Source code for mlens.parallel.blend

"""ML-ENSEMBLE

:author: Sebastian Flennerhag
:copyright: 2017
:licence: MIT

Estimation engine for parallel preprocessing of blend layer.
"""
from ._base_functions import predict_fold_est, fit, predict, construct_args
from .estimation import BaseEstimator
from ..externals.joblib import delayed
from ..externals.sklearn.base import clone

from copy import deepcopy


FUNCS = {'fit': fit,
         'predict': predict,
         'predict_proba': predict,
         }


###############################################################################
[docs]class Blender(BaseEstimator): """Blended fit sub-process class. Class for fitting a Layer using Blending. """ def __init__(self, job, layer): super(Blender, self).__init__(layer=layer) self.dir = job.dir self.execute = FUNCS[job.job] if job.job != 'transform' else transform self.args = construct_args(self.execute, job)
[docs] def run(self, parallel): """Execute stacking.""" super(Blender, self).run(parallel)
def _format_instance_list(self): """Expand the instance lists to every fold with associated indices.""" self.e = _expand_instance_list(self.layer.estimators, self.layer.indexer) self.t = _expand_instance_list(self.layer.preprocessing, self.layer.indexer) def _get_col_id(self): """Assign unique col_id to every estimator.""" c = getattr(self.layer, 'classes_', 1) k = self.layer.n_feature_prop self.c = _get_col_idx(self.layer.preprocessing, self.layer.estimators, c, k) def _build_scores(self, s): """Build a cv-score mapping.""" scores = dict() for k, v in s: case_name, est_name = k.split('___') if case_name == '': name = est_name else: name = '%s__%s' % (case_name, est_name) scores[name] = (v, 0.) # mean, std return scores
[docs]def transform(inst, X, P, parallel): """Predict X. Since a blend ensemble does not use folds, transform coincides with predict, except that the prediction in fitting is only for a subset of X. """ inst._check_fitted() pred_method = inst.layer._predict_attr # Collect estimators - blend only has estimators fitted on 'full' # since no folds are used in building the prediction matrix during fitting prep, ests = inst._retrieve('full') parallel(delayed(predict_fold_est)(tr_list=deepcopy(prep[case]) if prep is not None else [], est=est, x=X, pred=P, idx=idx, attr=pred_method) for case, (_, est, idx) in ests)
############################################################################### def _expand_instance_list(instance_list, indexer=None): """Build a list of estimation tuples with train and test indices.""" if instance_list is None or len(instance_list) == 0: # Capture cases when there is no preprocessing to avoid running a # parallel job. return None elif isinstance(instance_list, dict): # List entries have format: # (case, train_idx, test_idx, est_list) # Each est_list have entries (est_name, cloned_est) if indexer is not None: return [('%s' % case, tri, tei, [('%s' % n, clone(e)) for n, e in instance_list[case]]) for case in sorted(instance_list) for tri, tei in indexer.generate() ] else: # No cases to worry about: expand the list of named instance tuples # List entries have format: # ('inst', train_idx, test_idx, est_list) # Each est_list have entries (est_name, cloned_est) if indexer is not None: return [(None, tri, tei, [('%s' % n, clone(e)) for n, e in instance_list]) for tri, tei in indexer.generate() ] def _get_col_idx(preprocessing, estimators, labels, n_feature_prop): """Utility for assigning each ``est`` in each ``prep`` a unique ``col_id``. Parameters ---------- preprocessing : dict dictionary of preprocessing cases. estimators : dict dictionary of lists of estimators per preprocessing case. labels : int number of labels to expand col_id with n_feature_prop : int number of features being propagated. Predictions are concatenated from the right. """ inc = 1 if labels is None else labels if isinstance(preprocessing, list) or preprocessing is None: # Simple iteration of list idx = {(None, inst_name): int(n_feature_prop + inc * i) for i, (inst_name, _) in enumerate(estimators)} else: # Nested for loop required case_list, idx, col = sorted(preprocessing), dict(), n_feature_prop for case in case_list: for inst_name, _ in estimators[case]: idx[case, inst_name] = col col += inc return idx