Source code for mlens.parallel.stack


:author: Sebastian Flennerhag
:copyright: 2017
:licence: MIT

Estimation engine for parallel preprocessing of stacked layer.

from .estimation import BaseEstimator
from ..externals.sklearn.base import clone

[docs]class Stacker(BaseEstimator): """Stacked fit sub-process class. Class for fitting a Layer using Stacking. """ def __init__(self, job, layer): super(Stacker, self).__init__(layer=layer) self._default_initialization(job)
[docs] def run(self, parallel): """Execute stacking.""" super(Stacker, self).run(parallel)
def _format_instance_list(self): """Expand the instance lists to every fold with associated indices.""" self.e = _expand_instance_list(self.layer.estimators, self.layer.indexer) self.t = _expand_instance_list(self.layer.preprocessing, self.layer.indexer) def _get_col_id(self): """Assign unique col_id to every estimator.""" c = getattr(self.layer, 'classes_', 1) p = len(self.layer.cases) k = self.layer.n_feature_prop self.c = _get_col_idx(self.e, p, c, k)
############################################################################### def _expand_instance_list(instance_list, indexer): """Build a list of fold-specific estimation tuples w. train and test idx. The full learner library is copied for each fold and used for building the Z matrix of dimensions n * (L), where n is the number of training samples and L number of base learners. Examples -------- Passing a list estimators >>> import numpy as np >>> from mlens.utils.dummy import OLS >>> from mlens.base import FoldIndex >>> from mlens.parallel.stack import _expand_instance_list >>> X = np.arange(12) >>> indexer = FoldIndex(3, X=X) >>> instance_list = [('%i' % i, OLS()) for i in range(2)] >>> _expand_instance_list(instance_list, indexer) [(None, None, None, [('0', OLS(offset=0)), ('1', OLS(offset=0))]), (None, ((4, 12),), (0, 4), [('0__f0', OLS(offset=0)), ('1__f0', OLS(offset=0))]), (None, ((0, 4), (8, 12)), (4, 8), [('0__f1', OLS(offset=0)), ('1__f1', OLS(offset=0))]), (None, ((0, 8),), (8, 12), [('0__f2', OLS(offset=0)), ('1__f2', OLS(offset=0))])] Passing a dict estimators per cases >>> import numpy as np >>> from mlens.utils.dummy import OLS >>> from mlens.base import FoldIndex >>> from mlens.parallel.stack import _expand_instance_list >>> X = np.arange(12) >>> indexer = FoldIndex(3, X=X) >>> instance_list = {'a': [('%i' % i, OLS()) for i in range(2)], ... 'b': [('%i' % i, OLS(1)) for i in range(1)]} >>> _expand_instance_list(instance_list, indexer) [list of estimation tuples, beginning with main estimators] """ splits = indexer.n_splits if instance_list is None or len(instance_list) == 0: # Capture cases when there is no preprocessing to avoid running a # parallel job. return None elif isinstance(instance_list, dict): # We need to build fit list on a case basis # --- Full data --- # Estimators to be fitted on full data. List entries have format: # (case, no_train_idx, no_test_idx, est_list) # Each est_list have entries (inst_name, cloned_est) ls = [('%s' % case, None, None, [(n, clone(e)) for n, e in instance_list[case]]) for case in sorted(instance_list)] # --- Folds --- # Estimators to be fitted on each fold. List entries have format: # (case__fold_num, train_idx, test_idx, est_list) # Each est_list have entries (inst_name__fol_num, cloned_est) if indexer is not None: fd = [('%s__f%i' % (case, i % splits), tri, tei, [('%s__f%i' % (n, i % splits), clone(e)) for n, e in instance_list[case]]) for case in sorted(instance_list) for i, (tri, tei) in enumerate(indexer.generate()) ] ls.extend(fd) else: # No cases to worry about: expand the list of named instance tuples # --- Full data --- # Estimators to be fitted on full data. List entries have format: # (no_case, no_train_idx, no_test_idx, est_list) # Each est_list have entries (inst_name, cloned_est) ls = [(None, None, None, [(n, clone(e)) for n, e in instance_list])] # --- Folds --- # Estimators to be fitted on each fold. List entries have format: # (None, train_idx, test_idx, est_list) # Each est_list have entries (inst_name__fol_num, cloned_est) if indexer is not None: ls.extend([(None, tri, tei, [('%s__f%i' % (n, i % splits), clone(e)) for n, e in instance_list]) for i, (tri, tei) in enumerate(indexer.generate()) ]) return ls def _get_col_idx(instance_list, n_main, labels, n_feature_prop): """Utility for assigning columns ids to each fold-specific estimator. Parameters ---------- instance_list : list list of instances per case and per cv fold n_main : int number of main cases. labels : int number of labels to expand col_id with n_feature_prop : int number of features being propagated. Predictions are concatenated from the right. """ inc = 1 if labels is None else labels # Set up estimator column mapping # We select the main estimators by filtering out # fold-specific estimators and assigning each of the main ests a col_id idx, col = dict(), n_feature_prop for meta_name, _, _, estimator_list in instance_list[:n_main]: for est_name, _ in estimator_list: idx[(meta_name, est_name)] = col col += inc # Map every fold-specific estimator back onto the just created column # mapping for the final estimators. The fold-specific estimators should # have the same col_id as the main estimators. for meta_name_w_fold, _, _, estimator_list in instance_list[n_main:]: # 'meta_name__f0' > 'meta_name' try: # Fails if meta_name is None meta_name = '__'.join(meta_name_w_fold.split('__')[:-1]) except AttributeError: meta_name = None # Assign a column to estimators in belonging to the case__fold entry for est_name_w_fold, _ in estimator_list: # 'est_name__f0' > 'est_name' est_name = '__'.join(est_name_w_fold.split('__')[:-1]) idx[meta_name_w_fold, est_name_w_fold] = idx[meta_name, est_name] return idx