"""ML-ENSEMBLE
:author: Sebastian Flennerhag
:copyright: 2017
:licence: MIT
Estimation engine for parallel preprocessing of stacked layer.
"""
from .estimation import BaseEstimator
from ..externals.sklearn.base import clone
###############################################################################
[docs]class Stacker(BaseEstimator):
"""Stacked fit sub-process class.
Class for fitting a Layer using Stacking.
"""
def __init__(self, job, layer):
super(Stacker, self).__init__(layer=layer)
self._default_initialization(job)
[docs] def run(self, parallel):
"""Execute stacking."""
super(Stacker, self).run(parallel)
def _format_instance_list(self):
"""Expand the instance lists to every fold with associated indices."""
self.e = _expand_instance_list(self.layer.estimators,
self.layer.indexer)
self.t = _expand_instance_list(self.layer.preprocessing,
self.layer.indexer)
def _get_col_id(self):
"""Assign unique col_id to every estimator."""
c = getattr(self.layer, 'classes_', 1)
p = len(self.layer.cases)
k = self.layer.n_feature_prop
self.c = _get_col_idx(self.e, p, c, k)
###############################################################################
def _expand_instance_list(instance_list, indexer):
"""Build a list of fold-specific estimation tuples w. train and test idx.
The full learner library is copied for each fold and used for building
the Z matrix of dimensions n * (L), where n is the number of training
samples and L number of base learners.
Examples
--------
Passing a list estimators
>>> import numpy as np
>>> from mlens.utils.dummy import OLS
>>> from mlens.base import FoldIndex
>>> from mlens.parallel.stack import _expand_instance_list
>>> X = np.arange(12)
>>> indexer = FoldIndex(3, X=X)
>>> instance_list = [('%i' % i, OLS()) for i in range(2)]
>>> _expand_instance_list(instance_list, indexer)
[(None, None, None, [('0', OLS(offset=0)), ('1', OLS(offset=0))]),
(None,
((4, 12),),
(0, 4),
[('0__f0', OLS(offset=0)), ('1__f0', OLS(offset=0))]),
(None,
((0, 4), (8, 12)),
(4, 8),
[('0__f1', OLS(offset=0)), ('1__f1', OLS(offset=0))]),
(None,
((0, 8),),
(8, 12),
[('0__f2', OLS(offset=0)), ('1__f2', OLS(offset=0))])]
Passing a dict estimators per cases
>>> import numpy as np
>>> from mlens.utils.dummy import OLS
>>> from mlens.base import FoldIndex
>>> from mlens.parallel.stack import _expand_instance_list
>>> X = np.arange(12)
>>> indexer = FoldIndex(3, X=X)
>>> instance_list = {'a': [('%i' % i, OLS()) for i in range(2)],
... 'b': [('%i' % i, OLS(1)) for i in range(1)]}
>>> _expand_instance_list(instance_list, indexer)
[list of estimation tuples, beginning with main estimators]
"""
splits = indexer.n_splits
if instance_list is None or len(instance_list) == 0:
# Capture cases when there is no preprocessing to avoid running a
# parallel job.
return None
elif isinstance(instance_list, dict):
# We need to build fit list on a case basis
# --- Full data ---
# Estimators to be fitted on full data. List entries have format:
# (case, no_train_idx, no_test_idx, est_list)
# Each est_list have entries (inst_name, cloned_est)
ls = [('%s' % case, None, None,
[(n, clone(e)) for n, e in instance_list[case]])
for case in sorted(instance_list)]
# --- Folds ---
# Estimators to be fitted on each fold. List entries have format:
# (case__fold_num, train_idx, test_idx, est_list)
# Each est_list have entries (inst_name__fol_num, cloned_est)
if indexer is not None:
fd = [('%s__f%i' % (case, i % splits),
tri,
tei,
[('%s__f%i' % (n, i % splits), clone(e)) for n, e in
instance_list[case]])
for case in sorted(instance_list)
for i, (tri, tei) in enumerate(indexer.generate())
]
ls.extend(fd)
else:
# No cases to worry about: expand the list of named instance tuples
# --- Full data ---
# Estimators to be fitted on full data. List entries have format:
# (no_case, no_train_idx, no_test_idx, est_list)
# Each est_list have entries (inst_name, cloned_est)
ls = [(None, None, None, [(n, clone(e)) for n, e in instance_list])]
# --- Folds ---
# Estimators to be fitted on each fold. List entries have format:
# (None, train_idx, test_idx, est_list)
# Each est_list have entries (inst_name__fol_num, cloned_est)
if indexer is not None:
ls.extend([(None,
tri,
tei,
[('%s__f%i' % (n, i % splits), clone(e)) for n, e in
instance_list])
for i, (tri, tei) in enumerate(indexer.generate())
])
return ls
def _get_col_idx(instance_list, n_main, labels, n_feature_prop):
"""Utility for assigning columns ids to each fold-specific estimator.
Parameters
----------
instance_list : list
list of instances per case and per cv fold
n_main : int
number of main cases.
labels : int
number of labels to expand col_id with
n_feature_prop : int
number of features being propagated. Predictions are concatenated from
the right.
"""
inc = 1 if labels is None else labels
# Set up estimator column mapping
# We select the main estimators by filtering out
# fold-specific estimators and assigning each of the main ests a col_id
idx, col = dict(), n_feature_prop
for meta_name, _, _, estimator_list in instance_list[:n_main]:
for est_name, _ in estimator_list:
idx[(meta_name, est_name)] = col
col += inc
# Map every fold-specific estimator back onto the just created column
# mapping for the final estimators. The fold-specific estimators should
# have the same col_id as the main estimators.
for meta_name_w_fold, _, _, estimator_list in instance_list[n_main:]:
# 'meta_name__f0' > 'meta_name'
try:
# Fails if meta_name is None
meta_name = '__'.join(meta_name_w_fold.split('__')[:-1])
except AttributeError:
meta_name = None
# Assign a column to estimators in belonging to the case__fold entry
for est_name_w_fold, _ in estimator_list:
# 'est_name__f0' > 'est_name'
est_name = '__'.join(est_name_w_fold.split('__')[:-1])
idx[meta_name_w_fold, est_name_w_fold] = idx[meta_name, est_name]
return idx