"""ML-ENSEMBLE
:author: Sebastian Flennerhag
:copyright: 2017
:licence: MIT
Estimation engine for parallel preprocessing of blend layer.
"""
from ._base_functions import predict_fold_est, fit, predict, construct_args
from .estimation import BaseEstimator
from ..externals.joblib import delayed
from ..externals.sklearn.base import clone
from copy import deepcopy
FUNCS = {'fit': fit,
'predict': predict,
'predict_proba': predict,
}
###############################################################################
[docs]class Blender(BaseEstimator):
"""Blended fit sub-process class.
Class for fitting a Layer using Blending.
"""
def __init__(self, job, layer):
super(Blender, self).__init__(layer=layer)
self.dir = job.dir
self.execute = FUNCS[job.job] if job.job != 'transform' else transform
self.args = construct_args(self.execute, job)
[docs] def run(self, parallel):
"""Execute stacking."""
super(Blender, self).run(parallel)
def _format_instance_list(self):
"""Expand the instance lists to every fold with associated indices."""
self.e = _expand_instance_list(self.layer.estimators,
self.layer.indexer)
self.t = _expand_instance_list(self.layer.preprocessing,
self.layer.indexer)
def _get_col_id(self):
"""Assign unique col_id to every estimator."""
c = getattr(self.layer, 'classes_', 1)
k = self.layer.n_feature_prop
self.c = _get_col_idx(self.layer.preprocessing, self.layer.estimators,
c, k)
def _build_scores(self, s):
"""Build a cv-score mapping."""
scores = dict()
for k, v in s:
case_name, est_name = k.split('___')
if case_name == '':
name = est_name
else:
name = '%s__%s' % (case_name, est_name)
scores[name] = (v, 0.) # mean, std
return scores
###############################################################################
def _expand_instance_list(instance_list, indexer=None):
"""Build a list of estimation tuples with train and test indices."""
if instance_list is None or len(instance_list) == 0:
# Capture cases when there is no preprocessing to avoid running a
# parallel job.
return None
elif isinstance(instance_list, dict):
# List entries have format:
# (case, train_idx, test_idx, est_list)
# Each est_list have entries (est_name, cloned_est)
if indexer is not None:
return [('%s' % case, tri, tei,
[('%s' % n, clone(e)) for n, e in instance_list[case]])
for case in sorted(instance_list)
for tri, tei in indexer.generate()
]
else:
# No cases to worry about: expand the list of named instance tuples
# List entries have format:
# ('inst', train_idx, test_idx, est_list)
# Each est_list have entries (est_name, cloned_est)
if indexer is not None:
return [(None, tri, tei,
[('%s' % n, clone(e)) for n, e in instance_list])
for tri, tei in indexer.generate()
]
def _get_col_idx(preprocessing, estimators, labels, n_feature_prop):
"""Utility for assigning each ``est`` in each ``prep`` a unique ``col_id``.
Parameters
----------
preprocessing : dict
dictionary of preprocessing cases.
estimators : dict
dictionary of lists of estimators per preprocessing case.
labels : int
number of labels to expand col_id with
n_feature_prop : int
number of features being propagated. Predictions are concatenated from
the right.
"""
inc = 1 if labels is None else labels
if isinstance(preprocessing, list) or preprocessing is None:
# Simple iteration of list
idx = {(None, inst_name): int(n_feature_prop + inc * i)
for i, (inst_name, _) in enumerate(estimators)}
else:
# Nested for loop required
case_list, idx, col = sorted(preprocessing), dict(), n_feature_prop
for case in case_list:
for inst_name, _ in estimators[case]:
idx[case, inst_name] = col
col += inc
return idx