Source code for mlens.model_selection.model_selection

"""ML-ENSEMBLE

:author: Sebastian Flennerhag
:copyright: 2017
:licence: MIT

Class for parallel tuning a set of estimators that share a common
preprocessing pipeline.
"""

from __future__ import division

import gc
import sys
import numpy as np

from .. import config
from ..base import FoldIndex
from ..parallel import ParallelEvaluation
from ..utils import (print_time,
                     safe_print,
                     check_instances,
                     assert_correct_format,
                     check_inputs)

try:
    from time import perf_counter as time
except ImportError:
    from time import time

try:
    from collections import OrderedDict as _dict
except ImportError:
    _dict = dict

from operator import itemgetter
import warnings


def _check_scorer(scorer):
    """Check that the scorer instance passed behaves as expected."""
    if not type(scorer).__name__ in ['_PredictScorer', '_ProbaScorer']:

        raise ValueError("The passes scorer does not seem to be a valid "
                         "scorer. Expected type '_PredictScorer', got '%s'."
                         "Use the mlens.metrics.make_scorer function to "
                         "construct a valid scorer." % type(scorer).__name__)


[docs]class Evaluator(object): r"""Model selection across several estimators and preprocessing pipelines. The :class:`Evaluator` allows users to evaluate several models in one call across a set preprocessing pipelines. The class is useful for comparing a set of estimators, especially when several preprocessing pipelines is to be evaluated. By pre-making all folds and iteratively fitting estimators with different parameter settings, array slicing and preprocessing is kept to a minimum. This can greatly reduced fit time compared to creating pipeline classes for each estimator and pipeline and fitting them one at a time in an Scikit-learn :class:`sklearn.model_selection.GridSearch` class. Preprocessing can be done before making any evaluation, and several evaluations can be made on the pre-made folds. Current implementation relies on a randomized grid search, so parameter grids must be specified as SciPy distributions (or a class that accepts a ``rvs`` method). Parameters ---------- scorer : function a scoring function that follows the Scikit-learn API:: score = scorer(estimator, y_true, y_pred) A user defines scoring function, ``score = f(y_true, y_pred)`` can be made into a scorer by calling on the ML-Ensemble implementation of Scikit-learn's ``make_scorer``. NOTE: do **not** use Scikit-learn's ``make_scorer`` if the Evaluator is to be pickled. :: from mlens.metrics import make_scorer scorer = make_scorer(scoring_function, **kwargs) error_score : int, optional score to assign when fitting an estimator fails. If ``None``, the evaluator will raise an error. cv : int or obj (default = 2) cross validation folds to use. Either pass a ``KFold`` class that obeys the Scikit-learn API. metrics : list, optional list of aggregation metrics to calculate on scores. Default is mean and standard deviation. shuffle : bool (default = True) whether to shuffle input data before creating cv folds. random_state : int, optional seed for creating folds (if shuffled) and parameter draws array_check : int (default = 2) level of strictness in checking input arrays. - ``array_check = 0`` will not check ``X`` or ``y`` - ``array_check = 1`` will check ``X`` and ``y`` for inconsistencies and warn when format looks suspicious, but retain original format. - ``array_check = 2`` will impose Scikit-learn array checks, which converts ``X`` and ``y`` to numpy arrays and raises an error if conversion fails. n_jobs: int (default = -1) number of CPU cores to use. verbose : bool or int (default = False) level of printed messages. Attributes ---------- summary : dict Summary output that shows data for best mean test scores, such as test and train scores, std, fit times, and params. cv_results : dict a nested ``dict`` of data from each fit. Includes mean and std of test and train scores and fit times, as well as param draw index and parameters. """ def __init__(self, scorer, cv=2, shuffle=True, random_state=None, backend=None, error_score=None, metrics=None, array_check=2, n_jobs=-1, verbose=False): self.cv = cv self.indexer = FoldIndex(cv) self.shuffle = shuffle self.backend = backend if backend is not None else config.BACKEND self.n_jobs = n_jobs self.error_score = error_score self.metrics = [np.mean, np.std] if metrics is None else metrics self.array_check = array_check self.random_state = random_state self.verbose = verbose self.evaluator = None self.preprocessing = None _check_scorer(scorer) self.scorer = scorer self.scores_ = None
[docs] def initialize(self, X, y): """Set up :class:`ParallelEvaluation` job manager.""" self.indexer.fit(X, y) self.evaluator = ParallelEvaluation(self) self.evaluator.initialize('evaluate', X, y)
[docs] def terminate(self): """Terminate evaluation job.""" self.evaluator.terminate() self.evaluator = None gc.collect()
[docs] def fit(self, X, y, estimators, param_dicts, n_iter=2, preprocessing=None): """Fit the Evaluator to given data, estimators and preprocessing. Utility function that calls ``preprocess`` and ``evaluate``. The following is equivalent:: # Explicitly calling preprocess and evaluate evaluator.preprocess(X, y, preprocessing) evaluator.evaluate(X, y, estimators, param_dicts, n_iter) # Calling fit evaluator.fit(X, y, estimators, param_dicts, n_iter, preprocessing) Parameters ---------- X : array-like, shape=[n_samples, n_features] input data to preprocess and create folds from. y : array-like, shape=[n_samples, ] training labels. estimators : list or dict set of estimators to use. If no preprocessing is desired or if only on preprocessing pipeline should apply to all, pass a list of estimators. The list can contain elements of named tuples (i.e. ``('my_name', my_est)``). If different estimators should be mapped to preprocessing cases, a dictionary that maps estimators to each case should be passed: ``{'case_a': list_of_est, ...}``. param_dicts : dict parameter distribution mapping for estimators. Current implementation only supports randomized grid search. Passed distribution object must have an ``rvs`` method. See :mod:`Scipy.stats` for details. There is quite some flexibility in specifying ``param_dicts``. If there is no preprocessing, or if all estimators are fitted on all preprocessing cases, the ``param_dict`` should have keys matching the names of the estimators. :: estimators = [('name', est), est] param_dicts = {'name': {'param-1': some_distribution}, 'est': {'param-1': some_distribution} } It is possible to specify different distributions for some or all preprocessing cases:: preprocessing = {'case-1': transformer_list, 'case-2': transformer_list} estimators = [('name', est), est] param_dicts = {'name': {'param-1': some_distribution}, ('case-1', 'est'): {'param-1': some_distribution} ('case-2', 'est'): {'param-1': some_distribution, 'param-2': some_distribution} } If estimators are mapped on a per-preprocessing case basis as a dictionary, ``param_dict`` must have key entries of the form ``(case_name, est_name)``. n_iter : int number of parameter draws to evaluate. preprocessing : dict, optional preprocessing cases to consider. Pass a dictionary mapping a case name to a preprocessing pipeline. :: preprocessing = {'case_name': transformer_list,} Returns ------- self : instance class instance with stored estimator evaluation results. """ if preprocessing is not None: self.preprocess(X, y, preprocessing) return self.evaluate(X, y, estimators, param_dicts, n_iter)
[docs] def preprocess(self, X, y, preprocessing=None): """Preprocess folds. Method for preprocessing data separately from the evaluation method. Helpful if preprocessing is costly relative to estimator fitting and several ``evaluate`` calls might be desired. Parameters ---------- X : array-like, shape=[n_samples, n_features] input data to preprocess and create folds from. y : array-like, shape=[n_samples, ] training labels. preprocessing : list or dict, optional preprocessing cases to consider. Pass a dictionary mapping a case name to a preprocessing pipeline. :: preprocessing = {'case_name': transformer_list,} Returns ------- self : instance class instance with stored estimator evaluation results. """ if preprocessing is None: raise ValueError("No preprocessing specified.") X, y = check_inputs(X, y, self.array_check) self.preprocessing = check_instances(preprocessing) if self.verbose > 0: printout = sys.stdout if self.verbose >= 50 else sys.stderr t0 = time() self._print_prep_start(t0, printout) self.initialize(X, y) try: self.evaluator.process('preprocess') finally: # Always terminate cache self.terminate() if self.verbose > 0: print_time(t0, 'Preprocessing done', file=printout) return self
[docs] def evaluate(self, X, y, estimators, param_dicts, n_iter=2): """Evaluate set of estimators. Function for evaluating a set of estimators using cross validation. Similar to a randomized grid search, but applies the grid search to all specified preprocessing pipelines. Parameters ---------- X : array-like, shape=[n_samples, n_features] input data to preprocess and create folds from. y : array-like, shape=[n_samples, ] training labels. estimators : list or dict set of estimators to use. If no preprocessing is desired or if only on preprocessing pipeline should apply to all, pass a list of estimators. The list can contain elements of named tuples (i.e. ``('my_name', my_est)``). If different estimators should be mapped to preprocessing cases, a dictionary that maps estimators to each case should be passed: ``{'case_a': list_of_est, ...}``. param_dicts : dict parameter distribution mapping for estimators. Current implementation only supports randomized grid search. Passed distribution object must have an ``rvs`` method. See :mod:`Scipy.stats` for details. There is quite some flexibility in specifying ``param_dicts``. If there is no preprocessing, or if all estimators are fitted on all preprocessing cases, the ``param_dict`` should have keys matching the names of the estimators. :: estimators = [('name', est), est] param_dicts = {'name': {'param-1': some_distribution}, 'est': {'param-1': some_distribution} } It is possible to specify different distributions for some or all preprocessing cases:: preprocessing = {'case-1': transformer_list, 'case-2': transformer_list} estimators = [('name', est), est] param_dicts = {'name': {'param-1': some_distribution}, ('case-1', 'est'): {'param-1': some_distribution} ('case-2', 'est'): {'param-1': some_distribution, 'param-2': some_distribution} } If estimators are mapped on a per-preprocessing case basis as a dictionary, ``param_dict`` must have key entries of the form ``(case_name, est_name)``. n_iter : int number of parameter draws to evaluate. Returns ------- self : instance class instance with stored estimator evaluation results. """ X, y = check_inputs(X, y, check_level=self.array_check) # First check if list of estimators should be expanded to very case estimators, param_dicts = self._format(estimators, param_dicts) self.estimators = check_instances(estimators) self.n_iter = n_iter self._param_sets(param_dicts) if self.verbose > 0: printout = sys.stdout if self.verbose >= 50 else sys.stderr t0 = time() self._print_eval_start(printout) self.initialize(X, y) # Run evaluation try: self.evaluator.process('evaluate') self._collect() finally: # Always terminate job self.evaluator.terminate() del self.evaluator gc.collect() if self.verbose > 0: print_time(t0, 'Evaluation done', file=printout) return self
def _format(self, estimators, param_dicts): """Ensure estimator object and param_dict object have right format.""" preprocessing = getattr(self, 'preprocessing', None) if preprocessing is not None and isinstance(estimators, list): # Set all estimators in list as ests for each case ests_ = estimators estimators = {case: ests_ for case in preprocessing.keys()} # Set parameter draws for each case if preprocessing is not None: params = dict() for key, pars in param_dicts.items(): if isinstance(key, tuple): # Check that naming is of the (case, est) form if key[0] not in preprocessing: msg = ("param_dict poorly formatted. Valid keys are " "'(case_name, est_name)' or 'est_name'." " Failed on key entry {}. \nAll keys: " "{}".format(key, list(preprocessing))) raise ValueError(msg) params[key] = pars else: # have an est_name key entry. Need to generate # keys of the form (case, est) for case in preprocessing.keys(): if (case, key) in params: # We do not want to overwrite user-specified dists continue params[(case, key)] = pars else: params = param_dicts # Finally, check that estimators and preprocessing are correctly # formatted for estimation assert_correct_format(estimators, preprocessing) return estimators, params def _draw_params(self, param_dists): """Draw a list of param dictionaries for estimator.""" # Set up empty list of parameter setting param_draws = [{} for _ in range(self.n_iter)] # Fill list of parameter settings by param for param, dist in param_dists.items(): draws = dist.rvs(size=self.n_iter, random_state=self.random_state) for i, draw in enumerate(draws): param_draws[i][param] = draw return param_draws def _set_params(self, param_dicts, key): """Try to set params, and if failure set an empty list.""" try: self.params[key] = \ self._draw_params(param_dicts[key]) except KeyError: # No param draws desired. Set empty dict. warnings.warn("No valid parameters found for {}. Will fit and " "score once with given parameter " "settings.".format(key)) self.params[key] = [{}] def _param_sets(self, param_dicts): """For each estimator, create a mapping of parameter draws.""" self.params = dict() if not self.preprocessing: # No preprocessing # the expected param_dicts key is 'est_name' for est_name, _ in self.estimators: self._set_params(param_dicts, est_name) else: # Preprocessing # Iterate over cases, expected param_dicts key is # ('case_name', 'est_name') if isinstance(self.preprocessing, dict): for case in self.preprocessing: for est_name, _ in self.estimators[case]: self._set_params(param_dicts, (case, est_name)) else: for est_name, _ in self.estimators: self._set_params(param_dicts, (None, est_name)) def _collect(self): """Collect output and format into dicts.""" # Scores are returned as a list of tuples for each case, est, draw and # fold. We need to aggregate them up to case, est and draw level. scores = self._aggregate_scores() # To build the cv_results dictionary, we loop over the scores dict and # aggregate the lists created on the metrics specified. cv_res = self._get_results(scores) # Summarize best draws for each case-est draw summary = self._summarize(cv_res) # Finally, we sort summary in order of best performance rank = sorted(summary['test_score_mean'], key=itemgetter(1), reverse=True) pretty_summary = _dict() for metric, data in summary.items(): pretty_summary[metric] = _dict() for case_est in rank: pretty_summary[metric][case_est] = data[case_est] self.cv_results = cv_res self.summary = pretty_summary def _summarize(self, cv_res): """For each case-estimator, return best param draw from cv results.""" summary = _dict() for case_est, data in cv_res.items(): # For each case and estimator, iterate over draws to find best # test score best_data = None for draw_num, draw_data in data.items(): if best_data is None: best_data, best_draw = draw_data, draw_num best_data['params'] = self.params[case_est][best_draw] if draw_data['test_score_mean'] > best_data['test_score_mean']: best_data, best_draw = draw_data, draw_num best_data['params'] = self.params[case_est][best_draw] # Assign data associated with best test score to summary dict # We invert the dictionary nesting here for metric, val in best_data.items(): if metric not in summary: summary[metric] = _dict() summary[metric][case_est] = val return summary def _aggregate_scores(self): """Aggregate scores to one list per case, est and param draw level.""" scores = _dict() for case, est, draw_num, train_sc, test_sc, fit_time in self.scores_: # Strip fold data if case is not None: name = (case.split('__')[0], est.split('__')[0]) else: name = est.split('__')[0] if name not in scores: scores[name] = _dict() if draw_num not in scores[name]: scores[name][draw_num] = _dict(test_score=[], train_score=[], fit_time=[]) scores[name][draw_num]['test_score'].append(test_sc) scores[name][draw_num]['train_score'].append(train_sc) scores[name][draw_num]['fit_time'].append(fit_time) return scores def _get_results(self, scores): """Return score metrics for each case, est and param draw level.""" cv_res = _dict() for name, case_est_data in scores.items(): if name not in cv_res: cv_res[name] = _dict() for draw_num, draw_data in case_est_data.items(): if draw_num not in cv_res[name]: cv_res[name][draw_num] = _dict() for key, values in draw_data.items(): for n, m in zip(['mean', 'std'], self.metrics): cv_res[name][draw_num]['%s_%s' % (key, n)] = m(values) return cv_res def _print_prep_start(self, t0, printout): """Print preprocessing start and return timer.""" msg = 'Preprocessing %i preprocessing pipelines over %i CV folds' p = len(getattr(self, 'preprocessing', [1])) c = self.cv if isinstance(self.cv, int) else self.cv.n_splits safe_print(msg % (p, c), file=printout) return t0 def _print_eval_start(self, printout): """Print initiation message and return timer.""" preprocessing = getattr(self, 'preprocessing', None) if preprocessing is None: msg = ('Evaluating %i models for %i parameter draws over %i ' 'CV folds, totalling %i fits') e, c, tot = self._get_count(preprocessing) safe_print(msg % (e, self.n_iter, c, tot), file=printout) else: msg = ('Evaluating %i models for %i parameter draws over %i' + ' preprocessing pipelines and %i CV folds, ' 'totalling %i fits') e, p, c, tot = self._get_count(preprocessing) safe_print(msg % (e, self.n_iter, p, c, tot), file=printout) def _get_count(self, preprocessing): """Utility for counting number of fits to make.""" c = self.cv if preprocessing is None: # Simply grab length of estimator list e = len(self.estimators) tot = e * c * self.n_iter return int(e), int(c), int(tot) else: # Need to consider cases p = len(preprocessing) if isinstance(self.estimators, list): # If all estimators are applied to all cases, just grab # length of list and multiply by cases e = len(self.estimators) * p else: # Sum over cases e = 0 for v in self.estimators.values(): e += len(v) tot = e * c * self.n_iter return int(e), int(p), int(c), int(tot)