Source code for mlens.parallel.subset

"""ML-ENSEMBLE

:author: Sebastian Flennerhag
:copyright: 2017
:licence: MIT

Estimation engine for parallel preprocessing of subsemble layer.
"""

from .estimation import BaseEstimator
from ..externals.sklearn.base import clone


###############################################################################
[docs]class SubStacker(BaseEstimator):

    """Stacked subset fit sub-process class.

    Class for fitting a Layer using Subsemble.
    """

    def __init__(self, job, layer):
        super(SubStacker, self).__init__(layer=layer)
        self._default_initialization(job)

[docs]    def run(self, parallel):
        """Execute subsembling"""
        super(SubStacker, self).run(parallel)

    def _format_instance_list(self):
        """Expand the instance lists to every fold with associated indices."""
        self.e = _expand_instance_list(self.layer.estimators,
                                       self.layer.indexer)

        self.t = _expand_instance_list(self.layer.preprocessing,
                                       self.layer.indexer)

    def _get_col_id(self):
        """Assign unique col_id to every estimator."""
        c = getattr(self.layer, 'classes_', 1)
        p = len(self.layer.cases) * self.layer.indexer.n_partitions
        k = self.layer.n_feature_prop
        self.c = _get_col_idx(self.e, p, c, k)


###############################################################################
def _expand_instance_list(instance_list, indexer):
    """Build a list of subset-specific estimation tuples w. train and test idx.

    The subset's ``_expand_insance_list`` function expands a list of
    base learners in two dimensions:
        1. Partitions
        2. Folds

    For each partition, the full learner library is copied as final estimators
    on that partition. The full learner library is then copied again for
    each fold within that partition, and these estimators are used for building
    the Z matrix of dimensions n * (J*L), where n is the number of training
    samples, J the number of partitions, and L number of base learners.

    Examples
    --------
    Passing a list estimators

    >>> import numpy as np
    >>> from mlens.utils.dummy import OLS
    >>> from mlens.base import SubsetIndex
    >>> from mlens.parallel.subset import _expand_instance_list
    >>> X = np.arange(12)
    >>> indexer = SubsetIndex(3, X=X)
    >>> instance_list = [('%i', OLS()) for i in range(2)]
    >>> _expand_instance_list(instance_list, indexer)
    [list of estimation tuples, beginning with main estimators]

    Passing a dict estimators per cases

    >>> import numpy as np
    >>> from mlens.utils.dummy import OLS
    >>> from mlens.base import SubsetIndex
    >>> from mlens.parallel.subset import _expand_instance_list
    >>> X = np.arange(12)
    >>> indexer = SubsetIndex(3, X=X)
    >>> instance_list = {'a': [('%i' % i, OLS()) for i in range(2)],
    ...                  'b': [('%i' % i, OLS(1)) for i in range(1)]}
    >>> _expand_instance_list(instance_list, indexer)
    [list of estimation tuples, beginning with main estimators]
    """
    splits = indexer.n_splits

    if instance_list is None or len(instance_list) == 0:
        # Capture cases when there is no preprocessing to avoid running a
        # parallel job.
        return None

    elif isinstance(instance_list, dict):
        # We need to build fit list on a case basis

        # --- Full data ---
        # Estimators to be fitted on full data. List entries have format:
        # (case, no_train_idx, no_test_idx, est_list)
        # Each est_list have entries (inst_name, cloned_est)
        ls = [('%s__j%i' % (case, j), partition, None,
               [(n, clone(e)) for n, e in instance_list[case]])
              for case in sorted(instance_list)
              for j, partition in enumerate(indexer.partition())]

        # --- Folds ---
        # Estimators to be fitted on each fold. List entries have format:
        # (case__fold_num, train_idx, test_idx, est_list)
        # Each est_list have entries (inst_name__fol_num, cloned_est)
        if indexer is not None:
            fd = [('%s__j%i__f%i' % (case, i // splits, i % splits),
                   tri,
                   tei,
                   [('%s__f%i' % (n, i % splits), clone(e)) for n, e in
                    instance_list[case]])
                  for case in sorted(instance_list)
                  for i, (tri, tei) in enumerate(indexer.generate())
                  ]
            ls.extend(fd)

    else:
        # No cases to worry about: expand the list of named instance tuples

        # --- Full data ---
        # Estimators to be fitted on full data. List entries have format:
        # (no_case, no_train_idx, no_test_idx, est_list)
        # Each est_list have entries (inst_name, cloned_est)
        ls = [('j%i' % i, partition, None,
               [(n, clone(e)) for n, e in instance_list])
              for i, partition in enumerate(indexer.partition())]

        # --- Folds ---
        # Estimators to be fitted on each fold. List entries have format:
        # (fold_num, train_idx, test_idx, est_list)
        # Each est_list have entries (inst_name__fol_num, cloned_est)
        if indexer is not None:
            ls.extend([('j%i__f%i' % (i // splits, i % splits),
                        tri,
                        tei,
                        [('%s__f%i' % (n, i % splits), clone(e)) for n, e in
                         instance_list])
                       for i, (tri, tei) in enumerate(indexer.generate())
                       ])
    return ls


def _get_col_idx(instance_list, n_main, labels, n_feature_prop):
    """Utility for assigning columns ids to each subset-specific estimator.

    Parameters
    ----------
    instance_list : list
        list of instances per case and per cv fold

    n_main : int
        number of main cases. Either ``n_partitions`` or
        ``n_partitions * n_cases``.

    labels : int
        number of labels to expand col_id with

    n_feature_prop : int
        number of features being propagated. Predictions are concatenated from
        the right.
    """
    inc = 1 if labels is None else labels

    # Set up estimator column mapping
    # We select the main estimators by filtering out
    # fold-specific estimators and assigning each of the main ests a col_id
    idx, col = dict(), n_feature_prop
    for meta_name, _, _, estimator_list in instance_list[:n_main]:
        for est_name, _ in estimator_list:
            idx[(meta_name, est_name)] = col

            col += inc

    # Map every fold-specific estimator back onto the just created column
    # mapping for the final estimators:
    # the fold-specific estimators in a partition j and fold f should have
    # the same col_id as the main estimators for partition j.
    for meta_name_w_fold, _, _, estimator_list in instance_list[n_main:]:

        # 'case__j0__f0' > 'case__j0' or 'j0__f0' > 'j0
        meta_name = '__'.join(meta_name_w_fold.split('__')[:-1])

        # Assign a column to estimators in belonging to the case__fold entry
        for est_name_w_fold, _ in estimator_list:

            # 'est_name__j0__f0' > 'est_name__j0'
            est_name = '__'.join(est_name_w_fold.split('__')[:-1])

            idx[meta_name_w_fold, est_name_w_fold] = idx[meta_name, est_name]

    return idx