Source code for mlens.utils.validation

"""ML-ENSEMBLE

:author: Sebastian Flennerhag
:copyright: 2017
:license: MIT

Input validation module. Builds on Scikit-learns ``validation`` module, but
extends it to a soft check that issues warnings but don't force change the
inputs.
"""

import warnings

import numpy as np
import scipy.sparse as sp

from ..externals import six
from mlens.externals.sklearn.validation import check_X_y, _num_samples, \
    _shape_repr, check_array, check_consistent_length
from ..utils.exceptions import InputDataWarning, NonBLASDotWarning

FLOAT_DTYPES = (np.float64, np.float32, np.float16)

# Silenced by default to reduce verbosity. Turn on at runtime for
# performance profiling.
warnings.simplefilter('ignore', NonBLASDotWarning)


def _get_context(estimator=None):
    """Get context name for warning messages."""
    if estimator is not None:
        if isinstance(estimator, six.string_types):
            estimator_name = estimator.lower()
        else:
            estimator_name = estimator.__class__.__name__.lower()

        estimator_name = "[%s] " % estimator_name
    else:
        estimator_name = ""

    return estimator_name


[docs]def soft_check_array(array, accept_sparse=True, dtype=None,
                     ensure_2d=True, force_all_finite=True, allow_nd=True,
                     ensure_min_samples=1, ensure_min_features=1,
                     estimator=None):
    """Input validation on an array, list, sparse matrix or similar.

    Like Scikit-learn's ``check_array`` , but issues warnings on failed tests
    and do no forced array conversion.

    Parameters
    ----------
    array : array-like
        Input object, expected to be array-like, to check / convert.

    accept_sparse : string, list of string or None (default=None)
        String[s] representing allowed sparse matrix formats, such as 'csc',
        'csr', etc.  None means that sparse matrix input will raise an error.
        If the input is sparse but not in the allowed format, it will be
        converted to the first listed format.

    dtype : string, type, list of types or None (default="numeric")
        Data type of result. If None, the dtype of the input is preserved.
        If "numeric", warning is raised if array.dtype is object.
        If dtype is a list of types, warning is raised if array.dtype is not
        a member of the list.

    force_all_finite : boolean (default=True)
        Whether to raise an error on np.inf and np.nan in X.

    ensure_2d : boolean (default=True)
        Whether to warn if X is not at least 2d.

    allow_nd : boolean (default=False)
        Whether to allow X.ndim > 2.

    ensure_min_samples : int (default=1)
        Make sure that the array has a minimum number of samples in its first
        axis (rows for a 2D array). Setting to 0 disables this check.

    ensure_min_features : int (default=1)
        Make sure that the 2D array has some minimum number of features
        (columns). The default value of 1 rejects empty datasets.
        This check is only enforced when the input data has effectively 2
        dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
        disables this check.

    estimator : str or estimator instance (default=None)
        If passed, include the name of the estimator in warning messages.

    Returns
    -------
    CHANGE : bool
        Whether X should be changed.
    """
    # Set initial change flag to False. Will be set to True if any test fails.
    CHANGE = False

    context = _get_context(estimator)

    # ---- Check dtype -----

    # store whether originally we wanted numeric dtype
    dtype_numeric = dtype == "numeric"

    # Get input array's dtype
    dtype_orig = getattr(array, "dtype", None)

    if not hasattr(dtype_orig, 'kind'):
        # not a data type (e.g. a column named dtype in a pandas DataFrame)
        dtype_orig = None

    if dtype_numeric:
        # We want to check that the dtype is numeric.
        if dtype_orig is not None and dtype_orig.kind == "O":
            dtype = np.float64
        else:
            dtype = None

    wrong_dtype = False
    if dtype is not None:
        if isinstance(dtype, (list, tuple)):
            wrong_dtype = dtype_orig is not None and dtype_orig not in dtype
        else:
            wrong_dtype = dtype_orig is not None and dtype_orig != dtype

    if wrong_dtype:
        CHANGE = True
        msg = ("%sDtype of input array not the expected type [dtype: %s]. "
               "Consider changing to %r")
        warnings.warn(msg % (context, dtype_orig, dtype), InputDataWarning)

    # ----- check array shape ------
    if isinstance(accept_sparse, str):
        accept_sparse = [accept_sparse]

    if sp.issparse(array):
        CHANGE = _check_sparse_format(array, accept_sparse, dtype,
                                      force_all_finite, context)
    else:
        # Check if X is 2d
        if ensure_2d:
            if array.ndim == 1:
                if (ensure_min_samples >= 2) and (len(array) == 1):
                    # Raise error if we want X to be 2d, but only have one obs
                    raise ValueError("%sexpected at least 2 samples provided "
                                     "in a 2 dimensional array-like input"
                                     % context)
                # Else,flag for bad formatting
                CHANGE = True
                msg = ("%sX is one-dimensional. Reshape your data either "
                       "using X.reshape(-1, 1) if your data has a single"
                       "feature or X.reshape(1, -1) if it contains a single "
                       "sample.")
                warnings.warn(msg % context, InputDataWarning)

        # Check for number of dimensions
        if not allow_nd and array.ndim >= 3:
            warnings.warn("%sFound array with dim %d. %s expected <= 2." % (
                          context, array.ndim, context), InputDataWarning)

        # Check for finite inputs
        if force_all_finite:
            ALL_FINITE = _check_all_finite(array)

            if not ALL_FINITE:
                CHANGE = True
                msg = ("%sNot all elements in array are finite. This may "
                       "cause estimation problems. Consider nan conversion "
                       "and replacing infinite values.")
                warnings.warn(msg % context, InputDataWarning)

    # Check shape
    try:
        shape_repr = _shape_repr(array.shape)
    except Exception as e:
        CHANGE = True
        warnings.warn("%sCannot infer shape of input data: may not be "
                      "a suitable data type for estimation. Will proceed "
                      "without checking dimensionality. "
                      "Details:\n%r" % (context, e), InputDataWarning)
        shape_repr = 'NaN'

    if ensure_min_samples > 0:
        try:
            n_samples = _num_samples(array)
        except Exception as e:
            CHANGE = True
            warnings.warn("%sCannot infer samples size of input data: may not "
                          "be a suitable data type for estimation."
                          "Will proceed without checking sample size. "
                          "Details:\n%r" % (context, e), InputDataWarning)
            n_samples = np.inf

        if n_samples < ensure_min_samples:
            CHANGE = True
            msg = ("%sFound array with %d sample(s) (shape=%s) "
                   "while a minimum of %d is required.")
            warnings.warn(msg % (context, n_samples, shape_repr,
                                 ensure_min_samples), InputDataWarning)

    if ensure_min_features > 0 and array.ndim == 2:
        try:
            n_features = array.shape[1]
        except Exception as e:
            CHANGE = True
            warnings.warn("%sCannot infer feature size of input data: may not "
                          "be a suitable data type for estimation."
                          "Will proceed without checking feature size. "
                          "Details:\n%r" % (context, e), InputDataWarning)
            n_features = np.inf

        if n_features < ensure_min_features:
            CHANGE = True
            msg = ("%sFound array with %d feature(s) (shape=%s) while "
                   " a minimum of %d is required.")
            warnings.warn(msg % (context, n_features, shape_repr,
                                 ensure_min_features), InputDataWarning)

    if CHANGE:
        warnings.warn("%sInput data failed initial test. Estimation may fail. "
                      "Consider converting input data to a numpy array with "
                      "finite elements and no missing values." % context,
                      InputDataWarning)

    return CHANGE


def _check_all_finite(X):
    """General check for all finite values in X."""
    # First try an O(n) time, O(1) space solution for the common case that
    # everything is finite; fall back to O(n) space np.isfinite to prevent
    # false positives from overflow in sum method.
    try:
        if (X.dtype.char in np.typecodes['AllFloat'] and not
                np.isfinite(X.sum()) and not np.isfinite(X).all()):
            return False
        else:
            return True

    except Exception as e:
        warnings.warn('Could not check array for all finite. Ensure X is an'
                      'array type, and consider converting to an ndarray or'
                      'scipy sparse array. Details:\n%r' % e, InputDataWarning)


[docs]def check_all_finite(X):
    """Return False if X contains NaN or infinity."""
    return _check_all_finite(X.data if sp.issparse(X) else X)


def _check_sparse_format(spmatrix, accept_sparse=True, dtype=None,
                         force_all_finite=True, context=""):
    """Check if a sparse array needs format changes.

    Checks the sparse format of spmatrix and alerts if changes are
    recommended. Like Scikit-learn's ``_assert_sparse_format`` but without
    forced conversion.

    Parameters
    ----------
    spmatrix : scipy sparse matrix
        Input to validate and convert.

    accept_sparse : string, list of string or None (default=None)
        String[s] representing allowed sparse matrix formats ('csc',
        'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). None means that sparse
        matrix input will raise an error.  If the input is sparse but not in
        the allowed format, it will be converted to the first listed format.

    dtype : string, type or None (default=none)
        Data type of result. If None, the dtype of the input is preserved.

    force_all_finite : boolean (default=True)
        Whether to raise an error on np.inf and np.nan in X.

    context: str
        contextual message to begin warnings with.

    Returns
    -------
    CHANGE : bool
        False if no change is required, True if change is required
    """
    if accept_sparse in [None, False]:
        raise TypeError('%sA sparse matrix was passed, but dense '
                        'data is required. Use X.toarray() to '
                        'convert to a dense numpy array.' % context)
    if dtype is None:
        dtype = spmatrix.dtype

    CHANGE_FORMAT = False
    if (isinstance(accept_sparse, (list, tuple)) and spmatrix.format not in
            accept_sparse):
        CHANGE_FORMAT = True

    if CHANGE_FORMAT:
        msg = ("%sSparse format not one of recommended [format: %s]. "
               "Consider changing one of %r")
        warnings.warn(msg % (context, spmatrix.format, accept_sparse),
                      InputDataWarning)

    CHANGE_DTYPE = False
    if dtype != spmatrix.dtype:
        # convert dtype
        CHANGE_DTYPE = True

    if CHANGE_DTYPE:
        msg = ("%sDtype of sparse array not the expected type [dtype: %s]. "
               "Consider changing to %r")
        warnings.warn(msg % (context, spmatrix.dtype, dtype), InputDataWarning)

    ALL_FINITE = True
    if force_all_finite:
        if not hasattr(spmatrix, "data"):
            msg = "%sCan't check %s sparse matrix for nan or inf."
            warnings.warn(msg % (context, spmatrix.format))
        else:
            ALL_FINITE = check_all_finite(spmatrix.data)

    if not ALL_FINITE:
        msg = ("%sNot all elements in array are finite. This may cause "
               "estimation problems. Consider nan conversion and replacing "
               "infinite values.")
        warnings.warn(msg % context, InputDataWarning)

    return CHANGE_DTYPE or CHANGE_FORMAT or not ALL_FINITE


[docs]def soft_check_x_y(X, y, accept_sparse=True, dtype=None,
                   force_all_finite=True, ensure_2d=True, allow_nd=True,
                   multi_output=False, ensure_min_samples=1,
                   ensure_min_features=1, y_numeric=False, estimator=None):
    """Input validation before estimation.

    Checks X and y for consistent length, and X 2d and y 1d.
    Standard input checks are only applied to y, such as checking that y
    does not have np.nan or np.inf targets. For multi-label y, set
    multi_output=True to allow 2d and sparse y.  Raises warnings if the
    dtype is object.

    Parameters
    ----------
    X : nd-array, list or sparse matrix
        Input data.

    y : nd-array, list or sparse matrix
        Labels.

    accept_sparse : string, list of string or None (default=None)
        String[s] representing allowed sparse matrix formats, such as 'csc',
        'csr', etc.  None means that sparse matrix input will raise an error.
        If the input is sparse but not in the allowed format, it will be
        converted to the first listed format.

    dtype : string, type, list of types or None (default="numeric")
        Data type of result. If None, the dtype of the input is preserved.
        If "numeric", dtype is preserved unless array.dtype is object.
        If dtype is a list of types, conversion on the first type is only
        performed if the dtype of the input is not in the list.

    force_all_finite : boolean (default=True)
        Whether to raise an error on np.inf and np.nan in X. This parameter
        does not influence whether y can have np.inf or np.nan values.

    ensure_2d : boolean (default=True)
        Whether to make X at least 2d.

    allow_nd : boolean (default=False)
        Whether to allow X.ndim > 2.

    multi_output : boolean (default=False)
        Whether to allow 2-d y (array or sparse matrix). If false, y will be
        validated as a vector. y cannot have np.nan or np.inf values if
        multi_output=True.

    ensure_min_samples : int (default=1)
        Make sure that X has a minimum number of samples in its first
        axis (rows for a 2D array).

    ensure_min_features : int (default=1)
        Make sure that the 2D array has some minimum number of features
        (columns). The default value of 1 rejects empty datasets.
        This check is only enforced when X has effectively 2 dimensions or
        is originally 1D and ``ensure_2d`` is True. Setting to 0 disables
        this check.

    y_numeric : boolean (default=False)
        Whether to ensure that y has a numeric type. If dtype of y is object,
        it is converted to float64. Should only be used for regression
        algorithms.

    estimator : str or estimator instance (default=None)
        If passed, include the name of the estimator in warning messages.

    Returns
    -------
    X_converted : object
        The converted and validated X.

    y_converted : object
        The converted and validated y.
    """
    # ------ Check X ------
    CHANGE_X = soft_check_array(X, accept_sparse, dtype,
                                force_all_finite, ensure_2d, allow_nd,
                                ensure_min_samples, ensure_min_features,
                                estimator)

    # ------ Check y ------
    if multi_output:
        CHANGE_y = soft_check_array(y, accept_sparse=['csr'],
                                    force_all_finite=force_all_finite,
                                    ensure_2d=False, dtype=dtype,
                                    estimator=estimator)
    else:
        CHANGE_y = soft_check_1d(y, y_numeric, estimator)

    # Check consistent lengths. This raises an error if test fails.
    check_consistent_length(X, y)

    return CHANGE_X or CHANGE_y


[docs]def soft_check_1d(y, y_numeric, estimator):
    """Check if y is numeric, finite and one-dimensional."""
    context = _get_context(estimator)

    CHANGE_y = _check_column_or_1d(y)

    ALL_FINITE = _check_all_finite(y)
    if not ALL_FINITE:
        CHANGE_y = True
        msg = ("%sNot all elements in array are finite. This may "
               "cause estimation problems. Consider nan conversion "
               "and replacing infinite values.")
        warnings.warn(msg % context, InputDataWarning)

    if y_numeric and y.dtype.kind == 'O':
        CHANGE_y = True
        msg = ("%sDtype of y not the expected type [dtype: %s]. "
               "Consider changing to 'float' or 'int'.")
        warnings.warn(msg % (context, y.dtype.kind), InputDataWarning)

    if CHANGE_y:
        msg = ("%sy array failed initial test. Estimation may fail. "
               "Consider converting input data to a numpy array with "
               "finite elements and no missing values.")
        warnings.warn(msg % context, InputDataWarning)

    return CHANGE_y


def _check_column_or_1d(y, context=""):
    """Check if y can be raveled."""
    CHANGE = False
    try:
        s = tuple(np.shape(y))
    except Exception as e:
        raise ValueError("%sCould not get shape of y. "
                         "y should be an ndarray or scipy sparse csr "
                         "/csc matrix of shape (n_samples, ). Got %s."
                         "Details:\n%r" % (context, type(y), e))

    if len(s) == 0:
        raise ValueError("%sy is empty: y = %r." % (context, y))

    if len(s) == 2 and s[1] == 1:
        CHANGE = True
        warnings.warn("%sA column-vector y was passed when a 1d array was"
                      " expected. Change the shape of y to "
                      "(n_samples, ), for example using ravel()." % context,
                      InputDataWarning)

    if len(s) == 2 and s[1] > 1:
        CHANGE = True
        warnings.warn("%sA matrix y was passed for as for labels. "
                      "Most estimators expect a one dimensional label vector."
                      "Consider changing the shape of y to (n_samples, )." %
                      context, InputDataWarning)

    return CHANGE


def _check_x_y(X, y):
    """Wrapper for our default arguments - relax some Scikit-learn defaults."""
    return check_X_y(X, y,
                     accept_sparse=['csr', 'csc'],  # Accept sparse csr, csc
                     order=None,             # Make no C or Fortran imposition
                     copy=False,             # Do not trigger copying
                     force_all_finite=True,  # Raise error on np.inf or np.nan
                     ensure_2d=True,         # Force 'X' do be a matrix
                     allow_nd=True,          # Allow 'X.ndim' > 2
                     multi_output=True,      # Allow 'y.shape[1]' > 1
                     warn_on_dtype=False     # Mute as 'dtype' is 'None'
                     )


def _check_array(X):
    """Wrapper for our default arguments - relax some Scikit-learn defaults."""
    return check_array(X,
                       accept_sparse=['csr', 'csc'],  # Accept sparse csr, csc
                       order=None,  # Do not enforce C or Fortran
                       copy=False,  # Do not trigger copying
                       force_all_finite=True,  # Raise error on np.inf/np.nan
                       ensure_2d=True,  # Force 'X' do be a matrix
                       allow_nd=True,  # Allow 'X.ndim' > 2
                       warn_on_dtype=False  # Mute as 'dtype' is 'None'
                       )


[docs]def check_inputs(X, y=None, check_level=0):
    r"""Pre-checks on input arrays X and y.

    Checks input data according to ``check_level`` to ensure format is roughly
    in line with what a typical estimator expects.

    If ``check_level = 0`` this test is turned off.

    Parameters
    ----------
    X : nd-array, list or sparse matrix
        Input data.

    y : nd-array, list or sparse matrix
        Labels.

    check_level : int (default = 2)
        level of strictness in checking input arrays.

            - ``check_level = 0`` no checks, returns X, y

            - ``check_level`` = 1 will raises warnings if any non-critical
              test fails. Returns boolean FAIL flag.

            - ``check_level = 2`` will impose Scikit-learn array  check,
              which converts ``X`` and ``y`` to numpy arrays and raises error
              if conversion fails.

    Returns
    ---------
    FAIL : fail flag, optional
        boolean for whether any test failed. Returned if ``check_level = 1``

    X_converted : numpy array, optional
        The converted and validated X. Returned if ``check_level = 2``

    y_converted : numpy array, optional
        The converted and validated y. Returned if ``check_level = 2``.

    random_state : object, optional
        numpy RandomState object.
    """
    if check_level == 1:
        soft_check_x_y(X, y)

    if check_level == 2:

        if y is None:
            X = _check_array(X)
        else:
            X, y = _check_x_y(X, y)

    return X, y