Source code for imputena.recommendation.recommend_method

import pandas as pd


from .utils import (
    is_series, is_dataframe, is_categorical, contains_categorical,
    is_temporal, has_lt_10_percent_na, has_gt_80_percent_cor)


[docs]def recommend_method(data=None, column=None, title_only=False):
    """Recommends an imputation method to use on a series, data frame,
    or particular column of a data frame. If data_only is True, only the
    title of the recommended method is returned, otherwise a description of
    the decision process is provided as well.

    :param data: The data for which an imputation method should be recommended.
    :type data: pandas.Series or pandas.DataFrame
    :param column: If data is a data frame, the column for which an
        imputation method should be recommended
    :type column: string, optional
    :param title_only: If true, return only the title of the imputation
        method, otherwise provide a description of the decision process as
        well.
    :type title_only: bool, default False
    :return: The title of the recommended imputation method and a
        description of the decision model if title_only is False.
    :rtype: string
    """
    # Check that data is a Series or Dataframe:
    if not (isinstance(data, pd.Series) or isinstance(data, pd.DataFrame)):
        raise TypeError('The data has to be a Series or DataFrame.')
    # If a column is specified while data is a Series, raise a ValueError:
    if isinstance(data, pd.Series) and column is not None:
        raise ValueError('A column can only be specified if the data is a '
                         'DataFrame.')
    # Initialize messages and method:
    messages = []
    method = None
    # Treatment if the data is a series:
    if is_series(data):
        # The data is a series.
        messages.append('The data is a series.')
        series = data
        # Check if the series contains categorical values:
        if is_categorical(series):
            messages.append('The series contains categorical values.')
            method = 'random sample imputation'
        else:
            messages.append('The series contains numerical values.')
            if is_temporal(series):
                messages.append('The series is a time series.')
                method = 'interpolation with seasonal adjustment'
            else:
                messages.append('The series is not a time series.')
                method = 'mean substitution'
    # Treatment if the data is a dataframe:
    if is_dataframe(data):
        messages.append('The data is a data frame.')
        if column is None:
            # Treatment for a whole dataframe.
            messages.append(
                'You want to apply the same method to the whole data frame.')
            if contains_categorical(data):
                # The data frame contains categorical data.
                messages.append('The data frame contains categorical data.')
                method = 'most-frequent substitution'
            else:
                # The data frame does not contain categorical data.
                messages.append(
                    'The data frame does not contain categorical data.')
                method = 'imputation using k-NN'
        else:
            # Treatment for a specific column of a dataframe
            # Check if column is actually a column of data:
            if column in data.columns:
                series = data[column]
            else:
                raise ValueError(column + 'is not a column of the data.')
            # Check if the column contains categorical values:
            if is_categorical(series):
                # The column contains categorical values.
                messages.append(
                    'The column {} contains categorical values.'.format(
                        column))
                method = 'logistic regression imputation'
            else:
                # The column contains numerical values.
                messages.append(
                    'The column {} contains numerical values.'.format(
                        column))
                # Check if the column represents a time series:
                if is_temporal(series):
                    # The column represents a time series.
                    messages.append(
                        'The column {} represent a time series.'.format(
                            column))
                    method = 'interpolation with seasonal adjustment'
                else:
                    # The column does not represent a time series.
                    messages.append(
                        'The column {} does not represent a time '
                        'series.'.format(column))
                    # Check if the column contains less than 10% missing
                    # values:
                    if has_lt_10_percent_na(series):
                        # The column contains less than 10% missing values.
                        messages.append(
                            'Less than 10% of the values in the '
                            'column {} are missing.'.format(column))
                        method = 'mean substitution'
                    else:
                        # The column contains 10% or more missing values.
                        messages.append(
                            '10% or more of the values in the column'
                            '{} are missing.'.format(column))
                        # Check if the column has a correlation of more than
                        # 0.8 with any other column.
                        if has_gt_80_percent_cor(data, column):
                            # The column does have a correlation of more
                            # than 0.8 with at least one other column.
                            messages.append(
                                'The column {} has high '
                                'correlations (> 0.8) with at least one '
                                'other column.'.format(column))
                            method = 'linear regression imputation'
                        else:
                            # The column does not have a correlation of more
                            # than 0.8 with at least one other column.
                            messages.append(
                                'The column {} does not ' 
                                'have high correlations (> 0.8) with any '
                                'other column.'.format(column))
                            method = 'imputation using k-NN'
    # Create return string and return:
    res = ''
    if title_only:
        res = method
    else:
        for idx, message in enumerate(messages):
            res += str(idx + 1) + '. ' + message + '\n'
        res += 'Therefore you should apply {}.'.format(method)
    return res