Source code for imputena.recommendation.recommend_method

import pandas as pd


from .utils import (
    is_series, is_dataframe, is_categorical, contains_categorical,
    is_temporal, has_lt_10_percent_na, has_gt_80_percent_cor)


[docs]def recommend_method(data=None, column=None, title_only=False): """Recommends an imputation method to use on a series, data frame, or particular column of a data frame. If data_only is True, only the title of the recommended method is returned, otherwise a description of the decision process is provided as well. :param data: The data for which an imputation method should be recommended. :type data: pandas.Series or pandas.DataFrame :param column: If data is a data frame, the column for which an imputation method should be recommended :type column: string, optional :param title_only: If true, return only the title of the imputation method, otherwise provide a description of the decision process as well. :type title_only: bool, default False :return: The title of the recommended imputation method and a description of the decision model if title_only is False. :rtype: string """ # Check that data is a Series or Dataframe: if not (isinstance(data, pd.Series) or isinstance(data, pd.DataFrame)): raise TypeError('The data has to be a Series or DataFrame.') # If a column is specified while data is a Series, raise a ValueError: if isinstance(data, pd.Series) and column is not None: raise ValueError('A column can only be specified if the data is a ' 'DataFrame.') # Initialize messages and method: messages = [] method = None # Treatment if the data is a series: if is_series(data): # The data is a series. messages.append('The data is a series.') series = data # Check if the series contains categorical values: if is_categorical(series): messages.append('The series contains categorical values.') method = 'random sample imputation' else: messages.append('The series contains numerical values.') if is_temporal(series): messages.append('The series is a time series.') method = 'interpolation with seasonal adjustment' else: messages.append('The series is not a time series.') method = 'mean substitution' # Treatment if the data is a dataframe: if is_dataframe(data): messages.append('The data is a data frame.') if column is None: # Treatment for a whole dataframe. messages.append( 'You want to apply the same method to the whole data frame.') if contains_categorical(data): # The data frame contains categorical data. messages.append('The data frame contains categorical data.') method = 'most-frequent substitution' else: # The data frame does not contain categorical data. messages.append( 'The data frame does not contain categorical data.') method = 'imputation using k-NN' else: # Treatment for a specific column of a dataframe # Check if column is actually a column of data: if column in data.columns: series = data[column] else: raise ValueError(column + 'is not a column of the data.') # Check if the column contains categorical values: if is_categorical(series): # The column contains categorical values. messages.append( 'The column {} contains categorical values.'.format( column)) method = 'logistic regression imputation' else: # The column contains numerical values. messages.append( 'The column {} contains numerical values.'.format( column)) # Check if the column represents a time series: if is_temporal(series): # The column represents a time series. messages.append( 'The column {} represent a time series.'.format( column)) method = 'interpolation with seasonal adjustment' else: # The column does not represent a time series. messages.append( 'The column {} does not represent a time ' 'series.'.format(column)) # Check if the column contains less than 10% missing # values: if has_lt_10_percent_na(series): # The column contains less than 10% missing values. messages.append( 'Less than 10% of the values in the ' 'column {} are missing.'.format(column)) method = 'mean substitution' else: # The column contains 10% or more missing values. messages.append( '10% or more of the values in the column' '{} are missing.'.format(column)) # Check if the column has a correlation of more than # 0.8 with any other column. if has_gt_80_percent_cor(data, column): # The column does have a correlation of more # than 0.8 with at least one other column. messages.append( 'The column {} has high ' 'correlations (> 0.8) with at least one ' 'other column.'.format(column)) method = 'linear regression imputation' else: # The column does not have a correlation of more # than 0.8 with at least one other column. messages.append( 'The column {} does not ' 'have high correlations (> 0.8) with any ' 'other column.'.format(column)) method = 'imputation using k-NN' # Create return string and return: res = '' if title_only: res = method else: for idx, message in enumerate(messages): res += str(idx + 1) + '. ' + message + '\n' res += 'Therefore you should apply {}.'.format(method) return res