Source code for imputena.simple_imputation.mean_substitution

import pandas as pd
import warnings


[docs]def mean_substitution(data=None, method='mean', columns=None, inplace=False): """Fills in missing values with the average value of the same column, in case of a dataframe, or of the series as a whole in case of a series. If the data is passed as a dataframe, the operation can be applied to all columns, by leaving the parameter columns empty, or to selected columns, passed as an array of strings. :param data: The data on which to perform the mean substitution. :type data: pandas.Series or pandas.DataFrame :param columns: Columns on which to apply the operation. :type columns: array-like, optional :param method: Method to use to calculate the average. :type method: {'mean', 'median'}, default 'mean' :param inplace: If True, do operation inplace and return None. :type inplace: bool, default False :return: The series or dataframe with NA values filled in, or None if inplace=True. :rtype: pandas.Series, pandas.DataFrame, or None :raises: TypeError, ValueError """ # Check if data is a series or dataframe: if not (isinstance(data, pd.Series) or isinstance(data, pd.DataFrame)): raise TypeError('The data has to be a Series or DataFrame.') # Raise a ValueError if columns are selected for a series: if isinstance(data, pd.Series) and columns is not None: raise ValueError('Columns can only be selected if the data is a ' 'DataFrame.') # Raise a ValueError if the method is neither mean nor median: if method not in ['mean', 'median']: raise ValueError( method + 'is not a valid method for calculating the average.') # Assign a reference or copy to res, depending on inplace: if inplace: res = data else: res = data.copy() if columns is None: # Treatment for a series or all columns of a dataframe with warnings.catch_warnings(): warnings.filterwarnings( 'ignore', 'All-NaN slice encountered') if method == 'mean': res.fillna(data.mean(), inplace=True) elif method == 'median': res.fillna(data.median(), inplace=True) else: # Treatment for selected columns of a dataframe for column in columns: # Raise error if the column name doesn't exist in the data: if column not in data.columns: raise ValueError( '\'' + column + '\' is not a column of the data.') # Impute the missing values of the column with warnings.catch_warnings(): warnings.filterwarnings( 'ignore', 'All-NaN slice encountered') if method == 'mean': res[column].fillna(data[column].mean(), inplace=True) elif method == 'median': res[column].fillna(data[column].median(), inplace=True) # Return the imputed data, or None if inplace: if inplace: return None else: return res