Source code for imputena.simple_imputation.random_sample_imputation

import pandas as pd
import numpy as np


[docs]def random_sample_imputation(data=None, columns=None, inplace=False): """Performs random sample imputation on the data. Missing values in each column are replaced by a randomly selected observed values of the same column, if available. The operation can be applied to a series, a whole dataframe, or a selection of columns of a dataframe. :param data: The data on which to perform the random sample imputation :type data: pandas.Series or pandas.DataFrame :param columns: Columns on which to apply the operation. :type columns: array-like, optional :param inplace: If True, do operation inplace and return None. :type inplace: bool, default False :return: The series or dataframe with NA values filled in, or None if inplace=True. :rtype: pandas.Series, pandas.DataFrame, or None :raises: TypeError, ValueError """ # Check if data is of the correct type: if not (isinstance(data, pd.Series) or isinstance(data, pd.DataFrame)): raise TypeError('The data has to be a Series or DataFrame.') # Assign a reference or copy to res, depending on inplace: if inplace: res = data else: res = data.copy() # Treatment if data is a series: if isinstance(data, pd.Series): if columns is not None: raise ValueError('Columns can only be selected if the data is a ' 'DataFrame.') if data.notnull().sum() > 0: # The operation is only applied if the column contains some # non-NA value. number_missing = data.isnull().sum() observed_values = data.loc[data.notnull()] res.loc[data.isnull()] = np.random.choice( observed_values, number_missing, replace=True) # Treatment if data is a dataframe: if isinstance(data, pd.DataFrame): if columns is None: columns = data.columns for column in columns: # Raise error if the column name doesn't exist in the data: if column not in data.columns: raise ValueError( '\'' + column + '\' is not a column of the data.' ) if data[column].notnull().sum() > 0: # The operation is only applied if the column contains some # non-NA value. number_missing = data[column].isnull().sum() observed_values = data.loc[data[column].notnull(), column] res.loc[data[column].isnull(), column] = np.random.choice( observed_values, number_missing, replace=True) # Return the imputed data, or None if inplace: if inplace: return None else: return res