Source code for imputena.simple_imputation.random_value_imputation

import pandas as pd
import numpy as np


[docs]def random_value_imputation( data=None, distribution='uniform', vmin=0, vmax=1, sigma=1, mu=0, columns=None, inplace=False): """Fills in missing values with a randomly generated number. If distribution is uniform, a float between vmin (inclusive) and vmax ( exclusive) will be generated. If distribution is normal, a float from a normal distribution specified by sigma and int will be generated. If distribution is integer, an integer between vmin (inclusive) and vmax ( exclusive) will be drawn from a uniform distribution. If the data is passed as a dataframe, the operation can be applied to all columns, by leaving the parameter columns empty, or to selected columns, passed as an array of strings. :param data: The data on which to perform the constant value imputation :type data: pandas.Series or pandas.DataFrame :param distribution: The distribution from which to draw the random values. :type distribution: {'uniform', 'normal', 'integer'}, default 'uniform' :param vmin: The lowest value to be drawn :type vmin: scalar, default 0 :param vmax: One above the highest value to be drawn :type vmax: scalar, default 1 :param sigma: The sigma value tu be used when drawing from a normal distribution. :type sigma: scalar, default 1 :param mu: The mu value tu be used when drawing from a normal distribution. :type mu: scalar, default 0 :param columns: Columns on which to apply the operation. :type columns: array-like, optional :param inplace: If True, do operation inplace and return None. :type inplace: bool, default False :return: The series or dataframe with NA values filled in, or None if inplace=True. :rtype: pandas.Series, pandas.DataFrame, or None :raises: TypeError, ValueError """ # Check if data is a series or dataframe: if not (isinstance(data, pd.Series) or isinstance(data, pd.DataFrame)): raise TypeError('The data has to be a Series or DataFrame.') # Raise a ValueError if columns are selected for a series: if isinstance(data, pd.Series) and columns is not None: raise ValueError('Columns can only be selected if the data is a ' 'DataFrame.') # Check if the distribution has a valid value: if distribution not in ['uniform', 'normal', 'integer']: raise ValueError(distribution + 'is not a supported distribution.') # Assign a reference or copy to res, depending on inplace: if inplace: res = data else: res = data.copy() # Treatment for a DataFrame: if isinstance(data, pd.DataFrame): if columns is None: columns = data.columns num_rows = len(res.index) num_cols = len(columns) if distribution == 'uniform': rand = pd.DataFrame( (vmax - vmin) * np.random.rand(num_rows, num_cols) + vmin, columns=columns, index=res.index) if distribution == 'normal': rand = pd.DataFrame( sigma * np.random.randn(num_rows, num_cols) + mu, columns=columns, index=res.index) if distribution == 'integer': rand = pd.DataFrame( np.random.randint( low=vmin, high=vmax, size=(num_rows, num_cols)), columns=columns, index=res.index) res.update(rand, overwrite=False) # Treatment for a Series: if isinstance(data, pd.Series): def get_random_value(): if distribution == 'uniform': return (vmax - vmin) * np.random.rand() + vmin if distribution == 'normal': return sigma * np.random.randn() + mu if distribution == 'integer': return np.random.randint(low=vmin, high=vmax, size=(1, 1)) if inplace: data.loc[:] = res.apply( lambda item: get_random_value() if pd.isnull(item) else item) else: res = res.apply( lambda item: get_random_value() if pd.isnull(item) else item) # Return the imputed data, or None if inplace: if inplace: return None else: return res