Source code for imputena.simple_imputation.random_sample_imputation

import pandas as pd
import numpy as np


[docs]def random_sample_imputation(data=None, columns=None, inplace=False):
    """Performs random sample imputation on the data. Missing values in each
    column are replaced by a randomly selected observed values of the same
    column, if available. The operation can be applied to a series, a whole
    dataframe, or a selection of columns of a dataframe.

    :param data: The data on which to perform the random sample imputation
    :type data: pandas.Series or pandas.DataFrame
    :param columns: Columns on which to apply the operation.
    :type columns: array-like, optional
    :param inplace: If True, do operation inplace and return None.
    :type inplace: bool, default False
    :return: The series or dataframe with NA values filled in, or
        None if inplace=True.
    :rtype: pandas.Series, pandas.DataFrame, or None
    :raises: TypeError, ValueError
    """
    # Check if data is of the correct type:
    if not (isinstance(data, pd.Series) or isinstance(data, pd.DataFrame)):
        raise TypeError('The data has to be a Series or DataFrame.')
    # Assign a reference or copy to res, depending on inplace:
    if inplace:
        res = data
    else:
        res = data.copy()
    # Treatment if data is a series:
    if isinstance(data, pd.Series):
        if columns is not None:
            raise ValueError('Columns can only be selected if the data is a '
                             'DataFrame.')
        if data.notnull().sum() > 0:
            # The operation is only applied if the column contains some
            # non-NA value.
            number_missing = data.isnull().sum()
            observed_values = data.loc[data.notnull()]
            res.loc[data.isnull()] = np.random.choice(
                observed_values, number_missing, replace=True)
    # Treatment if data is a dataframe:
    if isinstance(data, pd.DataFrame):
        if columns is None:
            columns = data.columns
        for column in columns:
            # Raise error if the column name doesn't exist in the data:
            if column not in data.columns:
                raise ValueError(
                    '\'' + column + '\' is not a column of the data.'
                )
            if data[column].notnull().sum() > 0:
                # The operation is only applied if the column contains some
                # non-NA value.
                number_missing = data[column].isnull().sum()
                observed_values = data.loc[data[column].notnull(), column]
                res.loc[data[column].isnull(), column] = np.random.choice(
                    observed_values, number_missing, replace=True)
    # Return the imputed data, or None if inplace:
    if inplace:
        return None
    else:
        return res