Source code for imputena.multiple_imputation.srmi

import pandas as pd
from sklearn import linear_model
from imputena.simple_imputation.linear_regression import get_imputed_row
import logging


[docs]def srmi(data=None, sample_size=10, imputations=3, regressions='available'):
    """Performs sequential regression multiple imputation on the data.
    Several (parameter imputations) imputations are performed and the
    resulting dataframes returned as a list. For each one, a regression
    model is created based on a sample of the available rows. The size of
    these samples is fixed by the parameter sample_size. If, in the same
    row as a missing value in the dependent variable the value for any
    predictor variable is missing, a regression model based on all available
    predictors in calculated just to impute those values where the
    predictor(s) are missing. This behavior can be changed by assigning to
    the parameter regressions the value 'complete'. In this case, rows in
    which a predictor variable is missing do not get imputed.

    :param data: The data on which to perform the SRMI.
    :type data: pandas.DataFrame
    :param sample_size: Maximum size of the set of rows used to compute the
        regression model. Has to be at least 2.
    :type sample_size: scalar, default 10
    :param imputations: Number of imputations to perform
    :type imputations: scalar, default 3
    :param regressions: If 'available': Impute missing values by modeling a
        regression based on all available predictors if some predictors have
        missing values themselves. If 'complete': Only impute with a
        regression model based on all predictors and leave missing values in
        rows in which some predictor value is missing itself unimputed.
    :type regressions: {'available', 'complete'}, default 'available'
    :return: A list of linear regression imputations performed based on
        regression models calculated from different samples.
    :rtype: list of pandas.DataFrame
    :raises: TypeError, ValueError
    """
    # Check if data is a dataframe:
    if not isinstance(data, pd.DataFrame):
        raise TypeError('The data has to be a DataFrame.')
    # Assign value to do_available_regressions
    if regressions == 'available':
        do_available_regressions = True
    elif regressions == 'complete':
        do_available_regressions = False
    else:
        raise ValueError(regressions + 'could not be understood')
    # Create the list that will be returned
    imputed_datasets = []
    # Impute several times and add the results to the list:
    for _ in range(imputations):
        imputed_datasets.append(
            srmi_one_imputation(data, sample_size, do_available_regressions))
    # Return the imputed datasets:
    return imputed_datasets


def srmi_one_imputation(data, sample_size, do_available_regressions):
    """Auxiliary function that performs one linear regression imputation,
    creating the regression model based on a sample.

    :param data: The data on which to perform the linear regression imputation.
    :type data: pandas.DataFrame
    :param sample_size: Maximum size of the set of rows used to compute the
        regression model.
    :type sample_size: scalar
    :param do_available_regressions: Whether to do regressions for all
        available predictor combinations or only on complete ones
    :type do_available_regressions: bool
    :return: The dataframe with one linear regression imputation performed
        for all columns with missing values, based on a model created from a
        sample.
    :rtype: pandas.DataFrame
    """
    # This auxiliary function always returns a copy:
    res = data.copy()
    # Impute each column that contains missing values:
    for column in data.columns:
        if data[column].isna().any():
            res.loc[:, :] = srmi_one_dependent(
                res, column, None, do_available_regressions, sample_size)
    # Return the result:
    return res


def srmi_one_dependent(
        data, dependent, predictors, do_available_regressions, sample_size):
    """Auxiliary function that performs linear regression imputation for the
    dependent column. The difference with srmi_step() is that in
    that function dependent can be None, in which case this function is
    called for each column containing missing values,

    :param data: The data on which to perform the linear regression imputation.
    :type data: pandas.DataFrame
    :param dependent: The dependent variable in which the missing values
        should be imputed.
    :type dependent: String
    :param predictors: The predictor variables on which the dependent variable
        is dependent.
    :type predictors: array-like
    :param do_available_regressions: Whether to do regressions for all
        available predictor combinations or only on complete ones
    :type do_available_regressions: bool
    :param sample_size: Maximum size of the set of rows used to compute the
        regression model.
    :type sample_size: scalar
    :return: The dataframe with linear regression imputation performed for the
        incomplete variable.
    :rtype: pandas.DataFrame
    """
    # THis auxiliary function always returns a copy:
    res = data.copy()
    # If predictors is None, all variables except for the dependent one are
    # considered predictors:
    if predictors is None:
        predictors = list(data.columns)
        predictors.remove(dependent)
    # Predictor combination sets and lists
    limited_predictors_combs = set()
    predictors_combs_done = []
    predictors_combs_todo = [tuple(predictors)]
    # Perform the operation:
    while len(predictors_combs_todo) > 0:
        # Select iteration predictors
        it_predictors = predictors_combs_todo.pop(0)
        # Log iteration beginning:
        logging.info('Applying regression imputation with predictors: ' + str(
            it_predictors))
        # Perform iteration:
        res.loc[:, :] = srmi_iter(
            res, dependent, list(it_predictors), sample_size,
            limited_predictors_combs)
        # Update predictor combinations done and to do
        predictors_combs_done.append(it_predictors)
        if do_available_regressions:
            predictors_combs_todo = list(
                set(limited_predictors_combs) - set(predictors_combs_done))
        # Log iteration end:
        logging.info('Predictor combinations done: ' + str(
            predictors_combs_done))
        logging.info('Predictor combinations to do: ' + str(
            predictors_combs_todo))
    # Return the result:
    return res


def srmi_iter(
        data, dependent, predictors, sample_size, limited_predictors_combs):
    """Auxiliary function that performs (simple or multiple) linear
    regression imputation on the data, for the dependent column only. The
    regression model is based on a subset of all available rows that has the
    maximum size sample_size. In rows that contain a missing value for any
    predictor variable, the value of the dependent variable does not get
    imputed. The operation is always performed on a copy of the data,
    which is returned.

    :param data: The data on which to perform the linear regression imputation.
    :type data: pandas.DataFrame
    :param dependent: The dependent variable in which the missing values
        should be imputed.
    :type dependent: String
    :param predictors: The predictor variables on which the dependent variable
        is dependent.
    :type predictors: array-like
    :param sample_size: Maximum size of the set of rows used to compute the
        regression model.
    :type sample_size: scalar
    :param limited_predictors_combs: Reference to the set which contains all
        limited predictor combinations that are necessary to use because
        some predictor had a missing value in some row.
    :type limited_predictors_combs: set
    :return: A copy of the dataframe with linear regression imputation
        performed for the incomplete variable.
    :rtype: pandas.DataFrame
    """
    # Perform pairwise deletion before calculating the regression
    data_pairwise_deleted = data.copy()
    variables = predictors.copy()
    variables.append(dependent)
    data_pairwise_deleted.dropna(subset=variables, inplace=True)
    # Select sample_size random values from data_pairwise_deleted:
    data_sampled = data_pairwise_deleted
    if len(data_pairwise_deleted) > sample_size:
        data_sampled = data_pairwise_deleted.sample(sample_size)
    # Calculate the regression:
    x = data_sampled[predictors]
    y = data_sampled[dependent]
    model = linear_model.LinearRegression()
    model.fit(x, y)
    # Extract the regression parameters from the model
    intercept = model.intercept_
    coefs = model.coef_
    # Log regression equation:
    eq = str(dependent) + ' = ' + str(intercept)
    for idx, coef in enumerate(coefs):
        eq += ' + ' + str(coef) + '*' + predictors[idx]
    logging.info('Regression equation: ' + eq)
    # Calculate standard error:
    std_error = (model.predict(x) - y).std()
    logging.info('Standard error: ' + str(std_error))
    # Implementation using apply:
    return data.apply(
        lambda row: get_imputed_row(
            row, dependent, predictors, intercept, coefs, False, std_error,
            limited_predictors_combs),
        axis=1, result_type='broadcast')