Source code for imputena.multiple_imputation.mice

import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
from random import shuffle

from imputena import (
    mean_substitution, linear_regression, logistic_regression,
    random_sample_imputation)


[docs]def mice(data=None, imputations=3): """Performs multiple imputation by chained equations (MICE) on the data. Several (parameter imputations) linear regression imputations are performed on the dataset. For each one, the a random order of imputation of columns in generated. Then the dataset is imputed with mean substitution. For each column with missing data, and in the previously generated order, (1) the missing values imputed with the mean are set missing again, (2) a linear regression model is calculated based on the available data and (3) the predictions from the model are used to impute the missing values. :param data: The data on which to perform the MICE imputation. :type data: pandas.DataFrame :param imputations: Number of imputations to perform :type imputations: scalar, default 3 :return: A list of MICE imputations performed with randomly chosen orders of column imputations. :rtype: list of pandas.DataFrame :raises: TypeError, ValueError """ # Check if data is a dataframe: if not isinstance(data, pd.DataFrame): raise TypeError('The data has to be a DataFrame.') # Create the list that will be returned imputed_datasets = [] # Impute several times and add the results to the list: for _ in range(imputations): imputed_datasets.append(mice_one_imputation(data)) # Return the imputed datasets: return imputed_datasets
def mice_one_imputation(data): """Auxiliary function that performs one MICE imputation, choosing the order in which the columns are imputed at random. :param data: The data on which to perform the imputation. :type data: pandas.DataFrame :return: The dataframe with one MICE imputation performed. :rtype: pandas.DataFrame """ # This auxiliary function always returns a copy: res = data.copy() # Save the mask of missing values: na_mask = pd.isna(data) # Compute the list of columns with missing values columns_with_na = [] for column in data.columns: if data[column].isna().any(): columns_with_na.append(column) # Shuffle the list of columns to impute: shuffle(columns_with_na) # Impute with mean substitution: for column in columns_with_na: if is_numeric_dtype(data[column]): mean_substitution(res, columns=[column], inplace=True) else: random_sample_imputation(res, columns=[column], inplace=True) # Compute which columns are numeric in order to use them as predictors: numerics = [col for col in data.columns if is_numeric_dtype(data[col])] # Impute each column: for column in columns_with_na: if is_numeric_dtype(data[column]): res.loc[na_mask[column], column] = np.nan linear_regression(res, column, predictors=numerics, inplace=True) else: res.loc[na_mask[column], column] = None logistic_regression(res, column, inplace=True) return res