Source code for imputena.deletion.delete_columns

import pandas as pd


[docs]def delete_columns(data=None, columns=None, threshold=None, inplace=False): """Drops variables that contain NA values from the data. If a list of column names is passed, all other columns will be ignored, otherwise all the columns will be considered. If a threshold is given, the function drops those rows which have less non-NA values in the specified columns. :param data: The data on which to perform the pairwise dropping of variables. :type data: pandas.DataFrame :param columns: The columns which should be considered. If not passed or None, all columns will be considered. :type columns: array-like, optional :param threshold: Require that many non-NA values in order to not drop a column. If not passed or None, all columns with any NA value will be dropped. :type threshold: int, optional :param inplace: If True, do operation inplace and return None. :type inplace: bool, default False :return: The dataframe with columns that contain NA dropped or None if inplace=True. :rtype: pandas.DataFrame or None :raises: TypeError, ValueError """ # Check if data is a dataframe: if not isinstance(data, pd.DataFrame): raise TypeError('The data has to be a DataFrame.') # If no columns are given, the operation will be applied to all of them: if columns is None: columns = data.columns # Array of columns in which the number of non-NA values is not at least the # threshold: columns_under_threshold = [] for column in columns: # Raise error if the column name doesn't exist in the data: if column not in data.columns: raise ValueError( '\'' + column + '\' is not a column of the data.' ) if threshold is None: # If no threshold is given, drop the column if it contains any NA: if data[column].isna().any(): columns_under_threshold.append(column) else: # If a threshold is given, drop the column if the number of non-NA # values is not at least the threshold: if data[column].notna().sum() < threshold: columns_under_threshold.append(column) # Drop the columns collected in columns_under_threshold: if inplace: data.drop(columns=columns_under_threshold, inplace=True) return None else: return data.drop(columns=columns_under_threshold)