Source code for pydit.wrangling.coalesce_dataframe_values

"""Creates a new column with the top N most frequent values and the rest are replaced by Other"""

import logging

import pandas as pd
import numpy as np

logger = logging.getLogger(__name__)


[docs] def coalesce_values( df_in, cols, top_n_values_to_keep=10, translation_dict=None, other_label="OTHER", nan_label="N/A", case_insensitive=True, show_nan=True, ): """ Creates a new column with the top N most frequent values and the rest are replaced by Other. Also can take a translation dictionary to do the manual translation prior to applying that top N limit. Parameters ---------- df_in : pandas.DataFrame The dataframe to clean up cols : list The column names to coalesce top_n_values_to_keep : int, optional, default 10 The number of top values to keep. translation_dict : dict, optional, default None A dictionary to use for manual translation/coalescing. other_label : str or int, optional, default "OTHER" The label to use for the other values. case_insensitive : bool, optional, default True Whether to do a case insensitive comparison. dropna : bool, optional, default True Whether to ignore np.nan values. If False, NA values will be treated as a category with "N/A" as the label. Returns ------- pandas.DataFrame Pandas DataFrame with new column with coalesced values. """ # We ensure we create a copy so not to mutate the original DataFrame df = df_in.copy() if not isinstance(cols, (list, str)): raise TypeError("cols must be a string or a list") if isinstance(cols, str): if cols == "": raise ValueError("Column must be a non-empty string or a list") cols = [cols] if len(cols) == 0: raise ValueError("Cols must be a non-empty list") if not set(cols).issubset(df.columns): raise ValueError("Not all columns found in DataFrame") if isinstance(other_label, str): flag_str_label = True elif isinstance(other_label, int): flag_str_label = False else: raise TypeError("other_label must be a string or an integer") if top_n_values_to_keep <= 0: raise ValueError("top_n_values_to_keep must be greater than 0") if len(cols) == 1: col_root = cols[0] df[col_root + "_source"] = df[cols[0]].copy() else: def concat_categories(r, cols): try: v = "_".join([str(v) for v in r[cols].values]) except Exception: v = np.NAN return v col_root = "_".join(cols) df[col_root + "source"] = df.apply(lambda r: concat_categories(r, cols), axis=1) if translation_dict: df[col_root + "_translate"] = df[col_root + "_source"].apply( lambda v: translation_dict[v] if v in translation_dict else other_label, ) col_output = col_root + "_translate" else: col_output = col_root + "_source" logger.info("Processing column %s", cols) logger.info("Will keep top %s values", top_n_values_to_keep) logger.info("Case insensitive: %s", case_insensitive) logger.info("Show NaN values as N/A: %s", show_nan) if case_insensitive: try: df[col_output] = df[col_output].str.upper() except Exception: pass if show_nan: df[col_output] = df[col_output].fillna(nan_label) if flag_str_label: df[col_output] = df[col_output].astype(str).str.strip().str.upper() val_counts = df[col_output].value_counts().reset_index() val_counts_top_n = list(val_counts[0:top_n_values_to_keep][col_output]) df[col_root + "_collapsed"] = df.apply( lambda r: r[col_output] if r[col_output] in val_counts_top_n else other_label, axis=1, ) logger.info("Unique values after: %s", len(df[col_root + "_collapsed"].unique())) logger.info("Value counts:\n%s", df[col_root + "_collapsed"].value_counts()) return df
if __name__ == "__main__": data = { "a": [ "Label 1", "Label 2", "Label 2", "Label 3", "Label 3", "Label 3", "Label 4", ], "b": [1, 2, 2, 3, 3, 3, 4], "c": ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul"], "d": [1, 2, 3, 4, 5, 6, 7], "e": ["Red", "Red", "Red", "Red", "Red", "Red", "Red"], "f": ["a", "b", "c", "d", "e", "f", "g"], "g": ["a", "a", "a", "b", "b", "c", np.nan], "h": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], "i": [np.nan, np.nan, np.nan, "b", "b", "c", "d"], } df = pd.DataFrame(data) vc = df["a"].value_counts().reset_index() print(vc) value_counts_topN = list(vc[0:2]["a"]) print(value_counts_topN) result = coalesce_values(df, "a", top_n_values_to_keep=2) print(result)