Source code for pydit.wrangling.groupby_text_concatenate

"""Groupby text column into concatenated text"""

import logging

import pandas as pd

logger = logging.getLogger(__name__)


[docs] def groupby_text( df, key_cols, value_cols=None, target_col_name="groupby_text", field_separator=" ", row_separator="\n", unique=False, ): """Groupby text column into concatenated text Parameters ---------- df : DataFrame Pandas DataFrame to apply the function to key_cols : list or str Key columns used for grouping value_cols : list or str or None, optional, default None Value colums to concatenate. target_col_name : str, optional, default "groupby_text" Name for the resulting column. field_separator : str, optional, default " " If multiple value_cols provided then how to concatenate. row_separator : str, optional, default newline Separator for the rows. unique : bool, optional, default False If True, concatenate only unique values. Returns ------- DataFrame A grouped dataframe with the concatenated text. This function does not mutate the input dataframe. """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be a pandas DataFrame") if not isinstance(key_cols, (list, str)): raise TypeError("key_cols must be a list or string") if isinstance(key_cols, str): key_cols = [key_cols] if not set(key_cols).issubset(set(df.columns)): raise ValueError(f"Key column {key_cols} not in dataframe") if value_cols is None: value_cols = [ x for x in df.columns if x not in key_cols and df[x].dtype == "object" ] logger.debug("No value columns provided, using all columns: %s", value_cols) elif isinstance(value_cols, str): if value_cols in df.columns: value_cols = [value_cols] else: raise ValueError(f"Value column {value_cols} not in dataframe") elif isinstance(value_cols, list): if not set(value_cols).issubset(set(df.columns)): raise ValueError(f"One or more from {value_cols} not in dataframe") else: raise TypeError("value_cols must be a string or list of strings") # pick just the columns we need, remove nans and convert to string dfkeys = df[key_cols].copy() dfvalues = df[value_cols].fillna("").astype(str).copy() dfvalues = dfvalues.apply( lambda x: x.str.strip() ) # remove leading and trailing whitespace # here we join the value columns if we need to if len(dfvalues.columns) > 1: dfvalues = dfvalues.apply(lambda x: str.strip(field_separator.join(x)), axis=1) # dfvalues = dfvalues.stack().groupby(level=0).agg(field_separator.join) dfjoined = dfkeys.assign(**{target_col_name: dfvalues}) # here is where the true row concatenation happens, reset_index() makes in to a flat dataframe if unique: df_groupby = ( dfjoined.groupby(key_cols)[target_col_name] .apply(set) .apply(lambda s: row_separator.join(sorted(list(s)))) .reset_index() ) else: df_groupby = ( dfjoined.groupby(key_cols)[target_col_name] .apply(row_separator.join) .reset_index() ) # possibly not needed, but just in case we trim the whitespaces df_groupby[target_col_name] = df_groupby[target_col_name].str.strip() return df_groupby
if __name__ == "__main__": pass