Source code for pydit.wrangling.groupby_text_concatenate

"""Groupby text column into concatenated text"""

import logging

import pandas as pd

logger = logging.getLogger(__name__)



[docs]
def groupby_text(
    df,
    key_cols,
    value_cols=None,
    target_col_name="groupby_text",
    field_separator=" ",
    row_separator="\n",
    unique=False,
):
    """Groupby text column into concatenated text

    Parameters
    ----------
    df : DataFrame
        Pandas DataFrame to apply the function to
    key_cols : list or str
        Key columns used for grouping
    value_cols : list or str or None, optional, default None
        Value colums to concatenate.
    target_col_name : str, optional, default "groupby_text"
        Name for the resulting column.
    field_separator : str, optional, default " "
        If multiple value_cols provided then how to concatenate.
    row_separator : str, optional, default newline
        Separator for the rows.
    unique : bool, optional, default False
        If True, concatenate only unique values.


    Returns
    -------
    DataFrame
        A grouped dataframe with the concatenated text.

        This function does not mutate the input dataframe.

    """
    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be a pandas DataFrame")
    if not isinstance(key_cols, (list, str)):
        raise TypeError("key_cols must be a list or string")
    if isinstance(key_cols, str):
        key_cols = [key_cols]
    if not set(key_cols).issubset(set(df.columns)):
        raise ValueError(f"Key column {key_cols} not in dataframe")
    if value_cols is None:
        value_cols = [
            x for x in df.columns if x not in key_cols and df[x].dtype == "object"
        ]
        logger.debug("No value columns provided, using all columns: %s", value_cols)
    elif isinstance(value_cols, str):
        if value_cols in df.columns:
            value_cols = [value_cols]
        else:
            raise ValueError(f"Value column {value_cols} not in dataframe")
    elif isinstance(value_cols, list):
        if not set(value_cols).issubset(set(df.columns)):
            raise ValueError(f"One or more from {value_cols} not in dataframe")
    else:
        raise TypeError("value_cols must be a string or list of strings")

    # pick just the columns we need, remove nans and convert to string
    dfkeys = df[key_cols].copy()
    dfvalues = df[value_cols].fillna("").astype(str).copy()
    dfvalues = dfvalues.apply(
        lambda x: x.str.strip()
    )  # remove leading and trailing whitespace
    # here we join the value columns if we need to
    if len(dfvalues.columns) > 1:
        dfvalues = dfvalues.apply(lambda x: str.strip(field_separator.join(x)), axis=1)
        # dfvalues = dfvalues.stack().groupby(level=0).agg(field_separator.join)

    dfjoined = dfkeys.assign(**{target_col_name: dfvalues})

    # here is where the true row concatenation happens, reset_index() makes in to a flat dataframe
    if unique:
        df_groupby = (
            dfjoined.groupby(key_cols)[target_col_name]
            .apply(set)
            .apply(lambda s: row_separator.join(sorted(list(s))))
            .reset_index()
        )

    else:
        df_groupby = (
            dfjoined.groupby(key_cols)[target_col_name]
            .apply(row_separator.join)
            .reset_index()
        )

    # possibly not needed, but just in case we trim the whitespaces
    df_groupby[target_col_name] = df_groupby[target_col_name].str.strip()
    return df_groupby



if __name__ == "__main__":
    pass