Source code for pydit.statistics.percentile

"""Adds a percentile column to a DataFrame, optionally based on a column"""

import logging

import pandas as pd

logger = logging.getLogger(__name__)



[docs]
def add_percentile(df, col, col_group=None):
    """
    Adds columns for percentile for a chosen column in a DataFrame

    It can also provide it within a category group (col_group)

    Parameters
    ----------
    df : DataFrame
        A pandas Dataframe object
    col : str
        The column to calculate the percentile for
    col_group : list, optional, default None
        The column to group by, by default None

    See Also:
    ---------
    https://stackoverflow.com/questions/50804120/how-do-i-get-the-percentile-for-a-row-in-a-pandas-dataframe
    Using the percentile with linear interpolation method, but kept various
    ranks calculations for reference.

    These are alternative ways of calculating for reference/debugging:

    df["PCNT_RANK"] = df[col].rank(method="max", pct=True)

    df["POF"] = df[col].apply(lambda x: stats.percentileofscore(df[col], x, kind="weak"))

    df["QUANTILE_VALUE"] = df["PCNT_RANK"].apply(lambda x: df[col].quantile(x, "lower"))

    df["CHK"] = df["PCNT_LIN"].apply(lambda x: df[col].quantile(x))

    You can check these methods in action in the test suite

    Returns
    -------
    pandas.DataFrame
        Returns a copy of the dataframe with the new columns added.

    """

    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be a pandas DataFrame")
    if not isinstance(col, (str, list)):
        raise TypeError("col must be a string or, at a stretch, a list of one element")
    if isinstance(col, list):
        if len(col) > 1:
            raise ValueError("expected one element in col")
        else:
            col = col[0]

    if col not in df.columns:
        raise ValueError("col not found in dataframe")
    if col_group and not set(col_group).issubset(set(df.columns)):
        raise ValueError("col_group has elements not found in dataframe")

    df = df.copy(deep=True)

    logger.debug("Adding percentile column based on column %s", col)
    if col_group:
        col_group_joined = "_".join(col_group)
        df["percentile_in_" + col_group_joined] = (
            df.groupby(col_group)[col].rank(pct=True).mul(100)
        )
        # TODO: #31 investigate why we use here a different formula when grouping vs full population below
        logger.debug("and grouping by column percentile_in_%s", col_group_joined)

    else:
        df["RANKTMP"] = df[col].rank(method="max")
        sz = df["RANKTMP"].size - 1
        df["percentile_in_" + col] = df["RANKTMP"].apply(lambda x: (x - 1) / sz)

        df = df.drop("RANKTMP", axis=1)

    return df