Source code for pydit.statistics.percentile

"""Adds a percentile column to a DataFrame, optionally based on a column"""

import logging

import pandas as pd

logger = logging.getLogger(__name__)


[docs] def add_percentile(df, col, col_group=None): """ Adds columns for percentile for a chosen column in a DataFrame It can also provide it within a category group (col_group) Parameters ---------- df : DataFrame A pandas Dataframe object col : str The column to calculate the percentile for col_group : list, optional, default None The column to group by, by default None See Also: --------- https://stackoverflow.com/questions/50804120/how-do-i-get-the-percentile-for-a-row-in-a-pandas-dataframe Using the percentile with linear interpolation method, but kept various ranks calculations for reference. These are alternative ways of calculating for reference/debugging: df["PCNT_RANK"] = df[col].rank(method="max", pct=True) df["POF"] = df[col].apply(lambda x: stats.percentileofscore(df[col], x, kind="weak")) df["QUANTILE_VALUE"] = df["PCNT_RANK"].apply(lambda x: df[col].quantile(x, "lower")) df["CHK"] = df["PCNT_LIN"].apply(lambda x: df[col].quantile(x)) You can check these methods in action in the test suite Returns ------- pandas.DataFrame Returns a copy of the dataframe with the new columns added. """ if not isinstance(df, pd.DataFrame): raise TypeError("df must be a pandas DataFrame") if not isinstance(col, (str, list)): raise TypeError("col must be a string or, at a stretch, a list of one element") if isinstance(col, list): if len(col) > 1: raise ValueError("expected one element in col") else: col = col[0] if col not in df.columns: raise ValueError("col not found in dataframe") if col_group and not set(col_group).issubset(set(df.columns)): raise ValueError("col_group has elements not found in dataframe") df = df.copy(deep=True) logger.debug("Adding percentile column based on column %s", col) if col_group: col_group_joined = "_".join(col_group) df["percentile_in_" + col_group_joined] = ( df.groupby(col_group)[col].rank(pct=True).mul(100) ) # TODO: #31 investigate why we use here a different formula when grouping vs full population below logger.debug("and grouping by column percentile_in_%s", col_group_joined) else: df["RANKTMP"] = df[col].rank(method="max") sz = df["RANKTMP"].size - 1 df["percentile_in_" + col] = df["RANKTMP"].apply(lambda x: (x - 1) / sz) df = df.drop("RANKTMP", axis=1) return df