Source code for pydit.wrangling.split_transactions

"""Utility functions to do analysis/detection of split purchases/expenses"""

import logging
from datetime import timedelta
import pandas as pd

# pylint: disable=unused-variable

logger = logging.getLogger(__name__)



[docs]
def check_for_split_transactions(
    df,
    limits,
    amount_col="amount",
    categ_col="supplier",
    date_col="date",
    tolerance_perc=0.01,
    tolerance_abs=100,
    days_horizon=30,
):
    """checks for transactions that are just below a threshold

    This function checks for transactions that are just below a threshold
    and returns a DataFrame with the original columns, sorted by category and
    date, flagging those transactions that would have accumulated a hit just
    below the threshold or going over the threshold, within the specified
    tolerance and days horizon.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to check
    limits : list or tuple
        The list of limits to check for, expressed in the same units as the amount column
    amount_col : str
        The name of the column in the dataframe that contains the amounts
    categ_col : str
        The name of the column in the dataframe that contains the categories
        e.g. supplier, submitter, etc.
    date_col : str
        The name of the column in the dataframe that contains the dates
    tolerance_perc : float
        The percentage tolerance to apply to the limits
        Default is 0.01
    tolerance_abs : float
        The absolute tolerance to apply to the limits
        Default is 100
    days_horizon : int
        The number of days to look back for the running total
        Default is 30

    Returns
    -------
    pd.DataFrame
        A new DataFrame with the original columns, sorted (asc) by category and
        date, plus the following columns:
        - highest_limit_hit_just_below: the highest limit hit just below
        - highest_limit_hit_above: the highest limit hit just above
        - running_total: the running total of the amounts for the category

    """
    if isinstance(limits, int) or isinstance(limits, float):
        limits = [limits]
    if not isinstance(limits, list) and not isinstance(limits, tuple):
        raise ValueError("limits should be a list or tuple")
    if not isinstance(df, pd.DataFrame):
        raise ValueError("df should be a pandas DataFrame")
    if not all([c in df.columns for c in [amount_col, categ_col, date_col]]):
        raise ValueError("amount_col, categ_col, date_col should be columns in df")
    if not all([isinstance(limit, (int, float)) for limit in limits]):
        raise ValueError("limits should be a list of integers or floats")

    df1 = df.sort_values([categ_col, date_col]).copy()
    categ = ""
    running_total = 0
    running_total_counts = 0
    date_back_bracket = df1[date_col].min()
    df1["highest_limit_hit_just_below"] = None
    df1["highest_limit_hit_above"] = None

    for n, r in df1.iterrows():
        limits_hit_just_below = []
        limits_hit_above = []
        if categ != r[categ_col]:
            categ = r[categ_col]
            running_total = 0
            running_total_counts = 0
            date_back_bracket = r[date_col]

        if r[date_col] > date_back_bracket + timedelta(days=days_horizon):
            running_total = 0
            running_total_counts = 0
            date_back_bracket = r[date_col]

        running_total += r[amount_col]
        running_total_counts += 1
        for limit in limits:
            if (
                running_total >= limit - limit * tolerance_perc
                and running_total < limit
            ) or (running_total >= limit - tolerance_abs and running_total < limit):
                limits_hit_just_below.append(limit)
            if running_total >= limit:
                limits_hit_above.append(limit)
        highest_limit_hit_just_below = (
            max(limits_hit_just_below) if limits_hit_just_below else None
        )
        highest_limit_hit_above = max(limits_hit_above) if limits_hit_above else None
        df1.loc[n, "highest_limit_hit_just_below"] = highest_limit_hit_just_below
        df1.loc[n, "highest_limit_hit_above"] = highest_limit_hit_above
        df1.loc[n, "running_total"] = running_total
        df1.loc[n, "running_total_counts"] = running_total_counts
        df1.loc[n, "split_transaction_hit_flag"] = any(
            [
                (highest_limit_hit_above and running_total_counts > 1),
                highest_limit_hit_just_below,
            ]
        )

    return df1