Source code for pydit.wrangling.split_transactions

"""Utility functions to do analysis/detection of split purchases/expenses"""

import logging
from datetime import timedelta
import pandas as pd

# pylint: disable=unused-variable

logger = logging.getLogger(__name__)


[docs] def check_for_split_transactions( df, limits, amount_col="amount", categ_col="supplier", date_col="date", tolerance_perc=0.01, tolerance_abs=100, days_horizon=30, ): """checks for transactions that are just below a threshold This function checks for transactions that are just below a threshold and returns a DataFrame with the original columns, sorted by category and date, flagging those transactions that would have accumulated a hit just below the threshold or going over the threshold, within the specified tolerance and days horizon. Parameters ---------- df : pd.DataFrame The dataframe to check limits : list or tuple The list of limits to check for, expressed in the same units as the amount column amount_col : str The name of the column in the dataframe that contains the amounts categ_col : str The name of the column in the dataframe that contains the categories e.g. supplier, submitter, etc. date_col : str The name of the column in the dataframe that contains the dates tolerance_perc : float The percentage tolerance to apply to the limits Default is 0.01 tolerance_abs : float The absolute tolerance to apply to the limits Default is 100 days_horizon : int The number of days to look back for the running total Default is 30 Returns ------- pd.DataFrame A new DataFrame with the original columns, sorted (asc) by category and date, plus the following columns: - highest_limit_hit_just_below: the highest limit hit just below - highest_limit_hit_above: the highest limit hit just above - running_total: the running total of the amounts for the category """ if isinstance(limits, int) or isinstance(limits, float): limits = [limits] if not isinstance(limits, list) and not isinstance(limits, tuple): raise ValueError("limits should be a list or tuple") if not isinstance(df, pd.DataFrame): raise ValueError("df should be a pandas DataFrame") if not all([c in df.columns for c in [amount_col, categ_col, date_col]]): raise ValueError("amount_col, categ_col, date_col should be columns in df") if not all([isinstance(limit, (int, float)) for limit in limits]): raise ValueError("limits should be a list of integers or floats") df1 = df.sort_values([categ_col, date_col]).copy() categ = "" running_total = 0 running_total_counts = 0 date_back_bracket = df1[date_col].min() df1["highest_limit_hit_just_below"] = None df1["highest_limit_hit_above"] = None for n, r in df1.iterrows(): limits_hit_just_below = [] limits_hit_above = [] if categ != r[categ_col]: categ = r[categ_col] running_total = 0 running_total_counts = 0 date_back_bracket = r[date_col] if r[date_col] > date_back_bracket + timedelta(days=days_horizon): running_total = 0 running_total_counts = 0 date_back_bracket = r[date_col] running_total += r[amount_col] running_total_counts += 1 for limit in limits: if ( running_total >= limit - limit * tolerance_perc and running_total < limit ) or (running_total >= limit - tolerance_abs and running_total < limit): limits_hit_just_below.append(limit) if running_total >= limit: limits_hit_above.append(limit) highest_limit_hit_just_below = ( max(limits_hit_just_below) if limits_hit_just_below else None ) highest_limit_hit_above = max(limits_hit_above) if limits_hit_above else None df1.loc[n, "highest_limit_hit_just_below"] = highest_limit_hit_just_below df1.loc[n, "highest_limit_hit_above"] = highest_limit_hit_above df1.loc[n, "running_total"] = running_total df1.loc[n, "running_total_counts"] = running_total_counts df1.loc[n, "split_transaction_hit_flag"] = any( [ (highest_limit_hit_above and running_total_counts > 1), highest_limit_hit_just_below, ] ) return df1