Source code for pydit.wrangling.merge

"""Module to merge dataframes with prefixes or suffixes for all fields
not just those that have colissions.
This is useful when you want to keep all the columns from both dataframes
and avoid name clashes.

"""

import logging
import pandas as pd

logger = logging.getLogger(__name__)



[docs]
def merge_smart(df1, df2, rename_merge_key=False, **kwargs):
    """Merge two dataframes, with prefixes or suffixes for all fields not just collissions.

    Parameters
    ----------
    df1 : pandas.DataFrame
        The left dataframe
    df2 : pandas.DataFrame
        The right dataframe
    rename_merge_key : bool, optional, default False
        If True, the key columns will be renamed with the suffixes or prefixes
    kwargs : the keyword arguments you can pass to pandas.DataFrame.merge()
        See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html

    Returns
    -------
    pandas.DataFrame
        A new merged dataframe

    """

    merge_args = kwargs.copy()
    dfa = df1.copy()
    dfb = df2.copy()
    if on := merge_args.pop("on", None):
        left_on = on
        right_on = on
    else:
        left_on = merge_args.pop("left_on", None)
        right_on = merge_args.pop("right_on", None)
    if not (left_on and right_on):
        raise ValueError("Missing: on, left_on or right_on")

    left_on = [*left_on]  # unpack the iterable to ensure it is a list
    right_on = [*right_on]

    if suff := merge_args.pop("suffixes", None):
        if suff[0]:
            cols = []
            for c in dfa.columns:
                if not rename_merge_key and c in left_on:
                    cols.append(c)
                else:
                    cols.append(str(c) + str(suff[0]))
            if rename_merge_key:
                left_on = [v + str(suff[0]) for v in left_on]
            dfa.columns = cols
        if suff[1]:
            cols = []
            for c in dfb.columns:
                if (not rename_merge_key) and (c in right_on):
                    cols.append(c)
                else:
                    cols.append(str(c) + str(suff[1]))
            if rename_merge_key:
                right_on = [v + str(suff[1]) for v in right_on]
            dfb.columns = cols

    if pref := merge_args.pop("prefixes", None):
        if pref[0]:
            cols = []
            for c in dfa.columns:
                if (not rename_merge_key) and (c in left_on):
                    cols.append(c)
                else:
                    cols.append(str(pref[0] + str(c)))

            if rename_merge_key:
                left_on = [str(pref[0]) + str(v) for v in left_on]
            dfa.columns = cols
        if pref[1]:
            cols = []
            for c in dfb.columns:
                if (not rename_merge_key) and (c in right_on):
                    cols.append(c)
                else:
                    cols.append(str(pref[1]) + str(c))
            if rename_merge_key:
                right_on = [str(pref[1]) + str(v) for v in right_on]
            dfb.columns = cols
    dfres = pd.merge(dfa, dfb, left_on=left_on, right_on=right_on, **merge_args)
    return dfres




[docs]
def merge_outer_and_split(
    dffact,
    dfdim,
    on=None,
    left_on=None,
    right_on=None,
    suffixes=(None, None),
):
    """Merge two dataframes, and keep the joinable 1:1 or 1:n
    the nan or unmatched are returned in separate dataframes or Excel sheets.
    Under the bonnet it will
    - filter nulls from the key columns (and store in a separate dataframe
    to return those exceptions
    - on the clean version will do an outer merge with indicator=True
    - will split the results into the 3 dataframes (both, left, right)
    - will write the results to an Excel file if specified

    It will also check for duplicates in the key columns and exit if found in
    the right dataframe (presumed to be a dimension/ file ) or warn
    if they are in the left dataframe (presumably the transaction/fact file).


    Parameters
    ----------
    dffact : pandas.DataFrame
        The left dataframe
    dfdim : pandas.DataFrame
        The right dataframe
    left_on : list
        The list of key columns to join on in the left dataframe
    right_on : list
        The list of key columns to join on in the right dataframe
    suffixes : tuple, optional
        The suffixes to use for the left and right dataframes, by default (None,None)

    Returns
    -------
    tuple
        A tuple of dataframes, (both, left, right, left_na, right_na)
        both : the rows that matched
        left : the rows that matched but had nulls in the right dataframe
        right : the rows that matched but had nulls in the left dataframe
        left_na : the rows that have nulls in the specified key columns in the left dataframe
        right_na : the rows that have nulls in the specified key columns in the right dataframe

    Raises
    ------
    ValueError
        If the right dataframe has duplicates in the key columns

    """
    if on is not None:
        left_on = on
        right_on = on
    elif left_on is None or right_on is None:
        raise ValueError("You must specify either on or left_on and right_on")
    else:
        pass

    dffact_na = dffact[dffact[left_on].isnull()]
    dfdim_na = dfdim[dfdim[right_on].isnull()]
    dffact_notna = dffact.dropna(subset=left_on)
    dfdim_notna = dfdim.dropna(subset=right_on)
    # check if dropna creates a copy or is some sort of view/filter
    logger.info(f"Detected {dffact_na.shape[0]} null key values in the left dataframe")
    logger.info(f"Detected {dfdim_na.shape[0]} null key values in the right dataframe")
    if (dupe_count := dfdim_notna.duplicated(subset=right_on).values.sum()) > 0:
        raise ValueError(
            f"Right dataframe has {dupe_count} duplicates on field {right_on}, fix and retry"
        )
    if (dupe_count := dffact_notna.duplicated(subset=left_on).values.sum()) > 0:
        print(
            f"For info , your left dataframe has {dupe_count} duplicates on field {right_on}, should be ok if it is a fact table"
        )
    dfouter = pd.merge(
        dffact_notna,
        dfdim_notna,
        how="outer",
        left_on=left_on,
        right_on=right_on,
        indicator=True,
        suffixes=suffixes,
    )
    dfboth = dfouter[dfouter["_merge"] == "both"].drop(columns=["_merge"])
    dfleft = dfouter[dfouter["_merge"] == "left_only"].drop(columns=["_merge"])
    dfright = dfouter[dfouter["_merge"] == "right_only"].drop(columns=["_merge"])
    return dfboth, dfleft, dfright, dffact_na, dfdim_na