Source code for pydit.wrangling.fuzzy_matching

"""Module with utility functions for fuzzy matching"""

import unicodedata
import re
import string
import logging

# pylint disable=unused-variable

logger = logging.getLogger(__name__)



[docs]
def clean_string(
    t=None,
    keep_dot=False,
    keep_dash=False,
    keep_apostrophe=False,
    keep_ampersand=False,
    keep_spaces=True,
    space_to_underscore=True,
    to_case="lower",
):
    """Sanitising a string

    Cleans the strings applying the following transformations:
    - Normalises unicode to remove accents and other symbols
    - Keeps only [a-zA-Z0-9]
    - Optional to retain dot
    - Spaces to underscore
    - Removes multiple spaces, strips
    - Optional to lowercase

    This is a naive/slow implementation, useful for sanitising things like
    a filename or column headers or small datasets. If you need to cleanup
    large datasets, you need to look into pandas/numpy tools, and vectorised
    functions.


    Parameters
    ----------
    t : str
        String to clean
    keep_dot : bool, optional, default False
        Whether to keep the dot in the string
    keep_dash : bool, optional, default False
        Whether to keep the dash in the string (useful for names)
    keep_aphostrophe : bool, optional, default False
        Whether to keep the apostrophe in the string (useful for names)
    keep_ampersand : True, False, "expand", default False
        Whether to keep the & or not, or expand to "and"
    keep_spaces: bool, optional, default True
        Whether to keep the spaces in the string
        If true we still remove double spaces, and by default we replace
        spaces to underscores.
    space_to_underscore : bool, optional, default True
        Whether to replace spaces with underscores
    case : str, optional, default "lower", choices=["lower", "upper"]
        Whether to lowercase the string

    Returns
    -------
    str
        Cleaned string

    """
    if t != t or t is None:
        return ""

    try:
        t = str(t)
    except Exception:
        return ""

    # we are going to normalize using NFKD
    # this will convert characters to their closest ASCII equivalent
    # e.g. é will become e
    # https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-normalize-in-a-python-unicode-string
    r = (
        unicodedata.normalize("NFKD", t)
        .encode("ascii", errors="ignore")
        .decode("utf-8")
    )
    if to_case == "lower":
        r = str.lower(r)
    elif to_case == "upper":
        r = str.upper(r)
    else:
        pass

    if not keep_dot:
        r = re.sub(r"[\.]", " ", r)
    if not keep_dash:
        r = re.sub(r"[-]", " ", r)
    if not keep_apostrophe:
        r = re.sub(r"[']", " ", r)
    if not keep_ampersand:
        r = re.sub(r"[&]", " ", r)
    elif keep_ampersand == "expand":
        r = re.sub(r"[&]", "and", r)
    r = re.sub(r"[^a-zA-Z0-9\.\-\&']", " ", r)
    r = r.strip()
    if keep_spaces:
        if space_to_underscore:
            r = re.sub(" +", "_", r)
        else:
            r = re.sub(" +", " ", r)
    else:
        r = re.sub(" +", "", r)
    return r



# we enable the caching in this small piece, maxsize can be set to None=unlimited,
# but we could add a limit , apparently having an actual limit makes it
# marginally faster in some conditions.



[docs]
def create_fuzzy_key(
    df,
    input_col,
    output_col="fuzzy_key",
    token_sort=None,
):
    """
    Create a fuzzy key for a dataframe, note that this key preserves the spaces
    after tokenisation, thing this may work better when computing the lev
    distance. If you want a more compact string you need to tweak the
    code to set the clean_string function to remove spaces.


    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to create the fuzzy key for
    input_col : str
        The column to create the fuzzy key from
    output_col : str, optional
        The column to create the fuzzy key to, by default "fuzzy_key"
    token_sort : str, optional
        Whether to use a token sorting algorithm or not and rely on other libraries.
        Can be "token_set_sort", "token_sort" or None

    Returns
    -------
        pandas.Series
            The fuzzy key

    """
    if token_sort not in [None, "token_set_sort", "token_sort"]:
        raise ValueError(
            f"token_sort must be None, token_set_sort or token_sort, got {token_sort}"
        )

    def _token_set_sort(s):
        s = str(s)
        s = s.translate(str.maketrans("", "", string.punctuation))
        sl = list(set(str.split(s)))
        sl.sort()
        s = " ".join(sl)
        return s

    def _token_sort(s):
        s = str(s)
        s = s.translate(str.maketrans("", "", string.punctuation))
        sl = str.split(s)
        sl.sort()
        s = " ".join(sl)
        return s

    df = df.copy()

    # First we are going to deal with the new lines and tabs and empty strings
    df[output_col] = (
        df[input_col]
        .fillna("")
        .str.lower()
        .replace(" (ltd|plc|inc|llp|limited)", " ", regex=True)
        .replace(r"(mr\.?|mrs\.?|miss\.?) ", " ", regex=True)
        .replace("o'", "o", regex=True)
        .replace(" +", " ", regex=True)
        .str.strip()
        .apply(
            lambda v: clean_string(
                v, keep_spaces=True, space_to_underscore=False, keep_ampersand="expand"
            )
        )
    )

    if token_sort == "token_set_sort":
        df[output_col] = df[output_col].apply(_token_set_sort)

    if token_sort == "token_sort":
        df[output_col] = df[output_col].apply(_token_sort)

    return df