Source code for pydit.wrangling.keyword_search_batch

"""Functions to sweep a dataframe for keywords and return a matrix of matches."""

import logging
import re
import math

import pandas as pd
import numpy as np
from pandas import DataFrame


logger = logging.getLogger(__name__)


def _keyword_search_re(keywords, df, case_sensitive):
    """Internal function to do keyword search with regexp (regular expressions).

    If you dont know what is a regexp, then you probably want to use
    the simpler string search, but regexps are way more powerful

    Arguments
    ---------
    keywords : list of strings
        The list of regular expressions as string
    df : pandas.DataFrame
        The dataframe to search.
    case_sensitive : bool
        If True then the keywords are case sensitive.
        Provided by the parent function

    Returns
    -------
    pandas.DataFrame
        The dataframe with the boolean columns one per keyword searched.


    """

    # We do a quick compilation pass so we can detect issues with the regex
    re_compiled = []
    for re_text in keywords:
        try:
            if case_sensitive:
                pattern = re.compile(re_text)
            else:
                pattern = re.compile(re_text, re.IGNORECASE)
            re_compiled.append(pattern)
        except Exception as e:
            raise ValueError("Invalid regular expression: " + re_text) from e
    dfres = pd.DataFrame()
    n = 1  # used for column naming
    zeroes = max(math.ceil(math.log10(len(keywords))), 2)
    for p in re_compiled:
        logger.info("Searching for keyword: %s", p.pattern)
        regmatch = np.vectorize(lambda x: bool(p.search(x)))
        res = regmatch(df["dummy_keyword_search"].values)
        logger.info("Found %d matches", sum(res))

        dfres["kw_match" + str.zfill(str(n), zeroes)] = res
        n = n + 1
    return dfres


def _keyword_search_str(keywords, df, case_sensitive):
    """Internal function to do a simple string search, no regular expressions used.

    While less powerful it could be faster if we wish to do lots of keywords on a
    large file, normally regexp are fine and can take normal keywords too, use
    this as an exception.

    Arguments
    ---------
    keywords : list of strings
        The list of keywords to search for.
    df : pandas.DataFrame
        The dataframe to search.
    case_sensitive : bool, default False

    Returns
    -------
    pandas.DataFrame
        The dataframe with the boolean columns one per keyword searched.


    """
    if case_sensitive:
        # We just use the values directly
        listed = df["dummy_keyword_search"].tolist()
    else:
        keywords = [x.lower() for x in keywords]
        listed = df["dummy_keyword_search"].str.lower().tolist()
    dfres = pd.DataFrame()
    for i, kw in enumerate(keywords):
        dfres["kw_match" + str.zfill(str(i + 1), 2)] = [kw in n for n in listed]
    return dfres