Source code for pydit.wrangling.blanks

"""Checks for various types of nulls/blanks in a dataframe and returns counts."""

import pandas as pd
import numpy as np

import logging

logger = logging.getLogger(__name__)


[docs] def check_blanks( obj, columns=None, include_zeroes=False, include_nullstrings_and_spaces=False, totals_only=True, silent=False, ): """ Returns by default a summary dictionary with column names as key and count of blanks as value, for the columns selected (or all if no column list provided) If "total_only" is False it would return detailed information of the blanks original/copied dataframe with: a) one boolean column per input columns, True when there are blanks in that record b) a summary boolean column if any of the previous is true Check out https://github.com/ResidentMario/missingno library for a nice visualization (seems to come with Anaconda) Parameters ---------- obj : DataFrame or Series The dataframe or series to check for blanks columns : list, optional, default None The columns to check for blanks. If None, all columns are checked. include_zeroes : bool, optional, default False If True, checks for zeroes as blanks include_nullstrings_and_spaces : bool, optional, default False If True, checks for null strings and spaces as blanks totals_only : bool, optional, default False If True, only the total counts are returned silent : bool, optional, default False If True, logging level set to critical, ie no info messages shown Returns ------- pandas.DataFrame A dataframe with the counts of blanks in each column. Or a summary dictionary with various counts. See also -------- profile_dataframe() : Profile the dataframe, includes metrics on blanks Examples -------- Basic usage with a DataFrame containing NaN values: >>> import pandas as pd >>> import numpy as np >>> df = pd.DataFrame({ ... 'A': [1, 2, None, 4], ... 'B': ['x', 'y', None, 'z'], ... 'C': [1.0, 2.0, 3.0, 4.0] ... }) >>> result = check_blanks(df, silent=True) >>> result['A'] 1 >>> result['B'] 1 >>> result['C'] 0 Test with specific columns: >>> result = check_blanks(df, columns=['A', 'B'], silent=True) >>> len(result) 2 >>> 'C' in result False Test including zeroes as blanks: >>> df_zeros = pd.DataFrame({'A': [1, 0, 3], 'B': [0, 2, 0]}) >>> result = check_blanks(df_zeros, include_zeroes=True, silent=True) >>> result['A'] 1 >>> result['B'] 2 Test including null strings and spaces: >>> df_strings = pd.DataFrame({ ... 'text': ['hello', '', ' ', 'world', None] ... }) >>> result = check_blanks(df_strings, include_nullstrings_and_spaces=True, silent=True) >>> result['text'] 3 Test with Series input: >>> series = pd.Series([1, None, 3, None], name='my_series') >>> result = check_blanks(series, silent=True) >>> result['my_series'] 2 Test with totals_only=False to get detailed DataFrame: >>> df_small = pd.DataFrame({'A': [1, None], 'B': [None, 2]}) >>> result = check_blanks(df_small, totals_only=False, silent=True) >>> 'A_blanks' in result.columns True >>> 'has_blanks' in result.columns True >>> int(result['has_blanks'].sum()) 2 """ if silent: logger.setLevel(logging.CRITICAL) # We validate and standardise the input if not isinstance(obj, (pd.DataFrame, pd.Series)): raise TypeError("Expecting a dataframe or a Series") if isinstance(obj, pd.Series): if not obj.name: name = "data" else: name = obj.name df = obj.to_frame(name=name) else: df = obj.copy() if isinstance(columns, list): cols = columns elif isinstance(columns, str): cols = [columns] else: cols = df.columns logger.debug("No columns provided, checking all columns") if not set(cols).issubset(set(df.columns)): raise ValueError("Column(s) provided not found in the dataframe") fields = ",".join(cols) logger.info("Checking for blanks in %s", fields) if include_zeroes: logger.info("Including zeroes as blanks") if include_nullstrings_and_spaces: logger.info("Including null strings and spaces as blanks") total_results = {} for c in cols: s1 = pd.isna(df[c]) if include_zeroes: s2 = df[c] == 0 else: s2 = pd.Series(False, index=df.index) if include_nullstrings_and_spaces: s3 = ~df[c].fillna("").astype(str).str.strip().astype(bool) else: s3 = pd.Series(False, index=df.index) s = s1 | s2 | s3 if not totals_only: df[c + "_blanks"] = s total_results[c] = int(s.sum()) if not totals_only: new_cols = [c + "_blanks" for c in cols] df["has_blanks"] = np.any(df[new_cols], axis=1) logger.info( "Blanks per column:%s", ",".join([f"{k}:{v}" for k, v in total_results.items()]) ) if totals_only: return total_results return df