Source code for pydit.statistics.profile_dataframe_statistics
"""Calculate basic dataframe metrics on data completion/quality/uniqueness"""
import logging
import numpy as np
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype as is_datetime
logger = logging.getLogger(__name__)
[docs]
def profile_dataframe(obj, return_dict=False, unique_min=10):
"""Create a summary of a DataFrame with various statistics.
Returns a DataFrame or a dict with common statistics to profile the data.
In particular it focuses on unique (cardinality) blanks, nulls, and datetimes.
Parameters
----------
obj : pandas.DataFrame
DataFrame to profile.
return_dict : bool, optional, default=False
If True, return a dict instead of a DataFrame.
unique_min : int, optional, default=10
Returns
-------
DataFrame
DataFrame with various statistics.
"""
if isinstance(obj, pd.DataFrame):
df = obj.copy()
df = df.reset_index()
dtypes = df.dtypes.to_dict()
else:
raise TypeError("df must be a pandas.DataFrame")
logger.info("Profiling dataframe: %s rows , %s columns", df.shape[0], df.shape[1])
col_metrics = []
for col, typ in dtypes.items():
metrics = {}
metrics["column"] = col
metrics["dtype"] = typ
metrics["records"] = len(df[col])
metrics["count_unique"] = len(set(df[pd.notna(df[col])][col]))
metrics["nans"] = len(df[pd.isnull(df[col])])
if metrics["count_unique"] <= unique_min:
value_counts_series = df[col].value_counts(dropna=False)
metrics["value_counts"] = value_counts_series.to_dict()
else:
metrics["value_counts"] = []
if "float" in str(typ):
metrics["max"] = max(df[col])
metrics["min"] = min(df[col])
metrics["sum"] = df[col].sum(skipna=True)
metrics["sum_abs"] = df[col].abs().sum(skipna=True)
metrics["std"] = df[col].std()
metrics["zeroes"] = np.count_nonzero(df[col] == 0)
elif "int" in str(typ):
metrics["max"] = max(df[col])
metrics["min"] = min(df[col])
metrics["sum"] = sum(df[col])
metrics["sum_abs"] = sum(abs(df[col]))
metrics["std"] = df[col].std()
metrics["zeroes"] = np.count_nonzero(df[col] == 0)
elif is_datetime(df[col]):
metrics["max"] = max(df[col])
metrics["min"] = min(df[col])
elif typ == "object":
values = df[col].fillna("").astype(str).str.strip()
numeric_chars = values.str.replace(
r"[^0-9^-^.]+", "", regex=True
) # TODO: refactor this regex for more general cases, doesn't cover negative parentheses
numeric_chars_no_blank = numeric_chars[numeric_chars.str.len() > 0]
numeric_chars_just_digits = numeric_chars.str.replace(r"[^0-9]+", "")
numeric_chars_zeroes = numeric_chars_just_digits[
numeric_chars_just_digits.str.contains("0+", regex=True)
]
metrics["zeroes"] = len(numeric_chars_zeroes)
numeric = pd.to_numeric(numeric_chars_no_blank, errors="coerce")
if len(numeric) > 0:
metrics["max"] = max(numeric)
metrics["min"] = min(numeric)
metrics["empty_strings"] = len(values[values.str.len() == 0])
col_metrics.append(metrics)
df_metrics = pd.DataFrame(col_metrics)
df_metrics["cardinality_perc"] = df_metrics["count_unique"] / df_metrics["records"]
if return_dict:
return df_metrics.set_index("column").T.to_dict()
cols = [
"column",
"dtype",
"records",
"count_unique",
"nans",
"zeroes",
"empty_strings",
"cardinality_perc",
"max",
"min",
"sum",
"sum_abs",
"std",
"value_counts",
]
return df_metrics[cols]