Source code for pydit.statistics.simulation

"""This module provides a class to generate
parametrised distributions for simulation purposes

It is still in experimental stage and may not be fully functional
or tested. Use at your own risk.


"""

import math
import numpy as np

import matplotlib.pyplot as plt
from scipy.stats import lognorm, norm, uniform, triang



[docs]
class Simulation:
    """convenience class to generate parametrised distributions"""

    def __init__(
        self,
        mean=None,
        std_dev=None,
        lower_bound=None,
        upper_bound=None,
        probability=None,
        seed=None,
    ):
        try:
            assert (
                lower_bound is None and upper_bound is None and probability is None
            ) or (lower_bound < upper_bound and probability > 0 and probability < 1.0)
        except AssertionError:
            print(
                "Ensure you provide either: a) lower_bound, upper_bound and probability,  or b) mean and std_dev"
            )
        try:
            assert (mean is None and std_dev is None) or (mean != 0 and std_dev > 0)
        except AssertionError:
            print("Error in the parameters mean and std_dev:", mean, std_dev)
        self.mean = mean  # mean
        self.std_dev = std_dev  # standard deviation
        self.seed = seed
        self.samples = None
        self.lower_bound = lower_bound
        self.upper_bound = upper_bound
        self.probability = probability  # probability of the confidence interval
        np.random.seed(self.seed)
        self.random_series = None


[docs]
    def generate(self, size=1, random_series=None):
        """generate simulation, by default uses normal distribution"""
        alpha = 1 - self.probability
        z = norm.ppf(1 - alpha / 2)
        if self.mean is None:
            self.mean = (self.upper_bound + self.lower_bound) / 2
        if self.std_dev is None:
            self.std_dev = (self.upper_bound - self.lower_bound) / (2 * z)
        if (random_series is not None) and (len(random_series) >= size):
            print("Random series provided")
            self.random_series = random_series[:size]
        else:
            # uniform distribution 0,1
            self.random_series = np.random.uniform(0, 1, size)

        self.samples = norm.ppf(q=self.random_series, loc=self.mean, scale=self.std_dev)
        self.simulated_mean = np.mean(self.samples)
        self.simulated_median = np.median(self.samples)
        self.simulated_std_dev = np.std(self.samples)
        self.size = size



[docs]
    def plot_log_scale(self):
        """plot the histogram of the samples in log scale"""
        plt.hist(self.samples, bins=100, edgecolor="k", alpha=0.7)
        plt.title("Monte Carlo Simulation")
        plt.xlabel("Values")
        plt.ylabel("Frequency")
        plt.xscale("log")
        plt.yscale("log")
        plt.show()



[docs]
    def plot(self, bins=100):
        """plot the histogram of the samples"""
        hist, bins, _ = plt.hist(self.samples, bins=bins, edgecolor="k", alpha=0.7)
        plt.title("Monte Carlo Simulation")
        plt.xlabel("Values")
        plt.ylabel("Frequency")
        # count of observations within confidence interval
        num_within_interval = np.sum(
            (self.samples >= self.lower_bound) & (self.samples <= self.upper_bound)
        )
        # Calculate the proportion of observations within the confidence interval
        proportion_within_interval = num_within_interval / self.size

        plt.axvline(
            x=self.lower_bound,
            color="r",
            linestyle="dashed",
            linewidth=2,
            label=f"Lower bound ({self.lower_bound})",
        )
        plt.axvline(
            x=self.upper_bound,
            color="g",
            linestyle="dashed",
            linewidth=2,
            label=f"Upper bound ({self.upper_bound})",
        )
        plt.axvline(
            x=self.mean,
            color="b",
            linestyle="dashed",
            linewidth=2,
            label=f"Mean ({int(self.mean)})",
        )

        # add mean, median and std deviation to the plot as floating box
        plt.text(
            0.8,
            0.9,
            f"Mean: {self.simulated_mean:,.2f}\nMedian: {self.simulated_median:,.2f}\nStd Dev: {self.simulated_std_dev:,.2f}\nProportion within interval:{proportion_within_interval:.2}",
            horizontalalignment="center",
            verticalalignment="center",
            transform=plt.gca().transAxes,
        )
        plt.show()



[docs]
    def statistics(self):
        """returns a dictionary with all the statistics"""
        return {
            "mean": self.simulated_mean,
            "median": self.simulated_median,
            "std_dev": self.simulated_std_dev,
        }





[docs]
class SimulationLognormal(Simulation):
    """Generates lognormal distribution with the lower and upper bound provided"""

    def __init__(
        self,
        mean=None,
        std_dev=None,
        lower_bound=None,
        upper_bound=None,
        probability=None,
        seed=None,
    ):
        super().__init__(
            mean=mean,
            std_dev=std_dev,
            lower_bound=lower_bound,
            upper_bound=upper_bound,
            probability=probability,
            seed=seed,
        )


[docs]
    def generate(
        self, size=1, random_series=None
    ):  # generate the lognormal distribution
        assert size > 0, "size must be greater than 0"
        if (random_series is not None) and (len(random_series) >= size):
            self.random_series = random_series[:size]
        else:
            # uniform distribution 0,1
            self.random_series = np.random.uniform(0, 1, size)

        if self.mean is not None and self.std_dev is not None:
            # self.samples = np.random.lognormal(self.mean, self.std_dev, size)
            self.samples = lognorm(self.mean, scale=self.std_dev).ppf(
                self.random_series
            )
        elif self.lower_bound is not None and self.upper_bound is not None:
            # Convert the confidence level to a z-score
            z_score = norm.ppf((1 + self.probability) / 2)
            # Calculate the log-transformed mean and confidence interval
            log_lower_bound = np.log(self.lower_bound)
            log_upper_bound = np.log(self.upper_bound)
            # Calculate the mean and standard deviation of the underlying normal distribution
            mu_Y = (log_lower_bound + log_upper_bound) / 2
            sigma_Y = (log_upper_bound - log_lower_bound) / (2 * z_score)
            # samples_normal = np.random.normal(mu_Y, sigma_Y, size)
            # Theoretical mean from a lognormal based on the upper and lower band

            lognormal_mean = np.exp(mu_Y + (sigma_Y**2) / 2)
            self.mean = lognormal_mean
            # We now transform that normal into exp to generate the lognormal distribution
            samples_normal = norm.ppf(q=self.random_series, loc=mu_Y, scale=sigma_Y)
            self.samples = np.exp(samples_normal)
        else:
            raise ValueError(
                "Either mean and sigma or lower and upper bounds must be provided"
            )

        self.simulated_mean = np.mean(self.samples)
        self.simulated_median = np.median(self.samples)
        self.simulated_std_dev = np.std(self.samples)
        self.size = size





[docs]
class SimulationUniform(Simulation):
    """Generates uniform distribution with the lower and upper bound provided"""

    def __init__(self, lower_bound=None, upper_bound=None, probability=None, seed=None):
        mean = (lower_bound + upper_bound) / 2
        std_dev = (upper_bound - lower_bound) / math.sqrt(12)
        super().__init__(
            mean=mean,
            std_dev=std_dev,
            lower_bound=lower_bound,
            upper_bound=upper_bound,
            probability=probability,
            seed=seed,
        )


[docs]
    def generate(self, size=1, random_series=None):  # generate the uniform distribution
        assert size > 0, "size must be greater than 0"
        if (random_series is not None) and (len(random_series) >= size):
            self.random_series = random_series[:size]
        else:
            # uniform distribution 0,1
            self.random_series = np.random.uniform(0, 1, size)

        if self.lower_bound is not None and self.upper_bound is not None:
            # self.samples = np.random.uniform(self.lower_bound, self.upper_bound, size)
            self.samples = uniform.ppf(
                q=self.random_series, loc=self.mean, scale=self.std_dev
            )
        else:
            raise ValueError("Both lower and upper bounds must be provided")

        self.simulated_mean = np.mean(self.samples)
        self.simulated_median = np.median(self.samples)
        self.simulated_std_dev = np.std(self.samples)
        self.size = size





[docs]
class SimulationTriangular(Simulation):
    """Generates triangular distribution with the lower and upper bound provided"""

    def __init__(
        self, mode=None, lower_bound=None, upper_bound=None, probability=None, seed=None
    ):
        if lower_bound is None or upper_bound is None or mode is None:
            raise ValueError(
                "All mode, lower and upper bounds must be provided for the triangular distribution"
            )
        std_dev = math.sqrt(
            (
                lower_bound**2
                + upper_bound**2
                + mode**2
                - lower_bound * upper_bound
                - lower_bound * mode
                - upper_bound * mode
            )
            / 18
        )
        mean = (lower_bound + upper_bound + mode) / 3
        super().__init__(
            mean=mean,
            std_dev=std_dev,
            lower_bound=lower_bound,
            upper_bound=upper_bound,
            probability=probability,
            seed=seed,
        )
        self.mode = mode
        self.mode_scaled = (mode - lower_bound) / (
            upper_bound - lower_bound
        )  # c parameter for the triangular distribution


[docs]
    def generate(
        self, size=1, random_series=None
    ):  # generate the triangular distribution
        assert size > 0, "size must be greater than 0"
        if (random_series is not None) and (len(random_series) >= size):
            self.random_series = random_series[:size]
        else:
            # uniform distribution 0,1
            self.random_series = np.random.uniform(0, 1, size)

        if (
            self.mode is not None
            and self.lower_bound is not None
            and self.upper_bound is not None
        ):
            # self.samples = np.random.triangular(left=self.lower_bound, mode=self.mode, right=self.upper_bound, size=size)
            self.samples = triang.ppf(
                q=self.random_series,
                c=self.mode_scaled,
                loc=self.lower_bound,
                scale=self.upper_bound - self.lower_bound,
            )

        else:
            raise ValueError(
                "All mode, lower and upper bounds must be provided for the triangular distribution"
            )

        self.simulated_mean = np.mean(self.samples)
        self.simulated_median = np.median(self.samples)
        self.simulated_std_dev = np.std(self.samples)
        self.size = size