From 893250e7b566e50be443592b76357dbbea7e868f Mon Sep 17 00:00:00 2001 From: Daniel Heras Quesada Date: Fri, 19 Dec 2025 00:24:16 +0100 Subject: [PATCH] minor statistics definitions --- src/modules/statistics.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/modules/statistics.py b/src/modules/statistics.py index 2a65059..becb530 100644 --- a/src/modules/statistics.py +++ b/src/modules/statistics.py @@ -1,6 +1,13 @@ +## This module represents the third chapter of the book +## "Essential Math for Data Science" - Thomas Nield +## Chapter 3 - Statistics + from math import sqrt, pi, e, exp from scipy.stats import norm +import random +import plotly.express as px + def mean(list): return sum(list) / len(list) @@ -73,6 +80,27 @@ def z_score(value, data_mean, std_deviation): def coeficient_of_variation(std_deviation, mean): return (std_deviation / mean) +def test_central_limit_theorem(sample_size, sample_count): + x_values = [(sum([random.uniform(0.0,1.0) for i in range(sample_size)]) / sample_size) for _ in range(sample_count)] + y_values = [1 for _ in range(sample_count)] + px.histogram(x=x_values, y=y_values, nbins=20).show() + +def generic_critical_z_value(probability): + norm_dist = norm(loc=0.0, scale=1.0) + left_tail_area = (1.0 - p) / 2.0 + upper_area = 1.0 - ((1.0 - p) / 2.0) + return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area) + +def margin_of_error(sample_size, standard_deviation, z_value): + return z_value * (standard_deviation / sqrt(sample_size)) # +-, we return the one provided by the z_value (tail or upper) + +# How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be) +def confidence_interval(probability, sample_size, standard_deviation, z_value, mean): + critical_z = generic_critical_z_value(probability) + margin_error = margin_error(sample_size, standard_deviation, z_value) + return mean + margin_error, mean - margin_error + + def test_statistics_module(): print("=== Statistics module ===") list = [ 1, 2, 3, 4, 5, 6] @@ -100,3 +128,7 @@ def test_statistics_module(): print("The House A is much more expensive because its z-score is higher.") print("The neighborhood of B has a coeficient of variation: {0}, and the one of A: {1}".format(coeficient_of_variation(3000, 140000), coeficient_of_variation(10000, 800000))) print("This means that the neighborhood of A has more spread in its prices") + + ## Central limit theorem + test_central_limit_theorem(sample_size=1, sample_count=1000) + test_central_limit_theorem(sample_size=31, sample_count=1000)