feat: statistics chapter done

This commit is contained in:
2026-02-22 16:45:11 +01:00
parent 106205935e
commit dd21fd19c6
3 changed files with 110 additions and 7 deletions

View File

@@ -3,11 +3,12 @@ import unittest
from modules.essential_math.examples.statistics_example import (
normal_distribution_example,
normal_distribution_exercise,
t_distribution_example,
basic_statistic_concepts_example,
z_scores_example,
final_exercises
)
if __name__=="__main__":
# basic_statistic_concepts_example()
# normal_distribution_example()
# z_scores_example()
final_exercises()

View File

@@ -16,6 +16,7 @@ from modules.essential_math.statistics import (
generic_critical_z_value,
margin_of_error,
confidence_interval,
get_critical_value_range_t,
)
def basic_statistic_concepts_example():
@@ -59,6 +60,55 @@ def normal_distribution_example():
expected_value = inverse_cumulative_density_function(target_probability, mean, std_dev);
print(">> For a probability of .5 we expect the value: ", expected_value)
def normal_distribution_exercise():
# Population with cold MEAN recovery time of 18 days, with std_dev of 1.5 days.
# Chances of recovery between 15 and 21 days
mean = 18
std_dev = 1.5
init = 15
end = 21
chances = normal_cumulative_density_function(end, mean, std_dev) - normal_cumulative_density_function(init, mean, std_dev)
print("Chances of recovering from a cold between 15 and 21 days: ", chances)
print("Chances of recovering before 15 days or after 21: ", 1.0 - chances)
# since its a normal distribution, the chances are equaly distributed
print("Chances of recovering before 15 days: ", (1.0 - chances) / 2)
# Apply rug (or drug) to 40 people and see a 16 MEAN recovery time. Test if rug improved mean or casuality
## One tailed atest: use inverse cdf in order to find the limit value for a given %.
new_mean = 16
min_target_percentage = 0.05 # this is a standard
min_mean = inverse_cumulative_density_function(min_target_percentage, mean, std_dev)
if (min_mean < new_mean):
print("The rug (drug) did nothing.")
else:
print("The rug (drug) worked.")
## One tailed test with a P value
p_value = normal_cumulative_density_function(new_mean, mean, std_dev)
if (p_value > min_target_percentage):
print("The rug (drug) did nothing.")
else:
print("The rug (drug) worked.")
## Two tailed test (look for both sides of the normal distribution)
## Double the checks, harder to prove (x2) and checks if the rug(drug) makes the recovery time worse.
left_min_target = min_target_percentage / 2
x1 = inverse_cumulative_density_function(left_min_target, mean, std_dev)
x2 = inverse_cumulative_density_function(1.0 - left_min_target, mean, std_dev)
if (new_mean < x1 or new_mean > x2):
print("The rug (drug) worked.", x1, x2)
else:
print("The rug (drug) did nothing.", x1, x2)
## Two tailed test with a P value
p1 = normal_cumulative_density_function(new_mean, mean, std_dev)
right_symetrical_mean = mean + (mean - new_mean)
p2 = 1.0 - normal_cumulative_density_function(right_symetrical_mean, mean, std_dev)
p_value = p1 + p2
if (p_value < min_target_percentage):
print("The rug (drug) worked.", p_value)
else:
print("The rug (drug) did nothing.", p_value)
#### CONCEPT: P-hacking, searching for data (in big data scenarios) that passes the p_value < 0.05 test and claiming for a relation.
def z_scores_example():
print("== Z-scores ==")
@@ -72,3 +122,47 @@ def central_limit_theorem_example():
## Central limit theorem
test_central_limit_theorem(sample_size=1, sample_count=1000)
test_central_limit_theorem(sample_size=31, sample_count=1000)
def t_distribution_example():
confidence = 0.95
sample_size = 25
(lower, upper) = get_critical_value_range_t(confidence, sample_size)
print("The confidence interval is: ", lower, upper)
def final_exercises():
# 1.
pool_widths = (1.78, 1.75, 1.72, 1.74, 1.77)
pool_width_mean = mean(pool_widths)
pool_width_std = standard_deviation(pool_widths, True)
print("1: ", pool_width_mean, pool_width_std)
# 2.
z_mean = 42
z_std_dev = 8
z_prob_init = 20
z_prob_end = 30
z_prob_final = normal_cumulative_density_function(z_prob_end,z_mean,z_std_dev) - normal_cumulative_density_function(z_prob_init, z_mean, z_std_dev)
print("2: ", z_prob_final)
# 3.
filament_value = 1.75
filament_sample_size = 34
filament_mean = 1.715588
filament_std_dev = 0.029252
filament_percentage_conficence = .99
filament_z_value = z_score(filament_value,filament_mean, filament_std_dev)
(filament_confidence_init, filament_conficence_end) = confidence_interval(filament_percentage_conficence, filament_sample_size, filament_std_dev, filament_z_value, filament_mean)
print("3: ", filament_confidence_init, filament_conficence_end)
# 4.
original_sales_average = 10345
original_sales_std_dev = 552
new_sales_average = 11641
min_sales_percentage = 0.05
sales_p1 = 1.0 - normal_cumulative_density_function(new_sales_average, original_sales_average, original_sales_std_dev)
sales_p = sales_p1 * 2 # take advantage of symmetry
if (sales_p < min_sales_percentage):
print("The sales campaing worked", sales_p)
else:
print("The sales campaing did NOT work")

View File

@@ -3,7 +3,7 @@
## Chapter 3 - Statistics
from math import sqrt, pi, e, exp
from scipy.stats import norm
from scipy.stats import norm, t
import random
import plotly.express as px
@@ -85,8 +85,8 @@ def test_central_limit_theorem(sample_size, sample_count):
def generic_critical_z_value(probability):
norm_dist = norm(loc=0.0, scale=1.0)
left_tail_area = (1.0 - p) / 2.0
upper_area = 1.0 - ((1.0 - p) / 2.0)
left_tail_area = (1.0 - probability) / 2.0
upper_area = 1.0 - ((1.0 - probability) / 2.0)
return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)
def margin_of_error(sample_size, standard_deviation, z_value):
@@ -95,6 +95,14 @@ def margin_of_error(sample_size, standard_deviation, z_value):
# How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be)
def confidence_interval(probability, sample_size, standard_deviation, z_value, mean):
critical_z = generic_critical_z_value(probability)
margin_error = margin_error(sample_size, standard_deviation, z_value)
margin_error = margin_of_error(sample_size, standard_deviation, z_value)
return mean + margin_error, mean - margin_error
## T Distribution
## Similar to the normal distribution but made for smaller sample-sizes (30 or less)
## When we get close to the 31 items, both are identical
def get_critical_value_range_t(conficence_percentage: float, sample_size: int):
untrusted_percentage = 1.0 - conficence_percentage
lower = t.ppf(untrusted_percentage / 2, df=sample_size-1)
upper = t.ppf(conficence_percentage + (untrusted_percentage / 2), df=sample_size-1)
return (lower, upper)