refactor: module organization

2026-02-19 17:42:57 +01:00
parent 893250e7b5
commit 106205935e
7 changed files with 181 additions and 148 deletions
--- a/src/main.py
+++ b/src/main.py
@@ -1,20 +1,13 @@
 from sympy import diff, limit, oo, symbols
 import unittest

-from modules.math import (
-	test_math_module
+from modules.essential_math.examples.statistics_example import (
+	normal_distribution_example,
+	basic_statistic_concepts_example,
+	z_scores_example,
 )
-from modules.probability import (
-	test_probability_module
-)
-from modules.statistics import (
-	test_statistics_module
-)
-from modules.strings import t_strings

 if __name__=="__main__":
-	# t_strings()
-	# test_math_module()
-	# test_probability_module()
-	test_statistics_module()
-	# test_exercises_module()
+	# basic_statistic_concepts_example()
+	# normal_distribution_example()
+	# z_scores_example()
--- a/src/modules/essential_math/examples/statistics_example.py
+++ b/src/modules/essential_math/examples/statistics_example.py
@@ -0,0 +1,74 @@
+from modules.essential_math.statistics import (
+	mean,
+	median,
+	weighted_mean,
+	weighted_mean_inline,
+	population_variance,
+	population_variance_inline,
+	sample_variance,
+	standard_deviation,
+	normal_probability_density_function,
+	normal_cumulative_density_function,
+	inverse_cumulative_density_function,
+	z_score,
+	coeficient_of_variation,
+	test_central_limit_theorem,
+	generic_critical_z_value,
+	margin_of_error,
+	confidence_interval,
+)
+
+def basic_statistic_concepts_example():
+	print("=== Statistics module ===")
+
+	list = [ 1, 2, 3, 4, 5, 6]
+	print(">> The mean of {0} is {1}".format(list, mean(list)))
+
+	weights = [0.2, 0.5, 0.7, 1, 0, 0.9]
+	print(">> The weighted_mean of {0} is {1} and it is equivalent to {2}".format(list, weighted_mean(list, weights), weighted_mean_inline(list, weights)))
+
+	print(">> The median is {0}".format(median(list)))
+	
+	values = [ 0, 1, 5, 7, 9, 10, 14]
+	_population_variance = population_variance(values, sum(values) / len(values))
+	population_variance_calc_inline = population_variance_inline(values);
+	print("The population variance is", _population_variance, population_variance_calc_inline)
+
+	std_dev = standard_deviation(values, False)
+	print("The standard deviation is", std_dev)
+
+	sample = values.copy()
+	del sample[3]
+	del sample[1]
+	print("The sample variance for a population is", sample_variance(sample))
+	print("The standard deviation for a population is", standard_deviation(sample, True))
+
+def normal_distribution_example():
+	print("== Normal distribution ==")
+
+	values = [ 0, 1, 5, 7, 9, 10, 14]
+	mean = sum(values) / len(values)
+	std_dev = standard_deviation(values, False)
+	target_x = 1
+	
+	print(">> The probability_density_function for x = 1 over the example data is {0}".format(normal_probability_density_function(target_x, mean, std_dev)))
+
+	print(">> The probability for observing a value smaller than 1 is given by the cumulative density function and it is: {0}".format(normal_cumulative_density_function(target_x, mean, std_dev)))
+
+	target_probability = 0.5
+	expected_value = inverse_cumulative_density_function(target_probability, mean, std_dev);
+	print(">> For a probability of .5 we expect the value: ", expected_value)
+
+def z_scores_example():
+	print("== Z-scores ==")
+
+	print("A house (A) of 150K in a neighborhood of 140K mean and 3K std_dev has a Z-score: {0}".format(z_score(150000, 140000, 3000)))
+	print("A house (B) of 815K in a neighborhood of 800K mean and 10K std_dev has a Z-score: {0}".format(z_score(815000, 800000, 10000)))
+	print("The House A is much more expensive because its z-score is higher.")
+	print("The neighborhood of B has a coeficient of variation: {0}, and the one of A: {1}".format(coeficient_of_variation(3000, 140000), coeficient_of_variation(10000, 800000)))
+	print("This means that the neighborhood of A has more spread in its prices")
+
+def central_limit_theorem_example():
+	## Central limit theorem
+	test_central_limit_theorem(sample_size=1, sample_count=1000)
+	test_central_limit_theorem(sample_size=31, sample_count=1000)
--- a/src/modules/essential_math/math.py
+++ b/src/modules/essential_math/math.py
--- a/src/modules/essential_math/probability.py
+++ b/src/modules/essential_math/probability.py
--- a/src/modules/essential_math/statistics.py
+++ b/src/modules/essential_math/statistics.py
@@ -0,0 +1,100 @@
+## This module represents the third chapter of the book 
+##  "Essential Math for Data Science" - Thomas Nield
+##  Chapter 3 - Statistics 
+
+from math import sqrt, pi, e, exp
+from scipy.stats import norm
+
+import random
+import plotly.express as px
+
+def mean(list):
+	return sum(list) / len(list)
+
+def weighted_mean(items, weights):
+	if (len(items) != len(weights)):
+		return
+	total = 0
+	for i in range(len(items)):
+		total += items[i] * weights[i]
+	return total / sum(weights)
+
+def weighted_mean_inline(items, weights):
+	return sum(s * w for s, w in zip(items, weights)) / sum(weights)
+# also called 50% quantile
+def median(items):
+	ordered = sorted(items)
+	length = len(ordered)
+	pair = length % 2 == 0
+	mid = int(length / 2) - 1 if pair else int(n/2)
+
+	if pair:
+		return (ordered[mid] + ordered[mid+1]) / 2
+	else:
+		return ordered[mid]
+
+def mode(items):
+	sums = []
+
+def population_variance(value_list, mean):
+	summatory = 0.0
+	for value in value_list:
+		summatory += (value - mean) ** 2
+	return summatory / len(value_list)
+
+def population_variance_inline(value_list):
+	return sum((v - (sum(value_list) / len(value_list))) ** 2 for v in value_list) / len(value_list)
+
+def sample_variance(value_list):
+	mean = sum(value_list) / len(value_list)
+	return sum((value - mean) ** 2 for value in value_list) / (len(value_list) - 1)
+
+def population_standard_deviation(value_list):
+	return sqrt(population_variance_inline(value_list))
+
+def sample_standard_deviation(value_list):
+	return sqrt(sample_variance(value_list))
+
+def standard_deviation(value_list, is_sample):
+	return sample_standard_deviation(value_list) if is_sample else population_standard_deviation(value_list)
+
+## Normal distribution
+# PDF generates the Normal Distribution (symetric arround the mean)
+def normal_probability_density_function(x: float, mean: float, standard_deviation: float):
+	return (1.0 / (2.0 * pi * standard_deviation ** 2) ** 0.5) * exp(-1.0 * ((x - mean) ** 2 / (2.0 * standard_deviation ** 2)))
+
+def normal_cumulative_density_function(x, mean, std_deviation):
+	return norm.cdf(x, mean, std_deviation)
+
+# Check exected value for a given probability
+def inverse_cumulative_density_function(prob, mean, std_dev):
+	x = norm.ppf(prob, mean, std_dev)
+	return x
+
+# Z-scores are valuable in order to normalize 2 pieces of data
+def z_score(value, data_mean, std_deviation):
+	return (value - data_mean) / std_deviation
+
+def coeficient_of_variation(std_deviation, mean):
+	return (std_deviation / mean)
+
+def test_central_limit_theorem(sample_size, sample_count):
+	x_values = [(sum([random.uniform(0.0,1.0) for i in range(sample_size)]) / sample_size) for _ in range(sample_count)]
+	y_values = [1 for _ in range(sample_count)]
+	px.histogram(x=x_values, y=y_values, nbins=20).show()
+
+def generic_critical_z_value(probability):
+	norm_dist = norm(loc=0.0, scale=1.0)
+	left_tail_area = (1.0 - p) / 2.0
+	upper_area = 1.0 - ((1.0 - p) / 2.0)
+	return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)
+
+def margin_of_error(sample_size, standard_deviation, z_value):
+	return z_value * (standard_deviation / sqrt(sample_size)) # +-, we return the one provided by the z_value (tail or upper)
+
+# How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be)
+def confidence_interval(probability, sample_size, standard_deviation, z_value, mean):
+	critical_z = generic_critical_z_value(probability)
+	margin_error = margin_error(sample_size, standard_deviation, z_value)
+	return mean + margin_error, mean - margin_error
+
--- a/src/modules/primitives/strings.py
+++ b/src/modules/primitives/strings.py
--- a/src/modules/statistics.py
+++ b/src/modules/statistics.py
@@ -1,134 +0,0 @@
-## This module represents the third chapter of the book 
-##  "Essential Math for Data Science" - Thomas Nield
-##  Chapter 3 - Statistics 
-
-from math import sqrt, pi, e, exp
-from scipy.stats import norm
-
-import random
-import plotly.express as px
-
-def mean(list):
-	return sum(list) / len(list)
-
-def weighted_mean(items, weights):
-	if (len(items) != len(weights)):
-		return
-	total = 0
-	for i in range(len(items)):
-		total += items[i] * weights[i]
-	return total / sum(weights)
-
-def weighted_mean_inline(items, weights):
-	return sum(s * w for s, w in zip(items, weights)) / sum(weights)
-
-# also called 50% quantile
-def median(items):
-	ordered = sorted(items)
-	length = len(ordered)
-	pair = length % 2 == 0
-	mid = int(length / 2) - 1 if pair else int(n/2)
-
-	if pair:
-		return (ordered[mid] + ordered[mid+1]) / 2
-	else:
-		return ordered[mid]
-
-def mode(items):
-	sums = []
-
-def population_variance(difference_list, mean):
-	summatory = 0.0
-	for diff in difference_list:
-		summatory += (diff - mean) ** 2
-	return summatory / len(difference_list)
-
-def population_variance_inline(difference_list):
-	return sum((v - (sum(difference_list) / len(difference_list))) ** 2 for v in difference_list) / len(difference_list)
-
-def sample_variance(difference_list):
-	mean = sum(difference_list) / len(difference_list)
-	return sum((diff - mean) ** 2 for diff in difference_list) / (len(difference_list) - 1)
-
-def population_standard_deviation(difference_list):
-	return sqrt(population_variance_inline(difference_list))
-
-def sample_standard_deviation(difference_list):
-	return sqrt(sample_variance(difference_list))
-
-def standard_deviation(difference_list, is_sample):
-	return sample_standard_deviation(difference_list) if is_sample else population_standard_deviation(difference_list)
-
-## Normal distribution
-# PDF generates the Normal Distribution (symetric arround the mean)
-def normal_probability_density_function(x: float, mean: float, standard_deviation: float):
-	return (1.0 / (2.0 * pi * standard_deviation ** 2) ** 0.5) * exp(-1.0 * ((x - mean) ** 2 / (2.0 * standard_deviation ** 2)))
-
-def normal_cumulative_density_function(x, mean, difference_list):
-	std_dev = standard_deviation(difference_list, False)
-	return norm.cdf(x, mean, std_dev)
-
-# Check exected value for a given probability
-def inverse_cumulative_density_function(prob, mean, std_dev):
-	x = norm.ppf(prob, mean, std_dev)
-	return x
-
-# Z-scores are valuable in order to normalize 2 pieces of data
-def z_score(value, data_mean, std_deviation):
-	return (value - data_mean) / std_deviation
-
-def coeficient_of_variation(std_deviation, mean):
-	return (std_deviation / mean)
-
-def test_central_limit_theorem(sample_size, sample_count):
-	x_values = [(sum([random.uniform(0.0,1.0) for i in range(sample_size)]) / sample_size) for _ in range(sample_count)]
-	y_values = [1 for _ in range(sample_count)]
-	px.histogram(x=x_values, y=y_values, nbins=20).show()
-
-def generic_critical_z_value(probability):
-	norm_dist = norm(loc=0.0, scale=1.0)
-	left_tail_area = (1.0 - p) / 2.0
-	upper_area = 1.0 - ((1.0 - p) / 2.0)
-	return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)
-
-def margin_of_error(sample_size, standard_deviation, z_value):
-	return z_value * (standard_deviation / sqrt(sample_size)) # +-, we return the one provided by the z_value (tail or upper)
-
-# How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be)
-def confidence_interval(probability, sample_size, standard_deviation, z_value, mean):
-	critical_z = generic_critical_z_value(probability)
-	margin_error = margin_error(sample_size, standard_deviation, z_value)
-	return mean + margin_error, mean - margin_error
-
-
-def test_statistics_module():
-	print("=== Statistics module ===")
-	list = [ 1, 2, 3, 4, 5, 6]
-	print(">> The mean of {0} is {1}".format(list, mean(list)))
-	weights = [0.2, 0.5, 0.7, 1, 0, 0.9]
-	print(">> The weighted_mean of {0} is {1} and it is equivalent to {2}".format(list, weighted_mean(list, weights), weighted_mean_inline(list, weights)))
-	print(">> The mean is {0}".format(median(list)))
-
-	differences = [ -6.571, -5.571, -1.571, 0.429, 2.429, 3.429, 7.429 ]
-	print("The population variance is", population_variance(differences, sum(differences) / len(differences)), population_variance_inline(differences))
-	print("The standard deviation is", standard_deviation(differences, False))
-	sample = differences.copy()
-	del sample[3]
-	del sample[1]
-	print("The sample variance for a population is", sample_variance(sample))
-	print("The standard deviation for a population is", standard_deviation(sample, True))
-
-	print("== Normal distribution ==")
-	print(">> The probability_density_function for x = 1 over the example data is {0}".format(normal_probability_density_function(1, sum(differences) / len(differences), standard_deviation(differences, False))))
-	print(">> The probability for observing a value smaller than 1 is given by the cumulative density function and it is: {0}".format(normal_cumulative_density_function(1, sum(differences) / len(differences), differences)))
-
-	print("== Z-scores ==")
-	print("A house (A) of 150K in a neighborhood of 140K mean and 3K std_dev has a Z-score: {0}".format(z_score(150000, 140000, 3000)))
-	print("A house (B) of 815K in a neighborhood of 800K mean and 10K std_dev has a Z-score: {0}".format(z_score(815000, 800000, 10000)))
-	print("The House A is much more expensive because its z-score is higher.")
-	print("The neighborhood of B has a coeficient of variation: {0}, and the one of A: {1}".format(coeficient_of_variation(3000, 140000), coeficient_of_variation(10000, 800000)))
-	print("This means that the neighborhood of A has more spread in its prices")
-
-	## Central limit theorem
-	test_central_limit_theorem(sample_size=1, sample_count=1000)
-	test_central_limit_theorem(sample_size=31, sample_count=1000)