minor statistics definitions

feat: some more statistics
feat: normal distribution
2025-12-19 00:24:16 +01:00 · 2025-12-05 23:12:26 +01:00 · 2025-11-24 22:00:26 +01:00 · 2025-11-23 23:54:34 +01:00 · 2025-11-23 23:15:49 +01:00 · 2025-11-23 17:10:25 +01:00
7 changed files with 332 additions and 31 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,6 +8,8 @@ Pygments==2.19.2
 pytest==8.4.2
 python-dateutil==2.9.0.post0
 pytz==2025.2
 ruff==0.14.5
 scipy==1.16.3
 six==1.17.0
 sympy==1.14.0
 tzdata==2025.2
--- a/src/main.py
+++ b/src/main.py
@@ -1,4 +1,5 @@
 from sympy import diff, limit, oo, symbols
 import unittest
 from modules.math import (
 	test_math_module
@@ -6,14 +7,14 @@ from modules.math import (
 from modules.probability import (
 	test_probability_module
 )
 from modules.statistics import (
 	test_statistics_module
 )
 from modules.strings import t_strings
 if __name__=="__main__":
-	t_strings()
+	# t_strings()
 	print(">> Math module")
 	# test_math_module()
-	print(">>>>>>>>>>>>>>>>>>>>")
+	# test_probability_module()
-	print(">> Probability module")
+	test_statistics_module()
-	test_probability_module()
+	# test_exercises_module()
 	print(">>>>>>>>>>>>>>>>>>>>")
--- a/src/modules/exercises/essentials.py
+++ b/src/modules/exercises/essentials.py
@@ -0,0 +1,51 @@
 def maximum_subarray_sum(input_array: list[int]):
 	max_sum = input_array[0]
 	subarray = [input_array[0]]
 	for origin in range(0, len(input_array)):
 		for end in range(origin + 1, len(input_array) + 1):
 			temp_subarray = input_array[origin:end]
 			temp_sum = sum(temp_subarray)
 			if (temp_sum > max_sum):
 				max_sum = temp_sum
 				subarray = temp_subarray
 	return [max_sum, subarray]
 def get_array_range(min, max):
 	arr = []
 	for n in range(min, max + 1):
 		arr.append(n)
 	return arr
 def find_missing(input_aray: list[int]):
 	return sum(get_array_range(1, len(input_aray) + 1)) - sum(input_aray)
 def trap_rain_water(input_aray: list[int]):
 	ceiling = max(input_aray)
 	# Step 1: create a matrix with the graph
 	container = []
 	for wall_index in range(0, len(input_aray)):
 		wall = []
 		for coord_index in range(0, ceiling):
 			if (coord_index < input_aray[wall_index]):
 				wall.insert(coord_index, "w")
 			else:
 				wall.insert(coord_index, "a")
 		container.insert(wall_index, wall)
 	total_water = 0
 	# Step 2: fill the air with water
 	for wall_x in range(0, len(input_aray)):
 		for wall_y in range(0, input_aray[wall_x]):
 			if (container[wall_x][wall_y] == "a"):
 				continue
 			# Step 2.1 count distance with next wall
 			temp_water = 0
 			for water_x in range(wall_x + 1, len(input_aray)):
 				if (container[water_x][wall_y] == "a"):
 					temp_water += 1
 				else:
 					total_water += temp_water
 					break
 	return total_water
--- a/src/modules/math.py
+++ b/src/modules/math.py
@@ -91,6 +91,7 @@ def t_calculate_integral(f, init, end, symbol):
    return integrate(f, (symbol, init, end))
 def test_math_module():
 	print("=== Math module ===")
 	t_exponent(2,8)
 	print(t_compound_interest(100, 20 / 100, 2, 12))
 	print(t_compound_interest_algorigthm(100, 20 / 100, 2, 12))
--- a/src/modules/probability.py
+++ b/src/modules/probability.py
@@ -1,8 +1,8 @@
 ## This module represents the second chapter of the book 
 ##  "Essential Math for Data Science" - Thomas Nield
-##  Chaper 2 - Probability 
+##  Chapter 2 - Probability 
-from scipy.stats import binom
+from scipy.stats import binom, beta
 from math import factorial
 # 2.0 odds means than an events has twice the probabilities to happen than not
@@ -12,31 +12,102 @@ def p_odds_to_probability(o):
 def p_probability_to_odds(p):
 	return (p / (1 - p))
-## Binomial distribution
+class BinomialDistribution:
-def p_binomial_distribution_example():
+	@staticmethod
-	n = 10
+	def example():
-	p = 0.9
+		n = 10
 		p = 0.9
-	for k in range(n + 1):
+		for k in range(n + 1):
-		probability = binom.pmf(k, n, p)
+			probability = binom.pmf(k, n, p)
-		print("{0} >> {1}".format(k, probability))
+			print("{0} >> {1}".format(k, probability))
-def binomial_coeficient(pool, count):
+	@staticmethod
-	return factorial(pool) / (factorial(count) * factorial(pool - count))
+	def binomial_coeficient(pool, count):
 		return factorial(pool) / (factorial(count) * factorial(pool - count))
-def p_binomial_distribution_scratch(p, n):
+	@staticmethod
-	# For each number calc the probability of that exact number of outcomes (no order)
+	def from_scratch(p, n):
-	for k in range(n + 1):
+		# For each number calc the probability of that exact number of outcomes (no order)
-		# 1. Simple combinatory with the binomial coeficient (combinations of k elements out of a pool of n without repetition without order)
+		for k in range(n + 1):
-		combinatory = binomial_coeficient(n, k)	
+			# 1. Simple combinatory with the binomial coeficient (combinations of k elements out of a pool of n without repetition without order)
-		# 2. Probability of success, the probability of making it k times
+			combinatory = BinomialDistribution.binomial_coeficient(n, k)	
-		probability_of_success = p ** k # p * p, k times
+			# 2. Probability of success, the probability of making it k times
-		# 3. Probability of failure, inverse of the success
+			probability_of_success = p ** k
-		probability_of_failure = (1 - p) ** (n - k) # inverse of probability the rest of the times
+			# 3. Probability of failure, inverse of the success
-		k_binomital_distribution_probability = combinatory * probability_of_success * probability_of_failure
+			probability_of_failure = (1 - p) ** (n - k)
-		print("[{0}]: {1}".format(k, k_binomital_distribution_probability))
+			k_binomital_distribution_probability = combinatory * probability_of_success * probability_of_failure
 			print("[{0}]: {1}".format(k, k_binomital_distribution_probability))
 # Study probability of probabilities
 # > Given X success rate, get the likelihood of that success rate
 # > Returns a continuous function, so the final probability of X or better must be calculated using integrals (the area under the curve)
 class BetaDistribution:
 	@staticmethod
 	def calc(probability, success_count, failure_count):	
 		# Only calcs the rpbability to the left
 		return beta.cdf(probability, success_count, failure_count)
 	@staticmethod
 	def calc_right(probability, success_count, failure_count):
 		return 1.0 - BetaDistribution.calc(probability, success_count, failure_count)
 	@staticmethod
 	def calc_region(init_probability, end_probability, success_count, failure_count):
 		return BetaDistribution.calc(end_probability, success_count, failure_count) - BetaDistribution.calc(init_probability, success_count, failure_count)
 p_r = 0.3
 p_u = 0.4
 class Exercises:
 	@staticmethod
 	def one():
 		# 30% change of rain and 40% change your umbrella order will arrive. P(R AND U)
 		return p_r * p_u
 	@staticmethod
 	def two():
 		# Same Ps as previous. P(!R OR U)
 		# RU !RU !R!U R!U
 		return (1 - p_r)  + p_u - ((1 - p_r) * p_u)
 	@staticmethod
 	def three():
 		p_u_given_rain = 0.2
 		return p_u_given_rain * p_r
 	@staticmethod
 	def four():
 		n = 137
 		p_bail = 0.4
 		p_at_least_50_bail = 0.0
 		for x in range(50, n + 1):
 			p_at_least_50_bail += binom.pmf(x, n, p_bail) 
 		return p_at_least_50_bail
 	@staticmethod
 	def five():
 		n = 10
 		h = 8
 		t = 2
 		return  1.0 - BetaDistribution.calc(0.5, 8, 2)
 	@staticmethod
 	def test():
 		print("P(R AND U) = {0}".format(Exercises.one()))
 		print("P(!R OR U) = {0}".format(Exercises.two()))
 		print("P(U AND R) = {0}".format(Exercises.three()))
 		print("P(At least 50 bail) = {0}".format(Exercises.four()))
 		print("P(fair) = {0}".format(Exercises.five()))
 def test_probability_module():
-	p_binomial_distribution_example()
+	print("=== Probability module ===")
-	p_binomial_distribution_scratch(0.9, 10)
+	print(">> Binomial distribution")
 	BinomialDistribution.example()
 	BinomialDistribution.from_scratch(0.9, 10)
 	print(">> Beta distribution")
 	print(BetaDistribution.calc(0.9,8,2))
 	print(BetaDistribution.calc_region(0.8, 0.9, 8, 2))
 	print(">> Exercises")
 	Exercises.test()
--- a/src/modules/statistics.py
+++ b/src/modules/statistics.py
@@ -0,0 +1,134 @@
 ## This module represents the third chapter of the book 
 ##  "Essential Math for Data Science" - Thomas Nield
 ##  Chapter 3 - Statistics 
 from math import sqrt, pi, e, exp
 from scipy.stats import norm
 import random
 import plotly.express as px
 def mean(list):
 	return sum(list) / len(list)
 def weighted_mean(items, weights):
 	if (len(items) != len(weights)):
 		return
 	total = 0
 	for i in range(len(items)):
 		total += items[i] * weights[i]
 	return total / sum(weights)
 def weighted_mean_inline(items, weights):
 	return sum(s * w for s, w in zip(items, weights)) / sum(weights)
 # also called 50% quantile
 def median(items):
 	ordered = sorted(items)
 	length = len(ordered)
 	pair = length % 2 == 0
 	mid = int(length / 2) - 1 if pair else int(n/2)
 	if pair:
 		return (ordered[mid] + ordered[mid+1]) / 2
 	else:
 		return ordered[mid]
 def mode(items):
 	sums = []
 def population_variance(difference_list, mean):
 	summatory = 0.0
 	for diff in difference_list:
 		summatory += (diff - mean) ** 2
 	return summatory / len(difference_list)
 def population_variance_inline(difference_list):
 	return sum((v - (sum(difference_list) / len(difference_list))) ** 2 for v in difference_list) / len(difference_list)
 def sample_variance(difference_list):
 	mean = sum(difference_list) / len(difference_list)
 	return sum((diff - mean) ** 2 for diff in difference_list) / (len(difference_list) - 1)
 def population_standard_deviation(difference_list):
 	return sqrt(population_variance_inline(difference_list))
 def sample_standard_deviation(difference_list):
 	return sqrt(sample_variance(difference_list))
 def standard_deviation(difference_list, is_sample):
 	return sample_standard_deviation(difference_list) if is_sample else population_standard_deviation(difference_list)
 ## Normal distribution
 # PDF generates the Normal Distribution (symetric arround the mean)
 def normal_probability_density_function(x: float, mean: float, standard_deviation: float):
 	return (1.0 / (2.0 * pi * standard_deviation ** 2) ** 0.5) * exp(-1.0 * ((x - mean) ** 2 / (2.0 * standard_deviation ** 2)))
 def normal_cumulative_density_function(x, mean, difference_list):
 	std_dev = standard_deviation(difference_list, False)
 	return norm.cdf(x, mean, std_dev)
 # Check exected value for a given probability
 def inverse_cumulative_density_function(prob, mean, std_dev):
 	x = norm.ppf(prob, mean, std_dev)
 	return x
 # Z-scores are valuable in order to normalize 2 pieces of data
 def z_score(value, data_mean, std_deviation):
 	return (value - data_mean) / std_deviation
 def coeficient_of_variation(std_deviation, mean):
 	return (std_deviation / mean)
 def test_central_limit_theorem(sample_size, sample_count):
 	x_values = [(sum([random.uniform(0.0,1.0) for i in range(sample_size)]) / sample_size) for _ in range(sample_count)]
 	y_values = [1 for _ in range(sample_count)]
 	px.histogram(x=x_values, y=y_values, nbins=20).show()
 def generic_critical_z_value(probability):
 	norm_dist = norm(loc=0.0, scale=1.0)
 	left_tail_area = (1.0 - p) / 2.0
 	upper_area = 1.0 - ((1.0 - p) / 2.0)
 	return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)
 def margin_of_error(sample_size, standard_deviation, z_value):
 	return z_value * (standard_deviation / sqrt(sample_size)) # +-, we return the one provided by the z_value (tail or upper)
 # How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be)
 def confidence_interval(probability, sample_size, standard_deviation, z_value, mean):
 	critical_z = generic_critical_z_value(probability)
 	margin_error = margin_error(sample_size, standard_deviation, z_value)
 	return mean + margin_error, mean - margin_error
 def test_statistics_module():
 	print("=== Statistics module ===")
 	list = [ 1, 2, 3, 4, 5, 6]
 	print(">> The mean of {0} is {1}".format(list, mean(list)))
 	weights = [0.2, 0.5, 0.7, 1, 0, 0.9]
 	print(">> The weighted_mean of {0} is {1} and it is equivalent to {2}".format(list, weighted_mean(list, weights), weighted_mean_inline(list, weights)))
 	print(">> The mean is {0}".format(median(list)))
 	differences = [ -6.571, -5.571, -1.571, 0.429, 2.429, 3.429, 7.429 ]
 	print("The population variance is", population_variance(differences, sum(differences) / len(differences)), population_variance_inline(differences))
 	print("The standard deviation is", standard_deviation(differences, False))
 	sample = differences.copy()
 	del sample[3]
 	del sample[1]
 	print("The sample variance for a population is", sample_variance(sample))
 	print("The standard deviation for a population is", standard_deviation(sample, True))
 	print("== Normal distribution ==")
 	print(">> The probability_density_function for x = 1 over the example data is {0}".format(normal_probability_density_function(1, sum(differences) / len(differences), standard_deviation(differences, False))))
 	print(">> The probability for observing a value smaller than 1 is given by the cumulative density function and it is: {0}".format(normal_cumulative_density_function(1, sum(differences) / len(differences), differences)))
 	print("== Z-scores ==")
 	print("A house (A) of 150K in a neighborhood of 140K mean and 3K std_dev has a Z-score: {0}".format(z_score(150000, 140000, 3000)))
 	print("A house (B) of 815K in a neighborhood of 800K mean and 10K std_dev has a Z-score: {0}".format(z_score(815000, 800000, 10000)))
 	print("The House A is much more expensive because its z-score is higher.")
 	print("The neighborhood of B has a coeficient of variation: {0}, and the one of A: {1}".format(coeficient_of_variation(3000, 140000), coeficient_of_variation(10000, 800000)))
 	print("This means that the neighborhood of A has more spread in its prices")
 	## Central limit theorem
 	test_central_limit_theorem(sample_size=1, sample_count=1000)
 	test_central_limit_theorem(sample_size=31, sample_count=1000)
--- a/tests/test_essentials.py
+++ b/tests/test_essentials.py
@@ -0,0 +1,41 @@
 import pytest
 from src.modules.exercises.essentials import *
 def test_subarray_sum():
 	input = [2,3,-8,7,-1,2,3]
 	result = maximum_subarray_sum(input)
 	expected = [11, [7, -1, 2, 3]]
 	assert result == expected
 	input = [-2,-4]
 	result = maximum_subarray_sum(input)
 	expected = [-2, [-2]]
 	assert result == expected
 	input = [5,4,1,7,8]
 	result = maximum_subarray_sum(input)
 	expected = [25, [5,4,1,7,8]]
 	assert result == expected
 def test_find_missing():
 	input = [8,2,4,5,3,7,1]
 	result = find_missing(input)
 	expected = 6
 	assert result == expected
 	input = [1,2,3,5]
 	result = find_missing(input)
 	expected = 4
 	assert result == expected
 def test_rain_watter():
 	input = [3,0,1,0,4,0,2]
 	expected = 10
 	result = trap_rain_water(input)
 	assert result == expected
 	input = [3,0,2,0,4]
 	expected = 7
 	result = trap_rain_water(input)
 	assert result == expected
 	input = [1,2,3,4]
 	expected = 0
 	result = trap_rain_water(input)
 	assert result == expected
Author	SHA1	Message	Date
Daniel Heras Quesada	893250e7b5	minor statistics definitions	2025-12-19 00:24:16 +01:00
Daniel Heras Quesada	170697c955	feat: some more statistics	2025-12-05 23:12:26 +01:00
Daniel Heras Quesada	c37dba44a8	feat: normal distribution	2025-11-24 22:00:26 +01:00
Daniel Heras Quesada	6a86e0ab9d	feat: trapping rain water problem done	2025-11-23 23:54:34 +01:00
Daniel Heras Quesada	feb122711a	feat: find_missing	2025-11-23 23:15:49 +01:00
Daniel Heras Quesada	a26970f873	feat: more statistics + a simple exercise	2025-11-23 17:10:25 +01:00
Daniel Heras Quesada	86e86ea567	refactor: organization	2025-11-19 00:28:30 +01:00
Daniel Heras Quesada	cf7185c853	feat: requirements updated	2025-11-18 21:13:31 +01:00
Daniel Heras Quesada	f9a6cd540d	feat: probability exercises	2025-11-18 17:47:10 +01:00
Daniel Heras Quesada	8cff7e1c5a	feat: beta distribution	2025-11-18 13:09:24 +01:00