minor statistics definitions

feat: some more statistics
feat: normal distribution
2025-12-19 00:24:16 +01:00 · 2025-12-05 23:12:26 +01:00 · 2025-11-24 22:00:26 +01:00 · 2025-11-23 23:54:34 +01:00 · 2025-11-23 23:15:49 +01:00 · 2025-11-23 17:10:25 +01:00
7 changed files with 332 additions and 31 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,6 +8,8 @@ Pygments==2.19.2
 pytest==8.4.2
 python-dateutil==2.9.0.post0
 pytz==2025.2
+ruff==0.14.5
+scipy==1.16.3
 six==1.17.0
 sympy==1.14.0
 tzdata==2025.2
--- a/src/main.py
+++ b/src/main.py
@@ -1,4 +1,5 @@
 from sympy import diff, limit, oo, symbols
+import unittest

 from modules.math import (
 	test_math_module
@@ -6,14 +7,14 @@ from modules.math import (
 from modules.probability import (
 	test_probability_module
 )
+from modules.statistics import (
+	test_statistics_module
+)
 from modules.strings import t_strings

 if __name__=="__main__":
-	t_strings()
-
-	print(">> Math module")
+	# t_strings()
 	# test_math_module()
-	print(">>>>>>>>>>>>>>>>>>>>")
-	print(">> Probability module")
-	test_probability_module()
-	print(">>>>>>>>>>>>>>>>>>>>")
+	# test_probability_module()
+	test_statistics_module()
+	# test_exercises_module()
--- a/src/modules/exercises/essentials.py
+++ b/src/modules/exercises/essentials.py
@@ -0,0 +1,51 @@
+def maximum_subarray_sum(input_array: list[int]):
+	max_sum = input_array[0]
+	subarray = [input_array[0]]
+	for origin in range(0, len(input_array)):
+		for end in range(origin + 1, len(input_array) + 1):
+			temp_subarray = input_array[origin:end]
+			temp_sum = sum(temp_subarray)
+			if (temp_sum > max_sum):
+				max_sum = temp_sum
+				subarray = temp_subarray
+	return [max_sum, subarray]
+
+def get_array_range(min, max):
+	arr = []
+	for n in range(min, max + 1):
+		arr.append(n)
+	return arr
+
+def find_missing(input_aray: list[int]):
+	return sum(get_array_range(1, len(input_aray) + 1)) - sum(input_aray)
+
+def trap_rain_water(input_aray: list[int]):
+	ceiling = max(input_aray)
+	# Step 1: create a matrix with the graph
+	container = []
+	for wall_index in range(0, len(input_aray)):
+		wall = []
+		for coord_index in range(0, ceiling):
+			if (coord_index < input_aray[wall_index]):
+				wall.insert(coord_index, "w")
+			else:
+				wall.insert(coord_index, "a")
+		container.insert(wall_index, wall)
+	
+	total_water = 0
+	# Step 2: fill the air with water
+	for wall_x in range(0, len(input_aray)):
+		for wall_y in range(0, input_aray[wall_x]):
+			if (container[wall_x][wall_y] == "a"):
+				continue
+			# Step 2.1 count distance with next wall
+			temp_water = 0
+			for water_x in range(wall_x + 1, len(input_aray)):
+				if (container[water_x][wall_y] == "a"):
+					temp_water += 1
+				else:
+					total_water += temp_water
+					break
+
+	return total_water
+
--- a/src/modules/math.py
+++ b/src/modules/math.py
@@ -91,6 +91,7 @@ def t_calculate_integral(f, init, end, symbol):
    return integrate(f, (symbol, init, end))

 def test_math_module():
+	print("=== Math module ===")
 	t_exponent(2,8)
 	print(t_compound_interest(100, 20 / 100, 2, 12))
 	print(t_compound_interest_algorigthm(100, 20 / 100, 2, 12))
--- a/src/modules/probability.py
+++ b/src/modules/probability.py
@@ -1,8 +1,8 @@
 ## This module represents the second chapter of the book 
 ##  "Essential Math for Data Science" - Thomas Nield
-##  Chaper 2 - Probability 
+##  Chapter 2 - Probability 

-from scipy.stats import binom
+from scipy.stats import binom, beta
 from math import factorial

 # 2.0 odds means than an events has twice the probabilities to happen than not
@@ -12,8 +12,9 @@ def p_odds_to_probability(o):
 def p_probability_to_odds(p):
 	return (p / (1 - p))

-## Binomial distribution
-def p_binomial_distribution_example():
+class BinomialDistribution:
+	@staticmethod
+	def example():
 		n = 10
 		p = 0.9

@@ -21,22 +22,92 @@ def p_binomial_distribution_example():
 			probability = binom.pmf(k, n, p)
 			print("{0} >> {1}".format(k, probability))

-def binomial_coeficient(pool, count):
+	@staticmethod
+	def binomial_coeficient(pool, count):
 		return factorial(pool) / (factorial(count) * factorial(pool - count))

-def p_binomial_distribution_scratch(p, n):
+	@staticmethod
+	def from_scratch(p, n):
 		# For each number calc the probability of that exact number of outcomes (no order)
 		for k in range(n + 1):
 			# 1. Simple combinatory with the binomial coeficient (combinations of k elements out of a pool of n without repetition without order)
-		combinatory = binomial_coeficient(n, k)	
+			combinatory = BinomialDistribution.binomial_coeficient(n, k)	
 			# 2. Probability of success, the probability of making it k times
-		probability_of_success = p ** k # p * p, k times
+			probability_of_success = p ** k
 			# 3. Probability of failure, inverse of the success
-		probability_of_failure = (1 - p) ** (n - k) # inverse of probability the rest of the times
+			probability_of_failure = (1 - p) ** (n - k)
 			k_binomital_distribution_probability = combinatory * probability_of_success * probability_of_failure
 			print("[{0}]: {1}".format(k, k_binomital_distribution_probability))

+# Study probability of probabilities
+# > Given X success rate, get the likelihood of that success rate
+# > Returns a continuous function, so the final probability of X or better must be calculated using integrals (the area under the curve)
+class BetaDistribution:
+	@staticmethod
+	def calc(probability, success_count, failure_count):	
+		# Only calcs the rpbability to the left
+		return beta.cdf(probability, success_count, failure_count)
+
+	@staticmethod
+	def calc_right(probability, success_count, failure_count):
+		return 1.0 - BetaDistribution.calc(probability, success_count, failure_count)
+	
+	@staticmethod
+	def calc_region(init_probability, end_probability, success_count, failure_count):
+		return BetaDistribution.calc(end_probability, success_count, failure_count) - BetaDistribution.calc(init_probability, success_count, failure_count)
+
+p_r = 0.3
+p_u = 0.4
+class Exercises:
+	@staticmethod
+	def one():
+		# 30% change of rain and 40% change your umbrella order will arrive. P(R AND U)
+		return p_r * p_u
+	
+	@staticmethod
+	def two():
+		# Same Ps as previous. P(!R OR U)
+		# RU !RU !R!U R!U
+		return (1 - p_r)  + p_u - ((1 - p_r) * p_u)
+
+	@staticmethod
+	def three():
+		p_u_given_rain = 0.2
+		return p_u_given_rain * p_r
+
+	@staticmethod
+	def four():
+		n = 137
+		p_bail = 0.4
+		p_at_least_50_bail = 0.0
+		for x in range(50, n + 1):
+			p_at_least_50_bail += binom.pmf(x, n, p_bail) 
+		return p_at_least_50_bail
+
+	@staticmethod
+	def five():
+		n = 10
+		h = 8
+		t = 2
+		return  1.0 - BetaDistribution.calc(0.5, 8, 2)
+
+	
+	@staticmethod
+	def test():
+		print("P(R AND U) = {0}".format(Exercises.one()))
+		print("P(!R OR U) = {0}".format(Exercises.two()))
+		print("P(U AND R) = {0}".format(Exercises.three()))
+		print("P(At least 50 bail) = {0}".format(Exercises.four()))
+		print("P(fair) = {0}".format(Exercises.five()))
+

 def test_probability_module():
-	p_binomial_distribution_example()
-	p_binomial_distribution_scratch(0.9, 10)
+	print("=== Probability module ===")
+	print(">> Binomial distribution")
+	BinomialDistribution.example()
+	BinomialDistribution.from_scratch(0.9, 10)
+	print(">> Beta distribution")
+	print(BetaDistribution.calc(0.9,8,2))
+	print(BetaDistribution.calc_region(0.8, 0.9, 8, 2))
+	print(">> Exercises")
+	Exercises.test()
--- a/src/modules/statistics.py
+++ b/src/modules/statistics.py
@@ -0,0 +1,134 @@
+## This module represents the third chapter of the book 
+##  "Essential Math for Data Science" - Thomas Nield
+##  Chapter 3 - Statistics 
+
+from math import sqrt, pi, e, exp
+from scipy.stats import norm
+
+import random
+import plotly.express as px
+
+def mean(list):
+	return sum(list) / len(list)
+
+def weighted_mean(items, weights):
+	if (len(items) != len(weights)):
+		return
+	total = 0
+	for i in range(len(items)):
+		total += items[i] * weights[i]
+	return total / sum(weights)
+
+def weighted_mean_inline(items, weights):
+	return sum(s * w for s, w in zip(items, weights)) / sum(weights)
+
+# also called 50% quantile
+def median(items):
+	ordered = sorted(items)
+	length = len(ordered)
+	pair = length % 2 == 0
+	mid = int(length / 2) - 1 if pair else int(n/2)
+
+	if pair:
+		return (ordered[mid] + ordered[mid+1]) / 2
+	else:
+		return ordered[mid]
+
+def mode(items):
+	sums = []
+
+def population_variance(difference_list, mean):
+	summatory = 0.0
+	for diff in difference_list:
+		summatory += (diff - mean) ** 2
+	return summatory / len(difference_list)
+
+def population_variance_inline(difference_list):
+	return sum((v - (sum(difference_list) / len(difference_list))) ** 2 for v in difference_list) / len(difference_list)
+
+def sample_variance(difference_list):
+	mean = sum(difference_list) / len(difference_list)
+	return sum((diff - mean) ** 2 for diff in difference_list) / (len(difference_list) - 1)
+
+def population_standard_deviation(difference_list):
+	return sqrt(population_variance_inline(difference_list))
+
+def sample_standard_deviation(difference_list):
+	return sqrt(sample_variance(difference_list))
+
+def standard_deviation(difference_list, is_sample):
+	return sample_standard_deviation(difference_list) if is_sample else population_standard_deviation(difference_list)
+
+## Normal distribution
+# PDF generates the Normal Distribution (symetric arround the mean)
+def normal_probability_density_function(x: float, mean: float, standard_deviation: float):
+	return (1.0 / (2.0 * pi * standard_deviation ** 2) ** 0.5) * exp(-1.0 * ((x - mean) ** 2 / (2.0 * standard_deviation ** 2)))
+
+def normal_cumulative_density_function(x, mean, difference_list):
+	std_dev = standard_deviation(difference_list, False)
+	return norm.cdf(x, mean, std_dev)
+
+# Check exected value for a given probability
+def inverse_cumulative_density_function(prob, mean, std_dev):
+	x = norm.ppf(prob, mean, std_dev)
+	return x
+
+# Z-scores are valuable in order to normalize 2 pieces of data
+def z_score(value, data_mean, std_deviation):
+	return (value - data_mean) / std_deviation
+
+def coeficient_of_variation(std_deviation, mean):
+	return (std_deviation / mean)
+
+def test_central_limit_theorem(sample_size, sample_count):
+	x_values = [(sum([random.uniform(0.0,1.0) for i in range(sample_size)]) / sample_size) for _ in range(sample_count)]
+	y_values = [1 for _ in range(sample_count)]
+	px.histogram(x=x_values, y=y_values, nbins=20).show()
+
+def generic_critical_z_value(probability):
+	norm_dist = norm(loc=0.0, scale=1.0)
+	left_tail_area = (1.0 - p) / 2.0
+	upper_area = 1.0 - ((1.0 - p) / 2.0)
+	return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)
+
+def margin_of_error(sample_size, standard_deviation, z_value):
+	return z_value * (standard_deviation / sqrt(sample_size)) # +-, we return the one provided by the z_value (tail or upper)
+
+# How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be)
+def confidence_interval(probability, sample_size, standard_deviation, z_value, mean):
+	critical_z = generic_critical_z_value(probability)
+	margin_error = margin_error(sample_size, standard_deviation, z_value)
+	return mean + margin_error, mean - margin_error
+
+
+def test_statistics_module():
+	print("=== Statistics module ===")
+	list = [ 1, 2, 3, 4, 5, 6]
+	print(">> The mean of {0} is {1}".format(list, mean(list)))
+	weights = [0.2, 0.5, 0.7, 1, 0, 0.9]
+	print(">> The weighted_mean of {0} is {1} and it is equivalent to {2}".format(list, weighted_mean(list, weights), weighted_mean_inline(list, weights)))
+	print(">> The mean is {0}".format(median(list)))
+
+	differences = [ -6.571, -5.571, -1.571, 0.429, 2.429, 3.429, 7.429 ]
+	print("The population variance is", population_variance(differences, sum(differences) / len(differences)), population_variance_inline(differences))
+	print("The standard deviation is", standard_deviation(differences, False))
+	sample = differences.copy()
+	del sample[3]
+	del sample[1]
+	print("The sample variance for a population is", sample_variance(sample))
+	print("The standard deviation for a population is", standard_deviation(sample, True))
+
+	print("== Normal distribution ==")
+	print(">> The probability_density_function for x = 1 over the example data is {0}".format(normal_probability_density_function(1, sum(differences) / len(differences), standard_deviation(differences, False))))
+	print(">> The probability for observing a value smaller than 1 is given by the cumulative density function and it is: {0}".format(normal_cumulative_density_function(1, sum(differences) / len(differences), differences)))
+
+	print("== Z-scores ==")
+	print("A house (A) of 150K in a neighborhood of 140K mean and 3K std_dev has a Z-score: {0}".format(z_score(150000, 140000, 3000)))
+	print("A house (B) of 815K in a neighborhood of 800K mean and 10K std_dev has a Z-score: {0}".format(z_score(815000, 800000, 10000)))
+	print("The House A is much more expensive because its z-score is higher.")
+	print("The neighborhood of B has a coeficient of variation: {0}, and the one of A: {1}".format(coeficient_of_variation(3000, 140000), coeficient_of_variation(10000, 800000)))
+	print("This means that the neighborhood of A has more spread in its prices")
+
+	## Central limit theorem
+	test_central_limit_theorem(sample_size=1, sample_count=1000)
+	test_central_limit_theorem(sample_size=31, sample_count=1000)
--- a/tests/test_essentials.py
+++ b/tests/test_essentials.py
@@ -0,0 +1,41 @@
+import pytest
+
+from src.modules.exercises.essentials import *
+
+def test_subarray_sum():
+	input = [2,3,-8,7,-1,2,3]
+	result = maximum_subarray_sum(input)
+	expected = [11, [7, -1, 2, 3]]
+	assert result == expected
+	input = [-2,-4]
+	result = maximum_subarray_sum(input)
+	expected = [-2, [-2]]
+	assert result == expected
+	input = [5,4,1,7,8]
+	result = maximum_subarray_sum(input)
+	expected = [25, [5,4,1,7,8]]
+	assert result == expected
+
+def test_find_missing():
+	input = [8,2,4,5,3,7,1]
+	result = find_missing(input)
+	expected = 6
+	assert result == expected
+	input = [1,2,3,5]
+	result = find_missing(input)
+	expected = 4
+	assert result == expected
+
+def test_rain_watter():
+	input = [3,0,1,0,4,0,2]
+	expected = 10
+	result = trap_rain_water(input)
+	assert result == expected
+	input = [3,0,2,0,4]
+	expected = 7
+	result = trap_rain_water(input)
+	assert result == expected
+	input = [1,2,3,4]
+	expected = 0
+	result = trap_rain_water(input)
+	assert result == expected
Author	SHA1	Message	Date
Daniel Heras Quesada	893250e7b5	minor statistics definitions	2025-12-19 00:24:16 +01:00
Daniel Heras Quesada	170697c955	feat: some more statistics	2025-12-05 23:12:26 +01:00
Daniel Heras Quesada	c37dba44a8	feat: normal distribution	2025-11-24 22:00:26 +01:00
Daniel Heras Quesada	6a86e0ab9d	feat: trapping rain water problem done	2025-11-23 23:54:34 +01:00
Daniel Heras Quesada	feb122711a	feat: find_missing	2025-11-23 23:15:49 +01:00
Daniel Heras Quesada	a26970f873	feat: more statistics + a simple exercise	2025-11-23 17:10:25 +01:00
Daniel Heras Quesada	86e86ea567	refactor: organization	2025-11-19 00:28:30 +01:00
Daniel Heras Quesada	cf7185c853	feat: requirements updated	2025-11-18 21:13:31 +01:00
Daniel Heras Quesada	f9a6cd540d	feat: probability exercises	2025-11-18 17:47:10 +01:00
Daniel Heras Quesada	8cff7e1c5a	feat: beta distribution	2025-11-18 13:09:24 +01:00