refactor: module organization
This commit is contained in:
21
src/main.py
21
src/main.py
@@ -1,20 +1,13 @@
|
|||||||
from sympy import diff, limit, oo, symbols
|
from sympy import diff, limit, oo, symbols
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from modules.math import (
|
from modules.essential_math.examples.statistics_example import (
|
||||||
test_math_module
|
normal_distribution_example,
|
||||||
|
basic_statistic_concepts_example,
|
||||||
|
z_scores_example,
|
||||||
)
|
)
|
||||||
from modules.probability import (
|
|
||||||
test_probability_module
|
|
||||||
)
|
|
||||||
from modules.statistics import (
|
|
||||||
test_statistics_module
|
|
||||||
)
|
|
||||||
from modules.strings import t_strings
|
|
||||||
|
|
||||||
if __name__=="__main__":
|
if __name__=="__main__":
|
||||||
# t_strings()
|
# basic_statistic_concepts_example()
|
||||||
# test_math_module()
|
# normal_distribution_example()
|
||||||
# test_probability_module()
|
# z_scores_example()
|
||||||
test_statistics_module()
|
|
||||||
# test_exercises_module()
|
|
||||||
|
|||||||
74
src/modules/essential_math/examples/statistics_example.py
Normal file
74
src/modules/essential_math/examples/statistics_example.py
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
from modules.essential_math.statistics import (
|
||||||
|
mean,
|
||||||
|
median,
|
||||||
|
weighted_mean,
|
||||||
|
weighted_mean_inline,
|
||||||
|
population_variance,
|
||||||
|
population_variance_inline,
|
||||||
|
sample_variance,
|
||||||
|
standard_deviation,
|
||||||
|
normal_probability_density_function,
|
||||||
|
normal_cumulative_density_function,
|
||||||
|
inverse_cumulative_density_function,
|
||||||
|
z_score,
|
||||||
|
coeficient_of_variation,
|
||||||
|
test_central_limit_theorem,
|
||||||
|
generic_critical_z_value,
|
||||||
|
margin_of_error,
|
||||||
|
confidence_interval,
|
||||||
|
)
|
||||||
|
|
||||||
|
def basic_statistic_concepts_example():
|
||||||
|
print("=== Statistics module ===")
|
||||||
|
|
||||||
|
list = [ 1, 2, 3, 4, 5, 6]
|
||||||
|
print(">> The mean of {0} is {1}".format(list, mean(list)))
|
||||||
|
|
||||||
|
weights = [0.2, 0.5, 0.7, 1, 0, 0.9]
|
||||||
|
print(">> The weighted_mean of {0} is {1} and it is equivalent to {2}".format(list, weighted_mean(list, weights), weighted_mean_inline(list, weights)))
|
||||||
|
|
||||||
|
print(">> The median is {0}".format(median(list)))
|
||||||
|
|
||||||
|
values = [ 0, 1, 5, 7, 9, 10, 14]
|
||||||
|
_population_variance = population_variance(values, sum(values) / len(values))
|
||||||
|
population_variance_calc_inline = population_variance_inline(values);
|
||||||
|
print("The population variance is", _population_variance, population_variance_calc_inline)
|
||||||
|
|
||||||
|
std_dev = standard_deviation(values, False)
|
||||||
|
print("The standard deviation is", std_dev)
|
||||||
|
|
||||||
|
sample = values.copy()
|
||||||
|
del sample[3]
|
||||||
|
del sample[1]
|
||||||
|
print("The sample variance for a population is", sample_variance(sample))
|
||||||
|
print("The standard deviation for a population is", standard_deviation(sample, True))
|
||||||
|
|
||||||
|
def normal_distribution_example():
|
||||||
|
print("== Normal distribution ==")
|
||||||
|
|
||||||
|
values = [ 0, 1, 5, 7, 9, 10, 14]
|
||||||
|
mean = sum(values) / len(values)
|
||||||
|
std_dev = standard_deviation(values, False)
|
||||||
|
target_x = 1
|
||||||
|
|
||||||
|
print(">> The probability_density_function for x = 1 over the example data is {0}".format(normal_probability_density_function(target_x, mean, std_dev)))
|
||||||
|
|
||||||
|
print(">> The probability for observing a value smaller than 1 is given by the cumulative density function and it is: {0}".format(normal_cumulative_density_function(target_x, mean, std_dev)))
|
||||||
|
|
||||||
|
target_probability = 0.5
|
||||||
|
expected_value = inverse_cumulative_density_function(target_probability, mean, std_dev);
|
||||||
|
print(">> For a probability of .5 we expect the value: ", expected_value)
|
||||||
|
|
||||||
|
def z_scores_example():
|
||||||
|
print("== Z-scores ==")
|
||||||
|
|
||||||
|
print("A house (A) of 150K in a neighborhood of 140K mean and 3K std_dev has a Z-score: {0}".format(z_score(150000, 140000, 3000)))
|
||||||
|
print("A house (B) of 815K in a neighborhood of 800K mean and 10K std_dev has a Z-score: {0}".format(z_score(815000, 800000, 10000)))
|
||||||
|
print("The House A is much more expensive because its z-score is higher.")
|
||||||
|
print("The neighborhood of B has a coeficient of variation: {0}, and the one of A: {1}".format(coeficient_of_variation(3000, 140000), coeficient_of_variation(10000, 800000)))
|
||||||
|
print("This means that the neighborhood of A has more spread in its prices")
|
||||||
|
|
||||||
|
def central_limit_theorem_example():
|
||||||
|
## Central limit theorem
|
||||||
|
test_central_limit_theorem(sample_size=1, sample_count=1000)
|
||||||
|
test_central_limit_theorem(sample_size=31, sample_count=1000)
|
||||||
100
src/modules/essential_math/statistics.py
Normal file
100
src/modules/essential_math/statistics.py
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
## This module represents the third chapter of the book
|
||||||
|
## "Essential Math for Data Science" - Thomas Nield
|
||||||
|
## Chapter 3 - Statistics
|
||||||
|
|
||||||
|
from math import sqrt, pi, e, exp
|
||||||
|
from scipy.stats import norm
|
||||||
|
|
||||||
|
import random
|
||||||
|
import plotly.express as px
|
||||||
|
|
||||||
|
def mean(list):
|
||||||
|
return sum(list) / len(list)
|
||||||
|
|
||||||
|
def weighted_mean(items, weights):
|
||||||
|
if (len(items) != len(weights)):
|
||||||
|
return
|
||||||
|
total = 0
|
||||||
|
for i in range(len(items)):
|
||||||
|
total += items[i] * weights[i]
|
||||||
|
return total / sum(weights)
|
||||||
|
|
||||||
|
def weighted_mean_inline(items, weights):
|
||||||
|
return sum(s * w for s, w in zip(items, weights)) / sum(weights)
|
||||||
|
# also called 50% quantile
|
||||||
|
def median(items):
|
||||||
|
ordered = sorted(items)
|
||||||
|
length = len(ordered)
|
||||||
|
pair = length % 2 == 0
|
||||||
|
mid = int(length / 2) - 1 if pair else int(n/2)
|
||||||
|
|
||||||
|
if pair:
|
||||||
|
return (ordered[mid] + ordered[mid+1]) / 2
|
||||||
|
else:
|
||||||
|
return ordered[mid]
|
||||||
|
|
||||||
|
def mode(items):
|
||||||
|
sums = []
|
||||||
|
|
||||||
|
def population_variance(value_list, mean):
|
||||||
|
summatory = 0.0
|
||||||
|
for value in value_list:
|
||||||
|
summatory += (value - mean) ** 2
|
||||||
|
return summatory / len(value_list)
|
||||||
|
|
||||||
|
def population_variance_inline(value_list):
|
||||||
|
return sum((v - (sum(value_list) / len(value_list))) ** 2 for v in value_list) / len(value_list)
|
||||||
|
|
||||||
|
def sample_variance(value_list):
|
||||||
|
mean = sum(value_list) / len(value_list)
|
||||||
|
return sum((value - mean) ** 2 for value in value_list) / (len(value_list) - 1)
|
||||||
|
|
||||||
|
def population_standard_deviation(value_list):
|
||||||
|
return sqrt(population_variance_inline(value_list))
|
||||||
|
|
||||||
|
def sample_standard_deviation(value_list):
|
||||||
|
return sqrt(sample_variance(value_list))
|
||||||
|
|
||||||
|
def standard_deviation(value_list, is_sample):
|
||||||
|
return sample_standard_deviation(value_list) if is_sample else population_standard_deviation(value_list)
|
||||||
|
|
||||||
|
## Normal distribution
|
||||||
|
# PDF generates the Normal Distribution (symetric arround the mean)
|
||||||
|
def normal_probability_density_function(x: float, mean: float, standard_deviation: float):
|
||||||
|
return (1.0 / (2.0 * pi * standard_deviation ** 2) ** 0.5) * exp(-1.0 * ((x - mean) ** 2 / (2.0 * standard_deviation ** 2)))
|
||||||
|
|
||||||
|
def normal_cumulative_density_function(x, mean, std_deviation):
|
||||||
|
return norm.cdf(x, mean, std_deviation)
|
||||||
|
|
||||||
|
# Check exected value for a given probability
|
||||||
|
def inverse_cumulative_density_function(prob, mean, std_dev):
|
||||||
|
x = norm.ppf(prob, mean, std_dev)
|
||||||
|
return x
|
||||||
|
|
||||||
|
# Z-scores are valuable in order to normalize 2 pieces of data
|
||||||
|
def z_score(value, data_mean, std_deviation):
|
||||||
|
return (value - data_mean) / std_deviation
|
||||||
|
|
||||||
|
def coeficient_of_variation(std_deviation, mean):
|
||||||
|
return (std_deviation / mean)
|
||||||
|
|
||||||
|
def test_central_limit_theorem(sample_size, sample_count):
|
||||||
|
x_values = [(sum([random.uniform(0.0,1.0) for i in range(sample_size)]) / sample_size) for _ in range(sample_count)]
|
||||||
|
y_values = [1 for _ in range(sample_count)]
|
||||||
|
px.histogram(x=x_values, y=y_values, nbins=20).show()
|
||||||
|
|
||||||
|
def generic_critical_z_value(probability):
|
||||||
|
norm_dist = norm(loc=0.0, scale=1.0)
|
||||||
|
left_tail_area = (1.0 - p) / 2.0
|
||||||
|
upper_area = 1.0 - ((1.0 - p) / 2.0)
|
||||||
|
return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)
|
||||||
|
|
||||||
|
def margin_of_error(sample_size, standard_deviation, z_value):
|
||||||
|
return z_value * (standard_deviation / sqrt(sample_size)) # +-, we return the one provided by the z_value (tail or upper)
|
||||||
|
|
||||||
|
# How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be)
|
||||||
|
def confidence_interval(probability, sample_size, standard_deviation, z_value, mean):
|
||||||
|
critical_z = generic_critical_z_value(probability)
|
||||||
|
margin_error = margin_error(sample_size, standard_deviation, z_value)
|
||||||
|
return mean + margin_error, mean - margin_error
|
||||||
|
|
||||||
@@ -1,134 +0,0 @@
|
|||||||
## This module represents the third chapter of the book
|
|
||||||
## "Essential Math for Data Science" - Thomas Nield
|
|
||||||
## Chapter 3 - Statistics
|
|
||||||
|
|
||||||
from math import sqrt, pi, e, exp
|
|
||||||
from scipy.stats import norm
|
|
||||||
|
|
||||||
import random
|
|
||||||
import plotly.express as px
|
|
||||||
|
|
||||||
def mean(list):
|
|
||||||
return sum(list) / len(list)
|
|
||||||
|
|
||||||
def weighted_mean(items, weights):
|
|
||||||
if (len(items) != len(weights)):
|
|
||||||
return
|
|
||||||
total = 0
|
|
||||||
for i in range(len(items)):
|
|
||||||
total += items[i] * weights[i]
|
|
||||||
return total / sum(weights)
|
|
||||||
|
|
||||||
def weighted_mean_inline(items, weights):
|
|
||||||
return sum(s * w for s, w in zip(items, weights)) / sum(weights)
|
|
||||||
|
|
||||||
# also called 50% quantile
|
|
||||||
def median(items):
|
|
||||||
ordered = sorted(items)
|
|
||||||
length = len(ordered)
|
|
||||||
pair = length % 2 == 0
|
|
||||||
mid = int(length / 2) - 1 if pair else int(n/2)
|
|
||||||
|
|
||||||
if pair:
|
|
||||||
return (ordered[mid] + ordered[mid+1]) / 2
|
|
||||||
else:
|
|
||||||
return ordered[mid]
|
|
||||||
|
|
||||||
def mode(items):
|
|
||||||
sums = []
|
|
||||||
|
|
||||||
def population_variance(difference_list, mean):
|
|
||||||
summatory = 0.0
|
|
||||||
for diff in difference_list:
|
|
||||||
summatory += (diff - mean) ** 2
|
|
||||||
return summatory / len(difference_list)
|
|
||||||
|
|
||||||
def population_variance_inline(difference_list):
|
|
||||||
return sum((v - (sum(difference_list) / len(difference_list))) ** 2 for v in difference_list) / len(difference_list)
|
|
||||||
|
|
||||||
def sample_variance(difference_list):
|
|
||||||
mean = sum(difference_list) / len(difference_list)
|
|
||||||
return sum((diff - mean) ** 2 for diff in difference_list) / (len(difference_list) - 1)
|
|
||||||
|
|
||||||
def population_standard_deviation(difference_list):
|
|
||||||
return sqrt(population_variance_inline(difference_list))
|
|
||||||
|
|
||||||
def sample_standard_deviation(difference_list):
|
|
||||||
return sqrt(sample_variance(difference_list))
|
|
||||||
|
|
||||||
def standard_deviation(difference_list, is_sample):
|
|
||||||
return sample_standard_deviation(difference_list) if is_sample else population_standard_deviation(difference_list)
|
|
||||||
|
|
||||||
## Normal distribution
|
|
||||||
# PDF generates the Normal Distribution (symetric arround the mean)
|
|
||||||
def normal_probability_density_function(x: float, mean: float, standard_deviation: float):
|
|
||||||
return (1.0 / (2.0 * pi * standard_deviation ** 2) ** 0.5) * exp(-1.0 * ((x - mean) ** 2 / (2.0 * standard_deviation ** 2)))
|
|
||||||
|
|
||||||
def normal_cumulative_density_function(x, mean, difference_list):
|
|
||||||
std_dev = standard_deviation(difference_list, False)
|
|
||||||
return norm.cdf(x, mean, std_dev)
|
|
||||||
|
|
||||||
# Check exected value for a given probability
|
|
||||||
def inverse_cumulative_density_function(prob, mean, std_dev):
|
|
||||||
x = norm.ppf(prob, mean, std_dev)
|
|
||||||
return x
|
|
||||||
|
|
||||||
# Z-scores are valuable in order to normalize 2 pieces of data
|
|
||||||
def z_score(value, data_mean, std_deviation):
|
|
||||||
return (value - data_mean) / std_deviation
|
|
||||||
|
|
||||||
def coeficient_of_variation(std_deviation, mean):
|
|
||||||
return (std_deviation / mean)
|
|
||||||
|
|
||||||
def test_central_limit_theorem(sample_size, sample_count):
|
|
||||||
x_values = [(sum([random.uniform(0.0,1.0) for i in range(sample_size)]) / sample_size) for _ in range(sample_count)]
|
|
||||||
y_values = [1 for _ in range(sample_count)]
|
|
||||||
px.histogram(x=x_values, y=y_values, nbins=20).show()
|
|
||||||
|
|
||||||
def generic_critical_z_value(probability):
|
|
||||||
norm_dist = norm(loc=0.0, scale=1.0)
|
|
||||||
left_tail_area = (1.0 - p) / 2.0
|
|
||||||
upper_area = 1.0 - ((1.0 - p) / 2.0)
|
|
||||||
return norm_dist.ppf(left_tail_area), norm_dist.ppf(upper_area)
|
|
||||||
|
|
||||||
def margin_of_error(sample_size, standard_deviation, z_value):
|
|
||||||
return z_value * (standard_deviation / sqrt(sample_size)) # +-, we return the one provided by the z_value (tail or upper)
|
|
||||||
|
|
||||||
# How confident we are at a population metric given a sample (the interval we are "probability" sure the value will be)
|
|
||||||
def confidence_interval(probability, sample_size, standard_deviation, z_value, mean):
|
|
||||||
critical_z = generic_critical_z_value(probability)
|
|
||||||
margin_error = margin_error(sample_size, standard_deviation, z_value)
|
|
||||||
return mean + margin_error, mean - margin_error
|
|
||||||
|
|
||||||
|
|
||||||
def test_statistics_module():
|
|
||||||
print("=== Statistics module ===")
|
|
||||||
list = [ 1, 2, 3, 4, 5, 6]
|
|
||||||
print(">> The mean of {0} is {1}".format(list, mean(list)))
|
|
||||||
weights = [0.2, 0.5, 0.7, 1, 0, 0.9]
|
|
||||||
print(">> The weighted_mean of {0} is {1} and it is equivalent to {2}".format(list, weighted_mean(list, weights), weighted_mean_inline(list, weights)))
|
|
||||||
print(">> The mean is {0}".format(median(list)))
|
|
||||||
|
|
||||||
differences = [ -6.571, -5.571, -1.571, 0.429, 2.429, 3.429, 7.429 ]
|
|
||||||
print("The population variance is", population_variance(differences, sum(differences) / len(differences)), population_variance_inline(differences))
|
|
||||||
print("The standard deviation is", standard_deviation(differences, False))
|
|
||||||
sample = differences.copy()
|
|
||||||
del sample[3]
|
|
||||||
del sample[1]
|
|
||||||
print("The sample variance for a population is", sample_variance(sample))
|
|
||||||
print("The standard deviation for a population is", standard_deviation(sample, True))
|
|
||||||
|
|
||||||
print("== Normal distribution ==")
|
|
||||||
print(">> The probability_density_function for x = 1 over the example data is {0}".format(normal_probability_density_function(1, sum(differences) / len(differences), standard_deviation(differences, False))))
|
|
||||||
print(">> The probability for observing a value smaller than 1 is given by the cumulative density function and it is: {0}".format(normal_cumulative_density_function(1, sum(differences) / len(differences), differences)))
|
|
||||||
|
|
||||||
print("== Z-scores ==")
|
|
||||||
print("A house (A) of 150K in a neighborhood of 140K mean and 3K std_dev has a Z-score: {0}".format(z_score(150000, 140000, 3000)))
|
|
||||||
print("A house (B) of 815K in a neighborhood of 800K mean and 10K std_dev has a Z-score: {0}".format(z_score(815000, 800000, 10000)))
|
|
||||||
print("The House A is much more expensive because its z-score is higher.")
|
|
||||||
print("The neighborhood of B has a coeficient of variation: {0}, and the one of A: {1}".format(coeficient_of_variation(3000, 140000), coeficient_of_variation(10000, 800000)))
|
|
||||||
print("This means that the neighborhood of A has more spread in its prices")
|
|
||||||
|
|
||||||
## Central limit theorem
|
|
||||||
test_central_limit_theorem(sample_size=1, sample_count=1000)
|
|
||||||
test_central_limit_theorem(sample_size=31, sample_count=1000)
|
|
||||||
Reference in New Issue
Block a user