math.stats: update math.stats using generics (#11482)

pull/11489/head
Ulises Jeremias Cornejo Fandos 2021-09-13 04:20:38 -03:00 committed by GitHub
parent 30029eaf5d
commit 480fe8041a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 356 additions and 120 deletions

View File

@ -2,40 +2,16 @@ module stats
import math import math
// TODO: Implement all of them with generics
// This module defines the following statistical operations on f64 array
// ---------------------------
// | Summary of Functions |
// ---------------------------
// -----------------------------------------------------------------------
// freq - Frequency
// mean - Mean
// geometric_mean - Geometric Mean
// harmonic_mean - Harmonic Mean
// median - Median
// mode - Mode
// rms - Root Mean Square
// population_variance - Population Variance
// sample_variance - Sample Variance
// population_stddev - Population Standard Deviation
// sample_stddev - Sample Standard Deviation
// mean_absdev - Mean Absolute Deviation
// min - Minimum of the Array
// max - Maximum of the Array
// range - Range of the Array ( max - min )
// -----------------------------------------------------------------------
// Measure of Occurance // Measure of Occurance
// Frequency of a given number // Frequency of a given number
// Based on // Based on
// https://www.mathsisfun.com/data/frequency-distribution.html // https://www.mathsisfun.com/data/frequency-distribution.html
pub fn freq(arr []f64, val f64) int { pub fn freq<T>(data []T, val T) int {
if arr.len == 0 { if data.len == 0 {
return 0 return 0
} }
mut count := 0 mut count := 0
for v in arr { for v in data {
if v == val { if v == val {
count++ count++
} }
@ -47,60 +23,60 @@ pub fn freq(arr []f64, val f64) int {
// Mean of the given input array // Mean of the given input array
// Based on // Based on
// https://www.mathsisfun.com/data/central-measures.html // https://www.mathsisfun.com/data/central-measures.html
pub fn mean(arr []f64) f64 { pub fn mean<T>(data []T) T {
if arr.len == 0 { if data.len == 0 {
return f64(0) return T(0)
} }
mut sum := f64(0) mut sum := T(0)
for v in arr { for v in data {
sum += v sum += v
} }
return sum / f64(arr.len) return sum / T(data.len)
} }
// Measure of Central Tendancy // Measure of Central Tendancy
// Geometric Mean of the given input array // Geometric Mean of the given input array
// Based on // Based on
// https://www.mathsisfun.com/numbers/geometric-mean.html // https://www.mathsisfun.com/numbers/geometric-mean.html
pub fn geometric_mean(arr []f64) f64 { pub fn geometric_mean<T>(data []T) T {
if arr.len == 0 { if data.len == 0 {
return f64(0) return T(0)
} }
mut sum := f64(1) mut sum := 1.0
for v in arr { for v in data {
sum *= v sum *= v
} }
return math.pow(sum, f64(1) / arr.len) return math.pow(sum, 1.0 / T(data.len))
} }
// Measure of Central Tendancy // Measure of Central Tendancy
// Harmonic Mean of the given input array // Harmonic Mean of the given input array
// Based on // Based on
// https://www.mathsisfun.com/numbers/harmonic-mean.html // https://www.mathsisfun.com/numbers/harmonic-mean.html
pub fn harmonic_mean(arr []f64) f64 { pub fn harmonic_mean<T>(data []T) T {
if arr.len == 0 { if data.len == 0 {
return f64(0) return T(0)
} }
mut sum := f64(0) mut sum := T(0)
for v in arr { for v in data {
sum += f64(1) / v sum += 1.0 / v
} }
return f64(arr.len) / sum return T(data.len) / sum
} }
// Measure of Central Tendancy // Measure of Central Tendancy
// Median of the given input array ( input array is assumed to be sorted ) // Median of the given input array ( input array is assumed to be sorted )
// Based on // Based on
// https://www.mathsisfun.com/data/central-measures.html // https://www.mathsisfun.com/data/central-measures.html
pub fn median(arr []f64) f64 { pub fn median<T>(sorted_data []T) T {
if arr.len == 0 { if sorted_data.len == 0 {
return f64(0) return T(0)
} }
if arr.len % 2 == 0 { if sorted_data.len % 2 == 0 {
mid := (arr.len / 2) - 1 mid := (sorted_data.len / 2) - 1
return (arr[mid] + arr[mid + 1]) / f64(2) return (sorted_data[mid] + sorted_data[mid + 1]) / T(2)
} else { } else {
return arr[((arr.len - 1) / 2)] return sorted_data[((sorted_data.len - 1) / 2)]
} }
} }
@ -108,114 +84,198 @@ pub fn median(arr []f64) f64 {
// Mode of the given input array // Mode of the given input array
// Based on // Based on
// https://www.mathsisfun.com/data/central-measures.html // https://www.mathsisfun.com/data/central-measures.html
pub fn mode(arr []f64) f64 { pub fn mode<T>(data []T) T {
if arr.len == 0 { if data.len == 0 {
return f64(0) return T(0)
} }
mut freqs := []int{} mut freqs := []int{}
for v in arr { for v in data {
freqs << freq(arr, v) freqs << freq(data, v)
} }
mut max := 0 mut max := 0
for i in 0 .. freqs.len { for i := 0; i < freqs.len; i++ {
if freqs[i] > freqs[max] { if freqs[i] > freqs[max] {
max = i max = i
} }
} }
return arr[max] return data[max]
} }
// Root Mean Square of the given input array // Root Mean Square of the given input array
// Based on // Based on
// https://en.wikipedia.org/wiki/Root_mean_square // https://en.wikipedia.org/wiki/Root_mean_square
pub fn rms(arr []f64) f64 { pub fn rms<T>(data []T) T {
if arr.len == 0 { if data.len == 0 {
return f64(0) return T(0)
} }
mut sum := f64(0) mut sum := T(0)
for v in arr { for v in data {
sum += math.pow(v, 2) sum += math.pow(v, 2)
} }
return math.sqrt(sum / f64(arr.len)) return math.sqrt(sum / T(data.len))
} }
// Measure of Dispersion / Spread // Measure of Dispersion / Spread
// Population Variance of the given input array // Population Variance of the given input array
// Based on // Based on
// https://www.mathsisfun.com/data/standard-deviation.html // https://www.mathsisfun.com/data/standard-deviation.html
pub fn population_variance(arr []f64) f64 { [inline]
if arr.len == 0 { pub fn population_variance<T>(data []T) T {
return f64(0) if data.len == 0 {
return T(0)
} }
m := mean(arr) data_mean := mean<T>(data)
mut sum := f64(0) return population_variance_mean<T>(data, data_mean)
for v in arr { }
sum += math.pow(v - m, 2)
// Measure of Dispersion / Spread
// Population Variance of the given input array
// Based on
// https://www.mathsisfun.com/data/standard-deviation.html
pub fn population_variance_mean<T>(data []T, mean T) T {
if data.len == 0 {
return T(0)
} }
return sum / f64(arr.len) mut sum := T(0)
for v in data {
sum += (v - mean) * (v - mean)
}
return sum / T(data.len)
} }
// Measure of Dispersion / Spread // Measure of Dispersion / Spread
// Sample Variance of the given input array // Sample Variance of the given input array
// Based on // Based on
// https://www.mathsisfun.com/data/standard-deviation.html // https://www.mathsisfun.com/data/standard-deviation.html
pub fn sample_variance(arr []f64) f64 { [inline]
if arr.len == 0 { pub fn sample_variance<T>(data []T) T {
return f64(0) if data.len == 0 {
return T(0)
} }
m := mean(arr) data_mean := mean<T>(data)
mut sum := f64(0) return sample_variance_mean<T>(data, data_mean)
for v in arr { }
sum += math.pow(v - m, 2)
// Measure of Dispersion / Spread
// Sample Variance of the given input array
// Based on
// https://www.mathsisfun.com/data/standard-deviation.html
pub fn sample_variance_mean<T>(data []T, mean T) T {
if data.len == 0 {
return T(0)
} }
return sum / f64(arr.len - 1) mut sum := T(0)
for v in data {
sum += (v - mean) * (v - mean)
}
return sum / T(data.len - 1)
} }
// Measure of Dispersion / Spread // Measure of Dispersion / Spread
// Population Standard Deviation of the given input array // Population Standard Deviation of the given input array
// Based on // Based on
// https://www.mathsisfun.com/data/standard-deviation.html // https://www.mathsisfun.com/data/standard-deviation.html
pub fn population_stddev(arr []f64) f64 { [inline]
if arr.len == 0 { pub fn population_stddev<T>(data []T) T {
return f64(0) if data.len == 0 {
return T(0)
} }
return math.sqrt(population_variance(arr)) return math.sqrt(population_variance<T>(data))
}
// Measure of Dispersion / Spread
// Population Standard Deviation of the given input array
// Based on
// https://www.mathsisfun.com/data/standard-deviation.html
[inline]
pub fn population_stddev_mean<T>(data []T, mean T) T {
if data.len == 0 {
return T(0)
}
return T(math.sqrt(f64(population_variance_mean<T>(data, mean))))
} }
// Measure of Dispersion / Spread // Measure of Dispersion / Spread
// Sample Standard Deviation of the given input array // Sample Standard Deviation of the given input array
// Based on // Based on
// https://www.mathsisfun.com/data/standard-deviation.html // https://www.mathsisfun.com/data/standard-deviation.html
pub fn sample_stddev(arr []f64) f64 { [inline]
if arr.len == 0 { pub fn sample_stddev<T>(data []T) T {
return f64(0) if data.len == 0 {
return T(0)
} }
return math.sqrt(sample_variance(arr)) return T(math.sqrt(f64(sample_variance<T>(data))))
}
// Measure of Dispersion / Spread
// Sample Standard Deviation of the given input array
// Based on
// https://www.mathsisfun.com/data/standard-deviation.html
[inline]
pub fn sample_stddev_mean<T>(data []T, mean T) T {
if data.len == 0 {
return T(0)
}
return T(math.sqrt(f64(sample_variance_mean<T>(data, mean))))
} }
// Measure of Dispersion / Spread // Measure of Dispersion / Spread
// Mean Absolute Deviation of the given input array // Mean Absolute Deviation of the given input array
// Based on // Based on
// https://en.wikipedia.org/wiki/Average_absolute_deviation // https://en.wikipedia.org/wiki/Average_absolute_deviation
pub fn mean_absdev(arr []f64) f64 { [inline]
if arr.len == 0 { pub fn absdev<T>(data []T) T {
return f64(0) if data.len == 0 {
return T(0)
} }
amean := mean(arr) data_mean := mean<T>(data)
mut sum := f64(0) return absdev_mean<T>(data, data_mean)
for v in arr { }
sum += math.abs(v - amean)
// Measure of Dispersion / Spread
// Mean Absolute Deviation of the given input array
// Based on
// https://en.wikipedia.org/wiki/Average_absolute_deviation
pub fn absdev_mean<T>(data []T, mean T) T {
if data.len == 0 {
return T(0)
} }
return sum / f64(arr.len) mut sum := T(0)
for v in data {
sum += math.abs(v - mean)
}
return sum / T(data.len)
}
// Sum of squares
[inline]
pub fn tss<T>(data []T) T {
if data.len == 0 {
return T(0)
}
data_mean := mean<T>(data)
return tss_mean<T>(data, data_mean)
}
// Sum of squares about the mean
pub fn tss_mean<T>(data []T, mean T) T {
if data.len == 0 {
return T(0)
}
mut tss := T(0)
for v in data {
tss += (v - mean) * (v - mean)
}
return tss
} }
// Minimum of the given input array // Minimum of the given input array
pub fn min(arr []f64) f64 { pub fn min<T>(data []T) T {
if arr.len == 0 { if data.len == 0 {
return f64(0) return T(0)
} }
mut min := arr[0] mut min := data[0]
for v in arr { for v in data {
if v < min { if v < min {
min = v min = v
} }
@ -224,12 +284,12 @@ pub fn min(arr []f64) f64 {
} }
// Maximum of the given input array // Maximum of the given input array
pub fn max(arr []f64) f64 { pub fn max<T>(data []T) T {
if arr.len == 0 { if data.len == 0 {
return f64(0) return T(0)
} }
mut max := arr[0] mut max := data[0]
for v in arr { for v in data {
if v > max { if v > max {
max = v max = v
} }
@ -237,13 +297,188 @@ pub fn max(arr []f64) f64 {
return max return max
} }
// Minimum and maximum of the given input array
pub fn minmax<T>(data []T) (T, T) {
if data.len == 0 {
return T(0), T(0)
}
mut max := data[0]
mut min := data[0]
for v in data[1..] {
if v > max {
max = v
}
if v < min {
min = v
}
}
return min, max
}
// Minimum of the given input array
pub fn min_index<T>(data []T) int {
if data.len == 0 {
return 0
}
mut min := data[0]
mut min_index := 0
for i, v in data {
if v < min {
min = v
min_index = i
}
}
return min_index
}
// Maximum of the given input array
pub fn max_index<T>(data []T) int {
if data.len == 0 {
return 0
}
mut max := data[0]
mut max_index := 0
for i, v in data {
if v > max {
max = v
max_index = i
}
}
return max_index
}
// Minimum and maximum of the given input array
pub fn minmax_index<T>(data []T) (int, int) {
if data.len == 0 {
return 0, 0
}
mut min := data[0]
mut max := data[0]
mut min_index := 0
mut max_index := 0
for i, v in data {
if v < min {
min = v
min_index = i
}
if v > max {
max = v
max_index = i
}
}
return min_index, max_index
}
// Measure of Dispersion / Spread // Measure of Dispersion / Spread
// Range ( Maximum - Minimum ) of the given input array // Range ( Maximum - Minimum ) of the given input array
// Based on // Based on
// https://www.mathsisfun.com/data/range.html // https://www.mathsisfun.com/data/range.html
pub fn range(arr []f64) f64 { pub fn range<T>(data []T) T {
if arr.len == 0 { if data.len == 0 {
return f64(0) return T(0)
}
min, max := minmax<T>(data)
return max - min
}
[inline]
pub fn covariance<T>(data1 []T, data2 []T) T {
mean1 := mean<T>(data1)
mean2 := mean<T>(data2)
return covariance_mean<T>(data1, data2, mean1, mean2)
}
// Compute the covariance of a dataset using
// the recurrence relation
pub fn covariance_mean<T>(data1 []T, data2 []T, mean1 T, mean2 T) T {
n := int(math.min(data1.len, data2.len))
if n == 0 {
return T(0)
}
mut covariance := T(0)
for i in 0 .. n {
delta1 := data1[i] - mean1
delta2 := data2[i] - mean2
covariance += (delta1 * delta2 - covariance) / (T(i) + 1.0)
}
return covariance
}
[inline]
pub fn lag1_autocorrelation<T>(data []T) T {
data_mean := mean<T>(data)
return lag1_autocorrelation_mean<T>(data, data_mean)
}
// Compute the lag-1 autocorrelation of a dataset using
// the recurrence relation
pub fn lag1_autocorrelation_mean<T>(data []T, mean T) T {
if data.len == 0 {
return T(0)
}
mut q := T(0)
mut v := (data[0] * mean) - (data[0] * mean)
for i := 1; i < data.len; i++ {
delta0 := data[i - 1] - mean
delta1 := data[i] - mean
q += (delta0 * delta1 - q) / (T(i) + 1.0)
v += (delta1 * delta1 - v) / (T(i) + 1.0)
}
return q / v
}
[inline]
pub fn kurtosis<T>(data []T) T {
data_mean := mean<T>(data)
sd := population_stddev_mean<T>(data, data_mean)
return kurtosis_mean_stddev<T>(data, data_mean, sd)
}
// Takes a dataset and finds the kurtosis
// using the fourth moment the deviations, normalized by the sd
pub fn kurtosis_mean_stddev<T>(data []T, mean T, sd T) T {
mut avg := T(0) // find the fourth moment the deviations, normalized by the sd
/*
we use a recurrence relation to stably update a running value so
* there aren't any large sums that can overflow
*/
for i, v in data {
x := (v - mean) / sd
avg += (x * x * x * x - avg) / (T(i) + 1.0)
}
return avg - T(3.0)
}
[inline]
pub fn skew<T>(data []T) T {
data_mean := mean<T>(data)
sd := population_stddev_mean<T>(data, data_mean)
return skew_mean_stddev<T>(data, data_mean, sd)
}
pub fn skew_mean_stddev<T>(data []T, mean T, sd T) T {
mut skew := T(0) // find the sum of the cubed deviations, normalized by the sd.
/*
we use a recurrence relation to stably update a running value so
* there aren't any large sums that can overflow
*/
for i, v in data {
x := (v - mean) / sd
skew += (x * x * x - skew) / (T(i) + 1.0)
}
return skew
}
pub fn quantile<T>(sorted_data []T, f T) T {
if sorted_data.len == 0 {
return T(0)
}
index := f * (T(sorted_data.len) - 1.0)
lhs := int(index)
delta := index - T(lhs)
return if lhs == sorted_data.len - 1 {
sorted_data[lhs]
} else {
(1.0 - delta) * sorted_data[lhs] + delta * sorted_data[(lhs + 1)]
} }
return max(arr) - min(arr)
} }

View File

@ -1,5 +1,5 @@
import math.stats
import math import math
import math.stats
fn test_freq() { fn test_freq() {
// Tests were also verified on Wolfram Alpha // Tests were also verified on Wolfram Alpha
@ -44,8 +44,9 @@ fn test_geometric_mean() {
data = [f64(-3.0), f64(67.31), f64(4.4), f64(1.89)] data = [f64(-3.0), f64(67.31), f64(4.4), f64(1.89)]
o = stats.geometric_mean(data) o = stats.geometric_mean(data)
// Some issue with precision comparison in f64 using == operator hence serializing to string // Some issue with precision comparison in f64 using == operator hence serializing to string
assert o.str() == 'nan' || o.str() == '-nan' || o.str() == '-1.#IND00' || o == f64(0) ok := o.str() == 'nan' || o.str() == '-nan' || o.str() == '-1.#IND00' || o == f64(0)
|| o.str() == '-nan(ind)' // Because in math it yields a complex number || o.str() == '-nan(ind)'
assert ok // Because in math it yields a complex number
data = [f64(12.0), f64(7.88), f64(76.122), f64(54.83)] data = [f64(12.0), f64(7.88), f64(76.122), f64(54.83)]
o = stats.geometric_mean(data) o = stats.geometric_mean(data)
// Some issue with precision comparison in f64 using == operator hence serializing to string // Some issue with precision comparison in f64 using == operator hence serializing to string
@ -194,18 +195,18 @@ fn test_sample_stddev() {
assert tst_res(o.str(), '33.263639') assert tst_res(o.str(), '33.263639')
} }
fn test_mean_absdev() { fn test_absdev() {
// Tests were also verified on Wolfram Alpha // Tests were also verified on Wolfram Alpha
mut data := [f64(10.0), f64(4.45), f64(5.9), f64(2.7)] mut data := [f64(10.0), f64(4.45), f64(5.9), f64(2.7)]
mut o := stats.mean_absdev(data) mut o := stats.absdev(data)
// Some issue with precision comparison in f64 using == operator hence serializing to string // Some issue with precision comparison in f64 using == operator hence serializing to string
assert tst_res(o.str(), '2.187500') assert tst_res(o.str(), '2.187500')
data = [f64(-3.0), f64(67.31), f64(4.4), f64(1.89)] data = [f64(-3.0), f64(67.31), f64(4.4), f64(1.89)]
o = stats.mean_absdev(data) o = stats.absdev(data)
// Some issue with precision comparison in f64 using == operator hence serializing to string // Some issue with precision comparison in f64 using == operator hence serializing to string
assert tst_res(o.str(), '24.830000') assert tst_res(o.str(), '24.830000')
data = [f64(12.0), f64(7.88), f64(76.122), f64(54.83)] data = [f64(12.0), f64(7.88), f64(76.122), f64(54.83)]
o = stats.mean_absdev(data) o = stats.absdev(data)
// Some issue with precision comparison in f64 using == operator hence serializing to string // Some issue with precision comparison in f64 using == operator hence serializing to string
assert tst_res(o.str(), '27.768000') assert tst_res(o.str(), '27.768000')
} }
@ -262,7 +263,7 @@ fn test_passing_empty() {
assert stats.sample_variance(data) == f64(0) assert stats.sample_variance(data) == f64(0)
assert stats.population_stddev(data) == f64(0) assert stats.population_stddev(data) == f64(0)
assert stats.sample_stddev(data) == f64(0) assert stats.sample_stddev(data) == f64(0)
assert stats.mean_absdev(data) == f64(0) assert stats.absdev(data) == f64(0)
assert stats.min(data) == f64(0) assert stats.min(data) == f64(0)
assert stats.max(data) == f64(0) assert stats.max(data) == f64(0)
assert stats.range(data) == f64(0) assert stats.range(data) == f64(0)