From 480fe8041ac8a4e485b2dbaf2ba3cfc93f47f0d9 Mon Sep 17 00:00:00 2001 From: Ulises Jeremias Cornejo Fandos Date: Mon, 13 Sep 2021 04:20:38 -0300 Subject: [PATCH] math.stats: update math.stats using generics (#11482) --- vlib/math/stats/stats.v | 459 ++++++++++++++++++++++++++--------- vlib/math/stats/stats_test.v | 17 +- 2 files changed, 356 insertions(+), 120 deletions(-) diff --git a/vlib/math/stats/stats.v b/vlib/math/stats/stats.v index d7317bf1ef..544391019d 100644 --- a/vlib/math/stats/stats.v +++ b/vlib/math/stats/stats.v @@ -2,40 +2,16 @@ module stats import math -// TODO: Implement all of them with generics - -// This module defines the following statistical operations on f64 array -// --------------------------- -// | Summary of Functions | -// --------------------------- -// ----------------------------------------------------------------------- -// freq - Frequency -// mean - Mean -// geometric_mean - Geometric Mean -// harmonic_mean - Harmonic Mean -// median - Median -// mode - Mode -// rms - Root Mean Square -// population_variance - Population Variance -// sample_variance - Sample Variance -// population_stddev - Population Standard Deviation -// sample_stddev - Sample Standard Deviation -// mean_absdev - Mean Absolute Deviation -// min - Minimum of the Array -// max - Maximum of the Array -// range - Range of the Array ( max - min ) -// ----------------------------------------------------------------------- - // Measure of Occurance // Frequency of a given number // Based on // https://www.mathsisfun.com/data/frequency-distribution.html -pub fn freq(arr []f64, val f64) int { - if arr.len == 0 { +pub fn freq(data []T, val T) int { + if data.len == 0 { return 0 } mut count := 0 - for v in arr { + for v in data { if v == val { count++ } @@ -47,60 +23,60 @@ pub fn freq(arr []f64, val f64) int { // Mean of the given input array // Based on // https://www.mathsisfun.com/data/central-measures.html -pub fn mean(arr []f64) f64 { - if arr.len == 0 { - return f64(0) +pub fn mean(data []T) T { + if data.len == 0 { + return T(0) } - mut sum := f64(0) - for v in arr { + mut sum := T(0) + for v in data { sum += v } - return sum / f64(arr.len) + return sum / T(data.len) } // Measure of Central Tendancy // Geometric Mean of the given input array // Based on // https://www.mathsisfun.com/numbers/geometric-mean.html -pub fn geometric_mean(arr []f64) f64 { - if arr.len == 0 { - return f64(0) +pub fn geometric_mean(data []T) T { + if data.len == 0 { + return T(0) } - mut sum := f64(1) - for v in arr { + mut sum := 1.0 + for v in data { sum *= v } - return math.pow(sum, f64(1) / arr.len) + return math.pow(sum, 1.0 / T(data.len)) } // Measure of Central Tendancy // Harmonic Mean of the given input array // Based on // https://www.mathsisfun.com/numbers/harmonic-mean.html -pub fn harmonic_mean(arr []f64) f64 { - if arr.len == 0 { - return f64(0) +pub fn harmonic_mean(data []T) T { + if data.len == 0 { + return T(0) } - mut sum := f64(0) - for v in arr { - sum += f64(1) / v + mut sum := T(0) + for v in data { + sum += 1.0 / v } - return f64(arr.len) / sum + return T(data.len) / sum } // Measure of Central Tendancy // Median of the given input array ( input array is assumed to be sorted ) // Based on // https://www.mathsisfun.com/data/central-measures.html -pub fn median(arr []f64) f64 { - if arr.len == 0 { - return f64(0) +pub fn median(sorted_data []T) T { + if sorted_data.len == 0 { + return T(0) } - if arr.len % 2 == 0 { - mid := (arr.len / 2) - 1 - return (arr[mid] + arr[mid + 1]) / f64(2) + if sorted_data.len % 2 == 0 { + mid := (sorted_data.len / 2) - 1 + return (sorted_data[mid] + sorted_data[mid + 1]) / T(2) } else { - return arr[((arr.len - 1) / 2)] + return sorted_data[((sorted_data.len - 1) / 2)] } } @@ -108,114 +84,198 @@ pub fn median(arr []f64) f64 { // Mode of the given input array // Based on // https://www.mathsisfun.com/data/central-measures.html -pub fn mode(arr []f64) f64 { - if arr.len == 0 { - return f64(0) +pub fn mode(data []T) T { + if data.len == 0 { + return T(0) } mut freqs := []int{} - for v in arr { - freqs << freq(arr, v) + for v in data { + freqs << freq(data, v) } mut max := 0 - for i in 0 .. freqs.len { + for i := 0; i < freqs.len; i++ { if freqs[i] > freqs[max] { max = i } } - return arr[max] + return data[max] } // Root Mean Square of the given input array // Based on // https://en.wikipedia.org/wiki/Root_mean_square -pub fn rms(arr []f64) f64 { - if arr.len == 0 { - return f64(0) +pub fn rms(data []T) T { + if data.len == 0 { + return T(0) } - mut sum := f64(0) - for v in arr { + mut sum := T(0) + for v in data { sum += math.pow(v, 2) } - return math.sqrt(sum / f64(arr.len)) + return math.sqrt(sum / T(data.len)) } // Measure of Dispersion / Spread // Population Variance of the given input array // Based on // https://www.mathsisfun.com/data/standard-deviation.html -pub fn population_variance(arr []f64) f64 { - if arr.len == 0 { - return f64(0) +[inline] +pub fn population_variance(data []T) T { + if data.len == 0 { + return T(0) } - m := mean(arr) - mut sum := f64(0) - for v in arr { - sum += math.pow(v - m, 2) + data_mean := mean(data) + return population_variance_mean(data, data_mean) +} + +// Measure of Dispersion / Spread +// Population Variance of the given input array +// Based on +// https://www.mathsisfun.com/data/standard-deviation.html +pub fn population_variance_mean(data []T, mean T) T { + if data.len == 0 { + return T(0) } - return sum / f64(arr.len) + mut sum := T(0) + for v in data { + sum += (v - mean) * (v - mean) + } + return sum / T(data.len) } // Measure of Dispersion / Spread // Sample Variance of the given input array // Based on // https://www.mathsisfun.com/data/standard-deviation.html -pub fn sample_variance(arr []f64) f64 { - if arr.len == 0 { - return f64(0) +[inline] +pub fn sample_variance(data []T) T { + if data.len == 0 { + return T(0) } - m := mean(arr) - mut sum := f64(0) - for v in arr { - sum += math.pow(v - m, 2) + data_mean := mean(data) + return sample_variance_mean(data, data_mean) +} + +// Measure of Dispersion / Spread +// Sample Variance of the given input array +// Based on +// https://www.mathsisfun.com/data/standard-deviation.html +pub fn sample_variance_mean(data []T, mean T) T { + if data.len == 0 { + return T(0) } - return sum / f64(arr.len - 1) + mut sum := T(0) + for v in data { + sum += (v - mean) * (v - mean) + } + return sum / T(data.len - 1) } // Measure of Dispersion / Spread // Population Standard Deviation of the given input array // Based on // https://www.mathsisfun.com/data/standard-deviation.html -pub fn population_stddev(arr []f64) f64 { - if arr.len == 0 { - return f64(0) +[inline] +pub fn population_stddev(data []T) T { + if data.len == 0 { + return T(0) } - return math.sqrt(population_variance(arr)) + return math.sqrt(population_variance(data)) +} + +// Measure of Dispersion / Spread +// Population Standard Deviation of the given input array +// Based on +// https://www.mathsisfun.com/data/standard-deviation.html +[inline] +pub fn population_stddev_mean(data []T, mean T) T { + if data.len == 0 { + return T(0) + } + return T(math.sqrt(f64(population_variance_mean(data, mean)))) } // Measure of Dispersion / Spread // Sample Standard Deviation of the given input array // Based on // https://www.mathsisfun.com/data/standard-deviation.html -pub fn sample_stddev(arr []f64) f64 { - if arr.len == 0 { - return f64(0) +[inline] +pub fn sample_stddev(data []T) T { + if data.len == 0 { + return T(0) } - return math.sqrt(sample_variance(arr)) + return T(math.sqrt(f64(sample_variance(data)))) +} + +// Measure of Dispersion / Spread +// Sample Standard Deviation of the given input array +// Based on +// https://www.mathsisfun.com/data/standard-deviation.html +[inline] +pub fn sample_stddev_mean(data []T, mean T) T { + if data.len == 0 { + return T(0) + } + return T(math.sqrt(f64(sample_variance_mean(data, mean)))) } // Measure of Dispersion / Spread // Mean Absolute Deviation of the given input array // Based on // https://en.wikipedia.org/wiki/Average_absolute_deviation -pub fn mean_absdev(arr []f64) f64 { - if arr.len == 0 { - return f64(0) +[inline] +pub fn absdev(data []T) T { + if data.len == 0 { + return T(0) } - amean := mean(arr) - mut sum := f64(0) - for v in arr { - sum += math.abs(v - amean) + data_mean := mean(data) + return absdev_mean(data, data_mean) +} + +// Measure of Dispersion / Spread +// Mean Absolute Deviation of the given input array +// Based on +// https://en.wikipedia.org/wiki/Average_absolute_deviation +pub fn absdev_mean(data []T, mean T) T { + if data.len == 0 { + return T(0) } - return sum / f64(arr.len) + mut sum := T(0) + for v in data { + sum += math.abs(v - mean) + } + return sum / T(data.len) +} + +// Sum of squares +[inline] +pub fn tss(data []T) T { + if data.len == 0 { + return T(0) + } + data_mean := mean(data) + return tss_mean(data, data_mean) +} + +// Sum of squares about the mean +pub fn tss_mean(data []T, mean T) T { + if data.len == 0 { + return T(0) + } + mut tss := T(0) + for v in data { + tss += (v - mean) * (v - mean) + } + return tss } // Minimum of the given input array -pub fn min(arr []f64) f64 { - if arr.len == 0 { - return f64(0) +pub fn min(data []T) T { + if data.len == 0 { + return T(0) } - mut min := arr[0] - for v in arr { + mut min := data[0] + for v in data { if v < min { min = v } @@ -224,12 +284,12 @@ pub fn min(arr []f64) f64 { } // Maximum of the given input array -pub fn max(arr []f64) f64 { - if arr.len == 0 { - return f64(0) +pub fn max(data []T) T { + if data.len == 0 { + return T(0) } - mut max := arr[0] - for v in arr { + mut max := data[0] + for v in data { if v > max { max = v } @@ -237,13 +297,188 @@ pub fn max(arr []f64) f64 { return max } +// Minimum and maximum of the given input array +pub fn minmax(data []T) (T, T) { + if data.len == 0 { + return T(0), T(0) + } + mut max := data[0] + mut min := data[0] + for v in data[1..] { + if v > max { + max = v + } + if v < min { + min = v + } + } + return min, max +} + +// Minimum of the given input array +pub fn min_index(data []T) int { + if data.len == 0 { + return 0 + } + mut min := data[0] + mut min_index := 0 + for i, v in data { + if v < min { + min = v + min_index = i + } + } + return min_index +} + +// Maximum of the given input array +pub fn max_index(data []T) int { + if data.len == 0 { + return 0 + } + mut max := data[0] + mut max_index := 0 + for i, v in data { + if v > max { + max = v + max_index = i + } + } + return max_index +} + +// Minimum and maximum of the given input array +pub fn minmax_index(data []T) (int, int) { + if data.len == 0 { + return 0, 0 + } + mut min := data[0] + mut max := data[0] + mut min_index := 0 + mut max_index := 0 + for i, v in data { + if v < min { + min = v + min_index = i + } + if v > max { + max = v + max_index = i + } + } + return min_index, max_index +} + // Measure of Dispersion / Spread // Range ( Maximum - Minimum ) of the given input array // Based on // https://www.mathsisfun.com/data/range.html -pub fn range(arr []f64) f64 { - if arr.len == 0 { - return f64(0) +pub fn range(data []T) T { + if data.len == 0 { + return T(0) + } + min, max := minmax(data) + return max - min +} + +[inline] +pub fn covariance(data1 []T, data2 []T) T { + mean1 := mean(data1) + mean2 := mean(data2) + return covariance_mean(data1, data2, mean1, mean2) +} + +// Compute the covariance of a dataset using +// the recurrence relation +pub fn covariance_mean(data1 []T, data2 []T, mean1 T, mean2 T) T { + n := int(math.min(data1.len, data2.len)) + if n == 0 { + return T(0) + } + mut covariance := T(0) + for i in 0 .. n { + delta1 := data1[i] - mean1 + delta2 := data2[i] - mean2 + covariance += (delta1 * delta2 - covariance) / (T(i) + 1.0) + } + return covariance +} + +[inline] +pub fn lag1_autocorrelation(data []T) T { + data_mean := mean(data) + return lag1_autocorrelation_mean(data, data_mean) +} + +// Compute the lag-1 autocorrelation of a dataset using +// the recurrence relation +pub fn lag1_autocorrelation_mean(data []T, mean T) T { + if data.len == 0 { + return T(0) + } + mut q := T(0) + mut v := (data[0] * mean) - (data[0] * mean) + for i := 1; i < data.len; i++ { + delta0 := data[i - 1] - mean + delta1 := data[i] - mean + q += (delta0 * delta1 - q) / (T(i) + 1.0) + v += (delta1 * delta1 - v) / (T(i) + 1.0) + } + return q / v +} + +[inline] +pub fn kurtosis(data []T) T { + data_mean := mean(data) + sd := population_stddev_mean(data, data_mean) + return kurtosis_mean_stddev(data, data_mean, sd) +} + +// Takes a dataset and finds the kurtosis +// using the fourth moment the deviations, normalized by the sd +pub fn kurtosis_mean_stddev(data []T, mean T, sd T) T { + mut avg := T(0) // find the fourth moment the deviations, normalized by the sd + /* + we use a recurrence relation to stably update a running value so + * there aren't any large sums that can overflow + */ + for i, v in data { + x := (v - mean) / sd + avg += (x * x * x * x - avg) / (T(i) + 1.0) + } + return avg - T(3.0) +} + +[inline] +pub fn skew(data []T) T { + data_mean := mean(data) + sd := population_stddev_mean(data, data_mean) + return skew_mean_stddev(data, data_mean, sd) +} + +pub fn skew_mean_stddev(data []T, mean T, sd T) T { + mut skew := T(0) // find the sum of the cubed deviations, normalized by the sd. + /* + we use a recurrence relation to stably update a running value so + * there aren't any large sums that can overflow + */ + for i, v in data { + x := (v - mean) / sd + skew += (x * x * x - skew) / (T(i) + 1.0) + } + return skew +} + +pub fn quantile(sorted_data []T, f T) T { + if sorted_data.len == 0 { + return T(0) + } + index := f * (T(sorted_data.len) - 1.0) + lhs := int(index) + delta := index - T(lhs) + return if lhs == sorted_data.len - 1 { + sorted_data[lhs] + } else { + (1.0 - delta) * sorted_data[lhs] + delta * sorted_data[(lhs + 1)] } - return max(arr) - min(arr) } diff --git a/vlib/math/stats/stats_test.v b/vlib/math/stats/stats_test.v index c18daff5e5..258523bfce 100644 --- a/vlib/math/stats/stats_test.v +++ b/vlib/math/stats/stats_test.v @@ -1,5 +1,5 @@ -import math.stats import math +import math.stats fn test_freq() { // Tests were also verified on Wolfram Alpha @@ -44,8 +44,9 @@ fn test_geometric_mean() { data = [f64(-3.0), f64(67.31), f64(4.4), f64(1.89)] o = stats.geometric_mean(data) // Some issue with precision comparison in f64 using == operator hence serializing to string - assert o.str() == 'nan' || o.str() == '-nan' || o.str() == '-1.#IND00' || o == f64(0) - || o.str() == '-nan(ind)' // Because in math it yields a complex number + ok := o.str() == 'nan' || o.str() == '-nan' || o.str() == '-1.#IND00' || o == f64(0) + || o.str() == '-nan(ind)' + assert ok // Because in math it yields a complex number data = [f64(12.0), f64(7.88), f64(76.122), f64(54.83)] o = stats.geometric_mean(data) // Some issue with precision comparison in f64 using == operator hence serializing to string @@ -194,18 +195,18 @@ fn test_sample_stddev() { assert tst_res(o.str(), '33.263639') } -fn test_mean_absdev() { +fn test_absdev() { // Tests were also verified on Wolfram Alpha mut data := [f64(10.0), f64(4.45), f64(5.9), f64(2.7)] - mut o := stats.mean_absdev(data) + mut o := stats.absdev(data) // Some issue with precision comparison in f64 using == operator hence serializing to string assert tst_res(o.str(), '2.187500') data = [f64(-3.0), f64(67.31), f64(4.4), f64(1.89)] - o = stats.mean_absdev(data) + o = stats.absdev(data) // Some issue with precision comparison in f64 using == operator hence serializing to string assert tst_res(o.str(), '24.830000') data = [f64(12.0), f64(7.88), f64(76.122), f64(54.83)] - o = stats.mean_absdev(data) + o = stats.absdev(data) // Some issue with precision comparison in f64 using == operator hence serializing to string assert tst_res(o.str(), '27.768000') } @@ -262,7 +263,7 @@ fn test_passing_empty() { assert stats.sample_variance(data) == f64(0) assert stats.population_stddev(data) == f64(0) assert stats.sample_stddev(data) == f64(0) - assert stats.mean_absdev(data) == f64(0) + assert stats.absdev(data) == f64(0) assert stats.min(data) == f64(0) assert stats.max(data) == f64(0) assert stats.range(data) == f64(0)