| // Copyright 2024 Google LLC |
| // SPDX-License-Identifier: Apache-2.0 |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #ifndef HIGHWAY_HWY_STATS_H_ |
| #define HIGHWAY_HWY_STATS_H_ |
| |
| #include <stdint.h> |
| #include <stdio.h> |
| |
| #include <cmath> |
| #include <string> |
| |
| #include "third_party/highway/hwy/base.h" // HWY_ASSERT |
| |
| namespace hwy { |
| |
| // Thread-compatible. |
| template <size_t N> |
| class Bins { |
| public: |
| Bins() { Reset(); } |
| |
| template <typename T> |
| void Notify(T bin) { |
| HWY_ASSERT(T{0} <= bin && bin < static_cast<T>(N)); |
| counts_[static_cast<int32_t>(bin)]++; |
| } |
| |
| void Assimilate(const Bins<N>& other) { |
| for (size_t i = 0; i < N; ++i) { |
| counts_[i] += other.counts_[i]; |
| } |
| } |
| |
| void Print(const char* caption) const { |
| fprintf(stderr, "\n%s [%zu]\n", caption, N); |
| size_t last_nonzero = 0; |
| for (size_t i = N - 1; i < N; --i) { |
| if (counts_[i] != 0) { |
| last_nonzero = i; |
| break; |
| } |
| } |
| for (size_t i = 0; i <= last_nonzero; ++i) { |
| fprintf(stderr, " %zu\n", counts_[i]); |
| } |
| } |
| |
| void Reset() { |
| for (size_t i = 0; i < N; ++i) { |
| counts_[i] = 0; |
| } |
| } |
| |
| private: |
| size_t counts_[N]; |
| }; |
| |
| // Descriptive statistics of a variable (4 moments). Thread-compatible. |
| class Stats { |
| public: |
| Stats() { Reset(); } |
| |
| void Notify(const float x) { |
| ++n_; |
| |
| min_ = HWY_MIN(min_, x); |
| max_ = HWY_MAX(max_, x); |
| |
| // Logarithmic transform avoids/delays underflow and overflow. |
| sum_log_ += std::log(static_cast<double>(x)); |
| |
| // Online moments. Reference: https://goo.gl/9ha694 |
| const double d = x - m1_; |
| const double d_div_n = d / static_cast<double>(n_); |
| const double d2n1_div_n = d * (static_cast<double>(n_) - 1) * d_div_n; |
| const int64_t n_poly = n_ * n_ - 3 * n_ + 3; |
| m1_ += d_div_n; |
| m4_ += d_div_n * (d_div_n * (d2n1_div_n * static_cast<double>(n_poly) + 6.0 * m2_) - 4.0 * m3_); |
| m3_ += d_div_n * (d2n1_div_n * (static_cast<double>(n_) - 2) - 3.0 * m2_); |
| m2_ += d2n1_div_n; |
| } |
| |
| void Assimilate(const Stats& other); |
| |
| int64_t Count() const { return n_; } |
| |
| float Min() const { return min_; } |
| float Max() const { return max_; } |
| |
| double GeometricMean() const { |
| return n_ == 0 ? 0.0 : std::exp(sum_log_ / static_cast<double>(n_)); |
| } |
| |
| double Mean() const { return m1_; } |
| // Same as Mu2. Assumes n_ is large. |
| double SampleVariance() const { |
| return n_ == 0 ? 0.0 : m2_ / static_cast<int>(n_); |
| } |
| // Unbiased estimator for population variance even for smaller n_. |
| double Variance() const { |
| if (n_ == 0) return 0.0; |
| if (n_ == 1) return m2_; |
| return m2_ / static_cast<int>(n_ - 1); |
| } |
| double StandardDeviation() const { return std::sqrt(Variance()); } |
| // Near zero for normal distributions; if positive on a unimodal distribution, |
| // the right tail is fatter. Assumes n_ is large. |
| double SampleSkewness() const { |
| if (ScalarAbs(m2_) < 1E-7) return 0.0; |
| return m3_ * std::sqrt(static_cast<double>(n_)) / std::pow(m2_, 1.5); |
| } |
| // Corrected for bias (same as Wikipedia and Minitab but not Excel). |
| double Skewness() const { |
| if (n_ == 0) return 0.0; |
| const double biased = SampleSkewness(); |
| const double r = (static_cast<double>(n_) - 1.0) / static_cast<double>(n_); |
| return biased * std::pow(r, 1.5); |
| } |
| // Near zero for normal distributions; smaller values indicate fewer/smaller |
| // outliers and larger indicates more/larger outliers. Assumes n_ is large. |
| double SampleKurtosis() const { |
| if (ScalarAbs(m2_) < 1E-7) return 0.0; |
| return m4_ * static_cast<double>(n_) / (m2_ * m2_); |
| } |
| // Corrected for bias (same as Wikipedia and Minitab but not Excel). |
| double Kurtosis() const { |
| if (n_ == 0) return 0.0; |
| const double biased = SampleKurtosis(); |
| const double r = (static_cast<double>(n_) - 1.0) / static_cast<double>(n_); |
| return biased * r * r; |
| } |
| |
| // Central moments, useful for "method of moments"-based parameter estimation |
| // of a mixture of two Gaussians. Assumes Count() != 0. |
| double Mu1() const { return m1_; } |
| double Mu2() const { return m2_ / static_cast<int>(n_); } |
| double Mu3() const { return m3_ / static_cast<int>(n_); } |
| double Mu4() const { return m4_ / static_cast<int>(n_); } |
| |
| // Which statistics to EXCLUDE in ToString |
| enum { |
| kNoCount = 1, |
| kNoMeanSD = 2, |
| kNoMinMax = 4, |
| kNoSkewKurt = 8, |
| kNoGeomean = 16 |
| }; |
| std::string ToString(int exclude = 0) const; |
| |
| void Reset() { |
| n_ = 0; |
| |
| min_ = hwy::HighestValue<float>(); |
| max_ = hwy::LowestValue<float>(); |
| |
| sum_log_ = 0.0; |
| |
| m1_ = 0.0; |
| m2_ = 0.0; |
| m3_ = 0.0; |
| m4_ = 0.0; |
| } |
| |
| private: |
| int64_t n_; // signed for faster conversion + safe subtraction |
| |
| float min_; |
| float max_; |
| |
| double sum_log_; // for geomean |
| |
| // Moments |
| double m1_; |
| double m2_; |
| double m3_; |
| double m4_; |
| }; |
| |
| } // namespace hwy |
| |
| #endif // HIGHWAY_HWY_STATS_H_ |