View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  /*
19   * This is not the original file distributed by the Apache Software Foundation
20   * It has been modified by the Hipparchus project
21   */
22  package org.hipparchus.stat.descriptive;
23  
24  import java.io.Serializable;
25  import java.util.function.DoubleConsumer;
26  
27  import org.hipparchus.exception.NullArgumentException;
28  import org.hipparchus.random.RandomGenerator;
29  import org.hipparchus.stat.descriptive.moment.GeometricMean;
30  import org.hipparchus.stat.descriptive.moment.Mean;
31  import org.hipparchus.stat.descriptive.moment.SecondMoment;
32  import org.hipparchus.stat.descriptive.moment.Variance;
33  import org.hipparchus.stat.descriptive.rank.Max;
34  import org.hipparchus.stat.descriptive.rank.Min;
35  import org.hipparchus.stat.descriptive.rank.RandomPercentile;
36  import org.hipparchus.stat.descriptive.summary.Sum;
37  import org.hipparchus.stat.descriptive.summary.SumOfLogs;
38  import org.hipparchus.stat.descriptive.summary.SumOfSquares;
39  import org.hipparchus.util.FastMath;
40  import org.hipparchus.util.MathUtils;
41  import org.hipparchus.util.Precision;
42  
43  /**
44   * Computes summary statistics for a stream of data values added using the
45   * {@link #addValue(double) addValue} method. The data values are not stored in
46   * memory, so this class can be used to compute statistics for very large data
47   * streams.
48   * <p>
49   * By default, all statistics other than percentiles are maintained.  Percentile
50   * calculations use an embedded {@link RandomPercentile} which carries more memory
51   * and compute overhead than the other statistics, so it is disabled by default.
52   * To enable percentiles, either pass {@code true} to the constructor or use a
53   * {@link StreamingStatisticsBuilder} to configure an instance with percentiles turned
54   * on. Other stats can also be selectively disabled using
55   * {@code StreamingStatisticsBulder}.
56   * <p>
57   * Note: This class is not thread-safe.
58   */
59  public class StreamingStatistics
60      implements StatisticalSummary, AggregatableStatistic<StreamingStatistics>,
61                 DoubleConsumer, Serializable {
62  
63      /** Serialization UID */
64      private static final long serialVersionUID = 20160422L;
65  
66      /** count of values that have been added */
67      private long n;
68  
69      /** SecondMoment is used to compute the mean and variance */
70      private final SecondMoment secondMoment;
71      /** min of values that have been added */
72      private final Min minImpl;
73      /** max of values that have been added */
74      private final Max maxImpl;
75      /** sum of values that have been added */
76      private final Sum sumImpl;
77      /** sum of the square of each value that has been added */
78      private final SumOfSquares sumOfSquaresImpl;
79      /** sumLog of values that have been added */
80      private final SumOfLogs sumOfLogsImpl;
81      /** mean of values that have been added */
82      private final Mean meanImpl;
83      /** variance of values that have been added */
84      private final Variance varianceImpl;
85      /** geoMean of values that have been added */
86      private final GeometricMean geoMeanImpl;
87      /** population variance of values that have been added */
88      private final Variance populationVariance;
89      /** source of percentiles */
90      private final RandomPercentile randomPercentile;
91  
92      /** whether or not moment stats (sum, mean, variance) are maintained */
93      private final boolean computeMoments;
94      /** whether or not sum of squares and quadratic mean are maintained */
95      private final boolean computeSumOfSquares;
96      /** whether or not sum of logs and geometric mean are maintained */
97      private final boolean computeSumOfLogs;
98      /** whether or not min and max are maintained */
99      private final boolean computeExtrema;
100 
101     /**
102      * Construct a new StreamingStatistics instance, maintaining all statistics
103      * other than percentiles.
104      */
105     public StreamingStatistics() {
106        this(Double.NaN, null);
107     }
108 
109     /**
110      * Construct a new StreamingStatistics instance, maintaining all statistics
111      * other than percentiles and with/without percentiles per the arguments.
112      *
113      * @param epsilon bound on quantile estimation error (see {@link RandomGenerator})
114      * @param randomGenerator PRNG used in sampling and merge operations (null if percentiles should not be computed)
115      * @since 2.3
116      */
117     public StreamingStatistics(final double epsilon, final RandomGenerator randomGenerator) {
118        this(true, true, true, true, epsilon, randomGenerator);
119     }
120 
121     /**
122      * Private constructor used by {@link StreamingStatisticsBuilder}.
123      *
124      * @param computeMoments whether or not moment stats (mean, sum, variance) are maintained
125      * @param computeSumOfLogs whether or not sum of logs and geometric mean are maintained
126      * @param computeSumOfSquares whether or not sum of squares and quadratic mean are maintained
127      * @param computeExtrema whether or not min and max are maintained
128      * @param epsilon bound on quantile estimation error (see {@link RandomGenerator})
129      * @param randomGenerator PRNG used in sampling and merge operations (null if percentiles should not be computed)
130      * @since 2.3
131      */
132     private StreamingStatistics(final boolean computeMoments,
133                                 final boolean computeSumOfLogs, final boolean computeSumOfSquares,
134                                 final boolean computeExtrema,
135                                 final double epsilon, final RandomGenerator randomGenerator) {
136         this.computeMoments = computeMoments;
137         this.computeSumOfLogs = computeSumOfLogs;
138         this.computeSumOfSquares = computeSumOfSquares;
139         this.computeExtrema = computeExtrema;
140 
141         this.secondMoment = computeMoments ? new SecondMoment() : null;
142         this.maxImpl = computeExtrema ? new Max() : null;
143         this.minImpl = computeExtrema ? new Min() : null;
144         this.sumImpl = computeMoments ? new Sum() : null;
145         this.sumOfSquaresImpl = computeSumOfSquares ? new SumOfSquares() : null;
146         this.sumOfLogsImpl = computeSumOfLogs ? new SumOfLogs() : null;
147         this.meanImpl = computeMoments ? new Mean(this.secondMoment) : null;
148         this.varianceImpl = computeMoments ?  new Variance(this.secondMoment) : null;
149         this.geoMeanImpl = computeSumOfLogs ? new GeometricMean(this.sumOfLogsImpl) : null;
150         this.populationVariance = computeMoments ? new Variance(false, this.secondMoment) : null;
151         this.randomPercentile = randomGenerator == null ? null : new RandomPercentile(epsilon, randomGenerator);
152     }
153 
154     /**
155      * A copy constructor. Creates a deep-copy of the {@code original}.
156      *
157      * @param original the {@code StreamingStatistics} instance to copy
158      * @throws NullArgumentException if original is null
159      */
160     StreamingStatistics(StreamingStatistics original) throws NullArgumentException {
161         MathUtils.checkNotNull(original);
162 
163         this.n                = original.n;
164         this.secondMoment     = original.computeMoments ? original.secondMoment.copy() : null;
165         this.maxImpl          = original.computeExtrema ? original.maxImpl.copy() : null;
166         this.minImpl          = original.computeExtrema ? original.minImpl.copy() : null;
167         this.sumImpl          = original.computeMoments ? original.sumImpl.copy() : null;
168         this.sumOfLogsImpl    = original.computeSumOfLogs ? original.sumOfLogsImpl.copy() : null;
169         this.sumOfSquaresImpl = original.computeSumOfSquares ? original.sumOfSquaresImpl.copy() : null;
170 
171         // Keep statistics with embedded moments in synch
172         this.meanImpl     = original.computeMoments ? new Mean(this.secondMoment) : null;
173         this.varianceImpl = original.computeMoments ? new Variance(this.secondMoment) : null;
174         this.geoMeanImpl  = original.computeSumOfLogs ? new GeometricMean(this.sumOfLogsImpl) : null;
175         this.populationVariance = original.computeMoments ? new Variance(false, this.secondMoment) : null;
176         this.randomPercentile = original.randomPercentile != null ? original.randomPercentile.copy() : null;
177 
178         this.computeMoments = original.computeMoments;
179         this.computeSumOfLogs = original.computeSumOfLogs;
180         this.computeSumOfSquares = original.computeSumOfSquares;
181         this.computeExtrema = original.computeExtrema;
182     }
183 
184     /**
185      * Returns a copy of this StreamingStatistics instance with the same internal state.
186      *
187      * @return a copy of this
188      */
189     public StreamingStatistics copy() {
190         return new StreamingStatistics(this);
191     }
192 
193     /**
194      * Return a {@link StatisticalSummaryValues} instance reporting current
195      * statistics.
196      * @return Current values of statistics
197      */
198     public StatisticalSummary getSummary() {
199         return new StatisticalSummaryValues(getMean(), getVariance(), getN(),
200                                             getMax(), getMin(), getSum());
201     }
202 
203     /**
204      * Add a value to the data
205      * @param value the value to add
206      */
207     public void addValue(double value) {
208         if (computeMoments) {
209             secondMoment.increment(value);
210             sumImpl.increment(value);
211         }
212         if (computeExtrema) {
213             minImpl.increment(value);
214             maxImpl.increment(value);
215         }
216         if (computeSumOfSquares) {
217             sumOfSquaresImpl.increment(value);
218         }
219         if (computeSumOfLogs) {
220             sumOfLogsImpl.increment(value);
221         }
222         if (randomPercentile != null) {
223             randomPercentile.increment(value);
224         }
225         n++;
226     }
227 
228     /** {@inheritDoc} */
229     @Override
230     public void accept(double value) {
231         addValue(value);
232     }
233 
234     /**
235      * Resets all statistics and storage.
236      */
237     public void clear() {
238         this.n = 0;
239         if (computeExtrema) {
240             minImpl.clear();
241             maxImpl.clear();
242         }
243         if (computeMoments) {
244             sumImpl.clear();
245             secondMoment.clear();
246         }
247         if (computeSumOfLogs) {
248             sumOfLogsImpl.clear();
249         }
250         if (computeSumOfSquares) {
251             sumOfSquaresImpl.clear();
252         }
253         if (randomPercentile != null) {
254             randomPercentile.clear();
255         }
256     }
257 
258     /** {@inheritDoc} */
259     @Override
260     public long getN() {
261         return n;
262     }
263 
264     /** {@inheritDoc} */
265     @Override
266     public double getMax() {
267         return computeExtrema ? maxImpl.getResult() : Double.NaN;
268     }
269 
270     /** {@inheritDoc} */
271     @Override
272     public double getMin() {
273         return computeExtrema ? minImpl.getResult() : Double.NaN;
274     }
275 
276     /** {@inheritDoc} */
277     @Override
278     public double getSum() {
279         return computeMoments ? sumImpl.getResult() : Double.NaN;
280     }
281 
282     /**
283      * Returns the sum of the squares of the values that have been added.
284      * <p>
285      * Double.NaN is returned if no values have been added.
286      *
287      * @return The sum of squares
288      */
289     public double getSumOfSquares() {
290         return computeSumOfSquares ? sumOfSquaresImpl.getResult() : Double.NaN;
291     }
292 
293     /** {@inheritDoc} */
294     @Override
295     public double getMean() {
296         return computeMoments ? meanImpl.getResult() : Double.NaN;
297     }
298 
299     /** {@inheritDoc} */
300     @Override
301     public double getVariance() {
302         return computeMoments ? varianceImpl.getResult() : Double.NaN;
303     }
304 
305     /**
306      * Returns the <a href="http://en.wikibooks.org/wiki/Statistics/Summary/Variance">
307      * population variance</a> of the values that have been added.
308      * <p>
309      * Double.NaN is returned if no values have been added.
310      *
311      * @return the population variance
312      */
313     public double getPopulationVariance() {
314         return computeMoments ? populationVariance.getResult() : Double.NaN;
315     }
316 
317     /**
318      * Returns the geometric mean of the values that have been added.
319      * <p>
320      * Double.NaN is returned if no values have been added.
321      *
322      * @return the geometric mean
323      */
324     public double getGeometricMean() {
325         return computeSumOfLogs ? geoMeanImpl.getResult() : Double.NaN;
326     }
327 
328     /**
329      * Returns the sum of the logs of the values that have been added.
330      * <p>
331      * Double.NaN is returned if no values have been added.
332      *
333      * @return the sum of logs
334      */
335     public double getSumOfLogs() {
336         return computeSumOfLogs ? sumOfLogsImpl.getResult() : Double.NaN;
337     }
338 
339     /**
340      * Returns a statistic related to the Second Central Moment. Specifically,
341      * what is returned is the sum of squared deviations from the sample mean
342      * among the values that have been added.
343      * <p>
344      * Returns <code>Double.NaN</code> if no data values have been added and
345      * returns <code>0</code> if there is just one value in the data set.
346      *
347      * @return second central moment statistic
348      */
349     public double getSecondMoment() {
350         return computeMoments ? secondMoment.getResult() : Double.NaN;
351     }
352 
353     /**
354      * Returns the quadratic mean, a.k.a.
355      * <a href="http://mathworld.wolfram.com/Root-Mean-Square.html">
356      * root-mean-square</a> of the available values
357      *
358      * @return The quadratic mean or {@code Double.NaN} if no values
359      * have been added.
360      */
361     public double getQuadraticMean() {
362         if (computeSumOfSquares) {
363             long size = getN();
364             return size > 0 ? FastMath.sqrt(getSumOfSquares() / size) : Double.NaN;
365         } else {
366             return Double.NaN;
367         }
368     }
369 
370     /**
371      * Returns the standard deviation of the values that have been added.
372      * <p>
373      * Double.NaN is returned if no values have been added.
374      *
375      * @return the standard deviation
376      */
377     @Override
378     public double getStandardDeviation() {
379         long size = getN();
380         if (computeMoments) {
381             if (size > 0) {
382                 return size > 1 ? FastMath.sqrt(getVariance()) : 0.0;
383             } else {
384                 return Double.NaN;
385             }
386         } else {
387             return Double.NaN;
388         }
389     }
390 
391     /**
392      * Returns an estimate of the median of the values that have been entered.
393      * See {@link RandomPercentile} for a description of the algorithm used for large
394      * data streams.
395      *
396      * @return the median
397      */
398     public double getMedian() {
399         return randomPercentile != null ? randomPercentile.getResult(50d) : Double.NaN;
400     }
401 
402     /**
403      * Returns an estimate of the given percentile of the values that have been entered.
404      * See {@link RandomPercentile} for a description of the algorithm used for large
405      * data streams.
406      *
407      * @param percentile the desired percentile (must be between 0 and 100)
408      * @return estimated percentile
409      */
410     public double getPercentile(double percentile) {
411         return randomPercentile == null ? Double.NaN : randomPercentile.getResult(percentile);
412     }
413 
414     /**
415      * {@inheritDoc}
416      * Statistics are aggregated only when both this and other are maintaining them.  For example,
417      * if this.computeMoments is false, but other.computeMoments is true, the moment data in other
418      * will be lost.
419      */
420     @Override
421     public void aggregate(StreamingStatistics other) {
422         MathUtils.checkNotNull(other);
423 
424         if (other.n > 0) {
425             this.n += other.n;
426             if (computeMoments && other.computeMoments) {
427                 this.secondMoment.aggregate(other.secondMoment);
428                 this.sumImpl.aggregate(other.sumImpl);
429             }
430             if (computeExtrema && other.computeExtrema) {
431                 this.minImpl.aggregate(other.minImpl);
432                 this.maxImpl.aggregate(other.maxImpl);
433             }
434             if (computeSumOfLogs && other.computeSumOfLogs) {
435                 this.sumOfLogsImpl.aggregate(other.sumOfLogsImpl);
436             }
437             if (computeSumOfSquares && other.computeSumOfSquares) {
438                 this.sumOfSquaresImpl.aggregate(other.sumOfSquaresImpl);
439             }
440             if (randomPercentile != null && other.randomPercentile != null) {
441                 this.randomPercentile.aggregate(other.randomPercentile);
442             }
443         }
444     }
445 
446     /**
447      * Generates a text report displaying summary statistics from values that
448      * have been added.
449      *
450      * @return String with line feeds displaying statistics
451      */
452     @Override
453     public String toString() {
454         StringBuilder outBuffer = new StringBuilder(200); // the size is just a wild guess
455         String endl = "\n";
456         outBuffer.append("StreamingStatistics:").append(endl).
457                   append("n: ").append(getN()).append(endl).
458                   append("min: ").append(getMin()).append(endl).
459                   append("max: ").append(getMax()).append(endl).
460                   append("sum: ").append(getSum()).append(endl).
461                   append("mean: ").append(getMean()).append(endl).
462                   append("variance: ").append(getVariance()).append(endl).
463                   append("population variance: ").append(getPopulationVariance()).append(endl).
464                   append("standard deviation: ").append(getStandardDeviation()).append(endl).
465                   append("geometric mean: ").append(getGeometricMean()).append(endl).
466                   append("second moment: ").append(getSecondMoment()).append(endl).
467                   append("sum of squares: ").append(getSumOfSquares()).append(endl).
468                   append("sum of logs: ").append(getSumOfLogs()).append(endl);
469         return outBuffer.toString();
470     }
471 
472     /**
473      * Returns true iff <code>object</code> is a <code>StreamingStatistics</code>
474      * instance and all statistics have the same values as this.
475      *
476      * @param object the object to test equality against.
477      * @return true if object equals this
478      */
479     @Override
480     public boolean equals(Object object) {
481         if (object == this) {
482             return true;
483         }
484         if (!(object instanceof StreamingStatistics)) {
485             return false;
486         }
487         StreamingStatistics other = (StreamingStatistics)object;
488         return other.getN() == getN()                                                     &&
489                Precision.equalsIncludingNaN(other.getMax(),           getMax())           &&
490                Precision.equalsIncludingNaN(other.getMin(),           getMin())           &&
491                Precision.equalsIncludingNaN(other.getSum(),           getSum())           &&
492                Precision.equalsIncludingNaN(other.getGeometricMean(), getGeometricMean()) &&
493                Precision.equalsIncludingNaN(other.getMean(),          getMean())          &&
494                Precision.equalsIncludingNaN(other.getSumOfSquares(),  getSumOfSquares())  &&
495                Precision.equalsIncludingNaN(other.getSumOfLogs(),     getSumOfLogs())     &&
496                Precision.equalsIncludingNaN(other.getVariance(),      getVariance())      &&
497                Precision.equalsIncludingNaN(other.getMedian(),        getMedian());
498     }
499 
500     /**
501      * Returns hash code based on values of statistics.
502      * @return hash code
503      */
504     @Override
505     public int hashCode() {
506         int result = 31 + MathUtils.hash(getN());
507         result = result * 31 + MathUtils.hash(getMax());
508         result = result * 31 + MathUtils.hash(getMin());
509         result = result * 31 + MathUtils.hash(getSum());
510         result = result * 31 + MathUtils.hash(getGeometricMean());
511         result = result * 31 + MathUtils.hash(getMean());
512         result = result * 31 + MathUtils.hash(getSumOfSquares());
513         result = result * 31 + MathUtils.hash(getSumOfLogs());
514         result = result * 31 + MathUtils.hash(getVariance());
515         result = result * 31 + MathUtils.hash(getMedian());
516         return result;
517     }
518 
519     /**
520      * Returns a {@link StreamingStatisticsBuilder} to source configured
521      * {@code StreamingStatistics} instances.
522      *
523      * @return a StreamingStatisticsBuilder instance
524      */
525     public static StreamingStatisticsBuilder builder() {
526         return new StreamingStatisticsBuilder();
527     }
528 
529     /**
530      * Builder for StreamingStatistics instances.
531      */
532     public static class StreamingStatisticsBuilder {
533         /** whether or not moment statistics are maintained by instances created by this factory */
534         private boolean computeMoments;
535         /** whether or not sum of squares and quadratic mean are maintained by instances created by this factory */
536         private boolean computeSumOfSquares;
537         /** whether or not sum of logs and geometric mean are maintained by instances created by this factory */
538         private boolean computeSumOfLogs;
539         /** whether or not min and max are maintained by instances created by this factory */
540         private boolean computeExtrema;
541         /** bound on quantile estimation error for percentiles.
542          * @since 2.3
543          */
544         private double epsilon;
545         /** PRNG used in sampling and merge operations.
546          * @since 2.3
547          */
548         private RandomGenerator randomGenerator;
549 
550         /** Simple constructor.
551          */
552         public StreamingStatisticsBuilder() {
553             computeMoments      = true;
554             computeSumOfSquares = true;
555             computeSumOfLogs    = true;
556             computeExtrema      = true;
557             percentiles(Double.NaN, null);
558         }
559 
560         /**
561          * Sets the computeMoments setting of the factory
562          *
563          * @param arg whether or not instances created using {@link #build()} will
564          * maintain moment statistics
565          * @return a factory with the given computeMoments property set
566          */
567         public StreamingStatisticsBuilder moments(boolean arg) {
568             this.computeMoments = arg;
569             return this;
570         }
571 
572         /**
573          * Sets the computeSumOfLogs setting of the factory
574          *
575          * @param arg whether or not instances created using {@link #build()} will
576          * maintain log sums
577          * @return a factory with the given computeSumOfLogs property set
578          */
579         public StreamingStatisticsBuilder sumOfLogs(boolean arg) {
580             this.computeSumOfLogs = arg;
581             return this;
582         }
583 
584         /**
585          * Sets the computeSumOfSquares setting of the factory.
586          *
587          * @param arg whether or not instances created using {@link #build()} will
588          * maintain sums of squares
589          * @return a factory with the given computeSumOfSquares property set
590          */
591         public StreamingStatisticsBuilder sumOfSquares(boolean arg) {
592             this.computeSumOfSquares = arg;
593             return this;
594         }
595 
596         /**
597          * Sets the computePercentiles setting of the factory.
598          * @param epsilonBound bound on quantile estimation error (see {@link RandomGenerator})
599          * @param generator PRNG used in sampling and merge operations
600          * @return a factory with the given computePercentiles property set
601          * @since 2.3
602          */
603         public StreamingStatisticsBuilder percentiles(final double epsilonBound, final RandomGenerator generator) {
604             this.epsilon         = epsilonBound;
605             this.randomGenerator = generator;
606             return this;
607         }
608 
609         /**
610          * Sets the computeExtrema setting of the factory.
611          *
612          * @param arg whether or not instances created using {@link #build()} will
613          * compute min and max
614          * @return a factory with the given computeExtrema property set
615          */
616         public StreamingStatisticsBuilder extrema(boolean arg) {
617             this.computeExtrema = arg;
618             return this;
619         }
620 
621         /**
622          * Builds a StreamingStatistics instance with currently defined properties.
623          *
624          * @return newly configured StreamingStatistics instance
625          */
626         public StreamingStatistics build() {
627             return new StreamingStatistics(computeMoments,
628                                            computeSumOfLogs, computeSumOfSquares,
629                                            computeExtrema,
630                                            epsilon, randomGenerator);
631         }
632     }
633 }