View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  /*
19   * This is not the original file distributed by the Apache Software Foundation
20   * It has been modified by the Hipparchus project
21   */
22  package org.hipparchus.stat.inference;
23  
24  import java.util.ArrayList;
25  import java.util.Collection;
26  
27  import org.hipparchus.distribution.continuous.FDistribution;
28  import org.hipparchus.exception.MathIllegalArgumentException;
29  import org.hipparchus.exception.MathIllegalStateException;
30  import org.hipparchus.exception.NullArgumentException;
31  import org.hipparchus.stat.LocalizedStatFormats;
32  import org.hipparchus.stat.descriptive.StreamingStatistics;
33  import org.hipparchus.util.MathUtils;
34  
35  /**
36   * Implements one-way ANOVA (analysis of variance) statistics.
37   *
38   * <p> Tests for differences between two or more categories of univariate data
39   * (for example, the body mass index of accountants, lawyers, doctors and
40   * computer programmers).  When two categories are given, this is equivalent to
41   * the {@link TTest}.
42   * </p><p>
43   * Uses the {@link FDistribution
44   * Hipparchus F Distribution implementation} to estimate exact p-values.</p>
45   * <p>This implementation is based on a description at
46   * <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">One way Anova (dead link)</a></p>
47   * <pre>
48   * Abbreviations: bg = between groups,
49   *                wg = within groups,
50   *                ss = sum squared deviations
51   * </pre>
52   *
53   */
54  public class OneWayAnova {
55  
56      /** Empty constructor.
57       * <p>
58       * This constructor is not strictly necessary, but it prevents spurious
59       * javadoc warnings with JDK 18 and later.
60       * </p>
61       * @since 3.0
62       */
63      public OneWayAnova() { // NOPMD - unnecessary constructor added intentionally to make javadoc happy
64          // nothing to do
65      }
66  
67      /**
68       * Computes the ANOVA F-value for a collection of <code>double[]</code>
69       * arrays.
70       *
71       * <p><strong>Preconditions</strong>:</p>
72       * <ul>
73       * <li>The categoryData <code>Collection</code> must contain
74       * <code>double[]</code> arrays.</li>
75       * <li> There must be at least two <code>double[]</code> arrays in the
76       * <code>categoryData</code> collection and each of these arrays must
77       * contain at least two values.</li></ul>
78       * <p>
79       * This implementation computes the F statistic using the definitional
80       * formula</p><pre>
81       *   F = msbg/mswg</pre>
82       * <p>where</p><pre>
83       *  msbg = between group mean square
84       *  mswg = within group mean square</pre>
85       *  <p>
86       * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">
87       * here</a></p>
88       *
89       * @param categoryData <code>Collection</code> of <code>double[]</code>
90       * arrays each containing data for one category
91       * @return Fvalue
92       * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
93       * @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
94       * array is less than 2 or a contained <code>double[]</code> array does not have
95       * at least two values
96       */
97      public double anovaFValue(final Collection<double[]> categoryData)
98          throws MathIllegalArgumentException, NullArgumentException {
99  
100         return anovaStats(categoryData).F;
101 
102     }
103 
104     /**
105      * Computes the ANOVA P-value for a collection of <code>double[]</code>
106      * arrays.
107      *
108      * <p><strong>Preconditions</strong>:</p>
109      * <ul>
110      * <li>The categoryData <code>Collection</code> must contain
111      * <code>double[]</code> arrays.</li>
112      * <li> There must be at least two <code>double[]</code> arrays in the
113      * <code>categoryData</code> collection and each of these arrays must
114      * contain at least two values.</li></ul>
115      * <p>
116      * This implementation uses the
117      * {@link org.hipparchus.distribution.continuous.FDistribution
118      * Hipparchus F Distribution implementation} to estimate the exact
119      * p-value, using the formula</p><pre>
120      *   p = 1 - cumulativeProbability(F)</pre>
121      * <p>where <code>F</code> is the F value and <code>cumulativeProbability</code>
122      * is the Hipparchus implementation of the F distribution.</p>
123      *
124      * @param categoryData <code>Collection</code> of <code>double[]</code>
125      * arrays each containing data for one category
126      * @return Pvalue
127      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
128      * @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
129      * array is less than 2 or a contained <code>double[]</code> array does not have
130      * at least two values
131      * @throws MathIllegalStateException if the p-value can not be computed due to a convergence error
132      * @throws MathIllegalStateException if the maximum number of iterations is exceeded
133      */
134     public double anovaPValue(final Collection<double[]> categoryData)
135         throws MathIllegalArgumentException, NullArgumentException,
136         MathIllegalStateException {
137 
138         final AnovaStats a = anovaStats(categoryData);
139         // No try-catch or advertised exception because args are valid
140         final FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
141         return 1.0 - fdist.cumulativeProbability(a.F);
142 
143     }
144 
145     /**
146      * Computes the ANOVA P-value for a collection of {@link StreamingStatistics}.
147      *
148      * <p><strong>Preconditions</strong>:</p>
149      * <ul>
150      * <li>The categoryData <code>Collection</code> must contain
151      * {@link StreamingStatistics}.</li>
152      * <li> There must be at least two {@link StreamingStatistics} in the
153      * <code>categoryData</code> collection and each of these statistics must
154      * contain at least two values.</li></ul>
155      * <p>
156      * This implementation uses the
157      * {@link org.hipparchus.distribution.continuous.FDistribution
158      * Hipparchus F Distribution implementation} to estimate the exact
159      * p-value, using the formula</p><pre>
160      *   p = 1 - cumulativeProbability(F)</pre>
161      * <p>where <code>F</code> is the F value and <code>cumulativeProbability</code>
162      * is the Hipparchus implementation of the F distribution.</p>
163      *
164      * @param categoryData <code>Collection</code> of {@link StreamingStatistics}
165      * each containing data for one category
166      * @param allowOneElementData if true, allow computation for one catagory
167      * only or for one data element per category
168      * @return Pvalue
169      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
170      * @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
171      * array is less than 2 or a contained {@link StreamingStatistics} does not have
172      * at least two values
173      * @throws MathIllegalStateException if the p-value can not be computed due to a convergence error
174      * @throws MathIllegalStateException if the maximum number of iterations is exceeded
175      */
176     public double anovaPValue(final Collection<StreamingStatistics> categoryData,
177                               final boolean allowOneElementData)
178         throws MathIllegalArgumentException, NullArgumentException,
179         MathIllegalStateException {
180 
181         final AnovaStats a = anovaStats(categoryData, allowOneElementData);
182         final FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
183         return 1.0 - fdist.cumulativeProbability(a.F);
184 
185     }
186 
187     /**
188      * This method calls the method that actually does the calculations (except
189      * P-value).
190      *
191      * @param categoryData
192      *            <code>Collection</code> of <code>double[]</code> arrays each
193      *            containing data for one category
194      * @return computed AnovaStats
195      * @throws NullArgumentException
196      *             if <code>categoryData</code> is <code>null</code>
197      * @throws MathIllegalArgumentException
198      *             if the length of the <code>categoryData</code> array is less
199      *             than 2 or a contained <code>double[]</code> array does not
200      *             contain at least two values
201      */
202     private AnovaStats anovaStats(final Collection<double[]> categoryData)
203         throws MathIllegalArgumentException, NullArgumentException {
204 
205         MathUtils.checkNotNull(categoryData);
206 
207         final Collection<StreamingStatistics> categoryDataSummaryStatistics =
208                 new ArrayList<>(categoryData.size());
209 
210         // convert arrays to SummaryStatistics
211         for (final double[] data : categoryData) {
212             final StreamingStatistics dataSummaryStatistics = new StreamingStatistics();
213             categoryDataSummaryStatistics.add(dataSummaryStatistics);
214             for (final double val : data) {
215                 dataSummaryStatistics.addValue(val);
216             }
217         }
218 
219         return anovaStats(categoryDataSummaryStatistics, false);
220 
221     }
222 
223     /**
224      * Performs an ANOVA test, evaluating the null hypothesis that there
225      * is no difference among the means of the data categories.
226      *
227      * <p><strong>Preconditions</strong>:</p>
228      * <ul>
229      * <li>The categoryData <code>Collection</code> must contain
230      * <code>double[]</code> arrays.</li>
231      * <li> There must be at least two <code>double[]</code> arrays in the
232      * <code>categoryData</code> collection and each of these arrays must
233      * contain at least two values.</li>
234      * <li>alpha must be strictly greater than 0 and less than or equal to 0.5.
235      * </li></ul>
236      * <p>
237      * This implementation uses the
238      * {@link org.hipparchus.distribution.continuous.FDistribution
239      * Hipparchus F Distribution implementation} to estimate the exact
240      * p-value, using the formula</p><pre>
241      *   p = 1 - cumulativeProbability(F)</pre>
242      * <p>where <code>F</code> is the F value and <code>cumulativeProbability</code>
243      * is the Hipparchus implementation of the F distribution.</p>
244      * <p>True is returned iff the estimated p-value is less than alpha.</p>
245      *
246      * @param categoryData <code>Collection</code> of <code>double[]</code>
247      * arrays each containing data for one category
248      * @param alpha significance level of the test
249      * @return true if the null hypothesis can be rejected with
250      * confidence 1 - alpha
251      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
252      * @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
253      * array is less than 2 or a contained <code>double[]</code> array does not have
254      * at least two values
255      * @throws MathIllegalArgumentException if <code>alpha</code> is not in the range (0, 0.5]
256      * @throws MathIllegalStateException if the p-value can not be computed due to a convergence error
257      * @throws MathIllegalStateException if the maximum number of iterations is exceeded
258      */
259     public boolean anovaTest(final Collection<double[]> categoryData,
260                              final double alpha)
261         throws MathIllegalArgumentException, NullArgumentException, MathIllegalStateException {
262 
263         if ((alpha <= 0) || (alpha > 0.5)) {
264             throw new MathIllegalArgumentException(
265                     LocalizedStatFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
266                     alpha, 0, 0.5);
267         }
268         return anovaPValue(categoryData) < alpha;
269     }
270 
271     /**
272      * This method actually does the calculations (except P-value).
273      *
274      * @param categoryData <code>Collection</code> of <code>double[]</code>
275      * arrays each containing data for one category
276      * @param allowOneElementData if true, allow computation for one category
277      * only or for one data element per category
278      * @return computed AnovaStats
279      * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
280      * @throws MathIllegalArgumentException if <code>allowOneElementData</code> is false and the number of
281      * categories is less than 2 or a contained SummaryStatistics does not contain
282      * at least two values
283      */
284     private AnovaStats anovaStats(final Collection<StreamingStatistics> categoryData,
285                                   final boolean allowOneElementData)
286         throws MathIllegalArgumentException, NullArgumentException {
287 
288         MathUtils.checkNotNull(categoryData);
289 
290         if (!allowOneElementData) {
291             // check if we have enough categories
292             if (categoryData.size() < 2) {
293                 throw new MathIllegalArgumentException(LocalizedStatFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
294                                                        categoryData.size(), 2);
295             }
296 
297             // check if each category has enough data
298             for (final StreamingStatistics array : categoryData) {
299                 if (array.getN() <= 1) {
300                     throw new MathIllegalArgumentException(LocalizedStatFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
301                                                            (int) array.getN(), 2);
302                 }
303             }
304         }
305 
306         int dfwg = 0;
307         double sswg = 0;
308         double totsum = 0;
309         double totsumsq = 0;
310         int totnum = 0;
311 
312         for (final StreamingStatistics data : categoryData) {
313 
314             final double sum = data.getSum();
315             final double sumsq = data.getSumOfSquares();
316             final int num = (int) data.getN();
317             totnum += num;
318             totsum += sum;
319             totsumsq += sumsq;
320 
321             dfwg += num - 1;
322             final double ss = sumsq - ((sum * sum) / num);
323             sswg += ss;
324         }
325 
326         final double sst = totsumsq - ((totsum * totsum) / totnum);
327         final double ssbg = sst - sswg;
328         final int dfbg = categoryData.size() - 1;
329         final double msbg = ssbg / dfbg;
330         final double mswg = sswg / dfwg;
331         final double F = msbg / mswg;
332 
333         return new AnovaStats(dfbg, dfwg, F);
334 
335     }
336 
337     /**
338      * Convenience class to pass dfbg,dfwg,F values around within OneWayAnova.
339      * No get/set methods provided.
340      */
341     private static class AnovaStats {
342 
343         /** Degrees of freedom in numerator (between groups). */
344         private final int dfbg;
345 
346         /** Degrees of freedom in denominator (within groups). */
347         private final int dfwg;
348 
349         /** Statistic. */
350         private final double F;
351 
352         /**
353          * Constructor
354          * @param dfbg degrees of freedom in numerator (between groups)
355          * @param dfwg degrees of freedom in denominator (within groups)
356          * @param F statistic
357          */
358         private AnovaStats(int dfbg, int dfwg, double F) {
359             this.dfbg = dfbg;
360             this.dfwg = dfwg;
361             this.F = F;
362         }
363     }
364 
365 }