View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  /*
19   * This is not the original file distributed by the Apache Software Foundation
20   * It has been modified by the Hipparchus project
21   */
22  package org.hipparchus.stat.inference;
23  
24  import org.hipparchus.distribution.continuous.ChiSquaredDistribution;
25  import org.hipparchus.exception.LocalizedCoreFormats;
26  import org.hipparchus.exception.MathIllegalArgumentException;
27  import org.hipparchus.exception.MathIllegalStateException;
28  import org.hipparchus.stat.LocalizedStatFormats;
29  import org.hipparchus.util.FastMath;
30  import org.hipparchus.util.MathArrays;
31  import org.hipparchus.util.MathUtils;
32  
33  /**
34   * Implements <a href="http://en.wikipedia.org/wiki/G-test">G Test</a>
35   * statistics.
36   * <p>
37   * This is known in statistical genetics as the McDonald-Kreitman test.
38   * The implementation handles both known and unknown distributions.
39   * <p>
40   * Two samples tests can be used when the distribution is unknown <i>a priori</i>
41   * but provided by one sample, or when the hypothesis under test is that the two
42   * samples come from the same underlying distribution.
43   */
44  public class GTest { // NOPMD - this is not a Junit test class, PMD false positive here
45  
46      /** Empty constructor.
47       * <p>
48       * This constructor is not strictly necessary, but it prevents spurious
49       * javadoc warnings with JDK 18 and later.
50       * </p>
51       * @since 3.0
52       */
53      public GTest() { // NOPMD - unnecessary constructor added intentionally to make javadoc happy
54          // nothing to do
55      }
56  
57      /**
58       * Computes the <a href="http://en.wikipedia.org/wiki/G-test">G statistic
59       * for Goodness of Fit</a> comparing {@code observed} and {@code expected}
60       * frequency counts.
61       * <p>
62       * This statistic can be used to perform a G test (Log-Likelihood Ratio
63       * Test) evaluating the null hypothesis that the observed counts follow the
64       * expected distribution.
65       * <p>
66       * <strong>Preconditions</strong>:
67       * <ul>
68       * <li>Expected counts must all be positive.</li>
69       * <li>Observed counts must all be &ge; 0.</li>
70       * <li>The observed and expected arrays must have the same length and their
71       * common length must be at least 2. </li>
72       * </ul>
73       * <p>
74       * If any of the preconditions are not met, a
75       * {@code MathIllegalArgumentException} is thrown.
76       * <p>
77       * <strong>Note:</strong>This implementation rescales the
78       * {@code expected} array if necessary to ensure that the sum of the
79       * expected and observed counts are equal.
80       *
81       * @param observed array of observed frequency counts
82       * @param expected array of expected frequency counts
83       * @return G-Test statistic
84       * @throws MathIllegalArgumentException if {@code observed} has negative entries
85       * @throws MathIllegalArgumentException if {@code expected} has entries that
86       * are not strictly positive
87       * @throws MathIllegalArgumentException if the array lengths do not match or
88       * are less than 2.
89       */
90      public double g(final double[] expected, final long[] observed)
91              throws MathIllegalArgumentException {
92  
93          if (expected.length < 2) {
94              throw new MathIllegalArgumentException(LocalizedCoreFormats.DIMENSIONS_MISMATCH,
95                                                     expected.length, 2);
96          }
97          MathUtils.checkDimension(expected.length, observed.length);
98          MathArrays.checkPositive(expected);
99          MathArrays.checkNonNegative(observed);
100 
101         double sumExpected = 0d;
102         double sumObserved = 0d;
103         for (int i = 0; i < observed.length; i++) {
104             sumExpected += expected[i];
105             sumObserved += observed[i];
106         }
107         double ratio = 1d;
108         boolean rescale = false;
109         if (FastMath.abs(sumExpected - sumObserved) > 10E-6) {
110             ratio = sumObserved / sumExpected;
111             rescale = true;
112         }
113         double sum = 0d;
114         for (int i = 0; i < observed.length; i++) {
115             final double dev = rescale ?
116                     FastMath.log(observed[i] / (ratio * expected[i])) :
117                         FastMath.log(observed[i] / expected[i]);
118             sum += (observed[i]) * dev;
119         }
120         return 2d * sum;
121     }
122 
123     /**
124      * Returns the <i>observed significance level</i>, or <a href=
125      * "http://www.cas.lancs.ac.uk/glossary_v1.1/hyptest.html#pvalue"> p-value</a>,
126      * associated with a G-Test for goodness of fit comparing the
127      * {@code observed} frequency counts to those in the {@code expected} array.
128      *
129      * <p>The number returned is the smallest significance level at which one
130      * can reject the null hypothesis that the observed counts conform to the
131      * frequency distribution described by the expected counts.</p>
132      *
133      * <p>The probability returned is the tail probability beyond
134      * {@link #g(double[], long[]) g(expected, observed)}
135      * in the ChiSquare distribution with degrees of freedom one less than the
136      * common length of {@code expected} and {@code observed}.</p>
137      *
138      * <p> <strong>Preconditions</strong>:</p>
139      * <ul>
140      * <li>Expected counts must all be positive. </li>
141      * <li>Observed counts must all be &ge; 0. </li>
142      * <li>The observed and expected arrays must have the
143      * same length and their common length must be at least 2.</li>
144      * </ul>
145      *
146      * <p>If any of the preconditions are not met, a
147      * {@code MathIllegalArgumentException} is thrown.</p>
148      *
149      * <p><strong>Note:</strong>This implementation rescales the
150      * {@code expected} array if necessary to ensure that the sum of the
151      *  expected and observed counts are equal.</p>
152      *
153      * @param observed array of observed frequency counts
154      * @param expected array of expected frequency counts
155      * @return p-value
156      * @throws MathIllegalArgumentException if {@code observed} has negative entries
157      * @throws MathIllegalArgumentException if {@code expected} has entries that
158      * are not strictly positive
159      * @throws MathIllegalArgumentException if the array lengths do not match or
160      * are less than 2.
161      * @throws MathIllegalStateException if an error occurs computing the
162      * p-value.
163      */
164     public double gTest(final double[] expected, final long[] observed)
165             throws MathIllegalArgumentException, MathIllegalStateException {
166 
167         final ChiSquaredDistribution distribution =
168                 new ChiSquaredDistribution(expected.length - 1.0);
169         return 1.0 - distribution.cumulativeProbability(g(expected, observed));
170     }
171 
172     /**
173      * Returns the intrinsic (Hardy-Weinberg proportions) p-Value, as described
174      * in p64-69 of McDonald, J.H. 2009. Handbook of Biological Statistics
175      * (2nd ed.). Sparky House Publishing, Baltimore, Maryland.
176      *
177      * <p> The probability returned is the tail probability beyond
178      * {@link #g(double[], long[]) g(expected, observed)}
179      * in the ChiSquare distribution with degrees of freedom two less than the
180      * common length of {@code expected} and {@code observed}.</p>
181      *
182      * @param observed array of observed frequency counts
183      * @param expected array of expected frequency counts
184      * @return p-value
185      * @throws MathIllegalArgumentException if {@code observed} has negative entries
186      * @throws MathIllegalArgumentException {@code expected} has entries that are
187      * not strictly positive
188      * @throws MathIllegalArgumentException if the array lengths do not match or
189      * are less than 2.
190      * @throws MathIllegalStateException if an error occurs computing the
191      * p-value.
192      */
193     public double gTestIntrinsic(final double[] expected, final long[] observed)
194             throws MathIllegalArgumentException, MathIllegalStateException {
195 
196         final ChiSquaredDistribution distribution =
197                 new ChiSquaredDistribution(expected.length - 2.0);
198         return 1.0 - distribution.cumulativeProbability(g(expected, observed));
199     }
200 
201     /**
202      * Performs a G-Test (Log-Likelihood Ratio Test) for goodness of fit
203      * evaluating the null hypothesis that the observed counts conform to the
204      * frequency distribution described by the expected counts, with
205      * significance level {@code alpha}. Returns true iff the null
206      * hypothesis can be rejected with {@code 100 * (1 - alpha)} percent confidence.
207      *
208      * <p><strong>Example:</strong><br> To test the hypothesis that
209      * {@code observed} follows {@code expected} at the 99% level,
210      * use </p><p>
211      * {@code gTest(expected, observed, 0.01)}</p>
212      *
213      * <p>Returns true iff {@link #gTest(double[], long[])
214      *  gTestGoodnessOfFitPValue(expected, observed)} &gt; alpha</p>
215      *
216      * <p><strong>Preconditions</strong>:</p>
217      * <ul>
218      * <li>Expected counts must all be positive. </li>
219      * <li>Observed counts must all be &ge; 0. </li>
220      * <li>The observed and expected arrays must have the same length and their
221      * common length must be at least 2.
222      * <li> {@code 0 < alpha < 0.5} </li></ul>
223      *
224      * <p>If any of the preconditions are not met, a
225      * {@code MathIllegalArgumentException} is thrown.</p>
226      *
227      * <p><strong>Note:</strong>This implementation rescales the
228      * {@code expected} array if necessary to ensure that the sum of the
229      * expected and observed counts are equal.</p>
230      *
231      * @param observed array of observed frequency counts
232      * @param expected array of expected frequency counts
233      * @param alpha significance level of the test
234      * @return true iff null hypothesis can be rejected with confidence 1 -
235      * alpha
236      * @throws MathIllegalArgumentException if {@code observed} has negative entries
237      * @throws MathIllegalArgumentException if {@code expected} has entries that
238      * are not strictly positive
239      * @throws MathIllegalArgumentException if the array lengths do not match or
240      * are less than 2.
241      * @throws MathIllegalStateException if an error occurs computing the
242      * p-value.
243      * @throws MathIllegalArgumentException if alpha is not strictly greater than zero
244      * and less than or equal to 0.5
245      */
246     public boolean gTest(final double[] expected, final long[] observed,
247             final double alpha)
248             throws MathIllegalArgumentException, MathIllegalStateException {
249 
250         if ((alpha <= 0) || (alpha > 0.5)) {
251             throw new MathIllegalArgumentException(LocalizedStatFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
252                     alpha, 0, 0.5);
253         }
254         return gTest(expected, observed) < alpha;
255     }
256 
257     /**
258      * Calculates the <a href=
259      * "http://en.wikipedia.org/wiki/Entropy_%28information_theory%29">Shannon
260      * entropy</a> for 2 Dimensional Matrix.  The value returned is the entropy
261      * of the vector formed by concatenating the rows (or columns) of {@code k}
262      * to form a vector. See {@link #entropy(long[])}.
263      *
264      * @param k 2 Dimensional Matrix of long values (for ex. the counts of a
265      * trials)
266      * @return Shannon Entropy of the given Matrix
267      *
268      */
269     private double entropy(final long[][] k) {
270         double h = 0d;
271         double sum_k = 0d;
272         for (int i = 0; i < k.length; i++) {
273             for (int j = 0; j < k[i].length; j++) {
274                 sum_k += k[i][j];
275             }
276         }
277         for (int i = 0; i < k.length; i++) {
278             for (int j = 0; j < k[i].length; j++) {
279                 if (k[i][j] != 0) {
280                     final double p_ij = k[i][j] / sum_k;
281                     h += p_ij * FastMath.log(p_ij);
282                 }
283             }
284         }
285         return -h;
286     }
287 
288     /**
289      * Calculates the <a href="http://en.wikipedia.org/wiki/Entropy_%28information_theory%29">
290      * Shannon entropy</a> for a vector.  The values of {@code k} are taken to be
291      * incidence counts of the values of a random variable. What is returned is <br>
292      * &sum;p<sub>i</sub>log(p<sub>i</sub><br>
293      * where p<sub>i</sub> = k[i] / (sum of elements in k)
294      *
295      * @param k Vector (for ex. Row Sums of a trials)
296      * @return Shannon Entropy of the given Vector
297      *
298      */
299     private double entropy(final long[] k) {
300         double h = 0d;
301         double sum_k = 0d;
302         for (int i = 0; i < k.length; i++) {
303             sum_k += k[i];
304         }
305         for (int i = 0; i < k.length; i++) {
306             if (k[i] != 0) {
307                 final double p_i = k[i] / sum_k;
308                 h += p_i * FastMath.log(p_i);
309             }
310         }
311         return -h;
312     }
313 
314     /**
315      * <p>Computes a G (Log-Likelihood Ratio) two sample test statistic for
316      * independence comparing frequency counts in
317      * {@code observed1} and {@code observed2}. The sums of frequency
318      * counts in the two samples are not required to be the same. The formula
319      * used to compute the test statistic is </p>
320      *
321      * <p>{@code 2 * totalSum * [H(rowSums) + H(colSums) - H(k)]}</p>
322      *
323      * <p> where {@code H} is the
324      * <a href="http://en.wikipedia.org/wiki/Entropy_%28information_theory%29">
325      * Shannon Entropy</a> of the random variable formed by viewing the elements
326      * of the argument array as incidence counts; <br>
327      * {@code k} is a matrix with rows {@code [observed1, observed2]}; <br>
328      * {@code rowSums, colSums} are the row/col sums of {@code k}; <br>
329      * and {@code totalSum} is the overall sum of all entries in {@code k}.</p>
330      *
331      * <p>This statistic can be used to perform a G test evaluating the null
332      * hypothesis that both observed counts are independent </p>
333      *
334      * <p> <strong>Preconditions</strong>:</p>
335      * <ul>
336      * <li>Observed counts must be non-negative. </li>
337      * <li>Observed counts for a specific bin must not both be zero. </li>
338      * <li>Observed counts for a specific sample must not all be  0. </li>
339      * <li>The arrays {@code observed1} and {@code observed2} must have
340      * the same length and their common length must be at least 2. </li></ul>
341      *
342      * <p>If any of the preconditions are not met, a
343      * {@code MathIllegalArgumentException} is thrown.</p>
344      *
345      * @param observed1 array of observed frequency counts of the first data set
346      * @param observed2 array of observed frequency counts of the second data
347      * set
348      * @return G-Test statistic
349      * @throws MathIllegalArgumentException the the lengths of the arrays do not
350      * match or their common length is less than 2
351      * @throws MathIllegalArgumentException if any entry in {@code observed1} or
352      * {@code observed2} is negative
353      * @throws MathIllegalArgumentException if either all counts of
354      * {@code observed1} or {@code observed2} are zero, or if the count
355      * at the same index is zero for both arrays.
356      */
357     public double gDataSetsComparison(final long[] observed1, final long[] observed2)
358             throws MathIllegalArgumentException {
359 
360         // Make sure lengths are same
361         if (observed1.length < 2) {
362             throw new MathIllegalArgumentException(LocalizedCoreFormats.DIMENSIONS_MISMATCH,
363                                                    observed1.length, 2);
364         }
365         MathUtils.checkDimension(observed1.length, observed2.length);
366 
367         // Ensure non-negative counts
368         MathArrays.checkNonNegative(observed1);
369         MathArrays.checkNonNegative(observed2);
370 
371         // Compute and compare count sums
372         long countSum1 = 0;
373         long countSum2 = 0;
374 
375         // Compute and compare count sums
376         final long[] collSums = new long[observed1.length];
377         final long[][] k = new long[2][observed1.length];
378 
379         for (int i = 0; i < observed1.length; i++) {
380             if (observed1[i] == 0 && observed2[i] == 0) {
381                 throw new MathIllegalArgumentException(LocalizedCoreFormats.OBSERVED_COUNTS_BOTTH_ZERO_FOR_ENTRY, i);
382             } else {
383                 countSum1 += observed1[i];
384                 countSum2 += observed2[i];
385                 collSums[i] = observed1[i] + observed2[i];
386                 k[0][i] = observed1[i];
387                 k[1][i] = observed2[i];
388             }
389         }
390         // Ensure neither sample is uniformly 0
391         if (countSum1 == 0 || countSum2 == 0) {
392             throw new MathIllegalArgumentException(LocalizedCoreFormats.ZERO_NOT_ALLOWED);
393         }
394         final long[] rowSums = {countSum1, countSum2};
395         final double sum = (double) countSum1 + (double) countSum2;
396         return 2 * sum * (entropy(rowSums) + entropy(collSums) - entropy(k));
397     }
398 
399     /**
400      * Calculates the root log-likelihood ratio for 2 state Datasets. See
401      * {@link #gDataSetsComparison(long[], long[] )}.
402      *
403      * <p>Given two events A and B, let k11 be the number of times both events
404      * occur, k12 the incidence of B without A, k21 the count of A without B,
405      * and k22 the number of times neither A nor B occurs.  What is returned
406      * by this method is </p>
407      *
408      * <p>{@code (sgn) sqrt(gValueDataSetsComparison({k11, k12}, {k21, k22})}</p>
409      *
410      * <p>where {@code sgn} is -1 if {@code k11 / (k11 + k12) < k21 / (k21 + k22))};<br>
411      * 1 otherwise.</p>
412      *
413      * <p>Signed root LLR has two advantages over the basic LLR: a) it is positive
414      * where k11 is bigger than expected, negative where it is lower b) if there is
415      * no difference it is asymptotically normally distributed. This allows one
416      * to talk about "number of standard deviations" which is a more common frame
417      * of reference than the chi^2 distribution.</p>
418      *
419      * @param k11 number of times the two events occurred together (AB)
420      * @param k12 number of times the second event occurred WITHOUT the
421      * first event (notA,B)
422      * @param k21 number of times the first event occurred WITHOUT the
423      * second event (A, notB)
424      * @param k22 number of times something else occurred (i.e. was neither
425      * of these events (notA, notB)
426      * @return root log-likelihood ratio
427      *
428      */
429     public double rootLogLikelihoodRatio(final long k11, long k12,
430             final long k21, final long k22) {
431         final double llr = gDataSetsComparison(
432                 new long[]{k11, k12}, new long[]{k21, k22});
433         double sqrt = FastMath.sqrt(llr);
434         if ((double) k11 / (k11 + k12) < (double) k21 / (k21 + k22)) {
435             sqrt = -sqrt;
436         }
437         return sqrt;
438     }
439 
440     /**
441      * <p>Returns the <i>observed significance level</i>, or <a href=
442      * "http://www.cas.lancs.ac.uk/glossary_v1.1/hyptest.html#pvalue">
443      * p-value</a>, associated with a G-Value (Log-Likelihood Ratio) for two
444      * sample test comparing bin frequency counts in {@code observed1} and
445      * {@code observed2}.</p>
446      *
447      * <p>The number returned is the smallest significance level at which one
448      * can reject the null hypothesis that the observed counts conform to the
449      * same distribution. </p>
450      *
451      * <p>See {@link #gTest(double[], long[])} for details
452      * on how the p-value is computed.  The degrees of of freedom used to
453      * perform the test is one less than the common length of the input observed
454      * count arrays.</p>
455      *
456      * <p><strong>Preconditions</strong>:</p>
457      * <ul> <li>Observed counts must be non-negative. </li>
458      * <li>Observed counts for a specific bin must not both be zero. </li>
459      * <li>Observed counts for a specific sample must not all be 0. </li>
460      * <li>The arrays {@code observed1} and {@code observed2} must
461      * have the same length and their common length must be at least 2. </li>
462      * </ul>
463      * <p> If any of the preconditions are not met, a
464      * {@code MathIllegalArgumentException} is thrown.</p>
465      *
466      * @param observed1 array of observed frequency counts of the first data set
467      * @param observed2 array of observed frequency counts of the second data
468      * set
469      * @return p-value
470      * @throws MathIllegalArgumentException the the length of the arrays does not
471      * match or their common length is less than 2
472      * @throws MathIllegalArgumentException if any of the entries in {@code observed1} or
473      * {@code observed2} are negative
474      * @throws MathIllegalArgumentException if either all counts of {@code observed1} or
475      * {@code observed2} are zero, or if the count at some index is
476      * zero for both arrays
477      * @throws MathIllegalStateException if an error occurs computing the
478      * p-value.
479      */
480     public double gTestDataSetsComparison(final long[] observed1,
481             final long[] observed2)
482             throws MathIllegalArgumentException,
483             MathIllegalStateException {
484 
485         final ChiSquaredDistribution distribution =
486                 new ChiSquaredDistribution((double) observed1.length - 1);
487         return 1 - distribution.cumulativeProbability(
488                 gDataSetsComparison(observed1, observed2));
489     }
490 
491     /**
492      * <p>Performs a G-Test (Log-Likelihood Ratio Test) comparing two binned
493      * data sets. The test evaluates the null hypothesis that the two lists
494      * of observed counts conform to the same frequency distribution, with
495      * significance level {@code alpha}. Returns true iff the null
496      * hypothesis can be rejected  with 100 * (1 - alpha) percent confidence.
497      * </p>
498      * <p>See {@link #gDataSetsComparison(long[], long[])} for details
499      * on the formula used to compute the G (LLR) statistic used in the test and
500      * {@link #gTest(double[], long[])} for information on how
501      * the observed significance level is computed. The degrees of of freedom used
502      * to perform the test is one less than the common length of the input observed
503      * count arrays. </p>
504      *
505      * <p><strong>Preconditions</strong>:</p>
506      * <ul>
507      * <li>Observed counts must be non-negative. </li>
508      * <li>Observed counts for a specific bin must not both be zero. </li>
509      * <li>Observed counts for a specific sample must not all be 0. </li>
510      * <li>The arrays {@code observed1} and {@code observed2} must
511      * have the same length and their common length must be at least 2. </li>
512      * <li>{@code 0 < alpha < 0.5} </li></ul>
513      *
514      * <p>If any of the preconditions are not met, a
515      * {@code MathIllegalArgumentException} is thrown.</p>
516      *
517      * @param observed1 array of observed frequency counts of the first data set
518      * @param observed2 array of observed frequency counts of the second data
519      * set
520      * @param alpha significance level of the test
521      * @return true iff null hypothesis can be rejected with confidence 1 -
522      * alpha
523      * @throws MathIllegalArgumentException the the length of the arrays does not
524      * match
525      * @throws MathIllegalArgumentException if any of the entries in {@code observed1} or
526      * {@code observed2} are negative
527      * @throws MathIllegalArgumentException if either all counts of {@code observed1} or
528      * {@code observed2} are zero, or if the count at some index is
529      * zero for both arrays
530      * @throws MathIllegalArgumentException if {@code alpha} is not in the range
531      * (0, 0.5]
532      * @throws MathIllegalStateException if an error occurs performing the test
533      */
534     public boolean gTestDataSetsComparison(
535             final long[] observed1,
536             final long[] observed2,
537             final double alpha)
538             throws MathIllegalArgumentException, MathIllegalStateException {
539 
540         if (alpha <= 0 || alpha > 0.5) {
541             throw new MathIllegalArgumentException(
542                     LocalizedStatFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5);
543         }
544         return gTestDataSetsComparison(observed1, observed2) < alpha;
545     }
546 }