View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  /*
19   * This is not the original file distributed by the Apache Software Foundation
20   * It has been modified by the Hipparchus project
21   */
22  package org.hipparchus.stat.inference;
23  
24  import java.util.Map;
25  import java.util.TreeMap;
26  import java.util.stream.LongStream;
27  
28  import org.hipparchus.distribution.continuous.NormalDistribution;
29  import org.hipparchus.exception.LocalizedCoreFormats;
30  import org.hipparchus.exception.MathIllegalArgumentException;
31  import org.hipparchus.exception.MathIllegalStateException;
32  import org.hipparchus.exception.NullArgumentException;
33  import org.hipparchus.stat.LocalizedStatFormats;
34  import org.hipparchus.stat.ranking.NaNStrategy;
35  import org.hipparchus.stat.ranking.NaturalRanking;
36  import org.hipparchus.stat.ranking.TiesStrategy;
37  import org.hipparchus.util.FastMath;
38  import org.hipparchus.util.Precision;
39  
40  /**
41   * An implementation of the Mann-Whitney U test.
42   * <p>
43   * The definitions and computing formulas used in this implementation follow
44   * those in the article,
45   * <a href="http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U"> Mann-Whitney U
46   * Test</a>
47   * <p>
48   * In general, results correspond to (and have been tested against) the R
49   * wilcox.test function, with {@code exact} meaning the same thing in both APIs
50   * and {@code CORRECT} uniformly true in this implementation. For example,
51   * wilcox.test(x, y, alternative = "two.sided", mu = 0, paired = FALSE, exact = FALSE
52   * correct = TRUE) will return the same p-value as mannWhitneyUTest(x, y,
53   * false). The minimum of the W value returned by R for wilcox.test(x, y...) and
54   * wilcox.test(y, x...) should equal mannWhitneyU(x, y...).
55   */
56  public class MannWhitneyUTest { // NOPMD - this is not a Junit test class, PMD false positive here
57  
58      /**
59       * If the combined dataset contains no more values than this, test defaults to
60       * exact test.
61       */
62      private static final int SMALL_SAMPLE_SIZE = 50;
63  
64      /** Ranking algorithm. */
65      private final NaturalRanking naturalRanking;
66  
67      /** Normal distribution */
68      private final NormalDistribution standardNormal;
69  
70      /**
71       * Create a test instance using where NaN's are left in place and ties get
72       * the average of applicable ranks.
73       */
74      public MannWhitneyUTest() {
75          naturalRanking = new NaturalRanking(NaNStrategy.FIXED,
76                                              TiesStrategy.AVERAGE);
77          standardNormal = new NormalDistribution(0, 1);
78      }
79  
80      /**
81       * Create a test instance using the given strategies for NaN's and ties.
82       *
83       * @param nanStrategy specifies the strategy that should be used for
84       *        Double.NaN's
85       * @param tiesStrategy specifies the strategy that should be used for ties
86       */
87      public MannWhitneyUTest(final NaNStrategy nanStrategy,
88                              final TiesStrategy tiesStrategy) {
89          naturalRanking = new NaturalRanking(nanStrategy, tiesStrategy);
90          standardNormal = new NormalDistribution(0, 1);
91      }
92  
93      /**
94       * Computes the
95       * <a href="http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U">
96       * Mann-Whitney U statistic</a> comparing means for two independent samples
97       * possibly of different lengths.
98       * <p>
99       * This statistic can be used to perform a Mann-Whitney U test evaluating
100      * the null hypothesis that the two independent samples have equal mean.
101      * <p>
102      * Let X<sub>i</sub> denote the i'th individual of the first sample and
103      * Y<sub>j</sub> the j'th individual in the second sample. Note that the
104      * samples can have different lengths.
105      * <p>
106      * <strong>Preconditions</strong>:
107      * <ul>
108      * <li>All observations in the two samples are independent.</li>
109      * <li>The observations are at least ordinal (continuous are also
110      * ordinal).</li>
111      * </ul>
112      *
113      * @param x the first sample
114      * @param y the second sample
115      * @return Mann-Whitney U statistic (minimum of U<sup>x</sup> and
116      *         U<sup>y</sup>)
117      * @throws NullArgumentException if {@code x} or {@code y} are {@code null}.
118      * @throws MathIllegalArgumentException if {@code x} or {@code y} are
119      *         zero-length.
120      */
121     public double mannWhitneyU(final double[] x, final double[] y)
122         throws MathIllegalArgumentException, NullArgumentException {
123 
124         ensureDataConformance(x, y);
125 
126         final double[] z = concatenateSamples(x, y);
127         final double[] ranks = naturalRanking.rank(z);
128 
129         double sumRankX = 0;
130 
131         /*
132          * The ranks for x is in the first x.length entries in ranks because x
133          * is in the first x.length entries in z
134          */
135         for (int i = 0; i < x.length; ++i) {
136             sumRankX += ranks[i];
137         }
138 
139         /*
140          * U1 = R1 - (n1 * (n1 + 1)) / 2 where R1 is sum of ranks for sample 1,
141          * e.g. x, n1 is the number of observations in sample 1.
142          */
143         final double U1 = sumRankX - ((long) x.length * (x.length + 1)) / 2;
144 
145         /*
146          * U1 + U2 = n1 * n2
147          */
148         final double U2 = (long) x.length * y.length - U1;
149 
150         return FastMath.min(U1, U2);
151     }
152 
153     /**
154      * Concatenate the samples into one array.
155      *
156      * @param x first sample
157      * @param y second sample
158      * @return concatenated array
159      */
160     private double[] concatenateSamples(final double[] x, final double[] y) {
161         final double[] z = new double[x.length + y.length];
162 
163         System.arraycopy(x, 0, z, 0, x.length);
164         System.arraycopy(y, 0, z, x.length, y.length);
165 
166         return z;
167     }
168 
169     /**
170      * Returns the asymptotic <i>observed significance level</i>, or
171      * <a href="http://www.cas.lancs.ac.uk/glossary_v1.1/hyptest.html#pvalue">
172      * p-value</a>, associated with a <a href=
173      * "http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U">Mann-Whitney U
174      * Test</a> comparing means for two independent samples.
175      * <p>
176      * Let X<sub>i</sub> denote the i'th individual of the first sample and
177      * Y<sub>j</sub> the j'th individual in the second sample.
178      * <p>
179      * <strong>Preconditions</strong>:
180      * <ul>
181      * <li>All observations in the two samples are independent.</li>
182      * <li>The observations are at least ordinal.</li>
183      * </ul>
184      * <p>
185      * If there are no ties in the data and both samples are small (less than or
186      * equal to 50 values in the combined dataset), an exact test is performed;
187      * otherwise the test uses the normal approximation (with continuity
188      * correction).
189      * <p>
190      * If the combined dataset contains ties, the variance used in the normal
191      * approximation is bias-adjusted using the formula in the reference above.
192      *
193      * @param x the first sample
194      * @param y the second sample
195      * @return approximate 2-sized p-value
196      * @throws NullArgumentException if {@code x} or {@code y} are {@code null}.
197      * @throws MathIllegalArgumentException if {@code x} or {@code y} are
198      *         zero-length
199      */
200     public double mannWhitneyUTest(final double[] x, final double[] y)
201         throws MathIllegalArgumentException, NullArgumentException {
202         ensureDataConformance(x, y);
203 
204         // If samples are both small and there are no ties, perform exact test
205         if (x.length + y.length <= SMALL_SAMPLE_SIZE &&
206             tiesMap(x, y).isEmpty()) {
207             return mannWhitneyUTest(x, y, true);
208         } else { // Normal approximation
209             return mannWhitneyUTest(x, y, false);
210         }
211     }
212 
213     /**
214      * Returns the asymptotic <i>observed significance level</i>, or
215      * <a href="http://www.cas.lancs.ac.uk/glossary_v1.1/hyptest.html#pvalue">
216      * p-value</a>, associated with a <a href=
217      * "http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U">Mann-Whitney U
218      * Test</a> comparing means for two independent samples.
219      * <p>
220      * Let X<sub>i</sub> denote the i'th individual of the first sample and
221      * Y<sub>j</sub> the j'th individual in the second sample.
222      * <p>
223      * <strong>Preconditions</strong>:
224      * <ul>
225      * <li>All observations in the two samples are independent.</li>
226      * <li>The observations are at least ordinal.</li>
227      * </ul>
228      * <p>
229      * If {@code exact} is {@code true}, the p-value reported is exact, computed
230      * using the exact distribution of the U statistic. The computation in this
231      * case requires storage on the order of the product of the two sample
232      * sizes, so this should not be used for large samples.
233      * <p>
234      * If {@code exact} is {@code false}, the normal approximation is used to
235      * estimate the p-value.
236      * <p>
237      * If the combined dataset contains ties and {@code exact} is {@code true},
238      * MathIllegalArgumentException is thrown. If {@code exact} is {@code false}
239      * and the ties are present, the variance used to compute the approximate
240      * p-value in the normal approximation is bias-adjusted using the formula in
241      * the reference above.
242      *
243      * @param x the first sample
244      * @param y the second sample
245      * @param exact true means compute the p-value exactly, false means use the
246      *        normal approximation
247      * @return approximate 2-sided p-value
248      * @throws NullArgumentException if {@code x} or {@code y} are {@code null}.
249      * @throws MathIllegalArgumentException if {@code x} or {@code y} are
250      *         zero-length or if {@code exact} is {@code true} and ties are
251      *         present in the data
252      */
253     public double mannWhitneyUTest(final double[] x, final double[] y,
254                                    final boolean exact)
255         throws MathIllegalArgumentException, NullArgumentException {
256         ensureDataConformance(x, y);
257         final Map<Double, Integer> tiesMap = tiesMap(x, y);
258         final double u = mannWhitneyU(x, y);
259         if (exact) {
260             if (!tiesMap.isEmpty()) {
261                 throw new MathIllegalArgumentException(LocalizedStatFormats.TIES_ARE_NOT_ALLOWED);
262             }
263             return exactP(x.length, y.length, u);
264         }
265 
266         return approximateP(u, x.length, y.length,
267                             varU(x.length, y.length, tiesMap));
268     }
269 
270     /**
271      * Ensures that the provided arrays fulfills the assumptions.
272      *
273      * @param x first sample
274      * @param y second sample
275      * @throws NullArgumentException if {@code x} or {@code y} are {@code null}.
276      * @throws MathIllegalArgumentException if {@code x} or {@code y} are
277      *         zero-length.
278      */
279     private void ensureDataConformance(final double[] x, final double[] y)
280         throws MathIllegalArgumentException, NullArgumentException {
281 
282         if (x == null || y == null) {
283             throw new NullArgumentException();
284         }
285         if (x.length == 0 || y.length == 0) {
286             throw new MathIllegalArgumentException(LocalizedCoreFormats.NO_DATA);
287         }
288     }
289 
290     /**
291      * Estimates the 2-sided p-value associated with a Mann-Whitney U statistic
292      * value using the normal approximation.
293      * <p>
294      * The variance passed in is assumed to be corrected for ties. Continuity
295      * correction is applied to the normal approximation.
296      *
297      * @param u Mann-Whitney U statistic
298      * @param n1 number of subjects in first sample
299      * @param n2 number of subjects in second sample
300      * @param varU variance of U (corrected for ties if these exist)
301      * @return two-sided asymptotic p-value
302      * @throws MathIllegalStateException if the p-value can not be computed due
303      *         to a convergence error
304      * @throws MathIllegalStateException if the maximum number of iterations is
305      *         exceeded
306      */
307     private double approximateP(final double u, final int n1, final int n2,
308                                 final double varU)
309         throws MathIllegalStateException {
310 
311         final double mu = (long) n1 * n2 / 2.0;
312 
313         // If u == mu, return 1
314         if (Precision.equals(mu, u)) {
315             return 1;
316         }
317 
318         // Force z <= 0 so we get tail probability. Also apply continuity
319         // correction
320         final double z = -Math.abs((u - mu) + 0.5) / FastMath.sqrt(varU);
321 
322         return 2 * standardNormal.cumulativeProbability(z);
323     }
324 
325     /**
326      * Calculates the (2-sided) p-value associated with a Mann-Whitney U
327      * statistic.
328      * <p>
329      * To compute the p-value, the probability densities for each value of U up
330      * to and including u are summed and the resulting tail probability is
331      * multiplied by 2.
332      * <p>
333      * The result of this computation is only valid when the combined n + m
334      * sample has no tied values.
335      * <p>
336      * This method should not be used for large values of n or m as it maintains
337      * work arrays of size n*m.
338      *
339      * @param u Mann-Whitney U statistic value
340      * @param n first sample size
341      * @param m second sample size
342      * @return two-sided exact p-value
343      */
344     private double exactP(final int n, final int m, final double u) {
345         final double nm = m * n;
346         if (u > nm) { // Quick exit if u is out of range
347             return 1;
348         }
349         // Need to convert u to a mean deviation, so cumulative probability is
350         // tail probability
351         final double crit = u < nm / 2 ? u : nm / 2 - u;
352 
353         double cum = 0d;
354         for (int ct = 0; ct <= crit; ct++) {
355             cum += uDensity(n, m, ct);
356         }
357         return 2 * cum;
358     }
359 
360     /**
361      * Computes the probability density function for the Mann-Whitney U
362      * statistic.
363      * <p>
364      * This method should not be used for large values of n or m as it maintains
365      * work arrays of size n*m.
366      *
367      * @param n first sample size
368      * @param m second sample size
369      * @param u U-statistic value
370      * @return the probability that a U statistic derived from random samples of
371      *         size n and m (containing no ties) equals u
372      */
373     private double uDensity(final int n, final int m, double u) {
374         if (u < 0 || u > m * n) {
375             return 0;
376         }
377         final long[] freq = uFrequencies(n, m);
378         return freq[(int) FastMath.round(u + 1)] /
379                (double) LongStream.of(freq).sum();
380     }
381 
382     /**
383      * Computes frequency counts for values of the Mann-Whitney U statistc. If
384      * freq[] is the returned array, freq[u + 1] counts the frequency of U = u
385      * among all possible n-m orderings. Therefore, P(u = U) = freq[u + 1] / sum
386      * where sum is the sum of the values in the returned array.
387      * <p>
388      * Implements the algorithm presented in "Algorithm AS 62: A Generator for
389      * the Sampling Distribution of the Mann-Whitney U Statistic", L. C. Dinneen
390      * and B. C. Blakesley Journal of the Royal Statistical Society. Series C
391      * (Applied Statistics) Vol. 22, No. 2 (1973), pp. 269-273.
392      *
393      * @param n first sample size
394      * @param m second sample size
395      * @return array of U statistic value frequencies
396      */
397     private long[] uFrequencies(final int n, final int m) {
398         final int max = FastMath.max(m, n);
399         if (max > 100) {
400             throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_LARGE,
401                                                    max, 100);
402         }
403         final int min = FastMath.min(m, n);
404         final long[] out = new long[n * m + 2];
405         final long[] work = new long[n * m + 2];
406         for (int i = 1; i < out.length; i++) {
407             out[i] = (i <= (max + 1)) ? 1 : 0;
408         }
409         work[1] = 0;
410         int in = max;
411         for (int i = 2; i <= min; i++) {
412             work[i] = 0;
413             in = in + max;
414             int n1 = in + 2;
415             long l = 1 + in / 2;
416             int k = i;
417             for (int j = 1; j <= l; j++) {
418                 k++;
419                 n1 = n1 - 1;
420                 final long sum = out[j] + work[j];
421                 out[j] = sum;
422                 work[k] = sum - out[n1];
423                 out[n1] = sum;
424             }
425         }
426         return out;
427     }
428 
429     /**
430      * Computes the variance for a U-statistic associated with samples of
431      * sizes{@code n} and {@code m} and ties described by {@code tiesMap}. If
432      * {@code tiesMap} is non-empty, the multiplicity counts in its values set
433      * are used to adjust the variance.
434      *
435      * @param n first sample size
436      * @param m second sample size
437      * @param tiesMap map of <value, multiplicity>
438      * @return ties-adjusted variance
439      */
440     private double varU(final int n, final int m,
441                         Map<Double, Integer> tiesMap) {
442         final double nm = (long) n * m;
443         if (tiesMap.isEmpty()) {
444             return nm * (n + m + 1) / 12.0;
445         }
446         final long tSum = tiesMap.entrySet().stream()
447             .mapToLong(e -> e.getValue() * e.getValue() * e.getValue() -
448                             e.getValue())
449             .sum();
450         final double totalN = n + m;
451         return (nm / 12) * (totalN + 1 - tSum / (totalN * (totalN - 1)));
452 
453     }
454 
455     /**
456      * Creates a map whose keys are values occurring more than once in the
457      * combined dataset formed from x and y. Map entry values are the number of
458      * occurrences. The returned map is empty iff there are no ties in the data.
459      *
460      * @param x first dataset
461      * @param y second dataset
462      * @return map of <value, number of times it occurs> for values occurring
463      *         more than once or an empty map if there are no ties (the returned
464      *         map is <em>not</em> thread-safe, which is OK in the context of the callers)
465      */
466     private Map<Double, Integer> tiesMap(final double[] x, final double[] y) {
467         final Map<Double, Integer> tiesMap = new TreeMap<>(); // NOPMD - no concurrent access in the callers context
468         for (int i = 0; i < x.length; i++) {
469             tiesMap.merge(x[i], 1, Integer::sum);
470         }
471         for (int i = 0; i < y.length; i++) {
472             tiesMap.merge(y[i], 1, Integer::sum);
473         }
474         tiesMap.entrySet().removeIf(e -> e.getValue() == 1);
475         return tiesMap;
476     }
477 }