1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 /*
19 * This is not the original file distributed by the Apache Software Foundation
20 * It has been modified by the Hipparchus project
21 */
22
23 package org.hipparchus.stat.inference;
24
25 import java.math.RoundingMode;
26 import java.util.Arrays;
27 import java.util.HashSet;
28
29 import org.hipparchus.distribution.RealDistribution;
30 import org.hipparchus.exception.LocalizedCoreFormats;
31 import org.hipparchus.exception.MathIllegalArgumentException;
32 import org.hipparchus.exception.MathIllegalStateException;
33 import org.hipparchus.exception.MathRuntimeException;
34 import org.hipparchus.fraction.BigFraction;
35 import org.hipparchus.fraction.BigFractionField;
36 import org.hipparchus.linear.Array2DRowFieldMatrix;
37 import org.hipparchus.linear.FieldMatrix;
38 import org.hipparchus.linear.MatrixUtils;
39 import org.hipparchus.linear.RealMatrix;
40 import org.hipparchus.random.RandomDataGenerator;
41 import org.hipparchus.random.RandomGenerator;
42 import org.hipparchus.stat.LocalizedStatFormats;
43 import org.hipparchus.util.FastMath;
44 import org.hipparchus.util.MathArrays;
45 import org.hipparchus.util.MathUtils;
46
47 /**
48 * Implementation of the <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">
49 * Kolmogorov-Smirnov (K-S) test</a> for equality of continuous distributions.
50 * <p>
51 * The K-S test uses a statistic based on the maximum deviation of the empirical distribution of
52 * sample data points from the distribution expected under the null hypothesis. For one-sample tests
53 * evaluating the null hypothesis that a set of sample data points follow a given distribution, the
54 * test statistic is \(D_n=\sup_x |F_n(x)-F(x)|\), where \(F\) is the expected distribution and
55 * \(F_n\) is the empirical distribution of the \(n\) sample data points. The distribution of
56 * \(D_n\) is estimated using a method based on [1] with certain quick decisions for extreme values
57 * given in [2].
58 * <p>
59 * Two-sample tests are also supported, evaluating the null hypothesis that the two samples
60 * {@code x} and {@code y} come from the same underlying distribution. In this case, the test
61 * statistic is \(D_{n,m}=\sup_t | F_n(t)-F_m(t)|\) where \(n\) is the length of {@code x}, \(m\) is
62 * the length of {@code y}, \(F_n\) is the empirical distribution that puts mass \(1/n\) at each of
63 * the values in {@code x} and \(F_m\) is the empirical distribution of the {@code y} values. The
64 * default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as
65 * follows:
66 * <ul>
67 * <li>For small samples (where the product of the sample sizes is less than
68 * {@link #LARGE_SAMPLE_PRODUCT}), the method presented in [4] is used to compute the
69 * exact p-value for the 2-sample test.</li>
70 * <li>When the product of the sample sizes exceeds {@link #LARGE_SAMPLE_PRODUCT}, the asymptotic
71 * distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on
72 * the approximation.</li>
73 * </ul>
74 * <p>
75 * If the product of the sample sizes is less than {@link #LARGE_SAMPLE_PRODUCT} and the sample
76 * data contains ties, random jitter is added to the sample data to break ties before applying
77 * the algorithm above. Alternatively, the {@link #bootstrap(double[], double[], int, boolean)}
78 * method, modeled after <a href="http://sekhon.berkeley.edu/matching/ks.boot.html">ks.boot</a>
79 * in the R Matching package [3], can be used if ties are known to be present in the data.
80 * <p>
81 * In the two-sample case, \(D_{n,m}\) has a discrete distribution. This makes the p-value
82 * associated with the null hypothesis \(H_0 : D_{n,m} \ge d \) differ from \(H_0 : D_{n,m} > d \)
83 * by the mass of the observed value \(d\). To distinguish these, the two-sample tests use a boolean
84 * {@code strict} parameter. This parameter is ignored for large samples.
85 * <p>
86 * The methods used by the 2-sample default implementation are also exposed directly:
87 * <ul>
88 * <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li>
89 * <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean}
90 * arguments in the first two methods allow the probability used to estimate the p-value to be
91 * expressed using strict or non-strict inequality. See
92 * {@link #kolmogorovSmirnovTest(double[], double[], boolean)}.</li>
93 * </ul>
94 * <p>
95 * References:
96 * <ul>
97 * <li>[1] <a href="http://www.jstatsoft.org/v08/i18/"> Evaluating Kolmogorov's Distribution</a> by
98 * George Marsaglia, Wai Wan Tsang, and Jingbo Wang</li>
99 * <li>[2] <a href="http://www.jstatsoft.org/v39/i11/"> Computing the Two-Sided Kolmogorov-Smirnov
100 * Distribution</a> by Richard Simard and Pierre L'Ecuyer</li>
101 * <li>[3] Jasjeet S. Sekhon. 2011. <a href="http://www.jstatsoft.org/article/view/v042i07">
102 * Multivariate and Propensity Score Matching Software with Automated Balance Optimization:
103 * The Matching package for R</a> Journal of Statistical Software, 42(7): 1-52.</li>
104 * <li>[4] Kim, P. J. and Jennrich, R. I. (1970). Tables of the Exact Sampling Distribution of the
105 * Two-Sample Kolmogorov-Smirnov Criterion D_mn ,m≦n in Selected Tables in Mathematical Statistics,
106 * Vol. 1, H. L. Harter and D. B. Owen, editors.</li>
107 * </ul>
108 * <p>
109 * Note that [1] contains an error in computing h, refer to <a
110 * href="https://issues.apache.org/jira/browse/MATH-437">MATH-437</a> for details.
111 */
112 public class KolmogorovSmirnovTest { // NOPMD - this is not a Junit test class, PMD false positive here
113
114 /**
115 * Bound on the number of partial sums in {@link #ksSum(double, double, int)}
116 */
117 protected static final int MAXIMUM_PARTIAL_SUM_COUNT = 100000;
118
119 /** Convergence criterion for {@link #ksSum(double, double, int)} */
120 protected static final double KS_SUM_CAUCHY_CRITERION = 1E-20;
121
122 /** Convergence criterion for the sums in #pelzGood(double, double, int)} */
123 protected static final double PG_SUM_RELATIVE_ERROR = 1.0e-10;
124
125 /**
126 * When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic
127 * distribution to compute the p-value.
128 */
129 protected static final int LARGE_SAMPLE_PRODUCT = 10000;
130
131 /**
132 * RandomDataGenerator used by {@link #bootstrap(double[], double[], int)}
133 * or to generate jitter to break ties in the data.
134 */
135 private final RandomDataGenerator gen = new RandomDataGenerator();
136
137 /**
138 * Construct a KolmogorovSmirnovTest instance.
139 */
140 public KolmogorovSmirnovTest() {
141 super();
142 }
143
144 /**
145 * Construct a KolmogorovSmirnovTest instance providing a seed for the PRNG
146 * used by the {@link #bootstrap(double[], double[], int)} method.
147 *
148 * @param seed the seed for the PRNG
149 */
150 public KolmogorovSmirnovTest(long seed) {
151 super();
152 gen.setSeed(seed);
153 }
154
155 /**
156 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
157 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
158 * evaluating the null hypothesis that {@code data} conforms to {@code distribution}. If
159 * {@code exact} is true, the distribution used to compute the p-value is computed using
160 * extended precision. See {@link #cdfExact(double, int)}.
161 *
162 * @param distribution reference distribution
163 * @param data sample being being evaluated
164 * @param exact whether or not to force exact computation of the p-value
165 * @return the p-value associated with the null hypothesis that {@code data} is a sample from
166 * {@code distribution}
167 * @throws MathIllegalArgumentException if {@code data} does not have length at least 2
168 * @throws org.hipparchus.exception.NullArgumentException if {@code data} is null
169 */
170 public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data, boolean exact) {
171 return 1d - cdf(kolmogorovSmirnovStatistic(distribution, data), data.length, exact);
172 }
173
174 /**
175 * Computes the one-sample Kolmogorov-Smirnov test statistic, \(D_n=\sup_x |F_n(x)-F(x)|\) where
176 * \(F\) is the distribution (cdf) function associated with {@code distribution}, \(n\) is the
177 * length of {@code data} and \(F_n\) is the empirical distribution that puts mass \(1/n\) at
178 * each of the values in {@code data}.
179 *
180 * @param distribution reference distribution
181 * @param data sample being evaluated
182 * @return Kolmogorov-Smirnov statistic \(D_n\)
183 * @throws MathIllegalArgumentException if {@code data} does not have length at least 2
184 * @throws org.hipparchus.exception.NullArgumentException if {@code data} is null
185 */
186 public double kolmogorovSmirnovStatistic(RealDistribution distribution, double[] data) {
187 checkArray(data);
188 final int n = data.length;
189 final double nd = n;
190 final double[] dataCopy = new double[n];
191 System.arraycopy(data, 0, dataCopy, 0, n);
192 Arrays.sort(dataCopy);
193 double d = 0d;
194 for (int i = 1; i <= n; i++) {
195 final double yi = distribution.cumulativeProbability(dataCopy[i - 1]);
196 final double currD = FastMath.max(yi - (i - 1) / nd, i / nd - yi);
197 if (currD > d) {
198 d = currD;
199 }
200 }
201 return d;
202 }
203
204 /**
205 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
206 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
207 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
208 * probability distribution. Specifically, what is returned is an estimate of the probability
209 * that the {@link #kolmogorovSmirnovStatistic(double[], double[])} associated with a randomly
210 * selected partition of the combined sample into subsamples of sizes {@code x.length} and
211 * {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as
212 * large as {@code strict = false}) as {@code kolmogorovSmirnovStatistic(x, y)}.
213 * <ul>
214 * <li>For small samples (where the product of the sample sizes is less than
215 * {@link #LARGE_SAMPLE_PRODUCT}), the exact p-value is computed using the method presented
216 * in [4], implemented in {@link #exactP(double, int, int, boolean)}. </li>
217 * <li>When the product of the sample sizes exceeds {@link #LARGE_SAMPLE_PRODUCT}, the
218 * asymptotic distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)}
219 * for details on the approximation.</li>
220 * </ul><p>
221 * If {@code x.length * y.length} < {@link #LARGE_SAMPLE_PRODUCT} and the combined set of values in
222 * {@code x} and {@code y} contains ties, random jitter is added to {@code x} and {@code y} to
223 * break ties before computing \(D_{n,m}\) and the p-value. The jitter is uniformly distributed
224 * on (-minDelta / 2, minDelta / 2) where minDelta is the smallest pairwise difference between
225 * values in the combined sample.</p>
226 * <p>
227 * If ties are known to be present in the data, {@link #bootstrap(double[], double[], int, boolean)}
228 * may be used as an alternative method for estimating the p-value.</p>
229 *
230 * @param x first sample dataset
231 * @param y second sample dataset
232 * @param strict whether or not the probability to compute is expressed as a strict inequality
233 * (ignored for large samples)
234 * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
235 * samples from the same distribution
236 * @throws MathIllegalArgumentException if either {@code x} or {@code y} does not have length at
237 * least 2
238 * @throws org.hipparchus.exception.NullArgumentException if either {@code x} or {@code y} is null
239 * @see #bootstrap(double[], double[], int, boolean)
240 */
241 public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) {
242 final long lengthProduct = (long) x.length * y.length;
243 final double[] xa;
244 final double[] ya;
245 if (lengthProduct < LARGE_SAMPLE_PRODUCT && hasTies(x,y)) {
246 xa = x.clone();
247 ya = y.clone();
248 fixTies(xa, ya);
249 } else {
250 xa = x;
251 ya = y;
252 }
253 if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
254 return exactP(kolmogorovSmirnovStatistic(xa, ya), x.length, y.length, strict);
255 }
256 return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length);
257 }
258
259 /**
260 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a two-sample <a
261 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
262 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
263 * probability distribution. Assumes the strict form of the inequality used to compute the
264 * p-value. See {@link #kolmogorovSmirnovTest(RealDistribution, double[], boolean)}.
265 *
266 * @param x first sample dataset
267 * @param y second sample dataset
268 * @return p-value associated with the null hypothesis that {@code x} and {@code y} represent
269 * samples from the same distribution
270 * @throws MathIllegalArgumentException if either {@code x} or {@code y} does not have length at
271 * least 2
272 * @throws org.hipparchus.exception.NullArgumentException if either {@code x} or {@code y} is null
273 */
274 public double kolmogorovSmirnovTest(double[] x, double[] y) {
275 return kolmogorovSmirnovTest(x, y, true);
276 }
277
278 /**
279 * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
280 * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
281 * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
282 * is the empirical distribution of the {@code y} values.
283 *
284 * @param x first sample
285 * @param y second sample
286 * @return test statistic \(D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
287 * {@code y} represent samples from the same underlying distribution
288 * @throws MathIllegalArgumentException if either {@code x} or {@code y} does not have length at
289 * least 2
290 * @throws org.hipparchus.exception.NullArgumentException if either {@code x} or {@code y} is null
291 */
292 public double kolmogorovSmirnovStatistic(double[] x, double[] y) {
293 return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * (long)y.length));
294 }
295
296 /**
297 * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
298 * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the
299 * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\)
300 * is the empirical distribution of the {@code y} values. Finally \(n m D_{n,m}\) is returned
301 * as long value.
302 *
303 * @param x first sample
304 * @param y second sample
305 * @return test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that {@code x} and
306 * {@code y} represent samples from the same underlying distribution
307 * @throws MathIllegalArgumentException if either {@code x} or {@code y} does not have length at
308 * least 2
309 * @throws org.hipparchus.exception.NullArgumentException if either {@code x} or {@code y} is null
310 */
311 private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) {
312 checkArray(x);
313 checkArray(y);
314 // Copy and sort the sample arrays
315 final double[] sx = x.clone();
316 final double[] sy = y.clone();
317 Arrays.sort(sx);
318 Arrays.sort(sy);
319 final int n = sx.length;
320 final int m = sy.length;
321
322 int rankX = 0;
323 int rankY = 0;
324 long curD = 0l;
325
326 // Find the max difference between cdf_x and cdf_y
327 long supD = 0l;
328 do {
329 double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY];
330 while(rankX < n && Double.compare(sx[rankX], z) == 0) {
331 rankX += 1;
332 curD += m;
333 }
334 while(rankY < m && Double.compare(sy[rankY], z) == 0) {
335 rankY += 1;
336 curD -= n;
337 }
338 if (curD > supD) {
339 supD = curD;
340 }
341 else if (-curD > supD) {
342 supD = -curD;
343 }
344 } while(rankX < n && rankY < m);
345 return supD;
346 }
347
348 /**
349 * Computes the <i>p-value</i>, or <i>observed significance level</i>, of a one-sample <a
350 * href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
351 * evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
352 *
353 * @param distribution reference distribution
354 * @param data sample being being evaluated
355 * @return the p-value associated with the null hypothesis that {@code data} is a sample from
356 * {@code distribution}
357 * @throws MathIllegalArgumentException if {@code data} does not have length at least 2
358 * @throws org.hipparchus.exception.NullArgumentException if {@code data} is null
359 */
360 public double kolmogorovSmirnovTest(RealDistribution distribution, double[] data) {
361 return kolmogorovSmirnovTest(distribution, data, false);
362 }
363
364 /**
365 * Performs a <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov
366 * test</a> evaluating the null hypothesis that {@code data} conforms to {@code distribution}.
367 *
368 * @param distribution reference distribution
369 * @param data sample being being evaluated
370 * @param alpha significance level of the test
371 * @return true iff the null hypothesis that {@code data} is a sample from {@code distribution}
372 * can be rejected with confidence 1 - {@code alpha}
373 * @throws MathIllegalArgumentException if {@code data} does not have length at least 2
374 * @throws org.hipparchus.exception.NullArgumentException if {@code data} is null
375 */
376 public boolean kolmogorovSmirnovTest(RealDistribution distribution, double[] data, double alpha) {
377 if ((alpha <= 0) || (alpha > 0.5)) {
378 throw new MathIllegalArgumentException(LocalizedStatFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5);
379 }
380 return kolmogorovSmirnovTest(distribution, data) < alpha;
381 }
382
383 /**
384 * Estimates the <i>p-value</i> of a two-sample
385 * <a href="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"> Kolmogorov-Smirnov test</a>
386 * evaluating the null hypothesis that {@code x} and {@code y} are samples drawn from the same
387 * probability distribution. This method estimates the p-value by repeatedly sampling sets of size
388 * {@code x.length} and {@code y.length} from the empirical distribution of the combined sample.
389 * When {@code strict} is true, this is equivalent to the algorithm implemented in the R function
390 * {@code ks.boot}, described in <pre>
391 * Jasjeet S. Sekhon. 2011. 'Multivariate and Propensity Score Matching
392 * Software with Automated Balance Optimization: The Matching package for R.'
393 * Journal of Statistical Software, 42(7): 1-52.
394 * </pre>
395 * @param x first sample
396 * @param y second sample
397 * @param iterations number of bootstrap resampling iterations
398 * @param strict whether or not the null hypothesis is expressed as a strict inequality
399 * @return estimated p-value
400 */
401 public double bootstrap(double[] x, double[] y, int iterations, boolean strict) {
402 final int xLength = x.length;
403 final int yLength = y.length;
404 final double[] combined = new double[xLength + yLength];
405 System.arraycopy(x, 0, combined, 0, xLength);
406 System.arraycopy(y, 0, combined, xLength, yLength);
407 final long d = integralKolmogorovSmirnovStatistic(x, y);
408 int greaterCount = 0;
409 int equalCount = 0;
410 double[] curX;
411 double[] curY;
412 long curD;
413 for (int i = 0; i < iterations; i++) {
414 curX = resample(combined, xLength);
415 curY = resample(combined, yLength);
416 curD = integralKolmogorovSmirnovStatistic(curX, curY);
417 if (curD > d) {
418 greaterCount++;
419 } else if (curD == d) {
420 equalCount++;
421 }
422 }
423 return strict ? greaterCount / (double) iterations :
424 (greaterCount + equalCount) / (double) iterations;
425 }
426
427 /**
428 * Computes {@code bootstrap(x, y, iterations, true)}.
429 * This is equivalent to ks.boot(x,y, nboots=iterations) using the R Matching
430 * package function. See #bootstrap(double[], double[], int, boolean).
431 *
432 * @param x first sample
433 * @param y second sample
434 * @param iterations number of bootstrap resampling iterations
435 * @return estimated p-value
436 */
437 public double bootstrap(double[] x, double[] y, int iterations) {
438 return bootstrap(x, y, iterations, true);
439 }
440
441 /**
442 * Return a bootstrap sample (with replacement) of size k from sample.
443 *
444 * @param sample array to sample from
445 * @param k size of bootstrap sample
446 * @return bootstrap sample
447 */
448 private double[] resample(double[] sample, int k) {
449 final int len = sample.length;
450 final double[] out = new double[k];
451 for (int i = 0; i < k; i++) {
452 out[i] = gen.nextInt(len);
453 }
454 return out;
455 }
456
457 /**
458 * Calculates {@code P(D_n < d)} using the method described in [1] with quick decisions for extreme
459 * values given in [2] (see above). The result is not exact as with
460 * {@link #cdfExact(double, int)} because calculations are based on
461 * {@code double} rather than {@link org.hipparchus.fraction.BigFraction}.
462 *
463 * @param d statistic
464 * @param n sample size
465 * @return \(P(D_n < d)\)
466 * @throws MathRuntimeException if algorithm fails to convert {@code h} to a
467 * {@link org.hipparchus.fraction.BigFraction} in expressing {@code d} as \((k
468 * - h) / m\) for integer {@code k, m} and \(0 <= h < 1\)
469 */
470 public double cdf(double d, int n)
471 throws MathRuntimeException {
472 return cdf(d, n, false);
473 }
474
475 /**
476 * Calculates {@code P(D_n < d)}. The result is exact in the sense that BigFraction/BigReal is
477 * used everywhere at the expense of very slow execution time. Almost never choose this in real
478 * applications unless you are very sure; this is almost solely for verification purposes.
479 * Normally, you would choose {@link #cdf(double, int)}. See the class
480 * javadoc for definitions and algorithm description.
481 *
482 * @param d statistic
483 * @param n sample size
484 * @return \(P(D_n < d)\)
485 * @throws MathRuntimeException if the algorithm fails to convert {@code h} to a
486 * {@link org.hipparchus.fraction.BigFraction} in expressing {@code d} as \((k
487 * - h) / m\) for integer {@code k, m} and \(0 <= h < 1\)
488 */
489 public double cdfExact(double d, int n)
490 throws MathRuntimeException {
491 return cdf(d, n, true);
492 }
493
494 /**
495 * Calculates {@code P(D_n < d)} using method described in [1] with quick decisions for extreme
496 * values given in [2] (see above).
497 *
498 * @param d statistic
499 * @param n sample size
500 * @param exact whether the probability should be calculated exact using
501 * {@link org.hipparchus.fraction.BigFraction} everywhere at the expense of
502 * very slow execution time, or if {@code double} should be used convenient places to
503 * gain speed. Almost never choose {@code true} in real applications unless you are very
504 * sure; {@code true} is almost solely for verification purposes.
505 * @return \(P(D_n < d)\)
506 * @throws MathRuntimeException if algorithm fails to convert {@code h} to a
507 * {@link org.hipparchus.fraction.BigFraction} in expressing {@code d} as \((k
508 * - h) / m\) for integer {@code k, m} and \(0 \lt;= h < 1\).
509 */
510 public double cdf(double d, int n, boolean exact)
511 throws MathRuntimeException {
512
513 final double ninv = 1 / ((double) n);
514 final double ninvhalf = 0.5 * ninv;
515
516 if (d <= ninvhalf) {
517 return 0;
518 } else if (ninvhalf < d && d <= ninv) {
519 double res = 1;
520 final double f = 2 * d - ninv;
521 // n! f^n = n*f * (n-1)*f * ... * 1*x
522 for (int i = 1; i <= n; ++i) {
523 res *= i * f;
524 }
525 return res;
526 } else if (1 - ninv <= d && d < 1) {
527 return 1 - 2 * Math.pow(1 - d, n);
528 } else if (1 <= d) {
529 return 1;
530 }
531 if (exact) {
532 return exactK(d, n);
533 }
534 if (n <= 140) {
535 return roundedK(d, n);
536 }
537 return pelzGood(d, n);
538 }
539
540 /**
541 * Calculates the exact value of {@code P(D_n < d)} using the method described in [1] (reference
542 * in class javadoc above) and {@link org.hipparchus.fraction.BigFraction} (see
543 * above).
544 *
545 * @param d statistic
546 * @param n sample size
547 * @return the two-sided probability of \(P(D_n < d)\)
548 * @throws MathRuntimeException if algorithm fails to convert {@code h} to a
549 * {@link org.hipparchus.fraction.BigFraction} in expressing {@code d} as \((k
550 * - h) / m\) for integer {@code k, m} and \(0 \le h < 1\).
551 */
552 private double exactK(double d, int n)
553 throws MathRuntimeException {
554
555 final int k = (int) Math.ceil(n * d);
556
557 final FieldMatrix<BigFraction> H = this.createExactH(d, n);
558 final FieldMatrix<BigFraction> Hpower = H.power(n);
559
560 BigFraction pFrac = Hpower.getEntry(k - 1, k - 1);
561
562 for (int i = 1; i <= n; ++i) {
563 pFrac = pFrac.multiply(i).divide(n);
564 }
565
566 /*
567 * BigFraction.doubleValue converts numerator to double and the denominator to double and
568 * divides afterwards. That gives NaN quite easy. This does not (scale is the number of
569 * digits):
570 */
571 return pFrac.bigDecimalValue(20, RoundingMode.HALF_UP).doubleValue();
572 }
573
574 /**
575 * Calculates {@code P(D_n < d)} using method described in [1] and doubles (see above).
576 *
577 * @param d statistic
578 * @param n sample size
579 * @return \(P(D_n < d)\)
580 */
581 private double roundedK(double d, int n) {
582
583 final int k = (int) Math.ceil(n * d);
584 final RealMatrix H = this.createRoundedH(d, n);
585 final RealMatrix Hpower = H.power(n);
586
587 double pFrac = Hpower.getEntry(k - 1, k - 1);
588 for (int i = 1; i <= n; ++i) {
589 pFrac *= ((double) i) / n;
590 }
591
592 return pFrac;
593 }
594
595 /**
596 * Computes the Pelz-Good approximation for \(P(D_n < d)\) as described in [2] in the class javadoc.
597 *
598 * @param d value of d-statistic (x in [2])
599 * @param n sample size
600 * @return \(P(D_n < d)\)
601 */
602 public double pelzGood(double d, int n) {
603 // Change the variable since approximation is for the distribution evaluated at d / sqrt(n)
604 final double sqrtN = FastMath.sqrt(n);
605 final double z = d * sqrtN;
606 final double z2 = d * d * n;
607 final double z4 = z2 * z2;
608 final double z6 = z4 * z2;
609 final double z8 = z4 * z4;
610
611 // Compute K_0(z)
612 double sum = 0;
613 double z2Term = MathUtils.PI_SQUARED / (8 * z2);
614 int k = 1;
615 for (; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
616 final double kTerm = 2 * k - 1;
617 final double increment = FastMath.exp(-z2Term * kTerm * kTerm);
618 sum += increment;
619 if (increment <= PG_SUM_RELATIVE_ERROR * sum) {
620 break;
621 }
622 }
623 if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
624 throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT);
625 }
626 double ret = sum * FastMath.sqrt(2 * FastMath.PI) / z;
627
628 // K_1(z)
629 // Sum is -inf to inf, but k term is always (k + 1/2) ^ 2, so really have
630 // twice the sum from k = 0 to inf (k = -1 is same as 0, -2 same as 1, ...)
631 final double twoZ2 = 2 * z2;
632 sum = 0;
633 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
634 final double kTerm = k + 0.5;
635 final double kTerm2 = kTerm * kTerm;
636 final double increment = (MathUtils.PI_SQUARED * kTerm2 - z2) * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
637 sum += increment;
638 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
639 break;
640 }
641 }
642 if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
643 throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT);
644 }
645 final double sqrtHalfPi = FastMath.sqrt(FastMath.PI / 2);
646 // Instead of doubling sum, divide by 3 instead of 6
647 ret += sum * sqrtHalfPi / (3 * z4 * sqrtN);
648
649 // K_2(z)
650 // Same drill as K_1, but with two doubly infinite sums, all k terms are even powers.
651 final double z4Term = 2 * z4;
652 final double z6Term = 6 * z6;
653 z2Term = 5 * z2;
654 final double pi4 = MathUtils.PI_SQUARED * MathUtils.PI_SQUARED;
655 sum = 0;
656 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
657 final double kTerm = k + 0.5;
658 final double kTerm2 = kTerm * kTerm;
659 final double increment = (z6Term + z4Term + MathUtils.PI_SQUARED * (z4Term - z2Term) * kTerm2 +
660 pi4 * (1 - twoZ2) * kTerm2 * kTerm2) * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
661 sum += increment;
662 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
663 break;
664 }
665 }
666 if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
667 throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT);
668 }
669 double sum2 = 0;
670 for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
671 final double kTerm2 = k * k;
672 final double increment = MathUtils.PI_SQUARED * kTerm2 * FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
673 sum2 += increment;
674 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) {
675 break;
676 }
677 }
678 if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
679 throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT);
680 }
681 // Again, adjust coefficients instead of doubling sum, sum2
682 ret += (sqrtHalfPi / n) * (sum / (36 * z2 * z2 * z2 * z) - sum2 / (18 * z2 * z));
683
684 // K_3(z) One more time with feeling - two doubly infinite sums, all k powers even.
685 // Multiply coefficient denominators by 2, so omit doubling sums.
686 final double pi6 = pi4 * MathUtils.PI_SQUARED;
687 sum = 0;
688 for (k = 0; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
689 final double kTerm = k + 0.5;
690 final double kTerm2 = kTerm * kTerm;
691 final double kTerm4 = kTerm2 * kTerm2;
692 final double kTerm6 = kTerm4 * kTerm2;
693 final double increment = (pi6 * kTerm6 * (5 - 30 * z2) + pi4 * kTerm4 * (-60 * z2 + 212 * z4) +
694 MathUtils.PI_SQUARED * kTerm2 * (135 * z4 - 96 * z6) - 30 * z6 - 90 * z8) *
695 FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
696 sum += increment;
697 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum)) {
698 break;
699 }
700 }
701 if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
702 throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT);
703 }
704 sum2 = 0;
705 for (k = 1; k < MAXIMUM_PARTIAL_SUM_COUNT; k++) {
706 final double kTerm2 = k * k;
707 final double kTerm4 = kTerm2 * kTerm2;
708 final double increment = (-pi4 * kTerm4 + 3 * MathUtils.PI_SQUARED * kTerm2 * z2) *
709 FastMath.exp(-MathUtils.PI_SQUARED * kTerm2 / twoZ2);
710 sum2 += increment;
711 if (FastMath.abs(increment) < PG_SUM_RELATIVE_ERROR * FastMath.abs(sum2)) {
712 break;
713 }
714 }
715 if (k == MAXIMUM_PARTIAL_SUM_COUNT) {
716 throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, MAXIMUM_PARTIAL_SUM_COUNT);
717 }
718 return ret + (sqrtHalfPi / (sqrtN * n)) * (sum / (3240 * z6 * z4) +
719 + sum2 / (108 * z6));
720
721 }
722
723 /***
724 * Creates {@code H} of size {@code m x m} as described in [1] (see above).
725 *
726 * @param d statistic
727 * @param n sample size
728 * @return H matrix
729 * @throws MathIllegalArgumentException if fractional part is greater than 1
730 * @throws MathIllegalStateException if algorithm fails to convert {@code h} to a
731 * {@link org.hipparchus.fraction.BigFraction} in expressing {@code d} as \((k
732 * - h) / m\) for integer {@code k, m} and \(0 <= h < 1\).
733 */
734 private FieldMatrix<BigFraction> createExactH(double d, int n)
735 throws MathIllegalArgumentException, MathIllegalStateException {
736
737 final int k = (int) Math.ceil(n * d);
738 final int m = 2 * k - 1;
739 final double hDouble = k - n * d;
740 if (hDouble >= 1) {
741 throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_LARGE_BOUND_EXCLUDED,
742 hDouble, 1.0);
743 }
744 BigFraction h;
745 try {
746 h = new BigFraction(hDouble, 1.0e-20, 10000);
747 } catch (final MathIllegalStateException e1) {
748 try {
749 h = new BigFraction(hDouble, 1.0e-10, 10000);
750 } catch (final MathIllegalStateException e2) {
751 h = new BigFraction(hDouble, 1.0e-5, 10000);
752 }
753 }
754 final BigFraction[][] Hdata = new BigFraction[m][m];
755
756 /*
757 * Start by filling everything with either 0 or 1.
758 */
759 for (int i = 0; i < m; ++i) {
760 for (int j = 0; j < m; ++j) {
761 if (i - j + 1 < 0) {
762 Hdata[i][j] = BigFraction.ZERO;
763 } else {
764 Hdata[i][j] = BigFraction.ONE;
765 }
766 }
767 }
768
769 /*
770 * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
771 * hPowers[m-1] = h^m
772 */
773 final BigFraction[] hPowers = new BigFraction[m];
774 hPowers[0] = h;
775 for (int i = 1; i < m; ++i) {
776 hPowers[i] = h.multiply(hPowers[i - 1]);
777 }
778
779 /*
780 * First column and last row has special values (each other reversed).
781 */
782 for (int i = 0; i < m; ++i) {
783 Hdata[i][0] = Hdata[i][0].subtract(hPowers[i]);
784 Hdata[m - 1][i] = Hdata[m - 1][i].subtract(hPowers[m - i - 1]);
785 }
786
787 /*
788 * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
789 * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
790 */
791 if (h.compareTo(BigFraction.ONE_HALF) == 1) {
792 Hdata[m - 1][0] = Hdata[m - 1][0].add(h.multiply(2).subtract(1).pow(m));
793 }
794
795 /*
796 * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
797 * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
798 * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
799 * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
800 * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
801 * really necessary.
802 */
803 for (int i = 0; i < m; ++i) {
804 for (int j = 0; j < i + 1; ++j) {
805 if (i - j + 1 > 0) {
806 for (int g = 2; g <= i - j + 1; ++g) {
807 Hdata[i][j] = Hdata[i][j].divide(g);
808 }
809 }
810 }
811 }
812 return new Array2DRowFieldMatrix<>(BigFractionField.getInstance(), Hdata);
813 }
814
815 /***
816 * Creates {@code H} of size {@code m x m} as described in [1] (see above)
817 * using double-precision.
818 *
819 * @param d statistic
820 * @param n sample size
821 * @return H matrix
822 * @throws MathIllegalArgumentException if fractional part is greater than 1
823 */
824 private RealMatrix createRoundedH(double d, int n)
825 throws MathIllegalArgumentException {
826
827 final int k = (int) Math.ceil(n * d);
828 final int m = 2 * k - 1;
829 final double h = k - n * d;
830 if (h >= 1) {
831 throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_LARGE_BOUND_EXCLUDED,
832 h, 1.0);
833 }
834 final double[][] Hdata = new double[m][m];
835
836 /*
837 * Start by filling everything with either 0 or 1.
838 */
839 for (int i = 0; i < m; ++i) {
840 for (int j = 0; j < m; ++j) {
841 if (i - j + 1 < 0) {
842 Hdata[i][j] = 0;
843 } else {
844 Hdata[i][j] = 1;
845 }
846 }
847 }
848
849 /*
850 * Setting up power-array to avoid calculating the same value twice: hPowers[0] = h^1 ...
851 * hPowers[m-1] = h^m
852 */
853 final double[] hPowers = new double[m];
854 hPowers[0] = h;
855 for (int i = 1; i < m; ++i) {
856 hPowers[i] = h * hPowers[i - 1];
857 }
858
859 /*
860 * First column and last row has special values (each other reversed).
861 */
862 for (int i = 0; i < m; ++i) {
863 Hdata[i][0] = Hdata[i][0] - hPowers[i];
864 Hdata[m - 1][i] -= hPowers[m - i - 1];
865 }
866
867 /*
868 * [1] states: "For 1/2 < h < 1 the bottom left element of the matrix should be (1 - 2*h^m +
869 * (2h - 1)^m )/m!" Since 0 <= h < 1, then if h > 1/2 is sufficient to check:
870 */
871 if (Double.compare(h, 0.5) > 0) {
872 Hdata[m - 1][0] += FastMath.pow(2 * h - 1, m);
873 }
874
875 /*
876 * Aside from the first column and last row, the (i, j)-th element is 1/(i - j + 1)! if i -
877 * j + 1 >= 0, else 0. 1's and 0's are already put, so only division with (i - j + 1)! is
878 * needed in the elements that have 1's. There is no need to calculate (i - j + 1)! and then
879 * divide - small steps avoid overflows. Note that i - j + 1 > 0 <=> i + 1 > j instead of
880 * j'ing all the way to m. Also note that it is started at g = 2 because dividing by 1 isn't
881 * really necessary.
882 */
883 for (int i = 0; i < m; ++i) {
884 for (int j = 0; j < i + 1; ++j) {
885 if (i - j + 1 > 0) {
886 for (int g = 2; g <= i - j + 1; ++g) {
887 Hdata[i][j] /= g;
888 }
889 }
890 }
891 }
892 return MatrixUtils.createRealMatrix(Hdata);
893 }
894
895 /**
896 * Verifies that {@code array} has length at least 2.
897 *
898 * @param array array to test
899 * @throws org.hipparchus.exception.NullArgumentException if array is null
900 * @throws MathIllegalArgumentException if array is too short
901 */
902 private void checkArray(double[] array) {
903 MathUtils.checkNotNull(array);
904 if (array.length < 2) {
905 throw new MathIllegalArgumentException(LocalizedCoreFormats.INSUFFICIENT_OBSERVED_POINTS_IN_SAMPLE, array.length,
906 2);
907 }
908 }
909
910 /**
911 * Computes \( 1 + 2 \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2} \) stopping when successive partial
912 * sums are within {@code tolerance} of one another, or when {@code maxIterations} partial sums
913 * have been computed. If the sum does not converge before {@code maxIterations} iterations a
914 * {@link MathIllegalStateException} is thrown.
915 *
916 * @param t argument
917 * @param tolerance Cauchy criterion for partial sums
918 * @param maxIterations maximum number of partial sums to compute
919 * @return Kolmogorov sum evaluated at t
920 * @throws MathIllegalStateException if the series does not converge
921 */
922 public double ksSum(double t, double tolerance, int maxIterations) {
923 if (t == 0.0) {
924 return 0.0;
925 }
926
927 // TODO: for small t (say less than 1), the alternative expansion in part 3 of [1]
928 // from class javadoc should be used.
929
930 final double x = -2 * t * t;
931 int sign = -1;
932 long i = 1;
933 double partialSum = 0.5d;
934 double delta = 1;
935 while (delta > tolerance && i < maxIterations) {
936 delta = FastMath.exp(x * i * i);
937 partialSum += sign * delta;
938 sign *= -1;
939 i++;
940 }
941 if (i == maxIterations) {
942 throw new MathIllegalStateException(LocalizedCoreFormats.MAX_COUNT_EXCEEDED, maxIterations);
943 }
944 return partialSum * 2;
945 }
946
947 /**
948 * Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge
949 * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
950 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
951 * <p>
952 * The returned probability is exact, implemented by unwinding the recursive function
953 * definitions presented in [4] from the class javadoc.
954 * </p>
955 *
956 * @param d D-statistic value
957 * @param n first sample size
958 * @param m second sample size
959 * @param strict whether or not the probability to compute is expressed as a strict inequality
960 * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
961 * greater than (resp. greater than or equal to) {@code d}
962 */
963 public double exactP(double d, int n, int m, boolean strict) {
964 if (d < 1 / (double)( m * n)) {
965 return 1.0;
966 } else if (d >= 1) {
967 return 0;
968 }
969 double normalizeD = normalizeD(d, n, m);
970 if (!strict) {
971 normalizeD -= 1 / ((double)n * m);
972 }
973 return exactPAtMeshpoint(normalizeD, n, m);
974 }
975
976 /**
977 * Normalizes a value to an integral multiple of 1/mn between 0 and 1.
978 * If d < 1/mn, 0 is returned; if d > 1, 1 is returned; if d is very close
979 * to an integral multiple of 1/mn, that value is returned; otherwise the
980 * returned value is the smallest multiple of 1/mn less than or equal to d.
981 *
982 * @param d d value
983 * @param n first sample size
984 * @param m second sample size
985 * @return d value suitable for input to exactPAtMeshpoint(d, m, n)
986 */
987 private double normalizeD(double d, int n, int m) {
988 final double resolution = 1 / ((double)n * m);
989 final double tol = 1e-12;
990
991 // If d is smaller that the first mesh point, return 0
992 // If greater than 1, return 1
993 if (d < resolution) {
994 return 0;
995 } else if (d > 1) {
996 return 1;
997 }
998
999 // Normalize d to the smallest mesh point less than or equal to d;
1000 // except if d is less than tol less than the next mesh point, bump it up
1001 final double resolutions = d / resolution;
1002 final double ceil = FastMath.ceil(resolutions);
1003 if (ceil - resolutions < tol) {
1004 return ceil * resolution;
1005 } else {
1006 return FastMath.floor(resolutions) * resolution;
1007 }
1008
1009 }
1010
1011 /**
1012 * Computes \(P(D_{n,m} > d)\) where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See
1013 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
1014 * <p>
1015 * The returned probability is exact, implemented by unwinding the recursive function
1016 * definitions presented in [4].
1017 *
1018 * @param d D-statistic value (must be a "meshpoint" - i.e., a possible actual value of D(m,n)).
1019 * @param n first sample size
1020 * @param m second sample size
1021 * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\)
1022 * greater than (resp. greater than or equal to) {@code d}
1023 */
1024 private double exactPAtMeshpoint(double d, int n, int m) {
1025 final int nn = FastMath.max(n, m);
1026 final int mm = FastMath.min(n, m);
1027 final double[] u = new double[nn + 2];
1028 final double k = mm * nn * d + 0.5;
1029 u[1] = 1d;
1030 for (int j = 1; j < nn + 1; j++) {
1031 u[j + 1] = 1;
1032 if (mm * j > k) {
1033 u[j + 1] = 0;
1034 }
1035 }
1036 for (int i = 1; i < mm + 1; i++) {
1037 final double w = ((double) i) / (i + nn);
1038 u[1] = w * u[1];
1039 if (nn * i > k) {
1040 u[1] = 0;
1041 }
1042 for (int j = 1; j < nn + 1; j++) {
1043 u[j + 1] = u[j] + u[j + 1] * w;
1044 if (FastMath.abs(nn * i - mm * j) > k) {
1045 u[j + 1] = 0;
1046 }
1047 }
1048 }
1049 return 1 - u[nn + 1];
1050 }
1051
1052 /**
1053 * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\)
1054 * is the 2-sample Kolmogorov-Smirnov statistic. See
1055 * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\).
1056 * <p>
1057 * Specifically, what is returned is \(1 - k(d \sqrt{mn / (m + n)})\) where \(k(t) = 1 + 2
1058 * \sum_{i=1}^\infty (-1)^i e^{-2 i^2 t^2}\). See {@link #ksSum(double, double, int)} for
1059 * details on how convergence of the sum is determined. This implementation passes {@code ksSum}
1060 * {@link #KS_SUM_CAUCHY_CRITERION} as {@code tolerance} and
1061 * {@link #MAXIMUM_PARTIAL_SUM_COUNT} as {@code maxIterations}.
1062 * </p>
1063 *
1064 * @param d D-statistic value
1065 * @param n first sample size
1066 * @param m second sample size
1067 * @return approximate probability that a randomly selected m-n partition of m + n generates
1068 * \(D_{n,m}\) greater than {@code d}
1069 */
1070 public double approximateP(double d, int n, int m) {
1071 final double dm = m;
1072 final double dn = n;
1073 return 1 - ksSum(d * FastMath.sqrt((dm * dn) / (dm + dn)),
1074 KS_SUM_CAUCHY_CRITERION, MAXIMUM_PARTIAL_SUM_COUNT);
1075 }
1076
1077 /**
1078 * Fills a boolean array randomly with a fixed number of {@code true} values.
1079 * The method uses a simplified version of the Fisher-Yates shuffle algorithm.
1080 * By processing first the {@code true} values followed by the remaining {@code false} values
1081 * less random numbers need to be generated. The method is optimized for the case
1082 * that the number of {@code true} values is larger than or equal to the number of
1083 * {@code false} values.
1084 *
1085 * @param b boolean array
1086 * @param numberOfTrueValues number of {@code true} values the boolean array should finally have
1087 * @param rng random data generator
1088 */
1089 static void fillBooleanArrayRandomlyWithFixedNumberTrueValues(final boolean[] b, final int numberOfTrueValues, final RandomGenerator rng) {
1090 Arrays.fill(b, true);
1091 for (int k = numberOfTrueValues; k < b.length; k++) {
1092 final int r = rng.nextInt(k + 1);
1093 b[(b[r]) ? r : k] = false;
1094 }
1095 }
1096
1097 /**
1098 * If there are no ties in the combined dataset formed from x and y, this
1099 * method is a no-op. If there are ties, a uniform random deviate in
1100 * (-minDelta / 2, minDelta / 2) - {0} is added to each value in x and y, where
1101 * minDelta is the minimum difference between unequal values in the combined
1102 * sample. A fixed seed is used to generate the jitter, so repeated activations
1103 * with the same input arrays result in the same values.
1104 *
1105 * NOTE: if there are ties in the data, this method overwrites the data in
1106 * x and y with the jittered values.
1107 *
1108 * @param x first sample
1109 * @param y second sample
1110 */
1111 private void fixTies(double[] x, double[] y) {
1112 final double[] values = MathArrays.unique(MathArrays.concatenate(x,y));
1113 if (values.length == x.length + y.length) {
1114 return; // There are no ties
1115 }
1116
1117 // Find the smallest difference between values, or 1 if all values are the same
1118 double minDelta = 1;
1119 double prev = values[0];
1120 for (int i = 1; i < values.length; i++) {
1121 final double delta = prev - values[i];
1122 if (delta < minDelta) {
1123 minDelta = delta;
1124 }
1125 prev = values[i];
1126 }
1127 minDelta /= 2;
1128
1129 // Add jitter using a fixed seed (so same arguments always give same results),
1130 // low-initialization-overhead generator
1131 gen.setSeed(100);
1132
1133 // It is theoretically possible that jitter does not break ties, so repeat
1134 // until all ties are gone. Bound the loop and throw MIE if bound is exceeded.
1135 int ct = 0;
1136 boolean ties;
1137 do {
1138 jitter(x, minDelta);
1139 jitter(y, minDelta);
1140 ties = hasTies(x, y);
1141 ct++;
1142 } while (ties && ct < 1000);
1143 if (ties) {
1144 throw MathRuntimeException.createInternalError(); // Should never happen
1145 }
1146 }
1147
1148 /**
1149 * Returns true iff there are ties in the combined sample
1150 * formed from x and y.
1151 *
1152 * @param x first sample
1153 * @param y second sample
1154 * @return true if x and y together contain ties
1155 */
1156 private static boolean hasTies(double[] x, double[] y) {
1157 final HashSet<Double> values = new HashSet<>();
1158 for (double value : x) {
1159 if (!values.add(value)) {
1160 return true;
1161 }
1162 }
1163 for (double v : y) {
1164 if (!values.add(v)) {
1165 return true;
1166 }
1167 }
1168 return false;
1169 }
1170
1171 /**
1172 * Adds random jitter to {@code data} using uniform deviates between {@code -delta} and {@code delta}.
1173 * <p>
1174 * Note that jitter is applied in-place - i.e., the array
1175 * values are overwritten with the result of applying jitter.</p>
1176 *
1177 * @param data input/output data array - entries overwritten by the method
1178 * @param delta max magnitude of jitter
1179 * @throws NullPointerException if either of the parameters is null
1180 */
1181 private void jitter(double[] data, double delta) {
1182 for (int i = 0; i < data.length; i++) {
1183 data[i] += gen.nextUniform(-delta, delta);
1184 }
1185 }
1186
1187 }