OneWayAnova.java

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * This is not the original file distributed by the Apache Software Foundation
 * It has been modified by the Hipparchus project
 */
package org.hipparchus.stat.inference;

import java.util.ArrayList;
import java.util.Collection;

import org.hipparchus.distribution.continuous.FDistribution;
import org.hipparchus.exception.MathIllegalArgumentException;
import org.hipparchus.exception.MathIllegalStateException;
import org.hipparchus.exception.NullArgumentException;
import org.hipparchus.stat.LocalizedStatFormats;
import org.hipparchus.stat.descriptive.StreamingStatistics;
import org.hipparchus.util.MathUtils;

/**
 * Implements one-way ANOVA (analysis of variance) statistics.
 *
 * <p> Tests for differences between two or more categories of univariate data
 * (for example, the body mass index of accountants, lawyers, doctors and
 * computer programmers).  When two categories are given, this is equivalent to
 * the {@link TTest}.
 * </p><p>
 * Uses the {@link FDistribution
 * Hipparchus F Distribution implementation} to estimate exact p-values.</p>
 * <p>This implementation is based on a description at
 * <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">One way Anova (dead link)</a></p>
 * <pre>
 * Abbreviations: bg = between groups,
 *                wg = within groups,
 *                ss = sum squared deviations
 * </pre>
 *
 */
public class OneWayAnova {

    /** Empty constructor.
     * <p>
     * This constructor is not strictly necessary, but it prevents spurious
     * javadoc warnings with JDK 18 and later.
     * </p>
     * @since 3.0
     */
    public OneWayAnova() { // NOPMD - unnecessary constructor added intentionally to make javadoc happy
        // nothing to do
    }

    /**
     * Computes the ANOVA F-value for a collection of <code>double[]</code>
     * arrays.
     *
     * <p><strong>Preconditions</strong>:</p>
     * <ul>
     * <li>The categoryData <code>Collection</code> must contain
     * <code>double[]</code> arrays.</li>
     * <li> There must be at least two <code>double[]</code> arrays in the
     * <code>categoryData</code> collection and each of these arrays must
     * contain at least two values.</li></ul>
     * <p>
     * This implementation computes the F statistic using the definitional
     * formula</p><pre>
     *   F = msbg/mswg</pre>
     * <p>where</p><pre>
     *  msbg = between group mean square
     *  mswg = within group mean square</pre>
     *  <p>
     * are as defined <a href="http://faculty.vassar.edu/lowry/ch13pt1.html">
     * here</a></p>
     *
     * @param categoryData <code>Collection</code> of <code>double[]</code>
     * arrays each containing data for one category
     * @return Fvalue
     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
     * @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
     * array is less than 2 or a contained <code>double[]</code> array does not have
     * at least two values
     */
    public double anovaFValue(final Collection<double[]> categoryData)
        throws MathIllegalArgumentException, NullArgumentException {

        return anovaStats(categoryData).F;

    }

    /**
     * Computes the ANOVA P-value for a collection of <code>double[]</code>
     * arrays.
     *
     * <p><strong>Preconditions</strong>:</p>
     * <ul>
     * <li>The categoryData <code>Collection</code> must contain
     * <code>double[]</code> arrays.</li>
     * <li> There must be at least two <code>double[]</code> arrays in the
     * <code>categoryData</code> collection and each of these arrays must
     * contain at least two values.</li></ul>
     * <p>
     * This implementation uses the
     * {@link org.hipparchus.distribution.continuous.FDistribution
     * Hipparchus F Distribution implementation} to estimate the exact
     * p-value, using the formula</p><pre>
     *   p = 1 - cumulativeProbability(F)</pre>
     * <p>where <code>F</code> is the F value and <code>cumulativeProbability</code>
     * is the Hipparchus implementation of the F distribution.</p>
     *
     * @param categoryData <code>Collection</code> of <code>double[]</code>
     * arrays each containing data for one category
     * @return Pvalue
     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
     * @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
     * array is less than 2 or a contained <code>double[]</code> array does not have
     * at least two values
     * @throws MathIllegalStateException if the p-value can not be computed due to a convergence error
     * @throws MathIllegalStateException if the maximum number of iterations is exceeded
     */
    public double anovaPValue(final Collection<double[]> categoryData)
        throws MathIllegalArgumentException, NullArgumentException,
        MathIllegalStateException {

        final AnovaStats a = anovaStats(categoryData);
        // No try-catch or advertised exception because args are valid
        final FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
        return 1.0 - fdist.cumulativeProbability(a.F);

    }

    /**
     * Computes the ANOVA P-value for a collection of {@link StreamingStatistics}.
     *
     * <p><strong>Preconditions</strong>:</p>
     * <ul>
     * <li>The categoryData <code>Collection</code> must contain
     * {@link StreamingStatistics}.</li>
     * <li> There must be at least two {@link StreamingStatistics} in the
     * <code>categoryData</code> collection and each of these statistics must
     * contain at least two values.</li></ul>
     * <p>
     * This implementation uses the
     * {@link org.hipparchus.distribution.continuous.FDistribution
     * Hipparchus F Distribution implementation} to estimate the exact
     * p-value, using the formula</p><pre>
     *   p = 1 - cumulativeProbability(F)</pre>
     * <p>where <code>F</code> is the F value and <code>cumulativeProbability</code>
     * is the Hipparchus implementation of the F distribution.</p>
     *
     * @param categoryData <code>Collection</code> of {@link StreamingStatistics}
     * each containing data for one category
     * @param allowOneElementData if true, allow computation for one catagory
     * only or for one data element per category
     * @return Pvalue
     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
     * @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
     * array is less than 2 or a contained {@link StreamingStatistics} does not have
     * at least two values
     * @throws MathIllegalStateException if the p-value can not be computed due to a convergence error
     * @throws MathIllegalStateException if the maximum number of iterations is exceeded
     */
    public double anovaPValue(final Collection<StreamingStatistics> categoryData,
                              final boolean allowOneElementData)
        throws MathIllegalArgumentException, NullArgumentException,
        MathIllegalStateException {

        final AnovaStats a = anovaStats(categoryData, allowOneElementData);
        final FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
        return 1.0 - fdist.cumulativeProbability(a.F);

    }

    /**
     * This method calls the method that actually does the calculations (except
     * P-value).
     *
     * @param categoryData
     *            <code>Collection</code> of <code>double[]</code> arrays each
     *            containing data for one category
     * @return computed AnovaStats
     * @throws NullArgumentException
     *             if <code>categoryData</code> is <code>null</code>
     * @throws MathIllegalArgumentException
     *             if the length of the <code>categoryData</code> array is less
     *             than 2 or a contained <code>double[]</code> array does not
     *             contain at least two values
     */
    private AnovaStats anovaStats(final Collection<double[]> categoryData)
        throws MathIllegalArgumentException, NullArgumentException {

        MathUtils.checkNotNull(categoryData);

        final Collection<StreamingStatistics> categoryDataSummaryStatistics =
                new ArrayList<>(categoryData.size());

        // convert arrays to SummaryStatistics
        for (final double[] data : categoryData) {
            final StreamingStatistics dataSummaryStatistics = new StreamingStatistics();
            categoryDataSummaryStatistics.add(dataSummaryStatistics);
            for (final double val : data) {
                dataSummaryStatistics.addValue(val);
            }
        }

        return anovaStats(categoryDataSummaryStatistics, false);

    }

    /**
     * Performs an ANOVA test, evaluating the null hypothesis that there
     * is no difference among the means of the data categories.
     *
     * <p><strong>Preconditions</strong>:</p>
     * <ul>
     * <li>The categoryData <code>Collection</code> must contain
     * <code>double[]</code> arrays.</li>
     * <li> There must be at least two <code>double[]</code> arrays in the
     * <code>categoryData</code> collection and each of these arrays must
     * contain at least two values.</li>
     * <li>alpha must be strictly greater than 0 and less than or equal to 0.5.
     * </li></ul>
     * <p>
     * This implementation uses the
     * {@link org.hipparchus.distribution.continuous.FDistribution
     * Hipparchus F Distribution implementation} to estimate the exact
     * p-value, using the formula</p><pre>
     *   p = 1 - cumulativeProbability(F)</pre>
     * <p>where <code>F</code> is the F value and <code>cumulativeProbability</code>
     * is the Hipparchus implementation of the F distribution.</p>
     * <p>True is returned iff the estimated p-value is less than alpha.</p>
     *
     * @param categoryData <code>Collection</code> of <code>double[]</code>
     * arrays each containing data for one category
     * @param alpha significance level of the test
     * @return true if the null hypothesis can be rejected with
     * confidence 1 - alpha
     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
     * @throws MathIllegalArgumentException if the length of the <code>categoryData</code>
     * array is less than 2 or a contained <code>double[]</code> array does not have
     * at least two values
     * @throws MathIllegalArgumentException if <code>alpha</code> is not in the range (0, 0.5]
     * @throws MathIllegalStateException if the p-value can not be computed due to a convergence error
     * @throws MathIllegalStateException if the maximum number of iterations is exceeded
     */
    public boolean anovaTest(final Collection<double[]> categoryData,
                             final double alpha)
        throws MathIllegalArgumentException, NullArgumentException, MathIllegalStateException {

        if ((alpha <= 0) || (alpha > 0.5)) {
            throw new MathIllegalArgumentException(
                    LocalizedStatFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
                    alpha, 0, 0.5);
        }
        return anovaPValue(categoryData) < alpha;
    }

    /**
     * This method actually does the calculations (except P-value).
     *
     * @param categoryData <code>Collection</code> of <code>double[]</code>
     * arrays each containing data for one category
     * @param allowOneElementData if true, allow computation for one category
     * only or for one data element per category
     * @return computed AnovaStats
     * @throws NullArgumentException if <code>categoryData</code> is <code>null</code>
     * @throws MathIllegalArgumentException if <code>allowOneElementData</code> is false and the number of
     * categories is less than 2 or a contained SummaryStatistics does not contain
     * at least two values
     */
    private AnovaStats anovaStats(final Collection<StreamingStatistics> categoryData,
                                  final boolean allowOneElementData)
        throws MathIllegalArgumentException, NullArgumentException {

        MathUtils.checkNotNull(categoryData);

        if (!allowOneElementData) {
            // check if we have enough categories
            if (categoryData.size() < 2) {
                throw new MathIllegalArgumentException(LocalizedStatFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
                                                       categoryData.size(), 2);
            }

            // check if each category has enough data
            for (final StreamingStatistics array : categoryData) {
                if (array.getN() <= 1) {
                    throw new MathIllegalArgumentException(LocalizedStatFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
                                                           (int) array.getN(), 2);
                }
            }
        }

        int dfwg = 0;
        double sswg = 0;
        double totsum = 0;
        double totsumsq = 0;
        int totnum = 0;

        for (final StreamingStatistics data : categoryData) {

            final double sum = data.getSum();
            final double sumsq = data.getSumOfSquares();
            final int num = (int) data.getN();
            totnum += num;
            totsum += sum;
            totsumsq += sumsq;

            dfwg += num - 1;
            final double ss = sumsq - ((sum * sum) / num);
            sswg += ss;
        }

        final double sst = totsumsq - ((totsum * totsum) / totnum);
        final double ssbg = sst - sswg;
        final int dfbg = categoryData.size() - 1;
        final double msbg = ssbg / dfbg;
        final double mswg = sswg / dfwg;
        final double F = msbg / mswg;

        return new AnovaStats(dfbg, dfwg, F);

    }

    /**
     * Convenience class to pass dfbg,dfwg,F values around within OneWayAnova.
     * No get/set methods provided.
     */
    private static class AnovaStats {

        /** Degrees of freedom in numerator (between groups). */
        private final int dfbg;

        /** Degrees of freedom in denominator (within groups). */
        private final int dfwg;

        /** Statistic. */
        private final double F;

        /**
         * Constructor
         * @param dfbg degrees of freedom in numerator (between groups)
         * @param dfwg degrees of freedom in denominator (within groups)
         * @param F statistic
         */
        private AnovaStats(int dfbg, int dfwg, double F) {
            this.dfbg = dfbg;
            this.dfwg = dfwg;
            this.F = F;
        }
    }

}