View Javadoc
1   /*
2    * Licensed to the Hipparchus project under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The Hipparchus project licenses this file to You under the Apache License,
6    * Version 2.0 (the "License"); you may not use this file except in compliance
7    * with the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.hipparchus.stat.projection;
18  
19  import org.hipparchus.exception.MathIllegalStateException;
20  import org.hipparchus.stat.LocalizedStatFormats;
21  import org.junit.Assert;
22  import org.junit.Test;
23  
24  public class PCATest {
25  
26      // example from:
27      // https://towardsdatascience.com/the-mathematics-behind-principal-component-analysis-fff2d7f4b643
28      // https://stattrek.com/matrix-algebra/covariance-matrix
29      private static final double[][] SCORES = {
30              {90, 60, 90},
31              {90, 90, 30},
32              {60, 60, 60},
33              {60, 60, 90},
34              {30, 30, 30},
35      };
36  
37      public static final double[] EXPECTED_MEAN = {66, 60, 60};
38      public static final double[] EXPECTED_VARIANCE = {1137.587441, 786.387983, 56.024575};
39  
40      private static final double[][] EXPECTED_COMPONENTS = {
41              {  0.6558023,  0.385999 },
42              {  0.4291978,  0.516366 },
43              {  0.6210577, -0.7644414 }
44      };
45  
46      /**
47       * This is the expected value (give or take sign) when centering (covariance)
48       * but no scaling is applied. In general, components with the same values but
49       * differing sign are
50       * <a href="https://stats.stackexchange.com/questions/30348/is-it-acceptable-to-reverse-a-sign-of-a-principal-component-score">equivalent</a>.
51       *
52       * The result has been cross-checked with
53       * <a href="https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html">sklearn.decomposition.PCA</a>
54       * <a href="https://javadoc.io/static/nz.ac.waikato.cms.weka/weka-dev/3.9.4/weka/attributeSelection/PrincipalComponents.html>weka.attributeSelection.PrincipalComponents</a>
55       * (with the <code>centerData</code> option set to <code>true</code>)
56       * <a href="https://github.com/datumbox/datumbox-framework/blob/develop/datumbox-framework-core/src/main/java/com/datumbox/framework/core/machinelearning/featureselection/PCA.java">com.datumbox.framework.core.machinelearning.featureselection.PCA</a>
57       * (but for whatever reason datumbox does the transform on the unnormalized original data rather than normalized data - normalizing manually achieves the result below)
58       * <a href="https://www.javadoc.io/doc/com.github.haifengl/smile-core/latest/smile/feature/extraction/PCA.html">smile.feature.extraction.PCA</a>
59       * (using the PCA.fit method)
60       * <a href="https://au.mathworks.com/help/stats/pca.html">pca from matlab</a>
61       */
62      private static final double[][] EXPECTED_COV = {
63              {  34.3709848, -13.6692708 },
64              {   9.9834573,  47.6882055 },
65              {  -3.9348135,  -2.3159927 },
66              {  14.6969171, -25.2492347 },
67              { -55.1165457,  -6.4537072 },
68      };
69  
70      /**
71       * This is the expected value give or take sign when centering and scaling (correlation).
72       *
73       * The result has been cross-checked with
74       * <a href="https://javadoc.io/static/nz.ac.waikato.cms.weka/weka-dev/3.9.4/weka/attributeSelection/PrincipalComponents.html></a>
75       * <a href="https://au.mathworks.com/help/stats/pca.html">pca from matlab</a>
76       * (using the 'VariableWeights','variance' option)
77       */
78      private static final double[][] EXPECTED_COR = {
79              {  0.9118256, -0.942809  },
80              {  1.3832302,  1.4142136 },
81              { -0.1690309,  0.0       },
82              {  0.0666714, -0.942809  },
83              { -2.1926964,  0.4714045 },
84      };
85  
86      /**
87       * This is the expected value give or take sign when centering and scaling (correlation)
88       * is applied with no bias adjustment. In general, bias adjustment is more accurate but
89       * alters most PCA machine learning models by an insignificant amount so is often not accounted for.
90       *
91       * The result has been cross-checked with
92       * <a href="https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html">sklearn.decomposition.PCA</a>
93       * (but with prior preprocessing using <a href="https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html">sklearn.preprocessing.StandardScaler</a>)
94       * <a href="https://www.javadoc.io/doc/com.github.haifengl/smile-core/latest/smile/feature/extraction/PCA.html">smile.feature.extraction.PCA</a>
95       * (using the PCA.cor method)
96       */
97      public static final double[][] EXPECTED_COR_NO_BIAS = {
98              {  1.0194521, -1.0540926 },
99              {  1.5464984,  1.5811388 },
100             { -0.1889822,  0.0       },
101             {  0.0745409, -1.0540926 },
102             { -2.451509,   0.5270463 },
103     };
104     public static final double DELTA = 0.000001;
105 
106     @Test
107     public void defaultSettings() {
108         PCA pca = new PCA(2);
109         Assert.assertEquals(2, pca.getNumComponents());
110         Assert.assertFalse(pca.isScale());
111         Assert.assertTrue(pca.isBiasCorrection());
112     }
113 
114     @Test
115     public void covariance() {
116         PCA pca = new PCA(2);
117         double[][] actual = pca.fitAndTransform(SCORES);
118         Assert.assertArrayEquals(EXPECTED_MEAN, pca.getCenter(), DELTA);
119         Assert.assertArrayEquals(EXPECTED_VARIANCE, pca.getVariance(), DELTA);
120         assertExpected(EXPECTED_COMPONENTS, pca.getComponents());
121         assertExpected(EXPECTED_COV, actual);
122 
123         // calling fit and transform individually should be the same as combo method
124         pca = new PCA(2);
125         actual = pca.fit(SCORES).transform(SCORES);
126         Assert.assertArrayEquals(EXPECTED_MEAN, pca.getCenter(), DELTA);
127         assertExpected(EXPECTED_COV, actual);
128     }
129 
130     @Test
131     public void correlation() {
132         PCA pca = new PCA(2, true, true);
133         double[][] actual = pca.fitAndTransform(SCORES);
134         assertExpected(EXPECTED_COR, actual);
135     }
136 
137     @Test
138     public void correlationNoBias() {
139         PCA pca = new PCA(2, true, false);
140         double[][] actual = pca.fitAndTransform(SCORES);
141         assertExpected(EXPECTED_COR_NO_BIAS, actual);
142     }
143 
144     @Test
145     public void transformWithoutFit() {
146         PCA pca = new PCA(2);
147         try {
148             pca.transform(SCORES);
149             Assert.fail("an exception should have been thrown");
150         } catch (MathIllegalStateException mise) {
151             Assert.assertEquals(LocalizedStatFormats.ILLEGAL_STATE_PCA, mise.getSpecifier());
152             Assert.assertEquals("transform", mise.getParts()[0]);
153         }
154     }
155 
156     private static void assertExpected(double[][] expected, double[][] actual) {
157         for (int i = 0; i < expected.length; i++) {
158             double[] e = expected[i];
159             double[] t = actual[i];
160             Assert.assertArrayEquals(e, t, DELTA);
161         }
162     }
163 }