1 /*
2 * Licensed to the Hipparchus project under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The Hipparchus project licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.hipparchus.stat.projection;
18
19 import org.hipparchus.exception.MathIllegalStateException;
20 import org.hipparchus.linear.EigenDecompositionSymmetric;
21 import org.hipparchus.linear.MatrixUtils;
22 import org.hipparchus.linear.RealMatrix;
23 import org.hipparchus.stat.LocalizedStatFormats;
24 import org.hipparchus.stat.StatUtils;
25 import org.hipparchus.stat.correlation.Covariance;
26 import org.hipparchus.stat.descriptive.moment.StandardDeviation;
27
28 /**
29 * Principal component analysis (PCA) is a statistical technique for reducing the dimensionality of a dataset.
30 * <a href="https://en.wikipedia.org/wiki/Principal_component_analysis">PCA</a> can be thought of as a
31 * projection or scaling of the data to reduce the number of dimensions but done in a way
32 * that preserves as much information as possible.
33 * @since 3.0
34 */
35 public class PCA {
36 /**
37 * The number of components (reduced dimensions) for this projection.
38 */
39 private final int numC;
40
41 /**
42 * Whether to scale (standardize) the input data as well as center (normalize).
43 */
44 private final boolean scale;
45
46 /**
47 * Whether to correct for bias when standardizing. Ignored when only centering.
48 */
49 private final boolean biasCorrection;
50
51 /**
52 * The by column (feature) averages (means) from the fitted data.
53 */
54 private double[] center;
55
56 /**
57 * The by column (feature) standard deviates from the fitted data.
58 */
59 private double[] std;
60
61 /**
62 * The eigenValues (variance) of our projection model.
63 */
64 private double[] eigenValues;
65
66 /**
67 * The eigenVectors (components) of our projection model.
68 */
69 private RealMatrix principalComponents;
70
71 /**
72 * Utility class when scaling.
73 */
74 private final StandardDeviation sd;
75
76 /**
77 * Create a PCA with the ability to adjust scaling parameters.
78 *
79 * @param numC the number of components
80 * @param scale whether to also scale (correlation) rather than just center (covariance)
81 * @param biasCorrection whether to adjust for bias when scaling
82 */
83 public PCA(int numC, boolean scale, boolean biasCorrection) {
84 this.numC = numC;
85 this.scale = scale;
86 this.biasCorrection = biasCorrection;
87 sd = scale ? new StandardDeviation(biasCorrection) : null;
88 }
89
90 /**
91 * A default PCA will center but not scale.
92 *
93 * @param numC the number of components
94 */
95 public PCA(int numC) {
96 this(numC, false, true);
97 }
98
99 /** GEt number of components.
100 * @return the number of components
101 */
102 public int getNumComponents() {
103 return numC;
104 }
105
106 /** Check whether scaling (correlation) or no scaling (covariance) is used.
107 * @return whether scaling (correlation) or no scaling (covariance) is used
108 */
109 public boolean isScale() {
110 return scale;
111 }
112
113 /** Check whether scaling (correlation), if in use, adjusts for bias.
114 * @return whether scaling (correlation), if in use, adjusts for bias
115 */
116 public boolean isBiasCorrection() {
117 return biasCorrection;
118 }
119
120 /** Get principal component variances.
121 * @return the principal component variances, ordered from largest to smallest, which are the eigenvalues of the covariance or correlation matrix of the fitted data
122 */
123 public double[] getVariance() {
124 validateState("getVariance");
125 return eigenValues.clone();
126 }
127
128 /** Get by column center (or mean) of the fitted data.
129 * @return the by column center (or mean) of the fitted data
130 */
131 public double[] getCenter() {
132 validateState("getCenter");
133 return center.clone();
134 }
135
136 /**
137 * Returns the principal components of our projection model.
138 * These are the eigenvectors of our covariance/correlation matrix.
139 *
140 * @return the principal components
141 */
142 public double[][] getComponents() {
143 validateState("getComponents");
144 return principalComponents.getData();
145 }
146
147 /**
148 * Fit our model to the data and then transform it to the reduced dimensions.
149 *
150 * @param data the input data
151 * @return the fitted data
152 */
153 public double[][] fitAndTransform(double[][] data) {
154 center = null;
155 RealMatrix normalizedM = getNormalizedMatrix(data);
156 calculatePrincipalComponents(normalizedM);
157 return normalizedM.multiply(principalComponents).getData();
158 }
159
160 /**
161 * Transform the supplied data using our projection model.
162 *
163 * @param data the input data
164 * @return the fitted data
165 */
166 public double[][] transform(double[][] data) {
167 validateState("transform");
168 RealMatrix normalizedM = getNormalizedMatrix(data);
169 return normalizedM.multiply(principalComponents).getData();
170 }
171
172 /**
173 * Fit our model to the data, ready for subsequence transforms.
174 *
175 * @param data the input data
176 * @return this
177 */
178 public PCA fit(double[][] data) {
179 center = null;
180 RealMatrix normalized = getNormalizedMatrix(data);
181 calculatePrincipalComponents(normalized);
182 return this;
183 }
184
185 /** Check if the state allows an operation to be performed.
186 * @param from name of the operation
187 * @exception MathIllegalStateException if the state does not allows operation
188 */
189 private void validateState(String from) {
190 if (center == null) {
191 throw new MathIllegalStateException(LocalizedStatFormats.ILLEGAL_STATE_PCA, from);
192 }
193
194 }
195
196 /** Compute eigenvalues and principal components.
197 * <p>
198 * The results are stored in the instance itself
199 * <p>
200 * @param normalizedM normalized matrix
201 */
202 private void calculatePrincipalComponents(RealMatrix normalizedM) {
203 RealMatrix covarianceM = new Covariance(normalizedM).getCovarianceMatrix();
204 EigenDecompositionSymmetric decomposition = new EigenDecompositionSymmetric(covarianceM);
205 eigenValues = decomposition.getEigenvalues();
206 principalComponents = MatrixUtils.createRealMatrix(eigenValues.length, numC);
207 for (int c = 0; c < numC; c++) {
208 for (int f = 0; f < eigenValues.length; f++) {
209 principalComponents.setEntry(f, c, decomposition.getEigenvector(c).getEntry(f));
210 }
211 }
212 }
213
214 /**
215 * This will either normalize (center) or
216 * standardize (center plus scale) the input data.
217 *
218 * @param input the input data
219 * @return the normalized (or standardized) matrix
220 */
221 private RealMatrix getNormalizedMatrix(double[][] input) {
222 int numS = input.length;
223 int numF = input[0].length;
224 boolean calculating = center == null;
225 if (calculating) {
226 center = new double[numF];
227 if (scale) {
228 std = new double[numF];
229 }
230 }
231
232 double[][] normalized = new double[numS][numF];
233 for (int f = 0; f < numF; f++) {
234 if (calculating) {
235 calculateNormalizeParameters(input, numS, f);
236 }
237 for (int s = 0; s < numS; s++) {
238 normalized[s][f] = input[s][f] - center[f];
239 }
240 if (scale) {
241 for (int s = 0; s < numS; s++) {
242 normalized[s][f] /= std[f];
243 }
244 }
245 }
246
247 return MatrixUtils.createRealMatrix(normalized);
248 }
249
250 /** compute normalized parameters.
251 * @param input the input data
252 * @param numS number of data rows
253 * @param f index of the component
254 */
255 private void calculateNormalizeParameters(double[][] input, int numS, int f) {
256 double[] column = new double[numS];
257 for (int s = 0; s < numS; s++) {
258 column[s] = input[s][f];
259 }
260 center[f] = StatUtils.mean(column);
261 if (scale) {
262 std[f] = sd.evaluate(column, center[f]);
263 }
264 }
265 }