View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  /*
19   * This is not the original file distributed by the Apache Software Foundation
20   * It has been modified by the Hipparchus project
21   */
22  
23  package org.hipparchus.stat.regression;
24  import java.io.Serializable;
25  
26  import org.hipparchus.distribution.continuous.TDistribution;
27  import org.hipparchus.exception.LocalizedCoreFormats;
28  import org.hipparchus.exception.MathIllegalArgumentException;
29  import org.hipparchus.stat.LocalizedStatFormats;
30  import org.hipparchus.util.FastMath;
31  import org.hipparchus.util.MathUtils;
32  import org.hipparchus.util.Precision;
33  
34  /**
35   * Estimates an ordinary least squares regression model
36   * with one independent variable.
37   * <p>
38   * <code> y = intercept + slope * x  </code></p>
39   * <p>
40   * Standard errors for <code>intercept</code> and <code>slope</code> are
41   * available as well as ANOVA, r-square and Pearson's r statistics.</p>
42   * <p>
43   * Observations (x,y pairs) can be added to the model one at a time or they
44   * can be provided in a 2-dimensional array.  The observations are not stored
45   * in memory, so there is no limit to the number of observations that can be
46   * added to the model.</p>
47   * <p>* <strong>Usage Notes</strong>:</p>
48   * <ul>
49   * <li> When there are fewer than two observations in the model, or when
50   * there is no variation in the x values (i.e. all x values are the same)
51   * all statistics return <code>NaN</code>. At least two observations with
52   * different x coordinates are required to estimate a bivariate regression
53   * model.
54   * </li>
55   * <li> Getters for the statistics always compute values based on the current
56   * set of observations -- i.e., you can get statistics, then add more data
57   * and get updated statistics without using a new instance.  There is no
58   * "compute" method that updates all statistics.  Each of the getters performs
59   * the necessary computations to return the requested statistic.
60   * </li>
61   * <li> The intercept term may be suppressed by passing {@code false} to
62   * the {@link #SimpleRegression(boolean)} constructor.  When the
63   * {@code hasIntercept} property is false, the model is estimated without a
64   * constant term and {@link #getIntercept()} returns {@code 0}.</li>
65   * </ul>
66   *
67   */
68  public class SimpleRegression implements Serializable, UpdatingMultipleLinearRegression {
69  
70      /** Serializable version identifier */
71      private static final long serialVersionUID = -3004689053607543335L;
72  
73      /** sum of x values */
74      private double sumX;
75  
76      /** total variation in x (sum of squared deviations from xbar) */
77      private double sumXX;
78  
79      /** sum of y values */
80      private double sumY;
81  
82      /** total variation in y (sum of squared deviations from ybar) */
83      private double sumYY;
84  
85      /** sum of products */
86      private double sumXY;
87  
88      /** number of observations */
89      private long n;
90  
91      /** mean of accumulated x values, used in updating formulas */
92      private double xbar;
93  
94      /** mean of accumulated y values, used in updating formulas */
95      private double ybar;
96  
97      /** include an intercept or not */
98      private final boolean hasIntercept;
99      // ---------------------Public methods--------------------------------------
100 
101     /**
102      * Create an empty SimpleRegression instance
103      */
104     public SimpleRegression() {
105         this(true);
106     }
107     /**
108     * Create a SimpleRegression instance, specifying whether or not to estimate
109     * an intercept.
110     *
111     * <p>Use {@code false} to estimate a model with no intercept.  When the
112     * {@code hasIntercept} property is false, the model is estimated without a
113     * constant term and {@link #getIntercept()} returns {@code 0}.</p>
114     *
115     * @param includeIntercept whether or not to include an intercept term in
116     * the regression model
117     */
118     public SimpleRegression(boolean includeIntercept) {
119         super();
120         hasIntercept = includeIntercept;
121     }
122 
123     /**
124      * Adds the observation (x,y) to the regression data set.
125      * <p>
126      * Uses updating formulas for means and sums of squares defined in
127      * "Algorithms for Computing the Sample Variance: Analysis and
128      * Recommendations", Chan, T.F., Golub, G.H., and LeVeque, R.J.
129      * 1983, American Statistician, vol. 37, pp. 242-247, referenced in
130      * Weisberg, S. "Applied Linear Regression". 2nd Ed. 1985.</p>
131      *
132      *
133      * @param x independent variable value
134      * @param y dependent variable value
135      */
136     public void addData(final double x,final double y) {
137         if (n == 0) {
138             xbar = x;
139             ybar = y;
140         } else {
141             if( hasIntercept ){
142                 final double fact1 = 1.0 + n;
143                 final double fact2 = n / (1.0 + n);
144                 final double dx = x - xbar;
145                 final double dy = y - ybar;
146                 sumXX += dx * dx * fact2;
147                 sumYY += dy * dy * fact2;
148                 sumXY += dx * dy * fact2;
149                 xbar += dx / fact1;
150                 ybar += dy / fact1;
151             }
152          }
153         if( !hasIntercept ){
154             sumXX += x * x ;
155             sumYY += y * y ;
156             sumXY += x * y ;
157         }
158         sumX += x;
159         sumY += y;
160         n++;
161     }
162 
163     /**
164      * Appends data from another regression calculation to this one.
165      *
166      * <p>The mean update formulae are based on a paper written by Philippe
167      * P&eacute;bay:
168      * <a
169      * href="http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf">
170      * Formulas for Robust, One-Pass Parallel Computation of Covariances and
171      * Arbitrary-Order Statistical Moments</a>, 2008, Technical Report
172      * SAND2008-6212, Sandia National Laboratories.</p>
173      *
174      * @param reg model to append data from
175      */
176     public void append(SimpleRegression reg) {
177         if (n == 0) {
178             xbar = reg.xbar;
179             ybar = reg.ybar;
180             sumXX = reg.sumXX;
181             sumYY = reg.sumYY;
182             sumXY = reg.sumXY;
183         } else {
184             if (hasIntercept) {
185                 final double fact1 = reg.n / (double) (reg.n + n);
186                 final double fact2 = n * reg.n / (double) (reg.n + n);
187                 final double dx = reg.xbar - xbar;
188                 final double dy = reg.ybar - ybar;
189                 sumXX += reg.sumXX + dx * dx * fact2;
190                 sumYY += reg.sumYY + dy * dy * fact2;
191                 sumXY += reg.sumXY + dx * dy * fact2;
192                 xbar += dx * fact1;
193                 ybar += dy * fact1;
194             }else{
195                 sumXX += reg.sumXX;
196                 sumYY += reg.sumYY;
197                 sumXY += reg.sumXY;
198             }
199         }
200         sumX += reg.sumX;
201         sumY += reg.sumY;
202         n += reg.n;
203     }
204 
205     /**
206      * Removes the observation (x,y) from the regression data set.
207      * <p>
208      * Mirrors the addData method.  This method permits the use of
209      * SimpleRegression instances in streaming mode where the regression
210      * is applied to a sliding "window" of observations, however the caller is
211      * responsible for maintaining the set of observations in the window.</p>
212      *
213      * The method has no effect if there are no points of data (i.e. n=0)
214      *
215      * @param x independent variable value
216      * @param y dependent variable value
217      */
218     public void removeData(final double x,final double y) {
219         if (n > 0) {
220             if (hasIntercept) {
221                 final double fact1 = n - 1.0;
222                 final double fact2 = n / (n - 1.0);
223                 final double dx = x - xbar;
224                 final double dy = y - ybar;
225                 sumXX -= dx * dx * fact2;
226                 sumYY -= dy * dy * fact2;
227                 sumXY -= dx * dy * fact2;
228                 xbar -= dx / fact1;
229                 ybar -= dy / fact1;
230             } else {
231                 final double fact1 = n - 1.0;
232                 sumXX -= x * x;
233                 sumYY -= y * y;
234                 sumXY -= x * y;
235                 xbar -= x / fact1;
236                 ybar -= y / fact1;
237             }
238              sumX -= x;
239              sumY -= y;
240              n--;
241         }
242     }
243 
244     /**
245      * Adds the observations represented by the elements in
246      * <code>data</code>.
247      * <p>
248      * <code>(data[0][0],data[0][1])</code> will be the first observation, then
249      * <code>(data[1][0],data[1][1])</code>, etc.</p>
250      * <p>
251      * This method does not replace data that has already been added.  The
252      * observations represented by <code>data</code> are added to the existing
253      * dataset.</p>
254      * <p>
255      * To replace all data, use <code>clear()</code> before adding the new
256      * data.</p>
257      *
258      * @param data array of observations to be added
259      * @throws MathIllegalArgumentException if the length of {@code data[i]} is not
260      * greater than or equal to 2
261      */
262     public void addData(final double[][] data) throws MathIllegalArgumentException {
263         for (int i = 0; i < data.length; i++) {
264             if( data[i].length < 2 ){
265                throw new MathIllegalArgumentException(LocalizedStatFormats.INVALID_REGRESSION_OBSERVATION,
266                     data[i].length, 2);
267             }
268             addData(data[i][0], data[i][1]);
269         }
270     }
271 
272     /**
273      * Adds one observation to the regression model.
274      *
275      * @param x the independent variables which form the design matrix
276      * @param y the dependent or response variable
277      * @throws MathIllegalArgumentException if the length of {@code x} does not equal
278      * the number of independent variables in the model
279      */
280     @Override
281     public void addObservation(final double[] x,final double y)
282             throws MathIllegalArgumentException {
283         if( x == null || x.length == 0 ){
284             throw new MathIllegalArgumentException(LocalizedStatFormats.INVALID_REGRESSION_OBSERVATION,x!=null?x.length:0, 1);
285         }
286         addData( x[0], y );
287     }
288 
289     /**
290      * Adds a series of observations to the regression model. The lengths of
291      * x and y must be the same and x must be rectangular.
292      *
293      * @param x a series of observations on the independent variables
294      * @param y a series of observations on the dependent variable
295      * The length of x and y must be the same
296      * @throws MathIllegalArgumentException if {@code x} is not rectangular, does not match
297      * the length of {@code y} or does not contain sufficient data to estimate the model
298      */
299     @Override
300     public void addObservations(final double[][] x,final double[] y) throws MathIllegalArgumentException {
301         MathUtils.checkNotNull(x, LocalizedCoreFormats.INPUT_ARRAY);
302         MathUtils.checkNotNull(y, LocalizedCoreFormats.INPUT_ARRAY);
303         MathUtils.checkDimension(x.length, y.length);
304         boolean obsOk = true;
305         for( int i = 0 ; i < x.length; i++){
306             if( x[i] == null || x[i].length == 0 ){
307                 obsOk = false;
308             }
309         }
310         if( !obsOk ){
311             throw new MathIllegalArgumentException(
312                   LocalizedStatFormats.NOT_ENOUGH_DATA_FOR_NUMBER_OF_PREDICTORS,
313                   0, 1);
314         }
315         for( int i = 0 ; i < x.length ; i++){
316             addData( x[i][0], y[i] );
317         }
318     }
319 
320     /**
321      * Removes observations represented by the elements in <code>data</code>.
322       * <p>
323      * If the array is larger than the current n, only the first n elements are
324      * processed.  This method permits the use of SimpleRegression instances in
325      * streaming mode where the regression is applied to a sliding "window" of
326      * observations, however the caller is responsible for maintaining the set
327      * of observations in the window.</p>
328      * <p>
329      * To remove all data, use <code>clear()</code>.</p>
330      *
331      * @param data array of observations to be removed
332      */
333     public void removeData(double[][] data) {
334         for (int i = 0; i < data.length && n > 0; i++) {
335             removeData(data[i][0], data[i][1]);
336         }
337     }
338 
339     /**
340      * Clears all data from the model.
341      */
342     @Override
343     public void clear() {
344         sumX = 0d;
345         sumXX = 0d;
346         sumY = 0d;
347         sumYY = 0d;
348         sumXY = 0d;
349         n = 0;
350     }
351 
352     /**
353      * Returns the number of observations that have been added to the model.
354      *
355      * @return n number of observations that have been added.
356      */
357     @Override
358     public long getN() {
359         return n;
360     }
361 
362     /**
363      * Returns the "predicted" <code>y</code> value associated with the
364      * supplied <code>x</code> value,  based on the data that has been
365      * added to the model when this method is activated.
366      * <p>
367      * <code> predict(x) = intercept + slope * x </code></p>
368      * <p>* <strong>Preconditions</strong>:</p>
369      * <ul>
370      * <li>At least two observations (with at least two different x values)
371      * must have been added before invoking this method. If this method is
372      * invoked before a model can be estimated, <code>Double,NaN</code> is
373      * returned.
374      * </li></ul>
375      *
376      * @param x input <code>x</code> value
377      * @return predicted <code>y</code> value
378      */
379     public double predict(final double x) {
380         final double b1 = getSlope();
381         if (hasIntercept) {
382             return getIntercept(b1) + b1 * x;
383         }
384         return b1 * x;
385     }
386 
387     /**
388      * Returns the intercept of the estimated regression line, if
389      * {@link #hasIntercept()} is true; otherwise 0.
390      * <p>
391      * The least squares estimate of the intercept is computed using the
392      * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
393      * The intercept is sometimes denoted b0.</p>
394      * <p><strong>Preconditions</strong>:</p>
395      * <ul>
396      * <li>At least two observations (with at least two different x values)
397      * must have been added before invoking this method. If this method is
398      * invoked before a model can be estimated, <code>Double,NaN</code> is
399      * returned.
400      * </li></ul>
401      *
402      * @return the intercept of the regression line if the model includes an
403      * intercept; 0 otherwise
404      * @see #SimpleRegression(boolean)
405      */
406     public double getIntercept() {
407         return hasIntercept ? getIntercept(getSlope()) : 0.0;
408     }
409 
410     /**
411      * Returns true if the model includes an intercept term.
412      *
413      * @return true if the regression includes an intercept; false otherwise
414      * @see #SimpleRegression(boolean)
415      */
416     @Override
417     public boolean hasIntercept() {
418         return hasIntercept;
419     }
420 
421     /**
422     * Returns the slope of the estimated regression line.
423     * <p>
424     * The least squares estimate of the slope is computed using the
425     * <a href="http://www.xycoon.com/estimation4.htm">normal equations</a>.
426     * The slope is sometimes denoted b1.</p>
427     * <p>* <strong>Preconditions</strong>:</p>
428     * <ul>
429     * <li>At least two observations (with at least two different x values)
430     * must have been added before invoking this method. If this method is
431     * invoked before a model can be estimated, <code>Double.NaN</code> is
432     * returned.
433     * </li></ul>
434     *
435     * @return the slope of the regression line
436     */
437     public double getSlope() {
438         if (n < 2) {
439             return Double.NaN; //not enough data
440         }
441         if (FastMath.abs(sumXX) < 10 * Double.MIN_VALUE) {
442             return Double.NaN; //not enough variation in x
443         }
444         return sumXY / sumXX;
445     }
446 
447     /**
448      * Returns the <a href="http://www.xycoon.com/SumOfSquares.htm">
449      * sum of squared errors</a> (SSE) associated with the regression
450      * model.
451      * <p>
452      * The sum is computed using the computational formula</p>
453      * <p>
454      * <code>SSE = SYY - (SXY * SXY / SXX)</code></p>
455      * <p>
456      * where <code>SYY</code> is the sum of the squared deviations of the y
457      * values about their mean, <code>SXX</code> is similarly defined and
458      * <code>SXY</code> is the sum of the products of x and y mean deviations.
459      * </p><p>
460      * The sums are accumulated using the updating algorithm referenced in
461      * {@link #addData}.</p>
462      * <p>
463      * The return value is constrained to be non-negative - i.e., if due to
464      * rounding errors the computational formula returns a negative result,
465      * 0 is returned.</p>
466      * <p>* <strong>Preconditions</strong>:</p>
467      * <ul>
468      * <li>At least two observations (with at least two different x values)
469      * must have been added before invoking this method. If this method is
470      * invoked before a model can be estimated, <code>Double,NaN</code> is
471      * returned.
472      * </li></ul>
473      *
474      * @return sum of squared errors associated with the regression model
475      */
476     public double getSumSquaredErrors() {
477         return FastMath.max(0d, sumYY - sumXY * sumXY / sumXX);
478     }
479 
480     /**
481      * Returns the sum of squared deviations of the y values about their mean.
482      * <p>
483      * This is defined as SSTO
484      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a>.</p>
485      * <p>
486      * If {@code n < 2}, this returns <code>Double.NaN</code>.</p>
487      *
488      * @return sum of squared deviations of y values
489      */
490     public double getTotalSumSquares() {
491         if (n < 2) {
492             return Double.NaN;
493         }
494         return sumYY;
495     }
496 
497     /**
498      * Returns the sum of squared deviations of the x values about their mean.
499      *
500      * <p>If {@code n < 2}, this returns <code>Double.NaN</code>.</p>
501      *
502      * @return sum of squared deviations of x values
503      */
504     public double getXSumSquares() {
505         if (n < 2) {
506             return Double.NaN;
507         }
508         return sumXX;
509     }
510 
511     /**
512      * Returns the sum of crossproducts, x<sub>i</sub>*y<sub>i</sub>.
513      *
514      * @return sum of cross products
515      */
516     public double getSumOfCrossProducts() {
517         return sumXY;
518     }
519 
520     /**
521      * Returns the sum of squared deviations of the predicted y values about
522      * their mean (which equals the mean of y).
523      * <p>
524      * This is usually abbreviated SSR or SSM.  It is defined as SSM
525      * <a href="http://www.xycoon.com/SumOfSquares.htm">here</a></p>
526      * <p>* <strong>Preconditions</strong>:</p>
527      * <ul>
528      * <li>At least two observations (with at least two different x values)
529      * must have been added before invoking this method. If this method is
530      * invoked before a model can be estimated, <code>Double.NaN</code> is
531      * returned.
532      * </li></ul>
533      *
534      * @return sum of squared deviations of predicted y values
535      */
536     public double getRegressionSumSquares() {
537         return getRegressionSumSquares(getSlope());
538     }
539 
540     /**
541      * Returns the sum of squared errors divided by the degrees of freedom,
542      * usually abbreviated MSE.
543      * <p>
544      * If there are fewer than <strong>three</strong> data pairs in the model,
545      * or if there is no variation in <code>x</code>, this returns
546      * <code>Double.NaN</code>.</p>
547      *
548      * @return sum of squared deviations of y values
549      */
550     public double getMeanSquareError() {
551         if (n < 3) {
552             return Double.NaN;
553         }
554         return hasIntercept ? (getSumSquaredErrors() / (n - 2)) : (getSumSquaredErrors() / (n - 1));
555     }
556 
557     /**
558      * Returns <a href="http://mathworld.wolfram.com/CorrelationCoefficient.html">
559      * Pearson's product moment correlation coefficient</a>,
560      * usually denoted r.
561      * <p>* <strong>Preconditions</strong>:</p>
562      * <ul>
563      * <li>At least two observations (with at least two different x values)
564      * must have been added before invoking this method. If this method is
565      * invoked before a model can be estimated, <code>Double,NaN</code> is
566      * returned.
567      * </li></ul>
568      *
569      * @return Pearson's r
570      */
571     public double getR() {
572         double b1 = getSlope();
573         double result = FastMath.sqrt(getRSquare());
574         if (b1 < 0) {
575             result = -result;
576         }
577         return result;
578     }
579 
580     /**
581      * Returns the <a href="http://www.xycoon.com/coefficient1.htm">
582      * coefficient of determination</a>,
583      * usually denoted r-square.
584      * <p>* <strong>Preconditions</strong>:</p>
585      * <ul>
586      * <li>At least two observations (with at least two different x values)
587      * must have been added before invoking this method. If this method is
588      * invoked before a model can be estimated, <code>Double,NaN</code> is
589      * returned.
590      * </li></ul>
591      *
592      * @return r-square
593      */
594     public double getRSquare() {
595         double ssto = getTotalSumSquares();
596         return (ssto - getSumSquaredErrors()) / ssto;
597     }
598 
599     /**
600      * Returns the <a href="http://www.xycoon.com/standarderrorb0.htm">
601      * standard error of the intercept estimate</a>,
602      * usually denoted s(b0).
603      * <p>
604      * If there are fewer that <strong>three</strong> observations in the
605      * model, or if there is no variation in x, this returns
606      * <code>Double.NaN</code>.</p> Additionally, a <code>Double.NaN</code> is
607      * returned when the intercept is constrained to be zero
608      *
609      * @return standard error associated with intercept estimate
610      */
611     public double getInterceptStdErr() {
612         if( !hasIntercept ){
613             return Double.NaN;
614         }
615         return FastMath.sqrt(
616             getMeanSquareError() * ((1d / n) + (xbar * xbar) / sumXX));
617     }
618 
619     /**
620      * Returns the <a href="http://www.xycoon.com/standerrorb(1).htm">standard
621      * error of the slope estimate</a>,
622      * usually denoted s(b1).
623      * <p>
624      * If there are fewer that <strong>three</strong> data pairs in the model,
625      * or if there is no variation in x, this returns <code>Double.NaN</code>.
626      * </p>
627      *
628      * @return standard error associated with slope estimate
629      */
630     public double getSlopeStdErr() {
631         return FastMath.sqrt(getMeanSquareError() / sumXX);
632     }
633 
634     /**
635      * Returns the half-width of a 95% confidence interval for the slope
636      * estimate.
637      * <p>
638      * The 95% confidence interval is</p>
639      * <p>
640      * <code>(getSlope() - getSlopeConfidenceInterval(),
641      * getSlope() + getSlopeConfidenceInterval())</code></p>
642      * <p>
643      * If there are fewer that <strong>three</strong> observations in the
644      * model, or if there is no variation in x, this returns
645      * <code>Double.NaN</code>.</p>
646      * <p>* <strong>Usage Note</strong>:</p>
647      * <p>
648      * The validity of this statistic depends on the assumption that the
649      * observations included in the model are drawn from a
650      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
651      * Bivariate Normal Distribution</a>.</p>
652      *
653      * @return half-width of 95% confidence interval for the slope estimate
654      * @throws MathIllegalArgumentException if the confidence interval can not be computed.
655      */
656     public double getSlopeConfidenceInterval() throws MathIllegalArgumentException {
657         return getSlopeConfidenceInterval(0.05d);
658     }
659 
660     /**
661      * Returns the half-width of a (100-100*alpha)% confidence interval for
662      * the slope estimate.
663      * <p>
664      * The (100-100*alpha)% confidence interval is </p>
665      * <p>
666      * <code>(getSlope() - getSlopeConfidenceInterval(),
667      * getSlope() + getSlopeConfidenceInterval())</code></p>
668      * <p>
669      * To request, for example, a 99% confidence interval, use
670      * <code>alpha = .01</code></p>
671      * <p>
672      * <strong>Usage Note</strong>:<br>
673      * The validity of this statistic depends on the assumption that the
674      * observations included in the model are drawn from a
675      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
676      * Bivariate Normal Distribution</a>.</p>
677      * <p>
678      * <strong> Preconditions:</strong></p><ul>
679      * <li>If there are fewer that <strong>three</strong> observations in the
680      * model, or if there is no variation in x, this returns
681      * <code>Double.NaN</code>.
682      * </li>
683      * <li>{@code (0 < alpha < 1)}; otherwise an
684      * <code>MathIllegalArgumentException</code> is thrown.
685      * </li></ul>
686      *
687      * @param alpha the desired significance level
688      * @return half-width of 95% confidence interval for the slope estimate
689      * @throws MathIllegalArgumentException if the confidence interval can not be computed.
690      */
691     public double getSlopeConfidenceInterval(final double alpha)
692     throws MathIllegalArgumentException {
693         if (n < 3) {
694             return Double.NaN;
695         }
696         if (alpha >= 1 || alpha <= 0) {
697             throw new MathIllegalArgumentException(LocalizedStatFormats.SIGNIFICANCE_LEVEL,
698                                           alpha, 0, 1);
699         }
700         // No advertised MathIllegalArgumentException here - will return NaN above
701         TDistribution distribution = new TDistribution(n - 2);
702         return getSlopeStdErr() *
703             distribution.inverseCumulativeProbability(1d - alpha / 2d);
704     }
705 
706     /**
707      * Returns the significance level of the slope (equiv) correlation.
708      * <p>
709      * Specifically, the returned value is the smallest <code>alpha</code>
710      * such that the slope confidence interval with significance level
711      * equal to <code>alpha</code> does not include <code>0</code>.
712      * On regression output, this is often denoted <code>Prob(|t| &gt; 0)</code>
713      * </p><p>
714      * <strong>Usage Note</strong>:<br>
715      * The validity of this statistic depends on the assumption that the
716      * observations included in the model are drawn from a
717      * <a href="http://mathworld.wolfram.com/BivariateNormalDistribution.html">
718      * Bivariate Normal Distribution</a>.</p>
719      * <p>
720      * If there are fewer that <strong>three</strong> observations in the
721      * model, or if there is no variation in x, this returns
722      * <code>Double.NaN</code>.</p>
723      *
724      * @return significance level for slope/correlation
725      * @throws org.hipparchus.exception.MathIllegalStateException
726      * if the significance level can not be computed.
727      */
728     public double getSignificance() {
729         if (n < 3) {
730             return Double.NaN;
731         }
732         // No advertised MathIllegalArgumentException here - will return NaN above
733         TDistribution distribution = new TDistribution(n - 2);
734         return 2d * (1.0 - distribution.cumulativeProbability(
735                     FastMath.abs(getSlope()) / getSlopeStdErr()));
736     }
737 
738     // ---------------------Private methods-----------------------------------
739 
740     /**
741     * Returns the intercept of the estimated regression line, given the slope.
742     * <p>
743     * Will return <code>NaN</code> if slope is <code>NaN</code>.</p>
744     *
745     * @param slope current slope
746     * @return the intercept of the regression line
747     */
748     private double getIntercept(final double slope) {
749       if( hasIntercept){
750         return (sumY - slope * sumX) / n;
751       }
752       return 0.0;
753     }
754 
755     /**
756      * Computes SSR from b1.
757      *
758      * @param slope regression slope estimate
759      * @return sum of squared deviations of predicted y values
760      */
761     private double getRegressionSumSquares(final double slope) {
762         return slope * slope * sumXX;
763     }
764 
765     /**
766      * Performs a regression on data present in buffers and outputs a RegressionResults object.
767      *
768      * <p>If there are fewer than 3 observations in the model and {@code hasIntercept} is true
769      * a {@code MathIllegalArgumentException} is thrown.  If there is no intercept term, the model must
770      * contain at least 2 observations.</p>
771      *
772      * @return RegressionResults acts as a container of regression output
773      * @throws MathIllegalArgumentException if the model is not correctly specified
774      * @throws MathIllegalArgumentException if there is not sufficient data in the model to
775      * estimate the regression parameters
776      */
777     @Override
778     public RegressionResults regress() throws MathIllegalArgumentException {
779         if (hasIntercept) {
780             if (n < 3) {
781                 throw new MathIllegalArgumentException(LocalizedStatFormats.NOT_ENOUGH_DATA_REGRESSION);
782             }
783             if (FastMath.abs(sumXX) > Precision.SAFE_MIN) {
784                 final double[] params = { getIntercept(), getSlope() };
785                 final double mse = getMeanSquareError();
786                 final double _syy = sumYY + sumY * sumY / n;
787                 final double[] vcv = { mse * (xbar * xbar / sumXX + 1.0 / n), -xbar * mse / sumXX, mse / sumXX };
788                 return new RegressionResults(params, new double[][] { vcv }, true, n, 2, sumY, _syy, getSumSquaredErrors(), true,
789                         false);
790             } else {
791                 final double[] params = { sumY / n, Double.NaN };
792                 // final double mse = getMeanSquareError();
793                 final double[] vcv = { ybar / (n - 1.0), Double.NaN, Double.NaN };
794                 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), true,
795                         false);
796             }
797         } else {
798             if (n < 2) {
799                 throw new MathIllegalArgumentException(LocalizedStatFormats.NOT_ENOUGH_DATA_REGRESSION);
800             }
801             if (!Double.isNaN(sumXX)) {
802                 final double[] vcv = { getMeanSquareError() / sumXX };
803                 final double[] params = { sumXY / sumXX };
804                 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, sumY, sumYY, getSumSquaredErrors(), false,
805                         false);
806             } else {
807                 final double[] vcv = { Double.NaN };
808                 final double[] params = { Double.NaN };
809                 return new RegressionResults(params, new double[][] { vcv }, true, n, 1, Double.NaN, Double.NaN, Double.NaN, false,
810                         false);
811             }
812         }
813     }
814 
815     /**
816      * Performs a regression on data present in buffers including only regressors
817      * indexed in variablesToInclude and outputs a RegressionResults object
818      * @param variablesToInclude an array of indices of regressors to include
819      * @return RegressionResults acts as a container of regression output
820      * @throws MathIllegalArgumentException if the variablesToInclude array is null or zero length
821      * @throws MathIllegalArgumentException if a requested variable is not present in model
822      */
823     @Override
824     public RegressionResults regress(int[] variablesToInclude) throws MathIllegalArgumentException {
825         if (variablesToInclude == null || variablesToInclude.length == 0) {
826           throw new MathIllegalArgumentException(LocalizedCoreFormats.ARRAY_ZERO_LENGTH_OR_NULL_NOT_ALLOWED);
827         }
828         if (variablesToInclude.length > 2 || (variablesToInclude.length > 1 && !hasIntercept)) {
829             throw new MathIllegalArgumentException(
830                     LocalizedCoreFormats.ARRAY_SIZE_EXCEEDS_MAX_VARIABLES,
831                     (variablesToInclude.length > 1 && !hasIntercept) ? 1 : 2);
832         }
833 
834         if (hasIntercept) {
835             if (variablesToInclude.length == 2) {
836                 if (variablesToInclude[0] == 1) {
837                     throw new MathIllegalArgumentException(LocalizedCoreFormats.NOT_INCREASING_SEQUENCE);
838                 } else if (variablesToInclude[0] != 0) {
839                     throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE_SIMPLE,
840                                                            variablesToInclude[0], 0, 1);
841                 }
842                 if (variablesToInclude[1] != 1) {
843                      throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE_SIMPLE,
844                                                             variablesToInclude[0], 0, 1);
845                 }
846                 return regress();
847             }else{
848                 if( variablesToInclude[0] != 1 && variablesToInclude[0] != 0 ) {
849                      throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE_SIMPLE,
850                                                             variablesToInclude[0], 0, 1);
851                 }
852                 final double _mean = sumY * sumY / n;
853                 final double _syy = sumYY + _mean;
854                 if( variablesToInclude[0] == 0 ){
855                     //just the mean
856                     final double[] vcv = { sumYY/(((n-1)*n)) };
857                     final double[] params = { ybar };
858                     return new RegressionResults(
859                       params, new double[][] {vcv}, true, n, 1,
860                       sumY, _syy+_mean, sumYY,true,false);
861 
862                 }else if( variablesToInclude[0] == 1){
863                     //final double _syy = sumYY + sumY * sumY / ((double) n);
864                     final double _sxx = sumXX + sumX * sumX / n;
865                     final double _sxy = sumXY + sumX * sumY / n;
866                     final double _sse = FastMath.max(0d, _syy - _sxy * _sxy / _sxx);
867                     final double _mse = _sse/((n-1));
868                     if( !Double.isNaN(_sxx) ){
869                         final double[] vcv = { _mse / _sxx };
870                         final double[] params = { _sxy/_sxx };
871                         return new RegressionResults(
872                                     params, new double[][] {vcv}, true, n, 1,
873                                     sumY, _syy, _sse,false,false);
874                     }else{
875                         final double[] vcv = {Double.NaN };
876                         final double[] params = { Double.NaN };
877                         return new RegressionResults(
878                                     params, new double[][] {vcv}, true, n, 1,
879                                     Double.NaN, Double.NaN, Double.NaN,false,false);
880                     }
881                 }
882             }
883         } else {
884             if (variablesToInclude[0] != 0) {
885                 throw new MathIllegalArgumentException(LocalizedCoreFormats.OUT_OF_RANGE_SIMPLE,
886                                                        variablesToInclude[0], 0, 0);
887             }
888             return regress();
889         }
890 
891         return null;
892     }
893 }