I found that the regress function runs out of memory when given
moderately large problems, such as

y = randn(1E5, 1);
X = randn(1E5, 2);
[b, bint, r, rint, stats] = regress(y, X);

This is because a large (1E5*1E5) intermediate matrix is computed.
Slight modification of the code, as given below, avoids such
computations and enables problems in this size range to be solved
quickly.

Nir




## Copyright (C) 2005, 2006 William Poetra Yoga Hadisoeseno
## Nir Krakauer, 2011: Modified to avoid computing X * pinv(X), which
will lead to running out of memory when the number of data points is
around the square root of the memory size
##
## This program is free software; you can redistribute it and/or modify it
## under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2, or (at your option)
## any later version.
##
## This software is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this software; see the file COPYING.  If not, see
## <http://www.gnu.org/licenses/>.

## -*- texinfo -*-
## @deftypefn {Function File} {[@var{b}, @var{bint}, @var{r},
@var{rint}, @var{stats}] =} regress (@var{y}, @var{X}, [@var{alpha}])
## Multiple Linear Regression using Least Squares Fit of @var{y} on @var{X}
## with the model @code{y = X * beta + e}.
##
## Here,
##
## @itemize
## @item
## @code{y} is a column vector of observed values
## @item
## @code{X} is a matrix of regressors, with the first column filled with
## the constant value 1
## @item
## @code{beta} is a column vector of regression parameters
## @item
## @code{e} is a column vector of random errors
## @end itemize
##
## Arguments are
##
## @itemize
## @item
## @var{y} is the @code{y} in the model
## @item
## @var{X} is the @code{X} in the model
## @item
## @var{alpha} is the significance level used to calculate the confidence
## intervals @var{bint} and @var{rint} (see `Return values' below). If not
## specified, ALPHA defaults to 0.05
## @end itemize
##
## Return values are
##
## @itemize
## @item
## @var{b} is the @code{beta} in the model
## @item
## @var{bint} is the confidence interval for @var{b}
## @item
## @var{r} is a column vector of residuals
## @item
## @var{rint} is the confidence interval for @var{r}
## @item
## @var{stats} is a row vector containing:
##
##   @itemize
##   @item The R^2 statistic
##   @item The F statistic
##   @item The p value for the full model
##   @item The estimated error variance
##   @end itemize
## @end itemize
##
## @var{r} and @var{rint} can be passed to @code{rcoplot} to visualize
## the residual intervals and identify outliers.
##
## NaN values in @var{y} and @var{X} are removed before calculation begins.
##
## @end deftypefn

## References:
## - Matlab 7.0 documentation (pdf)
## - ¡¶ŽóѧÊýѧʵÑé¡· œªÆôÔŽ µÈ (textbook)
## - http://www.netnam.vn/unescocourse/statistics/12_5.htm
## - wsolve.m in octave-forge
## - http://www.stanford.edu/class/ee263/ls_ln_matlab.pdf

function [b, bint, r, rint, stats] = regress (y, X, alpha)

  if (nargin < 2 || nargin > 3)
    usage ("[b, bint, r, rint] = regress (y, X, [alpha])");
  endif

  if (! ismatrix (y))
    error ("regress: y must be a numeric matrix");
  endif
  if (! ismatrix (X))
    error ("regress: X must be a numeric matrix");
  endif

  if (columns (y) != 1)
    error ("regress: y must be a column vector");
  endif

  if (rows (y) != rows (X))
    error ("regress: y and X must contain the same number of rows");
  endif

  if (nargin < 3)
    alpha = 0.05;
  elseif (! isscalar (alpha))
    error ("regress: alpha must be a scalar value")
  endif

  notnans = ! logical (sum (isnan ([y X]), 2));
  y = y(notnans);
  X = X(notnans,:);

  [Xq Xr] = qr (X, 0);
  pinv_X = Xr \ Xq';

  b = pinv_X * y;

  if (nargout > 1)

    n = rows (X);
    p = columns (X);
    dof = n - p;
    t_alpha_2 = tinv (alpha / 2, dof);
#    H = X * pinv_X; #commented out -- Nir

#    r = (eye (n) - H) * y; #commented out -- Nir
    r = y - X * b; #added -- Nir
    SSE = sum (r .^ 2);
    v = SSE / dof;

    # c = diag(inv (X' * X)) using (economy) QR decomposition
    # which means that we only have to use Xr
    c = diag (inv (Xr' * Xr));

    db = t_alpha_2 * sqrt (v * c);

    bint = [b + db, b - db];

  endif

  if (nargout > 3)

    dof1 = n - p - 1;
#    h = diag (H); #commented out -- Nir
    h = sum(X.*pinv_X', 2); #added -- Nir (same as diag(X*pinv_X),
without doing the matrix multiply)

    # From Matlab's documentation on Multiple Linear Regression,
    #   sigmaihat2 = norm (r) ^ 2 / dof1 - r .^ 2 / (dof1 * (1 - h));
    #   dr = -tinv (1 - alpha / 2, dof) * sqrt (sigmaihat2 .* (1 - h));
    # Substitute
    #   norm (r) ^ 2 == sum (r .^ 2) == SSE
    #   -tinv (1 - alpha / 2, dof) == tinv (alpha / 2, dof) == t_alpha_2
    # We get
    #   sigmaihat2 = (SSE - r .^ 2 / (1 - h)) / dof1;
    #   dr = t_alpha_2 * sqrt (sigmaihat2 .* (1 - h));
    # Combine, we get
    #   dr = t_alpha_2 * sqrt ((SSE * (1 - h) - (r .^ 2)) / dof1);

    dr = t_alpha_2 * sqrt ((SSE * (1 - h) - (r .^ 2)) / dof1);

    rint = [r + dr, r - dr];

  endif

  if (nargout > 4)

    R2 = 1 - SSE / sum ((y - mean (y)) .^ 2);
#    F = (R2 / (p - 1)) / ((1 - R2) / dof);
    F = dof / (p - 1) / (1 / R2 - 1);
    pval = 1 - fcdf (F, p - 1, dof);

    stats = [R2 F pval v];

  endif

endfunction


%!test
%! % Longley data from the NIST Statistical Reference Dataset
%! Z = [  60323    83.0   234289   2356     1590    107608  1947
%!        61122    88.5   259426   2325     1456    108632  1948
%!        60171    88.2   258054   3682     1616    109773  1949
%!        61187    89.5   284599   3351     1650    110929  1950
%!        63221    96.2   328975   2099     3099    112075  1951
%!        63639    98.1   346999   1932     3594    113270  1952
%!        64989    99.0   365385   1870     3547    115094  1953
%!        63761   100.0   363112   3578     3350    116219  1954
%!        66019   101.2   397469   2904     3048    117388  1955
%!        67857   104.6   419180   2822     2857    118734  1956
%!        68169   108.4   442769   2936     2798    120445  1957
%!        66513   110.8   444546   4681     2637    121950  1958
%!        68655   112.6   482704   3813     2552    123366  1959
%!        69564   114.2   502601   3931     2514    125368  1960
%!        69331   115.7   518173   4806     2572    127852  1961
%!        70551   116.9   554894   4007     2827    130081  1962 ];
%! % Results certified by NIST using 500 digit arithmetic
%! % b and standard error in b
%! V = [  -3482258.63459582         890420.383607373
%!         15.0618722713733         84.9149257747669
%!        -0.358191792925910E-01    0.334910077722432E-01
%!        -2.02022980381683         0.488399681651699
%!        -1.03322686717359         0.214274163161675
%!        -0.511041056535807E-01    0.226073200069370
%!         1829.15146461355         455.478499142212 ];
%! Rsq = 0.995479004577296;
%! F = 330.285339234588;
%! y = Z(:,1); X = [ones(rows(Z),1), Z(:,2:end)];
%! alpha = 0.05;
%! [b, bint, r, rint, stats] = regress (y, X, alpha);
%! assert(b,V(:,1),3e-6);
%! assert(stats(1),Rsq,1e-12);
%! assert(stats(2),F,3e-8);
%! assert(((bint(:,1)-bint(:,2))/2)/tinv(alpha/2,9),V(:,2),-1.e-5);

------------------------------------------------------------------------------
The ultimate all-in-one performance toolkit: Intel(R) Parallel Studio XE:
Pinpoint memory and threading errors before they happen.
Find and fix more than 250 security defects in the development cycle.
Locate bottlenecks in serial and parallel code that limit performance.
http://p.sf.net/sfu/intel-dev2devfeb
_______________________________________________
Octave-dev mailing list
Octave-dev@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/octave-dev

Reply via email to