Blob Blame History Raw
/* multirobust.c
 * 
 * Copyright (C) 2013 Patrick Alken
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or (at
 * your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *
 * This module contains routines related to robust linear least squares. The
 * algorithm used closely follows the publications:
 *
 * [1] DuMouchel, W. and F. O'Brien (1989), "Integrating a robust
 * option into a multiple regression computing environment,"
 * Computer Science and Statistics:  Proceedings of the 21st
 * Symposium on the Interface, American Statistical Association
 *
 * [2] Street, J.O., R.J. Carroll, and D. Ruppert (1988), "A note on
 * computing robust regression estimates via iteratively
 * reweighted least squares," The American Statistician, v. 42, 
 * pp. 152-154.
 */

#include <config.h>
#include <gsl/gsl_math.h>
#include <gsl/gsl_multifit.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_statistics.h>
#include <gsl/gsl_sort.h>
#include <gsl/gsl_sort_vector.h>
#include <gsl/gsl_blas.h>
#include <gsl/gsl_linalg.h>

static int robust_test_convergence(const gsl_vector *c_prev, const gsl_vector *c,
                                   const double tol);
static double robust_madsigma(const gsl_vector *r, const size_t p, gsl_vector *workn);
static double robust_robsigma(const gsl_vector *r, const double s,
                              const double tune, gsl_multifit_robust_workspace *w);
static double robust_sigma(const double s_ols, const double s_rob,
                           gsl_multifit_robust_workspace *w);
static int robust_covariance(const double sigma, gsl_matrix *cov,
                             gsl_multifit_robust_workspace *w);

/*
gsl_multifit_robust_alloc
  Allocate a robust workspace

Inputs: T - robust weighting algorithm
        n - number of observations
        p - number of model parameters

Return: pointer to workspace
*/

gsl_multifit_robust_workspace *
gsl_multifit_robust_alloc(const gsl_multifit_robust_type *T,
                          const size_t n, const size_t p)
{
  gsl_multifit_robust_workspace *w;

  if (n < p)
    {
      GSL_ERROR_VAL("observations n must be >= p", GSL_EINVAL, 0);
    }

  w = calloc(1, sizeof(gsl_multifit_robust_workspace));
  if (w == 0)
    {
      GSL_ERROR_VAL("failed to allocate space for multifit_robust struct",
                    GSL_ENOMEM, 0);
    }

  w->n = n;
  w->p = p;
  w->type = T;
  w->maxiter = 100; /* maximum iterations */
  w->tune = w->type->tuning_default;

  w->multifit_p = gsl_multifit_linear_alloc(n, p);
  if (w->multifit_p == 0)
    {
      gsl_multifit_robust_free(w);
      GSL_ERROR_VAL("failed to allocate space for multifit_linear struct",
                    GSL_ENOMEM, 0);
    }

  w->r = gsl_vector_alloc(n);
  if (w->r == 0)
    {
      gsl_multifit_robust_free(w);
      GSL_ERROR_VAL("failed to allocate space for residuals",
                    GSL_ENOMEM, 0);
    }

  w->weights = gsl_vector_alloc(n);
  if (w->weights == 0)
    {
      gsl_multifit_robust_free(w);
      GSL_ERROR_VAL("failed to allocate space for weights", GSL_ENOMEM, 0);
    }

  w->c_prev = gsl_vector_alloc(p);
  if (w->c_prev == 0)
    {
      gsl_multifit_robust_free(w);
      GSL_ERROR_VAL("failed to allocate space for c_prev", GSL_ENOMEM, 0);
    }

  w->resfac = gsl_vector_alloc(n);
  if (w->resfac == 0)
    {
      gsl_multifit_robust_free(w);
      GSL_ERROR_VAL("failed to allocate space for residual factors",
                    GSL_ENOMEM, 0);
    }

  w->psi = gsl_vector_alloc(n);
  if (w->psi == 0)
    {
      gsl_multifit_robust_free(w);
      GSL_ERROR_VAL("failed to allocate space for psi", GSL_ENOMEM, 0);
    }

  w->dpsi = gsl_vector_alloc(n);
  if (w->dpsi == 0)
    {
      gsl_multifit_robust_free(w);
      GSL_ERROR_VAL("failed to allocate space for dpsi", GSL_ENOMEM, 0);
    }

  w->QSI = gsl_matrix_alloc(p, p);
  if (w->QSI == 0)
    {
      gsl_multifit_robust_free(w);
      GSL_ERROR_VAL("failed to allocate space for QSI", GSL_ENOMEM, 0);
    }

  w->D = gsl_vector_alloc(p);
  if (w->D == 0)
    {
      gsl_multifit_robust_free(w);
      GSL_ERROR_VAL("failed to allocate space for D", GSL_ENOMEM, 0);
    }

  w->workn = gsl_vector_alloc(n);
  if (w->workn == 0)
    {
      gsl_multifit_robust_free(w);
      GSL_ERROR_VAL("failed to allocate space for workn", GSL_ENOMEM, 0);
    }

  w->stats.sigma_ols = 0.0;
  w->stats.sigma_mad = 0.0;
  w->stats.sigma_rob = 0.0;
  w->stats.sigma = 0.0;
  w->stats.Rsq = 0.0;
  w->stats.adj_Rsq = 0.0;
  w->stats.rmse = 0.0;
  w->stats.sse = 0.0;
  w->stats.dof = n - p;
  w->stats.weights = w->weights;
  w->stats.r = w->r;

  return w;
} /* gsl_multifit_robust_alloc() */

/*
gsl_multifit_robust_free()
  Free memory associated with robust workspace
*/

void
gsl_multifit_robust_free(gsl_multifit_robust_workspace *w)
{
  if (w->multifit_p)
    gsl_multifit_linear_free(w->multifit_p);

  if (w->r)
    gsl_vector_free(w->r);

  if (w->weights)
    gsl_vector_free(w->weights);

  if (w->c_prev)
    gsl_vector_free(w->c_prev);

  if (w->resfac)
    gsl_vector_free(w->resfac);

  if (w->psi)
    gsl_vector_free(w->psi);

  if (w->dpsi)
    gsl_vector_free(w->dpsi);

  if (w->QSI)
    gsl_matrix_free(w->QSI);

  if (w->D)
    gsl_vector_free(w->D);

  if (w->workn)
    gsl_vector_free(w->workn);

  free(w);
} /* gsl_multifit_robust_free() */

int
gsl_multifit_robust_tune(const double tune, gsl_multifit_robust_workspace *w)
{
  w->tune = tune;
  return GSL_SUCCESS;
}

int
gsl_multifit_robust_maxiter(const size_t maxiter,
                            gsl_multifit_robust_workspace *w)
{
  if (w->maxiter == 0)
    {
      GSL_ERROR("maxiter must be greater than 0", GSL_EINVAL);
    }
  else
    {
      w->maxiter = maxiter;
      return GSL_SUCCESS;
    }
}

const char *
gsl_multifit_robust_name(const gsl_multifit_robust_workspace *w)
{
  return w->type->name;
}

gsl_multifit_robust_stats
gsl_multifit_robust_statistics(const gsl_multifit_robust_workspace *w)
{
  return w->stats;
}

/*
gsl_multifit_robust_weights()
  Compute iterative weights for given residuals

Inputs: r   - residuals
        wts - (output) where to store weights
              w_i = r_i / (sigma_mad * tune)
        w   - workspace

Return: success/error

Notes:
1) Sizes of r and wts must be equal
2) Size of r/wts may be less than or equal to w->n, to allow
for computing weights of a subset of data
*/

int
gsl_multifit_robust_weights(const gsl_vector *r, gsl_vector *wts,
                            gsl_multifit_robust_workspace *w)
{
  if (r->size != wts->size)
    {
      GSL_ERROR("residual vector does not match weight vector size", GSL_EBADLEN);
    }
  else
    {
      int s;
      double sigma;

      sigma = robust_madsigma(r, w->p, wts);

      /* scale residuals by sigma and tuning factor */
      gsl_vector_memcpy(wts, r);
      gsl_vector_scale(wts, 1.0 / (sigma * w->tune));

      /* compute weights in-place */
      s = w->type->wfun(wts, wts);

      return s;
    }
} /* gsl_multifit_robust_weights() */

/*
gsl_multifit_robust()
  Perform robust iteratively reweighted linear least squares
fit

Inputs: X     - design matrix of basis functions
        y     - right hand side vector
        c     - (output) model coefficients
        cov   - (output) covariance matrix
        w     - workspace
*/

int
gsl_multifit_robust(const gsl_matrix * X,
                    const gsl_vector * y,
                    gsl_vector * c,
                    gsl_matrix * cov,
                    gsl_multifit_robust_workspace *w)
{
  /* check matrix and vector sizes */
  if (X->size1 != y->size)
    {
      GSL_ERROR
        ("number of observations in y does not match rows of matrix X",
         GSL_EBADLEN);
    }
  else if (X->size2 != c->size)
    {
      GSL_ERROR ("number of parameters c does not match columns of matrix X",
                 GSL_EBADLEN);
    }
  else if (cov->size1 != cov->size2)
    {   
      GSL_ERROR ("covariance matrix is not square", GSL_ENOTSQR);
    }   
  else if (c->size != cov->size1)
    {   
      GSL_ERROR
        ("number of parameters does not match size of covariance matrix",
         GSL_EBADLEN);
    }
  else if (X->size1 != w->n || X->size2 != w->p)
    {
      GSL_ERROR
        ("size of workspace does not match size of observation matrix",
         GSL_EBADLEN);
    }
  else
    {
      int s;
      double chisq;
      const double tol = GSL_SQRT_DBL_EPSILON;
      int converged = 0;
      size_t numit = 0;
      const size_t n = y->size;
      double sigy = gsl_stats_sd(y->data, y->stride, n);
      double sig_lower;
      size_t i;

      /*
       * if the initial fit is very good, then finding outliers by comparing
       * them to the residual standard deviation is difficult. Therefore we
       * set a lower bound on the standard deviation estimate that is a small
       * fraction of the standard deviation of the data values
       */
      sig_lower = 1.0e-6 * sigy;
      if (sig_lower == 0.0)
        sig_lower = 1.0;

      /* compute initial estimates using ordinary least squares */
      s = gsl_multifit_linear(X, y, c, cov, &chisq, w->multifit_p);
      if (s)
        return s;

      /* save Q S^{-1} of original matrix */
      gsl_matrix_memcpy(w->QSI, w->multifit_p->QSI);
      gsl_vector_memcpy(w->D, w->multifit_p->D);

      /* compute statistical leverage of each data point */
      s = gsl_linalg_SV_leverage(w->multifit_p->A, w->resfac);
      if (s)
        return s;

      /* correct residuals with factor 1 / sqrt(1 - h) */
      for (i = 0; i < n; ++i)
        {
          double h = gsl_vector_get(w->resfac, i);

          if (h > 0.9999)
            h = 0.9999;

          gsl_vector_set(w->resfac, i, 1.0 / sqrt(1.0 - h));
        }

      /* compute residuals from OLS fit r = y - X c */
      s = gsl_multifit_linear_residuals(X, y, c, w->r);
      if (s)
        return s;

      /* compute estimate of sigma from ordinary least squares */
      w->stats.sigma_ols = gsl_blas_dnrm2(w->r) / sqrt((double) w->stats.dof);

      while (!converged && ++numit <= w->maxiter)
        {
          double sig;

          /* adjust residuals by statistical leverage (see DuMouchel and O'Brien) */
          s = gsl_vector_mul(w->r, w->resfac);
          if (s)
            return s;

          /* compute estimate of standard deviation using MAD */
          sig = robust_madsigma(w->r, w->p, w->workn);

          /* scale residuals by standard deviation and tuning parameter */
          gsl_vector_scale(w->r, 1.0 / (GSL_MAX(sig, sig_lower) * w->tune));

          /* compute weights using these residuals */
          s = w->type->wfun(w->r, w->weights);
          if (s)
            return s;

          gsl_vector_memcpy(w->c_prev, c);

          /* solve weighted least squares with new weights */
          s = gsl_multifit_wlinear(X, w->weights, y, c, cov, &chisq, w->multifit_p);
          if (s)
            return s;

          /* compute new residuals r = y - X c */
          s = gsl_multifit_linear_residuals(X, y, c, w->r);
          if (s)
            return s;

          converged = robust_test_convergence(w->c_prev, c, tol);
        }

      /* compute final MAD sigma */
      w->stats.sigma_mad = robust_madsigma(w->r, w->p, w->workn);

      /* compute robust estimate of sigma */
      w->stats.sigma_rob = robust_robsigma(w->r, w->stats.sigma_mad, w->tune, w);

      /* compute final estimate of sigma */
      w->stats.sigma = robust_sigma(w->stats.sigma_ols, w->stats.sigma_rob, w);

      /* store number of iterations */
      w->stats.numit = numit;

      {
        double dof = (double) w->stats.dof;
        double rnorm = w->stats.sigma * sqrt(dof); /* see DuMouchel, sec 4.2 */
        double ss_err = rnorm * rnorm;
        double ss_tot = gsl_stats_tss(y->data, y->stride, n);

        /* compute R^2 */
        w->stats.Rsq = 1.0 - ss_err / ss_tot;

        /* compute adjusted R^2 */
        w->stats.adj_Rsq = 1.0 - (1.0 - w->stats.Rsq) * ((double)n - 1.0) / dof;

        /* compute rmse */
        w->stats.rmse = sqrt(ss_err / dof);

        /* store SSE */
        w->stats.sse = ss_err;
      }

      /* calculate covariance matrix = sigma^2 (X^T X)^{-1} */
      s = robust_covariance(w->stats.sigma, cov, w);
      if (s)
        return s;

      /* raise an error if not converged */
      if (numit > w->maxiter)
        {
          GSL_ERROR("maximum iterations exceeded", GSL_EMAXITER);
        }

      return s;
    }
} /* gsl_multifit_robust() */

/* Estimation of values for given x */
int
gsl_multifit_robust_est(const gsl_vector * x, const gsl_vector * c,
                        const gsl_matrix * cov, double *y, double *y_err)
{
  int s = gsl_multifit_linear_est(x, c, cov, y, y_err);

  return s;
}

/*
gsl_multifit_robust_residuals()
  Compute robust / studentized residuals from fit

r_i = (y_i - Y_i) / (sigma * sqrt(1 - h_i))

Inputs: X - design matrix
        y - rhs vector
        c - fit coefficients
        r - (output) studentized residuals
        w - workspace

Notes:
1) gsl_multifit_robust() must first be called to compute the coefficients
c, the leverage factors in w->resfac, and sigma in w->stats.sigma
*/

int
gsl_multifit_robust_residuals(const gsl_matrix * X, const gsl_vector * y,
                              const gsl_vector * c, gsl_vector * r,
                              gsl_multifit_robust_workspace * w)
{
  if (X->size1 != y->size)
    {
      GSL_ERROR
        ("number of observations in y does not match rows of matrix X",
         GSL_EBADLEN);
    }
  else if (X->size2 != c->size)
    {
      GSL_ERROR ("number of parameters c does not match columns of matrix X",
                 GSL_EBADLEN);
    }
  else if (y->size != r->size)
    {
      GSL_ERROR ("number of observations in y does not match number of residuals",
                 GSL_EBADLEN);
    }
  else
    {
      const double sigma = w->stats.sigma; /* previously calculated sigma */
      int s;
      size_t i;

      /* compute r = y - X c */
      s = gsl_multifit_linear_residuals(X, y, c, r);
      if (s)
        return s;

      for (i = 0; i < r->size; ++i)
        {
          double hfac = gsl_vector_get(w->resfac, i); /* 1/sqrt(1 - h_i) */
          double *ri = gsl_vector_ptr(r, i);

          /* multiply residual by 1 / (sigma * sqrt(1 - h_i)) */
          *ri *= hfac / sigma;
        }

      return s;
    }
} /* gsl_multifit_robust_residuals() */

/***********************************
 * INTERNAL ROUTINES               *
 ***********************************/

/*
robust_test_convergence()
  Test for convergence in robust least squares

Convergence criteria:

|c_i^(k) - c_i^(k-1)| <= tol * max(|c_i^(k)|, |c_i^(k-1)|)

for all i. k refers to iteration number.

Inputs: c_prev - coefficients from previous iteration
        c      - coefficients from current iteration
        tol    - tolerance

Return: 1 if converged, 0 if not
*/

static int
robust_test_convergence(const gsl_vector *c_prev, const gsl_vector *c,
                        const double tol)
{
  size_t p = c->size;
  size_t i;

  for (i = 0; i < p; ++i)
    {
      double ai = gsl_vector_get(c_prev, i);
      double bi = gsl_vector_get(c, i);

      if (fabs(bi - ai) > tol * GSL_MAX(fabs(ai), fabs(bi)))
        return 0; /* not yet converged */
    }

  /* converged */
  return 1;
} /* robust_test_convergence() */

/*
robust_madsigma()
  Estimate the standard deviation of the residuals using
the Median-Absolute-Deviation (MAD) of the residuals,
throwing away the smallest p residuals.

See: Street et al, 1988

Inputs: r     - vector of residuals
        p     - number of model coefficients (smallest p residuals are
                ignored)
        workn - workspace of size n = length(r)
*/

static double
robust_madsigma(const gsl_vector *r, const size_t p, gsl_vector *workn)
{
  size_t n = r->size;
  double sigma;
  size_t i;

  /* allow for the possibility that r->size < w->n */
  gsl_vector_view v1 = gsl_vector_subvector(workn, 0, n);
  gsl_vector_view v2;

  /* copy |r| into workn */
  for (i = 0; i < n; ++i)
    {
      gsl_vector_set(&v1.vector, i, fabs(gsl_vector_get(r, i)));
    }

  gsl_sort_vector(&v1.vector);

  /*
   * ignore the smallest p residuals when computing the median
   * (see Street et al 1988)
   */
  v2 = gsl_vector_subvector(&v1.vector, p - 1, n - p + 1);
  sigma = gsl_stats_median_from_sorted_data(v2.vector.data, v2.vector.stride, v2.vector.size) / 0.6745;

  return sigma;
} /* robust_madsigma() */

/*
robust_robsigma()
  Compute robust estimate of sigma so that
sigma^2 * inv(X' * X) is a reasonable estimate of
the covariance for robust regression. Based heavily
on the equations of Street et al, 1988.

Inputs: r    - vector of residuals y - X c
        s    - sigma estimate using MAD
        tune - tuning constant
        w    - workspace
*/

static double
robust_robsigma(const gsl_vector *r, const double s,
                const double tune, gsl_multifit_robust_workspace *w)
{
  double sigma;
  size_t i;
  const size_t n = w->n;
  const size_t p = w->p;
  const double st = s * tune;
  double a, b, lambda;

  /* compute u = r / sqrt(1 - h) / st */
  gsl_vector_memcpy(w->workn, r);
  gsl_vector_mul(w->workn, w->resfac);
  gsl_vector_scale(w->workn, 1.0 / st);

  /* compute w(u) and psi'(u) */
  w->type->wfun(w->workn, w->psi);
  w->type->psi_deriv(w->workn, w->dpsi);

  /* compute psi(u) = u*w(u) */
  gsl_vector_mul(w->psi, w->workn);

  /* Street et al, Eq (3) */
  a = gsl_stats_mean(w->dpsi->data, w->dpsi->stride, n);

  /* Street et al, Eq (5) */
  b = 0.0;
  for (i = 0; i < n; ++i)
    {
      double psi_i = gsl_vector_get(w->psi, i);
      double resfac = gsl_vector_get(w->resfac, i);
      double fac = 1.0 / (resfac*resfac); /* 1 - h */

      b += fac * psi_i * psi_i;
    }
  b /= (double) (n - p);

  /* Street et al, Eq (5) */
  lambda = 1.0 + ((double)p)/((double)n) * (1.0 - a) / a;

  sigma = lambda * sqrt(b) * st / a;

  return sigma;
} /* robust_robsigma() */

/*
robust_sigma()
  Compute final estimate of residual standard deviation, using
the OLS and robust sigma estimates.

This equation is taken from DuMouchel and O'Brien, sec 4.1:
\hat{\sigma_R}

Inputs: s_ols - OLS sigma
        s_rob - robust sigma
        w     - workspace

Return: final estimate of sigma
*/

static double
robust_sigma(const double s_ols, const double s_rob,
             gsl_multifit_robust_workspace *w)
{
  double sigma;
  const double p = (double) w->p;
  const double n = (double) w->n;

  /* see DuMouchel and O'Brien, sec 4.1 */
  sigma = GSL_MAX(s_rob,
                  sqrt((s_ols*s_ols*p*p + s_rob*s_rob*n) /
                       (p*p + n)));

  return sigma;
} /* robust_sigma() */

/*
robust_covariance()
  Calculate final covariance matrix, defined as:

  sigma * (X^T X)^{-1}

Inputs: sigma - residual standard deviation
        cov   - (output) covariance matrix
        w     - workspace
*/

static int
robust_covariance(const double sigma, gsl_matrix *cov,
                  gsl_multifit_robust_workspace *w)
{
  int status = 0;
  const size_t p = w->p;
  const double s2 = sigma * sigma;
  size_t i, j;
  gsl_matrix *QSI = w->QSI;
  gsl_vector *D = w->D;

  /* Form variance-covariance matrix cov = s2 * (Q S^-1) (Q S^-1)^T */

  for (i = 0; i < p; i++)
    {
      gsl_vector_view row_i = gsl_matrix_row (QSI, i);
      double d_i = gsl_vector_get (D, i);

      for (j = i; j < p; j++)
        {
          gsl_vector_view row_j = gsl_matrix_row (QSI, j);
          double d_j = gsl_vector_get (D, j);
          double s;

          gsl_blas_ddot (&row_i.vector, &row_j.vector, &s);

          gsl_matrix_set (cov, i, j, s * s2 / (d_i * d_j));
          gsl_matrix_set (cov, j, i, s * s2 / (d_i * d_j));
        }
    }

  return status;
} /* robust_covariance() */