/* multirobust.c * * Copyright (C) 2013 Patrick Alken * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * This module contains routines related to robust linear least squares. The * algorithm used closely follows the publications: * * [1] DuMouchel, W. and F. O'Brien (1989), "Integrating a robust * option into a multiple regression computing environment," * Computer Science and Statistics: Proceedings of the 21st * Symposium on the Interface, American Statistical Association * * [2] Street, J.O., R.J. Carroll, and D. Ruppert (1988), "A note on * computing robust regression estimates via iteratively * reweighted least squares," The American Statistician, v. 42, * pp. 152-154. */ #include #include #include #include #include #include #include #include #include #include static int robust_test_convergence(const gsl_vector *c_prev, const gsl_vector *c, const double tol); static double robust_madsigma(const gsl_vector *r, const size_t p, gsl_vector *workn); static double robust_robsigma(const gsl_vector *r, const double s, const double tune, gsl_multifit_robust_workspace *w); static double robust_sigma(const double s_ols, const double s_rob, gsl_multifit_robust_workspace *w); static int robust_covariance(const double sigma, gsl_matrix *cov, gsl_multifit_robust_workspace *w); /* gsl_multifit_robust_alloc Allocate a robust workspace Inputs: T - robust weighting algorithm n - number of observations p - number of model parameters Return: pointer to workspace */ gsl_multifit_robust_workspace * gsl_multifit_robust_alloc(const gsl_multifit_robust_type *T, const size_t n, const size_t p) { gsl_multifit_robust_workspace *w; if (n < p) { GSL_ERROR_VAL("observations n must be >= p", GSL_EINVAL, 0); } w = calloc(1, sizeof(gsl_multifit_robust_workspace)); if (w == 0) { GSL_ERROR_VAL("failed to allocate space for multifit_robust struct", GSL_ENOMEM, 0); } w->n = n; w->p = p; w->type = T; w->maxiter = 100; /* maximum iterations */ w->tune = w->type->tuning_default; w->multifit_p = gsl_multifit_linear_alloc(n, p); if (w->multifit_p == 0) { gsl_multifit_robust_free(w); GSL_ERROR_VAL("failed to allocate space for multifit_linear struct", GSL_ENOMEM, 0); } w->r = gsl_vector_alloc(n); if (w->r == 0) { gsl_multifit_robust_free(w); GSL_ERROR_VAL("failed to allocate space for residuals", GSL_ENOMEM, 0); } w->weights = gsl_vector_alloc(n); if (w->weights == 0) { gsl_multifit_robust_free(w); GSL_ERROR_VAL("failed to allocate space for weights", GSL_ENOMEM, 0); } w->c_prev = gsl_vector_alloc(p); if (w->c_prev == 0) { gsl_multifit_robust_free(w); GSL_ERROR_VAL("failed to allocate space for c_prev", GSL_ENOMEM, 0); } w->resfac = gsl_vector_alloc(n); if (w->resfac == 0) { gsl_multifit_robust_free(w); GSL_ERROR_VAL("failed to allocate space for residual factors", GSL_ENOMEM, 0); } w->psi = gsl_vector_alloc(n); if (w->psi == 0) { gsl_multifit_robust_free(w); GSL_ERROR_VAL("failed to allocate space for psi", GSL_ENOMEM, 0); } w->dpsi = gsl_vector_alloc(n); if (w->dpsi == 0) { gsl_multifit_robust_free(w); GSL_ERROR_VAL("failed to allocate space for dpsi", GSL_ENOMEM, 0); } w->QSI = gsl_matrix_alloc(p, p); if (w->QSI == 0) { gsl_multifit_robust_free(w); GSL_ERROR_VAL("failed to allocate space for QSI", GSL_ENOMEM, 0); } w->D = gsl_vector_alloc(p); if (w->D == 0) { gsl_multifit_robust_free(w); GSL_ERROR_VAL("failed to allocate space for D", GSL_ENOMEM, 0); } w->workn = gsl_vector_alloc(n); if (w->workn == 0) { gsl_multifit_robust_free(w); GSL_ERROR_VAL("failed to allocate space for workn", GSL_ENOMEM, 0); } w->stats.sigma_ols = 0.0; w->stats.sigma_mad = 0.0; w->stats.sigma_rob = 0.0; w->stats.sigma = 0.0; w->stats.Rsq = 0.0; w->stats.adj_Rsq = 0.0; w->stats.rmse = 0.0; w->stats.sse = 0.0; w->stats.dof = n - p; w->stats.weights = w->weights; w->stats.r = w->r; return w; } /* gsl_multifit_robust_alloc() */ /* gsl_multifit_robust_free() Free memory associated with robust workspace */ void gsl_multifit_robust_free(gsl_multifit_robust_workspace *w) { if (w->multifit_p) gsl_multifit_linear_free(w->multifit_p); if (w->r) gsl_vector_free(w->r); if (w->weights) gsl_vector_free(w->weights); if (w->c_prev) gsl_vector_free(w->c_prev); if (w->resfac) gsl_vector_free(w->resfac); if (w->psi) gsl_vector_free(w->psi); if (w->dpsi) gsl_vector_free(w->dpsi); if (w->QSI) gsl_matrix_free(w->QSI); if (w->D) gsl_vector_free(w->D); if (w->workn) gsl_vector_free(w->workn); free(w); } /* gsl_multifit_robust_free() */ int gsl_multifit_robust_tune(const double tune, gsl_multifit_robust_workspace *w) { w->tune = tune; return GSL_SUCCESS; } int gsl_multifit_robust_maxiter(const size_t maxiter, gsl_multifit_robust_workspace *w) { if (w->maxiter == 0) { GSL_ERROR("maxiter must be greater than 0", GSL_EINVAL); } else { w->maxiter = maxiter; return GSL_SUCCESS; } } const char * gsl_multifit_robust_name(const gsl_multifit_robust_workspace *w) { return w->type->name; } gsl_multifit_robust_stats gsl_multifit_robust_statistics(const gsl_multifit_robust_workspace *w) { return w->stats; } /* gsl_multifit_robust_weights() Compute iterative weights for given residuals Inputs: r - residuals wts - (output) where to store weights w_i = r_i / (sigma_mad * tune) w - workspace Return: success/error Notes: 1) Sizes of r and wts must be equal 2) Size of r/wts may be less than or equal to w->n, to allow for computing weights of a subset of data */ int gsl_multifit_robust_weights(const gsl_vector *r, gsl_vector *wts, gsl_multifit_robust_workspace *w) { if (r->size != wts->size) { GSL_ERROR("residual vector does not match weight vector size", GSL_EBADLEN); } else { int s; double sigma; sigma = robust_madsigma(r, w->p, wts); /* scale residuals by sigma and tuning factor */ gsl_vector_memcpy(wts, r); gsl_vector_scale(wts, 1.0 / (sigma * w->tune)); /* compute weights in-place */ s = w->type->wfun(wts, wts); return s; } } /* gsl_multifit_robust_weights() */ /* gsl_multifit_robust() Perform robust iteratively reweighted linear least squares fit Inputs: X - design matrix of basis functions y - right hand side vector c - (output) model coefficients cov - (output) covariance matrix w - workspace */ int gsl_multifit_robust(const gsl_matrix * X, const gsl_vector * y, gsl_vector * c, gsl_matrix * cov, gsl_multifit_robust_workspace *w) { /* check matrix and vector sizes */ if (X->size1 != y->size) { GSL_ERROR ("number of observations in y does not match rows of matrix X", GSL_EBADLEN); } else if (X->size2 != c->size) { GSL_ERROR ("number of parameters c does not match columns of matrix X", GSL_EBADLEN); } else if (cov->size1 != cov->size2) { GSL_ERROR ("covariance matrix is not square", GSL_ENOTSQR); } else if (c->size != cov->size1) { GSL_ERROR ("number of parameters does not match size of covariance matrix", GSL_EBADLEN); } else if (X->size1 != w->n || X->size2 != w->p) { GSL_ERROR ("size of workspace does not match size of observation matrix", GSL_EBADLEN); } else { int s; double chisq; const double tol = GSL_SQRT_DBL_EPSILON; int converged = 0; size_t numit = 0; const size_t n = y->size; double sigy = gsl_stats_sd(y->data, y->stride, n); double sig_lower; size_t i; /* * if the initial fit is very good, then finding outliers by comparing * them to the residual standard deviation is difficult. Therefore we * set a lower bound on the standard deviation estimate that is a small * fraction of the standard deviation of the data values */ sig_lower = 1.0e-6 * sigy; if (sig_lower == 0.0) sig_lower = 1.0; /* compute initial estimates using ordinary least squares */ s = gsl_multifit_linear(X, y, c, cov, &chisq, w->multifit_p); if (s) return s; /* save Q S^{-1} of original matrix */ gsl_matrix_memcpy(w->QSI, w->multifit_p->QSI); gsl_vector_memcpy(w->D, w->multifit_p->D); /* compute statistical leverage of each data point */ s = gsl_linalg_SV_leverage(w->multifit_p->A, w->resfac); if (s) return s; /* correct residuals with factor 1 / sqrt(1 - h) */ for (i = 0; i < n; ++i) { double h = gsl_vector_get(w->resfac, i); if (h > 0.9999) h = 0.9999; gsl_vector_set(w->resfac, i, 1.0 / sqrt(1.0 - h)); } /* compute residuals from OLS fit r = y - X c */ s = gsl_multifit_linear_residuals(X, y, c, w->r); if (s) return s; /* compute estimate of sigma from ordinary least squares */ w->stats.sigma_ols = gsl_blas_dnrm2(w->r) / sqrt((double) w->stats.dof); while (!converged && ++numit <= w->maxiter) { double sig; /* adjust residuals by statistical leverage (see DuMouchel and O'Brien) */ s = gsl_vector_mul(w->r, w->resfac); if (s) return s; /* compute estimate of standard deviation using MAD */ sig = robust_madsigma(w->r, w->p, w->workn); /* scale residuals by standard deviation and tuning parameter */ gsl_vector_scale(w->r, 1.0 / (GSL_MAX(sig, sig_lower) * w->tune)); /* compute weights using these residuals */ s = w->type->wfun(w->r, w->weights); if (s) return s; gsl_vector_memcpy(w->c_prev, c); /* solve weighted least squares with new weights */ s = gsl_multifit_wlinear(X, w->weights, y, c, cov, &chisq, w->multifit_p); if (s) return s; /* compute new residuals r = y - X c */ s = gsl_multifit_linear_residuals(X, y, c, w->r); if (s) return s; converged = robust_test_convergence(w->c_prev, c, tol); } /* compute final MAD sigma */ w->stats.sigma_mad = robust_madsigma(w->r, w->p, w->workn); /* compute robust estimate of sigma */ w->stats.sigma_rob = robust_robsigma(w->r, w->stats.sigma_mad, w->tune, w); /* compute final estimate of sigma */ w->stats.sigma = robust_sigma(w->stats.sigma_ols, w->stats.sigma_rob, w); /* store number of iterations */ w->stats.numit = numit; { double dof = (double) w->stats.dof; double rnorm = w->stats.sigma * sqrt(dof); /* see DuMouchel, sec 4.2 */ double ss_err = rnorm * rnorm; double ss_tot = gsl_stats_tss(y->data, y->stride, n); /* compute R^2 */ w->stats.Rsq = 1.0 - ss_err / ss_tot; /* compute adjusted R^2 */ w->stats.adj_Rsq = 1.0 - (1.0 - w->stats.Rsq) * ((double)n - 1.0) / dof; /* compute rmse */ w->stats.rmse = sqrt(ss_err / dof); /* store SSE */ w->stats.sse = ss_err; } /* calculate covariance matrix = sigma^2 (X^T X)^{-1} */ s = robust_covariance(w->stats.sigma, cov, w); if (s) return s; /* raise an error if not converged */ if (numit > w->maxiter) { GSL_ERROR("maximum iterations exceeded", GSL_EMAXITER); } return s; } } /* gsl_multifit_robust() */ /* Estimation of values for given x */ int gsl_multifit_robust_est(const gsl_vector * x, const gsl_vector * c, const gsl_matrix * cov, double *y, double *y_err) { int s = gsl_multifit_linear_est(x, c, cov, y, y_err); return s; } /* gsl_multifit_robust_residuals() Compute robust / studentized residuals from fit r_i = (y_i - Y_i) / (sigma * sqrt(1 - h_i)) Inputs: X - design matrix y - rhs vector c - fit coefficients r - (output) studentized residuals w - workspace Notes: 1) gsl_multifit_robust() must first be called to compute the coefficients c, the leverage factors in w->resfac, and sigma in w->stats.sigma */ int gsl_multifit_robust_residuals(const gsl_matrix * X, const gsl_vector * y, const gsl_vector * c, gsl_vector * r, gsl_multifit_robust_workspace * w) { if (X->size1 != y->size) { GSL_ERROR ("number of observations in y does not match rows of matrix X", GSL_EBADLEN); } else if (X->size2 != c->size) { GSL_ERROR ("number of parameters c does not match columns of matrix X", GSL_EBADLEN); } else if (y->size != r->size) { GSL_ERROR ("number of observations in y does not match number of residuals", GSL_EBADLEN); } else { const double sigma = w->stats.sigma; /* previously calculated sigma */ int s; size_t i; /* compute r = y - X c */ s = gsl_multifit_linear_residuals(X, y, c, r); if (s) return s; for (i = 0; i < r->size; ++i) { double hfac = gsl_vector_get(w->resfac, i); /* 1/sqrt(1 - h_i) */ double *ri = gsl_vector_ptr(r, i); /* multiply residual by 1 / (sigma * sqrt(1 - h_i)) */ *ri *= hfac / sigma; } return s; } } /* gsl_multifit_robust_residuals() */ /*********************************** * INTERNAL ROUTINES * ***********************************/ /* robust_test_convergence() Test for convergence in robust least squares Convergence criteria: |c_i^(k) - c_i^(k-1)| <= tol * max(|c_i^(k)|, |c_i^(k-1)|) for all i. k refers to iteration number. Inputs: c_prev - coefficients from previous iteration c - coefficients from current iteration tol - tolerance Return: 1 if converged, 0 if not */ static int robust_test_convergence(const gsl_vector *c_prev, const gsl_vector *c, const double tol) { size_t p = c->size; size_t i; for (i = 0; i < p; ++i) { double ai = gsl_vector_get(c_prev, i); double bi = gsl_vector_get(c, i); if (fabs(bi - ai) > tol * GSL_MAX(fabs(ai), fabs(bi))) return 0; /* not yet converged */ } /* converged */ return 1; } /* robust_test_convergence() */ /* robust_madsigma() Estimate the standard deviation of the residuals using the Median-Absolute-Deviation (MAD) of the residuals, throwing away the smallest p residuals. See: Street et al, 1988 Inputs: r - vector of residuals p - number of model coefficients (smallest p residuals are ignored) workn - workspace of size n = length(r) */ static double robust_madsigma(const gsl_vector *r, const size_t p, gsl_vector *workn) { size_t n = r->size; double sigma; size_t i; /* allow for the possibility that r->size < w->n */ gsl_vector_view v1 = gsl_vector_subvector(workn, 0, n); gsl_vector_view v2; /* copy |r| into workn */ for (i = 0; i < n; ++i) { gsl_vector_set(&v1.vector, i, fabs(gsl_vector_get(r, i))); } gsl_sort_vector(&v1.vector); /* * ignore the smallest p residuals when computing the median * (see Street et al 1988) */ v2 = gsl_vector_subvector(&v1.vector, p - 1, n - p + 1); sigma = gsl_stats_median_from_sorted_data(v2.vector.data, v2.vector.stride, v2.vector.size) / 0.6745; return sigma; } /* robust_madsigma() */ /* robust_robsigma() Compute robust estimate of sigma so that sigma^2 * inv(X' * X) is a reasonable estimate of the covariance for robust regression. Based heavily on the equations of Street et al, 1988. Inputs: r - vector of residuals y - X c s - sigma estimate using MAD tune - tuning constant w - workspace */ static double robust_robsigma(const gsl_vector *r, const double s, const double tune, gsl_multifit_robust_workspace *w) { double sigma; size_t i; const size_t n = w->n; const size_t p = w->p; const double st = s * tune; double a, b, lambda; /* compute u = r / sqrt(1 - h) / st */ gsl_vector_memcpy(w->workn, r); gsl_vector_mul(w->workn, w->resfac); gsl_vector_scale(w->workn, 1.0 / st); /* compute w(u) and psi'(u) */ w->type->wfun(w->workn, w->psi); w->type->psi_deriv(w->workn, w->dpsi); /* compute psi(u) = u*w(u) */ gsl_vector_mul(w->psi, w->workn); /* Street et al, Eq (3) */ a = gsl_stats_mean(w->dpsi->data, w->dpsi->stride, n); /* Street et al, Eq (5) */ b = 0.0; for (i = 0; i < n; ++i) { double psi_i = gsl_vector_get(w->psi, i); double resfac = gsl_vector_get(w->resfac, i); double fac = 1.0 / (resfac*resfac); /* 1 - h */ b += fac * psi_i * psi_i; } b /= (double) (n - p); /* Street et al, Eq (5) */ lambda = 1.0 + ((double)p)/((double)n) * (1.0 - a) / a; sigma = lambda * sqrt(b) * st / a; return sigma; } /* robust_robsigma() */ /* robust_sigma() Compute final estimate of residual standard deviation, using the OLS and robust sigma estimates. This equation is taken from DuMouchel and O'Brien, sec 4.1: \hat{\sigma_R} Inputs: s_ols - OLS sigma s_rob - robust sigma w - workspace Return: final estimate of sigma */ static double robust_sigma(const double s_ols, const double s_rob, gsl_multifit_robust_workspace *w) { double sigma; const double p = (double) w->p; const double n = (double) w->n; /* see DuMouchel and O'Brien, sec 4.1 */ sigma = GSL_MAX(s_rob, sqrt((s_ols*s_ols*p*p + s_rob*s_rob*n) / (p*p + n))); return sigma; } /* robust_sigma() */ /* robust_covariance() Calculate final covariance matrix, defined as: sigma * (X^T X)^{-1} Inputs: sigma - residual standard deviation cov - (output) covariance matrix w - workspace */ static int robust_covariance(const double sigma, gsl_matrix *cov, gsl_multifit_robust_workspace *w) { int status = 0; const size_t p = w->p; const double s2 = sigma * sigma; size_t i, j; gsl_matrix *QSI = w->QSI; gsl_vector *D = w->D; /* Form variance-covariance matrix cov = s2 * (Q S^-1) (Q S^-1)^T */ for (i = 0; i < p; i++) { gsl_vector_view row_i = gsl_matrix_row (QSI, i); double d_i = gsl_vector_get (D, i); for (j = i; j < p; j++) { gsl_vector_view row_j = gsl_matrix_row (QSI, j); double d_j = gsl_vector_get (D, j); double s; gsl_blas_ddot (&row_i.vector, &row_j.vector, &s); gsl_matrix_set (cov, i, j, s * s2 / (d_i * d_j)); gsl_matrix_set (cov, j, i, s * s2 / (d_i * d_j)); } } return status; } /* robust_covariance() */