/* multifit_nlinear/subspace2D.c
 * 
 * Copyright (C) 2016 Patrick Alken
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or (at
 * your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */

#include <config.h>
#include <gsl/gsl_math.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_linalg.h>
#include <gsl/gsl_multifit_nlinear.h>
#include <gsl/gsl_errno.h>
#include <gsl/gsl_blas.h>
#include <gsl/gsl_poly.h>

/*
 * This module implements a 2D subspace trust region subproblem method,
 * as outlined in
 *
 * [1] G. A. Shultz, R. B. Schnabel, and R. H. Byrd
 *     A Family of Trust-Region-Based Algorithms for Unconstrained
 *     Minimization with Strong Global Convergence Properties,
 *     SIAM Journal on Numerical Analysis 1985 22:1, 47-67 
 *
 * [2] R. H. Byrd, R. B. Schnabel, G. A. Shultz,
 *     Approximate solution of the trust region problem by
 *     minimization over two-dimensional subspaces,
 *     Mathematical Programming, January 1988, Volume 40,
 *     Issue 1, pp 247-263
 *
 * The idea is to solve:
 *
 * min_{dx} g^T dx + 1/2 dx^T B dx
 * 
 * with constraints:
 *
 * ||D dx|| <= delta
 * dx \in span{dx_sd, dx_gn}
 *
 * where B is the Hessian matrix, B = J^T J
 *
 * The steps are as follows:
 *
 * 1. preloop:
 *    a. Compute Gauss-Newton and steepest descent vectors,
 *       dx_gn, dx_sd
 *    b. Compute an orthonormal basis for span(D dx_sd, D dx_gn) by
 *       constructing W = [ D dx_sd, D dx_gn ] and performing a QR
 *       decomposition of W. The 2 columns of the Q matrix
 *       will then span the column space of W. W should have rank 2
 *       unless D*dx_sd and D*dx_gn are parallel, in which case it will
 *       have rank 1.
 *    c. Precompute various quantities needed for the step calculation
 *
 * 2. step:
 *    a. If the Gauss-Newton step is inside the trust region, use it
 *    b. if W has rank 1, we cannot form a 2D subspace, so in this case
 *       follow the steepest descent direction to the trust region boundary
 *       and use that as the step.
 *    c. In the full rank 2 case, if the GN point is outside the trust region,
 *       then the minimizer of the objective function lies on the trust
 *       region boundary. Therefore the minimization problem becomes:
 *
 *       min_{dx} g^T dx + 1/2 dx^T B dx, with ||dx|| = delta, dx = Q * x
 *
 *       where x is a 2-vector to be determined and the columns of Q are
 *       the orthonormal basis vectors of the subspace. Note the equality
 *       constraint now instead of <=. In terms of the new variable x,
 *       the minimization problem becomes:
 *
 *       min_x subg^T x + 1/2 x^T subB x, with ||Q*x|| = ||x|| = delta
 *
 *       where:
 *         subg = Q^T g   (2-by-1)
 *         subB = Q^T B Q (2-by-2)
 *
 *       This equality constrained 2D minimization problem can be solved
 *       with a Lagrangian multiplier, which results in a 4th degree polynomial
 *       equation to be solved. The equation is:
 *
 *         lambda^4  1
 *       + lambda^3  2 tr(B)
 *       + lambda^2  (tr(B)^2 + 2 det(B) - g^T g / delta^2)
 *       + lambda^1  (2 det(B) tr(B) - 2 g^T adj(B)^T g / delta^2)
 *       + lambda^0  (det(B)^2 - g^T adj(B)^T adj(B) g / delta^2)
 *
 *       where adj(B) is the adjugate matrix of B.
 *
 *       We then check each of the 4 solutions for lambda to determine which
 *       lambda results in the smallest objective function value. This x
 *       is then used to construct the final step: dx = Q*x
 */

typedef struct
{
  size_t n;                  /* number of observations */
  size_t p;                  /* number of parameters */
  gsl_vector *dx_gn;         /* Gauss-Newton step, size p */
  gsl_vector *dx_sd;         /* steepest descent step, size p */
  double norm_Dgn;           /* || D dx_gn || */
  double norm_Dsd;           /* || D dx_sd || */
  gsl_vector *workp;         /* workspace, length p */
  gsl_vector *workn;         /* workspace, length n */
  gsl_matrix *W;             /* orthonormal basis for 2D subspace, p-by-2 */
  gsl_matrix *JQ;            /* J * Q, n-by-p */
  gsl_vector *tau;           /* Householder scalars */
  gsl_vector *subg;          /* subspace gradient = W^T g, 2-by-1 */
  gsl_matrix *subB;          /* subspace Hessian = W^T B W, 2-by-2 */
  gsl_permutation *perm;     /* permutation matrix */

  double trB;                /* Tr(subB) */
  double detB;               /* det(subB) */
  double normg;              /* || subg || */
  double term0;              /* g^T adj(B)^T adj(B) g */
  double term1;              /* g^T adj(B)^T g */

  size_t rank;               /* rank of [ dx_sd, dx_gn ] matrix */

  gsl_poly_complex_workspace *poly_p;

  /* tunable parameters */
  gsl_multifit_nlinear_parameters params;
} subspace2D_state_t;

#include "common.c"

static void * subspace2D_alloc (const void * params, const size_t n, const size_t p);
static void subspace2D_free(void *vstate);
static int subspace2D_init(const void *vtrust_state, void *vstate);
static int subspace2D_preloop(const void * vtrust_state, void * vstate);
static int subspace2D_step(const void * vtrust_state, const double delta,
                           gsl_vector * dx, void * vstate);
static int subspace2D_preduction(const void * vtrust_state, const gsl_vector * dx,
                                 double * pred, void * vstate);
static int subspace2D_solution(const double lambda, gsl_vector * x,
                               subspace2D_state_t * state);
static double subspace2D_objective(const gsl_vector * x, subspace2D_state_t * state);
static int subspace2D_calc_gn(const gsl_multifit_nlinear_trust_state * trust_state, gsl_vector * dx);
static int subspace2D_calc_sd(const gsl_multifit_nlinear_trust_state * trust_state, gsl_vector * dx,
                              subspace2D_state_t * state);

static void *
subspace2D_alloc (const void * params, const size_t n, const size_t p)
{
  const gsl_multifit_nlinear_parameters *par = (const gsl_multifit_nlinear_parameters *) params;
  subspace2D_state_t *state;
  
  state = calloc(1, sizeof(subspace2D_state_t));
  if (state == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate subspace2D state", GSL_ENOMEM);
    }

  state->dx_gn = gsl_vector_alloc(p);
  if (state->dx_gn == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate space for dx_gn", GSL_ENOMEM);
    }

  state->dx_sd = gsl_vector_alloc(p);
  if (state->dx_sd == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate space for dx_sd", GSL_ENOMEM);
    }

  state->workp = gsl_vector_alloc(p);
  if (state->workp == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate space for workp", GSL_ENOMEM);
    }

  state->workn = gsl_vector_alloc(n);
  if (state->workn == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate space for workn", GSL_ENOMEM);
    }

  state->W = gsl_matrix_alloc(p, 2);
  if (state->W == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate space for W", GSL_ENOMEM);
    }

  state->JQ = gsl_matrix_alloc(n, p);
  if (state->JQ == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate space for JQ", GSL_ENOMEM);
    }

  state->tau = gsl_vector_alloc(2);
  if (state->tau == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate space for tau", GSL_ENOMEM);
    }

  state->subg = gsl_vector_alloc(2);
  if (state->subg == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate space for subg", GSL_ENOMEM);
    }

  state->subB = gsl_matrix_alloc(2, 2);
  if (state->subB == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate space for subB", GSL_ENOMEM);
    }

  state->perm = gsl_permutation_alloc(2);
  if (state->perm == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate space for perm", GSL_ENOMEM);
    }

  state->poly_p = gsl_poly_complex_workspace_alloc(5);
  if (state->poly_p == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate space for poly workspace", GSL_ENOMEM);
    }

  state->n = n;
  state->p = p;
  state->rank = 0;
  state->params = *par;

  return state;
}

static void
subspace2D_free(void *vstate)
{
  subspace2D_state_t *state = (subspace2D_state_t *) vstate;

  if (state->dx_gn)
    gsl_vector_free(state->dx_gn);

  if (state->dx_sd)
    gsl_vector_free(state->dx_sd);

  if (state->workp)
    gsl_vector_free(state->workp);

  if (state->workn)
    gsl_vector_free(state->workn);

  if (state->W)
    gsl_matrix_free(state->W);

  if (state->JQ)
    gsl_matrix_free(state->JQ);

  if (state->tau)
    gsl_vector_free(state->tau);

  if (state->subg)
    gsl_vector_free(state->subg);

  if (state->subB)
    gsl_matrix_free(state->subB);

  if (state->perm)
    gsl_permutation_free(state->perm);

  if (state->poly_p)
    gsl_poly_complex_workspace_free(state->poly_p);

  free(state);
}

/*
subspace2D_init()
  Initialize subspace2D solver

Inputs: vtrust_state - trust state
        vstate       - workspace

Return: success/error
*/

static int
subspace2D_init(const void *vtrust_state, void *vstate)
{
  (void)vtrust_state;
  (void)vstate;

  return GSL_SUCCESS;
}

/*
subspace2D_preloop()
  Initialize subspace2D method prior to iteration loop.
This involves computing the Gauss-Newton step and
steepest descent step

Notes: on output,
1) state->dx_gn contains Gauss-Newton step
2) state->dx_sd contains steepest descent step
3) state->rank contains the rank([dx_sd, dx_gn])
4) if full rank subspace (rank = 2), then:
   state->trB = Tr(subB)
   state->detB = det(subB)
   state->normg = || subg ||
*/

static int
subspace2D_preloop(const void * vtrust_state, void * vstate)
{
  int status;
  const gsl_multifit_nlinear_trust_state *trust_state =
    (const gsl_multifit_nlinear_trust_state *) vtrust_state;
  subspace2D_state_t *state = (subspace2D_state_t *) vstate;
  gsl_vector_view v;
  double work_data[2];
  gsl_vector_view work = gsl_vector_view_array(work_data, 2);
  int signum;

  /* calculate Gauss-Newton step */
  status = subspace2D_calc_gn(trust_state, state->dx_gn);
  if (status)
    return status;

  /* now calculate the steepest descent step */
  status = subspace2D_calc_sd(trust_state, state->dx_sd, state);
  if (status)
    return status;

  /* store norms */
  state->norm_Dgn = scaled_enorm(trust_state->diag, state->dx_gn);
  state->norm_Dsd = scaled_enorm(trust_state->diag, state->dx_sd);

  /*
   * now compute orthonormal basis for span(D dx_sd, D dx_gn) using
   * QR decomposition; set W = [ D dx_sd, D dx_gn ] and normalize each
   * column to unit magnitude. Then the Q matrix will form a basis for Col(W)
   */

  v = gsl_matrix_column(state->W, 0);
  gsl_vector_memcpy(&v.vector, state->dx_sd);
  gsl_vector_mul(&v.vector, trust_state->diag);
  if (state->norm_Dsd != 0)
    gsl_vector_scale(&v.vector, 1.0 / state->norm_Dsd);

  v = gsl_matrix_column(state->W, 1);
  gsl_vector_memcpy(&v.vector, state->dx_gn);
  gsl_vector_mul(&v.vector, trust_state->diag);
  if (state->norm_Dgn != 0)
    gsl_vector_scale(&v.vector, 1.0 / state->norm_Dgn);

  /* use a rank revealing QR decomposition in case dx_sd and dx_gn
   * are parallel */
  gsl_linalg_QRPT_decomp(state->W, state->tau, state->perm, &signum, &work.vector);

  /* check for parallel dx_sd, dx_gn, in which case rank will be 1 */
  state->rank = gsl_linalg_QRPT_rank(state->W, -1.0);

  if (state->rank == 2)
    {
      /*
       * full rank subspace, compute:
       * subg = Q^T D^{-1} g
       * subB = Q^T D^{-1} B D^{-1} Q where B = J^T J
       */
      const size_t p = state->p;
      size_t i;
      gsl_matrix_view JQ = gsl_matrix_submatrix(state->JQ, 0, 0, state->n, GSL_MIN(2, p));
      double B00, B10, B11, g0, g1;

      /* compute subg */
      gsl_vector_memcpy(state->workp, trust_state->g);
      gsl_vector_div(state->workp, trust_state->diag);
      gsl_linalg_QR_QTvec(state->W, state->tau, state->workp);

      g0 = gsl_vector_get(state->workp, 0);
      g1 = gsl_vector_get(state->workp, 1);

      gsl_vector_set(state->subg, 0, g0);
      gsl_vector_set(state->subg, 1, g1);

      /* compute subB */

      /* compute J D^{-1} */
      gsl_matrix_memcpy(state->JQ, trust_state->J);

      for (i = 0; i < p; ++i)
        {
          gsl_vector_view c = gsl_matrix_column(state->JQ, i);
          double di = gsl_vector_get(trust_state->diag, i);
          gsl_vector_scale(&c.vector, 1.0 / di);
        }

      /* compute J D^{-1} Q */
      gsl_linalg_QR_matQ(state->W, state->tau, state->JQ);

      /* compute subB = Q^T D^{-1} J^T J D^{-1} Q */
      gsl_blas_dsyrk(CblasLower, CblasTrans, 1.0, &JQ.matrix, 0.0, state->subB);

      B00 = gsl_matrix_get(state->subB, 0, 0);
      B10 = gsl_matrix_get(state->subB, 1, 0);
      B11 = gsl_matrix_get(state->subB, 1, 1);

      state->trB = B00 + B11;
      state->detB = B00*B11 - B10*B10;
      state->normg = gsl_blas_dnrm2(state->subg);

      /* g^T adj(B)^T adj(B) g */
      state->term0 = (B10*B10 + B11*B11)*g0*g0 -
                     2*B10*(B00 + B11)*g0*g1 +
                     (B00*B00 + B10*B10)*g1*g1;

      /* g^T adj(B)^T g */
      state->term1 = B11 * g0 * g0 + g1 * (B00*g1 - 2*B10*g0);

    }

  return GSL_SUCCESS;
}


/*
subspace2D_step()
  Calculate a new step with 2D subspace method. Based on [1]. We
seek a vector dx in span{dx_gn, dx_sd} which minimizes the model
function subject to ||dx|| <= delta
*/

static int
subspace2D_step(const void * vtrust_state, const double delta,
                gsl_vector * dx, void * vstate)
{
  const gsl_multifit_nlinear_trust_state *trust_state =
    (const gsl_multifit_nlinear_trust_state *) vtrust_state;
  subspace2D_state_t *state = (subspace2D_state_t *) vstate;

  if (state->norm_Dgn <= delta)
    {
      /* Gauss-Newton step is inside trust region, use it as final step
       * since it is the global minimizer of the quadratic model function */
      gsl_vector_memcpy(dx, state->dx_gn);
    }
  else if (state->rank < 2)
    {
      /* rank of [dx_sd, dx_gn] is 1, meaning dx_sd and dx_gn
       * are parallel so we can't form a 2D subspace. Follow the steepest
       * descent direction to the trust region boundary as our step */
      gsl_vector_memcpy(dx, state->dx_sd);
      gsl_vector_scale(dx, delta / state->norm_Dsd);
    }
  else
    {
      int status;
      const double delta_sq = delta * delta;
      double u = state->normg / delta;
      double a[5];
      double z[8];

#if 1
      a[0] = state->detB * state->detB - state->term0 / delta_sq;
      a[1] = 2 * state->detB * state->trB - 2 * state->term1 / delta_sq;
      a[2] = state->trB * state->trB + 2 * state->detB - u * u;
      a[3] = 2 * state->trB;
      a[4] = 1.0;
#else
      double TrB_D = state->trB * delta;
      double detB_D = state->detB * delta;
      double normg_sq = state->normg * state->normg;

      a[0] = detB_D * detB_D - state->term0;
      a[1] = 2 * state->detB * state->trB * delta_sq - 2 * state->term1;
      a[2] = TrB_D * TrB_D + 2 * state->detB * delta_sq - normg_sq;
      a[3] = 2 * state->trB * delta_sq;
      a[4] = delta_sq;
#endif

      status = gsl_poly_complex_solve(a, 5, state->poly_p, z);
      if (status == GSL_SUCCESS)
        {
          size_t i;
          double min = 0.0;
          int mini = -1;
          double x_data[2];
          gsl_vector_view x = gsl_vector_view_array(x_data, 2);

          /*
           * loop through all four values of the Lagrange multiplier
           * lambda. For each lambda, evaluate the objective function
           * with Re(lambda) to determine which lambda minimizes the
           * function
           */
          for (i = 0; i < 4; ++i)
            {
              double cost, normx;

              /*fprintf(stderr, "root: %.12e + %.12e i\n",
                      z[2*i], z[2*i+1]);*/

              status = subspace2D_solution(z[2*i], &x.vector, state);
              if (status != GSL_SUCCESS)
                continue; /* singular matrix system */

              /* ensure ||x|| = delta */

              normx = gsl_blas_dnrm2(&x.vector);
              if (normx == 0.0)
                continue;

              gsl_vector_scale(&x.vector, delta / normx);

              /* evaluate objective function to determine minimizer */
              cost = subspace2D_objective(&x.vector, state);
              if (mini < 0 || cost < min)
                {
                  mini = (int) i;
                  min = cost;
                }
            }

          if (mini < 0)
            {
              /* did not find minimizer - should not get here */
              return GSL_FAILURE;
            }
          else
            {
              /* compute x which minimizes objective function */
              subspace2D_solution(z[2*mini], &x.vector, state);

              /* dx = Q * x */
              gsl_vector_set_zero(dx);
              gsl_vector_set(dx, 0, gsl_vector_get(&x.vector, 0));
              gsl_vector_set(dx, 1, gsl_vector_get(&x.vector, 1));
              gsl_linalg_QR_Qvec(state->W, state->tau, dx);

              /* compute final dx by multiplying by D^{-1} */
              gsl_vector_div(dx, trust_state->diag);
            }
        }
      else
        {
          GSL_ERROR ("gsl_poly_complex_solve failed", status);
        }
    }

  return GSL_SUCCESS;
}

static int
subspace2D_preduction(const void * vtrust_state, const gsl_vector * dx,
                  double * pred, void * vstate)
{
  const gsl_multifit_nlinear_trust_state *trust_state =
    (const gsl_multifit_nlinear_trust_state *) vtrust_state;
  subspace2D_state_t *state = (subspace2D_state_t *) vstate;

  *pred = quadratic_preduction(trust_state->f, trust_state->J, dx, state->workn);

  return GSL_SUCCESS;
}

/* solve 2D subspace problem: (B + lambda*I) x = -g */
static int
subspace2D_solution(const double lambda, gsl_vector * x,
                    subspace2D_state_t * state)
{
  int status = GSL_SUCCESS;
  double C_data[4];
  gsl_matrix_view C = gsl_matrix_view_array(C_data, 2, 2);
  double B00 = gsl_matrix_get(state->subB, 0, 0);
  double B10 = gsl_matrix_get(state->subB, 1, 0);
  double B11 = gsl_matrix_get(state->subB, 1, 1);

  /* construct C = B + lambda*I */
  gsl_matrix_set(&C.matrix, 0, 0, B00 + lambda);
  gsl_matrix_set(&C.matrix, 1, 0, B10);
  gsl_matrix_set(&C.matrix, 0, 1, B10);
  gsl_matrix_set(&C.matrix, 1, 1, B11 + lambda);

  /* use modified Cholesky in case C is not positive definite */
  gsl_linalg_mcholesky_decomp(&C.matrix, state->perm, NULL);
  gsl_linalg_mcholesky_solve(&C.matrix, state->perm, state->subg, x);

  gsl_vector_scale(x, -1.0);

  return status;
}

/* evaluate 2D objective function: f(x) = g^T x + 1/2 x^T B x */
static double
subspace2D_objective(const gsl_vector * x, subspace2D_state_t * state)
{
  double f;
  double y_data[2];
  gsl_vector_view y = gsl_vector_view_array(y_data, 2);

  /* compute: y = g + 1/2 B x */
  gsl_vector_memcpy(&y.vector, state->subg);
  gsl_blas_dsymv(CblasLower, 0.5, state->subB, x, 1.0, &y.vector);

  /* compute: f = x^T ( g + 1/2 B x ) */
  gsl_blas_ddot(x, &y.vector, &f);

  return f;
}

/*
subspace2D_calc_gn()
  Calculate Gauss-Newton step which satisfies:

J dx_gn = -f

Inputs: trust_state - trust state variables
        dx          - (output) Gauss-Newton step

Return: success/error
*/

static int
subspace2D_calc_gn(const gsl_multifit_nlinear_trust_state * trust_state, gsl_vector * dx)
{
  int status;
  const gsl_multifit_nlinear_parameters *params = trust_state->params;

  /* initialize linear least squares solver */
  status = (params->solver->init)(trust_state, trust_state->solver_state);
  if (status)
    return status;

  /* prepare the linear solver to compute Gauss-Newton step */
  status = (params->solver->presolve)(0.0, trust_state, trust_state->solver_state);
  if (status)
    return status;

  /* solve: J dx_gn = -f for Gauss-Newton step */
  status = (params->solver->solve)(trust_state->f,
                                   dx,
                                   trust_state,
                                   trust_state->solver_state);
  if (status)
    return status;

  return GSL_SUCCESS;
}

/*
subspace2D_calc_sd()
  Calculate steepest descent step,

dx_sd = - || D^{-1} g ||^2 / || J D^{-2} g ||^2 D^{-2} g

Inputs: trust_state - trust state variables
        dx          - (output) steepest descent vector
        state       - workspace

Return: success/error
*/

static int
subspace2D_calc_sd(const gsl_multifit_nlinear_trust_state * trust_state, gsl_vector * dx,
                   subspace2D_state_t * state)
{
  double norm_Dinvg;   /* || D^{-1} g || */
  double norm_JDinv2g; /* || J D^{-2} g || */
  double alpha;        /* || D^{-1} g ||^2 / || J D^{-2} g ||^2 */
  double u;

  /* compute workp = D^{-1} g and its norm */
  gsl_vector_memcpy(state->workp, trust_state->g);
  gsl_vector_div(state->workp, trust_state->diag);
  norm_Dinvg = gsl_blas_dnrm2(state->workp);

  /* compute workp = D^{-2} g */
  gsl_vector_div(state->workp, trust_state->diag);

  /* compute: workn = J D^{-2} g */
  gsl_blas_dgemv(CblasNoTrans, 1.0, trust_state->J, state->workp, 0.0, state->workn);
  norm_JDinv2g = gsl_blas_dnrm2(state->workn);

  u = norm_Dinvg / norm_JDinv2g;
  alpha = u * u;

  /* dx_sd = -alpha D^{-2} g */
  gsl_vector_memcpy(dx, state->workp);
  gsl_vector_scale(dx, -alpha);

  return GSL_SUCCESS;
}

static const gsl_multifit_nlinear_trs subspace2D_type =
{
  "2D-subspace",
  subspace2D_alloc,
  subspace2D_init,
  subspace2D_preloop,
  subspace2D_step,
  subspace2D_preduction,
  subspace2D_free
};

const gsl_multifit_nlinear_trs *gsl_multifit_nlinear_trs_subspace2D = &subspace2D_type;