/* multilarge_nlinear/cgst.c
 * 
 * Copyright (C) 2016 Patrick Alken
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or (at
 * your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */

#include <config.h>
#include <gsl/gsl_math.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_linalg.h>
#include <gsl/gsl_multilarge_nlinear.h>
#include <gsl/gsl_errno.h>
#include <gsl/gsl_blas.h>

/*
 * This module contains an implementation of the Steihaug-Toint
 * conjugate gradient algorithm for nonlinear optimization problems.
 * This implementation closely follows the following works:
 *
 * [1] T. Steihaug, The conjugate gradient method and trust regions
 *     in large scale optimization, SIAM J. Num. Anal., 20(3) 1983.
 *
 * In the below algorithm, the Jacobian and gradient are scaled
 * according to:
 *
 * J~ = J D^{-1}
 * g~ = D^{-1}
 *
 * prior to any calculations which results in better numerical
 * stability when solving for the Gauss-Newton step. The resulting
 * step vector is then backtransformed as:
 *
 * dx = D^{-1} dx~
 */

typedef struct
{
  size_t n;                  /* number of observations */
  size_t p;                  /* number of parameters */
  gsl_vector *z;             /* Gauss-Newton step, size p */
  gsl_vector *r;             /* steepest descent step, size p */
  gsl_vector *d;             /* steepest descent step, size p */
  gsl_vector *workp;         /* workspace, length p */
  gsl_vector *workn;         /* workspace, length n */
  double norm_g;             /* || g~ || */

  double cgtol;              /* tolerance for CG solution */
  size_t cgmaxit;            /* maximum CG iterations */
} cgst_state_t;

#include "common.c"

static void * cgst_alloc (const void * params, const size_t n, const size_t p);
static void cgst_free(void *vstate);
static int cgst_init(const void *vtrust_state, void *vstate);
static int cgst_preloop(const void * vtrust_state, void * vstate);
static int cgst_step(const void * vtrust_state, const double delta,
                     gsl_vector * dx, void * vstate);
static int cgst_preduction(const void * vtrust_state, const gsl_vector * dx,
                           double * pred, void * vstate);
static double cgst_calc_tau(const gsl_vector * p, const gsl_vector * d,
                            const double delta);

static void *
cgst_alloc (const void * params, const size_t n, const size_t p)
{
  const gsl_multilarge_nlinear_parameters *par = (const gsl_multilarge_nlinear_parameters *) params;
  cgst_state_t *state;
  
  state = calloc(1, sizeof(cgst_state_t));
  if (state == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate st state", GSL_ENOMEM);
    }

  state->z = gsl_vector_alloc(p);
  if (state->z == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate space for z", GSL_ENOMEM);
    }

  state->r = gsl_vector_alloc(p);
  if (state->r == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate space for r", GSL_ENOMEM);
    }

  state->d = gsl_vector_alloc(p);
  if (state->d == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate space for d", GSL_ENOMEM);
    }

  state->workp = gsl_vector_alloc(p);
  if (state->workp == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate space for workp", GSL_ENOMEM);
    }

  state->workn = gsl_vector_alloc(n);
  if (state->workn == NULL)
    {
      GSL_ERROR_NULL ("failed to allocate space for workn", GSL_ENOMEM);
    }

  state->n = n;
  state->p = p;

  state->cgmaxit = par->max_iter;
  if (state->cgmaxit == 0)
    state->cgmaxit = n;

  state->cgtol = par->tol;

  return state;
}

static void
cgst_free(void *vstate)
{
  cgst_state_t *state = (cgst_state_t *) vstate;

  if (state->z)
    gsl_vector_free(state->z);

  if (state->r)
    gsl_vector_free(state->r);

  if (state->d)
    gsl_vector_free(state->d);

  if (state->workp)
    gsl_vector_free(state->workp);

  if (state->workn)
    gsl_vector_free(state->workn);

  free(state);
}

/*
cgst_init()
  Initialize cgst solver

Inputs: vtrust_state - trust state
        vstate       - workspace

Return: success/error
*/

static int
cgst_init(const void *vtrust_state, void *vstate)
{
  /* nothing to do */

  (void)vtrust_state;
  (void)vstate;

  return GSL_SUCCESS;
}

static int
cgst_preloop(const void * vtrust_state, void * vstate)
{
  /* nothing to do */

  (void)vtrust_state;
  (void)vstate;

  return GSL_SUCCESS;
}

/*
cgst_step()
  Calculate a new step vector

Return:
GSL_SUCCESS if CG solution found
GSL_EMAXITER if no solution found
*/

static int
cgst_step(const void * vtrust_state, const double delta,
          gsl_vector * dx, void * vstate)
{
  int status;
  const gsl_multilarge_nlinear_trust_state *trust_state =
    (const gsl_multilarge_nlinear_trust_state *) vtrust_state;
  cgst_state_t *state = (cgst_state_t *) vstate;
  const gsl_vector * x = trust_state->x;
  const gsl_vector * f = trust_state->f;
  const gsl_vector * swts = trust_state->sqrt_wts;
  const gsl_vector * diag = trust_state->diag;
  const gsl_multilarge_nlinear_parameters * params = trust_state->params;
  gsl_multilarge_nlinear_fdf * fdf = trust_state->fdf;
  double alpha, beta, u;
  double norm_Jd;   /* || J D^{-1} d_i || */
  double norm_r;    /* || r_i || */
  double norm_rp1;  /* || r_{i+1} || */
  size_t i;

  /* Step 1 of [1], section 2; scale gradient as
   *
   * g~ = D^{-1} g
   *
   * for better numerical stability
   */

  for (i = 0; i < state->p; ++i)
    {
      double gi = gsl_vector_get(trust_state->g, i);
      double di = gsl_vector_get(trust_state->diag, i);

      gsl_vector_set(state->z, i, 0.0);
      gsl_vector_set(state->r, i, -gi / di);
      gsl_vector_set(state->d, i, -gi / di);
      gsl_vector_set(state->workp, i, gi / di);
    }

  /* compute || g~ || */
  state->norm_g = gsl_blas_dnrm2(state->workp);

  for (i = 0; i < state->cgmaxit; ++i)
    {
      /* workp := D^{-1} d_i */
      gsl_vector_memcpy(state->workp, state->d);
      gsl_vector_div(state->workp, trust_state->diag);

      /* workn := J D^{-1} d_i */
      status = gsl_multilarge_nlinear_eval_df(CblasNoTrans, x, f, state->workp,
                                              swts, params->h_df, params->fdtype,
                                              fdf, state->workn, NULL, NULL);
      if (status)
        return status;

      /* compute || J D^{-1} d_i || */
      norm_Jd = gsl_blas_dnrm2(state->workn);

      /* Step 2 of [1], section 2 */
      if (norm_Jd == 0.0)
        {
          double tau = cgst_calc_tau(state->z, state->d, delta);

          /* dx = z_i + tau*d_i */
          scaled_addition(1.0, state->z, tau, state->d, dx);
          gsl_vector_div(dx, diag);

          return GSL_SUCCESS;
        }

      /* Step 3 of [1], section 2 */

      norm_r = gsl_blas_dnrm2(state->r);
      u = norm_r / norm_Jd;
      alpha = u * u;

      /* workp <= z_{i+1} = z_i + alpha_i*d_i */
      scaled_addition(1.0, state->z, alpha, state->d, state->workp);

      u = gsl_blas_dnrm2(state->workp);
      if (u >= delta)
        {
          double tau = cgst_calc_tau(state->z, state->d, delta);

          /* dx = z_i + tau*d_i */
          scaled_addition(1.0, state->z, tau, state->d, dx);
          gsl_vector_div(dx, diag);

          return GSL_SUCCESS;
        }

      /* store z_{i+1} */
      gsl_vector_memcpy(state->z, state->workp);

      /* Step 4 of [1], section 2 */

      /* compute: workp := alpha B d_i = alpha D^{-1} J^T J D^{-1} d_i,
       * where J D^{-1} d_i is already stored in workn */
      status = gsl_multilarge_nlinear_eval_df(CblasTrans, x, f, state->workn,
                                              swts, params->h_df, params->fdtype,
                                              fdf, state->workp, NULL, NULL);
      if (status)
        return status;

      gsl_vector_div(state->workp, trust_state->diag);
      gsl_vector_scale(state->workp, alpha);

      /* r_{i+1} = r_i - alpha*B*d_i */
      gsl_vector_sub(state->r, state->workp);
      norm_rp1 = gsl_blas_dnrm2(state->r);

      u = norm_rp1 / state->norm_g;
      if (u < state->cgtol)
        {
          gsl_vector_memcpy(dx, state->z);
          gsl_vector_div(dx, diag);
          return GSL_SUCCESS;
        }

      /* Step 5 of [1], section 2 */

      /* compute u = ||r_{i+1}|| / || r_i|| */
      u = norm_rp1 / norm_r;
      beta = u * u;

      /* compute: d_{i+1} = rt_{i+1} + beta*d_i */
      scaled_addition(1.0, state->r, beta, state->d, state->d);
    }

  /* failed to converge, return current estimate */
  gsl_vector_memcpy(dx, state->z);
  gsl_vector_div(dx, diag);

  return GSL_EMAXITER;
}

static int
cgst_preduction(const void * vtrust_state, const gsl_vector * dx,
                double * pred, void * vstate)
{
  const gsl_multilarge_nlinear_trust_state *trust_state =
    (const gsl_multilarge_nlinear_trust_state *) vtrust_state;
  cgst_state_t *state = (cgst_state_t *) vstate;

  *pred = quadratic_preduction(trust_state, dx, state->workn);

  return GSL_SUCCESS;
}

/*
cgst_calc_tau()
  Compute tau > 0 such that:

|| p + tau*d || = delta
*/

static double
cgst_calc_tau(const gsl_vector * p, const gsl_vector * d,
              const double delta)
{
  double norm_p, norm_d, u;
  double t1, t2, tau;

  norm_p = gsl_blas_dnrm2(p);
  norm_d = gsl_blas_dnrm2(d);

  /* compute (p, d) */
  gsl_blas_ddot(p, d, &u);

  t1 = u / (norm_d * norm_d);
  t2 = t1*u + (delta + norm_p) * (delta - norm_p);
  tau = -t1 + sqrt(t2) / norm_d;

  return tau;
}

static const gsl_multilarge_nlinear_trs cgst_type =
{
  "steihaug-toint",
  cgst_alloc,
  cgst_init,
  cgst_preloop,
  cgst_step,
  cgst_preduction,
  cgst_free
};

const gsl_multilarge_nlinear_trs *gsl_multilarge_nlinear_trs_cgst = &cgst_type;