/***************************************************************************
**
**  This file is part of QGpCoreMath.
**
**  This library is free software; you can redistribute it and/or
**  modify it under the terms of the GNU Lesser General Public
**  License as published by the Free Software Foundation; either
**  version 2.1 of the License, or (at your option) any later version.
**
**  This file is distributed in the hope that it will be useful, but WITHOUT
**  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
**  FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
**  License for more details.
**
**  You should have received a copy of the GNU Lesser General Public
**  License along with this library; if not, write to the Free Software
**  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
**
**  See http://www.geopsy.org for more information.
**
**  Created: 2019-08-19
**  Copyright: 2019
**    Marc Wathelet (ISTerre, Grenoble, France)
**
***************************************************************************/

#include "OptimizationBFGS.h"

namespace QGpCoreMath {

  /*!
    \class OptimizationBFGS::Step OptimizationBFGS.h
    \brief Useful values describing a step

  */

  inline void OptimizationBFGS::Step::setDerivative(const Vector<double>& searchDirection)
  {
    _dphi=_f->gradient.scalarProduct(searchDirection);
  }

  /*!
    \class OptimizationBFGS::Phi OptimizationBFGS.h
    \brief Phi function

    'Numerical Optimization' chapter 3.

    Phi funtion is defined by \phi(\alpha)=f(x_k+\alpha p_k) where
    p_k is a direction of ascent. \alpha is the step length.

    The objective is to find a maximum to this function, under the Wolfe conditions:
    \li Armijo condition ensures a sufficient increase
    \li The curvature condition. To tend to a maximum the slope must decrease.
  */

  /*!
    \fn Phi::Phi(const Vector<double>& x)

    Constructs a phi function at \a x.
  */

  inline OptimizationBFGS::Phi::Phi(int dimensionCount)
    : _x0(dimensionCount, 0.0), _p(dimensionCount, 0.0)
  {

  }

  /*!
    Sets the direction as the product of the inverse Hessian approximation \a h and
    the gradient \a grad. It also calculates the terms of the two Wolfe conditions (strong).
  */
  inline void OptimizationBFGS::Phi::setSearchDirection(const DoubleMatrix& h)
  {
    // Equation 6.2 p 136
    _p.multiply(h, _f0->gradient);
    _p.changeSign();
    // k0 can be eventually negative because _p and gradient can be
    // in drastically distinct directions.
    double k0=_f0->gradient.scalarProduct(_p);
    // Coefficient c1 must be between 0 and 0.5
    // Usually choosen to be quite small.
    _c1k0=1e-4*k0;
    // Coefficient c2 must be between c1 and 1
    // 0.9 for Newton or quasi-Newton method
    // Absolute value for the strong Wolfe condition
    _c2k0=0.9*fabs(k0);
  }

  /*!
    Calculates the argument of the function used to evaluate phi function.

    x+\alpha p
  */
  inline void OptimizationBFGS::Phi::setArgument(double alpha, Vector<double>& arg) const
  {
    arg.copyValues(_p);
    arg*=alpha;
    arg+=_x0;
  }

  /*!
    First Wolfe condition: sufficient decrease

    Section 3.1 on page 33.
  */
  inline bool OptimizationBFGS::Phi::isDecreaseCondition(const Step &s) const
  {
    return s.value()<=_f0->value+_c1k0*s.alpha();
  }

  /*!
    First Wolfe condition: sufficient increase (maximization)

    Section 3.1 on page 33.
  */
  inline bool OptimizationBFGS::Phi::isIncreaseCondition(const Step &s) const
  {
    return s.value()>=_f0->value+_c1k0*s.alpha();
  }

  /*!
    Strong second Wolfe condition: curvature condition

    Section 3.1 on page 34, eq. 3.7.
  */
  inline bool OptimizationBFGS::Phi::isCurvatureCondition(const Step& s) const
  {
    return fabs(s.derivative())<=_c2k0;
  }

  /*!
    \class OptimizationBFGS OptimizationBFGS.h
    \brief Numerical optimizer based on BFGS method

    Implementation of the BFGS method described in
    'Numerical Optimization' by Nocedal and Wright.

    Section 6.1 on page 136.
  */

  /*!
  */
  OptimizationBFGS::OptimizationBFGS(int dimensionCount)
    : FunctionSearch(dimensionCount),
      _precision(dimensionCount, 0.0),
      _phi(dimensionCount),
      _absStep(dimensionCount, 0.0),
      _absShift(dimensionCount, 0.0),
      _sk(dimensionCount, 0.0),
      _yk(dimensionCount, 0.0),
      _x(dimensionCount, 0.0),
      _x0(dimensionCount, 0.0),
      _hk(dimensionCount, dimensionCount),
      _tk(dimensionCount, dimensionCount),
      _skskt(dimensionCount, dimensionCount)
  {
    TRACE;
    for(int i=0; i<=3; i++) {
      _fValues[i]=new AbstractFunction::Properties(dimensionCount);
    }
    for(int i=0; i<=2; i++) {
      _steps[i]=new Step;
      _steps[i]->setFunctionProperties(_fValues[i]);
    }
    _phi.setFunctionProperties(_fValues[3]);
    _iterationCount=0;
  }

  /*!
    Description of destructor still missing
  */
  OptimizationBFGS::~OptimizationBFGS()
  {
    TRACE;
    for(int i=0; i<=2; i++) {
      delete _steps[i];
    }
    for(int i=0; i<=3; i++) {
      delete _fValues[i];
    }
  }

  void OptimizationBFGS::reset(const Vector<double>& p)
  {
    _phi.x0().copyValues(p);
    // Identity matrix implies following the gradient direction for the first iterations.
    // Progressively, the Hessian approximation will tend to the Hessian when approaching the minimum.
    _hk.identity();
  }

  /*!
    Return true if a minimum can be identified.
    Iterations are started from position().
    During the iterations, the maximum shift from the original position is \a maxShift.

    Algorithm 6.1 in 'Numerical Optimization', p. 140.
  */
  bool OptimizationBFGS::minimize(const Vector<double>& maxShift, int maxIterationCount)
  {
    // last iteration is close to a correct approximation.
    _x0.copyValues(_phi.x0());  // Keep original position to track maximum shift TODO, test with reset to 0
    Step * bestStep;

    APP_LOG(1, tr("Starting at %1\n").arg(_phi.x0().toString()));
    _phi.functionProperties()->value=_function->value(_phi.x0());
    _function->gradient(_phi.x0(), _phi.functionProperties()->gradient);

    _iterationCount=0;
    int smallStepCount=0;
    while(true) {
      APP_LOG(5, tr("Starting BFGS iteration %1\n").arg(++_iterationCount));
      _phi.setSearchDirection(_hk);
      // Store the gradient of current function
      // Bracketing will overwrite this function.
      _yk.copyValues(_phi.functionProperties()->gradient);
      _yk.changeSign();       // New gradient will be added once the best step is found.
      // Function properties of _phi may be altered in bracketing() but it is not used
      // before being assigned to the function properties of bestStep.
      bestStep=bracketing();
      // Equation 6.5, definition of sk and yk
      _sk.copyValues(_phi.searchDirection());
      _sk*=bestStep->alpha();
      _yk+=bestStep->functionProperties()->gradient;
      // Get H_{k+1} according to BFGS method (equation 6.17)
      double rk=_yk.scalarProduct(_sk);
      if(bestStep->alpha()>0.0 && rk!=0.0) {
        rk=1.0/rk;
        _tk.multiply(_yk, _sk, -rk);     // -\rho_k y_k s_k^T
        _tk.addIdentity();
        _hk=_tk.transposed()*_hk*_tk;
        _skskt.multiply(_sk, _sk, rk);
        _hk+=_skskt;

        _phi.setFunctionProperties(*bestStep);
        _phi.move(_sk);
        APP_LOG(1, tr("Step to %1\n").arg(_phi.x0().toString()));
      } else {
        APP_LOG(1, tr("Null step in BFGS optimization after %1 iterations\n").arg(_iterationCount));
        break;
      }

      _iterationCount++;
      APP_LOG(7, tr("it %1: x0=%2 => %3\n")
               .arg(_iterationCount, 3, 10, QChar(' '))
               .arg(_phi.x0().toUserString())
               .arg(_phi.functionProperties()->value));

      // Control of loop termination
      _absStep.copyValues(_sk);
      _absStep.abs();
      _absShift.copyValues(_phi.x0());
      _absShift-=_x0;
      _absShift.abs();
      if(_absShift.hasElementGreaterThan(maxShift)) {
        APP_LOG(1, tr("Maximum shift reached in BFGS optimization after %1 iterations\n").arg(_iterationCount));
        return false;
      }
      if(_iterationCount>maxIterationCount) {
        APP_LOG(1, tr("Maximum number of BFGS iteration reached (%1)\n").arg(_iterationCount));
        return false;
      }
      if(_absStep.hasElementGreaterThan(_precision)) {
        smallStepCount=0;
      } else {
        if(smallStepCount>5) {  // A succession of a minimum of 5 small steps less than the precision
          APP_LOG(1, tr("At least 5 successive tiny steps, stop BFGS iteration after %1 iterations.\n")
                  .arg(_iterationCount));
          break;
        } else {
          smallStepCount++;
        }
      }
    }
    return true;
  }

  /*!
    Two step linear search: bracketing and then zooming

    Described in details in 'Numerical Optimization' by Nocedal and Wright
    In particular algorithms 3.5 page 60.
  */
  OptimizationBFGS::Step * OptimizationBFGS::bracketing()
  {
    Step *& step=_steps[0];         // alpha_i
    Step *& lastStep=_steps[1];     // alpha_{i-1}

    lastStep->setAlpha(0.0);
    // Second part of first condition is set to false for the first step i>1
    lastStep->setValue(std::numeric_limits<double>::infinity());
    // Initial alpha set to avoid too small steps that may lead to numerical error.
    // For instance if the function value is reaching the arithmetic precision.
    _dalpha=_precision.minimumAbsRatio(_phi.searchDirection());
    double maxAlpha=_function->admissibleStep(_phi.x0(), _phi.searchDirection());
    if(_dalpha>maxAlpha) {
      return lastStep;
    }
    if(maxAlpha>1.0) {
      step->setAlpha(1.0);
    } else {
      step->setAlpha(maxAlpha);
    }
    while(true) {
      // Calculates the value of f at current alpha. Required for the first condition.
      _phi.setArgument(step->alpha(), _x);
      step->setValue(_function->value(_x));
      // Calculates the gradient required for the curvature condition. Not required for
      // the decrease condition, but if step returned, its gradient might not up-to-date.
      _function->gradient(_x, step->functionProperties()->gradient);
      if(!_phi.isDecreaseCondition(*step) ||
         step->value()>=lastStep->value()) {
        if(lastStep->alpha()==0.0) {
          lastStep->setValue(_phi.functionProperties()->value);
          lastStep->functionProperties()->gradient.copyValues(_phi.functionProperties()->gradient);
          lastStep->setDerivative(_phi.searchDirection());
        }
        return zoom(lastStep, step);
      }
      step->setDerivative(_phi.searchDirection());
      if(_phi.isCurvatureCondition(*step)) {
        return step;
      }
      if(step->derivative()>=0) {
        if(lastStep->alpha()==0.0) {
          lastStep->setValue(_phi.functionProperties()->value);
          lastStep->functionProperties()->gradient.copyValues(_phi.functionProperties()->gradient);
          lastStep->setDerivative(_phi.searchDirection());
        }
        return zoom(step, lastStep);
      }
      double nextAlpha=2.0*step->alpha();
      if(step->alpha()>maxAlpha) {
        APP_LOG(1, tr("Maximum alpha reached in BFGS bracketing\n"));
        step->setAlpha(0.0);
        return step;
      }
      if(nextAlpha>maxAlpha) {
        nextAlpha=maxAlpha;
      }
      qSwap(step, lastStep);
      step->setAlpha(nextAlpha);
    }
  }

  /*!
    \f[ y=a x^2+b x+c \f]
    passing by \f$(x_1, y_1)\f$ and \f$(x_2, y_2)\f$ with \f$y^'(x_1)=dy_1\f$

    We are looking for the extremum of this function at \f$x_0\f$ between \f$x_1\f$ and \f$x_2\f$, hence

    \f[ y^'(x_0)=2a x_0+b=0 \f]

    and we can write

    \f[ y=a x^2-2 a x_0 x+c \f]

    \f{eqnarray*}{
      \frac{y_2-y_1}{a} &=& (x_2^2-x_1^2) - 2 x_0 (x_2-x_1) \\
      dy_1              &=& 2 a x_1 - 2 a x_0 \\
      a                 &=& \frac{dy_1}{2 (x_1-x_0)} \\
      (y_2-y_1)(x_1-x_0) \frac{2}{dy_1} &=&  (x_2^2-x_1^2) - 2 x_0 (x_2-x_1) \\
      x_0 ( 2 (x_2-x_1) - (y_2-y_1)\frac{2}{dy_1}) &=& (x_2^2-x_1^2) - x_1 (y_2-y_1) \frac{2}{dy_1} \\
      x_0 ( dy_1 (x_2-x_1) - (y_2-y_1) ) &=& \frac{dy_1}{2}(x_2^2-x_1^2) - x_1 (y_2-y_1) \\
    \f}
  */
  inline double OptimizationBFGS::quadratic(double x1, double y1, double dy1,
                                            double x2, double y2)
  {
    double my21=y2-y1;
    double mx21=x2-x1;
    double px21=x2+x1;
    double x=(0.5*dy1*mx21*px21-x1*my21)/(dy1*mx21-my21);
    // Make sure that arithmetic errors do not lead to a new x outside x1 and x2
    if(x1<x2) {
      if(x<x1) {
        x=x1;
      } else if(x>x2) {
        x=x2;
      }
    } else {
      if(x<x2) {
        x=x2;
      } else if(x>x1) {
        x=x1;
      }
    }
    return x;
  }

  inline double OptimizationBFGS::bisection(double x1, double x2)
  {
    return 0.5*(x1+x2);
  }

  /*!
    Two step linear search: bracketing and then zooming

    Described in details in 'Numerical Optimization' by Nocedal and Wright
    In particular algorithms 3.6 page 61.
  */
  OptimizationBFGS::Step * OptimizationBFGS::zoom(Step *& low, Step *& high)
  {
    Step *& mid=_steps[2];

    for(int i=0; i<100; i++) {
      if(fabs(low->alpha()-high->alpha())<_dalpha) {
        return low->value()<high->value() ? low : high;
      }
      //mid->setAlpha(quadratic(low->alpha(), low->value(), low->derivative(),
      //                        high->alpha(), high->value()));
      mid->setAlpha(bisection(low->alpha(), high->alpha()));
      _phi.setArgument(mid->alpha(), _x);
      mid->setValue(_function->value(_x));
      // Calculates the gradient required for the curvature condition. Not required for
      // the decrease condition, but if step returned, its gradient might not up-to-date.
      _function->gradient(_x, mid->functionProperties()->gradient);
      if(!_phi.isDecreaseCondition(*mid) || mid->value()>=low->value()) {
        if(fabs(mid->alpha()-high->alpha())<_dalpha) {
          return mid->value()<high->value() ? mid : high;
        }
        qSwap(high, mid);
      } else {
        mid->setDerivative(_phi.searchDirection());
        if(_phi.isCurvatureCondition(*mid)) {
          return mid;
        }
        if(mid->derivative()*(high->alpha()-low->alpha())>=0) {
          qSwap(high, low);
        }
        qSwap(low, mid);
      }
    }
    return mid;
  }

  void OptimizationBFGS::debugAlpha(double min, double max)
  {
    Step *& mid=_steps[2];
    QFile fval("/tmp/alpha-val");
    fval.open(QIODevice::WriteOnly);
    QTextStream sval(&fval);
    QFile fder("/tmp/alpha-der");
    fder.open(QIODevice::WriteOnly);
    QTextStream sder(&fder);
    sval << "# val" << Qt::endl;
    sder << "# der" << Qt::endl;
    double delta=(max-min)/30.0;
    for(double alpha=min; alpha<max; alpha+=delta) {
      mid->setAlpha(alpha);
      _phi.setArgument(mid->alpha(), _x);
      mid->setValue(_function->value(_x));
      _function->gradient(_x, mid->functionProperties()->gradient);
      mid->setDerivative(_phi.searchDirection());
      sval << QString::number(alpha) << " " << mid->value() << Qt::endl;
      sder << QString::number(alpha) << " " << mid->derivative() << Qt::endl;
    }
    fval.close();
    fder.close();
  }

  void OptimizationBFGS::debugFunction(double range)
  {
    int n=_x.count();
    double delta=range/15.0;
    _x.copyValues(_phi.x0());
    PrivateVector<double> grad(_x);
    for(int i=0; i<n; i++) {
      QFile fval(QString("/tmp/f_%1").arg(i, 2, 10, QChar('0')));
      fval.open(QIODevice::WriteOnly);
      QTextStream sval(&fval);
      //QFile fder(QString("/tmp/df_%1").arg(i, 2, 10, QChar('0')));
      //fder.open(QIODevice::WriteOnly);
      //QTextStream sder(&fder);
      for(double x=-range; x<range; x+=delta) {
        _x.at(i)+=x;
        sval << Angle::radiansToDegrees(x) << " " << _function->value(_x) << Qt::endl;
        //_function->gradient(_x, grad);
        //sder << x << " " << grad.at(axis) << Qt::endl;
        _x.at(i)-=x;
      }
      fval.close();
      //fder.close();
    }
  }

} // namespace QGpCoreMath

