// matrix/optimization.h // Copyright 2012 Johns Hopkins University (author: Daniel Povey) // // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. // // (*) incorporates, with permission, FFT code from his book // "Signal Processing with Lapped Transforms", Artech, 1992. #ifndef KALDI_MATRIX_OPTIMIZATION_H_ #define KALDI_MATRIX_OPTIMIZATION_H_ #include "matrix/kaldi-vector.h" #include "matrix/kaldi-matrix.h" namespace kaldi { /// @addtogroup matrix_optimization /// @{ struct LinearCgdOptions { int32 max_iters; // Maximum number of iters (if >= 0). BaseFloat max_error; // Maximum 2-norm of the residual A x - b (convergence // test) // Every time the residual 2-norm decreases by this recompute_residual_factor // since the last time it was computed from scratch, recompute it from // scratch. This helps to keep the computed residual accurate even in the // presence of roundoff. BaseFloat recompute_residual_factor; LinearCgdOptions(): max_iters(-1), max_error(0.0), recompute_residual_factor(0.01) { } }; /* This function uses linear conjugate gradient descent to approximately solve the system A x = b. The value of x at entry corresponds to the initial guess of x. The algorithm continues until the number of iterations equals b.Dim(), or until the 2-norm of (A x - b) is <= max_error, or until the number of iterations equals max_iter, whichever happens sooner. It is a requirement that A be positive definite. It returns the number of iterations that were actually executed (this is useful for testing purposes). */ template int32 LinearCgd(const LinearCgdOptions &opts, const SpMatrix &A, const VectorBase &b, VectorBase *x); /** This is an implementation of L-BFGS. It pushes responsibility for determining when to stop, onto the user. There is no call-back here: everything is done via calls to the class itself (see the example in matrix-lib-test.cc). This does not implement constrained L-BFGS, but it will handle constrained problems correctly as long as the function approaches +infinity (or -infinity for maximization problems) when it gets close to the bound of the constraint. In these types of problems, you just let the function value be +infinity for minimization problems, or -infinity for maximization problems, outside these bounds). */ struct LbfgsOptions { bool minimize; // if true, we're minimizing, else maximizing. int m; // m is the number of stored vectors L-BFGS keeps. float first_step_learning_rate; // The very first step of L-BFGS is // like gradient descent. If you want to configure the size of that step, // you can do it using this variable. float first_step_length; // If this variable is >0.0, it overrides // first_step_learning_rate; on the first step we choose an approximate // Hessian that is the multiple of the identity that would generate this // step-length, or 1.0 if the gradient is zero. float first_step_impr; // If this variable is >0.0, it overrides // first_step_learning_rate; on the first step we choose an approximate // Hessian that is the multiple of the identity that would generate this // amount of objective function improvement (assuming the "real" objf // was linear). float c1; // A constant in Armijo rule = Wolfe condition i) float c2; // A constant in Wolfe condition ii) float d; // An amount > 1.0 (default 2.0) that we initially multiply or // divide the step length by, in the line search. int max_line_search_iters; // after this many iters we restart L-BFGS. int avg_step_length; // number of iters to avg step length over, in // RecentStepLength(). LbfgsOptions (bool minimize = true): minimize(minimize), m(10), first_step_learning_rate(1.0), first_step_length(0.0), first_step_impr(0.0), c1(1.0e-04), c2(0.9), d(2.0), max_line_search_iters(50), avg_step_length(4) { } }; template class OptimizeLbfgs { public: /// Initializer takes the starting value of x. OptimizeLbfgs(const VectorBase &x, const LbfgsOptions &opts); /// This returns the value of the variable x that has the best objective /// function so far, and the corresponding objective function value if /// requested. This would typically be called only at the end. const VectorBase& GetValue(Real *objf_value = NULL) const; /// This returns the value at which the function wants us /// to compute the objective function and gradient. const VectorBase& GetProposedValue() const { return new_x_; } /// Returns the average magnitude of the last n steps (but not /// more than the number we have stored). Before we have taken /// any steps, returns +infinity. Note: if the most recent /// step length was 0, it returns 0, regardless of the other /// step lengths. This makes it suitable as a convergence test /// (else we'd generate NaN's). Real RecentStepLength() const; /// The user calls this function to provide the class with the /// function and gradient info at the point GetProposedValue(). /// If this point is outside the constraints you can set function_value /// to {+infinity,-infinity} for {minimization,maximization} problems. /// In this case the gradient, and also the second derivative (if you call /// the second overloaded version of this function) will be ignored. void DoStep(Real function_value, const VectorBase &gradient); /// The user can call this version of DoStep() if it is desired to set some /// kind of approximate Hessian on this iteration. Note: it is a prerequisite /// that diag_approx_2nd_deriv must be strictly positive (minimizing), or /// negative (maximizing). void DoStep(Real function_value, const VectorBase &gradient, const VectorBase &diag_approx_2nd_deriv); private: KALDI_DISALLOW_COPY_AND_ASSIGN(OptimizeLbfgs); // The following variable says what stage of the computation we're at. // Refer to Algorithm 7.5 (L-BFGS) of Nodecdal & Wright, "Numerical // Optimization", 2nd edition. // kBeforeStep means we're about to do /// "compute p_k <-- - H_k \delta f_k" (i.e. Algorithm 7.4). // kWithinStep means we're at some point within line search; note // that line search is iterative so we can stay in this state more // than one time on each iteration. enum ComputationState { kBeforeStep, kWithinStep, // This means we're within the step-size computation, and // have not yet done the 1st function evaluation. }; inline MatrixIndexT Dim() { return x_.Dim(); } inline MatrixIndexT M() { return opts_.m; } SubVector Y(MatrixIndexT i) { return SubVector(data_, (i % M()) * 2); // vector y_i } SubVector S(MatrixIndexT i) { return SubVector(data_, (i % M()) * 2 + 1); // vector s_i } // The following are subroutines within DoStep(): bool AcceptStep(Real function_value, const VectorBase &gradient); void Restart(const VectorBase &x, Real function_value, const VectorBase &gradient); void ComputeNewDirection(Real function_value, const VectorBase &gradient); void ComputeHifNeeded(const VectorBase &gradient); void StepSizeIteration(Real function_value, const VectorBase &gradient); void RecordStepLength(Real s); LbfgsOptions opts_; SignedMatrixIndexT k_; // Iteration number, starts from zero. Gets set back to zero // when we restart. ComputationState computation_state_; bool H_was_set_; // True if the user specified H_; if false, // we'll use a heuristic to estimate it. Vector x_; // current x. Vector new_x_; // the x proposed in the line search. Vector best_x_; // the x with the best objective function so far // (either the same as x_ or something in the current line search.) Vector deriv_; // The most recently evaluated derivative-- at x_k. Vector temp_; Real f_; // The function evaluated at x_k. Real best_f_; // the best objective function so far. Real d_; // a number d > 1.0, but during an iteration we may decrease this, when // we switch between armijo and wolfe failures. int num_wolfe_i_failures_; // the num times we decreased step size. int num_wolfe_ii_failures_; // the num times we increased step size. enum { kWolfeI, kWolfeII, kNone } last_failure_type_; // last type of step-search // failure on this iter. Vector H_; // Current inverse-Hessian estimate. May be computed by this class itself, // or provided by user using 2nd form of SetGradientInfo(). Matrix data_; // dimension (m*2) x dim. Even rows store // gradients y_i, odd rows store steps s_i. Vector rho_; // dimension m; rho_(m) = 1/(y_m^T s_m), Eq. 7.17. std::vector step_lengths_; // The step sizes we took on the last // (up to m) iterations; these are not stored in a rotating buffer but // are shifted by one each time (this is more convenient when we // restart, as we keep this info past restarting). }; /// @} } // end namespace kaldi #endif