diff options
Diffstat (limited to 'kaldi_io/src/kaldi/matrix/optimization.h')
-rw-r--r-- | kaldi_io/src/kaldi/matrix/optimization.h | 248 |
1 files changed, 0 insertions, 248 deletions
diff --git a/kaldi_io/src/kaldi/matrix/optimization.h b/kaldi_io/src/kaldi/matrix/optimization.h deleted file mode 100644 index 66309ac..0000000 --- a/kaldi_io/src/kaldi/matrix/optimization.h +++ /dev/null @@ -1,248 +0,0 @@ -// matrix/optimization.h - -// Copyright 2012 Johns Hopkins University (author: Daniel Povey) -// -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. -// -// (*) incorporates, with permission, FFT code from his book -// "Signal Processing with Lapped Transforms", Artech, 1992. - - - -#ifndef KALDI_MATRIX_OPTIMIZATION_H_ -#define KALDI_MATRIX_OPTIMIZATION_H_ - -#include "matrix/kaldi-vector.h" -#include "matrix/kaldi-matrix.h" - -namespace kaldi { - - -/// @addtogroup matrix_optimization -/// @{ - -struct LinearCgdOptions { - int32 max_iters; // Maximum number of iters (if >= 0). - BaseFloat max_error; // Maximum 2-norm of the residual A x - b (convergence - // test) - // Every time the residual 2-norm decreases by this recompute_residual_factor - // since the last time it was computed from scratch, recompute it from - // scratch. This helps to keep the computed residual accurate even in the - // presence of roundoff. - BaseFloat recompute_residual_factor; - - LinearCgdOptions(): max_iters(-1), - max_error(0.0), - recompute_residual_factor(0.01) { } -}; - -/* - This function uses linear conjugate gradient descent to approximately solve - the system A x = b. The value of x at entry corresponds to the initial guess - of x. The algorithm continues until the number of iterations equals b.Dim(), - or until the 2-norm of (A x - b) is <= max_error, or until the number of - iterations equals max_iter, whichever happens sooner. It is a requirement - that A be positive definite. - It returns the number of iterations that were actually executed (this is - useful for testing purposes). -*/ -template<typename Real> -int32 LinearCgd(const LinearCgdOptions &opts, - const SpMatrix<Real> &A, const VectorBase<Real> &b, - VectorBase<Real> *x); - - - - - - -/** - This is an implementation of L-BFGS. It pushes responsibility for - determining when to stop, onto the user. There is no call-back here: - everything is done via calls to the class itself (see the example in - matrix-lib-test.cc). This does not implement constrained L-BFGS, but it will - handle constrained problems correctly as long as the function approaches - +infinity (or -infinity for maximization problems) when it gets close to the - bound of the constraint. In these types of problems, you just let the - function value be +infinity for minimization problems, or -infinity for - maximization problems, outside these bounds). -*/ - -struct LbfgsOptions { - bool minimize; // if true, we're minimizing, else maximizing. - int m; // m is the number of stored vectors L-BFGS keeps. - float first_step_learning_rate; // The very first step of L-BFGS is - // like gradient descent. If you want to configure the size of that step, - // you can do it using this variable. - float first_step_length; // If this variable is >0.0, it overrides - // first_step_learning_rate; on the first step we choose an approximate - // Hessian that is the multiple of the identity that would generate this - // step-length, or 1.0 if the gradient is zero. - float first_step_impr; // If this variable is >0.0, it overrides - // first_step_learning_rate; on the first step we choose an approximate - // Hessian that is the multiple of the identity that would generate this - // amount of objective function improvement (assuming the "real" objf - // was linear). - float c1; // A constant in Armijo rule = Wolfe condition i) - float c2; // A constant in Wolfe condition ii) - float d; // An amount > 1.0 (default 2.0) that we initially multiply or - // divide the step length by, in the line search. - int max_line_search_iters; // after this many iters we restart L-BFGS. - int avg_step_length; // number of iters to avg step length over, in - // RecentStepLength(). - - LbfgsOptions (bool minimize = true): - minimize(minimize), - m(10), - first_step_learning_rate(1.0), - first_step_length(0.0), - first_step_impr(0.0), - c1(1.0e-04), - c2(0.9), - d(2.0), - max_line_search_iters(50), - avg_step_length(4) { } -}; - -template<typename Real> -class OptimizeLbfgs { - public: - /// Initializer takes the starting value of x. - OptimizeLbfgs(const VectorBase<Real> &x, - const LbfgsOptions &opts); - - /// This returns the value of the variable x that has the best objective - /// function so far, and the corresponding objective function value if - /// requested. This would typically be called only at the end. - const VectorBase<Real>& GetValue(Real *objf_value = NULL) const; - - /// This returns the value at which the function wants us - /// to compute the objective function and gradient. - const VectorBase<Real>& GetProposedValue() const { return new_x_; } - - /// Returns the average magnitude of the last n steps (but not - /// more than the number we have stored). Before we have taken - /// any steps, returns +infinity. Note: if the most recent - /// step length was 0, it returns 0, regardless of the other - /// step lengths. This makes it suitable as a convergence test - /// (else we'd generate NaN's). - Real RecentStepLength() const; - - /// The user calls this function to provide the class with the - /// function and gradient info at the point GetProposedValue(). - /// If this point is outside the constraints you can set function_value - /// to {+infinity,-infinity} for {minimization,maximization} problems. - /// In this case the gradient, and also the second derivative (if you call - /// the second overloaded version of this function) will be ignored. - void DoStep(Real function_value, - const VectorBase<Real> &gradient); - - /// The user can call this version of DoStep() if it is desired to set some - /// kind of approximate Hessian on this iteration. Note: it is a prerequisite - /// that diag_approx_2nd_deriv must be strictly positive (minimizing), or - /// negative (maximizing). - void DoStep(Real function_value, - const VectorBase<Real> &gradient, - const VectorBase<Real> &diag_approx_2nd_deriv); - - private: - KALDI_DISALLOW_COPY_AND_ASSIGN(OptimizeLbfgs); - - - // The following variable says what stage of the computation we're at. - // Refer to Algorithm 7.5 (L-BFGS) of Nodecdal & Wright, "Numerical - // Optimization", 2nd edition. - // kBeforeStep means we're about to do - /// "compute p_k <-- - H_k \delta f_k" (i.e. Algorithm 7.4). - // kWithinStep means we're at some point within line search; note - // that line search is iterative so we can stay in this state more - // than one time on each iteration. - enum ComputationState { - kBeforeStep, - kWithinStep, // This means we're within the step-size computation, and - // have not yet done the 1st function evaluation. - }; - - inline MatrixIndexT Dim() { return x_.Dim(); } - inline MatrixIndexT M() { return opts_.m; } - SubVector<Real> Y(MatrixIndexT i) { - return SubVector<Real>(data_, (i % M()) * 2); // vector y_i - } - SubVector<Real> S(MatrixIndexT i) { - return SubVector<Real>(data_, (i % M()) * 2 + 1); // vector s_i - } - // The following are subroutines within DoStep(): - bool AcceptStep(Real function_value, - const VectorBase<Real> &gradient); - void Restart(const VectorBase<Real> &x, - Real function_value, - const VectorBase<Real> &gradient); - void ComputeNewDirection(Real function_value, - const VectorBase<Real> &gradient); - void ComputeHifNeeded(const VectorBase<Real> &gradient); - void StepSizeIteration(Real function_value, - const VectorBase<Real> &gradient); - void RecordStepLength(Real s); - - - LbfgsOptions opts_; - SignedMatrixIndexT k_; // Iteration number, starts from zero. Gets set back to zero - // when we restart. - - ComputationState computation_state_; - bool H_was_set_; // True if the user specified H_; if false, - // we'll use a heuristic to estimate it. - - - Vector<Real> x_; // current x. - Vector<Real> new_x_; // the x proposed in the line search. - Vector<Real> best_x_; // the x with the best objective function so far - // (either the same as x_ or something in the current line search.) - Vector<Real> deriv_; // The most recently evaluated derivative-- at x_k. - Vector<Real> temp_; - Real f_; // The function evaluated at x_k. - Real best_f_; // the best objective function so far. - Real d_; // a number d > 1.0, but during an iteration we may decrease this, when - // we switch between armijo and wolfe failures. - - int num_wolfe_i_failures_; // the num times we decreased step size. - int num_wolfe_ii_failures_; // the num times we increased step size. - enum { kWolfeI, kWolfeII, kNone } last_failure_type_; // last type of step-search - // failure on this iter. - - Vector<Real> H_; // Current inverse-Hessian estimate. May be computed by this class itself, - // or provided by user using 2nd form of SetGradientInfo(). - Matrix<Real> data_; // dimension (m*2) x dim. Even rows store - // gradients y_i, odd rows store steps s_i. - Vector<Real> rho_; // dimension m; rho_(m) = 1/(y_m^T s_m), Eq. 7.17. - - std::vector<Real> step_lengths_; // The step sizes we took on the last - // (up to m) iterations; these are not stored in a rotating buffer but - // are shifted by one each time (this is more convenient when we - // restart, as we keep this info past restarting). - - -}; - -/// @} - - -} // end namespace kaldi - - - -#endif - |