summaryrefslogtreecommitdiff
path: root/kaldi_io/src/kaldi/matrix/optimization.h
diff options
context:
space:
mode:
Diffstat (limited to 'kaldi_io/src/kaldi/matrix/optimization.h')
-rw-r--r--kaldi_io/src/kaldi/matrix/optimization.h248
1 files changed, 248 insertions, 0 deletions
diff --git a/kaldi_io/src/kaldi/matrix/optimization.h b/kaldi_io/src/kaldi/matrix/optimization.h
new file mode 100644
index 0000000..66309ac
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/optimization.h
@@ -0,0 +1,248 @@
+// matrix/optimization.h
+
+// Copyright 2012 Johns Hopkins University (author: Daniel Povey)
+//
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+//
+// (*) incorporates, with permission, FFT code from his book
+// "Signal Processing with Lapped Transforms", Artech, 1992.
+
+
+
+#ifndef KALDI_MATRIX_OPTIMIZATION_H_
+#define KALDI_MATRIX_OPTIMIZATION_H_
+
+#include "matrix/kaldi-vector.h"
+#include "matrix/kaldi-matrix.h"
+
+namespace kaldi {
+
+
+/// @addtogroup matrix_optimization
+/// @{
+
+struct LinearCgdOptions {
+ int32 max_iters; // Maximum number of iters (if >= 0).
+ BaseFloat max_error; // Maximum 2-norm of the residual A x - b (convergence
+ // test)
+ // Every time the residual 2-norm decreases by this recompute_residual_factor
+ // since the last time it was computed from scratch, recompute it from
+ // scratch. This helps to keep the computed residual accurate even in the
+ // presence of roundoff.
+ BaseFloat recompute_residual_factor;
+
+ LinearCgdOptions(): max_iters(-1),
+ max_error(0.0),
+ recompute_residual_factor(0.01) { }
+};
+
+/*
+ This function uses linear conjugate gradient descent to approximately solve
+ the system A x = b. The value of x at entry corresponds to the initial guess
+ of x. The algorithm continues until the number of iterations equals b.Dim(),
+ or until the 2-norm of (A x - b) is <= max_error, or until the number of
+ iterations equals max_iter, whichever happens sooner. It is a requirement
+ that A be positive definite.
+ It returns the number of iterations that were actually executed (this is
+ useful for testing purposes).
+*/
+template<typename Real>
+int32 LinearCgd(const LinearCgdOptions &opts,
+ const SpMatrix<Real> &A, const VectorBase<Real> &b,
+ VectorBase<Real> *x);
+
+
+
+
+
+
+/**
+ This is an implementation of L-BFGS. It pushes responsibility for
+ determining when to stop, onto the user. There is no call-back here:
+ everything is done via calls to the class itself (see the example in
+ matrix-lib-test.cc). This does not implement constrained L-BFGS, but it will
+ handle constrained problems correctly as long as the function approaches
+ +infinity (or -infinity for maximization problems) when it gets close to the
+ bound of the constraint. In these types of problems, you just let the
+ function value be +infinity for minimization problems, or -infinity for
+ maximization problems, outside these bounds).
+*/
+
+struct LbfgsOptions {
+ bool minimize; // if true, we're minimizing, else maximizing.
+ int m; // m is the number of stored vectors L-BFGS keeps.
+ float first_step_learning_rate; // The very first step of L-BFGS is
+ // like gradient descent. If you want to configure the size of that step,
+ // you can do it using this variable.
+ float first_step_length; // If this variable is >0.0, it overrides
+ // first_step_learning_rate; on the first step we choose an approximate
+ // Hessian that is the multiple of the identity that would generate this
+ // step-length, or 1.0 if the gradient is zero.
+ float first_step_impr; // If this variable is >0.0, it overrides
+ // first_step_learning_rate; on the first step we choose an approximate
+ // Hessian that is the multiple of the identity that would generate this
+ // amount of objective function improvement (assuming the "real" objf
+ // was linear).
+ float c1; // A constant in Armijo rule = Wolfe condition i)
+ float c2; // A constant in Wolfe condition ii)
+ float d; // An amount > 1.0 (default 2.0) that we initially multiply or
+ // divide the step length by, in the line search.
+ int max_line_search_iters; // after this many iters we restart L-BFGS.
+ int avg_step_length; // number of iters to avg step length over, in
+ // RecentStepLength().
+
+ LbfgsOptions (bool minimize = true):
+ minimize(minimize),
+ m(10),
+ first_step_learning_rate(1.0),
+ first_step_length(0.0),
+ first_step_impr(0.0),
+ c1(1.0e-04),
+ c2(0.9),
+ d(2.0),
+ max_line_search_iters(50),
+ avg_step_length(4) { }
+};
+
+template<typename Real>
+class OptimizeLbfgs {
+ public:
+ /// Initializer takes the starting value of x.
+ OptimizeLbfgs(const VectorBase<Real> &x,
+ const LbfgsOptions &opts);
+
+ /// This returns the value of the variable x that has the best objective
+ /// function so far, and the corresponding objective function value if
+ /// requested. This would typically be called only at the end.
+ const VectorBase<Real>& GetValue(Real *objf_value = NULL) const;
+
+ /// This returns the value at which the function wants us
+ /// to compute the objective function and gradient.
+ const VectorBase<Real>& GetProposedValue() const { return new_x_; }
+
+ /// Returns the average magnitude of the last n steps (but not
+ /// more than the number we have stored). Before we have taken
+ /// any steps, returns +infinity. Note: if the most recent
+ /// step length was 0, it returns 0, regardless of the other
+ /// step lengths. This makes it suitable as a convergence test
+ /// (else we'd generate NaN's).
+ Real RecentStepLength() const;
+
+ /// The user calls this function to provide the class with the
+ /// function and gradient info at the point GetProposedValue().
+ /// If this point is outside the constraints you can set function_value
+ /// to {+infinity,-infinity} for {minimization,maximization} problems.
+ /// In this case the gradient, and also the second derivative (if you call
+ /// the second overloaded version of this function) will be ignored.
+ void DoStep(Real function_value,
+ const VectorBase<Real> &gradient);
+
+ /// The user can call this version of DoStep() if it is desired to set some
+ /// kind of approximate Hessian on this iteration. Note: it is a prerequisite
+ /// that diag_approx_2nd_deriv must be strictly positive (minimizing), or
+ /// negative (maximizing).
+ void DoStep(Real function_value,
+ const VectorBase<Real> &gradient,
+ const VectorBase<Real> &diag_approx_2nd_deriv);
+
+ private:
+ KALDI_DISALLOW_COPY_AND_ASSIGN(OptimizeLbfgs);
+
+
+ // The following variable says what stage of the computation we're at.
+ // Refer to Algorithm 7.5 (L-BFGS) of Nodecdal & Wright, "Numerical
+ // Optimization", 2nd edition.
+ // kBeforeStep means we're about to do
+ /// "compute p_k <-- - H_k \delta f_k" (i.e. Algorithm 7.4).
+ // kWithinStep means we're at some point within line search; note
+ // that line search is iterative so we can stay in this state more
+ // than one time on each iteration.
+ enum ComputationState {
+ kBeforeStep,
+ kWithinStep, // This means we're within the step-size computation, and
+ // have not yet done the 1st function evaluation.
+ };
+
+ inline MatrixIndexT Dim() { return x_.Dim(); }
+ inline MatrixIndexT M() { return opts_.m; }
+ SubVector<Real> Y(MatrixIndexT i) {
+ return SubVector<Real>(data_, (i % M()) * 2); // vector y_i
+ }
+ SubVector<Real> S(MatrixIndexT i) {
+ return SubVector<Real>(data_, (i % M()) * 2 + 1); // vector s_i
+ }
+ // The following are subroutines within DoStep():
+ bool AcceptStep(Real function_value,
+ const VectorBase<Real> &gradient);
+ void Restart(const VectorBase<Real> &x,
+ Real function_value,
+ const VectorBase<Real> &gradient);
+ void ComputeNewDirection(Real function_value,
+ const VectorBase<Real> &gradient);
+ void ComputeHifNeeded(const VectorBase<Real> &gradient);
+ void StepSizeIteration(Real function_value,
+ const VectorBase<Real> &gradient);
+ void RecordStepLength(Real s);
+
+
+ LbfgsOptions opts_;
+ SignedMatrixIndexT k_; // Iteration number, starts from zero. Gets set back to zero
+ // when we restart.
+
+ ComputationState computation_state_;
+ bool H_was_set_; // True if the user specified H_; if false,
+ // we'll use a heuristic to estimate it.
+
+
+ Vector<Real> x_; // current x.
+ Vector<Real> new_x_; // the x proposed in the line search.
+ Vector<Real> best_x_; // the x with the best objective function so far
+ // (either the same as x_ or something in the current line search.)
+ Vector<Real> deriv_; // The most recently evaluated derivative-- at x_k.
+ Vector<Real> temp_;
+ Real f_; // The function evaluated at x_k.
+ Real best_f_; // the best objective function so far.
+ Real d_; // a number d > 1.0, but during an iteration we may decrease this, when
+ // we switch between armijo and wolfe failures.
+
+ int num_wolfe_i_failures_; // the num times we decreased step size.
+ int num_wolfe_ii_failures_; // the num times we increased step size.
+ enum { kWolfeI, kWolfeII, kNone } last_failure_type_; // last type of step-search
+ // failure on this iter.
+
+ Vector<Real> H_; // Current inverse-Hessian estimate. May be computed by this class itself,
+ // or provided by user using 2nd form of SetGradientInfo().
+ Matrix<Real> data_; // dimension (m*2) x dim. Even rows store
+ // gradients y_i, odd rows store steps s_i.
+ Vector<Real> rho_; // dimension m; rho_(m) = 1/(y_m^T s_m), Eq. 7.17.
+
+ std::vector<Real> step_lengths_; // The step sizes we took on the last
+ // (up to m) iterations; these are not stored in a rotating buffer but
+ // are shifted by one each time (this is more convenient when we
+ // restart, as we keep this info past restarting).
+
+
+};
+
+/// @}
+
+
+} // end namespace kaldi
+
+
+
+#endif
+