20 files changed, 5813 insertions, 0 deletions
diff --git a/kaldi_io/src/kaldi/matrix/cblas-wrappers.h b/kaldi_io/src/kaldi/matrix/cblas-wrappers.h
new file mode 100644
index 0000000..ebec0a3
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/cblas-wrappers.h
@@ -0,0 +1,491 @@
+// matrix/cblas-wrappers.h
+
+// Copyright 2012  Johns Hopkins University (author: Daniel Povey);
+//                 Haihua Xu; Wei Shi
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+#ifndef KALDI_MATRIX_CBLAS_WRAPPERS_H_
+#define KALDI_MATRIX_CBLAS_WRAPPERS_H_ 1
+
+
+#include <limits>
+#include "matrix/sp-matrix.h"
+#include "matrix/kaldi-vector.h"
+#include "matrix/kaldi-matrix.h"
+#include "matrix/matrix-functions.h"
+
+// Do not include this file directly.  It is to be included
+// by .cc files in this directory.
+
+namespace kaldi {
+
+
+inline void cblas_Xcopy(const int N, const float *X, const int incX, float *Y,
+                        const int incY) {
+  cblas_scopy(N, X, incX, Y, incY);
+}
+
+inline void cblas_Xcopy(const int N, const double *X, const int incX, double *Y,
+                        const int incY) {
+  cblas_dcopy(N, X, incX, Y, incY);
+}
+
+
+inline float cblas_Xasum(const int N, const float *X, const int incX) {
+  return cblas_sasum(N, X, incX);
+}
+
+inline double cblas_Xasum(const int N, const double *X, const int incX) {
+  return cblas_dasum(N, X, incX);
+}
+
+inline void cblas_Xrot(const int N, float *X, const int incX, float *Y,
+                       const int incY, const float c, const float s) {
+  cblas_srot(N, X, incX, Y, incY, c, s);
+}
+inline void cblas_Xrot(const int N, double *X, const int incX, double *Y,
+                       const int incY, const double c, const double s) {
+  cblas_drot(N, X, incX, Y, incY, c, s);
+}
+inline float cblas_Xdot(const int N, const float *const X,
+                        const int incX, const float *const Y,
+                        const int incY) {
+  return cblas_sdot(N, X, incX, Y, incY);
+}
+inline double cblas_Xdot(const int N, const double *const X,
+                        const int incX, const double *const Y,
+                        const int incY) {
+  return cblas_ddot(N, X, incX, Y, incY);
+}
+inline void cblas_Xaxpy(const int N, const float alpha, const float *X,
+                        const int incX, float *Y, const int incY) {
+  cblas_saxpy(N, alpha, X, incX, Y, incY);
+}
+inline void cblas_Xaxpy(const int N, const double alpha, const double *X,
+                        const int incX, double *Y, const int incY) {
+  cblas_daxpy(N, alpha, X, incX, Y, incY);
+}
+inline void cblas_Xscal(const int N, const float alpha, float *data,
+                        const int inc) {
+  cblas_sscal(N, alpha, data, inc);
+}
+inline void cblas_Xscal(const int N, const double alpha, double *data, 
+                        const int inc) {
+  cblas_dscal(N, alpha, data, inc);
+}
+inline void cblas_Xspmv(const float alpha, const int num_rows, const float *Mdata,
+                        const float *v, const int v_inc,
+                        const float beta, float *y, const int y_inc) {
+  cblas_sspmv(CblasRowMajor, CblasLower, num_rows, alpha, Mdata, v, v_inc, beta, y, y_inc);
+}
+inline void cblas_Xspmv(const double alpha, const int num_rows, const double *Mdata,
+                        const double *v, const int v_inc,
+                        const double beta, double *y, const int y_inc) {
+  cblas_dspmv(CblasRowMajor, CblasLower, num_rows, alpha, Mdata, v, v_inc, beta, y, y_inc);
+}
+inline void cblas_Xtpmv(MatrixTransposeType trans, const float *Mdata,
+                        const int num_rows, float *y, const int y_inc) {
+  cblas_stpmv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
+              CblasNonUnit, num_rows, Mdata, y, y_inc);
+}
+inline void cblas_Xtpmv(MatrixTransposeType trans, const double *Mdata,
+                        const int num_rows, double *y, const int y_inc) {
+  cblas_dtpmv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
+              CblasNonUnit, num_rows, Mdata, y, y_inc);
+}
+
+
+inline void cblas_Xtpsv(MatrixTransposeType trans, const float *Mdata,
+                        const int num_rows, float *y, const int y_inc) {
+  cblas_stpsv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
+              CblasNonUnit, num_rows, Mdata, y, y_inc);
+}
+inline void cblas_Xtpsv(MatrixTransposeType trans, const double *Mdata,
+                        const int num_rows, double *y, const int y_inc) {
+  cblas_dtpsv(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
+              CblasNonUnit, num_rows, Mdata, y, y_inc);
+}
+
+// x = alpha * M * y + beta * x
+inline void cblas_Xspmv(MatrixIndexT dim, float alpha, const float *Mdata,
+                        const float *ydata, MatrixIndexT ystride,
+                        float beta, float *xdata, MatrixIndexT xstride) {
+  cblas_sspmv(CblasRowMajor, CblasLower, dim, alpha, Mdata,
+              ydata, ystride, beta, xdata, xstride);
+}
+inline void cblas_Xspmv(MatrixIndexT dim, double alpha, const double *Mdata,
+                        const double *ydata, MatrixIndexT ystride,
+                        double beta, double *xdata, MatrixIndexT xstride) {
+  cblas_dspmv(CblasRowMajor, CblasLower, dim, alpha, Mdata,
+              ydata, ystride, beta, xdata, xstride);
+}
+
+// Implements  A += alpha * (x y'  + y x'); A is symmetric matrix.
+inline void cblas_Xspr2(MatrixIndexT dim, float alpha, const float *Xdata,
+                        MatrixIndexT incX, const float *Ydata, MatrixIndexT incY,
+                          float *Adata) {
+  cblas_sspr2(CblasRowMajor, CblasLower, dim, alpha, Xdata,
+              incX, Ydata, incY, Adata);
+}
+inline void cblas_Xspr2(MatrixIndexT dim, double alpha, const double *Xdata,
+                        MatrixIndexT incX, const double *Ydata, MatrixIndexT incY,
+                        double *Adata) {
+  cblas_dspr2(CblasRowMajor, CblasLower, dim, alpha, Xdata,
+              incX, Ydata, incY, Adata);
+}
+
+// Implements  A += alpha * (x x'); A is symmetric matrix.
+inline void cblas_Xspr(MatrixIndexT dim, float alpha, const float *Xdata,
+                       MatrixIndexT incX, float *Adata) {
+  cblas_sspr(CblasRowMajor, CblasLower, dim, alpha, Xdata, incX, Adata);
+}
+inline void cblas_Xspr(MatrixIndexT dim, double alpha, const double *Xdata,
+                       MatrixIndexT incX, double *Adata) {
+  cblas_dspr(CblasRowMajor, CblasLower, dim, alpha, Xdata, incX, Adata);
+}
+
+// sgemv,dgemv: y = alpha M x + beta y.
+inline void cblas_Xgemv(MatrixTransposeType trans, MatrixIndexT num_rows,
+                        MatrixIndexT num_cols, float alpha, const float *Mdata,
+                        MatrixIndexT stride, const float *xdata,
+                        MatrixIndexT incX, float beta, float *ydata, MatrixIndexT incY) {
+  cblas_sgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
+              num_cols, alpha, Mdata, stride, xdata, incX, beta, ydata, incY);
+}
+inline void cblas_Xgemv(MatrixTransposeType trans, MatrixIndexT num_rows,
+                        MatrixIndexT num_cols, double alpha, const double *Mdata,
+                        MatrixIndexT stride, const double *xdata,
+                        MatrixIndexT incX, double beta, double *ydata, MatrixIndexT incY) {
+  cblas_dgemv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
+              num_cols, alpha, Mdata, stride, xdata, incX, beta, ydata, incY);
+}
+
+// sgbmv, dgmmv: y = alpha M x +  + beta * y.
+inline void cblas_Xgbmv(MatrixTransposeType trans, MatrixIndexT num_rows,
+                        MatrixIndexT num_cols, MatrixIndexT num_below,
+                        MatrixIndexT num_above, float alpha, const float *Mdata,
+                        MatrixIndexT stride, const float *xdata,
+                        MatrixIndexT incX, float beta, float *ydata, MatrixIndexT incY) {
+  cblas_sgbmv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
+              num_cols, num_below, num_above, alpha, Mdata, stride, xdata,
+              incX, beta, ydata, incY);
+}
+inline void cblas_Xgbmv(MatrixTransposeType trans, MatrixIndexT num_rows,
+                        MatrixIndexT num_cols, MatrixIndexT num_below,
+                        MatrixIndexT num_above, double alpha, const double *Mdata,
+                        MatrixIndexT stride, const double *xdata,
+                        MatrixIndexT incX, double beta, double *ydata, MatrixIndexT incY) {
+  cblas_dgbmv(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(trans), num_rows,
+              num_cols, num_below, num_above, alpha, Mdata, stride, xdata,
+              incX, beta, ydata, incY);
+}
+
+
+template<typename Real>
+inline void Xgemv_sparsevec(MatrixTransposeType trans, MatrixIndexT num_rows,
+                            MatrixIndexT num_cols, Real alpha, const Real *Mdata,
+                            MatrixIndexT stride, const Real *xdata,
+                            MatrixIndexT incX, Real beta, Real *ydata,
+                            MatrixIndexT incY) {
+  if (trans == kNoTrans) {
+    if (beta != 1.0) cblas_Xscal(num_rows, beta, ydata, incY);
+    for (MatrixIndexT i = 0; i < num_cols; i++) {
+      Real x_i = xdata[i * incX];
+      if (x_i == 0.0) continue;
+      // Add to ydata, the i'th column of M, times alpha * x_i
+      cblas_Xaxpy(num_rows, x_i * alpha, Mdata + i, stride, ydata, incY);
+    }    
+  } else {
+    if (beta != 1.0) cblas_Xscal(num_cols, beta, ydata, incY);
+    for (MatrixIndexT i = 0; i < num_rows; i++) {
+      Real x_i = xdata[i * incX];
+      if (x_i == 0.0) continue;
+      // Add to ydata, the i'th row of M, times alpha * x_i
+      cblas_Xaxpy(num_cols, x_i * alpha,
+                  Mdata + (i * stride), 1, ydata, incY);
+    }
+  }
+}
+
+inline void cblas_Xgemm(const float alpha,
+                        MatrixTransposeType transA,
+                        const float *Adata,
+                        MatrixIndexT a_num_rows, MatrixIndexT a_num_cols, MatrixIndexT a_stride,
+                        MatrixTransposeType transB, 
+                        const float *Bdata, MatrixIndexT b_stride,
+                        const float beta,
+                        float *Mdata, 
+                        MatrixIndexT num_rows, MatrixIndexT num_cols,MatrixIndexT stride) {
+  cblas_sgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA), 
+              static_cast<CBLAS_TRANSPOSE>(transB),
+              num_rows, num_cols, transA == kNoTrans ? a_num_cols : a_num_rows,
+              alpha, Adata, a_stride, Bdata, b_stride,
+              beta, Mdata, stride); 
+}
+inline void cblas_Xgemm(const double alpha,
+                        MatrixTransposeType transA,
+                        const double *Adata,
+                        MatrixIndexT a_num_rows, MatrixIndexT a_num_cols, MatrixIndexT a_stride,
+                        MatrixTransposeType transB, 
+                        const double *Bdata, MatrixIndexT b_stride,
+                        const double beta,
+                        double *Mdata, 
+                        MatrixIndexT num_rows, MatrixIndexT num_cols,MatrixIndexT stride) {
+  cblas_dgemm(CblasRowMajor, static_cast<CBLAS_TRANSPOSE>(transA), 
+              static_cast<CBLAS_TRANSPOSE>(transB),
+              num_rows, num_cols, transA == kNoTrans ? a_num_cols : a_num_rows,
+              alpha, Adata, a_stride, Bdata, b_stride,
+              beta, Mdata, stride); 
+}
+
+
+inline void cblas_Xsymm(const float alpha,
+                        MatrixIndexT sz,
+                        const float *Adata,MatrixIndexT a_stride,
+                        const float *Bdata,MatrixIndexT b_stride,
+                        const float beta,
+                        float *Mdata, MatrixIndexT stride) {
+  cblas_ssymm(CblasRowMajor, CblasLeft, CblasLower, sz, sz, alpha, Adata,
+              a_stride, Bdata, b_stride, beta, Mdata, stride);
+}
+inline void cblas_Xsymm(const double alpha,
+                        MatrixIndexT sz,
+                        const double *Adata,MatrixIndexT a_stride,
+                        const double *Bdata,MatrixIndexT b_stride,
+                        const double beta,
+                        double *Mdata, MatrixIndexT stride) {
+  cblas_dsymm(CblasRowMajor, CblasLeft, CblasLower, sz, sz, alpha, Adata,
+              a_stride, Bdata, b_stride, beta, Mdata, stride);
+}
+// ger: M += alpha x y^T.
+inline void cblas_Xger(MatrixIndexT num_rows, MatrixIndexT num_cols, float alpha,
+                       const float *xdata, MatrixIndexT incX, const float *ydata,
+                       MatrixIndexT incY, float *Mdata, MatrixIndexT stride) {
+  cblas_sger(CblasRowMajor, num_rows, num_cols, alpha, xdata, 1, ydata, 1,
+             Mdata, stride);
+}
+inline void cblas_Xger(MatrixIndexT num_rows, MatrixIndexT num_cols, double alpha,
+                       const double *xdata, MatrixIndexT incX, const double *ydata,
+                       MatrixIndexT incY, double *Mdata, MatrixIndexT stride) {
+  cblas_dger(CblasRowMajor, num_rows, num_cols, alpha, xdata, 1, ydata, 1,
+             Mdata, stride);
+}
+
+// syrk: symmetric rank-k update.
+// if trans==kNoTrans, then C = alpha A A^T + beta C
+// else C = alpha A^T A + beta C.
+// note: dim_c is dim(C), other_dim_a is the "other" dimension of A, i.e.
+// num-cols(A) if kNoTrans, or num-rows(A) if kTrans.
+// We only need the row-major and lower-triangular option of this, and this
+// is hard-coded.
+inline void cblas_Xsyrk (
+    const MatrixTransposeType trans, const MatrixIndexT dim_c,
+    const MatrixIndexT other_dim_a, const float alpha, const float *A,
+    const MatrixIndexT a_stride, const float beta, float *C,
+    const MatrixIndexT c_stride) {
+  cblas_ssyrk(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
+              dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride);
+}
+
+inline void cblas_Xsyrk(
+    const MatrixTransposeType trans, const MatrixIndexT dim_c,
+    const MatrixIndexT other_dim_a, const double alpha, const double *A,
+    const MatrixIndexT a_stride, const double beta, double *C,
+    const MatrixIndexT c_stride) {
+  cblas_dsyrk(CblasRowMajor, CblasLower, static_cast<CBLAS_TRANSPOSE>(trans),
+              dim_c, other_dim_a, alpha, A, a_stride, beta, C, c_stride);
+}
+
+/// matrix-vector multiply using a banded matrix; we always call this
+/// with b = 1 meaning we're multiplying by a diagonal matrix.  This is used for
+/// elementwise multiplication.  We miss some of the arguments out of this
+/// wrapper.
+inline void cblas_Xsbmv1(
+    const MatrixIndexT dim,
+    const double *A,
+    const double alpha,
+    const double *x,
+    const double beta,
+    double *y) {
+  cblas_dsbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
+              1, x, 1, beta, y, 1);
+}
+
+inline void cblas_Xsbmv1(
+    const MatrixIndexT dim,
+    const float *A,
+    const float alpha,
+    const float *x,
+    const float beta,
+    float *y) {
+  cblas_ssbmv(CblasRowMajor, CblasLower, dim, 0, alpha, A,
+              1, x, 1, beta, y, 1);
+}
+
+
+/// This is not really a wrapper for CBLAS as CBLAS does not have this; in future we could
+/// extend this somehow.
+inline void mul_elements(
+    const MatrixIndexT dim,
+    const double *a,
+    double *b) { // does b *= a, elementwise.
+  double c1, c2, c3, c4;
+  MatrixIndexT i;
+  for (i = 0; i + 4 <= dim; i += 4) {
+    c1 = a[i] * b[i];
+    c2 = a[i+1] * b[i+1];
+    c3 = a[i+2] * b[i+2];
+    c4 = a[i+3] * b[i+3];
+    b[i] = c1;
+    b[i+1] = c2;
+    b[i+2] = c3;
+    b[i+3] = c4;
+  }
+  for (; i < dim; i++)
+    b[i] *= a[i];
+}
+
+inline void mul_elements(
+    const MatrixIndexT dim,
+    const float *a,
+    float *b) { // does b *= a, elementwise.
+  float c1, c2, c3, c4;
+  MatrixIndexT i;
+  for (i = 0; i + 4 <= dim; i += 4) {
+    c1 = a[i] * b[i];
+    c2 = a[i+1] * b[i+1];
+    c3 = a[i+2] * b[i+2];
+    c4 = a[i+3] * b[i+3];
+    b[i] = c1;
+    b[i+1] = c2;
+    b[i+2] = c3;
+    b[i+3] = c4;
+  }
+  for (; i < dim; i++)
+    b[i] *= a[i];
+}
+
+
+
+// add clapack here
+#if !defined(HAVE_ATLAS)
+inline void clapack_Xtptri(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *result) {
+  stptri_(const_cast<char *>("U"), const_cast<char *>("N"), num_rows, Mdata, result);
+}
+inline void clapack_Xtptri(KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *result) {
+  dtptri_(const_cast<char *>("U"), const_cast<char *>("N"), num_rows, Mdata, result);
+}
+// 
+inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols, 
+                            float *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot, 
+                            KaldiBlasInt *result) {
+  sgetrf_(num_rows, num_cols, Mdata, stride, pivot, result);
+}
+inline void clapack_Xgetrf2(KaldiBlasInt *num_rows, KaldiBlasInt *num_cols, 
+                            double *Mdata, KaldiBlasInt *stride, KaldiBlasInt *pivot, 
+                            KaldiBlasInt *result) {
+  dgetrf_(num_rows, num_cols, Mdata, stride, pivot, result);
+}
+
+// 
+inline void clapack_Xgetri2(KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *stride,
+                           KaldiBlasInt *pivot, float *p_work, 
+                           KaldiBlasInt *l_work, KaldiBlasInt *result) {
+  sgetri_(num_rows, Mdata, stride, pivot, p_work, l_work, result);
+}
+inline void clapack_Xgetri2(KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *stride,
+                           KaldiBlasInt *pivot, double *p_work, 
+                           KaldiBlasInt *l_work, KaldiBlasInt *result) {
+  dgetri_(num_rows, Mdata, stride, pivot, p_work, l_work, result);
+}
+//
+inline void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols,
+                           KaldiBlasInt *num_rows, float *Mdata, KaldiBlasInt *stride,
+                           float *sv, float *Vdata, KaldiBlasInt *vstride,
+                           float *Udata, KaldiBlasInt *ustride, float *p_work,
+                           KaldiBlasInt *l_work, KaldiBlasInt *result) {
+  sgesvd_(v, u,
+          num_cols, num_rows, Mdata, stride,
+          sv, Vdata, vstride, Udata, ustride, 
+          p_work, l_work, result); 
+}
+inline void clapack_Xgesvd(char *v, char *u, KaldiBlasInt *num_cols,
+                           KaldiBlasInt *num_rows, double *Mdata, KaldiBlasInt *stride,
+                           double *sv, double *Vdata, KaldiBlasInt *vstride,
+                           double *Udata, KaldiBlasInt *ustride, double *p_work,
+                           KaldiBlasInt *l_work, KaldiBlasInt *result) {
+  dgesvd_(v, u,
+          num_cols, num_rows, Mdata, stride,
+          sv, Vdata, vstride, Udata, ustride,
+          p_work, l_work, result); 
+}
+//
+void inline clapack_Xsptri(KaldiBlasInt *num_rows, float *Mdata, 
+                           KaldiBlasInt *ipiv, float *work, KaldiBlasInt *result) {
+  ssptri_(const_cast<char *>("U"), num_rows, Mdata, ipiv, work, result);
+}
+void inline clapack_Xsptri(KaldiBlasInt *num_rows, double *Mdata, 
+                           KaldiBlasInt *ipiv, double *work, KaldiBlasInt *result) {
+  dsptri_(const_cast<char *>("U"), num_rows, Mdata, ipiv, work, result);
+}
+//
+void inline clapack_Xsptrf(KaldiBlasInt *num_rows, float *Mdata,
+                           KaldiBlasInt *ipiv, KaldiBlasInt *result) {
+  ssptrf_(const_cast<char *>("U"), num_rows, Mdata, ipiv, result);
+}
+void inline clapack_Xsptrf(KaldiBlasInt *num_rows, double *Mdata,
+                           KaldiBlasInt *ipiv, KaldiBlasInt *result) {
+  dsptrf_(const_cast<char *>("U"), num_rows, Mdata, ipiv, result);
+}
+#else
+inline void clapack_Xgetrf(MatrixIndexT num_rows, MatrixIndexT num_cols,
+                           float *Mdata, MatrixIndexT stride, 
+                           int *pivot, int *result) {
+  *result = clapack_sgetrf(CblasColMajor, num_rows, num_cols,
+                              Mdata, stride, pivot);
+}
+
+inline void clapack_Xgetrf(MatrixIndexT num_rows, MatrixIndexT num_cols,
+                           double *Mdata, MatrixIndexT stride, 
+                           int *pivot, int *result) {
+  *result = clapack_dgetrf(CblasColMajor, num_rows, num_cols,
+                              Mdata, stride, pivot);
+}
+//
+inline int clapack_Xtrtri(int num_rows, float *Mdata, MatrixIndexT stride) {
+  return  clapack_strtri(CblasColMajor, CblasUpper, CblasNonUnit, num_rows,
+                              Mdata, stride);
+}
+
+inline int clapack_Xtrtri(int num_rows, double *Mdata, MatrixIndexT stride) {
+  return  clapack_dtrtri(CblasColMajor, CblasUpper, CblasNonUnit, num_rows,
+                              Mdata, stride);
+}
+//
+inline void clapack_Xgetri(MatrixIndexT num_rows, float *Mdata, MatrixIndexT stride,
+                      int *pivot, int *result) {
+  *result = clapack_sgetri(CblasColMajor, num_rows, Mdata, stride, pivot);
+}
+inline void clapack_Xgetri(MatrixIndexT num_rows, double *Mdata, MatrixIndexT stride,
+                      int *pivot, int *result) {
+  *result = clapack_dgetri(CblasColMajor, num_rows, Mdata, stride, pivot);
+}
+#endif
+
+}
+// namespace kaldi
+
+#endif
diff --git a/kaldi_io/src/kaldi/matrix/compressed-matrix.h b/kaldi_io/src/kaldi/matrix/compressed-matrix.h
new file mode 100644
index 0000000..746cab3
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/compressed-matrix.h
@@ -0,0 +1,179 @@
+// matrix/compressed-matrix.h
+
+// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+//                 Frantisek Skala, Wei Shi
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_MATRIX_COMPRESSED_MATRIX_H_
+#define KALDI_MATRIX_COMPRESSED_MATRIX_H_ 1
+
+#include "kaldi-matrix.h"
+
+namespace kaldi {
+
+/// \addtogroup matrix_group
+/// @{
+
+/// This class does lossy compression of a matrix.  It only
+/// supports copying to-from a KaldiMatrix.  For large matrices,
+/// each element is compressed into about one byte, but there
+/// is a little overhead on top of that (globally, and also per
+/// column).
+
+/// The basic idea is for each column (in the normal configuration)
+/// we work out the values at the 0th, 25th, 50th and 100th percentiles
+/// and store them as 16-bit integers; we then encode each value in
+/// the column as a single byte, in 3 separate ranges with different
+/// linear encodings (0-25th, 25-50th, 50th-100th).
+/// If the matrix has 8 rows or fewer, we simply store all values as
+/// uint16.
+
+class CompressedMatrix {
+ public:
+  CompressedMatrix(): data_(NULL) { }
+
+  ~CompressedMatrix() { Destroy(); }
+  
+  template<typename Real>
+  CompressedMatrix(const MatrixBase<Real> &mat): data_(NULL) { CopyFromMat(mat); }
+
+  /// Initializer that can be used to select part of an existing
+  /// CompressedMatrix without un-compressing and re-compressing (note: unlike
+  /// similar initializers for class Matrix, it doesn't point to the same memory
+  /// location).
+  CompressedMatrix(const CompressedMatrix &mat,
+                   const MatrixIndexT row_offset,
+                   const MatrixIndexT num_rows,
+                   const MatrixIndexT col_offset,
+                   const MatrixIndexT num_cols);
+
+  void *Data() const { return this->data_; }
+
+  /// This will resize *this and copy the contents of mat to *this.
+  template<typename Real>
+  void CopyFromMat(const MatrixBase<Real> &mat);
+
+  CompressedMatrix(const CompressedMatrix &mat);
+
+  CompressedMatrix &operator = (const CompressedMatrix &mat); // assignment operator.
+
+  template<typename Real>
+  CompressedMatrix &operator = (const MatrixBase<Real> &mat); // assignment operator.
+  
+  /// Copies contents to matrix.  Note: mat must have the correct size,
+  /// CopyToMat no longer attempts to resize it.
+  template<typename Real>
+  void CopyToMat(MatrixBase<Real> *mat) const;
+
+  void Write(std::ostream &os, bool binary) const;
+  
+  void Read(std::istream &is, bool binary);
+
+  /// Returns number of rows (or zero for emtpy matrix).
+  inline MatrixIndexT NumRows() const { return (data_ == NULL) ? 0 :
+      (*reinterpret_cast<GlobalHeader*>(data_)).num_rows; }
+
+  /// Returns number of columns (or zero for emtpy matrix).
+  inline MatrixIndexT NumCols() const { return (data_ == NULL) ? 0 :
+      (*reinterpret_cast<GlobalHeader*>(data_)).num_cols; }
+
+  /// Copies row #row of the matrix into vector v.
+  /// Note: v must have same size as #cols.
+  template<typename Real>
+  void CopyRowToVec(MatrixIndexT row, VectorBase<Real> *v) const;
+
+  /// Copies column #col of the matrix into vector v.
+  /// Note: v must have same size as #rows.
+  template<typename Real>
+  void CopyColToVec(MatrixIndexT col, VectorBase<Real> *v) const;
+
+  /// Copies submatrix of compressed matrix into matrix dest.
+  /// Submatrix starts at row row_offset and column column_offset and its size
+  /// is defined by size of provided matrix dest
+  template<typename Real>
+  void CopyToMat(int32 row_offset,
+                 int32 column_offset,
+                 MatrixBase<Real> *dest) const;
+
+  void Swap(CompressedMatrix *other) { std::swap(data_, other->data_); }
+  
+  friend class Matrix<float>;
+  friend class Matrix<double>;
+ private:
+
+  // allocates data using new [], ensures byte alignment
+  // sufficient for float.
+  static void *AllocateData(int32 num_bytes);
+
+  // the "format" will be 1 for the original format where each column has a
+  // PerColHeader, and 2 for the format now used for matrices with 8 or fewer
+  // rows, where everything is represented as 16-bit integers.
+  struct GlobalHeader {
+    int32 format;
+    float min_value;
+    float range;
+    int32 num_rows;
+    int32 num_cols;
+  };
+
+  static MatrixIndexT DataSize(const GlobalHeader &header);
+
+  struct PerColHeader {
+    uint16 percentile_0;
+    uint16 percentile_25;
+    uint16 percentile_75;
+    uint16 percentile_100;
+  };
+
+  template<typename Real>
+  static void CompressColumn(const GlobalHeader &global_header,
+                             const Real *data, MatrixIndexT stride,
+                             int32 num_rows, PerColHeader *header,
+                             unsigned char *byte_data);
+  template<typename Real>
+  static void ComputeColHeader(const GlobalHeader &global_header,
+                               const Real *data, MatrixIndexT stride,
+                               int32 num_rows, PerColHeader *header);
+
+  static inline uint16 FloatToUint16(const GlobalHeader &global_header,
+                                     float value);
+
+  static inline float Uint16ToFloat(const GlobalHeader &global_header,
+                                    uint16 value);
+  static inline unsigned char FloatToChar(float p0, float p25,
+                                          float p75, float p100,
+                                          float value);
+  static inline float CharToFloat(float p0, float p25,
+                                  float p75, float p100,
+                                  unsigned char value);
+  
+  void Destroy();
+  
+  void *data_; // first GlobalHeader, then PerColHeader (repeated), then
+  // the byte data for each column (repeated).  Note: don't intersperse
+  // the byte data with the PerColHeaders, because of alignment issues.
+
+};
+
+
+/// @} end of \addtogroup matrix_group
+
+
+}  // namespace kaldi
+
+
+#endif  // KALDI_MATRIX_COMPRESSED_MATRIX_H_
diff --git a/kaldi_io/src/kaldi/matrix/jama-eig.h b/kaldi_io/src/kaldi/matrix/jama-eig.h
new file mode 100644
index 0000000..c7278bc
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/jama-eig.h
@@ -0,0 +1,924 @@
+// matrix/jama-eig.h
+
+// Copyright 2009-2011 Microsoft Corporation 
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+// This file consists of a port and modification of materials from
+//   JAMA: A Java Matrix Package
+// under the following notice: This software is a cooperative product of
+// The MathWorks and the National Institute of Standards and Technology (NIST)
+// which has been released to the public.  This notice and the original code are
+// available at http://math.nist.gov/javanumerics/jama/domain.notice
+
+
+
+#ifndef KALDI_MATRIX_JAMA_EIG_H_
+#define KALDI_MATRIX_JAMA_EIG_H_ 1
+
+#include "matrix/kaldi-matrix.h"
+
+namespace kaldi {
+
+// This class is not to be used externally.  See the Eig function in the Matrix
+// class in kaldi-matrix.h.  This is the external interface.
+
+template<typename Real> class EigenvalueDecomposition {
+  // This class is based on the EigenvalueDecomposition class from the JAMA
+  // library (version 1.0.2).
+ public:
+  EigenvalueDecomposition(const MatrixBase<Real> &A);
+
+  ~EigenvalueDecomposition();  // free memory.
+
+  void GetV(MatrixBase<Real> *V_out) {  // V is what we call P externally; it's the matrix of
+    // eigenvectors.
+    KALDI_ASSERT(V_out->NumRows() == static_cast<MatrixIndexT>(n_)
+                 && V_out->NumCols() == static_cast<MatrixIndexT>(n_));
+    for (int i = 0; i < n_; i++)
+      for (int j = 0; j < n_; j++)
+        (*V_out)(i, j) = V(i, j);  // V(i, j) is member function.
+  }
+  void GetRealEigenvalues(VectorBase<Real> *r_out) {
+    // returns real part of eigenvalues.
+    KALDI_ASSERT(r_out->Dim() == static_cast<MatrixIndexT>(n_));
+    for (int i = 0; i < n_; i++)
+      (*r_out)(i) = d_[i];
+  }
+  void GetImagEigenvalues(VectorBase<Real> *i_out) {
+    // returns imaginary part of eigenvalues.
+    KALDI_ASSERT(i_out->Dim() == static_cast<MatrixIndexT>(n_));
+    for (int i = 0; i < n_; i++)
+      (*i_out)(i) = e_[i];
+  }
+ private:
+
+  inline Real &H(int r, int c) { return H_[r*n_ + c]; }
+  inline Real &V(int r, int c) { return V_[r*n_ + c]; }
+
+  // complex division
+  inline static void cdiv(Real xr, Real xi, Real yr, Real yi, Real *cdivr, Real *cdivi) {
+    Real r, d;
+    if (std::abs(yr) > std::abs(yi)) {
+      r = yi/yr;
+      d = yr + r*yi;
+      *cdivr = (xr + r*xi)/d;
+      *cdivi = (xi - r*xr)/d;
+    } else {
+      r = yr/yi;
+      d = yi + r*yr;
+      *cdivr = (r*xr + xi)/d;
+      *cdivi = (r*xi - xr)/d;
+    }
+  }
+
+  // Nonsymmetric reduction from Hessenberg to real Schur form.
+  void Hqr2 ();
+
+
+  int n_;  // matrix dimension.
+
+  Real *d_, *e_;  // real and imaginary parts of eigenvalues.
+  Real *V_;  // the eigenvectors (P in our external notation)
+  Real *H_;  // the nonsymmetric Hessenberg form.
+  Real *ort_;  // working storage for nonsymmetric algorithm.
+
+  // Symmetric Householder reduction to tridiagonal form.
+  void Tred2 ();
+
+  // Symmetric tridiagonal QL algorithm.
+  void Tql2 ();
+
+  // Nonsymmetric reduction to Hessenberg form.
+  void Orthes ();
+
+};
+
+template class EigenvalueDecomposition<float>;  // force instantiation.
+template class EigenvalueDecomposition<double>;  // force instantiation.
+
+template<typename Real> void  EigenvalueDecomposition<Real>::Tred2() {
+  //  This is derived from the Algol procedures tred2 by
+  //  Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
+  //  Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
+  //  Fortran subroutine in EISPACK.
+
+  for (int j = 0; j < n_; j++) {
+    d_[j] = V(n_-1, j);
+  }
+
+  // Householder reduction to tridiagonal form.
+
+  for (int i = n_-1; i > 0; i--) {
+
+    // Scale to avoid under/overflow.
+
+    Real scale = 0.0;
+    Real h = 0.0;
+    for (int k = 0; k < i; k++) {
+      scale = scale + std::abs(d_[k]);
+    }
+    if (scale == 0.0) {
+      e_[i] = d_[i-1];
+      for (int j = 0; j < i; j++) {
+        d_[j] = V(i-1, j);
+        V(i, j) = 0.0;
+        V(j, i) = 0.0;
+      }
+    } else {
+
+      // Generate Householder vector.
+
+      for (int k = 0; k < i; k++) {
+        d_[k] /= scale;
+        h += d_[k] * d_[k];
+      }
+      Real f = d_[i-1];
+      Real g = std::sqrt(h);
+      if (f > 0) {
+        g = -g;
+      }
+      e_[i] = scale * g;
+      h = h - f * g;
+      d_[i-1] = f - g;
+      for (int j = 0; j < i; j++) {
+        e_[j] = 0.0;
+      }
+
+      // Apply similarity transformation to remaining columns.
+
+      for (int j = 0; j < i; j++) {
+        f = d_[j];
+        V(j, i) = f;
+        g =e_[j] + V(j, j) * f;
+        for (int k = j+1; k <= i-1; k++) {
+          g += V(k, j) * d_[k];
+          e_[k] += V(k, j) * f;
+        }
+        e_[j] = g;
+      }
+      f = 0.0;
+      for (int j = 0; j < i; j++) {
+        e_[j] /= h;
+        f += e_[j] * d_[j];
+      }
+      Real hh = f / (h + h);
+      for (int j = 0; j < i; j++) {
+        e_[j] -= hh * d_[j];
+      }
+      for (int j = 0; j < i; j++) {
+        f = d_[j];
+        g = e_[j];
+        for (int k = j; k <= i-1; k++) {
+          V(k, j) -= (f * e_[k] + g * d_[k]);
+        }
+        d_[j] = V(i-1, j);
+        V(i, j) = 0.0;
+      }
+    }
+    d_[i] = h;
+  }
+
+  // Accumulate transformations.
+
+  for (int i = 0; i < n_-1; i++) {
+    V(n_-1, i) = V(i, i);
+    V(i, i) = 1.0;
+    Real h = d_[i+1];
+    if (h != 0.0) {
+      for (int k = 0; k <= i; k++) {
+        d_[k] = V(k, i+1) / h;
+      }
+      for (int j = 0; j <= i; j++) {
+        Real g = 0.0;
+        for (int k = 0; k <= i; k++) {
+          g += V(k, i+1) * V(k, j);
+        }
+        for (int k = 0; k <= i; k++) {
+          V(k, j) -= g * d_[k];
+        }
+      }
+    }
+    for (int k = 0; k <= i; k++) {
+      V(k, i+1) = 0.0;
+    }
+  }
+  for (int j = 0; j < n_; j++) {
+    d_[j] = V(n_-1, j);
+    V(n_-1, j) = 0.0;
+  }
+  V(n_-1, n_-1) = 1.0;
+   e_[0] = 0.0;
+}
+
+template<typename Real> void EigenvalueDecomposition<Real>::Tql2() {
+  //  This is derived from the Algol procedures tql2, by
+  //  Bowdler, Martin, Reinsch, and Wilkinson, Handbook for
+  //  Auto. Comp., Vol.ii-Linear Algebra, and the corresponding
+  //  Fortran subroutine in EISPACK.
+
+  for (int i = 1; i < n_; i++) {
+     e_[i-1] = e_[i];
+  }
+   e_[n_-1] = 0.0;
+
+  Real f = 0.0;
+  Real tst1 = 0.0;
+  Real eps = std::numeric_limits<Real>::epsilon();
+  for (int l = 0; l < n_; l++) {
+
+    // Find small subdiagonal element
+
+    tst1 = std::max(tst1, std::abs(d_[l]) + std::abs(e_[l]));
+    int m = l;
+    while (m < n_) {
+      if (std::abs(e_[m]) <= eps*tst1) {
+        break;
+      }
+      m++;
+    }
+
+    // If m == l, d_[l] is an eigenvalue,
+    // otherwise, iterate.
+
+    if (m > l) {
+      int iter = 0;
+      do {
+        iter = iter + 1;  // (Could check iteration count here.)
+
+        // Compute implicit shift
+
+        Real g = d_[l];
+        Real p = (d_[l+1] - g) / (2.0 *e_[l]);
+        Real r = Hypot(p, static_cast<Real>(1.0));  // This is a Kaldi version of hypot that works with templates.
+        if (p < 0) {
+          r = -r;
+        }
+        d_[l] =e_[l] / (p + r);
+        d_[l+1] =e_[l] * (p + r);
+        Real dl1 = d_[l+1];
+        Real h = g - d_[l];
+        for (int i = l+2; i < n_; i++) {
+          d_[i] -= h;
+        }
+        f = f + h;
+
+        // Implicit QL transformation.
+
+        p = d_[m];
+        Real c = 1.0;
+        Real c2 = c;
+        Real c3 = c;
+        Real el1 =e_[l+1];
+        Real s = 0.0;
+        Real s2 = 0.0;
+        for (int i = m-1; i >= l; i--) {
+          c3 = c2;
+          c2 = c;
+          s2 = s;
+          g = c *e_[i];
+          h = c * p;
+          r = Hypot(p, e_[i]);  // This is a Kaldi version of Hypot that works with templates.
+          e_[i+1] = s * r;
+          s =e_[i] / r;
+          c = p / r;
+          p = c * d_[i] - s * g;
+          d_[i+1] = h + s * (c * g + s * d_[i]);
+
+          // Accumulate transformation.
+
+          for (int k = 0; k < n_; k++) {
+            h = V(k, i+1);
+            V(k, i+1) = s * V(k, i) + c * h;
+            V(k, i) = c * V(k, i) - s * h;
+          }
+        }
+        p = -s * s2 * c3 * el1 *e_[l] / dl1;
+        e_[l] = s * p;
+        d_[l] = c * p;
+
+        // Check for convergence.
+
+      } while (std::abs(e_[l]) > eps*tst1);
+    }
+    d_[l] = d_[l] + f;
+    e_[l] = 0.0;
+  }
+
+  // Sort eigenvalues and corresponding vectors.
+
+  for (int i = 0; i < n_-1; i++) {
+    int k = i;
+    Real p = d_[i];
+    for (int j = i+1; j < n_; j++) {
+      if (d_[j] < p) {
+        k = j;
+        p = d_[j];
+      }
+    }
+    if (k != i) {
+      d_[k] = d_[i];
+      d_[i] = p;
+      for (int j = 0; j < n_; j++) {
+        p = V(j, i);
+        V(j, i) = V(j, k);
+        V(j, k) = p;
+      }
+    }
+  }
+}
+
+template<typename Real>
+void EigenvalueDecomposition<Real>::Orthes() {
+
+  //  This is derived from the Algol procedures orthes and ortran,
+  //  by Martin and Wilkinson, Handbook for Auto. Comp.,
+  //  Vol.ii-Linear Algebra, and the corresponding
+  //  Fortran subroutines in EISPACK.
+
+  int low = 0;
+  int high = n_-1;
+
+  for (int m = low+1; m <= high-1; m++) {
+
+    // Scale column.
+
+    Real scale = 0.0;
+    for (int i = m; i <= high; i++) {
+      scale = scale + std::abs(H(i, m-1));
+    }
+    if (scale != 0.0) {
+
+      // Compute Householder transformation.
+
+      Real h = 0.0;
+      for (int i = high; i >= m; i--) {
+        ort_[i] = H(i, m-1)/scale;
+        h += ort_[i] * ort_[i];
+      }
+      Real g = std::sqrt(h);
+      if (ort_[m] > 0) {
+        g = -g;
+      }
+      h = h - ort_[m] * g;
+      ort_[m] = ort_[m] - g;
+
+      // Apply Householder similarity transformation
+      // H = (I-u*u'/h)*H*(I-u*u')/h)
+
+      for (int j = m; j < n_; j++) {
+        Real f = 0.0;
+        for (int i = high; i >= m; i--) {
+          f += ort_[i]*H(i, j);
+        }
+        f = f/h;
+        for (int i = m; i <= high; i++) {
+          H(i, j) -= f*ort_[i];
+        }
+      }
+
+      for (int i = 0; i <= high; i++) {
+        Real f = 0.0;
+        for (int j = high; j >= m; j--) {
+          f += ort_[j]*H(i, j);
+        }
+        f = f/h;
+        for (int j = m; j <= high; j++) {
+          H(i, j) -= f*ort_[j];
+        }
+      }
+      ort_[m] = scale*ort_[m];
+      H(m, m-1) = scale*g;
+    }
+  }
+
+  // Accumulate transformations (Algol's ortran).
+
+  for (int i = 0; i < n_; i++) {
+    for (int j = 0; j < n_; j++) {
+      V(i, j) = (i == j ? 1.0 : 0.0);
+    }
+  }
+
+  for (int m = high-1; m >= low+1; m--) {
+    if (H(m, m-1) != 0.0) {
+      for (int i = m+1; i <= high; i++) {
+        ort_[i] = H(i, m-1);
+      }
+      for (int j = m; j <= high; j++) {
+        Real g = 0.0;
+        for (int i = m; i <= high; i++) {
+          g += ort_[i] * V(i, j);
+        }
+        // Double division avoids possible underflow
+        g = (g / ort_[m]) / H(m, m-1);
+        for (int i = m; i <= high; i++) {
+          V(i, j) += g * ort_[i];
+        }
+      }
+    }
+  }
+}
+
+template<typename Real> void  EigenvalueDecomposition<Real>::Hqr2() {
+  //  This is derived from the Algol procedure hqr2,
+  //  by Martin and Wilkinson, Handbook for Auto. Comp.,
+  //  Vol.ii-Linear Algebra, and the corresponding
+  //  Fortran subroutine in EISPACK.
+
+  int nn = n_;
+  int n = nn-1;
+  int low = 0;
+  int high = nn-1;
+  Real eps = std::numeric_limits<Real>::epsilon();
+  Real exshift = 0.0;
+  Real p = 0, q = 0, r = 0, s = 0, z=0, t, w, x, y;
+
+  // Store roots isolated by balanc and compute matrix norm
+
+  Real norm = 0.0;
+  for (int i = 0; i < nn; i++) {
+    if (i < low || i > high) {
+      d_[i] = H(i, i);
+      e_[i] = 0.0;
+    }
+    for (int j = std::max(i-1, 0); j < nn; j++) {
+      norm = norm + std::abs(H(i, j));
+    }
+  }
+
+  // Outer loop over eigenvalue index
+
+  int iter = 0;
+  while (n >= low) {
+
+    // Look for single small sub-diagonal element
+
+    int l = n;
+    while (l > low) {
+      s = std::abs(H(l-1, l-1)) + std::abs(H(l, l));
+      if (s == 0.0) {
+        s = norm;
+      }
+      if (std::abs(H(l, l-1)) < eps * s) {
+        break;
+      }
+      l--;
+    }
+
+    // Check for convergence
+    // One root found
+
+    if (l == n) {
+      H(n, n) = H(n, n) + exshift;
+      d_[n] = H(n, n);
+      e_[n] = 0.0;
+      n--;
+      iter = 0;
+
+      // Two roots found
+
+    } else if (l == n-1) {
+      w = H(n, n-1) * H(n-1, n);
+      p = (H(n-1, n-1) - H(n, n)) / 2.0;
+      q = p * p + w;
+      z = std::sqrt(std::abs(q));
+      H(n, n) = H(n, n) + exshift;
+      H(n-1, n-1) = H(n-1, n-1) + exshift;
+      x = H(n, n);
+
+      // Real pair
+
+      if (q >= 0) {
+        if (p >= 0) {
+          z = p + z;
+        } else {
+          z = p - z;
+        }
+        d_[n-1] = x + z;
+        d_[n] = d_[n-1];
+        if (z != 0.0) {
+          d_[n] = x - w / z;
+        }
+        e_[n-1] = 0.0;
+        e_[n] = 0.0;
+        x = H(n, n-1);
+        s = std::abs(x) + std::abs(z);
+        p = x / s;
+        q = z / s;
+        r = std::sqrt(p * p+q * q);
+        p = p / r;
+        q = q / r;
+
+        // Row modification
+
+        for (int j = n-1; j < nn; j++) {
+          z = H(n-1, j);
+          H(n-1, j) = q * z + p * H(n, j);
+          H(n, j) = q * H(n, j) - p * z;
+        }
+
+        // Column modification
+
+        for (int i = 0; i <= n; i++) {
+          z = H(i, n-1);
+          H(i, n-1) = q * z + p * H(i, n);
+          H(i, n) = q * H(i, n) - p * z;
+        }
+
+        // Accumulate transformations
+
+        for (int i = low; i <= high; i++) {
+          z = V(i, n-1);
+          V(i, n-1) = q * z + p * V(i, n);
+          V(i, n) = q * V(i, n) - p * z;
+        }
+
+        // Complex pair
+
+      } else {
+        d_[n-1] = x + p;
+        d_[n] = x + p;
+        e_[n-1] = z;
+        e_[n] = -z;
+      }
+      n = n - 2;
+      iter = 0;
+
+      // No convergence yet
+
+    } else {
+
+      // Form shift
+
+      x = H(n, n);
+      y = 0.0;
+      w = 0.0;
+      if (l < n) {
+        y = H(n-1, n-1);
+        w = H(n, n-1) * H(n-1, n);
+      }
+
+      // Wilkinson's original ad hoc shift
+
+      if (iter == 10) {
+        exshift += x;
+        for (int i = low; i <= n; i++) {
+          H(i, i) -= x;
+        }
+        s = std::abs(H(n, n-1)) + std::abs(H(n-1, n-2));
+        x = y = 0.75 * s;
+        w = -0.4375 * s * s;
+      }
+
+      // MATLAB's new ad hoc shift
+
+      if (iter == 30) {
+        s = (y - x) / 2.0;
+        s = s * s + w;
+        if (s > 0) {
+          s = std::sqrt(s);
+          if (y < x) {
+            s = -s;
+          }
+          s = x - w / ((y - x) / 2.0 + s);
+          for (int i = low; i <= n; i++) {
+            H(i, i) -= s;
+          }
+          exshift += s;
+          x = y = w = 0.964;
+        }
+      }
+
+      iter = iter + 1;   // (Could check iteration count here.)
+
+      // Look for two consecutive small sub-diagonal elements
+
+      int m = n-2;
+      while (m >= l) {
+        z = H(m, m);
+        r = x - z;
+        s = y - z;
+        p = (r * s - w) / H(m+1, m) + H(m, m+1);
+        q = H(m+1, m+1) - z - r - s;
+        r = H(m+2, m+1);
+        s = std::abs(p) + std::abs(q) + std::abs(r);
+        p = p / s;
+        q = q / s;
+        r = r / s;
+        if (m == l) {
+          break;
+        }
+        if (std::abs(H(m, m-1)) * (std::abs(q) + std::abs(r)) <
+            eps * (std::abs(p) * (std::abs(H(m-1, m-1)) + std::abs(z) +
+                                  std::abs(H(m+1, m+1))))) {
+          break;
+        }
+        m--;
+      }
+
+      for (int i = m+2; i <= n; i++) {
+        H(i, i-2) = 0.0;
+        if (i > m+2) {
+          H(i, i-3) = 0.0;
+        }
+      }
+
+      // Double QR step involving rows l:n and columns m:n
+
+      for (int k = m; k <= n-1; k++) {
+        bool notlast = (k != n-1);
+        if (k != m) {
+          p = H(k, k-1);
+          q = H(k+1, k-1);
+          r = (notlast ? H(k+2, k-1) : 0.0);
+          x = std::abs(p) + std::abs(q) + std::abs(r);
+          if (x != 0.0) {
+            p = p / x;
+            q = q / x;
+            r = r / x;
+          }
+        }
+        if (x == 0.0) {
+          break;
+        }
+        s = std::sqrt(p * p + q * q + r * r);
+        if (p < 0) {
+          s = -s;
+        }
+        if (s != 0) {
+          if (k != m) {
+            H(k, k-1) = -s * x;
+          } else if (l != m) {
+            H(k, k-1) = -H(k, k-1);
+          }
+          p = p + s;
+          x = p / s;
+          y = q / s;
+          z = r / s;
+          q = q / p;
+          r = r / p;
+
+          // Row modification
+
+          for (int j = k; j < nn; j++) {
+            p = H(k, j) + q * H(k+1, j);
+            if (notlast) {
+              p = p + r * H(k+2, j);
+              H(k+2, j) = H(k+2, j) - p * z;
+            }
+            H(k, j) = H(k, j) - p * x;
+            H(k+1, j) = H(k+1, j) - p * y;
+          }
+
+          // Column modification
+
+          for (int i = 0; i <= std::min(n, k+3); i++) {
+            p = x * H(i, k) + y * H(i, k+1);
+            if (notlast) {
+              p = p + z * H(i, k+2);
+              H(i, k+2) = H(i, k+2) - p * r;
+            }
+            H(i, k) = H(i, k) - p;
+            H(i, k+1) = H(i, k+1) - p * q;
+          }
+
+          // Accumulate transformations
+
+          for (int i = low; i <= high; i++) {
+            p = x * V(i, k) + y * V(i, k+1);
+            if (notlast) {
+              p = p + z * V(i, k+2);
+              V(i, k+2) = V(i, k+2) - p * r;
+            }
+            V(i, k) = V(i, k) - p;
+            V(i, k+1) = V(i, k+1) - p * q;
+          }
+        }  // (s != 0)
+      }  // k loop
+    }  // check convergence
+  }  // while (n >= low)
+
+  // Backsubstitute to find vectors of upper triangular form
+
+  if (norm == 0.0) {
+    return;
+  }
+
+  for (n = nn-1; n >= 0; n--) {
+    p = d_[n];
+    q = e_[n];
+
+    // Real vector
+
+    if (q == 0) {
+      int l = n;
+      H(n, n) = 1.0;
+      for (int i = n-1; i >= 0; i--) {
+        w = H(i, i) - p;
+        r = 0.0;
+        for (int j = l; j <= n; j++) {
+          r = r + H(i, j) * H(j, n);
+        }
+        if (e_[i] < 0.0) {
+          z = w;
+          s = r;
+        } else {
+          l = i;
+          if (e_[i] == 0.0) {
+            if (w != 0.0) {
+              H(i, n) = -r / w;
+            } else {
+              H(i, n) = -r / (eps * norm);
+            }
+
+            // Solve real equations
+
+          } else {
+            x = H(i, i+1);
+            y = H(i+1, i);
+            q = (d_[i] - p) * (d_[i] - p) +e_[i] *e_[i];
+            t = (x * s - z * r) / q;
+            H(i, n) = t;
+            if (std::abs(x) > std::abs(z)) {
+              H(i+1, n) = (-r - w * t) / x;
+            } else {
+              H(i+1, n) = (-s - y * t) / z;
+            }
+          }
+
+          // Overflow control
+
+          t = std::abs(H(i, n));
+          if ((eps * t) * t > 1) {
+            for (int j = i; j <= n; j++) {
+              H(j, n) = H(j, n) / t;
+            }
+          }
+        }
+      }
+
+      // Complex vector
+
+    } else if (q < 0) {
+      int l = n-1;
+
+      // Last vector component imaginary so matrix is triangular
+
+      if (std::abs(H(n, n-1)) > std::abs(H(n-1, n))) {
+        H(n-1, n-1) = q / H(n, n-1);
+        H(n-1, n) = -(H(n, n) - p) / H(n, n-1);
+      } else {
+        Real cdivr, cdivi;
+        cdiv(0.0, -H(n-1, n), H(n-1, n-1)-p, q, &cdivr, &cdivi);
+        H(n-1, n-1) = cdivr;
+        H(n-1, n) = cdivi;
+      }
+      H(n, n-1) = 0.0;
+      H(n, n) = 1.0;
+      for (int i = n-2; i >= 0; i--) {
+        Real ra, sa, vr, vi;
+        ra = 0.0;
+        sa = 0.0;
+        for (int j = l; j <= n; j++) {
+          ra = ra + H(i, j) * H(j, n-1);
+          sa = sa + H(i, j) * H(j, n);
+        }
+        w = H(i, i) - p;
+
+        if (e_[i] < 0.0) {
+          z = w;
+          r = ra;
+          s = sa;
+        } else {
+          l = i;
+          if (e_[i] == 0) {
+            Real cdivr, cdivi;
+            cdiv(-ra, -sa, w, q, &cdivr, &cdivi);
+            H(i, n-1) = cdivr;
+            H(i, n) = cdivi;
+          } else {
+            Real cdivr, cdivi;
+            // Solve complex equations
+
+            x = H(i, i+1);
+            y = H(i+1, i);
+            vr = (d_[i] - p) * (d_[i] - p) +e_[i] *e_[i] - q * q;
+            vi = (d_[i] - p) * 2.0 * q;
+            if (vr == 0.0 && vi == 0.0) {
+              vr = eps * norm * (std::abs(w) + std::abs(q) +
+                                 std::abs(x) + std::abs(y) + std::abs(z));
+            }
+            cdiv(x*r-z*ra+q*sa, x*s-z*sa-q*ra, vr, vi, &cdivr, &cdivi);
+            H(i, n-1) = cdivr;
+            H(i, n) = cdivi;
+            if (std::abs(x) > (std::abs(z) + std::abs(q))) {
+              H(i+1, n-1) = (-ra - w * H(i, n-1) + q * H(i, n)) / x;
+              H(i+1, n) = (-sa - w * H(i, n) - q * H(i, n-1)) / x;
+            } else {
+              cdiv(-r-y*H(i, n-1), -s-y*H(i, n), z, q, &cdivr, &cdivi);
+              H(i+1, n-1) = cdivr;
+              H(i+1, n) = cdivi;
+            }
+          }
+
+          // Overflow control
+
+          t = std::max(std::abs(H(i, n-1)), std::abs(H(i, n)));
+          if ((eps * t) * t > 1) {
+            for (int j = i; j <= n; j++) {
+              H(j, n-1) = H(j, n-1) / t;
+              H(j, n) = H(j, n) / t;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Vectors of isolated roots
+
+  for (int i = 0; i < nn; i++) {
+    if (i < low || i > high) {
+      for (int j = i; j < nn; j++) {
+        V(i, j) = H(i, j);
+      }
+    }
+  }
+
+  // Back transformation to get eigenvectors of original matrix
+
+  for (int j = nn-1; j >= low; j--) {
+    for (int i = low; i <= high; i++) {
+      z = 0.0;
+      for (int k = low; k <= std::min(j, high); k++) {
+        z = z + V(i, k) * H(k, j);
+      }
+      V(i, j) = z;
+    }
+  }
+}
+
+template<typename Real>
+EigenvalueDecomposition<Real>::EigenvalueDecomposition(const MatrixBase<Real> &A) {
+  KALDI_ASSERT(A.NumCols() == A.NumRows() && A.NumCols() >= 1);
+  n_ = A.NumRows();
+  V_ = new Real[n_*n_];
+  d_ = new Real[n_];
+  e_ = new Real[n_];
+  H_ = NULL;
+  ort_ = NULL;
+  if (A.IsSymmetric(0.0)) {
+
+    for (int i = 0; i < n_; i++)
+      for (int j = 0; j < n_; j++)
+        V(i, j) = A(i, j);  // Note that V(i, j) is a member function; A(i, j) is an operator
+    // of the matrix A.
+    // Tridiagonalize.
+    Tred2();
+
+    // Diagonalize.
+    Tql2();
+  } else {
+    H_ = new Real[n_*n_];
+    ort_ = new Real[n_];
+    for (int i = 0; i < n_; i++)
+      for (int j = 0; j < n_; j++)
+        H(i, j) = A(i, j);  // as before: H is member function, A(i, j) is operator of matrix.
+
+    // Reduce to Hessenberg form.
+    Orthes();
+
+    // Reduce Hessenberg to real Schur form.
+    Hqr2();
+  }
+}
+
+template<typename Real>
+EigenvalueDecomposition<Real>::~EigenvalueDecomposition() {
+  delete [] d_;
+  delete [] e_;
+  delete [] V_;
+  if (H_) delete [] H_;
+  if (ort_) delete [] ort_;
+}
+
+// see function MatrixBase<Real>::Eig in kaldi-matrix.cc
+
+
+} // namespace kaldi
+
+#endif // KALDI_MATRIX_JAMA_EIG_H_
diff --git a/kaldi_io/src/kaldi/matrix/jama-svd.h b/kaldi_io/src/kaldi/matrix/jama-svd.h
new file mode 100644
index 0000000..8304dac
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/jama-svd.h
@@ -0,0 +1,531 @@
+// matrix/jama-svd.h
+
+// Copyright 2009-2011 Microsoft Corporation
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+// This file consists of a port and modification of materials from
+//   JAMA: A Java Matrix Package
+// under the following notice: This software is a cooperative product of
+// The MathWorks and the National Institute of Standards and Technology (NIST)
+// which has been released to the public.  This notice and the original code are
+// available at http://math.nist.gov/javanumerics/jama/domain.notice
+
+
+#ifndef KALDI_MATRIX_JAMA_SVD_H_
+#define KALDI_MATRIX_JAMA_SVD_H_ 1
+
+
+#include "matrix/kaldi-matrix.h"
+#include "matrix/sp-matrix.h"
+#include "matrix/cblas-wrappers.h"
+
+namespace kaldi {
+
+#if defined(HAVE_ATLAS) || defined(USE_KALDI_SVD)
+// using ATLAS as our math library, which doesn't have SVD -> need
+// to implement it.
+
+// This routine is a modified form of jama_svd.h which is part of the TNT distribution.
+// (originally comes from JAMA).
+
+/** Singular Value Decomposition.
+ * <P>
+ * For an m-by-n matrix A with m >= n, the singular value decomposition is
+ * an m-by-n orthogonal matrix U, an n-by-n diagonal matrix S, and
+ * an n-by-n orthogonal matrix V so that A = U*S*V'.
+ * <P>
+ * The singular values, sigma[k] = S(k, k), are ordered so that
+ * sigma[0] >= sigma[1] >= ... >= sigma[n-1].
+ * <P>
+ * The singular value decompostion always exists, so the constructor will
+ * never fail.  The matrix condition number and the effective numerical
+ * rank can be computed from this decomposition.
+
+ * <p>
+ *     (Adapted from JAMA, a Java Matrix Library, developed by jointly
+ *     by the Mathworks and NIST; see  http://math.nist.gov/javanumerics/jama).
+ */
+
+
+template<typename Real>
+bool MatrixBase<Real>::JamaSvd(VectorBase<Real> *s_in,
+                               MatrixBase<Real> *U_in,
+                               MatrixBase<Real> *V_in) {  //  Destructive!
+  KALDI_ASSERT(s_in != NULL && U_in != this && V_in != this);
+  int wantu = (U_in != NULL), wantv = (V_in != NULL);
+  Matrix<Real> Utmp, Vtmp;
+  MatrixBase<Real> &U = (U_in ? *U_in : Utmp), &V = (V_in ? *V_in : Vtmp);
+  VectorBase<Real> &s = *s_in;
+
+  int m = num_rows_, n = num_cols_;
+  KALDI_ASSERT(m>=n && m != 0 && n != 0);
+  if (wantu) KALDI_ASSERT((int)U.num_rows_ == m && (int)U.num_cols_ == n);
+  if (wantv) KALDI_ASSERT((int)V.num_rows_ == n && (int)V.num_cols_ == n);
+  KALDI_ASSERT((int)s.Dim() == n);  // n<=m so n is min.
+
+  int nu = n;
+  U.SetZero();  // make sure all zero.
+  Vector<Real> e(n);
+  Vector<Real> work(m);
+  MatrixBase<Real> &A(*this);
+  Real *adata = A.Data(), *workdata = work.Data(), *edata = e.Data(),
+      *udata = U.Data(), *vdata = V.Data();
+  int astride = static_cast<int>(A.Stride()),
+      ustride = static_cast<int>(U.Stride()),
+      vstride = static_cast<int>(V.Stride());
+  int i = 0, j = 0, k = 0;
+
+  // Reduce A to bidiagonal form, storing the diagonal elements
+  // in s and the super-diagonal elements in e.
+
+  int nct = std::min(m-1, n);
+  int nrt = std::max(0, std::min(n-2, m));
+  for (k = 0; k < std::max(nct, nrt); k++) {
+    if (k < nct) {
+
+      // Compute the transformation for the k-th column and
+      // place the k-th diagonal in s(k).
+      // Compute 2-norm of k-th column without under/overflow.
+      s(k) = 0;
+      for (i = k; i < m; i++) {
+        s(k) = hypot(s(k), A(i, k));
+      }
+      if (s(k) != 0.0) {
+        if (A(k, k) < 0.0) {
+          s(k) = -s(k);
+        }
+        for (i = k; i < m; i++) {
+          A(i, k) /= s(k);
+        }
+        A(k, k) += 1.0;
+      }
+      s(k) = -s(k);
+    }
+    for (j = k+1; j < n; j++) {
+      if ((k < nct) && (s(k) != 0.0))  {
+
+        // Apply the transformation.
+
+        Real t = cblas_Xdot(m - k, adata + astride*k + k, astride,
+                            adata + astride*k + j, astride);
+        /*for (i = k; i < m; i++) {
+          t += adata[i*astride + k]*adata[i*astride + j];  //   A(i, k)*A(i, j); // 3
+          }*/
+        t = -t/A(k, k);
+        cblas_Xaxpy(m - k, t, adata + k*astride + k, astride,
+                    adata + k*astride + j, astride);
+        /*for (i = k; i < m; i++) {
+          adata[i*astride + j] += t*adata[i*astride + k];  // A(i, j) += t*A(i, k); // 5
+          }*/
+      }
+
+      // Place the k-th row of A into e for the
+      // subsequent calculation of the row transformation.
+
+      e(j) = A(k, j);
+    }
+    if (wantu & (k < nct)) {
+
+      // Place the transformation in U for subsequent back
+      // multiplication.
+
+      for (i = k; i < m; i++) {
+        U(i, k) = A(i, k);
+      }
+    }
+    if (k < nrt) {
+
+      // Compute the k-th row transformation and place the
+      // k-th super-diagonal in e(k).
+      // Compute 2-norm without under/overflow.
+      e(k) = 0;
+      for (i = k+1; i < n; i++) {
+        e(k) = hypot(e(k), e(i));
+      }
+      if (e(k) != 0.0) {
+        if (e(k+1) < 0.0) {
+          e(k) = -e(k);
+        }
+        for (i = k+1; i < n; i++) {
+          e(i) /= e(k);
+        }
+        e(k+1) += 1.0;
+      }
+      e(k) = -e(k);
+      if ((k+1 < m) & (e(k) != 0.0)) {
+
+        // Apply the transformation.
+
+        for (i = k+1; i < m; i++) {
+          work(i) = 0.0;
+        }
+        for (j = k+1; j < n; j++) {
+          for (i = k+1; i < m; i++) {
+            workdata[i] += edata[j] * adata[i*astride + j];  // work(i) += e(j)*A(i, j); // 5
+          }
+        }
+        for (j = k+1; j < n; j++) {
+          Real t(-e(j)/e(k+1));
+          cblas_Xaxpy(m - (k+1), t, workdata + (k+1), 1,
+                      adata + (k+1)*astride + j, astride);
+          /*
+          for (i = k+1; i < m; i++) {
+            adata[i*astride + j] += t*workdata[i];  // A(i, j) += t*work(i); // 5
+            }*/
+        }
+      }
+      if (wantv) {
+
+        // Place the transformation in V for subsequent
+        // back multiplication.
+
+        for (i = k+1; i < n; i++) {
+          V(i, k) = e(i);
+        }
+      }
+    }
+  }
+
+  // Set up the final bidiagonal matrix or order p.
+
+  int p = std::min(n, m+1);
+  if (nct < n) {
+    s(nct) = A(nct, nct);
+  }
+  if (m < p) {
+    s(p-1) = 0.0;
+  }
+  if (nrt+1 < p) {
+    e(nrt) = A(nrt, p-1);
+  }
+  e(p-1) = 0.0;
+
+  // If required, generate U.
+
+  if (wantu) {
+    for (j = nct; j < nu; j++) {
+      for (i = 0; i < m; i++) {
+        U(i, j) = 0.0;
+      }
+      U(j, j) = 1.0;
+    }
+    for (k = nct-1; k >= 0; k--) {
+      if (s(k) != 0.0) {
+        for (j = k+1; j < nu; j++) {
+          Real t = cblas_Xdot(m - k, udata + k*ustride + k, ustride, udata + k*ustride + j, ustride);
+          //for (i = k; i < m; i++) {
+          //  t += udata[i*ustride + k]*udata[i*ustride + j];  // t += U(i, k)*U(i, j); // 8
+          // }
+          t = -t/U(k, k);
+          cblas_Xaxpy(m - k, t, udata + ustride*k + k, ustride,
+                      udata + k*ustride + j, ustride);
+          /*for (i = k; i < m; i++) {
+            udata[i*ustride + j] += t*udata[i*ustride + k];  // U(i, j) += t*U(i, k); // 4
+            }*/
+        }
+        for (i = k; i < m; i++ ) {
+          U(i, k) = -U(i, k);
+        }
+        U(k, k) = 1.0 + U(k, k);
+        for (i = 0; i < k-1; i++) {
+          U(i, k) = 0.0;
+        }
+      } else {
+        for (i = 0; i < m; i++) {
+          U(i, k) = 0.0;
+        }
+        U(k, k) = 1.0;
+      }
+    }
+  }
+
+  // If required, generate V.
+
+  if (wantv) {
+    for (k = n-1; k >= 0; k--) {
+      if ((k < nrt) & (e(k) != 0.0)) {
+        for (j = k+1; j < nu; j++) {
+          Real t = cblas_Xdot(n - (k+1), vdata + (k+1)*vstride + k, vstride,
+                              vdata + (k+1)*vstride + j, vstride); 
+          /*Real t (0.0);
+          for (i = k+1; i < n; i++) {
+            t += vdata[i*vstride + k]*vdata[i*vstride + j];  // t += V(i, k)*V(i, j); // 7
+            }*/
+          t = -t/V(k+1, k);
+          cblas_Xaxpy(n - (k+1), t, vdata + (k+1)*vstride + k, vstride,
+                      vdata + (k+1)*vstride + j, vstride);
+          /*for (i = k+1; i < n; i++) {
+            vdata[i*vstride + j] += t*vdata[i*vstride + k];  // V(i, j) += t*V(i, k); // 7
+            }*/
+        }
+      }
+      for (i = 0; i < n; i++) {
+        V(i, k) = 0.0;
+      }
+      V(k, k) = 1.0;
+    }
+  }
+
+  // Main iteration loop for the singular values.
+
+  int pp = p-1;
+  int iter = 0;
+  // note: -52.0 is from Jama code; the -23 is the extension
+  // to float, because mantissa length in (double, float)
+  // is (52, 23) bits respectively.
+  Real eps(pow(2.0, sizeof(Real) == 4 ? -23.0 : -52.0));
+  // Note: the -966 was taken from Jama code, but the -120 is a guess
+  // of how to extend this to float... the exponent in double goes
+  // from -1022 .. 1023, and in float from -126..127.  I'm not sure
+  // what the significance of 966 is, so -120 just represents a number
+  // that's a bit less negative than -126.  If we get convergence
+  // failure in float only, this may mean that we have to make the
+  // -120 value less negative.
+  Real tiny(pow(2.0, sizeof(Real) == 4 ? -120.0: -966.0 ));
+  
+  while (p > 0) {
+    int k = 0;
+    int kase = 0;
+
+    if (iter == 500 || iter == 750) {
+      KALDI_WARN << "Svd taking a long time: making convergence criterion less exact.";
+      eps = pow(static_cast<Real>(0.8), eps);
+      tiny = pow(static_cast<Real>(0.8), tiny);
+    }
+    if (iter > 1000) {
+      KALDI_WARN << "Svd not converging on matrix of size " << m << " by " <<n;
+      return false;
+    }
+
+    // This section of the program inspects for
+    // negligible elements in the s and e arrays.  On
+    // completion the variables kase and k are set as follows.
+
+    // kase = 1     if s(p) and e(k-1) are negligible and k < p
+    // kase = 2     if s(k) is negligible and k < p
+    // kase = 3     if e(k-1) is negligible, k < p, and
+    //              s(k), ..., s(p) are not negligible (qr step).
+    // kase = 4     if e(p-1) is negligible (convergence).
+
+    for (k = p-2; k >= -1; k--) {
+      if (k == -1) {
+        break;
+      }
+      if (std::abs(e(k)) <=
+          tiny + eps*(std::abs(s(k)) + std::abs(s(k+1)))) {
+        e(k) = 0.0;
+        break;
+      }
+    }
+    if (k == p-2) {
+      kase = 4;
+    } else {
+      int ks;
+      for (ks = p-1; ks >= k; ks--) {
+        if (ks == k) {
+          break;
+        }
+        Real t( (ks != p ? std::abs(e(ks)) : 0.) +
+                (ks != k+1 ? std::abs(e(ks-1)) : 0.));
+        if (std::abs(s(ks)) <= tiny + eps*t)  {
+          s(ks) = 0.0;
+          break;
+        }
+      }
+      if (ks == k) {
+        kase = 3;
+      } else if (ks == p-1) {
+        kase = 1;
+      } else {
+        kase = 2;
+        k = ks;
+      }
+    }
+    k++;
+
+    // Perform the task indicated by kase.
+
+    switch (kase) {
+
+      // Deflate negligible s(p).
+
+      case 1: {
+        Real f(e(p-2));
+        e(p-2) = 0.0;
+        for (j = p-2; j >= k; j--) {
+          Real t( hypot(s(j), f));
+          Real cs(s(j)/t);
+          Real sn(f/t);
+          s(j) = t;
+          if (j != k) {
+            f = -sn*e(j-1);
+            e(j-1) = cs*e(j-1);
+          }
+          if (wantv) {
+            for (i = 0; i < n; i++) {
+              t = cs*V(i, j) + sn*V(i, p-1);
+              V(i, p-1) = -sn*V(i, j) + cs*V(i, p-1);
+              V(i, j) = t;
+            }
+          }
+        }
+      }
+        break;
+
+        // Split at negligible s(k).
+
+      case 2: {
+        Real f(e(k-1));
+        e(k-1) = 0.0;
+        for (j = k; j < p; j++) {
+          Real t(hypot(s(j), f));
+          Real cs( s(j)/t);
+          Real sn(f/t);
+          s(j) = t;
+          f = -sn*e(j);
+          e(j) = cs*e(j);
+          if (wantu) {
+            for (i = 0; i < m; i++) {
+              t = cs*U(i, j) + sn*U(i, k-1);
+              U(i, k-1) = -sn*U(i, j) + cs*U(i, k-1);
+              U(i, j) = t;
+            }
+          }
+        }
+      }
+        break;
+
+        // Perform one qr step.
+
+      case 3: {
+
+        // Calculate the shift.
+
+        Real scale = std::max(std::max(std::max(std::max(
+            std::abs(s(p-1)), std::abs(s(p-2))), std::abs(e(p-2))),
+                                       std::abs(s(k))), std::abs(e(k)));
+        Real sp = s(p-1)/scale;
+        Real spm1 = s(p-2)/scale;
+        Real epm1 = e(p-2)/scale;
+        Real sk = s(k)/scale;
+        Real ek = e(k)/scale;
+        Real b = ((spm1 + sp)*(spm1 - sp) + epm1*epm1)/2.0;
+        Real c = (sp*epm1)*(sp*epm1);
+        Real shift = 0.0;
+        if ((b != 0.0) || (c != 0.0)) {
+          shift = std::sqrt(b*b + c);
+          if (b < 0.0) {
+            shift = -shift;
+          }
+          shift = c/(b + shift);
+        }
+        Real f = (sk + sp)*(sk - sp) + shift;
+        Real g = sk*ek;
+
+        // Chase zeros.
+
+        for (j = k; j < p-1; j++) {
+          Real t = hypot(f, g);
+          Real cs = f/t;
+          Real sn = g/t;
+          if (j != k) {
+            e(j-1) = t;
+          }
+          f = cs*s(j) + sn*e(j);
+          e(j) = cs*e(j) - sn*s(j);
+          g = sn*s(j+1);
+          s(j+1) = cs*s(j+1);
+          if (wantv) {
+            cblas_Xrot(n, vdata + j, vstride, vdata + j+1, vstride, cs, sn);
+            /*for (i = 0; i < n; i++) {
+              t = cs*vdata[i*vstride + j] + sn*vdata[i*vstride + j+1];  // t = cs*V(i, j) + sn*V(i, j+1);         // 13
+              vdata[i*vstride + j+1] = -sn*vdata[i*vstride + j] + cs*vdata[i*vstride + j+1];  // V(i, j+1) = -sn*V(i, j) + cs*V(i, j+1); // 5
+              vdata[i*vstride + j] = t;  // V(i, j) = t; // 4
+              }*/
+          }
+          t = hypot(f, g);
+          cs = f/t;
+          sn = g/t;
+          s(j) = t;
+          f = cs*e(j) + sn*s(j+1);
+          s(j+1) = -sn*e(j) + cs*s(j+1);
+          g = sn*e(j+1);
+          e(j+1) = cs*e(j+1);
+          if (wantu && (j < m-1)) {
+            cblas_Xrot(m, udata + j, ustride, udata + j+1, ustride, cs, sn);
+            /*for (i = 0; i < m; i++) {
+              t = cs*udata[i*ustride + j] + sn*udata[i*ustride + j+1];  // t = cs*U(i, j) + sn*U(i, j+1); // 7
+              udata[i*ustride + j+1] = -sn*udata[i*ustride + j] +cs*udata[i*ustride + j+1];  // U(i, j+1) = -sn*U(i, j) + cs*U(i, j+1); // 8
+              udata[i*ustride + j] = t;  // U(i, j) = t; // 1
+              }*/
+          }
+        }
+        e(p-2) = f;
+        iter = iter + 1;
+      }
+        break;
+
+        // Convergence.
+
+      case 4: {
+
+        // Make the singular values positive.
+
+        if (s(k) <= 0.0) {
+          s(k) = (s(k) < 0.0 ? -s(k) : 0.0);
+          if (wantv) {
+            for (i = 0; i <= pp; i++) {
+              V(i, k) = -V(i, k);
+            }
+          }
+        }
+
+        // Order the singular values.
+
+        while (k < pp) {
+          if (s(k) >= s(k+1)) {
+            break;
+          }
+          Real t = s(k);
+          s(k) = s(k+1);
+          s(k+1) = t;
+          if (wantv && (k < n-1)) {
+            for (i = 0; i < n; i++) {
+              t = V(i, k+1); V(i, k+1) = V(i, k); V(i, k) = t;
+            }
+          }
+          if (wantu && (k < m-1)) {
+            for (i = 0; i < m; i++) {
+              t = U(i, k+1); U(i, k+1) = U(i, k); U(i, k) = t;
+            }
+          }
+          k++;
+        }
+        iter = 0;
+        p--;
+      }
+        break;
+    }
+  }
+  return true;
+}
+
+#endif // defined(HAVE_ATLAS) || defined(USE_KALDI_SVD)
+
+} // namespace kaldi
+
+#endif // KALDI_MATRIX_JAMA_SVD_H_
diff --git a/kaldi_io/src/kaldi/matrix/kaldi-blas.h b/kaldi_io/src/kaldi/matrix/kaldi-blas.h
new file mode 100644
index 0000000..5d25ab8
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/kaldi-blas.h
@@ -0,0 +1,132 @@
+// matrix/kaldi-blas.h
+
+// Copyright 2009-2011  Ondrej Glembek;  Microsoft Corporation
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+#ifndef KALDI_MATRIX_KALDI_BLAS_H_
+#define KALDI_MATRIX_KALDI_BLAS_H_
+
+// This file handles the #includes for BLAS, LAPACK and so on.
+// It manipulates the declarations into a common format that kaldi can handle.
+// However, the kaldi code will check whether HAVE_ATLAS is defined as that
+// code is called a bit differently from CLAPACK that comes from other sources.
+
+// There are three alternatives:
+//   (i) you have ATLAS, which includes the ATLAS implementation of CBLAS
+//   plus a subset of CLAPACK (but with clapack_ in the function declarations).
+//   In this case, define HAVE_ATLAS and make sure the relevant directories are
+//   in the include path.
+
+//   (ii) you have CBLAS (some implementation thereof) plus CLAPACK.
+//   In this case, define HAVE_CLAPACK.
+//   [Since CLAPACK depends on BLAS, the presence of BLAS is implicit].
+
+//  (iii) you have the MKL library, which includes CLAPACK and CBLAS.
+
+// Note that if we are using ATLAS, no Svd implementation is supplied,
+// so we define HAVE_Svd to be zero and this directs our implementation to
+// supply its own "by hand" implementation which is based on TNT code.
+
+
+
+
+#if (defined(HAVE_CLAPACK) && (defined(HAVE_ATLAS) || defined(HAVE_MKL))) \
+    || (defined(HAVE_ATLAS) && defined(HAVE_MKL))
+#error "Do not define more than one of HAVE_CLAPACK, HAVE_ATLAS and HAVE_MKL"
+#endif
+
+#ifdef HAVE_ATLAS
+  extern "C" {
+    #include <cblas.h>
+    #include <clapack.h>
+  }
+#elif defined(HAVE_CLAPACK)
+  #ifdef __APPLE__
+    #ifndef __has_extension
+    #define __has_extension(x) 0
+    #endif
+    #define vImage_Utilities_h
+    #define vImage_CVUtilities_h
+    #include <Accelerate/Accelerate.h>
+    typedef __CLPK_integer          integer;
+    typedef __CLPK_logical          logical;
+    typedef __CLPK_real             real;
+    typedef __CLPK_doublereal       doublereal;
+    typedef __CLPK_complex          complex;
+    typedef __CLPK_doublecomplex    doublecomplex;
+    typedef __CLPK_ftnlen           ftnlen;
+  #else
+    extern "C" {
+      // May be in /usr/[local]/include if installed; else this uses the one
+      // from the tools/CLAPACK_include directory.
+      #include <cblas.h>
+      #include <f2c.h>
+      #include <clapack.h>  
+
+      // get rid of macros from f2c.h -- these are dangerous.
+      #undef abs
+      #undef dabs
+      #undef min
+      #undef max
+      #undef dmin
+      #undef dmax
+      #undef bit_test
+      #undef bit_clear
+      #undef bit_set
+    }
+  #endif
+#elif defined(HAVE_MKL)
+  extern "C" {
+    #include <mkl.h>
+  }
+#elif defined(HAVE_OPENBLAS)
+  // getting cblas.h and lapacke.h from <openblas-install-dir>/.
+  // putting in "" not <> to search -I before system libraries.
+  #include "cblas.h"
+  #include "lapacke.h"
+  #undef I
+  #undef complex
+  // get rid of macros from f2c.h -- these are dangerous.
+  #undef abs
+  #undef dabs
+  #undef min
+  #undef max
+  #undef dmin
+  #undef dmax
+  #undef bit_test
+  #undef bit_clear
+  #undef bit_set
+#else
+  #error "You need to define (using the preprocessor) either HAVE_CLAPACK or HAVE_ATLAS or HAVE_MKL (but not more than one)"  
+#endif
+
+#ifdef HAVE_OPENBLAS
+typedef int KaldiBlasInt; // try int.
+#endif
+#ifdef HAVE_CLAPACK
+typedef integer KaldiBlasInt;
+#endif
+#ifdef HAVE_MKL
+typedef MKL_INT KaldiBlasInt;
+#endif
+
+#ifdef HAVE_ATLAS
+// in this case there is no need for KaldiBlasInt-- this typedef is only needed
+// for Svd code which is not included in ATLAS (we re-implement it).
+#endif
+
+
+#endif  // KALDI_MATRIX_KALDI_BLAS_H_
diff --git a/kaldi_io/src/kaldi/matrix/kaldi-gpsr.h b/kaldi_io/src/kaldi/matrix/kaldi-gpsr.h
new file mode 100644
index 0000000..c294bdd
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/kaldi-gpsr.h
@@ -0,0 +1,166 @@
+// matrix/kaldi-gpsr.h
+
+// Copyright 2012  Arnab Ghoshal
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_MATRIX_KALDI_GPSR_H_
+#define KALDI_MATRIX_KALDI_GPSR_H_
+
+#include <string>
+#include <vector>
+
+#include "base/kaldi-common.h"
+#include "matrix/matrix-lib.h"
+#include "itf/options-itf.h"
+
+namespace kaldi {
+
+/// This is an implementation of the GPSR algorithm. See, Figueiredo, Nowak and
+/// Wright, "Gradient Projection for Sparse Reconstruction: Application to
+/// Compressed Sensing and Other Inverse Problems," IEEE Journal of Selected
+/// Topics in Signal Processing, vol. 1, no. 4, pp. 586-597, 2007.
+/// http://dx.doi.org/10.1109/JSTSP.2007.910281
+
+/// The GPSR algorithm, described in Figueiredo, et al., 2007, solves:
+/// \f[ \min_x 0.5 * ||y - Ax||_2^2 + \tau ||x||_1, \f]
+/// where \f$ x \in R^n, y \in R^k \f$, and \f$ A \in R^{n \times k} \f$.
+/// In this implementation, we solve:
+/// \f[ \min_x 0.5 * x^T H x - g^T x + \tau ||x||_1, \f]
+/// which is the more natural form in which such problems arise in our case.
+/// Here, \f$ H = A^T A \in R^{n \times n} \f$ and \f$ g = A^T y \in R^n \f$.
+
+
+/** \struct GpsrConfig
+ *  Configuration variables needed in the GPSR algorithm.
+ */
+struct GpsrConfig {
+  bool use_gpsr_bb;  ///< Use the Barzilai-Borwein gradient projection method
+
+  /// The following options are common to both the basic & Barzilai-Borwein
+  /// versions of GPSR
+  double stop_thresh;  ///< Stopping threshold
+  int32 max_iters;  ///< Maximum number of iterations
+  double gpsr_tau;  ///< Regularization scale
+  double alpha_min;  ///< Minimum step size in the feasible direction
+  double alpha_max;  ///< Maximum step size in the feasible direction
+  double max_sparsity;  ///< Maximum percentage of dimensions set to 0
+  double tau_reduction;  ///< Multiply tau by this if max_sparsity reached
+
+  /// The following options are for the backtracking line search in basic GPSR.
+  /// Step size reduction factor in backtracking line search. 0 < beta < 1
+  double gpsr_beta;
+  /// Improvement factor in backtracking line search, i.e. the new objective
+  /// function must be less than the old one by mu times the gradient in the
+  /// direction of the change in x. 0 < mu < 1
+  double gpsr_mu;
+  int32 max_iters_backtrak;  ///< Max iterations for backtracking line search
+
+  bool debias;  ///< Do debiasing, i.e. unconstrained optimization at the end
+  double stop_thresh_debias;  ///< Stopping threshold for debiasing stage
+  int32 max_iters_debias;  ///< Maximum number of iterations for debiasing stage
+
+  GpsrConfig() {
+    use_gpsr_bb = true;
+
+    stop_thresh = 0.005;
+    max_iters = 100;
+    gpsr_tau = 10;
+    alpha_min = 1.0e-10;
+    alpha_max = 1.0e+20;
+    max_sparsity = 0.9;
+    tau_reduction = 0.8;
+
+    gpsr_beta = 0.5;
+    gpsr_mu = 0.1;
+    max_iters_backtrak = 50;
+
+    debias = false;
+    stop_thresh_debias = 0.001;
+    max_iters_debias = 50;
+  }
+
+  void Register(OptionsItf *po);
+};
+
+inline void GpsrConfig::Register(OptionsItf *po) {
+  std::string module = "GpsrConfig: ";
+  po->Register("use-gpsr-bb", &use_gpsr_bb, module+
+               "Use the Barzilai-Borwein gradient projection method.");
+
+  po->Register("stop-thresh", &stop_thresh, module+
+               "Stopping threshold for GPSR.");
+  po->Register("max-iters", &max_iters, module+
+               "Maximum number of iterations of GPSR.");
+  po->Register("gpsr-tau", &gpsr_tau, module+
+               "Regularization scale for GPSR.");
+  po->Register("alpha-min", &alpha_min, module+
+               "Minimum step size in feasible direction.");
+  po->Register("alpha-max", &alpha_max, module+
+               "Maximum step size in feasible direction.");
+  po->Register("max-sparsity", &max_sparsity, module+
+               "Maximum percentage of dimensions set to 0.");
+  po->Register("tau-reduction", &tau_reduction, module+
+               "Multiply tau by this if maximum sparsity is reached.");
+
+  po->Register("gpsr-beta", &gpsr_beta, module+
+               "Step size reduction factor in backtracking line search (0<beta<1).");
+  po->Register("gpsr-mu", &gpsr_mu, module+
+               "Improvement factor in backtracking line search (0<mu<1).");
+  po->Register("max-iters-backtrack", &max_iters_backtrak, module+
+               "Maximum number of iterations of backtracking line search.");
+
+  po->Register("debias", &debias, module+
+               "Do final debiasing step.");
+  po->Register("stop-thresh-debias", &stop_thresh_debias, module+
+               "Stopping threshold for debiaisng step.");
+  po->Register("max-iters-debias", &max_iters_debias, module+
+               "Maximum number of iterations of debiasing.");
+}
+
+/// Solves a quadratic program in \f$ x \f$, with L_1 regularization:
+/// \f[ \min_x 0.5 * x^T H x - g^T x + \tau ||x||_1. \f]
+/// This is similar to SolveQuadraticProblem() in sp-matrix.h with an added
+/// L_1 term.
+template<typename Real>
+Real Gpsr(const GpsrConfig &opts, const SpMatrix<Real> &H,
+          const Vector<Real> &g, Vector<Real> *x,
+          const char *debug_str = "[unknown]") {
+  if (opts.use_gpsr_bb)
+    return GpsrBB(opts, H, g, x, debug_str);
+  else
+    return GpsrBasic(opts, H, g, x, debug_str);
+}
+
+/// This is the basic GPSR algorithm, where the step size is determined by a
+/// backtracking line search. The line search is called "Armijo rule along the
+/// projection arc" in Bertsekas, Nonlinear Programming, 2nd ed. page 230.
+template<typename Real>
+Real GpsrBasic(const GpsrConfig &opts, const SpMatrix<Real> &H,
+               const Vector<Real> &g, Vector<Real> *x,
+               const char *debug_str = "[unknown]");
+
+/// This is the paper calls the Barzilai-Borwein variant. This is a constrained
+/// Netwon's method where the Hessian is approximated by scaled identity matrix
+template<typename Real>
+Real GpsrBB(const GpsrConfig &opts, const SpMatrix<Real> &H,
+            const Vector<Real> &g, Vector<Real> *x,
+            const char *debug_str = "[unknown]");
+
+
+}  // namespace kaldi
+
+#endif  // KALDI_MATRIX_KALDI_GPSR_H_
diff --git a/kaldi_io/src/kaldi/matrix/kaldi-matrix-inl.h b/kaldi_io/src/kaldi/matrix/kaldi-matrix-inl.h
new file mode 100644
index 0000000..8bc4749
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/kaldi-matrix-inl.h
@@ -0,0 +1,62 @@
+// matrix/kaldi-matrix-inl.h
+
+// Copyright 2009-2011  Microsoft Corporation;  Haihua Xu
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_MATRIX_KALDI_MATRIX_INL_H_
+#define KALDI_MATRIX_KALDI_MATRIX_INL_H_ 1
+
+#include "matrix/kaldi-vector.h"
+
+namespace kaldi {
+
+/// Empty constructor
+template<typename Real>
+Matrix<Real>::Matrix(): MatrixBase<Real>(NULL, 0, 0, 0) { }
+
+
+template<>
+template<>
+void MatrixBase<float>::AddVecVec(const float alpha, const VectorBase<float> &ra, const VectorBase<float> &rb);
+
+template<>
+template<>
+void MatrixBase<double>::AddVecVec(const double alpha, const VectorBase<double> &ra, const VectorBase<double> &rb); 
+
+template<typename Real>
+inline std::ostream & operator << (std::ostream & os, const MatrixBase<Real> & M) {
+  M.Write(os, false);
+  return os;
+}
+
+template<typename Real>
+inline std::istream & operator >> (std::istream & is, Matrix<Real> & M) {
+  M.Read(is, false);
+  return is;
+}
+
+
+template<typename Real>
+inline std::istream & operator >> (std::istream & is, MatrixBase<Real> & M) {
+  M.Read(is, false);
+  return is;
+}
+
+}// namespace kaldi
+
+
+#endif  // KALDI_MATRIX_KALDI_MATRIX_INL_H_
diff --git a/kaldi_io/src/kaldi/matrix/kaldi-matrix.h b/kaldi_io/src/kaldi/matrix/kaldi-matrix.h
new file mode 100644
index 0000000..e6829e0
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/kaldi-matrix.h
@@ -0,0 +1,983 @@
+// matrix/kaldi-matrix.h
+
+// Copyright 2009-2011  Ondrej Glembek;  Microsoft Corporation;  Lukas Burget;
+//                      Saarland University;  Petr Schwarz;  Yanmin Qian;
+//                      Karel Vesely;  Go Vivace Inc.;  Haihua Xu
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_MATRIX_KALDI_MATRIX_H_
+#define KALDI_MATRIX_KALDI_MATRIX_H_ 1
+
+#include "matrix/matrix-common.h"
+
+namespace kaldi {
+
+/// @{ \addtogroup matrix_funcs_scalar
+
+/// We need to declare this here as it will be a friend function.
+/// tr(A B), or tr(A B^T).
+template<typename Real>
+Real TraceMatMat(const MatrixBase<Real> &A, const MatrixBase<Real> &B,
+                 MatrixTransposeType trans = kNoTrans);
+/// @}
+
+/// \addtogroup matrix_group
+/// @{
+
+/// Base class which provides matrix operations not involving resizing
+/// or allocation.   Classes Matrix and SubMatrix inherit from it and take care
+/// of allocation and resizing.
+template<typename Real>
+class MatrixBase {
+ public:
+  // so this child can access protected members of other instances.
+  friend class Matrix<Real>;
+  // friend declarations for CUDA matrices (see ../cudamatrix/)
+  friend class CuMatrixBase<Real>;
+  friend class CuMatrix<Real>;
+  friend class CuSubMatrix<Real>;
+  friend class CuPackedMatrix<Real>;
+  
+  friend class PackedMatrix<Real>;
+
+  /// Returns number of rows (or zero for emtpy matrix).
+  inline MatrixIndexT  NumRows() const { return num_rows_; }
+
+  /// Returns number of columns (or zero for emtpy matrix).
+  inline MatrixIndexT NumCols() const { return num_cols_; }
+
+  /// Stride (distance in memory between each row).  Will be >= NumCols.
+  inline MatrixIndexT Stride() const {  return stride_; }
+
+  /// Returns size in bytes of the data held by the matrix.
+  size_t  SizeInBytes() const {
+    return static_cast<size_t>(num_rows_) * static_cast<size_t>(stride_) *
+        sizeof(Real);
+  }
+
+  /// Gives pointer to raw data (const).
+  inline const Real* Data() const {
+    return data_;
+  }
+
+  /// Gives pointer to raw data (non-const).
+  inline Real* Data() { return data_; }
+
+  /// Returns pointer to data for one row (non-const)
+  inline  Real* RowData(MatrixIndexT i) {
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                 static_cast<UnsignedMatrixIndexT>(num_rows_));
+    return data_ + i * stride_;
+  }
+
+  /// Returns pointer to data for one row (const)
+  inline const Real* RowData(MatrixIndexT i) const {
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                 static_cast<UnsignedMatrixIndexT>(num_rows_));
+    return data_ + i * stride_;
+  }
+
+  /// Indexing operator, non-const
+  /// (only checks sizes if compiled with -DKALDI_PARANOID)
+  inline Real&  operator() (MatrixIndexT r, MatrixIndexT c) {
+    KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
+                          static_cast<UnsignedMatrixIndexT>(num_rows_) &&
+                          static_cast<UnsignedMatrixIndexT>(c) <
+                          static_cast<UnsignedMatrixIndexT>(num_cols_));
+    return *(data_ + r * stride_ + c);
+  }
+  /// Indexing operator, provided for ease of debugging (gdb doesn't work
+  /// with parenthesis operator).
+  Real &Index (MatrixIndexT r, MatrixIndexT c) {  return (*this)(r, c); }
+  
+  /// Indexing operator, const
+  /// (only checks sizes if compiled with -DKALDI_PARANOID)
+  inline const Real operator() (MatrixIndexT r, MatrixIndexT c) const {
+    KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
+                          static_cast<UnsignedMatrixIndexT>(num_rows_) &&
+                          static_cast<UnsignedMatrixIndexT>(c) <
+                          static_cast<UnsignedMatrixIndexT>(num_cols_));
+    return *(data_ + r * stride_ + c);
+  }
+
+  /*   Basic setting-to-special values functions. */
+
+  /// Sets matrix to zero.
+  void SetZero();
+  /// Sets all elements to a specific value.
+  void Set(Real);
+  /// Sets to zero, except ones along diagonal [for non-square matrices too]
+  void SetUnit();
+  /// Sets to random values of a normal distribution
+  void SetRandn();
+  /// Sets to numbers uniformly distributed on (0, 1)
+  void SetRandUniform();
+
+  /*  Copying functions.  These do not resize the matrix! */
+
+
+  /// Copy given matrix. (no resize is done).
+  template<typename OtherReal>
+  void CopyFromMat(const MatrixBase<OtherReal> & M,
+                   MatrixTransposeType trans = kNoTrans);
+
+  /// Copy from compressed matrix.
+  void CopyFromMat(const CompressedMatrix &M);
+  
+  /// Copy given spmatrix. (no resize is done).
+  template<typename OtherReal>
+  void CopyFromSp(const SpMatrix<OtherReal> &M);
+
+  /// Copy given tpmatrix. (no resize is done).
+  template<typename OtherReal>
+  void CopyFromTp(const TpMatrix<OtherReal> &M,
+                  MatrixTransposeType trans = kNoTrans);
+  
+  /// Copy from CUDA matrix.  Implemented in ../cudamatrix/cu-matrix.h
+  template<typename OtherReal>  
+  void CopyFromMat(const CuMatrixBase<OtherReal> &M,
+                   MatrixTransposeType trans = kNoTrans);
+
+  /// Inverse of vec() operator. Copies vector into matrix, row-by-row.
+  /// Note that rv.Dim() must either equal NumRows()*NumCols() or
+  /// NumCols()-- this has two modes of operation.
+  void CopyRowsFromVec(const VectorBase<Real> &v);
+
+  /// This version of CopyRowsFromVec is implemented in ../cudamatrix/cu-vector.cc
+  void CopyRowsFromVec(const CuVectorBase<Real> &v);
+  
+  template<typename OtherReal>
+  void CopyRowsFromVec(const VectorBase<OtherReal> &v);
+
+  /// Copies vector into matrix, column-by-column.
+  /// Note that rv.Dim() must either equal NumRows()*NumCols() or NumRows();
+  /// this has two modes of operation.
+  void CopyColsFromVec(const VectorBase<Real> &v);
+  
+  /// Copy vector into specific column of matrix.
+  void CopyColFromVec(const VectorBase<Real> &v, const MatrixIndexT col);
+  /// Copy vector into specific row of matrix.
+  void CopyRowFromVec(const VectorBase<Real> &v, const MatrixIndexT row);
+  /// Copy vector into diagonal of matrix.
+  void CopyDiagFromVec(const VectorBase<Real> &v);
+
+  /* Accessing of sub-parts of the matrix. */
+
+  /// Return specific row of matrix [const].
+  inline const SubVector<Real> Row(MatrixIndexT i) const {
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                 static_cast<UnsignedMatrixIndexT>(num_rows_));
+    return SubVector<Real>(data_ + (i * stride_), NumCols());
+  }
+
+  /// Return specific row of matrix.
+  inline SubVector<Real> Row(MatrixIndexT i) {
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                 static_cast<UnsignedMatrixIndexT>(num_rows_));
+    return SubVector<Real>(data_ + (i * stride_), NumCols());
+  }
+
+  /// Return a sub-part of matrix.
+  inline SubMatrix<Real> Range(const MatrixIndexT row_offset,
+                               const MatrixIndexT num_rows,
+                               const MatrixIndexT col_offset,
+                               const MatrixIndexT num_cols) const {
+    return SubMatrix<Real>(*this, row_offset, num_rows,
+                           col_offset, num_cols);
+  }
+  inline SubMatrix<Real> RowRange(const MatrixIndexT row_offset,
+                                  const MatrixIndexT num_rows) const {
+    return SubMatrix<Real>(*this, row_offset, num_rows, 0, num_cols_);
+  }  
+  inline SubMatrix<Real> ColRange(const MatrixIndexT col_offset,
+                                  const MatrixIndexT num_cols) const {
+    return SubMatrix<Real>(*this, 0, num_rows_, col_offset, num_cols);
+  }  
+
+  /* Various special functions. */
+  /// Returns sum of all elements in matrix.
+  Real Sum() const;
+  /// Returns trace of matrix.
+  Real Trace(bool check_square = true) const;
+  // If check_square = true, will crash if matrix is not square.
+
+  /// Returns maximum element of matrix.
+  Real Max() const;
+  /// Returns minimum element of matrix.
+  Real Min() const;
+
+  /// Element by element multiplication with a given matrix.
+  void MulElements(const MatrixBase<Real> &A);
+
+  /// Divide each element by the corresponding element of a given matrix.
+  void DivElements(const MatrixBase<Real> &A);
+
+  /// Multiply each element with a scalar value.
+  void Scale(Real alpha);
+
+  /// Set, element-by-element, *this = max(*this, A)
+  void Max(const MatrixBase<Real> &A);
+
+  /// Equivalent to (*this) = (*this) * diag(scale).  Scaling
+  /// each column by a scalar taken from that dimension of the vector.
+  void MulColsVec(const VectorBase<Real> &scale);
+
+  /// Equivalent to (*this) = diag(scale) * (*this).  Scaling
+  /// each row by a scalar taken from that dimension of the vector.
+  void MulRowsVec(const VectorBase<Real> &scale);
+
+  /// Divide each row into src.NumCols() equal groups, and then scale i'th row's
+  /// j'th group of elements by src(i, j).  Requires src.NumRows() ==
+  /// this->NumRows() and this->NumCols() % src.NumCols() == 0.
+  void MulRowsGroupMat(const MatrixBase<Real> &src);
+    
+  /// Returns logdet of matrix.
+  Real LogDet(Real *det_sign = NULL) const;
+  
+  /// matrix inverse.
+  /// if inverse_needed = false, will fill matrix with garbage.
+  /// (only useful if logdet wanted).
+  void Invert(Real *log_det = NULL, Real *det_sign = NULL,
+              bool inverse_needed = true);
+  /// matrix inverse [double].
+  /// if inverse_needed = false, will fill matrix with garbage
+  /// (only useful if logdet wanted).
+  /// Does inversion in double precision even if matrix was not double.
+  void InvertDouble(Real *LogDet = NULL, Real *det_sign = NULL,
+                      bool inverse_needed = true);
+
+  /// Inverts all the elements of the matrix
+  void InvertElements();
+
+  /// Transpose the matrix.  This one is only
+  /// applicable to square matrices (the one in the
+  /// Matrix child class works also for non-square.
+  void Transpose();
+
+  /// Copies column r from column indices[r] of src.
+  /// As a special case, if indexes[i] == -1, sets column i to zero
+  /// indices.size() must equal this->NumCols(),
+  /// all elements of "reorder" must be in [-1, src.NumCols()-1],
+  /// and src.NumRows() must equal this.NumRows()
+  void CopyCols(const MatrixBase<Real> &src,
+                const std::vector<MatrixIndexT> &indices);
+
+  /// Copies row r from row indices[r] of src.
+  /// As a special case, if indexes[i] == -1, sets row i to zero
+  /// "reorder".size() must equal this->NumRows(),
+  /// all elements of "reorder" must be in [-1, src.NumRows()-1],
+  /// and src.NumCols() must equal this.NumCols()
+  void CopyRows(const MatrixBase<Real> &src,
+                const std::vector<MatrixIndexT> &indices);
+  
+  /// Applies floor to all matrix elements
+  void ApplyFloor(Real floor_val);
+
+  /// Applies floor to all matrix elements
+  void ApplyCeiling(Real ceiling_val);
+
+  /// Calculates log of all the matrix elemnts
+  void ApplyLog();
+
+  /// Exponentiate each of the elements.
+  void ApplyExp();
+
+  /// Applies power to all matrix elements
+  void ApplyPow(Real power);
+
+  /// Apply power to the absolute value of each element. 
+  /// Include the sign of the input element if include_sign == true.
+  /// If the power is negative and the input to the power is zero,
+  /// The output will be set zero.
+  void ApplyPowAbs(Real power, bool include_sign=false);
+  
+  /// Applies the Heaviside step function (x > 0 ? 1 : 0) to all matrix elements
+  /// Note: in general you can make different choices for x = 0, but for now
+  /// please leave it as it (i.e. returning zero) because it affects the
+  /// RectifiedLinearComponent in the neural net code.
+  void ApplyHeaviside();
+  
+  /// Eigenvalue Decomposition of a square NxN matrix into the form (*this) = P D
+  /// P^{-1}.  Be careful: the relationship of D to the eigenvalues we output is
+  /// slightly complicated, due to the need for P to be real.  In the symmetric
+  /// case D is diagonal and real, but in
+  /// the non-symmetric case there may be complex-conjugate pairs of eigenvalues.
+  /// In this case, for the equation (*this) = P D P^{-1} to hold, D must actually
+  /// be block diagonal, with 2x2 blocks corresponding to any such pairs.  If a
+  /// pair is lambda +- i*mu, D will have a corresponding 2x2 block
+  /// [lambda, mu; -mu, lambda].
+  /// Note that if the input matrix (*this) is non-invertible, P may not be invertible
+  /// so in this case instead of the equation (*this) = P D P^{-1} holding, we have
+  /// instead (*this) P = P D.
+  ///
+  /// The non-member function CreateEigenvalueMatrix creates D from eigs_real and eigs_imag.
+  void Eig(MatrixBase<Real> *P,
+           VectorBase<Real> *eigs_real,
+           VectorBase<Real> *eigs_imag) const;
+
+  /// The Power method attempts to take the matrix to a power using a method that
+  /// works in general for fractional and negative powers.  The input matrix must
+  /// be invertible and have reasonable condition (or we don't guarantee the
+  /// results.  The method is based on the eigenvalue decomposition.  It will
+  /// return false and leave the matrix unchanged, if at entry the matrix had
+  /// real negative eigenvalues (or if it had zero eigenvalues and the power was
+  /// negative).
+  bool Power(Real pow);
+
+  /** Singular value decomposition
+     Major limitations:
+     For nonsquare matrices, we assume m>=n (NumRows >= NumCols), and we return
+     the "skinny" Svd, i.e. the matrix in the middle is diagonal, and the
+     one on the left is rectangular.
+
+     In Svd, *this = U*diag(S)*Vt.
+     Null pointers for U and/or Vt at input mean we do not want that output.  We
+     expect that S.Dim() == m, U is either NULL or m by n,
+     and v is either NULL or n by n.
+     The singular values are not sorted (use SortSvd for that).  */
+  void DestructiveSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
+                      MatrixBase<Real> *Vt);  // Destroys calling matrix.
+
+  /// Compute SVD (*this) = U diag(s) Vt.   Note that the V in the call is already
+  /// transposed; the normal formulation is U diag(s) V^T.
+  /// Null pointers for U or V mean we don't want that output (this saves
+  /// compute).  The singular values are not sorted (use SortSvd for that).
+  void Svd(VectorBase<Real> *s, MatrixBase<Real> *U,
+           MatrixBase<Real> *Vt) const;
+  /// Compute SVD but only retain the singular values.
+  void Svd(VectorBase<Real> *s) const { Svd(s, NULL, NULL); }
+
+
+  /// Returns smallest singular value.
+  Real MinSingularValue() const {
+    Vector<Real> tmp(std::min(NumRows(), NumCols()));
+    Svd(&tmp);
+    return tmp.Min();
+  }
+
+  void TestUninitialized() const; // This function is designed so that if any element
+  // if the matrix is uninitialized memory, valgrind will complain.
+  
+  /// Returns condition number by computing Svd.  Works even if cols > rows.
+  /// Returns infinity if all singular values are zero.
+  Real Cond() const;
+
+  /// Returns true if matrix is Symmetric.
+  bool IsSymmetric(Real cutoff = 1.0e-05) const;  // replace magic number
+
+  /// Returns true if matrix is Diagonal.
+  bool IsDiagonal(Real cutoff = 1.0e-05) const;  // replace magic number
+
+  /// Returns true if the matrix is all zeros, except for ones on diagonal.  (it
+  /// does not have to be square).  More specifically, this function returns
+  /// false if for any i, j, (*this)(i, j) differs by more than cutoff from the
+  /// expression (i == j ? 1 : 0).
+  bool IsUnit(Real cutoff = 1.0e-05) const;     // replace magic number
+
+  /// Returns true if matrix is all zeros.
+  bool IsZero(Real cutoff = 1.0e-05) const;     // replace magic number
+
+  /// Frobenius norm, which is the sqrt of sum of square elements.  Same as Schatten 2-norm,
+  /// or just "2-norm".
+  Real FrobeniusNorm() const;
+
+  /// Returns true if ((*this)-other).FrobeniusNorm()
+  /// <= tol * (*this).FrobeniusNorm().
+  bool ApproxEqual(const MatrixBase<Real> &other, float tol = 0.01) const;
+
+  /// Tests for exact equality.  It's usually preferable to use ApproxEqual.
+  bool Equal(const MatrixBase<Real> &other) const;
+
+  /// largest absolute value.
+  Real LargestAbsElem() const;  // largest absolute value.
+
+  /// Returns log(sum(exp())) without exp overflow
+  /// If prune > 0.0, it uses a pruning beam, discarding
+  /// terms less than (max - prune).  Note: in future
+  /// we may change this so that if prune = 0.0, it takes
+  /// the max, so use -1 if you don't want to prune.
+  Real LogSumExp(Real prune = -1.0) const;
+
+  /// Apply soft-max to the collection of all elements of the
+  /// matrix and return normalizer (log sum of exponentials).
+  Real ApplySoftMax();
+  
+  /// Set each element to the sigmoid of the corresponding element of "src".
+  void Sigmoid(const MatrixBase<Real> &src);
+
+  /// Set each element to y = log(1 + exp(x))
+  void SoftHinge(const MatrixBase<Real> &src);
+  
+  /// Apply the function y(i) = (sum_{j = i*G}^{(i+1)*G-1} x_j^(power))^(1 / p).
+  /// Requires src.NumRows() == this->NumRows() and  src.NumCols() % this->NumCols() == 0.
+  void GroupPnorm(const MatrixBase<Real> &src, Real power);
+
+
+  /// Calculate derivatives for the GroupPnorm function above...
+  /// if "input" is the input to the GroupPnorm function above (i.e. the "src" variable),
+  /// and "output" is the result of the computation (i.e. the "this" of that function
+  /// call), and *this has the same dimension as "input", then it sets each element
+  /// of *this to the derivative d(output-elem)/d(input-elem) for each element of "input", where
+  /// "output-elem" is whichever element of output depends on that input element.
+  void GroupPnormDeriv(const MatrixBase<Real> &input, const MatrixBase<Real> &output,
+                       Real power);
+
+
+  /// Set each element to the tanh of the corresponding element of "src".
+  void Tanh(const MatrixBase<Real> &src);
+
+  // Function used in backpropagating derivatives of the sigmoid function:
+  // element-by-element, set *this = diff * value * (1.0 - value).
+  void DiffSigmoid(const MatrixBase<Real> &value,
+                   const MatrixBase<Real> &diff);
+
+  // Function used in backpropagating derivatives of the tanh function:
+  // element-by-element, set *this = diff * (1.0 - value^2).
+  void DiffTanh(const MatrixBase<Real> &value,
+                const MatrixBase<Real> &diff);
+  
+  /** Uses Svd to compute the eigenvalue decomposition of a symmetric positive
+   * semi-definite matrix: (*this) = rP * diag(rS) * rP^T, with rP an
+   * orthogonal matrix so rP^{-1} = rP^T.   Throws exception if input was not
+   * positive semi-definite (check_thresh controls how stringent the check is;
+   * set it to 2 to ensure it won't ever complain, but it will zero out negative
+   * dimensions in your matrix.
+  */
+  void SymPosSemiDefEig(VectorBase<Real> *s, MatrixBase<Real> *P,
+                        Real check_thresh = 0.001);
+
+  friend Real kaldi::TraceMatMat<Real>(const MatrixBase<Real> &A,
+      const MatrixBase<Real> &B, MatrixTransposeType trans);  // tr (A B)
+
+  // so it can get around const restrictions on the pointer to data_.
+  friend class SubMatrix<Real>;
+
+  /// Add a scalar to each element
+  void Add(const Real alpha);
+
+  /// Add a scalar to each diagonal element.
+  void AddToDiag(const Real alpha);
+
+  /// *this += alpha * a * b^T
+  template<typename OtherReal>
+  void AddVecVec(const Real alpha, const VectorBase<OtherReal> &a,
+                 const VectorBase<OtherReal> &b);
+
+  /// [each row of *this] += alpha * v
+  template<typename OtherReal>
+  void AddVecToRows(const Real alpha, const VectorBase<OtherReal> &v);
+  
+  /// [each col of *this] += alpha * v
+  template<typename OtherReal>
+  void AddVecToCols(const Real alpha, const VectorBase<OtherReal> &v);      
+  
+  /// *this += alpha * M [or M^T]
+  void AddMat(const Real alpha, const MatrixBase<Real> &M,
+              MatrixTransposeType transA = kNoTrans);
+
+  /// *this = beta * *this + alpha * M M^T, for symmetric matrices.  It only
+  /// updates the lower triangle of *this.  It will leave the matrix asymmetric;
+  /// if you need it symmetric as a regular matrix, do CopyLowerToUpper().
+  void SymAddMat2(const Real alpha, const MatrixBase<Real> &M,
+                  MatrixTransposeType transA, Real beta);
+
+  /// *this = beta * *this + alpha * diag(v) * M [or M^T].
+  /// The same as adding M but scaling each row M_i by v(i).
+  void AddDiagVecMat(const Real alpha, VectorBase<Real> &v,
+                     const MatrixBase<Real> &M, MatrixTransposeType transM, 
+                     Real beta = 1.0);
+ 
+  /// *this = beta * *this + alpha * M [or M^T] * diag(v) 
+  /// The same as adding M but scaling each column M_j by v(j).
+  void AddMatDiagVec(const Real alpha, 
+                     const MatrixBase<Real> &M, MatrixTransposeType transM, 
+                     VectorBase<Real> &v,
+                     Real beta = 1.0);
+
+  /// *this = beta * *this + alpha * A .* B (.* element by element multiplication)
+  void AddMatMatElements(const Real alpha,
+                         const MatrixBase<Real>& A,
+                         const MatrixBase<Real>& B,
+                         const Real beta);
+  
+  /// *this += alpha * S
+  template<typename OtherReal>
+  void AddSp(const Real alpha, const SpMatrix<OtherReal> &S);
+
+  void AddMatMat(const Real alpha,
+                 const MatrixBase<Real>& A, MatrixTransposeType transA,
+                 const MatrixBase<Real>& B, MatrixTransposeType transB,
+                 const Real beta);
+ 
+  /// *this = a * b / c (by element; when c = 0, *this = a)
+  void AddMatMatDivMat(const MatrixBase<Real>& A,
+                        const MatrixBase<Real>& B,
+                       const MatrixBase<Real>& C);
+
+  /// A version of AddMatMat specialized for when the second argument
+  /// contains a lot of zeroes.
+  void AddMatSmat(const Real alpha,
+                  const MatrixBase<Real>& A, MatrixTransposeType transA,
+                  const MatrixBase<Real>& B, MatrixTransposeType transB,
+                  const Real beta);
+
+  /// A version of AddMatMat specialized for when the first argument
+  /// contains a lot of zeroes.  
+  void AddSmatMat(const Real alpha,
+                  const MatrixBase<Real>& A, MatrixTransposeType transA,
+                  const MatrixBase<Real>& B, MatrixTransposeType transB,
+                  const Real beta);
+
+  /// this <-- beta*this + alpha*A*B*C.
+  void AddMatMatMat(const Real alpha,
+                    const MatrixBase<Real>& A, MatrixTransposeType transA,
+                    const MatrixBase<Real>& B, MatrixTransposeType transB,
+                    const MatrixBase<Real>& C, MatrixTransposeType transC,
+                    const Real beta);
+
+  /// this <-- beta*this + alpha*SpA*B.
+  // This and the routines below are really
+  // stubs that need to be made more efficient.
+  void AddSpMat(const Real alpha,
+                const SpMatrix<Real>& A,
+                const MatrixBase<Real>& B, MatrixTransposeType transB,
+                const Real beta) {
+    Matrix<Real> M(A);
+    return AddMatMat(alpha, M, kNoTrans, B, transB, beta);
+  }
+  /// this <-- beta*this + alpha*A*B.
+  void AddTpMat(const Real alpha,
+                const TpMatrix<Real>& A, MatrixTransposeType transA,
+                const MatrixBase<Real>& B, MatrixTransposeType transB,
+                const Real beta) {
+    Matrix<Real> M(A);
+    return AddMatMat(alpha, M, transA, B, transB, beta);
+  }
+  /// this <-- beta*this + alpha*A*B.
+  void AddMatSp(const Real alpha,
+                const MatrixBase<Real>& A, MatrixTransposeType transA,
+                const SpMatrix<Real>& B,
+                const Real beta) {
+    Matrix<Real> M(B);
+    return AddMatMat(alpha, A, transA, M, kNoTrans, beta);
+  }
+  /// this <-- beta*this + alpha*A*B*C.
+  void AddSpMatSp(const Real alpha,
+                  const SpMatrix<Real> &A,
+                  const MatrixBase<Real>& B, MatrixTransposeType transB,
+                  const SpMatrix<Real>& C,
+                const Real beta) {
+    Matrix<Real> M(A), N(C);
+    return AddMatMatMat(alpha, M, kNoTrans, B, transB, N, kNoTrans, beta);
+  }
+  /// this <-- beta*this + alpha*A*B.
+  void AddMatTp(const Real alpha,
+                const MatrixBase<Real>& A, MatrixTransposeType transA,
+                const TpMatrix<Real>& B, MatrixTransposeType transB,
+                const Real beta) {
+    Matrix<Real> M(B);
+    return AddMatMat(alpha, A, transA, M, transB, beta);
+  }
+
+  /// this <-- beta*this + alpha*A*B.
+  void AddTpTp(const Real alpha,
+               const TpMatrix<Real>& A, MatrixTransposeType transA,
+               const TpMatrix<Real>& B, MatrixTransposeType transB,
+               const Real beta) {
+    Matrix<Real> M(A), N(B);
+    return AddMatMat(alpha, M, transA, N, transB, beta);
+  }
+
+  /// this <-- beta*this + alpha*A*B.
+  // This one is more efficient, not like the others above.
+  void AddSpSp(const Real alpha,
+               const SpMatrix<Real>& A, const SpMatrix<Real>& B,
+               const Real beta);
+
+  /// Copy lower triangle to upper triangle (symmetrize)
+  void CopyLowerToUpper();
+
+  /// Copy upper triangle to lower triangle (symmetrize)
+  void CopyUpperToLower();
+  
+  /// This function orthogonalizes the rows of a matrix using the Gram-Schmidt
+  /// process.  It is only applicable if NumRows() <= NumCols().  It will use
+  /// random number generation to fill in rows with something nonzero, in cases
+  /// where the original matrix was of deficient row rank.
+  void OrthogonalizeRows();
+
+  /// stream read.
+  /// Use instead of stream<<*this, if you want to add to existing contents.
+  // Will throw exception on failure.
+  void Read(std::istream & in, bool binary, bool add = false);
+  /// write to stream.
+  void Write(std::ostream & out, bool binary) const;
+
+  // Below is internal methods for Svd, user does not have to know about this.
+#if !defined(HAVE_ATLAS) && !defined(USE_KALDI_SVD)
+  // protected:
+  // Should be protected but used directly in testing routine.
+  // destroys *this!
+  void LapackGesvd(VectorBase<Real> *s, MatrixBase<Real> *U,
+                     MatrixBase<Real> *Vt);
+#else
+ protected:
+  // destroys *this!
+  bool JamaSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
+               MatrixBase<Real> *V);
+
+#endif
+ protected:
+
+  ///  Initializer, callable only from child.
+  explicit MatrixBase(Real *data, MatrixIndexT cols, MatrixIndexT rows, MatrixIndexT stride) :
+    data_(data), num_cols_(cols), num_rows_(rows), stride_(stride) {
+    KALDI_ASSERT_IS_FLOATING_TYPE(Real);
+  }
+
+  ///  Initializer, callable only from child.
+  /// Empty initializer, for un-initialized matrix.
+  explicit MatrixBase(): data_(NULL) {
+    KALDI_ASSERT_IS_FLOATING_TYPE(Real);
+  }
+
+  // Make sure pointers to MatrixBase cannot be deleted.
+  ~MatrixBase() { }
+
+  /// A workaround that allows SubMatrix to get a pointer to non-const data
+  /// for const Matrix. Unfortunately C++ does not allow us to declare a
+  /// "public const" inheritance or anything like that, so it would require
+  /// a lot of work to make the SubMatrix class totally const-correct--
+  /// we would have to override many of the Matrix functions.
+  inline Real*  Data_workaround() const {
+    return data_;
+  }
+
+  /// data memory area
+  Real*   data_;
+
+  /// these atributes store the real matrix size as it is stored in memory
+  /// including memalignment
+  MatrixIndexT    num_cols_;   /// < Number of columns
+  MatrixIndexT    num_rows_;   /// < Number of rows
+  /** True number of columns for the internal matrix. This number may differ
+   * from num_cols_ as memory alignment might be used. */
+  MatrixIndexT    stride_;
+ private:
+  KALDI_DISALLOW_COPY_AND_ASSIGN(MatrixBase);
+};
+
+/// A class for storing matrices.
+template<typename Real>
+class Matrix : public MatrixBase<Real> {
+ public:
+
+  /// Empty constructor.
+  Matrix();
+
+  /// Basic constructor.  Sets to zero by default.
+  /// if set_zero == false, memory contents are undefined.
+  Matrix(const MatrixIndexT r, const MatrixIndexT c,
+         MatrixResizeType resize_type = kSetZero):
+      MatrixBase<Real>() { Resize(r, c, resize_type); }
+  
+  /// Copy constructor from CUDA matrix
+  /// This is defined in ../cudamatrix/cu-matrix.h
+  template<typename OtherReal>
+  explicit Matrix(const CuMatrixBase<OtherReal> &cu,
+                  MatrixTransposeType trans = kNoTrans);
+
+
+  /// Swaps the contents of *this and *other.  Shallow swap.
+  void Swap(Matrix<Real> *other);
+
+  /// Defined in ../cudamatrix/cu-matrix.cc
+  void Swap(CuMatrix<Real> *mat);
+
+  /// Constructor from any MatrixBase. Can also copy with transpose.
+  /// Allocates new memory.
+  explicit Matrix(const MatrixBase<Real> & M,
+                  MatrixTransposeType trans = kNoTrans);
+  
+  /// Same as above, but need to avoid default copy constructor.
+  Matrix(const Matrix<Real> & M);  //  (cannot make explicit)
+
+  /// Copy constructor: as above, but from another type.
+  template<typename OtherReal>
+  explicit Matrix(const MatrixBase<OtherReal> & M,
+                    MatrixTransposeType trans = kNoTrans);
+
+  /// Copy constructor taking SpMatrix...
+  /// It is symmetric, so no option for transpose, and NumRows == Cols
+  template<typename OtherReal>
+  explicit Matrix(const SpMatrix<OtherReal> & M) : MatrixBase<Real>() {
+    Resize(M.NumRows(), M.NumRows(), kUndefined);
+    this->CopyFromSp(M);
+  }
+
+  /// Constructor from CompressedMatrix
+  explicit Matrix(const CompressedMatrix &C);
+  
+  /// Copy constructor taking TpMatrix...
+  template <typename OtherReal>
+  explicit Matrix(const TpMatrix<OtherReal> & M,
+                  MatrixTransposeType trans = kNoTrans) : MatrixBase<Real>() {
+    if (trans == kNoTrans) {
+      Resize(M.NumRows(), M.NumCols(), kUndefined);
+      this->CopyFromTp(M);
+    } else {
+      Resize(M.NumCols(), M.NumRows(), kUndefined);
+      this->CopyFromTp(M, kTrans);
+    }
+  }
+
+  /// read from stream.
+  // Unlike one in base, allows resizing.
+  void Read(std::istream & in, bool binary, bool add = false);
+
+  /// Remove a specified row.
+  void RemoveRow(MatrixIndexT i);
+  
+  /// Transpose the matrix.  Works for non-square
+  /// matrices as well as square ones.
+  void Transpose();
+
+  /// Distructor to free matrices.
+  ~Matrix() { Destroy(); }
+
+  /// Sets matrix to a specified size (zero is OK as long as both r and c are
+  /// zero).  The value of the new data depends on resize_type:
+  ///   -if kSetZero, the new data will be zero
+  ///   -if kUndefined, the new data will be undefined
+  ///   -if kCopyData, the new data will be the same as the old data in any
+  ///      shared positions, and zero elsewhere.
+  /// This function takes time proportional to the number of data elements.
+  void Resize(const MatrixIndexT r,
+              const MatrixIndexT c,
+              MatrixResizeType resize_type = kSetZero);
+
+  /// Assignment operator that takes MatrixBase.
+  Matrix<Real> &operator = (const MatrixBase<Real> &other) {
+    if (MatrixBase<Real>::NumRows() != other.NumRows() ||
+        MatrixBase<Real>::NumCols() != other.NumCols())
+      Resize(other.NumRows(), other.NumCols(), kUndefined);
+    MatrixBase<Real>::CopyFromMat(other);
+    return *this;
+  }
+
+  /// Assignment operator. Needed for inclusion in std::vector.
+  Matrix<Real> &operator = (const Matrix<Real> &other) {
+    if (MatrixBase<Real>::NumRows() != other.NumRows() ||
+        MatrixBase<Real>::NumCols() != other.NumCols())
+      Resize(other.NumRows(), other.NumCols(), kUndefined);
+    MatrixBase<Real>::CopyFromMat(other);
+    return *this;
+  }
+  
+
+ private:
+  /// Deallocates memory and sets to empty matrix (dimension 0, 0).
+  void Destroy();
+  
+  /// Init assumes the current class contents are invalid (i.e. junk or have
+  /// already been freed), and it sets the matrix to newly allocated memory with
+  /// the specified number of rows and columns.  r == c == 0 is acceptable.  The data
+  /// memory contents will be undefined.
+  void Init(const MatrixIndexT r,
+            const MatrixIndexT c);
+
+};
+/// @} end "addtogroup matrix_group"
+
+/// \addtogroup matrix_funcs_io
+/// @{
+
+/// A structure containing the HTK header.
+/// [TODO: change the style of the variables to Kaldi-compliant]
+struct HtkHeader {
+  /// Number of samples.
+  int32    mNSamples;
+  /// Sample period.
+  int32    mSamplePeriod;
+  /// Sample size
+  int16    mSampleSize;
+  /// Sample kind.
+  uint16   mSampleKind;
+};
+
+// Read HTK formatted features from file into matrix.
+template<typename Real>
+bool ReadHtk(std::istream &is, Matrix<Real> *M, HtkHeader *header_ptr);
+
+// Write (HTK format) features to file from matrix.
+template<typename Real>
+bool WriteHtk(std::ostream &os, const MatrixBase<Real> &M, HtkHeader htk_hdr);
+
+// Write (CMUSphinx format) features to file from matrix.
+template<typename Real>
+bool WriteSphinx(std::ostream &os, const MatrixBase<Real> &M);
+
+/// @} end of "addtogroup matrix_funcs_io"
+
+/**
+  Sub-matrix representation.
+  Can work with sub-parts of a matrix using this class.
+  Note that SubMatrix is not very const-correct-- it allows you to
+  change the contents of a const Matrix.  Be careful!
+*/
+
+template<typename Real>
+class SubMatrix : public MatrixBase<Real> {
+ public:
+  // Initialize a SubMatrix from part of a matrix; this is
+  // a bit like A(b:c, d:e) in Matlab.
+  // This initializer is against the proper semantics of "const", since
+  // SubMatrix can change its contents.  It would be hard to implement
+  // a "const-safe" version of this class.
+  SubMatrix(const MatrixBase<Real>& T,
+            const MatrixIndexT ro,  // row offset, 0 < ro < NumRows()
+            const MatrixIndexT r,   // number of rows, r > 0
+            const MatrixIndexT co,  // column offset, 0 < co < NumCols()
+            const MatrixIndexT c);   // number of columns, c > 0
+  
+  // This initializer is mostly intended for use in CuMatrix and related
+  // classes.  Be careful!
+  SubMatrix(Real *data,
+            MatrixIndexT num_rows,
+            MatrixIndexT num_cols,
+            MatrixIndexT stride);
+  
+  ~SubMatrix<Real>() {}
+  
+  /// This type of constructor is needed for Range() to work [in Matrix base
+  /// class]. Cannot make it explicit.
+  SubMatrix<Real> (const SubMatrix &other):
+  MatrixBase<Real> (other.data_, other.num_cols_, other.num_rows_,
+                    other.stride_) {}
+
+ private:
+  /// Disallow assignment.
+  SubMatrix<Real> &operator = (const SubMatrix<Real> &other);
+};
+/// @} End of "addtogroup matrix_funcs_io".
+
+/// \addtogroup matrix_funcs_scalar
+/// @{
+
+// Some declarations.  These are traces of products.
+
+
+template<typename Real>
+bool ApproxEqual(const MatrixBase<Real> &A,
+                 const MatrixBase<Real> &B, Real tol = 0.01) {
+  return A.ApproxEqual(B, tol);
+}
+
+template<typename Real>
+inline void AssertEqual(const MatrixBase<Real> &A, const MatrixBase<Real> &B,
+                        float tol = 0.01) {
+  KALDI_ASSERT(A.ApproxEqual(B, tol));
+}
+
+/// Returns trace of matrix.
+template <typename Real>
+double TraceMat(const MatrixBase<Real> &A) { return A.Trace(); }
+
+
+/// Returns tr(A B C)
+template <typename Real>
+Real TraceMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
+                      const MatrixBase<Real> &B, MatrixTransposeType transB,
+                      const MatrixBase<Real> &C, MatrixTransposeType transC);
+
+/// Returns tr(A B C D)
+template <typename Real>
+Real TraceMatMatMatMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
+                         const MatrixBase<Real> &B, MatrixTransposeType transB,
+                         const MatrixBase<Real> &C, MatrixTransposeType transC,
+                         const MatrixBase<Real> &D, MatrixTransposeType transD);
+
+/// @} end "addtogroup matrix_funcs_scalar"
+
+
+/// \addtogroup matrix_funcs_misc
+/// @{
+
+
+/// Function to ensure that SVD is sorted.  This function is made as generic as
+/// possible, to be applicable to other types of problems.  s->Dim() should be
+/// the same as U->NumCols(), and we sort s from greatest to least absolute
+/// value (if sort_on_absolute_value == true) or greatest to least value
+/// otherwise, moving the columns of U, if it exists, and the rows of Vt, if it
+/// exists, around in the same way.  Note: the "absolute value" part won't matter
+/// if this is an actual SVD, since singular values are non-negative.
+template<typename Real> void SortSvd(VectorBase<Real> *s, MatrixBase<Real> *U,
+                                     MatrixBase<Real>* Vt = NULL,
+                                     bool sort_on_absolute_value = true);
+
+/// Creates the eigenvalue matrix D that is part of the decomposition used Matrix::Eig.
+/// D will be block-diagonal with blocks of size 1 (for real eigenvalues) or 2x2
+/// for complex pairs.  If a complex pair is lambda +- i*mu, D will have a corresponding
+/// 2x2 block [lambda, mu; -mu, lambda].
+/// This function will throw if any complex eigenvalues are not in complex conjugate
+/// pairs (or the members of such pairs are not consecutively numbered).
+template<typename Real>
+void CreateEigenvalueMatrix(const VectorBase<Real> &real, const VectorBase<Real> &imag,
+                            MatrixBase<Real> *D);
+
+/// The following function is used in Matrix::Power, and separately tested, so we
+/// declare it here mainly for the testing code to see.  It takes a complex value to
+/// a power using a method that will work for noninteger powers (but will fail if the
+/// complex value is real and negative).
+template<typename Real>
+bool AttemptComplexPower(Real *x_re, Real *x_im, Real power);
+
+
+
+/// @} end of addtogroup matrix_funcs_misc
+
+/// \addtogroup matrix_funcs_io
+/// @{
+template<typename Real>
+std::ostream & operator << (std::ostream & Out, const MatrixBase<Real> & M);
+
+template<typename Real>
+std::istream & operator >> (std::istream & In, MatrixBase<Real> & M);
+
+// The Matrix read allows resizing, so we override the MatrixBase one.
+template<typename Real>
+std::istream & operator >> (std::istream & In, Matrix<Real> & M);
+
+
+template<typename Real>
+bool SameDim(const MatrixBase<Real> &M, const MatrixBase<Real> &N) {
+  return (M.NumRows() == N.NumRows() && M.NumCols() == N.NumCols());
+}
+
+/// @} end of \addtogroup matrix_funcs_io
+
+
+}  // namespace kaldi
+
+
+
+// we need to include the implementation and some
+// template specializations.
+#include "matrix/kaldi-matrix-inl.h"
+
+
+#endif  // KALDI_MATRIX_KALDI_MATRIX_H_
diff --git a/kaldi_io/src/kaldi/matrix/kaldi-vector-inl.h b/kaldi_io/src/kaldi/matrix/kaldi-vector-inl.h
new file mode 100644
index 0000000..c3a4f52
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/kaldi-vector-inl.h
@@ -0,0 +1,58 @@
+// matrix/kaldi-vector-inl.h
+
+// Copyright 2009-2011   Ondrej Glembek;  Microsoft Corporation;
+//                       Haihua Xu
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+// This is an internal header file, included by other library headers.
+// You should not attempt to use it directly.
+
+#ifndef KALDI_MATRIX_KALDI_VECTOR_INL_H_
+#define KALDI_MATRIX_KALDI_VECTOR_INL_H_ 1
+
+namespace kaldi {
+
+template<typename Real>
+std::ostream & operator << (std::ostream &os, const VectorBase<Real> &rv) {
+  rv.Write(os, false);
+  return os;
+}
+
+template<typename Real>
+std::istream &operator >> (std::istream &is, VectorBase<Real> &rv) {
+  rv.Read(is, false);
+  return is;
+}
+
+template<typename Real>
+std::istream &operator >> (std::istream &is, Vector<Real> &rv) {
+  rv.Read(is, false);
+  return is;
+}
+
+template<>
+template<>
+void VectorBase<float>::AddVec(const float alpha, const VectorBase<float> &rv);
+
+template<>
+template<>
+void VectorBase<double>::AddVec<double>(const double alpha,
+                                        const VectorBase<double> &rv);
+
+}  // namespace kaldi
+
+#endif  // KALDI_MATRIX_KALDI_VECTOR_INL_H_
diff --git a/kaldi_io/src/kaldi/matrix/kaldi-vector.h b/kaldi_io/src/kaldi/matrix/kaldi-vector.h
new file mode 100644
index 0000000..2b3395b
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/kaldi-vector.h
@@ -0,0 +1,585 @@
+// matrix/kaldi-vector.h
+
+// Copyright 2009-2012   Ondrej Glembek;  Microsoft Corporation;  Lukas Burget;
+//                       Saarland University (Author: Arnab Ghoshal);
+//                       Ariya Rastrow;  Petr Schwarz;  Yanmin Qian;
+//                       Karel Vesely;  Go Vivace Inc.;  Arnab Ghoshal
+//                       Wei Shi;
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_MATRIX_KALDI_VECTOR_H_
+#define KALDI_MATRIX_KALDI_VECTOR_H_ 1
+
+#include "matrix/matrix-common.h"
+
+namespace kaldi {
+
+/// \addtogroup matrix_group
+/// @{
+
+///  Provides a vector abstraction class.
+///  This class provides a way to work with vectors in kaldi.
+///  It encapsulates basic operations and memory optimizations.
+template<typename Real>
+class VectorBase {
+ public:
+  /// Set vector to all zeros.
+  void SetZero();
+
+  /// Returns true if matrix is all zeros.
+  bool IsZero(Real cutoff = 1.0e-06) const;     // replace magic number
+
+  /// Set all members of a vector to a specified value.
+  void Set(Real f);
+
+  /// Set vector to random normally-distributed noise.
+  void SetRandn();
+
+  /// This function returns a random index into this vector,
+  /// chosen with probability proportional to the corresponding
+  /// element.  Requires that this->Min() >= 0 and this->Sum() > 0.
+  MatrixIndexT RandCategorical() const;
+  
+  /// Returns the  dimension of the vector.
+  inline MatrixIndexT Dim() const { return dim_; }
+
+  /// Returns the size in memory of the vector, in bytes.
+  inline MatrixIndexT SizeInBytes() const { return (dim_*sizeof(Real)); }
+
+  /// Returns a pointer to the start of the vector's data.
+  inline Real* Data() { return data_; }
+
+  /// Returns a pointer to the start of the vector's data (const).
+  inline const Real* Data() const { return data_; }
+
+  /// Indexing  operator (const).
+  inline Real operator() (MatrixIndexT i) const {
+    KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                 static_cast<UnsignedMatrixIndexT>(dim_));
+    return *(data_ + i);
+  }
+
+  /// Indexing operator (non-const).
+  inline Real & operator() (MatrixIndexT i) {
+    KALDI_PARANOID_ASSERT(static_cast<UnsignedMatrixIndexT>(i) <
+                 static_cast<UnsignedMatrixIndexT>(dim_));
+    return *(data_ + i);
+  }
+
+  /** @brief Returns a sub-vector of a vector (a range of elements).
+   *  @param o [in] Origin, 0 < o < Dim()
+   *  @param l [in] Length 0 < l < Dim()-o
+   *  @return A SubVector object that aliases the data of the Vector object.
+   *  See @c SubVector class for details   */
+  SubVector<Real> Range(const MatrixIndexT o, const MatrixIndexT l) {
+    return SubVector<Real>(*this, o, l);
+  }
+
+  /** @brief Returns a const sub-vector of a vector (a range of elements).
+   *  @param o [in] Origin, 0 < o < Dim()
+   *  @param l [in] Length 0 < l < Dim()-o
+   *  @return A SubVector object that aliases the data of the Vector object.
+   *  See @c SubVector class for details   */
+  const SubVector<Real> Range(const MatrixIndexT o,
+                              const MatrixIndexT l) const {
+    return SubVector<Real>(*this, o, l);
+  }
+
+  /// Copy data from another vector (must match own size).
+  void CopyFromVec(const VectorBase<Real> &v);
+
+  /// Copy data from a SpMatrix or TpMatrix (must match own size).
+  template<typename OtherReal>
+  void CopyFromPacked(const PackedMatrix<OtherReal> &M);
+  
+  /// Copy data from another vector of different type (double vs. float)
+  template<typename OtherReal>
+  void CopyFromVec(const VectorBase<OtherReal> &v);
+
+  /// Copy from CuVector.  This is defined in ../cudamatrix/cu-vector.h
+  template<typename OtherReal>
+  void CopyFromVec(const CuVectorBase<OtherReal> &v);
+
+  
+  /// Apply natural log to all elements.  Throw if any element of
+  /// the vector is negative (but doesn't complain about zero; the
+  /// log will be -infinity
+  void ApplyLog();
+
+  /// Apply natural log to another vector and put result in *this.
+  void ApplyLogAndCopy(const VectorBase<Real> &v);
+
+  /// Apply exponential to each value in vector.
+  void ApplyExp();
+
+  /// Take absolute value of each of the elements
+  void ApplyAbs();
+
+  /// Applies floor to all elements. Returns number of elements floored.
+  MatrixIndexT ApplyFloor(Real floor_val);
+
+  /// Applies ceiling to all elements. Returns number of elements changed.
+  MatrixIndexT ApplyCeiling(Real ceil_val);
+  
+  /// Applies floor to all elements. Returns number of elements floored.
+  MatrixIndexT ApplyFloor(const VectorBase<Real> &floor_vec);
+
+  /// Apply soft-max to vector and return normalizer (log sum of exponentials).
+  /// This is the same as: \f$ x(i) = exp(x(i)) / \sum_i exp(x(i)) \f$
+  Real ApplySoftMax();
+
+  /// Sets each element of *this to the tanh of the corresponding element of "src".
+  void Tanh(const VectorBase<Real> &src);
+
+  /// Sets each element of *this to the sigmoid function of the corresponding
+  /// element of "src".
+  void Sigmoid(const VectorBase<Real> &src);
+  
+  /// Take all  elements of vector to a power.
+  void ApplyPow(Real power);
+
+  /// Take the absolute value of all elements of a vector to a power.
+  /// Include the sign of the input element if include_sign == true.
+  /// If power is negative and the input value is zero, the output is set zero.
+  void ApplyPowAbs(Real power, bool include_sign=false);
+  
+  /// Compute the p-th norm of the vector.
+  Real Norm(Real p) const;
+  
+  /// Returns true if ((*this)-other).Norm(2.0) <= tol * (*this).Norm(2.0).
+  bool ApproxEqual(const VectorBase<Real> &other, float tol = 0.01) const;
+  
+  /// Invert all elements.
+  void InvertElements();
+
+  /// Add vector : *this = *this + alpha * rv (with casting between floats and
+  /// doubles)
+  template<typename OtherReal>
+  void AddVec(const Real alpha, const VectorBase<OtherReal> &v);
+
+  /// Add vector : *this = *this + alpha * rv^2  [element-wise squaring].
+  void AddVec2(const Real alpha, const VectorBase<Real> &v);
+
+  /// Add vector : *this = *this + alpha * rv^2  [element-wise squaring],
+  /// with casting between floats and doubles.
+  template<typename OtherReal>
+  void AddVec2(const Real alpha, const VectorBase<OtherReal> &v);
+
+  /// Add matrix times vector : this <-- beta*this + alpha*M*v.
+  /// Calls BLAS GEMV.
+  void AddMatVec(const Real alpha, const MatrixBase<Real> &M,
+                 const MatrixTransposeType trans,  const VectorBase<Real> &v,
+                 const Real beta); // **beta previously defaulted to 0.0**
+
+  /// This is as AddMatVec, except optimized for where v contains a lot
+  /// of zeros.
+  void AddMatSvec(const Real alpha, const MatrixBase<Real> &M,
+                  const MatrixTransposeType trans,  const VectorBase<Real> &v,
+                  const Real beta); // **beta previously defaulted to 0.0**
+
+  
+  /// Add symmetric positive definite matrix times vector:
+  ///  this <-- beta*this + alpha*M*v.   Calls BLAS SPMV.
+  void AddSpVec(const Real alpha, const SpMatrix<Real> &M,
+                const VectorBase<Real> &v, const Real beta);  // **beta previously defaulted to 0.0**
+
+  /// Add triangular matrix times vector: this <-- beta*this + alpha*M*v.
+  /// Works even if rv == *this.
+  void AddTpVec(const Real alpha, const TpMatrix<Real> &M,
+                const MatrixTransposeType trans, const VectorBase<Real> &v,
+                const Real beta);  // **beta previously defaulted to 0.0**
+
+  /// Set each element to y = (x == orig ? changed : x).
+  void ReplaceValue(Real orig, Real changed);
+
+  /// Multipy element-by-element by another vector.
+  void MulElements(const VectorBase<Real> &v);
+  /// Multipy element-by-element by another vector of different type.
+  template<typename OtherReal>
+  void MulElements(const VectorBase<OtherReal> &v);
+
+  /// Divide element-by-element by a vector.
+  void DivElements(const VectorBase<Real> &v);
+  /// Divide element-by-element by a vector of different type.
+  template<typename OtherReal>
+  void DivElements(const VectorBase<OtherReal> &v);
+
+  /// Add a constant to each element of a vector.
+  void Add(Real c);
+
+  /// Add element-by-element product of vectlrs:
+  //  this <-- alpha * v .* r + beta*this .
+  void AddVecVec(Real alpha, const VectorBase<Real> &v,
+                 const VectorBase<Real> &r, Real beta);
+
+  /// Add element-by-element quotient of two vectors.
+  ///  this <---- alpha*v/r + beta*this
+  void AddVecDivVec(Real alpha, const VectorBase<Real> &v,
+                    const VectorBase<Real> &r, Real beta);
+
+  /// Multiplies all elements by this constant.
+  void Scale(Real alpha);
+
+  /// Multiplies this vector by lower-triangular marix:  *this <-- *this *M
+  void MulTp(const TpMatrix<Real> &M, const MatrixTransposeType trans);
+
+  /// If trans == kNoTrans, solves M x = b, where b is the value of *this at input
+  /// and x is the value of *this at output.
+  /// If trans == kTrans, solves M' x = b.
+  /// Does not test for M being singular or near-singular, so test it before
+  /// calling this routine.
+  void Solve(const TpMatrix<Real> &M, const MatrixTransposeType trans);
+
+  /// Performs a row stack of the matrix M
+  void CopyRowsFromMat(const MatrixBase<Real> &M);
+  template<typename OtherReal>
+  void CopyRowsFromMat(const MatrixBase<OtherReal> &M);
+
+  /// The following is implemented in ../cudamatrix/cu-matrix.cc
+  void CopyRowsFromMat(const CuMatrixBase<Real> &M);
+
+  /// Performs a column stack of the matrix M
+  void CopyColsFromMat(const MatrixBase<Real> &M);
+
+  /// Extracts a row of the matrix M.  Could also do this with
+  /// this->Copy(M[row]).
+  void CopyRowFromMat(const MatrixBase<Real> &M, MatrixIndexT row);
+  /// Extracts a row of the matrix M with type conversion.
+  template<typename OtherReal>
+  void CopyRowFromMat(const MatrixBase<OtherReal> &M, MatrixIndexT row);
+
+  /// Extracts a row of the symmetric matrix S.
+  template<typename OtherReal>
+  void CopyRowFromSp(const SpMatrix<OtherReal> &S, MatrixIndexT row);
+  
+  /// Extracts a column of the matrix M.
+  template<typename OtherReal>
+  void CopyColFromMat(const MatrixBase<OtherReal> &M , MatrixIndexT col);
+
+  /// Extracts the diagonal of the matrix M.
+  void CopyDiagFromMat(const MatrixBase<Real> &M);
+
+  /// Extracts the diagonal of a packed matrix M; works for Sp or Tp.
+  void CopyDiagFromPacked(const PackedMatrix<Real> &M);
+
+
+  /// Extracts the diagonal of a symmetric matrix.
+  inline void CopyDiagFromSp(const SpMatrix<Real> &M) { CopyDiagFromPacked(M); }
+
+  /// Extracts the diagonal of a triangular matrix.
+  inline void CopyDiagFromTp(const TpMatrix<Real> &M) { CopyDiagFromPacked(M); }
+
+  /// Returns the maximum value of any element, or -infinity for the empty vector.
+  Real Max() const;
+
+  /// Returns the maximum value of any element, and the associated index.
+  /// Error if vector is empty.
+  Real Max(MatrixIndexT *index) const;
+  
+  /// Returns the minimum value of any element, or +infinity for the empty vector.
+  Real Min() const;
+
+  /// Returns the minimum value of any element, and the associated index.
+  /// Error if vector is empty.
+  Real Min(MatrixIndexT *index) const;
+  
+  /// Returns sum of the elements
+  Real Sum() const;
+
+  /// Returns sum of the logs of the elements.  More efficient than
+  /// just taking log of each.  Will return NaN if any elements are
+  /// negative.
+  Real SumLog() const;
+
+  /// Does *this = alpha * (sum of rows of M) + beta * *this.
+  void AddRowSumMat(Real alpha, const MatrixBase<Real> &M, Real beta = 1.0);
+  
+  /// Does *this = alpha * (sum of columns of M) + beta * *this.
+  void AddColSumMat(Real alpha, const MatrixBase<Real> &M, Real beta = 1.0);
+
+  /// Add the diagonal of a matrix times itself:
+  /// *this = diag(M M^T) +  beta * *this (if trans == kNoTrans), or
+  /// *this = diag(M^T M) +  beta * *this (if trans == kTrans).
+  void AddDiagMat2(Real alpha, const MatrixBase<Real> &M,
+                   MatrixTransposeType trans = kNoTrans, Real beta = 1.0);
+
+  /// Add the diagonal of a matrix product: *this = diag(M N), assuming the
+  /// "trans" arguments are both kNoTrans; for transpose arguments, it behaves
+  /// as you would expect.
+  void AddDiagMatMat(Real alpha, const MatrixBase<Real> &M, MatrixTransposeType transM,
+                     const MatrixBase<Real> &N, MatrixTransposeType transN,
+                     Real beta = 1.0);  
+
+  /// Returns log(sum(exp())) without exp overflow
+  /// If prune > 0.0, ignores terms less than the max - prune.
+  /// [Note: in future, if prune = 0.0, it will take the max.
+  /// For now, use -1 if you don't want it to prune.]
+  Real LogSumExp(Real prune = -1.0) const;
+
+  /// Reads from C++ stream (option to add to existing contents).
+  /// Throws exception on failure
+  void Read(std::istream & in, bool binary, bool add = false);
+
+  /// Writes to C++ stream (option to write in binary).
+  void Write(std::ostream &Out, bool binary) const;
+
+  friend class VectorBase<double>;
+  friend class VectorBase<float>;
+  friend class CuVectorBase<Real>;
+  friend class CuVector<Real>;
+ protected:
+  /// Destructor;  does not deallocate memory, this is handled by child classes.
+  /// This destructor is protected so this object so this object can only be
+  /// deleted via a child.
+  ~VectorBase() {}
+
+  /// Empty initializer, corresponds to vector of zero size.
+  explicit VectorBase(): data_(NULL), dim_(0) {
+    KALDI_ASSERT_IS_FLOATING_TYPE(Real);
+  }
+
+// Took this out since it is not currently used, and it is possible to create
+// objects where the allocated memory is not the same size as dim_ : Arnab
+//  /// Initializer from a pointer and a size; keeps the pointer internally
+//  /// (ownership or non-ownership depends on the child class).
+//  explicit VectorBase(Real* data, MatrixIndexT dim)
+//      : data_(data), dim_(dim) {}
+
+  // Arnab : made this protected since it is unsafe too.
+  /// Load data into the vector: sz must match own size.
+  void CopyFromPtr(const Real* Data, MatrixIndexT sz);
+
+  /// data memory area
+  Real* data_;
+  /// dimension of vector
+  MatrixIndexT dim_;
+  KALDI_DISALLOW_COPY_AND_ASSIGN(VectorBase);
+}; // class VectorBase
+
+/** @brief A class representing a vector.
+ *
+ *  This class provides a way to work with vectors in kaldi.
+ *  It encapsulates basic operations and memory optimizations.  */
+template<typename Real>
+class Vector: public VectorBase<Real> {
+ public:
+  /// Constructor that takes no arguments.  Initializes to empty.
+  Vector(): VectorBase<Real>() {}
+
+  /// Constructor with specific size.  Sets to all-zero by default
+  /// if set_zero == false, memory contents are undefined.
+  explicit Vector(const MatrixIndexT s,
+                  MatrixResizeType resize_type = kSetZero)
+      : VectorBase<Real>() {  Resize(s, resize_type);  }
+
+  /// Copy constructor from CUDA vector
+  /// This is defined in ../cudamatrix/cu-vector.h
+  template<typename OtherReal>
+  explicit Vector(const CuVectorBase<OtherReal> &cu);
+
+  /// Copy constructor.  The need for this is controversial.
+  Vector(const Vector<Real> &v) : VectorBase<Real>()  { //  (cannot be explicit)
+    Resize(v.Dim(), kUndefined);
+    this->CopyFromVec(v);
+  }
+
+  /// Copy-constructor from base-class, needed to copy from SubVector.
+  explicit Vector(const VectorBase<Real> &v) : VectorBase<Real>() {
+    Resize(v.Dim(), kUndefined);
+    this->CopyFromVec(v);
+  }
+
+  /// Type conversion constructor.
+  template<typename OtherReal>
+  explicit Vector(const VectorBase<OtherReal> &v): VectorBase<Real>() {
+    Resize(v.Dim(), kUndefined);
+    this->CopyFromVec(v);
+  }
+
+// Took this out since it is unsafe : Arnab
+//  /// Constructor from a pointer and a size; copies the data to a location
+//  /// it owns.
+//  Vector(const Real* Data, const MatrixIndexT s): VectorBase<Real>() {
+//    Resize(s);
+  //    CopyFromPtr(Data, s);
+//  }
+
+
+  /// Swaps the contents of *this and *other.  Shallow swap.
+  void Swap(Vector<Real> *other);
+
+  /// Destructor.  Deallocates memory.
+  ~Vector() { Destroy(); }
+
+  /// Read function using C++ streams.  Can also add to existing contents
+  /// of matrix.
+  void Read(std::istream & in, bool binary, bool add = false);
+
+  /// Set vector to a specified size (can be zero).
+  /// The value of the new data depends on resize_type:
+  ///   -if kSetZero, the new data will be zero
+  ///   -if kUndefined, the new data will be undefined
+  ///   -if kCopyData, the new data will be the same as the old data in any
+  ///      shared positions, and zero elsewhere.
+  /// This function takes time proportional to the number of data elements.
+  void Resize(MatrixIndexT length, MatrixResizeType resize_type = kSetZero);
+
+  /// Remove one element and shifts later elements down.
+  void RemoveElement(MatrixIndexT i);
+
+  /// Assignment operator, protected so it can only be used by std::vector
+  Vector<Real> &operator = (const Vector<Real> &other) {
+    Resize(other.Dim(), kUndefined);
+    this->CopyFromVec(other);
+    return *this;
+  }
+
+  /// Assignment operator that takes VectorBase.
+  Vector<Real> &operator = (const VectorBase<Real> &other) {
+    Resize(other.Dim(), kUndefined);
+    this->CopyFromVec(other);
+    return *this;
+  }
+ private:
+  /// Init assumes the current contents of the class are invalid (i.e. junk or
+  /// has already been freed), and it sets the vector to newly allocated memory
+  /// with the specified dimension.  dim == 0 is acceptable.  The memory contents
+  /// pointed to by data_ will be undefined.
+  void Init(const MatrixIndexT dim);
+
+  /// Destroy function, called internally.
+  void Destroy();
+
+};
+
+
+/// Represents a non-allocating general vector which can be defined
+/// as a sub-vector of higher-level vector [or as the row of a matrix].
+template<typename Real>
+class SubVector : public VectorBase<Real> {
+ public:
+  /// Constructor from a Vector or SubVector.
+  /// SubVectors are not const-safe and it's very hard to make them
+  /// so for now we just give up.  This function contains const_cast.
+  SubVector(const VectorBase<Real> &t, const MatrixIndexT origin,
+            const MatrixIndexT length) : VectorBase<Real>() {
+    // following assert equiv to origin>=0 && length>=0 &&
+    // origin+length <= rt.dim_
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+
+                 static_cast<UnsignedMatrixIndexT>(length) <=
+                 static_cast<UnsignedMatrixIndexT>(t.Dim()));
+    VectorBase<Real>::data_ = const_cast<Real*> (t.Data()+origin);
+    VectorBase<Real>::dim_   = length;
+  }
+
+  /// This constructor initializes the vector to point at the contents
+  /// of this packed matrix (SpMatrix or TpMatrix).
+  SubVector(const PackedMatrix<Real> &M) {
+    VectorBase<Real>::data_ = const_cast<Real*> (M.Data());
+    VectorBase<Real>::dim_   = (M.NumRows()*(M.NumRows()+1))/2;
+  }
+  
+  /// Copy constructor
+  SubVector(const SubVector &other) : VectorBase<Real> () {
+    // this copy constructor needed for Range() to work in base class.
+    VectorBase<Real>::data_ = other.data_;
+    VectorBase<Real>::dim_ = other.dim_;
+  }
+
+  /// Constructor from a pointer to memory and a length.  Keeps a pointer
+  /// to the data but does not take ownership (will never delete).
+  SubVector(Real *data, MatrixIndexT length) : VectorBase<Real> () {
+    VectorBase<Real>::data_ = data;
+    VectorBase<Real>::dim_   = length;
+  }
+
+
+  /// This operation does not preserve const-ness, so be careful.
+  SubVector(const MatrixBase<Real> &matrix, MatrixIndexT row) {
+    VectorBase<Real>::data_ = const_cast<Real*>(matrix.RowData(row));
+    VectorBase<Real>::dim_   = matrix.NumCols();
+  }
+
+  ~SubVector() {}  ///< Destructor (does nothing; no pointers are owned here).
+
+ private:
+  /// Disallow assignment operator.
+  SubVector & operator = (const SubVector &other) {}
+};
+
+/// @} end of "addtogroup matrix_group"
+/// \addtogroup matrix_funcs_io
+/// @{
+/// Output to a C++ stream.  Non-binary by default (use Write for
+/// binary output).
+template<typename Real>
+std::ostream & operator << (std::ostream & out, const VectorBase<Real> & v);
+
+/// Input from a C++ stream.  Will automatically read text or
+/// binary data from the stream.
+template<typename Real>
+std::istream & operator >> (std::istream & in, VectorBase<Real> & v);
+
+/// Input from a C++ stream. Will automatically read text or
+/// binary data from the stream.
+template<typename Real>
+std::istream & operator >> (std::istream & in, Vector<Real> & v);
+/// @} end of \addtogroup matrix_funcs_io
+
+/// \addtogroup matrix_funcs_scalar
+/// @{
+
+
+template<typename Real>
+bool ApproxEqual(const VectorBase<Real> &a,
+                 const VectorBase<Real> &b, Real tol = 0.01) {
+  return a.ApproxEqual(b, tol);
+}
+
+template<typename Real>
+inline void AssertEqual(VectorBase<Real> &a, VectorBase<Real> &b,
+                        float tol = 0.01) {
+  KALDI_ASSERT(a.ApproxEqual(b, tol));
+}
+
+
+/// Returns dot product between v1 and v2.
+template<typename Real>
+Real VecVec(const VectorBase<Real> &v1, const VectorBase<Real> &v2);
+
+template<typename Real, typename OtherReal>
+Real VecVec(const VectorBase<Real> &v1, const VectorBase<OtherReal> &v2);
+
+
+/// Returns \f$ v_1^T M v_2  \f$ .
+/// Not as efficient as it could be where v1 == v2.
+template<typename Real>
+Real VecMatVec(const VectorBase<Real> &v1, const MatrixBase<Real> &M,
+               const VectorBase<Real> &v2);
+
+/// @} End of "addtogroup matrix_funcs_scalar"
+
+
+}  // namespace kaldi
+
+// we need to include the implementation
+#include "matrix/kaldi-vector-inl.h"
+
+
+
+#endif  // KALDI_MATRIX_KALDI_VECTOR_H_
+
diff --git a/kaldi_io/src/kaldi/matrix/matrix-common.h b/kaldi_io/src/kaldi/matrix/matrix-common.h
new file mode 100644
index 0000000..d202b2e
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/matrix-common.h
@@ -0,0 +1,100 @@
+// matrix/matrix-common.h
+
+// Copyright 2009-2011  Microsoft Corporation
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+#ifndef KALDI_MATRIX_MATRIX_COMMON_H_
+#define KALDI_MATRIX_MATRIX_COMMON_H_
+
+// This file contains some #includes, forward declarations
+// and typedefs that are needed by all the main header
+// files in this directory.
+
+#include "base/kaldi-common.h"
+#include "matrix/kaldi-blas.h"
+
+namespace kaldi {
+typedef enum {
+  kTrans    = CblasTrans,
+  kNoTrans = CblasNoTrans
+} MatrixTransposeType;
+
+typedef enum {
+  kSetZero,
+  kUndefined,
+  kCopyData
+} MatrixResizeType;
+
+typedef enum {
+  kTakeLower,
+  kTakeUpper,
+  kTakeMean,
+  kTakeMeanAndCheck
+} SpCopyType;
+
+template<typename Real> class VectorBase;
+template<typename Real> class Vector;
+template<typename Real> class SubVector;
+template<typename Real> class MatrixBase;
+template<typename Real> class SubMatrix;
+template<typename Real> class Matrix;
+template<typename Real> class SpMatrix;
+template<typename Real> class TpMatrix;
+template<typename Real> class PackedMatrix;
+
+// these are classes that won't be defined in this
+// directory; they're mostly needed for friend declarations.
+template<typename Real> class CuMatrixBase;
+template<typename Real> class CuSubMatrix;
+template<typename Real> class CuMatrix;
+template<typename Real> class CuVectorBase;
+template<typename Real> class CuSubVector;
+template<typename Real> class CuVector;
+template<typename Real> class CuPackedMatrix;
+template<typename Real> class CuSpMatrix;
+template<typename Real> class CuTpMatrix;
+
+class CompressedMatrix;
+
+/// This class provides a way for switching between double and float types.
+template<typename T> class OtherReal { };  // useful in reading+writing routines
+                                           // to switch double and float.
+/// A specialized class for switching from float to double.
+template<> class OtherReal<float> {
+ public:
+  typedef double Real;
+};
+/// A specialized class for switching from double to float.
+template<> class OtherReal<double> {
+ public:
+  typedef float Real;
+};
+
+
+typedef int32 MatrixIndexT;
+typedef int32 SignedMatrixIndexT;
+typedef uint32 UnsignedMatrixIndexT;
+
+// If you want to use size_t for the index type, do as follows instead:
+//typedef size_t MatrixIndexT;
+//typedef ssize_t SignedMatrixIndexT;
+//typedef size_t UnsignedMatrixIndexT;
+
+}
+
+
+
+#endif  // KALDI_MATRIX_MATRIX_COMMON_H_
diff --git a/kaldi_io/src/kaldi/matrix/matrix-functions-inl.h b/kaldi_io/src/kaldi/matrix/matrix-functions-inl.h
new file mode 100644
index 0000000..9fac851
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/matrix-functions-inl.h
@@ -0,0 +1,56 @@
+// matrix/matrix-functions-inl.h
+
+// Copyright 2009-2011 Microsoft Corporation
+//
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+//
+// (*) incorporates, with permission, FFT code from his book
+// "Signal Processing with Lapped Transforms", Artech, 1992.
+
+
+
+#ifndef KALDI_MATRIX_MATRIX_FUNCTIONS_INL_H_
+#define KALDI_MATRIX_MATRIX_FUNCTIONS_INL_H_
+
+namespace kaldi {
+
+//! ComplexMul implements, inline, the complex multiplication b *= a.
+template<typename Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
+                                            Real *b_re, Real *b_im) {
+  Real tmp_re = (*b_re * a_re) - (*b_im * a_im);
+  *b_im = *b_re * a_im + *b_im * a_re;
+  *b_re = tmp_re;
+}
+
+template<typename Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
+                                                   const Real &b_re, const Real &b_im,
+                                                   Real *c_re, Real *c_im) {
+  *c_re += b_re*a_re - b_im*a_im;
+  *c_im += b_re*a_im + b_im*a_re;
+}
+
+
+template<typename Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im) {
+  *a_re = std::cos(x);
+  *a_im = std::sin(x);
+}
+
+
+} // end namespace kaldi
+
+
+#endif // KALDI_MATRIX_MATRIX_FUNCTIONS_INL_H_
+
diff --git a/kaldi_io/src/kaldi/matrix/matrix-functions.h b/kaldi_io/src/kaldi/matrix/matrix-functions.h
new file mode 100644
index 0000000..b70ca56
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/matrix-functions.h
@@ -0,0 +1,235 @@
+// matrix/matrix-functions.h
+
+// Copyright 2009-2011  Microsoft Corporation;  Go Vivace Inc.;  Jan Silovsky;
+//                      Yanmin Qian;   1991 Henrique (Rico) Malvar (*)
+//
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+//
+// (*) incorporates, with permission, FFT code from his book
+// "Signal Processing with Lapped Transforms", Artech, 1992.
+
+
+
+#ifndef KALDI_MATRIX_MATRIX_FUNCTIONS_H_
+#define KALDI_MATRIX_MATRIX_FUNCTIONS_H_
+
+#include "matrix/kaldi-vector.h"
+#include "matrix/kaldi-matrix.h"
+
+namespace kaldi {
+
+/// @addtogroup matrix_funcs_misc
+/// @{
+
+/** The function ComplexFft does an Fft on the vector argument v.
+   v is a vector of even dimension, interpreted for both input
+   and output as a vector of complex numbers i.e.
+   \f[ v = ( re_0, im_0, re_1, im_1, ... )    \f]
+   The dimension of v must be a power of 2.
+
+   If "forward == true" this routine does the Discrete Fourier Transform
+   (DFT), i.e.:
+   \f[   vout[m] \leftarrow \sum_{n = 0}^{N-1} vin[i] exp( -2pi m n / N )  \f]
+
+   If "backward" it does the Inverse Discrete Fourier Transform (IDFT)
+   *WITHOUT THE FACTOR 1/N*,
+   i.e.:
+   \f[   vout[m] <-- \sum_{n = 0}^{N-1} vin[i] exp(  2pi m n / N )   \f]
+   [note the sign difference on the 2 pi for the backward one.]
+
+   Note that this is the definition of the FT given in most texts, but
+   it differs from the Numerical Recipes version in which the forward
+   and backward algorithms are flipped.
+
+   Note that you would have to multiply by 1/N after the IDFT to get
+   back to where you started from.  We don't do this because
+   in some contexts, the transform is made symmetric by multiplying
+   by sqrt(N) in both passes.   The user can do this by themselves.
+
+   See also SplitRadixComplexFft, declared in srfft.h, which is more efficient
+   but only works if the length of the input is a power of 2.
+ */
+template<typename Real> void ComplexFft (VectorBase<Real> *v, bool forward, Vector<Real> *tmp_work = NULL);
+
+/// ComplexFt is the same as ComplexFft but it implements the Fourier
+/// transform in an inefficient way.  It is mainly included for testing purposes.
+/// See comment for ComplexFft to describe the input and outputs and what it does.
+template<typename Real> void ComplexFt (const VectorBase<Real> &in,
+                                     VectorBase<Real> *out, bool forward);
+
+/// RealFft is a fourier transform of real inputs.  Internally it uses
+/// ComplexFft.  The input dimension N must be even.  If forward == true,
+/// it transforms from a sequence of N real points to its complex fourier
+/// transform; otherwise it goes in the reverse direction.  If you call it
+/// in the forward and then reverse direction and multiply by 1.0/N, you
+/// will get back the original data.
+/// The interpretation of the complex-FFT data is as follows: the array
+/// is a sequence of complex numbers C_n of length N/2 with (real, im) format,
+/// i.e. [real0, real_{N/2}, real1, im1, real2, im2, real3, im3, ...].
+/// See also SplitRadixRealFft, declared in srfft.h, which is more efficient
+/// but only works if the length of the input is a power of 2.
+
+template<typename Real> void RealFft (VectorBase<Real> *v, bool forward);
+
+
+/// RealFt has the same input and output format as RealFft above, but it is
+/// an inefficient implementation included for testing purposes.
+template<typename Real> void RealFftInefficient (VectorBase<Real> *v, bool forward);
+
+/// ComputeDctMatrix computes a matrix corresponding to the DCT, such that
+/// M * v equals the DCT of vector v.  M must be square at input.
+/// This is the type = III DCT with normalization, corresponding to the
+/// following equations, where x is the signal and X is the DCT:
+/// X_0 = 1/sqrt(2*N) \sum_{n = 0}^{N-1} x_n
+/// X_k = 1/sqrt(N) \sum_{n = 0}^{N-1} x_n cos( \pi/N (n + 1/2) k )
+/// This matrix's transpose is its own inverse, so transposing this
+/// matrix will give the inverse DCT.
+/// Caution: the type III DCT is generally known as the "inverse DCT" (with the
+/// type II being the actual DCT), so this function is somewhatd mis-named.  It
+/// was probably done this way for HTK compatibility.  We don't change it
+/// because it was this way from the start and changing it would affect the
+/// feature generation.
+
+template<typename Real> void ComputeDctMatrix(Matrix<Real> *M);
+
+
+/// ComplexMul implements, inline, the complex multiplication b *= a.
+template<typename Real> inline void ComplexMul(const Real &a_re, const Real &a_im,
+                                            Real *b_re, Real *b_im);
+
+/// ComplexMul implements, inline, the complex operation c += (a * b).
+template<typename Real> inline void ComplexAddProduct(const Real &a_re, const Real &a_im,
+                                                   const Real &b_re, const Real &b_im,
+                                                   Real *c_re, Real *c_im);
+
+
+/// ComplexImExp implements a <-- exp(i x), inline.
+template<typename Real> inline void ComplexImExp(Real x, Real *a_re, Real *a_im);
+
+
+// This class allows you to compute the matrix exponential function
+// B = I + A + 1/2! A^2 + 1/3! A^3 + ...
+// This method is most accurate where the result is of the same order of
+// magnitude as the unit matrix (it will typically not work well when
+// the answer has almost-zero eigenvalues or is close to zero).
+// It also provides a function that allows you do back-propagate the
+// derivative of a scalar function through this calculation.
+// The
+template<typename Real>
+class MatrixExponential {
+ public:
+  MatrixExponential() { }
+
+  void Compute(const MatrixBase<Real> &M, MatrixBase<Real> *X);  // does *X = exp(M)
+
+  // Version for symmetric matrices (it just copies to full matrix).
+  void Compute(const SpMatrix<Real> &M, SpMatrix<Real> *X);  // does *X = exp(M)
+
+  void Backprop(const MatrixBase<Real> &hX, MatrixBase<Real> *hM) const;  // Propagates
+  // the gradient of a scalar function f backwards through this operation, i.e.:
+  // if the parameter dX represents df/dX (with no transpose, so element i, j of dX
+  // is the derivative of f w.r.t. E(i, j)), it sets dM to df/dM, again with no
+  // transpose (of course, only the part thereof that comes through the effect of
+  // A on B).  This applies to the values of A and E that were called most recently
+  // with Compute().
+
+  // Version for symmetric matrices (it just copies to full matrix).
+  void Backprop(const SpMatrix<Real> &hX, SpMatrix<Real> *hM) const;
+  
+ private:
+  void Clear();
+
+  static MatrixIndexT ComputeN(const MatrixBase<Real> &M);
+
+  // This is intended for matrices P with small norms: compute B_0 = exp(P) - I.
+  // Keeps adding terms in the Taylor series till there is no further
+  // change in the result.  Stores some of the powers of A in powers_,
+  // and the number of terms K as K_.
+  void ComputeTaylor(const MatrixBase<Real> &P, MatrixBase<Real> *B0);
+
+  // Backprop through the Taylor-series computation above.
+  // note: hX is \hat{X} in the math; hM is \hat{M} in the math.
+  void BackpropTaylor(const MatrixBase<Real> &hX,
+                      MatrixBase<Real> *hM) const;
+
+  Matrix<Real> P_;  // Equals M * 2^(-N_)
+  std::vector<Matrix<Real> > B_;  // B_[0] = exp(P_) - I,
+                                 //  B_[k] = 2 B_[k-1] + B_[k-1]^2   [k > 0],
+                                 //  ( = exp(P_)^k - I )
+                                 // goes from 0..N_ [size N_+1].
+
+  std::vector<Matrix<Real> > powers_;  // powers (>1) of P_ stored here,
+  // up to all but the last one used in the Taylor expansion (this is the
+  // last one we need in the backprop).  The index is the power minus 2.
+
+  MatrixIndexT N_;  // Power N_ >=0 such that P_ = A * 2^(-N_),
+  // we choose it so that P_ has a sufficiently small norm
+  // that the Taylor series will converge fast.
+};
+
+
+/**
+    ComputePCA does a PCA computation, using either outer products
+    or inner products, whichever is more efficient.  Let D be
+    the dimension of the data points, N be the number of data
+    points, and G be the PCA dimension we want to retain.  We assume
+    G <= N and G <= D.
+
+    @param X [in]  An N x D matrix.  Each row of X is a point x_i.
+    @param U [out] A G x D matrix.  Each row of U is a basis element u_i.
+    @param A [out] An N x D matrix, or NULL.  Each row of A is a set of coefficients
+         in the basis for a point x_i, so A(i, g) is the coefficient of u_i
+         in x_i.
+    @param print_eigs [in] If true, prints out diagnostic information about the
+         eigenvalues.
+    @param exact [in] If true, does the exact computation; if false, does
+         a much faster (but almost exact) computation based on the Lanczos
+         method.
+*/
+
+template<typename Real>
+void ComputePca(const MatrixBase<Real> &X,
+                MatrixBase<Real> *U,
+                MatrixBase<Real> *A,
+                bool print_eigs = false,
+                bool exact = true);
+
+
+
+// This function does: *plus += max(0, a b^T),
+// *minus += max(0, -(a b^T)).
+template<typename Real>
+void AddOuterProductPlusMinus(Real alpha,
+                              const VectorBase<Real> &a,
+                              const VectorBase<Real> &b,
+                              MatrixBase<Real> *plus, 
+                              MatrixBase<Real> *minus);
+
+template<typename Real1, typename Real2>
+inline void AssertSameDim(const MatrixBase<Real1> &mat1, const MatrixBase<Real2> &mat2) {
+  KALDI_ASSERT(mat1.NumRows() == mat2.NumRows()
+               && mat1.NumCols() == mat2.NumCols());
+}
+
+
+/// @} end of "addtogroup matrix_funcs_misc"
+
+} // end namespace kaldi
+
+#include "matrix/matrix-functions-inl.h"
+
+
+#endif
diff --git a/kaldi_io/src/kaldi/matrix/matrix-lib.h b/kaldi_io/src/kaldi/matrix/matrix-lib.h
new file mode 100644
index 0000000..39acec5
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/matrix-lib.h
@@ -0,0 +1,37 @@
+// matrix/matrix-lib.h
+
+// Copyright 2009-2011  Ondrej Glembek;  Microsoft Corporation;  Haihua Xu
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+// Include everything from this directory.
+// These files include other stuff that we need.
+#ifndef KALDI_MATRIX_MATRIX_LIB_H_
+#define KALDI_MATRIX_MATRIX_LIB_H_
+
+#include "matrix/cblas-wrappers.h"
+#include "base/kaldi-common.h"
+#include "matrix/kaldi-vector.h"
+#include "matrix/kaldi-matrix.h"
+#include "matrix/sp-matrix.h"
+#include "matrix/tp-matrix.h"
+#include "matrix/matrix-functions.h"
+#include "matrix/srfft.h"
+#include "matrix/compressed-matrix.h"
+#include "matrix/optimization.h"
+
+#endif
+
diff --git a/kaldi_io/src/kaldi/matrix/optimization.h b/kaldi_io/src/kaldi/matrix/optimization.h
new file mode 100644
index 0000000..66309ac
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/optimization.h
@@ -0,0 +1,248 @@
+// matrix/optimization.h
+
+// Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+//
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+//
+// (*) incorporates, with permission, FFT code from his book
+// "Signal Processing with Lapped Transforms", Artech, 1992.
+
+
+
+#ifndef KALDI_MATRIX_OPTIMIZATION_H_
+#define KALDI_MATRIX_OPTIMIZATION_H_
+
+#include "matrix/kaldi-vector.h"
+#include "matrix/kaldi-matrix.h"
+
+namespace kaldi {
+
+
+/// @addtogroup matrix_optimization
+/// @{
+
+struct LinearCgdOptions {
+  int32 max_iters;  //  Maximum number of iters (if >= 0).
+  BaseFloat max_error;  // Maximum 2-norm of the residual A x - b (convergence
+                        // test)
+  // Every time the residual 2-norm decreases by this recompute_residual_factor
+  // since the last time it was computed from scratch, recompute it from
+  // scratch.  This helps to keep the computed residual accurate even in the
+  // presence of roundoff.
+  BaseFloat recompute_residual_factor;
+  
+  LinearCgdOptions(): max_iters(-1),
+                      max_error(0.0),
+                      recompute_residual_factor(0.01) { }
+};
+  
+/*
+  This function uses linear conjugate gradient descent to approximately solve
+  the system A x = b.  The value of x at entry corresponds to the initial guess
+  of x.  The algorithm continues until the number of iterations equals b.Dim(),
+  or until the 2-norm of (A x - b) is <= max_error, or until the number of
+  iterations equals max_iter, whichever happens sooner.  It is a requirement
+  that A be positive definite.
+  It returns the number of iterations that were actually executed (this is
+  useful for testing purposes).
+*/
+template<typename Real>
+int32 LinearCgd(const LinearCgdOptions &opts,
+                const SpMatrix<Real> &A, const VectorBase<Real> &b,
+                VectorBase<Real> *x);
+
+
+
+
+
+
+/**
+   This is an implementation of L-BFGS.  It pushes responsibility for
+   determining when to stop, onto the user.  There is no call-back here:
+   everything is done via calls to the class itself (see the example in
+   matrix-lib-test.cc).  This does not implement constrained L-BFGS, but it will
+   handle constrained problems correctly as long as the function approaches
+   +infinity (or -infinity for maximization problems) when it gets close to the
+   bound of the constraint.  In these types of problems, you just let the
+   function value be +infinity for minimization problems, or -infinity for
+   maximization problems, outside these bounds).
+*/
+
+struct LbfgsOptions {
+  bool minimize; // if true, we're minimizing, else maximizing.
+  int m; // m is the number of stored vectors L-BFGS keeps.
+  float first_step_learning_rate; // The very first step of L-BFGS is
+  // like gradient descent.  If you want to configure the size of that step,
+  // you can do it using this variable.
+  float first_step_length; // If this variable is >0.0, it overrides
+  // first_step_learning_rate; on the first step we choose an approximate
+  // Hessian that is the multiple of the identity that would generate this
+  // step-length, or 1.0 if the gradient is zero.
+  float first_step_impr; // If this variable is >0.0, it overrides
+  // first_step_learning_rate; on the first step we choose an approximate
+  // Hessian that is the multiple of the identity that would generate this
+  // amount of objective function improvement (assuming the "real" objf
+  // was linear).
+  float c1; // A constant in Armijo rule = Wolfe condition i)
+  float c2; // A constant in Wolfe condition ii)
+  float d; // An amount > 1.0 (default 2.0) that we initially multiply or
+  // divide the step length by, in the line search.
+  int max_line_search_iters; // after this many iters we restart L-BFGS.
+  int avg_step_length; // number of iters to avg step length over, in
+  // RecentStepLength().
+  
+  LbfgsOptions (bool minimize = true):
+      minimize(minimize),
+      m(10),
+      first_step_learning_rate(1.0),
+      first_step_length(0.0),
+      first_step_impr(0.0),
+      c1(1.0e-04),
+      c2(0.9),
+      d(2.0),
+      max_line_search_iters(50),
+      avg_step_length(4) { }
+};
+
+template<typename Real>
+class OptimizeLbfgs {
+ public:
+  /// Initializer takes the starting value of x.
+  OptimizeLbfgs(const VectorBase<Real> &x,
+                const LbfgsOptions &opts);
+  
+  /// This returns the value of the variable x that has the best objective
+  /// function so far, and the corresponding objective function value if
+  /// requested.  This would typically be called only at the end.
+  const VectorBase<Real>& GetValue(Real *objf_value = NULL) const;
+  
+  /// This returns the value at which the function wants us
+  /// to compute the objective function and gradient.
+  const VectorBase<Real>& GetProposedValue() const { return new_x_; }
+  
+  /// Returns the average magnitude of the last n steps (but not
+  /// more than the number we have stored).  Before we have taken
+  /// any steps, returns +infinity.  Note: if the most recent
+  /// step length was 0, it returns 0, regardless of the other
+  /// step lengths.  This makes it suitable as a convergence test
+  /// (else we'd generate NaN's).
+  Real RecentStepLength() const;
+  
+  /// The user calls this function to provide the class with the
+  /// function and gradient info at the point GetProposedValue().
+  /// If this point is outside the constraints you can set function_value
+  /// to {+infinity,-infinity} for {minimization,maximization} problems.
+  /// In this case the gradient, and also the second derivative (if you call
+  /// the second overloaded version of this function) will be ignored.
+  void DoStep(Real function_value,
+              const VectorBase<Real> &gradient);
+  
+  /// The user can call this version of DoStep() if it is desired to set some
+  /// kind of approximate Hessian on this iteration.  Note: it is a prerequisite
+  /// that diag_approx_2nd_deriv must be strictly positive (minimizing), or
+  /// negative (maximizing).
+  void DoStep(Real function_value,
+              const VectorBase<Real> &gradient,
+              const VectorBase<Real> &diag_approx_2nd_deriv);
+  
+ private:
+  KALDI_DISALLOW_COPY_AND_ASSIGN(OptimizeLbfgs);
+
+
+  // The following variable says what stage of the computation we're at.
+  // Refer to Algorithm 7.5 (L-BFGS) of Nodecdal & Wright, "Numerical
+  // Optimization", 2nd edition.
+  // kBeforeStep means we're about to do
+  /// "compute p_k <-- - H_k \delta f_k" (i.e. Algorithm 7.4).
+  // kWithinStep means we're at some point within line search; note
+  // that line search is iterative so we can stay in this state more
+  // than one time on each iteration.
+  enum ComputationState {
+    kBeforeStep,
+    kWithinStep, // This means we're within the step-size computation, and
+    // have not yet done the 1st function evaluation.
+  };
+  
+  inline MatrixIndexT Dim() { return x_.Dim(); }
+  inline MatrixIndexT M() { return opts_.m; }
+  SubVector<Real> Y(MatrixIndexT i) {
+    return SubVector<Real>(data_, (i % M()) * 2); // vector y_i
+  }
+  SubVector<Real> S(MatrixIndexT i) {
+    return SubVector<Real>(data_, (i % M()) * 2 + 1); // vector s_i
+  }
+  // The following are subroutines within DoStep():
+  bool AcceptStep(Real function_value,
+                  const VectorBase<Real> &gradient);
+  void Restart(const VectorBase<Real> &x,
+               Real function_value,
+               const VectorBase<Real> &gradient);
+  void ComputeNewDirection(Real function_value,
+                           const VectorBase<Real> &gradient);
+  void ComputeHifNeeded(const VectorBase<Real> &gradient);
+  void StepSizeIteration(Real function_value,
+                         const VectorBase<Real> &gradient);
+  void RecordStepLength(Real s);
+  
+  
+  LbfgsOptions opts_;
+  SignedMatrixIndexT k_; // Iteration number, starts from zero.  Gets set back to zero
+  // when we restart.
+  
+  ComputationState computation_state_;
+  bool H_was_set_; // True if the user specified H_; if false,
+  // we'll use a heuristic to estimate it.
+
+
+  Vector<Real> x_; // current x.
+  Vector<Real> new_x_; // the x proposed in the line search.
+  Vector<Real> best_x_; // the x with the best objective function so far
+                        // (either the same as x_ or something in the current line search.)
+  Vector<Real> deriv_; // The most recently evaluated derivative-- at x_k.
+  Vector<Real> temp_;
+  Real f_; // The function evaluated at x_k.
+  Real best_f_; // the best objective function so far.
+  Real d_; // a number d > 1.0, but during an iteration we may decrease this, when
+  // we switch between armijo and wolfe failures.
+
+  int num_wolfe_i_failures_; // the num times we decreased step size.
+  int num_wolfe_ii_failures_; // the num times we increased step size.
+  enum { kWolfeI, kWolfeII, kNone } last_failure_type_; // last type of step-search
+  // failure on this iter.
+  
+  Vector<Real> H_; // Current inverse-Hessian estimate.  May be computed by this class itself,
+  // or provided by user using 2nd form of SetGradientInfo().
+  Matrix<Real> data_; // dimension (m*2) x dim.  Even rows store
+  // gradients y_i, odd rows store steps s_i.
+  Vector<Real> rho_; // dimension m; rho_(m) = 1/(y_m^T s_m), Eq. 7.17.
+
+  std::vector<Real> step_lengths_; // The step sizes we took on the last
+  // (up to m) iterations; these are not stored in a rotating buffer but
+  // are shifted by one each time (this is more convenient when we
+  // restart, as we keep this info past restarting).
+  
+
+};
+  
+/// @} 
+
+
+} // end namespace kaldi
+
+
+
+#endif
+
diff --git a/kaldi_io/src/kaldi/matrix/packed-matrix.h b/kaldi_io/src/kaldi/matrix/packed-matrix.h
new file mode 100644
index 0000000..722d932
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/packed-matrix.h
@@ -0,0 +1,197 @@
+// matrix/packed-matrix.h
+
+// Copyright 2009-2013  Ondrej Glembek;  Lukas Burget;  Microsoft Corporation;
+//                      Saarland University;  Yanmin Qian;
+//                      Johns Hopkins University (Author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_MATRIX_PACKED_MATRIX_H_
+#define KALDI_MATRIX_PACKED_MATRIX_H_
+
+#include "matrix/matrix-common.h"
+#include <algorithm>
+
+namespace kaldi {
+
+/// \addtogroup matrix_funcs_io
+// we need to declare the friend << operator here
+template<typename Real>
+std::ostream & operator <<(std::ostream & out, const PackedMatrix<Real>& M);
+
+
+/// \addtogroup matrix_group
+/// @{
+
+/// @brief Packed matrix: base class for triangular and symmetric matrices.
+template<typename Real> class PackedMatrix {
+  friend class CuPackedMatrix<Real>;
+ public:
+  //friend class CuPackedMatrix<Real>;
+
+  PackedMatrix() : data_(NULL), num_rows_(0) {}
+
+  explicit PackedMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero):
+      data_(NULL) {  Resize(r, resize_type);  }
+
+  explicit PackedMatrix(const PackedMatrix<Real> &orig) : data_(NULL) {
+    Resize(orig.num_rows_, kUndefined);
+    CopyFromPacked(orig);
+  }
+
+  template<typename OtherReal>
+  explicit PackedMatrix(const PackedMatrix<OtherReal> &orig) : data_(NULL) {
+    Resize(orig.NumRows(), kUndefined);
+    CopyFromPacked(orig);
+  }
+  
+  void SetZero();  /// < Set to zero
+  void SetUnit();  /// < Set to unit matrix.
+  void SetRandn(); /// < Set to random values of a normal distribution
+
+  Real Trace() const;
+
+  // Needed for inclusion in std::vector
+  PackedMatrix<Real> & operator =(const PackedMatrix<Real> &other) {
+    Resize(other.NumRows());
+    CopyFromPacked(other);
+    return *this;
+  }
+
+  ~PackedMatrix() {
+    Destroy();
+  }
+
+  /// Set packed matrix to a specified size (can be zero).
+  /// The value of the new data depends on resize_type:
+  ///   -if kSetZero, the new data will be zero
+  ///   -if kUndefined, the new data will be undefined
+  ///   -if kCopyData, the new data will be the same as the old data in any
+  ///      shared positions, and zero elsewhere.
+  /// This function takes time proportional to the number of data elements.
+  void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero);
+
+  void AddToDiag(const Real r); // Adds r to diaginal
+
+  void ScaleDiag(const Real alpha);  // Scales diagonal by alpha.
+
+  void SetDiag(const Real alpha);  // Sets diagonal to this value.
+
+  template<typename OtherReal>
+  void CopyFromPacked(const PackedMatrix<OtherReal> &orig);
+  
+  /// CopyFromVec just interprets the vector as having the same layout
+  /// as the packed matrix.  Must have the same dimension, i.e.
+  /// orig.Dim() == (NumRows()*(NumRows()+1)) / 2;
+  template<typename OtherReal>
+  void CopyFromVec(const SubVector<OtherReal> &orig);
+  
+  Real* Data() { return data_; }
+  const Real* Data() const { return data_; }
+  inline MatrixIndexT NumRows() const { return num_rows_; }
+  inline MatrixIndexT NumCols() const { return num_rows_; }
+  size_t SizeInBytes() const {
+    size_t nr = static_cast<size_t>(num_rows_);
+    return ((nr * (nr+1)) / 2) * sizeof(Real);
+  }
+
+  //MatrixIndexT Stride() const { return stride_; }
+
+  // This code is duplicated in child classes to avoid extra levels of calls.
+  Real operator() (MatrixIndexT r, MatrixIndexT c) const {
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
+                 static_cast<UnsignedMatrixIndexT>(num_rows_) &&
+                 static_cast<UnsignedMatrixIndexT>(c) <
+                 static_cast<UnsignedMatrixIndexT>(num_rows_)
+                 && c <= r);
+    return *(data_ + (r * (r + 1)) / 2 + c);
+  }
+
+  // This code is duplicated in child classes to avoid extra levels of calls.
+  Real &operator() (MatrixIndexT r, MatrixIndexT c) {
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
+                 static_cast<UnsignedMatrixIndexT>(num_rows_) &&
+                 static_cast<UnsignedMatrixIndexT>(c) <
+                 static_cast<UnsignedMatrixIndexT>(num_rows_)
+                 && c <= r);
+    return *(data_ + (r * (r + 1)) / 2 + c);
+  }
+
+  Real Max() const {
+    KALDI_ASSERT(num_rows_ > 0);
+    return * (std::max_element(data_, data_ + ((num_rows_*(num_rows_+1))/2) ));
+  }
+
+  Real Min() const {
+    KALDI_ASSERT(num_rows_ > 0);
+    return * (std::min_element(data_, data_ + ((num_rows_*(num_rows_+1))/2) ));
+  }
+
+  void Scale(Real c);
+
+  friend std::ostream & operator << <> (std::ostream & out,
+                                     const PackedMatrix<Real> &m);
+  // Use instead of stream<<*this, if you want to add to existing contents.
+  // Will throw exception on failure.
+  void Read(std::istream &in, bool binary, bool add = false);
+
+  void Write(std::ostream &out, bool binary) const;
+  
+  void Destroy();
+
+  /// Swaps the contents of *this and *other.  Shallow swap.
+  void Swap(PackedMatrix<Real> *other);
+  void Swap(Matrix<Real> *other);
+
+
+ protected:
+  // Will only be called from this class or derived classes.
+  void AddPacked(const Real alpha, const PackedMatrix<Real>& M);
+  Real *data_;
+  MatrixIndexT num_rows_;
+  //MatrixIndexT stride_;
+ private:
+  /// Init assumes the current contents of the class are is invalid (i.e. junk or
+  /// has already been freed), and it sets the matrixd to newly allocated memory
+  /// with the specified dimension.  dim == 0 is acceptable.  The memory contents
+  /// pointed to by data_ will be undefined.
+  void Init(MatrixIndexT dim);
+
+};
+/// @} end "addtogroup matrix_group"
+
+
+/// \addtogroup matrix_funcs_io
+/// @{
+
+template<typename Real>
+std::ostream & operator << (std::ostream & os, const PackedMatrix<Real>& M) {
+  M.Write(os, false);
+  return os;
+}
+
+template<typename Real>
+std::istream & operator >> (std::istream &is, PackedMatrix<Real> &M) {
+  M.Read(is, false);
+  return is;
+}
+
+/// @}
+
+}  // namespace kaldi
+
+#endif
+
diff --git a/kaldi_io/src/kaldi/matrix/sp-matrix-inl.h b/kaldi_io/src/kaldi/matrix/sp-matrix-inl.h
new file mode 100644
index 0000000..1579592
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/sp-matrix-inl.h
@@ -0,0 +1,42 @@
+// matrix/sp-matrix-inl.h
+
+// Copyright 2009-2011  Ondrej Glembek;  Microsoft Corporation;  Haihua Xu
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_MATRIX_SP_MATRIX_INL_H_
+#define KALDI_MATRIX_SP_MATRIX_INL_H_
+
+#include "matrix/tp-matrix.h"
+
+namespace kaldi {
+
+// All the lines in this file seem to be declaring template specializations.
+// These tell the compiler that we'll implement the templated function
+// separately for the different template arguments (float, double).
+
+template<>
+double SolveQuadraticProblem(const SpMatrix<double> &H, const VectorBase<double> &g,
+                             const SolverOptions &opts, VectorBase<double> *x);
+
+template<>
+float SolveQuadraticProblem(const SpMatrix<float> &H, const VectorBase<float> &g,
+                            const SolverOptions &opts, VectorBase<float> *x);
+
+}  // namespace kaldi
+
+
+#endif  // KALDI_MATRIX_SP_MATRIX_INL_H_
diff --git a/kaldi_io/src/kaldi/matrix/sp-matrix.h b/kaldi_io/src/kaldi/matrix/sp-matrix.h
new file mode 100644
index 0000000..209d24a
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/sp-matrix.h
@@ -0,0 +1,524 @@
+// matrix/sp-matrix.h
+
+// Copyright 2009-2011   Ondrej Glembek;  Microsoft Corporation;  Lukas Burget;
+//                       Saarland University;  Ariya Rastrow;  Yanmin Qian;
+//                       Jan Silovsky
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+#ifndef KALDI_MATRIX_SP_MATRIX_H_
+#define KALDI_MATRIX_SP_MATRIX_H_
+
+#include <algorithm>
+#include <vector>
+
+#include "matrix/packed-matrix.h"
+
+namespace kaldi {
+
+
+/// \addtogroup matrix_group
+/// @{
+template<typename Real> class SpMatrix;
+
+
+/**
+ * @brief Packed symetric matrix class
+*/
+template<typename Real>
+class SpMatrix : public PackedMatrix<Real> {
+  friend class CuSpMatrix<Real>;
+ public:
+  // so it can use our assignment operator.
+  friend class std::vector<Matrix<Real> >;
+
+  SpMatrix(): PackedMatrix<Real>() {}
+
+  /// Copy constructor from CUDA version of SpMatrix
+  /// This is defined in ../cudamatrix/cu-sp-matrix.h
+  
+  explicit SpMatrix(const CuSpMatrix<Real> &cu);
+ 
+  explicit SpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
+      : PackedMatrix<Real>(r, resize_type) {}
+
+  SpMatrix(const SpMatrix<Real> &orig)
+      : PackedMatrix<Real>(orig) {}
+
+  template<typename OtherReal>
+  explicit SpMatrix(const SpMatrix<OtherReal> &orig)
+      : PackedMatrix<Real>(orig) {}
+
+#ifdef KALDI_PARANOID
+  explicit SpMatrix(const MatrixBase<Real> & orig,
+                    SpCopyType copy_type = kTakeMeanAndCheck)
+      : PackedMatrix<Real>(orig.NumRows(), kUndefined) {
+    CopyFromMat(orig, copy_type);
+  }
+#else
+  explicit SpMatrix(const MatrixBase<Real> & orig,
+                    SpCopyType copy_type = kTakeMean)
+      : PackedMatrix<Real>(orig.NumRows(), kUndefined) {
+    CopyFromMat(orig, copy_type);
+  }
+#endif
+
+  /// Shallow swap.
+  void Swap(SpMatrix *other);
+
+  inline void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero) {
+    PackedMatrix<Real>::Resize(nRows, resize_type);
+  }
+
+  void CopyFromSp(const SpMatrix<Real> &other) {
+    PackedMatrix<Real>::CopyFromPacked(other);
+  }
+
+  template<typename OtherReal>
+  void CopyFromSp(const SpMatrix<OtherReal> &other) {
+    PackedMatrix<Real>::CopyFromPacked(other);
+  }
+
+#ifdef KALDI_PARANOID
+  void CopyFromMat(const MatrixBase<Real> &orig,
+                   SpCopyType copy_type = kTakeMeanAndCheck);
+#else  // different default arg if non-paranoid mode.
+  void CopyFromMat(const MatrixBase<Real> &orig,
+                   SpCopyType copy_type = kTakeMean);
+#endif
+
+  inline Real operator() (MatrixIndexT r, MatrixIndexT c) const {
+    // if column is less than row, then swap these as matrix is stored
+    // as upper-triangular...  only allowed for const matrix object.
+    if (static_cast<UnsignedMatrixIndexT>(c) >
+        static_cast<UnsignedMatrixIndexT>(r))
+      std::swap(c, r);
+    // c<=r now so don't have to check c.
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
+                 static_cast<UnsignedMatrixIndexT>(this->num_rows_));
+    return *(this->data_ + (r*(r+1)) / 2 + c);
+    // Duplicating code from PackedMatrix.h
+  }
+
+  inline Real &operator() (MatrixIndexT r, MatrixIndexT c) {
+    if (static_cast<UnsignedMatrixIndexT>(c) >
+        static_cast<UnsignedMatrixIndexT>(r))
+      std::swap(c, r);
+    // c<=r now so don't have to check c.
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
+                 static_cast<UnsignedMatrixIndexT>(this->num_rows_));
+    return *(this->data_ + (r * (r + 1)) / 2 + c);
+    // Duplicating code from PackedMatrix.h
+  }
+
+  using PackedMatrix<Real>::operator =;
+  using PackedMatrix<Real>::Scale;
+
+  /// matrix inverse.
+  /// if inverse_needed = false, will fill matrix with garbage.
+  /// (only useful if logdet wanted).
+  void Invert(Real *logdet = NULL, Real *det_sign= NULL,
+              bool inverse_needed = true);
+
+  // Below routine does inversion in double precision,
+  // even for single-precision object.
+  void InvertDouble(Real *logdet = NULL, Real *det_sign = NULL,
+                    bool inverse_needed = true);
+
+  /// Returns maximum ratio of singular values.
+  inline Real Cond() const {
+    Matrix<Real> tmp(*this);
+    return tmp.Cond();
+  }
+
+  /// Takes matrix to a fraction power via Svd.
+  /// Will throw exception if matrix is not positive semidefinite
+  /// (to within a tolerance)
+  void ApplyPow(Real exponent);
+
+  /// This is the version of SVD that we implement for symmetric positive
+  /// definite matrices.  This exists for historical reasons; right now its
+  /// internal implementation is the same as Eig().  It computes the eigenvalue
+  /// decomposition (*this) = P * diag(s) * P^T with P orthogonal.  Will throw
+  /// exception if input is not positive semidefinite to within a tolerance.
+  void SymPosSemiDefEig(VectorBase<Real> *s, MatrixBase<Real> *P,
+                        Real tolerance = 0.001) const;
+
+  /// Solves the symmetric eigenvalue problem: at end we should have (*this) = P
+  /// * diag(s) * P^T.  We solve the problem using the symmetric QR method.
+  /// P may be NULL.
+  /// Implemented in qr.cc.
+  /// If you need the eigenvalues sorted, the function SortSvd declared in
+  /// kaldi-matrix is suitable.
+  void Eig(VectorBase<Real> *s, MatrixBase<Real> *P = NULL) const;
+  
+  /// This function gives you, approximately, the largest eigenvalues of the
+  /// symmetric matrix and the corresponding eigenvectors.  (largest meaning,
+  /// further from zero).  It does this by doing a SVD within the Krylov
+  /// subspace generated by this matrix and a random vector.  This is
+  /// a form of the Lanczos method with complete reorthogonalization, followed
+  /// by SVD within a smaller dimension ("lanczos_dim").
+  ///
+  /// If *this is m by m, s should be of dimension n and P should be of
+  /// dimension m by n, with n <= m.  The *columns* of P are the approximate
+  /// eigenvectors; P * diag(s) * P^T would be a low-rank reconstruction of
+  /// *this.  The columns of P will be orthogonal, and the elements of s will be
+  /// the eigenvalues of *this projected into that subspace, but beyond that
+  /// there are no exact guarantees.  (This is because the convergence of this
+  /// method is statistical).  Note: it only makes sense to use this
+  /// method if you are in very high dimension and n is substantially smaller
+  /// than m: for example, if you want the 100 top eigenvalues of a 10k by 10k
+  /// matrix.  This function calls Rand() to initialize the lanczos
+  /// iterations and also for restarting.
+  /// If lanczos_dim is zero, it will default to the greater of:
+  /// s->Dim() + 50 or s->Dim() + s->Dim()/2, but not more than this->Dim().
+  /// If lanczos_dim == this->Dim(), you might as well just call the function
+  /// Eig() since the result will be the same, and Eig() would be faster; the
+  /// whole point of this function is to reduce the dimension of the SVD
+  /// computation.
+  void TopEigs(VectorBase<Real> *s, MatrixBase<Real> *P,
+               MatrixIndexT lanczos_dim = 0) const;
+
+
+  
+  /// Takes log of the matrix (does eigenvalue decomposition then takes
+  /// log of eigenvalues and reconstructs).  Will throw of not +ve definite.
+  void Log();
+
+
+  // Takes exponential of the matrix (equivalent to doing eigenvalue
+  // decomposition then taking exp of eigenvalues and reconstructing).
+  void Exp();
+
+  /// Returns the maximum of the absolute values of any of the
+  /// eigenvalues.
+  Real MaxAbsEig() const;
+
+  void PrintEigs(const char *name) {
+    Vector<Real> s((*this).NumRows());
+    Matrix<Real> P((*this).NumRows(), (*this).NumCols());
+    SymPosSemiDefEig(&s, &P);
+    KALDI_LOG << "PrintEigs: " << name << ": " << s;
+  }
+
+  bool IsPosDef() const;  // returns true if Cholesky succeeds.
+  void AddSp(const Real alpha, const SpMatrix<Real> &Ma) {
+    this->AddPacked(alpha, Ma);
+  }
+
+  /// Computes log determinant but only for +ve-def matrices
+  /// (it uses Cholesky).
+  /// If matrix is not +ve-def, it will throw an exception
+  /// was LogPDDeterminant()
+  Real LogPosDefDet() const;
+
+  Real LogDet(Real *det_sign = NULL) const;
+
+  /// rank-one update, this <-- this + alpha v v'
+  template<typename OtherReal>
+  void AddVec2(const Real alpha, const VectorBase<OtherReal> &v);
+
+  /// rank-two update, this <-- this + alpha (v w' + w v').
+  void AddVecVec(const Real alpha, const VectorBase<Real> &v,
+                 const VectorBase<Real> &w);
+
+  /// Does *this = beta * *thi + alpha * diag(v) * S * diag(v)
+  void AddVec2Sp(const Real alpha, const VectorBase<Real> &v,
+                 const SpMatrix<Real> &S, const Real beta);
+  
+  /// diagonal update, this <-- this + diag(v)
+  template<typename OtherReal>
+  void AddDiagVec(const Real alpha, const VectorBase<OtherReal> &v);
+
+  /// rank-N update:
+  /// if (transM == kNoTrans)
+  /// (*this) = beta*(*this) + alpha * M * M^T,
+  /// or  (if transM == kTrans)
+  ///  (*this) = beta*(*this) + alpha * M^T * M
+  /// Note: beta used to default to 0.0.
+  void AddMat2(const Real alpha, const MatrixBase<Real> &M,
+               MatrixTransposeType transM, const Real beta);
+
+  /// Extension of rank-N update:
+  /// this <-- beta*this  +  alpha * M * A * M^T.
+  /// (*this) and A are allowed to be the same.
+  /// If transM == kTrans, then we do it as M^T * A * M.
+  void AddMat2Sp(const Real alpha, const MatrixBase<Real> &M,
+                 MatrixTransposeType transM, const SpMatrix<Real> &A,
+                 const Real beta = 0.0);
+
+  /// This is a version of AddMat2Sp specialized for when M is fairly sparse.
+  /// This was required for making the raw-fMLLR code efficient.
+  void AddSmat2Sp(const Real alpha, const MatrixBase<Real> &M,
+                  MatrixTransposeType transM, const SpMatrix<Real> &A,
+                  const Real beta = 0.0);
+
+  /// The following function does:
+  /// this <-- beta*this  +  alpha * T * A * T^T.
+  /// (*this) and A are allowed to be the same.
+  /// If transM == kTrans, then we do it as alpha * T^T * A * T.
+  /// Currently it just calls AddMat2Sp, but if needed we
+  /// can implement it more efficiently.
+  void AddTp2Sp(const Real alpha, const TpMatrix<Real> &T,
+                MatrixTransposeType transM, const SpMatrix<Real> &A,
+                const Real beta = 0.0);
+
+  /// The following function does:
+  /// this <-- beta*this  +  alpha * T * T^T.
+  /// (*this) and A are allowed to be the same.
+  /// If transM == kTrans, then we do it as alpha * T^T *  T
+  /// Currently it just calls AddMat2, but if needed we
+  /// can implement it more efficiently.
+  void AddTp2(const Real alpha, const TpMatrix<Real> &T,
+              MatrixTransposeType transM, const Real beta = 0.0);
+
+  /// Extension of rank-N update:
+  /// this <-- beta*this + alpha * M * diag(v) * M^T.
+  /// if transM == kTrans, then
+  /// this <-- beta*this + alpha * M^T * diag(v) * M.
+  void AddMat2Vec(const Real alpha, const MatrixBase<Real> &M,
+                  MatrixTransposeType transM, const VectorBase<Real> &v,
+                  const Real beta = 0.0);
+
+
+  ///  Floors this symmetric matrix to the matrix
+  /// alpha * Floor, where the matrix Floor is positive
+  /// definite.
+  /// It is floored in the sense that after flooring,
+  ///  x^T (*this) x  >= x^T (alpha*Floor) x.
+  /// This is accomplished using an Svd.  It will crash
+  /// if Floor is not positive definite. Returns the number of
+  /// elements that were floored.
+  int ApplyFloor(const SpMatrix<Real> &Floor, Real alpha = 1.0,
+                 bool verbose = false);
+
+  /// Floor: Given a positive semidefinite matrix, floors the eigenvalues
+  /// to the specified quantity.  A previous version of this function had
+  /// a tolerance which is now no longer needed since we have code to
+  /// do the symmetric eigenvalue decomposition and no longer use the SVD
+  /// code for that purose.
+  int ApplyFloor(Real floor);
+  
+  bool IsDiagonal(Real cutoff = 1.0e-05) const;
+  bool IsUnit(Real cutoff = 1.0e-05) const;
+  bool IsZero(Real cutoff = 1.0e-05) const;
+  bool IsTridiagonal(Real cutoff = 1.0e-05) const;
+
+  /// sqrt of sum of square elements.
+  Real FrobeniusNorm() const;
+
+  /// Returns true if ((*this)-other).FrobeniusNorm() <=
+  ///   tol*(*this).FrobeniusNorma()
+  bool ApproxEqual(const SpMatrix<Real> &other, float tol = 0.01) const;
+
+  // LimitCond:
+  // Limits the condition of symmetric positive semidefinite matrix to
+  // a specified value
+  // by flooring all eigenvalues to a positive number which is some multiple
+  // of the largest one (or zero if there are no positive eigenvalues).
+  // Takes the condition number we are willing to accept, and floors
+  // eigenvalues to the largest eigenvalue divided by this.
+  //  Returns #eigs floored or already equal to the floor. 
+  // Throws exception if input is not positive definite.
+  // returns #floored.
+  MatrixIndexT LimitCond(Real maxCond = 1.0e+5, bool invert = false);
+
+  // as LimitCond but all done in double precision. // returns #floored.
+  MatrixIndexT LimitCondDouble(Real maxCond = 1.0e+5, bool invert = false) {
+    SpMatrix<double> dmat(*this);
+    MatrixIndexT ans = dmat.LimitCond(maxCond, invert);
+    (*this).CopyFromSp(dmat);
+    return ans;
+  }
+  Real Trace() const;
+
+  /// Tridiagonalize the matrix with an orthogonal transformation.  If
+  /// *this starts as S, produce T (and Q, if non-NULL) such that
+  /// T = Q A Q^T, i.e. S = Q^T T Q.  Caution: this is the other way
+  /// round from most authors (it's more efficient in row-major indexing).
+  void Tridiagonalize(MatrixBase<Real> *Q);
+
+  /// The symmetric QR algorithm.  This will mostly be useful in internal code.
+  /// Typically, you will call this after Tridiagonalize(), on the same object.
+  /// When called, *this (call it A at this point) must be tridiagonal; at exit,
+  /// *this will be a diagonal matrix D that is similar to A via orthogonal
+  /// transformations.  This algorithm right-multiplies Q by orthogonal
+  /// transformations.  It turns *this from a tridiagonal into a diagonal matrix
+  /// while maintaining that (Q *this Q^T) has the same value at entry and exit.
+  /// At entry Q should probably be either NULL or orthogonal, but we don't check
+  /// this.
+  void Qr(MatrixBase<Real> *Q);
+  
+ private:
+ void EigInternal(VectorBase<Real> *s, MatrixBase<Real> *P,
+                   Real tolerance, int recurse) const;
+};
+
+/// @} end of "addtogroup matrix_group"
+
+/// \addtogroup matrix_funcs_scalar
+/// @{
+
+
+/// Returns tr(A B).
+float TraceSpSp(const SpMatrix<float> &A, const SpMatrix<float> &B);
+double TraceSpSp(const SpMatrix<double> &A, const SpMatrix<double> &B);
+
+
+template<typename Real>
+inline bool ApproxEqual(const SpMatrix<Real> &A,
+                        const SpMatrix<Real> &B, Real tol = 0.01) {
+  return  A.ApproxEqual(B, tol);
+}
+
+template<typename Real>
+inline void AssertEqual(const SpMatrix<Real> &A,
+                        const SpMatrix<Real> &B, Real tol = 0.01) {
+  KALDI_ASSERT(ApproxEqual(A, B, tol));
+}
+
+
+
+/// Returns tr(A B).
+template<typename Real, typename OtherReal>
+Real TraceSpSp(const SpMatrix<Real> &A, const SpMatrix<OtherReal> &B);
+
+
+
+// TraceSpSpLower is the same as Trace(A B) except the lower-diagonal elements
+// are counted only once not twice as they should be.  It is useful in certain
+// optimizations.
+template<typename Real>
+Real TraceSpSpLower(const SpMatrix<Real> &A, const SpMatrix<Real> &B);
+
+
+/// Returns tr(A B).
+/// No option to transpose B because would make no difference.
+template<typename Real>
+Real TraceSpMat(const SpMatrix<Real> &A, const MatrixBase<Real> &B);
+
+/// Returns tr(A B C)
+/// (A and C may be transposed as specified by transA and transC).
+template<typename Real>
+Real TraceMatSpMat(const MatrixBase<Real> &A, MatrixTransposeType transA,
+                   const SpMatrix<Real> &B, const MatrixBase<Real> &C,
+                   MatrixTransposeType transC);
+
+/// Returns tr (A B C D)
+/// (A and C may be transposed as specified by transA and transB).
+template<typename Real>
+Real TraceMatSpMatSp(const MatrixBase<Real> &A, MatrixTransposeType transA,
+                     const SpMatrix<Real> &B, const MatrixBase<Real> &C,
+                     MatrixTransposeType transC, const SpMatrix<Real> &D);
+
+/** Computes v1^T * M * v2.  Not as efficient as it could be where v1 == v2
+ * (but no suitable blas routines available).
+ */
+
+/// Returns \f$ v_1^T M v_2 \f$
+/// Not as efficient as it could be where v1 == v2.
+template<typename Real>
+Real VecSpVec(const VectorBase<Real> &v1, const SpMatrix<Real> &M,
+               const VectorBase<Real> &v2);
+
+
+/// @} \addtogroup matrix_funcs_scalar
+
+/// \addtogroup matrix_funcs_misc
+/// @{
+
+
+/// This class describes the options for maximizing various quadratic objective
+/// functions.  It's mostly as described in the SGMM paper "the subspace
+/// Gaussian mixture model -- a structured model for speech recognition", but
+/// the diagonal_precondition option is newly added, to handle problems where
+/// different dimensions have very different scaling (we recommend to use the
+/// option but it's set false for back compatibility).
+struct SolverOptions {
+  BaseFloat K; // maximum condition number
+  BaseFloat eps; 
+  std::string name;
+  bool optimize_delta;
+  bool diagonal_precondition;
+  bool print_debug_output;
+  explicit SolverOptions(const std::string &name):
+      K(1.0e+4), eps(1.0e-40), name(name),
+      optimize_delta(true), diagonal_precondition(false),
+      print_debug_output(true) { }
+  SolverOptions(): K(1.0e+4), eps(1.0e-40), name("[unknown]"),
+                   optimize_delta(true), diagonal_precondition(false),
+                   print_debug_output(true) { }
+  void Check() const;
+};
+
+
+/// Maximizes the auxiliary function
+/// \f[    Q(x) = x.g - 0.5 x^T H x     \f]
+/// using a numerically stable method. Like a numerically stable version of
+/// \f$  x := Q^{-1} g.    \f$
+/// Assumes H positive semidefinite.
+/// Returns the objective-function change.
+
+template<typename Real>
+Real SolveQuadraticProblem(const SpMatrix<Real> &H,
+                           const VectorBase<Real> &g,
+                           const SolverOptions &opts,
+                           VectorBase<Real> *x);
+                           
+
+
+/// Maximizes the auxiliary function :
+/// \f[   Q(x) = tr(M^T P Y) - 0.5 tr(P M Q M^T)        \f]
+/// Like a numerically stable version of  \f$  M := Y Q^{-1}   \f$.
+/// Assumes Q and P positive semidefinite, and matrix dimensions match
+/// enough to make expressions meaningful.
+/// This is mostly as described in the SGMM paper "the subspace Gaussian mixture
+/// model -- a structured model for speech recognition", but the
+/// diagonal_precondition option is newly added, to handle problems
+/// where different dimensions have very different scaling (we recommend to use
+/// the option but it's set false for back compatibility).
+template<typename Real>
+Real SolveQuadraticMatrixProblem(const SpMatrix<Real> &Q,
+                                 const MatrixBase<Real> &Y,
+                                 const SpMatrix<Real> &P,
+                                 const SolverOptions &opts,
+                                 MatrixBase<Real> *M);
+
+/// Maximizes the auxiliary function :
+/// \f[   Q(M) =  tr(M^T G) -0.5 tr(P_1 M Q_1 M^T) -0.5 tr(P_2 M Q_2 M^T).   \f]
+/// Encountered in matrix update with a prior. We also apply a limit on the
+/// condition but it should be less frequently necessary, and can be set larger.
+template<typename Real>
+Real SolveDoubleQuadraticMatrixProblem(const MatrixBase<Real> &G,
+                                       const SpMatrix<Real> &P1,
+                                       const SpMatrix<Real> &P2,
+                                       const SpMatrix<Real> &Q1,
+                                       const SpMatrix<Real> &Q2,
+                                       const SolverOptions &opts,
+                                       MatrixBase<Real> *M);
+
+
+/// @} End of "addtogroup matrix_funcs_misc"
+
+}  // namespace kaldi
+
+
+// Including the implementation (now actually just includes some
+// template specializations).
+#include "matrix/sp-matrix-inl.h"
+
+
+#endif  // KALDI_MATRIX_SP_MATRIX_H_
+
diff --git a/kaldi_io/src/kaldi/matrix/srfft.h b/kaldi_io/src/kaldi/matrix/srfft.h
new file mode 100644
index 0000000..c0d36af
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/srfft.h
@@ -0,0 +1,132 @@
+// matrix/srfft.h
+
+// Copyright 2009-2011  Microsoft Corporation;  Go Vivace Inc.
+//                2014  Daniel Povey
+//
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+//
+// This file includes a modified version of code originally published in Malvar,
+// H., "Signal processing with lapped transforms, " Artech House, Inc., 1992.  The
+// current copyright holder of the original code, Henrique S. Malvar, has given
+// his permission for the release of this modified version under the Apache
+// License v2.0.
+
+#ifndef KALDI_MATRIX_SRFFT_H_
+#define KALDI_MATRIX_SRFFT_H_
+
+#include "matrix/kaldi-vector.h"
+#include "matrix/kaldi-matrix.h"
+
+namespace kaldi {
+
+/// @addtogroup matrix_funcs_misc
+/// @{
+
+
+// This class is based on code by Henrique (Rico) Malvar, from his book
+// "Signal Processing with Lapped Transforms" (1992).  Copied with
+// permission, optimized by Go Vivace Inc., and converted into C++ by
+// Microsoft Corporation
+// This is a more efficient way of doing the complex FFT than ComplexFft
+// (declared in matrix-functios.h), but it only works for powers of 2.
+// Note: in multi-threaded code, you would need to have one of these objects per
+// thread, because multiple calls to Compute in parallel would not work.
+template<typename Real>
+class SplitRadixComplexFft {
+ public:
+  typedef MatrixIndexT Integer;
+
+  // N is the number of complex points (must be a power of two, or this
+  // will crash).  Note that the constructor does some work so it's best to
+  // initialize the object once and do the computation many times.
+  SplitRadixComplexFft(Integer N);
+
+  // Does the FFT computation, given pointers to the real and
+  // imaginary parts.  If "forward", do the forward FFT; else
+  // do the inverse FFT (without the 1/N factor).
+  // xr and xi are pointers to zero-based arrays of size N,
+  // containing the real and imaginary parts
+  // respectively.
+  void Compute(Real *xr, Real *xi, bool forward) const;
+
+  // This version of Compute takes a single array of size N*2,
+  // containing [ r0 im0 r1 im1 ... ].  Otherwise its behavior is  the
+  // same as the version above.
+  void Compute(Real *x, bool forward);
+
+
+  // This version of Compute is const; it operates on an array of size N*2
+  // containing [ r0 im0 r1 im1 ... ], but it uses the argument "temp_buffer" as
+  // temporary storage instead of a class-member variable.  It will allocate it if
+  // needed.
+  void Compute(Real *x, bool forward, std::vector<Real> *temp_buffer) const;
+
+  ~SplitRadixComplexFft();
+
+ protected:
+  // temp_buffer_ is allocated only if someone calls Compute with only one Real*
+  // argument and we need a temporary buffer while creating interleaved data.
+  std::vector<Real> temp_buffer_;
+ private:
+  void ComputeTables();
+  void ComputeRecursive(Real *xr, Real *xi, Integer logn) const;
+  void BitReversePermute(Real *x, Integer logn) const;
+
+  Integer N_;
+  Integer logn_;  // log(N)
+
+  Integer *brseed_;
+  // brseed is Evans' seed table, ref:  (Ref: D. M. W.
+  // Evans, "An improved digit-reversal permutation algorithm ...",
+  // IEEE Trans. ASSP, Aug. 1987, pp. 1120-1125).
+  Real **tab_;       // Tables of butterfly coefficients.
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(SplitRadixComplexFft);
+};
+
+template<typename Real>
+class SplitRadixRealFft: private SplitRadixComplexFft<Real> {
+ public:
+  SplitRadixRealFft(MatrixIndexT N):  // will fail unless N>=4 and N is a power of 2.
+      SplitRadixComplexFft<Real> (N/2), N_(N) { }
+  
+  /// If forward == true, this function transforms from a sequence of N real points to its complex fourier
+  /// transform; otherwise it goes in the reverse direction.  If you call it
+  /// in the forward and then reverse direction and multiply by 1.0/N, you
+  /// will get back the original data.
+  /// The interpretation of the complex-FFT data is as follows: the array
+  /// is a sequence of complex numbers C_n of length N/2 with (real, im) format,
+  /// i.e. [real0, real_{N/2}, real1, im1, real2, im2, real3, im3, ...].
+  void Compute(Real *x, bool forward);
+
+
+  /// This is as the other Compute() function, but it is a const version that
+  /// uses a user-supplied buffer.
+  void Compute(Real *x, bool forward, std::vector<Real> *temp_buffer) const;
+
+ private:
+  KALDI_DISALLOW_COPY_AND_ASSIGN(SplitRadixRealFft);  
+  int N_;
+};
+
+
+/// @} end of "addtogroup matrix_funcs_misc"
+
+} // end namespace kaldi
+
+
+#endif
+
diff --git a/kaldi_io/src/kaldi/matrix/tp-matrix.h b/kaldi_io/src/kaldi/matrix/tp-matrix.h
new file mode 100644
index 0000000..f43e86c
--- /dev/null
+++ b/kaldi_io/src/kaldi/matrix/tp-matrix.h
@@ -0,0 +1,131 @@
+// matrix/tp-matrix.h
+
+// Copyright 2009-2011  Ondrej Glembek;  Lukas Burget;  Microsoft Corporation;
+//                      Saarland University;  Yanmin Qian;   Haihua Xu
+//                2013  Johns Hopkins Universith (author: Daniel Povey)
+
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+#ifndef KALDI_MATRIX_TP_MATRIX_H_
+#define KALDI_MATRIX_TP_MATRIX_H_
+
+
+#include "matrix/packed-matrix.h"
+
+namespace kaldi {
+/// \addtogroup matrix_group
+/// @{
+
+template<typename Real> class TpMatrix;
+
+/// @brief Packed symetric matrix class
+template<typename Real>
+class TpMatrix : public PackedMatrix<Real> {
+  friend class CuTpMatrix<float>;
+  friend class CuTpMatrix<double>;
+ public:
+  TpMatrix() : PackedMatrix<Real>() {}
+  explicit TpMatrix(MatrixIndexT r, MatrixResizeType resize_type = kSetZero)
+      : PackedMatrix<Real>(r, resize_type) {}
+  TpMatrix(const TpMatrix<Real>& orig) : PackedMatrix<Real>(orig) {}
+
+  /// Copy constructor from CUDA TpMatrix
+  /// This is defined in ../cudamatrix/cu-tp-matrix.cc
+  explicit TpMatrix(const CuTpMatrix<Real> &cu);
+  
+  
+  template<typename OtherReal> explicit TpMatrix(const TpMatrix<OtherReal>& orig)
+      : PackedMatrix<Real>(orig) {}
+  
+  Real operator() (MatrixIndexT r, MatrixIndexT c) const {
+    if (static_cast<UnsignedMatrixIndexT>(c) >
+        static_cast<UnsignedMatrixIndexT>(r)) {
+      KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(c) <
+                   static_cast<UnsignedMatrixIndexT>(this->num_rows_));
+      return 0;
+    }
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
+                 static_cast<UnsignedMatrixIndexT>(this->num_rows_));
+    // c<=r now so don't have to check c.
+    return *(this->data_ + (r*(r+1)) / 2 + c);
+    // Duplicating code from PackedMatrix.h
+  }
+
+  Real &operator() (MatrixIndexT r, MatrixIndexT c) {
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(r) <
+                 static_cast<UnsignedMatrixIndexT>(this->num_rows_));
+    KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(c) <=
+                 static_cast<UnsignedMatrixIndexT>(r) &&
+                 "you cannot access the upper triangle of TpMatrix using "
+                 "a non-const matrix object.");
+    return *(this->data_ + (r*(r+1)) / 2 + c);
+    // Duplicating code from PackedMatrix.h
+  }
+  // Note: Cholesky may throw std::runtime_error
+  void Cholesky(const SpMatrix<Real>& orig);
+  
+  void Invert();
+
+  // Inverts in double precision.
+  void InvertDouble() {
+    TpMatrix<double> dmat(*this);
+    dmat.Invert();
+    (*this).CopyFromTp(dmat);
+  }
+
+  /// Shallow swap
+  void Swap(TpMatrix<Real> *other);
+
+  /// Returns the determinant of the matrix (product of diagonals)
+  Real Determinant();
+
+  /// CopyFromMat copies the lower triangle of M into *this
+  /// (or the upper triangle, if Trans == kTrans).
+  void CopyFromMat(const MatrixBase<Real> &M,
+                   MatrixTransposeType Trans = kNoTrans);
+
+  /// This is implemented in ../cudamatrix/cu-tp-matrix.cc
+  void CopyFromMat(const CuTpMatrix<Real> &other);
+  
+  /// CopyFromTp copies another triangular matrix into this one.
+  void CopyFromTp(const TpMatrix<Real> &other) {
+    PackedMatrix<Real>::CopyFromPacked(other);
+  }
+
+  template<typename OtherReal> void CopyFromTp(const TpMatrix<OtherReal> &other) {
+    PackedMatrix<Real>::CopyFromPacked(other);
+  }
+
+  /// AddTp does *this += alpha * M.
+  void AddTp(const Real alpha, const TpMatrix<Real> &M) {
+    this->AddPacked(alpha, M);
+  }
+
+  using PackedMatrix<Real>::operator =;
+  using PackedMatrix<Real>::Scale;
+
+  void Resize(MatrixIndexT nRows, MatrixResizeType resize_type = kSetZero) {
+    PackedMatrix<Real>::Resize(nRows, resize_type);
+  }
+};
+
+/// @} end of "addtogroup matrix_group".
+
+}  // namespace kaldi
+
+
+#endif
+