14 files changed, 1396 insertions, 0 deletions
diff --git a/kaldi_seq/.valgrind b/kaldi_seq/.valgrind
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/kaldi_seq/.valgrind
diff --git a/kaldi_seq/Makefile b/kaldi_seq/Makefile
new file mode 100644
index 0000000..e76eea8
--- /dev/null
+++ b/kaldi_seq/Makefile
@@ -0,0 +1,47 @@
+# Change KDIR to `kaldi-trunk' path (Kaldi must be compiled with --share)
+KDIR := /slfs6/users/ymz09/kaldi/
+
+SHELL := /bin/bash
+BUILD_DIR := $(CURDIR)/build
+INC_PATH := $(LUA_BINDIR)/../include/
+OBJS := init.o src/kaldi_mpe.o src/kaldi_mmi.o src/init.o
+LIBS := libkaldiseq.so
+LUA_LIBS := init.lua layer/mpe.lua layer/mmi.lua
+INCLUDE := -I $(LUA_INCDIR) -I $(INC_PATH) -DLUA_USE_APICHECK
+
+SUBDIR := src layer
+OBJ_DIR := $(BUILD_DIR)/objs
+LUA_DIR := $(INST_LUADIR)/kaldi_seq
+KALDIINCLUDE := -I $(KDIR)/tools/ATLAS/include/ -I $(KDIR)/tools/openfst/include/ -I $(KDIR)/src/
+
+OBJS := $(addprefix $(OBJ_DIR)/,$(OBJS))
+LIBS := $(addprefix $(INST_LIBDIR)/,$(LIBS))
+OBJ_SUBDIR := $(addprefix $(OBJ_DIR)/,$(SUBDIR))
+LUA_SUBDIR := $(addprefix $(LUA_DIR)/,$(SUBDIR))
+LUA_LIBS := $(addprefix $(LUA_DIR)/,$(LUA_LIBS))
+LIB_PATH := $(LUA_BINDIR)/../lib
+
+build: $(OBJ_DIR) $(OBJ_SUBDIR) $(OBJS)
+install: $(LUA_DIR) $(LUA_SUBDIR) $(LUA_LIBS) $(LIBS)
+
+include $(KDIR)/src/kaldi.mk
+
+KL1 := -rdynamic -Wl,-rpath=$(KDIR)/tools/openfst/lib -L/usr/local/cuda/lib64 -Wl,-rpath,/usr/local/cuda/lib64 -Wl,-rpath=$(KDIR)/src/lib -L.  -L$(KDIR)/src/nnet/   -L$(KDIR)/src/cudamatrix/   -L$(KDIR)/src/lat/   -L$(KDIR)/src/hmm/   -L$(KDIR)/src/tree/   -L$(KDIR)/src/matrix/   -L$(KDIR)/src/util/   -L$(KDIR)/src/base/   $(KDIR)/src/nnet//libkaldi-nnet.so $(KDIR)/src/cudamatrix//libkaldi-cudamatrix.so $(KDIR)/src/lat//libkaldi-lat.so $(KDIR)/src/hmm//libkaldi-hmm.so $(KDIR)/src/tree//libkaldi-tree.so $(KDIR)/src/matrix//libkaldi-matrix.so $(KDIR)/src/util//libkaldi-util.so $(KDIR)/src/base//libkaldi-base.so   -L$(KDIR)/tools/openfst/lib -lfst /usr/lib/liblapack.so /usr/lib/libcblas.so /usr/lib/libatlas.so /usr/lib/libf77blas.so -lm -lpthread -ldl -lcublas -lcudart   -lkaldi-nnet   -lkaldi-cudamatrix   -lkaldi-lat   -lkaldi-hmm   -lkaldi-tree   -lkaldi-matrix   -lkaldi-util   -lkaldi-base
+
+KL2 := -msse -msse2 -Wall -pthread -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(KDIR)/tools/ATLAS/include -I$(KDIR)/tools/openfst/include -Wno-sign-compare -g  -fPIC -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -DKALDI_NO_EXPF
+
+$(OBJ_DIR) $(LUA_DIR) $(OBJ_SUBDIR) $(LUA_SUBDIR):
+	-mkdir -p $@
+$(LUA_DIR)/%.lua: %.lua
+	cp $< $@
+$(LIBS): $(OBJ_DIR)/src/kaldi_mpe.o $(OBJ_DIR)/src/kaldi_mmi.o $(OBJ_DIR)/init.o $(OBJ_DIR)/src/init.o
+	gcc -shared -fPIC -o $@ $(OBJ_DIR)/src/kaldi_mpe.o $(OBJ_DIR)/src/kaldi_mmi.o $(OBJ_DIR)/init.o $(OBJ_DIR)/src/init.o -lstdc++ -Wl,-rpath=$(LIB_PATH) -L$(LIB_PATH) -lnervcore -lluaT $(KL1)
+$(OBJ_DIR)/src/kaldi_mpe.o: src/kaldi_mpe.cpp
+	g++ -o $@ -c $< $(KALDIINCLUDE) -g -fPIC $(INCLUDE) $(KL2)
+$(OBJ_DIR)/src/kaldi_mmi.o: src/kaldi_mmi.cpp
+	g++ -o $@ -c $< $(KALDIINCLUDE) -g -fPIC $(INCLUDE) $(KL2)
+$(OBJ_DIR)/%.o: %.c
+	gcc -o $@ -c $< -g $(INCLUDE) -fPIC
+clean:
+	-rm $(OBJ_DIR)/src/*.o
+
diff --git a/kaldi_seq/init.c b/kaldi_seq/init.c
new file mode 100644
index 0000000..ed89473
--- /dev/null
+++ b/kaldi_seq/init.c
@@ -0,0 +1,8 @@
+#include "../nerv/common.h"
+#include <stdio.h>
+
+extern void kaldi_seq_init(lua_State *L);
+int luaopen_libkaldiseq(lua_State *L) {
+    kaldi_seq_init(L);
+    return 1;
+}
diff --git a/kaldi_seq/init.lua b/kaldi_seq/init.lua
new file mode 100644
index 0000000..39f4cb3
--- /dev/null
+++ b/kaldi_seq/init.lua
@@ -0,0 +1,2 @@
+nerv.include('layer/mpe.lua')
+nerv.include('layer/mmi.lua')
diff --git a/kaldi_seq/kaldi_seq-scm-1.rockspec b/kaldi_seq/kaldi_seq-scm-1.rockspec
new file mode 100644
index 0000000..41e34f0
--- /dev/null
+++ b/kaldi_seq/kaldi_seq-scm-1.rockspec
@@ -0,0 +1,36 @@
+package = "kaldi_seq"
+version = "scm-1"
+source = {
+    url = "https://github.com/Nerv-SJTU/nerv-speech.git"
+}
+description = {
+    summary = "Kaldi sequence training support for Nerv",
+    detailed = [[
+    ]],
+    homepage = "https://github.com/Nerv-SJTU/nerv-speech",
+    license = "BSD"
+}
+dependencies = {
+    "nerv >= scm-1",
+    "lua >= 5.1"
+}
+build = {
+    type = "make",
+    build_variables = {
+        CFLAGS="$(CFLAGS)",
+        LIBFLAG="$(LIBFLAG)",
+        LUA_LIBDIR="$(LUA_LIBDIR)",
+        LUA_BINDIR="$(LUA_BINDIR)",
+        LUA_INCDIR="$(LUA_INCDIR)",
+        INST_PREFIX="$(PREFIX)",
+        LUA="$(LUA)",
+    },
+    install_variables = {
+        LUA_BINDIR="$(LUA_BINDIR)",
+        INST_PREFIX="$(PREFIX)",
+        INST_BINDIR="$(BINDIR)",
+        INST_LIBDIR="$(LIBDIR)",
+        INST_LUADIR="$(LUADIR)",
+        INST_CONFDIR="$(CONFDIR)",
+    },
+}
diff --git a/kaldi_seq/layer/mmi.lua b/kaldi_seq/layer/mmi.lua
new file mode 100644
index 0000000..ecc7f48
--- /dev/null
+++ b/kaldi_seq/layer/mmi.lua
@@ -0,0 +1,50 @@
+require 'libkaldiseq'
+local MMILayer = nerv.class("nerv.MMILayer", "nerv.Layer")
+
+function MMILayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.gconf = global_conf
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self.arg = layer_conf.cmd.arg
+    self.mdl = layer_conf.cmd.mdl
+    self.lat = layer_conf.cmd.lat
+    self.ali = layer_conf.cmd.ali
+    self:check_dim_len(2, -1) -- two inputs: nn output and utt key
+end
+
+function MMILayer:init(batch_size)
+    self.total_frames = 0
+    self.kaldi_mmi = nerv.KaldiMMI(self.arg, self.mdl, self.lat, self.ali)
+    if self.kaldi_mmi == nil then
+        nerv.error("kaldi arguments is expected: %s %s %s %s", self.arg,
+        self.mdl, self.lat, self.ali)
+    end
+end
+
+function MMILayer:batch_resize(batch_size)
+    -- do nothing
+end
+
+function MMILayer:update(bp_err, input, output)
+    -- no params, therefore do nothing
+end
+
+function MMILayer:propagate(input, output)
+    self.valid = false
+    self.valid = self.kaldi_mmi:check(input[1], input[2])
+    return self.valid
+end
+
+function MMILayer:back_propagate(bp_err, next_bp_err, input, output)
+    if self.valid ~= true then
+        nerv.error("kaldi sequence training back_propagate fail")
+    end
+    local mmat = input[1]:new_to_host()
+    next_bp_err[1]:copy_fromh(self.kaldi_mmi:calc_diff(mmat, input[2]))
+    self.total_frames = self.total_frames + self.kaldi_mmi:get_num_frames()
+end
+
+function MMILayer:get_params()
+    return nerv.ParamRepo({})
+end
diff --git a/kaldi_seq/layer/mpe.lua b/kaldi_seq/layer/mpe.lua
new file mode 100644
index 0000000..ec8a8f3
--- /dev/null
+++ b/kaldi_seq/layer/mpe.lua
@@ -0,0 +1,52 @@
+require 'libkaldiseq'
+local MPELayer = nerv.class("nerv.MPELayer", "nerv.Layer")
+
+function MPELayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.gconf = global_conf
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self.arg = layer_conf.cmd.arg
+    self.mdl = layer_conf.cmd.mdl
+    self.lat = layer_conf.cmd.lat
+    self.ali = layer_conf.cmd.ali
+    self:check_dim_len(2, -1) -- two inputs: nn output and utt key
+end
+
+function MPELayer:init(batch_size)
+    self.total_correct = 0
+    self.total_frames = 0
+    self.kaldi_mpe = nerv.KaldiMPE(self.arg, self.mdl, self.lat, self.ali)
+    if self.kaldi_mpe == nil then
+        nerv.error("kaldi arguments is expected: %s %s %s %s", self.arg,
+        self.mdl, self.lat, self.ali)
+    end
+end
+
+function MPELayer:batch_resize(batch_size)
+    -- do nothing
+end
+
+function MPELayer:update(bp_err, input, output)
+    -- no params, therefore do nothing
+end
+
+function MPELayer:propagate(input, output)
+    self.valid = false
+    self.valid = self.kaldi_mpe:check(input[1], input[2])
+    return self.valid
+end
+
+function MPELayer:back_propagate(bp_err, next_bp_err, input, output)
+    if self.valid ~= true then
+        nerv.error("kaldi sequence training back_propagate fail")
+    end
+    local mmat = input[1]:new_to_host()
+    next_bp_err[1]:copy_fromh(self.kaldi_mpe:calc_diff(mmat, input[2]))
+    self.total_frames = self.total_frames + self.kaldi_mpe:get_num_frames()
+    self.total_correct = self.total_correct + self.kaldi_mpe:get_utt_frame_acc()
+end
+
+function MPELayer:get_params()
+    return nerv.ParamRepo({})
+end
diff --git a/kaldi_seq/src/init.c b/kaldi_seq/src/init.c
new file mode 100644
index 0000000..9b38056
--- /dev/null
+++ b/kaldi_seq/src/init.c
@@ -0,0 +1,131 @@
+#include "nerv/common.h"
+#include "kaldi_mpe.h"
+#include "kaldi_mmi.h"
+#include <stdio.h>
+
+const char *nerv_kaldi_mpe_tname = "nerv.KaldiMPE";
+const char *nerv_kaldi_mmi_tname = "nerv.KaldiMMI";
+const char *nerv_matrix_cuda_float_tname = "nerv.CuMatrixFloat";
+const char *nerv_matrix_host_float_tname = "nerv.MMatrixFloat";
+
+static int mpe_new(lua_State *L) {
+    const char *arg = luaL_checkstring(L, 1);
+    const char *mdl = luaL_checkstring(L, 2);
+    const char *lat = luaL_checkstring(L, 3);
+    const char *ali = luaL_checkstring(L, 4);
+    KaldiMPE *mpe = new_KaldiMPE(arg, mdl, lat, ali);
+    luaT_pushudata(L, mpe, nerv_kaldi_mpe_tname);
+    return 1;
+}
+
+static int mpe_destroy(lua_State *L) {
+    KaldiMPE *mpe = luaT_checkudata(L, 1, nerv_kaldi_mpe_tname);
+    destroy_KaldiMPE(mpe);
+    return 0;
+}
+
+static int mpe_check(lua_State *L) {
+    KaldiMPE *mpe = luaT_checkudata(L, 1, nerv_kaldi_mpe_tname);
+    const Matrix *cumat = luaT_checkudata(L, 2, nerv_matrix_cuda_float_tname);
+    const char *utt = luaL_checkstring(L, 3);
+
+    lua_pushboolean(L, check_mpe(mpe, cumat, utt));
+    return 1;
+}
+
+static int mpe_calc_diff(lua_State *L) {
+    KaldiMPE *mpe = luaT_checkudata(L, 1, nerv_kaldi_mpe_tname);
+    Matrix *mat = luaT_checkudata(L, 2, nerv_matrix_host_float_tname);
+    const char *utt = luaL_checkstring(L, 3);
+
+    Matrix *diff = calc_diff_mpe(mpe, mat, utt);
+    luaT_pushudata(L, diff, nerv_matrix_host_float_tname);
+    return 1;
+}
+
+static int mpe_get_num_frames(lua_State *L) {
+    KaldiMPE *mpe = luaT_checkudata(L, 1, nerv_kaldi_mpe_tname);
+    lua_pushnumber(L, get_num_frames_mpe(mpe));
+    return 1;
+}
+
+static int mpe_get_utt_frame_acc(lua_State *L) {
+    KaldiMPE *mpe = luaT_checkudata(L, 1, nerv_kaldi_mpe_tname);
+    lua_pushnumber(L, get_utt_frame_acc_mpe(mpe));
+    return 1;
+}
+
+static const luaL_Reg mpe_methods[] = {
+    {"check", mpe_check},
+    {"calc_diff", mpe_calc_diff},
+    {"get_num_frames", mpe_get_num_frames},
+    {"get_utt_frame_acc", mpe_get_utt_frame_acc},
+    {NULL, NULL}
+};
+
+static void mpe_init(lua_State *L) {
+    luaT_newmetatable(L, nerv_kaldi_mpe_tname, NULL,
+            mpe_new, mpe_destroy, NULL);
+    luaL_register(L, NULL, mpe_methods);
+    lua_pop(L, 1);
+}
+
+static int mmi_new(lua_State *L) {
+    const char *arg = luaL_checkstring(L, 1);
+    const char *mdl = luaL_checkstring(L, 2);
+    const char *lat = luaL_checkstring(L, 3);
+    const char *ali = luaL_checkstring(L, 4);
+    KaldiMMI *mmi = new_KaldiMMI(arg, mdl, lat, ali);
+    luaT_pushudata(L, mmi, nerv_kaldi_mmi_tname);
+    return 1;
+}
+
+static int mmi_destroy(lua_State *L) {
+    KaldiMMI *mmi = luaT_checkudata(L, 1, nerv_kaldi_mmi_tname);
+    destroy_KaldiMMI(mmi);
+    return 0;
+}
+
+static int mmi_check(lua_State *L) {
+    KaldiMMI *mmi = luaT_checkudata(L, 1, nerv_kaldi_mmi_tname);
+    const Matrix *cumat = luaT_checkudata(L, 2, nerv_matrix_cuda_float_tname);
+    const char *utt = luaL_checkstring(L, 3);
+
+    lua_pushboolean(L, check_mmi(mmi, cumat, utt));
+    return 1;
+}
+
+static int mmi_calc_diff(lua_State *L) {
+    KaldiMMI *mmi = luaT_checkudata(L, 1, nerv_kaldi_mmi_tname);
+    Matrix *mat = luaT_checkudata(L, 2, nerv_matrix_host_float_tname);
+    const char *utt = luaL_checkstring(L, 3);
+
+    Matrix *diff = calc_diff_mmi(mmi, mat, utt);
+    luaT_pushudata(L, diff, nerv_matrix_host_float_tname);
+    return 1;
+}
+
+static int mmi_get_num_frames(lua_State *L) {
+    KaldiMMI *mmi = luaT_checkudata(L, 1, nerv_kaldi_mmi_tname);
+    lua_pushnumber(L, get_num_frames_mmi(mmi));
+    return 1;
+}
+
+static const luaL_Reg mmi_methods[] = {
+    {"check", mmi_check},
+    {"calc_diff", mmi_calc_diff},
+    {"get_num_frames", mmi_get_num_frames},
+    {NULL, NULL}
+};
+
+static void mmi_init(lua_State *L) {
+    luaT_newmetatable(L, nerv_kaldi_mmi_tname, NULL,
+            mmi_new, mmi_destroy, NULL);
+    luaL_register(L, NULL, mmi_methods);
+    lua_pop(L, 1);
+}
+
+void kaldi_seq_init(lua_State *L) {
+    mpe_init(L);
+    mmi_init(L);
+}
diff --git a/kaldi_seq/src/kaldi_mmi.cpp b/kaldi_seq/src/kaldi_mmi.cpp
new file mode 100644
index 0000000..ea9b4f1
--- /dev/null
+++ b/kaldi_seq/src/kaldi_mmi.cpp
@@ -0,0 +1,427 @@
+#include <string>
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/faster-decoder.h"
+#include "decoder/decodable-matrix.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
+
+#include "nnet/nnet-trnopts.h"
+#include "nnet/nnet-component.h"
+#include "nnet/nnet-activation.h"
+#include "nnet/nnet-nnet.h"
+#include "nnet/nnet-pdf-prior.h"
+#include "nnet/nnet-utils.h"
+#include "base/timer.h"
+#include "cudamatrix/cu-device.h"
+
+#include <iomanip>
+
+typedef kaldi::BaseFloat BaseFloat;
+typedef struct Matrix NervMatrix;
+
+namespace kaldi{
+    namespace nnet1{
+        void LatticeAcousticRescore(const kaldi::Matrix<BaseFloat> &log_like,
+                const TransitionModel &trans_model,
+                const std::vector<int32> &state_times,
+                Lattice *lat);
+    }
+}
+
+extern "C" {
+#include "kaldi_mmi.h"
+#include "string.h"
+#include "assert.h"
+#include "nerv/common.h"
+
+    extern NervMatrix *nerv_matrix_host_float_create(long nrow, long ncol, Status *status);
+    extern void nerv_matrix_host_float_copy_fromd(NervMatrix *mat, const NervMatrix *cumat, int, int, int, Status *);
+    using namespace kaldi;
+    using namespace kaldi::nnet1;
+    typedef kaldi::int32 int32;
+
+    struct KaldiMMI {
+        TransitionModel *trans_model;
+        RandomAccessLatticeReader *den_lat_reader;
+        RandomAccessInt32VectorReader *ref_ali_reader;
+
+        Lattice den_lat;
+        vector<int32> state_times;
+
+        PdfPriorOptions *prior_opts;
+        PdfPrior *log_prior;
+
+        std::vector<int32> ref_ali;
+
+        Timer *time;
+        double time_now;
+
+        int32 num_done, num_no_ref_ali, num_no_den_lat, num_other_error;
+        int32 num_frm_drop;
+
+        kaldi::int64 total_frames;
+        double lat_like; // total likelihood of the lattice
+        double lat_ac_like; // acoustic likelihood weighted by posterior.
+        double total_mmi_obj, mmi_obj;
+        double total_post_on_ali, post_on_ali;
+
+        int32 num_frames;
+
+        bool binary;
+        BaseFloat acoustic_scale, lm_scale, old_acoustic_scale;
+        kaldi::int32 max_frames;
+        bool drop_frames;
+        std::string use_gpu;
+    };
+
+    KaldiMMI * new_KaldiMMI(const char* arg, const char* mdl, const char* lat, const char* ali)
+    {
+        KaldiMMI * mmi = new KaldiMMI;
+
+        const char *usage =
+            "Perform one iteration of DNN-MMI training by stochastic "
+            "gradient descent.\n"
+            "The network weights are updated on each utterance.\n"
+            "Usage:  nnet-train-mmi-sequential [options] <model-in> <transition-model-in> "
+            "<feature-rspecifier> <den-lat-rspecifier> <ali-rspecifier> [<model-out>]\n"
+            "e.g.: \n"
+            " nnet-train-mmi-sequential nnet.init trans.mdl scp:train.scp scp:denlats.scp ark:train.ali "
+            "nnet.iter1\n";
+
+        ParseOptions po(usage);
+
+        NnetTrainOptions trn_opts; trn_opts.learn_rate=0.00001;
+        trn_opts.Register(&po);
+
+        mmi->binary = true;
+        po.Register("binary", &(mmi->binary), "Write output in binary mode");
+
+        std::string feature_transform;
+        po.Register("feature-transform", &feature_transform,
+                "Feature transform in Nnet format");
+
+        mmi->prior_opts = new PdfPriorOptions;
+        PdfPriorOptions &prior_opts = *(mmi->prior_opts);
+        prior_opts.Register(&po);
+
+        mmi->acoustic_scale = 1.0,
+            mmi->lm_scale = 1.0,
+            mmi->old_acoustic_scale = 0.0;
+        po.Register("acoustic-scale", &(mmi->acoustic_scale),
+                "Scaling factor for acoustic likelihoods");
+        po.Register("lm-scale", &(mmi->lm_scale),
+                "Scaling factor for \"graph costs\" (including LM costs)");
+        po.Register("old-acoustic-scale", &(mmi->old_acoustic_scale),
+                "Add in the scores in the input lattices with this scale, rather "
+                "than discarding them.");
+        mmi->max_frames = 6000; // Allow segments maximum of one minute by default
+        po.Register("max-frames",&(mmi->max_frames), "Maximum number of frames a segment can have to be processed");
+
+        mmi->drop_frames = true;
+        po.Register("drop-frames", &(mmi->drop_frames),
+                "Drop frames, where is zero den-posterior under numerator path "
+                "(ie. path not in lattice)");
+
+        mmi->use_gpu=std::string("yes");
+        po.Register("use-gpu", &(mmi->use_gpu), "yes|no|optional, only has effect if compiled with CUDA");
+
+        int narg = 0;
+        char args[64][1024];
+        char *token;
+        char *saveptr = NULL;
+        char tmpstr[1024];
+
+        strcpy(tmpstr, arg);
+        strcpy(args[0], "nnet-train-mmi-sequential");
+        for(narg = 1, token = strtok_r(tmpstr, " ", &saveptr); token; token = strtok_r(NULL, " ", &saveptr))
+            strcpy(args[narg++], token);
+        strcpy(args[narg++], "0.nnet");
+        strcpy(args[narg++], mdl);
+        strcpy(args[narg++], "feat");
+        strcpy(args[narg++], lat);
+        strcpy(args[narg++], ali);
+        strcpy(args[narg++], "1.nnet");
+
+        char **argsv = new char*[narg];
+        for(int _i = 0; _i < narg; _i++)
+            argsv[_i] = args[_i];
+
+        po.Read(narg, argsv);
+        delete [] argsv;
+
+        if (po.NumArgs() != 6) {
+            po.PrintUsage();
+            exit(1);
+        }
+
+        std::string transition_model_filename = po.GetArg(2),
+            den_lat_rspecifier = po.GetArg(4),
+            ref_ali_rspecifier = po.GetArg(5);
+
+        // Select the GPU
+#if HAVE_CUDA == 1
+        CuDevice::Instantiate().SelectGpuId(mmi->use_gpu);
+#endif
+
+        // Read the class-frame-counts, compute priors
+        mmi->log_prior = new PdfPrior(prior_opts);
+
+        // Read transition model
+        mmi->trans_model = new TransitionModel;
+        ReadKaldiObject(transition_model_filename, mmi->trans_model);
+
+        mmi->den_lat_reader = new RandomAccessLatticeReader(den_lat_rspecifier);
+        mmi->ref_ali_reader = new RandomAccessInt32VectorReader(ref_ali_rspecifier);
+
+        if (mmi->drop_frames) {
+            KALDI_LOG << "--drop-frames=true :"
+                " we will zero gradient for frames with total den/num mismatch."
+                " The mismatch is likely to be caused by missing correct path "
+                " from den-lattice due wrong annotation or search error."
+                " Leaving such frames out stabilizes the training.";
+        }
+
+        mmi->time = new Timer;
+        mmi->time_now = 0;
+        mmi->num_done =0;
+        mmi->num_no_ref_ali = 0;
+        mmi->num_no_den_lat = 0;
+        mmi->num_other_error = 0;
+        mmi->total_frames = 0;
+        mmi->num_frm_drop = 0;
+
+        mmi->total_mmi_obj = 0.0, mmi->mmi_obj = 0.0;
+        mmi->total_post_on_ali = 0.0, mmi->post_on_ali = 0.0;
+        return mmi;
+    }
+
+    void destroy_KaldiMMI(KaldiMMI *mmi)
+    {
+        delete mmi->trans_model;
+        delete mmi->den_lat_reader;
+        delete mmi->ref_ali_reader;
+        delete mmi->time;
+        delete mmi->prior_opts;
+        delete mmi->log_prior;
+    }
+
+    int check_mmi(KaldiMMI *mmi, const NervMatrix* mat, const char *key)
+    {
+        std::string utt(key);
+        if (!mmi->den_lat_reader->HasKey(utt)) {
+            KALDI_WARN << "Utterance " << utt << ": found no lattice.";
+            mmi->num_no_den_lat++;
+            return 0;
+        }
+        if (!mmi->ref_ali_reader->HasKey(utt)) {
+            KALDI_WARN << "Utterance " << utt << ": found no reference alignment.";
+            mmi->num_no_ref_ali++;
+            return 0;
+        }
+
+        assert(sizeof(BaseFloat) == sizeof(float));
+        // 1) get the features, numerator alignment
+        mmi->ref_ali = mmi->ref_ali_reader->Value(utt);
+        long mat_nrow = mat->nrow, mat_ncol = mat->ncol;
+        // check for temporal length of numerator alignments
+        if (static_cast<MatrixIndexT>(mmi->ref_ali.size()) != mat_nrow) {
+            KALDI_WARN << "Numerator alignment has wrong length "
+                << mmi->ref_ali.size() << " vs. "<< mat_nrow;
+            mmi->num_other_error++;
+            return 0;
+        }
+        if (mat_nrow > mmi->max_frames) {
+            KALDI_WARN << "Utterance " << utt << ": Skipped because it has " << mat_nrow <<
+                " frames, which is more than " << mmi->max_frames << ".";
+            mmi->num_other_error++;
+            return 0;
+        }
+        // 2) get the denominator lattice, preprocess
+        mmi->den_lat = mmi->den_lat_reader->Value(utt);
+        Lattice &den_lat = mmi->den_lat;
+        if (den_lat.Start() == -1) {
+            KALDI_WARN << "Empty lattice for utt " << utt;
+            mmi->num_other_error++;
+            return 0;
+        }
+        if (mmi->old_acoustic_scale != 1.0) {
+            fst::ScaleLattice(fst::AcousticLatticeScale(mmi->old_acoustic_scale),
+                    &den_lat);
+        }
+        // optional sort it topologically
+        kaldi::uint64 props = den_lat.Properties(fst::kFstProperties, false);
+        if (!(props & fst::kTopSorted)) {
+            if (fst::TopSort(&den_lat) == false)
+                KALDI_ERR << "Cycles detected in lattice.";
+        }
+        // get the lattice length and times of states
+        mmi->state_times.clear();
+        vector<int32> &state_times = mmi->state_times;
+        int32 max_time = kaldi::LatticeStateTimes(den_lat, &state_times);
+        // check for temporal length of denominator lattices
+        if (max_time != mat_nrow) {
+            KALDI_WARN << "Denominator lattice has wrong length "
+                << max_time << " vs. " << mat_nrow;
+            mmi->num_other_error++;
+            return 0;
+        }
+
+        return 1;
+    }
+
+    NervMatrix * calc_diff_mmi(KaldiMMI * mmi, NervMatrix * mat, const char * key)
+    {
+        std::string utt(key);
+        assert(sizeof(BaseFloat) == sizeof(float));
+
+        kaldi::Matrix<BaseFloat> nnet_out_h, nnet_diff_h;
+        nnet_out_h.Resize(mat->nrow, mat->ncol, kUndefined);
+
+        size_t stride = mat->stride;
+        for (int i = 0; i < mat->nrow; i++)
+        {
+            const BaseFloat *nerv_row = (BaseFloat *)((char *)mat->data.f + i * stride);
+            BaseFloat *row = nnet_out_h.RowData(i);
+            memmove(row, nerv_row, sizeof(BaseFloat) * mat->ncol);
+        }
+
+        mmi->num_frames = nnet_out_h.NumRows();
+
+        PdfPriorOptions &prior_opts = *(mmi->prior_opts);
+        if (prior_opts.class_frame_counts != "") {
+            CuMatrix<BaseFloat> nnet_out;
+            nnet_out.Resize(mat->nrow, mat->ncol, kUndefined);
+            nnet_out.CopyFromMat(nnet_out_h);
+            mmi->log_prior->SubtractOnLogpost(&nnet_out);
+            nnet_out.CopyToMat(&nnet_out_h);
+            nnet_out.Resize(0,0);
+        }
+
+        // 4) rescore the latice
+        LatticeAcousticRescore(nnet_out_h, *(mmi->trans_model), mmi->state_times, &(mmi->den_lat));
+        if (mmi->acoustic_scale != 1.0 || mmi->lm_scale != 1.0)
+            fst::ScaleLattice(fst::LatticeScale(mmi->lm_scale, mmi->acoustic_scale), &(mmi->den_lat));
+
+        kaldi::Posterior post;
+        mmi->lat_like = kaldi::LatticeForwardBackward(mmi->den_lat, &post, &(mmi->lat_ac_like));
+
+        nnet_diff_h.Resize(mat->nrow, mat->ncol, kSetZero);
+        for (int32 t = 0; t < post.size(); t++) {
+            for (int32 arc = 0; arc < post[t].size(); arc++) {
+                int32 pdf = mmi->trans_model->TransitionIdToPdf(post[t][arc].first);
+                nnet_diff_h(t, pdf) += post[t][arc].second;
+            }
+        }
+
+        double path_ac_like = 0.0;
+        for(int32 t=0; t<mmi->num_frames; t++) {
+            int32 pdf = mmi->trans_model->TransitionIdToPdf(mmi->ref_ali[t]);
+            path_ac_like += nnet_out_h(t,pdf);
+        }
+        path_ac_like *= mmi->acoustic_scale;
+        mmi->mmi_obj = path_ac_like - mmi->lat_like;
+
+        mmi->post_on_ali = 0.0;
+        for(int32 t=0; t<mmi->num_frames; t++) {
+            int32 pdf = mmi->trans_model->TransitionIdToPdf(mmi->ref_ali[t]);
+            double posterior = nnet_diff_h(t, pdf);
+            mmi->post_on_ali += posterior;
+        }
+
+        KALDI_VLOG(1) << "Lattice #" << mmi->num_done + 1 << " processed"
+            << " (" << utt << "): found " << mmi->den_lat.NumStates()
+            << " states and " << fst::NumArcs(mmi->den_lat) << " arcs.";
+
+        KALDI_VLOG(1) << "Utterance " << utt << ": Average MMI obj. value = "
+            << (mmi->mmi_obj/mmi->num_frames) << " over " << mmi->num_frames
+            << " frames,"
+            << " (Avg. den-posterior on ali " << mmi->post_on_ali/mmi->num_frames << ")";
+
+        // 7a) Search for the frames with num/den mismatch
+        int32 frm_drop = 0;
+        std::vector<int32> frm_drop_vec;
+        for(int32 t=0; t<mmi->num_frames; t++) {
+            int32 pdf = mmi->trans_model->TransitionIdToPdf(mmi->ref_ali[t]);
+            double posterior = nnet_diff_h(t, pdf);
+            if(posterior < 1e-20) {
+                frm_drop++;
+                frm_drop_vec.push_back(t);
+            }
+        }
+
+        // 8) subtract the pdf-Viterbi-path
+        for(int32 t=0; t<nnet_diff_h.NumRows(); t++) {
+            int32 pdf = mmi->trans_model->TransitionIdToPdf(mmi->ref_ali[t]);
+            nnet_diff_h(t, pdf) -= 1.0;
+        }
+
+        // 9) Drop mismatched frames from the training by zeroing the derivative
+        if(mmi->drop_frames) {
+            for(int32 i=0; i<frm_drop_vec.size(); i++) {
+                nnet_diff_h.Row(frm_drop_vec[i]).Set(0.0);
+            }
+            mmi->num_frm_drop += frm_drop;
+        }
+
+        // Report the frame dropping
+        if (frm_drop > 0) {
+            std::stringstream ss;
+            ss << (mmi->drop_frames?"Dropped":"[dropping disabled] Would drop")
+                << " frames in " << utt << " " << frm_drop << "/" << mmi->num_frames << ",";
+            //get frame intervals from vec frm_drop_vec
+            ss << " intervals :";
+            //search for streaks of consecutive numbers:
+            int32 beg_streak=frm_drop_vec[0];
+            int32 len_streak=0;
+            int32 i;
+            for(i=0; i<frm_drop_vec.size(); i++,len_streak++) {
+                if(beg_streak + len_streak != frm_drop_vec[i]) {
+                    ss << " " << beg_streak << ".." << frm_drop_vec[i-1] << "frm";
+                    beg_streak = frm_drop_vec[i];
+                    len_streak = 0;
+                }
+            }
+            ss << " " << beg_streak << ".." << frm_drop_vec[i-1] << "frm";
+            //print
+            KALDI_WARN << ss.str();
+        }
+
+        assert(mat->nrow == nnet_diff_h.NumRows() && mat->ncol == nnet_diff_h.NumCols());
+        stride = mat->stride;
+        for (int i = 0; i < mat->nrow; i++)
+        {
+            const BaseFloat *row = nnet_diff_h.RowData(i);
+            BaseFloat *nerv_row = (BaseFloat *)((char *)mat->data.f + i * stride);
+            memmove(nerv_row, row, sizeof(BaseFloat) * mat->ncol);
+        }
+        nnet_diff_h.Resize(0,0);
+
+        // increase time counter
+        mmi->total_mmi_obj += mmi->mmi_obj;
+        mmi->total_post_on_ali += mmi->post_on_ali;
+        mmi->total_frames += mmi->num_frames;
+        mmi->num_done++;
+
+        if (mmi->num_done % 100 == 0) {
+            mmi->time_now = mmi->time->Elapsed();
+            KALDI_VLOG(1) << "After " << mmi->num_done << " utterances: time elapsed = "
+                << mmi->time_now/60 << " min; processed " << mmi->total_frames/mmi->time_now
+                << " frames per second.";
+#if HAVE_CUDA==1
+            // check the GPU is not overheated
+            CuDevice::Instantiate().CheckGpuHealth();
+#endif
+        }
+        return mat;
+    }
+
+    double get_num_frames_mmi(const KaldiMMI *mmi)
+    {
+        return (double)mmi->num_frames;
+    }
+
+}
diff --git a/kaldi_seq/src/kaldi_mmi.h b/kaldi_seq/src/kaldi_mmi.h
new file mode 100644
index 0000000..ce6787c
--- /dev/null
+++ b/kaldi_seq/src/kaldi_mmi.h
@@ -0,0 +1,20 @@
+#ifndef NERV_kaldi_KALDI_MMI
+#define NERV_kaldi_KALDI_MMI
+#include "nerv/matrix/matrix.h"
+#include "nerv/common.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    typedef struct KaldiMMI KaldiMMI;
+
+    KaldiMMI * new_KaldiMMI(const char*, const char*, const char*, const char*);
+    void destroy_KaldiMMI(KaldiMMI *);
+    int check_mmi(KaldiMMI *, const Matrix*, const char *);
+    Matrix * calc_diff_mmi(KaldiMMI *, Matrix *, const char *);
+    double get_num_frames_mmi(const KaldiMMI *);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/kaldi_seq/src/kaldi_mpe.cpp b/kaldi_seq/src/kaldi_mpe.cpp
new file mode 100644
index 0000000..60384e2
--- /dev/null
+++ b/kaldi_seq/src/kaldi_mpe.cpp
@@ -0,0 +1,411 @@
+#include <string>
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/faster-decoder.h"
+#include "decoder/decodable-matrix.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
+
+#include "nnet/nnet-trnopts.h"
+#include "nnet/nnet-component.h"
+#include "nnet/nnet-activation.h"
+#include "nnet/nnet-nnet.h"
+#include "nnet/nnet-pdf-prior.h"
+#include "nnet/nnet-utils.h"
+#include "base/timer.h"
+#include "cudamatrix/cu-device.h"
+
+typedef kaldi::BaseFloat BaseFloat;
+typedef struct Matrix NervMatrix;
+
+namespace kaldi {
+    namespace nnet1 {
+
+        void LatticeAcousticRescore(const Matrix<BaseFloat> &log_like,
+                const TransitionModel &trans_model,
+                const std::vector<int32> &state_times,
+                Lattice *lat) {
+            kaldi::uint64 props = lat->Properties(fst::kFstProperties, false);
+            if (!(props & fst::kTopSorted))
+                KALDI_ERR << "Input lattice must be topologically sorted.";
+
+            KALDI_ASSERT(!state_times.empty());
+            std::vector<std::vector<int32> > time_to_state(log_like.NumRows());
+            for (size_t i = 0; i < state_times.size(); i++) {
+                KALDI_ASSERT(state_times[i] >= 0);
+                if (state_times[i] < log_like.NumRows())  // end state may be past this..
+                    time_to_state[state_times[i]].push_back(i);
+                else
+                    KALDI_ASSERT(state_times[i] == log_like.NumRows()
+                            && "There appears to be lattice/feature mismatch.");
+            }
+
+            for (int32 t = 0; t < log_like.NumRows(); t++) {
+                for (size_t i = 0; i < time_to_state[t].size(); i++) {
+                    int32 state = time_to_state[t][i];
+                    for (fst::MutableArcIterator<Lattice> aiter(lat, state); !aiter.Done();
+                            aiter.Next()) {
+                        LatticeArc arc = aiter.Value();
+                        int32 trans_id = arc.ilabel;
+                        if (trans_id != 0) {  // Non-epsilon input label on arc
+                            int32 pdf_id = trans_model.TransitionIdToPdf(trans_id);
+                            arc.weight.SetValue2(-log_like(t, pdf_id) + arc.weight.Value2());
+                            aiter.SetValue(arc);
+                        }
+                    }
+                }
+            }
+        }
+
+    }  // namespace nnet1
+}  // namespace kaldi
+
+
+extern "C" {
+#include "kaldi_mpe.h"
+#include "string.h"
+#include "assert.h"
+#include "nerv/common.h"
+
+    extern NervMatrix *nerv_matrix_host_float_create(long nrow, long ncol, Status *status);
+    extern void nerv_matrix_host_float_copy_fromd(NervMatrix *mat, const NervMatrix *cumat, int, int, int, Status *);
+    using namespace kaldi;
+    using namespace kaldi::nnet1;
+    typedef kaldi::int32 int32;
+
+    struct KaldiMPE {
+        TransitionModel *trans_model;
+        RandomAccessLatticeReader *den_lat_reader;
+        RandomAccessInt32VectorReader *ref_ali_reader;
+
+        Lattice den_lat;
+        vector<int32> state_times;
+
+        PdfPriorOptions *prior_opts;
+        PdfPrior *log_prior;
+
+        std::vector<int32> silence_phones;
+        std::vector<int32> ref_ali;
+
+        Timer *time;
+        double time_now;
+
+        int32 num_done, num_no_ref_ali, num_no_den_lat, num_other_error;
+
+        kaldi::int64 total_frames;
+        int32 num_frames;
+        double total_frame_acc, utt_frame_acc;
+
+        bool binary;
+        bool one_silence_class;
+        BaseFloat acoustic_scale, lm_scale, old_acoustic_scale;
+        kaldi::int32 max_frames;
+        bool do_smbr;
+        std::string use_gpu;
+    };
+
+    KaldiMPE * new_KaldiMPE(const char* arg, const char* mdl, const char* lat, const char* ali)
+    {
+        KaldiMPE * mpe = new KaldiMPE;
+
+        const char *usage =
+            "Perform iteration of Neural Network MPE/sMBR training by stochastic "
+            "gradient descent.\n"
+            "The network weights are updated on each utterance.\n"
+            "Usage:  nnet-train-mpe-sequential [options] <model-in> <transition-model-in> "
+            "<feature-rspecifier> <den-lat-rspecifier> <ali-rspecifier> [<model-out>]\n"
+            "e.g.: \n"
+            " nnet-train-mpe-sequential nnet.init trans.mdl scp:train.scp scp:denlats.scp ark:train.ali "
+            "nnet.iter1\n";
+
+        ParseOptions po(usage);
+
+        NnetTrainOptions trn_opts; trn_opts.learn_rate=0.00001;
+        trn_opts.Register(&po);
+
+        mpe->binary = true;
+        po.Register("binary", &(mpe->binary), "Write output in binary mode");
+
+        std::string feature_transform;
+        po.Register("feature-transform", &feature_transform,
+                "Feature transform in Nnet format");
+        std::string silence_phones_str;
+        po.Register("silence-phones", &silence_phones_str, "Colon-separated list "
+                "of integer id's of silence phones, e.g. 46:47");
+
+        mpe->prior_opts = new PdfPriorOptions;
+        PdfPriorOptions &prior_opts = *(mpe->prior_opts);
+        prior_opts.Register(&po);
+
+        mpe->one_silence_class = false;
+        mpe->acoustic_scale = 1.0,
+            mpe->lm_scale = 1.0,
+            mpe->old_acoustic_scale = 0.0;
+        po.Register("acoustic-scale", &(mpe->acoustic_scale),
+                "Scaling factor for acoustic likelihoods");
+        po.Register("lm-scale", &(mpe->lm_scale),
+                "Scaling factor for \"graph costs\" (including LM costs)");
+        po.Register("old-acoustic-scale", &(mpe->old_acoustic_scale),
+                "Add in the scores in the input lattices with this scale, rather "
+                "than discarding them.");
+        po.Register("one-silence-class", &(mpe->one_silence_class), "If true, newer "
+                "behavior which will tend to reduce insertions.");
+        mpe->max_frames = 6000; // Allow segments maximum of one minute by default
+        po.Register("max-frames",&(mpe->max_frames), "Maximum number of frames a segment can have to be processed");
+        mpe->do_smbr = false;
+        po.Register("do-smbr", &(mpe->do_smbr), "Use state-level accuracies instead of "
+                "phone accuracies.");
+
+        mpe->use_gpu=std::string("yes");
+        po.Register("use-gpu", &(mpe->use_gpu), "yes|no|optional, only has effect if compiled with CUDA");
+
+        int narg = 0;
+        char args[64][1024];
+        char *token;
+        char *saveptr = NULL;
+        char tmpstr[1024];
+
+        strcpy(tmpstr, arg);
+        strcpy(args[0], "nnet-train-mpe-sequential");
+        for(narg = 1, token = strtok_r(tmpstr, " ", &saveptr); token; token = strtok_r(NULL, " ", &saveptr))
+            strcpy(args[narg++], token);
+        strcpy(args[narg++], "0.nnet");
+        strcpy(args[narg++], mdl);
+        strcpy(args[narg++], "feat");
+        strcpy(args[narg++], lat);
+        strcpy(args[narg++], ali);
+        strcpy(args[narg++], "1.nnet");
+
+        char **argsv = new char*[narg];
+        for(int _i = 0; _i < narg; _i++)
+            argsv[_i] = args[_i];
+
+        po.Read(narg, argsv);
+        delete [] argsv;
+
+        if (po.NumArgs() != 6) {
+            po.PrintUsage();
+            exit(1);
+        }
+
+        std::string transition_model_filename = po.GetArg(2),
+            den_lat_rspecifier = po.GetArg(4),
+            ref_ali_rspecifier = po.GetArg(5);
+
+        std::vector<int32> &silence_phones = mpe->silence_phones;
+        if (!kaldi::SplitStringToIntegers(silence_phones_str, ":", false,
+                    &silence_phones))
+            KALDI_ERR << "Invalid silence-phones string " << silence_phones_str;
+        kaldi::SortAndUniq(&silence_phones);
+        if (silence_phones.empty())
+            KALDI_LOG << "No silence phones specified.";
+
+        // Select the GPU
+#if HAVE_CUDA == 1
+        CuDevice::Instantiate().SelectGpuId(mpe->use_gpu);
+#endif
+
+        // Read the class-frame-counts, compute priors
+        mpe->log_prior = new PdfPrior(prior_opts);
+
+        // Read transition model
+        mpe->trans_model = new TransitionModel;
+        ReadKaldiObject(transition_model_filename, mpe->trans_model);
+
+        mpe->den_lat_reader = new RandomAccessLatticeReader(den_lat_rspecifier);
+        mpe->ref_ali_reader = new RandomAccessInt32VectorReader(ref_ali_rspecifier);
+
+        mpe->time = new Timer;
+        mpe->time_now = 0;
+        mpe->num_done =0;
+        mpe->num_no_ref_ali = 0;
+        mpe->num_no_den_lat = 0;
+        mpe->num_other_error = 0;
+        mpe->total_frames = 0;
+        mpe->total_frame_acc = 0.0;
+        mpe->utt_frame_acc = 0.0;
+
+        return mpe;
+    }
+
+    void destroy_KaldiMPE(KaldiMPE *mpe)
+    {
+        delete mpe->trans_model;
+        delete mpe->den_lat_reader;
+        delete mpe->ref_ali_reader;
+        delete mpe->time;
+        delete mpe->prior_opts;
+        delete mpe->log_prior;
+    }
+
+    int check_mpe(KaldiMPE *mpe, const NervMatrix* mat, const char *key)
+    {
+        std::string utt(key);
+        if (!mpe->den_lat_reader->HasKey(utt)) {
+            KALDI_WARN << "Utterance " << utt << ": found no lattice.";
+            mpe->num_no_den_lat++;
+            return 0;
+        }
+        if (!mpe->ref_ali_reader->HasKey(utt)) {
+            KALDI_WARN << "Utterance " << utt << ": found no reference alignment.";
+            mpe->num_no_ref_ali++;
+            return 0;
+        }
+
+        //assert(sizeof(BaseFloat) == sizeof(float));
+        // 1) get the features, numerator alignment
+        mpe->ref_ali = mpe->ref_ali_reader->Value(utt);
+        long mat_nrow = mat->nrow, mat_ncol = mat->ncol;
+        // check for temporal length of numerator alignments
+        if (static_cast<MatrixIndexT>(mpe->ref_ali.size()) != mat_nrow) {
+            KALDI_WARN << "Numerator alignment has wrong length "
+                << mpe->ref_ali.size() << " vs. "<< mat_nrow;
+            mpe->num_other_error++;
+            return 0;
+        }
+        if (mat_nrow > mpe->max_frames) {
+            KALDI_WARN << "Utterance " << utt << ": Skipped because it has " << mat_nrow <<
+                " frames, which is more than " << mpe->max_frames << ".";
+            mpe->num_other_error++;
+            return 0;
+        }
+        // 2) get the denominator lattice, preprocess
+        mpe->den_lat = mpe->den_lat_reader->Value(utt);
+        Lattice &den_lat = mpe->den_lat;
+        if (den_lat.Start() == -1) {
+            KALDI_WARN << "Empty lattice for utt " << utt;
+            mpe->num_other_error++;
+            return 0;
+        }
+        if (mpe->old_acoustic_scale != 1.0) {
+            fst::ScaleLattice(fst::AcousticLatticeScale(mpe->old_acoustic_scale),
+                    &den_lat);
+        }
+        // optional sort it topologically
+        kaldi::uint64 props = den_lat.Properties(fst::kFstProperties, false);
+        if (!(props & fst::kTopSorted)) {
+            if (fst::TopSort(&den_lat) == false)
+                KALDI_ERR << "Cycles detected in lattice.";
+        }
+        // get the lattice length and times of states
+        mpe->state_times.clear();
+        vector<int32> &state_times = mpe->state_times;
+        int32 max_time = kaldi::LatticeStateTimes(den_lat, &state_times);
+        // check for temporal length of denominator lattices
+        if (max_time != mat_nrow) {
+            KALDI_WARN << "Denominator lattice has wrong length "
+                << max_time << " vs. " << mat_nrow;
+            mpe->num_other_error++;
+            return 0;
+        }
+
+        return 1;
+    }
+
+    NervMatrix * calc_diff_mpe(KaldiMPE * mpe, NervMatrix * mat, const char * key)
+    {
+        std::string utt(key);
+        //assert(sizeof(BaseFloat) == sizeof(float));
+
+        CuMatrix<BaseFloat> nnet_diff;
+        kaldi::Matrix<BaseFloat> nnet_out_h;
+        nnet_out_h.Resize(mat->nrow, mat->ncol, kUndefined);
+
+        size_t stride = mat->stride;
+        for (int i = 0; i < mat->nrow; i++)
+        {
+            const BaseFloat *nerv_row = (BaseFloat *)((char *)mat->data.f + i * stride);
+            BaseFloat *row = nnet_out_h.RowData(i);
+            memmove(row, nerv_row, sizeof(BaseFloat) * mat->ncol);
+        }
+
+        mpe->num_frames = nnet_out_h.NumRows();
+
+        PdfPriorOptions &prior_opts = *(mpe->prior_opts);
+        if (prior_opts.class_frame_counts != "") {
+            CuMatrix<BaseFloat> nnet_out;
+            nnet_out.Resize(nnet_out_h.NumRows(), nnet_out_h.NumCols(), kUndefined);
+            nnet_out.CopyFromMat(nnet_out_h);
+            mpe->log_prior->SubtractOnLogpost(&nnet_out);
+            nnet_out_h.Resize(nnet_out.NumRows(), nnet_out.NumCols(), kUndefined);
+            nnet_out.CopyToMat(&nnet_out_h);
+            nnet_out.Resize(0,0);
+        }
+
+        // 4) rescore the latice
+        LatticeAcousticRescore(nnet_out_h, *(mpe->trans_model), mpe->state_times, &(mpe->den_lat));
+        if (mpe->acoustic_scale != 1.0 || mpe->lm_scale != 1.0)
+            fst::ScaleLattice(fst::LatticeScale(mpe->lm_scale, mpe->acoustic_scale), &(mpe->den_lat));
+
+        kaldi::Posterior post;
+        std::vector<int32> &silence_phones = mpe->silence_phones;
+
+        if (mpe->do_smbr) {  // use state-level accuracies, i.e. sMBR estimation
+            mpe->utt_frame_acc = LatticeForwardBackwardMpeVariants(
+                    *(mpe->trans_model), silence_phones, mpe->den_lat, mpe->ref_ali, "smbr",
+                    mpe->one_silence_class, &post);
+        } else {  // use phone-level accuracies, i.e. MPFE (minimum phone frame error)
+            mpe->utt_frame_acc = LatticeForwardBackwardMpeVariants(
+                    *(mpe->trans_model), silence_phones, mpe->den_lat, mpe->ref_ali, "mpfe",
+                    mpe->one_silence_class, &post);
+        }
+
+        // 6) convert the Posterior to a matrix,
+        PosteriorToMatrixMapped(post, *(mpe->trans_model), &nnet_diff);
+        nnet_diff.Scale(-1.0); // need to flip the sign of derivative,
+
+        KALDI_VLOG(1) << "Lattice #" << mpe->num_done + 1 << " processed"
+            << " (" << utt << "): found " << mpe->den_lat.NumStates()
+            << " states and " << fst::NumArcs(mpe->den_lat) << " arcs.";
+
+        KALDI_VLOG(1) << "Utterance " << utt << ": Average frame accuracy = "
+            << (mpe->utt_frame_acc/mpe->num_frames) << " over " << mpe->num_frames
+            << " frames,"
+            << " diff-range(" << nnet_diff.Min() << "," << nnet_diff.Max() << ")";
+
+        nnet_out_h.Resize(nnet_diff.NumRows(), nnet_diff.NumCols(), kUndefined);
+        nnet_diff.CopyToMat(&nnet_out_h);
+        nnet_diff.Resize(0,0); // release GPU memory,
+
+        assert(mat->nrow == nnet_out_h.NumRows() && mat->ncol == nnet_out_h.NumCols());
+        stride = mat->stride;
+        for (int i = 0; i < mat->nrow; i++)
+        {
+            const BaseFloat *row = nnet_out_h.RowData(i);
+            BaseFloat *nerv_row = (BaseFloat *)((char *)mat->data.f + i * stride);
+            memmove(nerv_row, row, sizeof(BaseFloat) * mat->ncol);
+        }
+        nnet_out_h.Resize(0,0);
+
+        // increase time counter
+        mpe->total_frame_acc += mpe->utt_frame_acc;
+        mpe->total_frames += mpe->num_frames;
+        mpe->num_done++;
+
+        if (mpe->num_done % 100 == 0) {
+            mpe->time_now = mpe->time->Elapsed();
+            KALDI_VLOG(1) << "After " << mpe->num_done << " utterances: time elapsed = "
+                << mpe->time_now/60 << " min; processed " << mpe->total_frames/mpe->time_now
+                << " frames per second.";
+#if HAVE_CUDA==1
+            // check the GPU is not overheated
+            CuDevice::Instantiate().CheckGpuHealth();
+#endif
+        }
+        return mat;
+    }
+
+    double get_num_frames_mpe(const KaldiMPE *mpe)
+    {
+        return (double)mpe->num_frames;
+    }
+
+    double get_utt_frame_acc_mpe(const KaldiMPE *mpe)
+    {
+        return (double)mpe->utt_frame_acc;
+    }
+
+}
diff --git a/kaldi_seq/src/kaldi_mpe.h b/kaldi_seq/src/kaldi_mpe.h
new file mode 100644
index 0000000..fd09574
--- /dev/null
+++ b/kaldi_seq/src/kaldi_mpe.h
@@ -0,0 +1,21 @@
+#ifndef NERV_kaldi_KALDI_MPE
+#define NERV_kaldi_KALDI_MPE
+#include "nerv/matrix/matrix.h"
+#include "nerv/common.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    typedef struct KaldiMPE KaldiMPE;
+
+    KaldiMPE * new_KaldiMPE(const char*, const char*, const char*, const char*);
+    void destroy_KaldiMPE(KaldiMPE *);
+    int check_mpe(KaldiMPE *, const Matrix*, const char *);
+    Matrix * calc_diff_mpe(KaldiMPE *, Matrix *, const char *);
+    double get_num_frames_mpe(const KaldiMPE *);
+    double get_utt_frame_acc_mpe(const KaldiMPE *);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/kaldi_seq/tools/net_kaldi2nerv.cpp b/kaldi_seq/tools/net_kaldi2nerv.cpp
new file mode 100644
index 0000000..bbac3db
--- /dev/null
+++ b/kaldi_seq/tools/net_kaldi2nerv.cpp
@@ -0,0 +1,85 @@
+#include <iostream>
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <cassert>
+using namespace std;
+
+const char fmt[] = "[%013d]\n";
+
+int main(int argc, char *argv[])
+{
+    if(argc < 3){
+        printf("Usage: %s kaldi_nnet nerv_output\n", argv[0]);
+        exit(0);
+    }
+
+    FILE *fin = fopen(argv[1], "r");
+    FILE *fout = fopen(argv[2], "w");
+
+    if(!fin || !fout){
+        printf("fopen error\n");
+        exit(1);
+    }
+
+    char buf[1024], tag[64];
+    int a, b;
+    char ***arr;
+    long start, size;
+    int affine_ltp = 0, affine_bp = 0;
+
+    while(fgets(buf, 1024, fin)){
+        if(sscanf(buf, "%s%d%d", tag, &b, &a) == 3 && strcmp(tag, "<AffineTransform>") == 0){
+            fgets(buf, 1024, fin);
+            arr = new char**[a];
+            for(int i = 0; i < a; i++)
+                arr[i] = new char*[b];
+            for(int j = 0; j < b; j++)
+                for(int i = 0; i < a; i++){
+                    arr[i][j] = new char[16];
+                    fscanf(fin, "%s", arr[i][j]);
+                }
+
+            start = ftell(fout);
+            fprintf(fout, fmt, 0);
+            fprintf(fout, "{type=\"nerv.LinearTransParam\",id=\"affine%d_ltp\"}\n", affine_ltp++);
+            fprintf(fout, "%d %d\n", a, b);
+            for(int i = 0; i < a; i++){
+                for(int j = 0; j < b; j++){
+                    fprintf(fout, "%s ", arr[i][j]);
+                    delete [] arr[i][j];
+                }
+                fprintf(fout, "\n");
+                delete [] arr[i];
+            }
+            delete [] arr;
+
+            size = ftell(fout) - start;
+            fseek(fout, start, SEEK_SET);
+            fprintf(fout, fmt, (int)size);
+            fseek(fout, 0, SEEK_END);
+
+            fgets(buf, 1024, fin);
+            fscanf(fin, "%*s");
+
+            start = ftell(fout);
+            fprintf(fout, fmt, 0);
+            fprintf(fout, "{type=\"nerv.BiasParam\",id=\"affine%d_bp\"}\n", affine_bp++);
+            fprintf(fout, "%d %d\n", 1, b);
+            for(int i = 0; i < b; i++){
+                fscanf(fin, "%s", buf);
+                fprintf(fout, "%s ", buf);
+            }
+            fputs("\n", fout);
+            size = ftell(fout) - start;
+            fseek(fout, start, SEEK_SET);
+            fprintf(fout, fmt, (int)size);
+            fseek(fout, 0, SEEK_END);
+        }
+    }
+
+    fclose(fin);
+    fclose(fout);
+
+    return 0;
+}
diff --git a/kaldi_seq/tools/transf_kaldi2nerv.cpp b/kaldi_seq/tools/transf_kaldi2nerv.cpp
new file mode 100644
index 0000000..525bcda
--- /dev/null
+++ b/kaldi_seq/tools/transf_kaldi2nerv.cpp
@@ -0,0 +1,106 @@
+#include <iostream>
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include <cassert>
+using namespace std;
+
+const char fmt[] = "[%013d]\n";
+
+int main(int argc, char *argv[])
+{
+    if(argc < 3){
+        printf("Usage: %s kaldi_transf nerv_output\n", argv[0]);
+        exit(1);
+    }
+
+    FILE *fin = fopen(argv[1], "r");
+    FILE *fout = fopen(argv[2], "w");
+    if(!fin || !fout){
+        puts("fopen error");
+        exit(1);
+    }
+
+    char buf[1024], tag[64];
+    int a, b;
+    int size_window, size_bias;
+    char **window, **bias;
+
+    while(fgets(buf, sizeof(buf), fin))
+    {
+        if(sscanf(buf, "%s%d%d", tag, &a, &b) == 3){
+            if(strcmp(tag, "<AddShift>") == 0){
+                assert(a == b);
+                size_bias = a;
+                fscanf(fin, "%*s%*s%*s");
+                bias = new char*[size_bias];
+                for(int i = 0; i < size_bias; i++){
+                    bias[i] = new char[16];
+                    fscanf(fin, "%s", bias[i]);
+                }
+            } else if(strcmp(tag, "<Rescale>") == 0){
+                assert(a == b);
+                size_window = a;
+                fscanf(fin, "%*s%*s%*s");
+                window = new char*[size_window];
+                for(int i = 0; i < size_window; i++){
+                    window[i] = new char[16];
+                    fscanf(fin, "%s", window[i]);
+                }
+            }
+        }
+    }
+
+    long start = ftell(fout), size;
+    fprintf(fout, fmt, 0);
+    fprintf(fout, "{id = \"bias1\", type = \"nerv.MatrixParam\"}\n");
+    fprintf(fout, "1 %d\n", size_bias);
+    for(int i = 0; i<size_bias; i++)
+        fprintf(fout, "0 ");
+    fputs("\n", fout);
+    size = ftell(fout) - start;
+    fseek(fout, start, SEEK_SET);
+    fprintf(fout, fmt, (int)size);
+    fseek(fout, 0, SEEK_END);
+
+    start = ftell(fout);
+    fprintf(fout, fmt, 0);
+    fprintf(fout, "{id = \"window1\", type = \"nerv.MatrixParam\"}\n");
+    fprintf(fout, "1 %d\n", size_window);
+    for(int i = 0; i<size_window; i++)
+        fprintf(fout, "1 ");
+    fputs("\n", fout);
+    size = ftell(fout) - start;
+    fseek(fout, start, SEEK_SET);
+    fprintf(fout, fmt, (int)size);
+    fseek(fout, 0, SEEK_END);
+
+    start = ftell(fout);
+    fprintf(fout, fmt, 0);
+    fprintf(fout, "{id = \"bias2\", type = \"nerv.MatrixParam\"}\n");
+    fprintf(fout, "1 %d\n", size_bias);
+    for(int i = 0; i<size_bias; i++)
+        fprintf(fout, "%s ", bias[i]);
+    fputs("\n", fout);
+    size = ftell(fout) - start;
+    fseek(fout, start, SEEK_SET);
+    fprintf(fout, fmt, (int)size);
+    fseek(fout, 0, SEEK_END);
+
+    start = ftell(fout);
+    fprintf(fout, fmt, 0);
+    fprintf(fout, "{id = \"window2\", type = \"nerv.MatrixParam\"}\n");
+    fprintf(fout, "1 %d\n", size_window);
+    for(int i = 0; i<size_window; i++)
+        fprintf(fout, "%s ", window[i]);
+    fputs("\n", fout);
+    size = ftell(fout) - start;
+    fseek(fout, start, SEEK_SET);
+    fprintf(fout, fmt, (int)size);
+    fseek(fout, 0, SEEK_END);
+
+    fclose(fin);
+    fclose(fout);
+
+    return 0;
+}