44 files changed, 1174 insertions, 722 deletions
diff --git a/.gitmodules b/.gitmodules
index 1432de9..9f556c5 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,12 +1,6 @@
 [submodule "luajit-2.0"]
 	path = luajit-2.0
 	url = http://luajit.org/git/luajit-2.0.git
-[submodule "nerv-speech"]
-	path = nerv-speech
-	url = https://github.com/Determinant/nerv-speech.git
-[submodule "speech"]
-	path = speech
-	url = https://github.com/Determinant/nerv-speech.git
 [submodule "luarocks"]
 	path = luarocks
 	url = https://github.com/keplerproject/luarocks.git
diff --git a/Makefile b/Makefile
index fa888c3..664a83b 100644
--- a/Makefile
+++ b/Makefile
@@ -9,6 +9,7 @@ luarocks:
 install:
 	cd nerv; $(PREFIX)/bin/luarocks make
 speech:
+	cd speech/speech_utils; $(PREFIX)/bin/luarocks make
 	cd speech/htk_io; $(PREFIX)/bin/luarocks make
 clean:
 	cd nerv && make clean
diff --git a/README.md b/README.md
index 10d531c..8c21bd9 100644
--- a/README.md
+++ b/README.md
@@ -8,10 +8,12 @@ First make sure you have __lua__ and __CUDA__ installed on your computer.
 __Nerv__ is currently developed via github.You can download and make __Nerv__ by doing the following:
 ```
 cd ~
-git clone https://github.com/Determinant/nerv.git
+git clone https://github.com/Nerv-SJTU/nerv.git
 cd nerv
+git clone https://github.com/Nerv-SJTU/nerv-speech.git speech
 git submodule init && git submodule update
 make
+make speech
 ```
 The `git submodule` command is for the __luajit__ repository inside __Nerv__.  
 Now, you can try to run some example scripts.  
diff --git a/embedding_example/.gitignore b/embedding_example/.gitignore
new file mode 100644
index 0000000..8e68213
--- /dev/null
+++ b/embedding_example/.gitignore
@@ -0,0 +1,2 @@
+main
+main.o
diff --git a/embedding_example/Makefile b/embedding_example/Makefile
new file mode 100644
index 0000000..73287f4
--- /dev/null
+++ b/embedding_example/Makefile
@@ -0,0 +1,20 @@
+CFLAG += -I ../install/include/luajit-2.0/  -I ../install/include/nerv/
+LDFLAG += -L../install/lib/ -lluajit-5.1 -Wl,-rpath=../install/lib/ -lluaT -lnervcore
+GCC := gcc
+
+.PHONY: FORCE
+
+all: main FORCE
+clean:
+	-rm -f *.o
+	-rm main
+
+FORCE: ../install/bin/luarocks
+	echo "#!/bin/bash" > run.sh
+	$< path >> run.sh
+	echo "./main" >> run.sh
+main: main.o
+	$(GCC) -o $@ $<  $(LDFLAG)
+
+main.o: main.c
+	$(GCC) $(CFLAG) -o $@ $< -c
diff --git a/embedding_example/main.c b/embedding_example/main.c
new file mode 100644
index 0000000..8856d58
--- /dev/null
+++ b/embedding_example/main.c
@@ -0,0 +1,93 @@
+#include "lua.h"
+#include "lauxlib.h"
+#include "lualib.h"
+#include "matrix/matrix.h"
+#include "common.h"
+#include "luaT/luaT.h"
+#include <stdio.h>
+
+const char *nerv_matrix_host_float_tname = "nerv.MMatrixFloat";
+const char *input_name = "_nerv_embed_input";
+const char *output_name = "_nerv_embed_output";
+extern Matrix *nerv_matrix_host_float_create(long nrow, long ncol, Status *status);
+extern void nerv_matrix_host_float_data_retain(Matrix *self);
+extern void nerv_matrix_host_float_data_free(Matrix *self, Status *status);
+
+lua_State *L;
+Matrix *input, *output;
+Status status;
+
+void setup_nerv() {
+    L = lua_open();
+    luaL_openlibs(L);
+    luaL_loadfile(L, "setup_nerv.lua");
+    /* network configuration */
+    lua_pushstring(L, "../nerv/examples/swb_baseline.lua");
+    if (lua_pcall(L, 1, LUA_MULTRET, 0))
+    {
+        printf("%s\n", luaL_checkstring(L, 1));
+        exit(1);
+    }
+    /* lua stack now: input width, output width, propagator */
+    input = nerv_matrix_host_float_create(1, luaL_checkinteger(L, 1), &status);
+    NERV_LUA_CHECK_STATUS(L, status);
+    output = nerv_matrix_host_float_create(1, luaL_checkinteger(L, 2), &status);
+    NERV_LUA_CHECK_STATUS(L, status);
+    /* add reference to avoid gc */
+    luaT_pushudata(L, output, nerv_matrix_host_float_tname);
+    luaT_pushudata(L, input, nerv_matrix_host_float_tname);
+    lua_setfield(L, LUA_GLOBALSINDEX, input_name);
+    lua_setfield(L, LUA_GLOBALSINDEX, output_name);
+}
+
+
+void propagate(float for_fun) {
+    int i, j;
+    printf("ok: %d\n", lua_gettop(L));
+    lua_pushvalue(L, 3);
+    /* lua stack now: input width, output width, propagator, propagator */
+    for (i = 0; i < input->nrow; i++) /* nrow is actually 1 */
+    {
+        float *nerv_row = (float *)((char *)input->data.f + i * input->stride);
+        for (j = 0; j < input->ncol; j++)
+        {
+            nerv_row[j] = j * for_fun;
+        }
+    }
+    lua_getfield(L, LUA_GLOBALSINDEX, input_name);
+    lua_getfield(L, LUA_GLOBALSINDEX, output_name);
+    /* lua stack now: input width, output width, propagator, propagator, input, output */
+    if (lua_pcall(L, 2, 0, 0)) /* call propagator with two parameters */
+    {
+        printf("%s\n", luaL_checkstring(L, -1));
+        exit(-1);
+    }
+    /* lua stack now: input width, output width, propagator */
+    printf("## output: %ld %ld ##\n", output->nrow, output->ncol);
+    for (i = 0; i < output->nrow; i++) /* nrow is actually 1 */
+    {
+        float *nerv_row = (float *)((char *)output->data.f + i * output->stride);
+        for (j = 0; j < output->ncol; j++)
+        {
+            printf("%.8f ", nerv_row[j]);
+        }
+    }
+}
+
+void teardown_nerv() {
+    lua_pushnil(L);
+    lua_pushnil(L);
+    lua_setfield(L, LUA_GLOBALSINDEX, input_name);
+    lua_setfield(L, LUA_GLOBALSINDEX, output_name);
+    lua_gc(L, LUA_GCCOLLECT, 0);
+}
+
+int main() {
+    setup_nerv();
+    propagate(1.0);
+    propagate(2.0);
+    propagate(2.0);
+    propagate(3.0);
+    teardown_nerv();
+    return 0;
+}
diff --git a/embedding_example/run.sh b/embedding_example/run.sh
new file mode 100755
index 0000000..e919263
--- /dev/null
+++ b/embedding_example/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+export LUA_PATH='/home/slhome/mfy43/.luarocks/share/lua/5.1/?.lua;/home/slhome/mfy43/.luarocks/share/lua/5.1/?/init.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?/init.lua;./?.lua;/usr/local/share/luajit-2.0.4/?.lua;/usr/local/share/lua/5.1/?.lua;/usr/local/share/lua/5.1/?/init.lua'
+export LUA_CPATH='/home/slhome/mfy43/.luarocks/lib/lua/5.1/?.so;/home/slhome/mfy43/nerv/install/lib/lua/5.1/?.so;./?.so;/usr/local/lib/lua/5.1/?.so;/usr/local/lib/lua/5.1/loadall.so'
+./main
diff --git a/embedding_example/setup_nerv.lua b/embedding_example/setup_nerv.lua
new file mode 100644
index 0000000..d80c306
--- /dev/null
+++ b/embedding_example/setup_nerv.lua
@@ -0,0 +1,25 @@
+local k,l,_=pcall(require,"luarocks.loader") _=k and l.add_context("nerv","scm-1")
+require 'nerv'
+local arg = {...}
+dofile(arg[1])
+local param_repo = nerv.ParamRepo()
+param_repo:import(gconf.initialized_param, nil, gconf)
+local layer_repo = make_layer_repo(param_repo)
+local network = get_decode_network(layer_repo)
+local global_transf = get_global_transf(layer_repo)
+local batch_size = 1
+network:init(batch_size)
+
+function propagator(input, output)
+    local transformed = nerv.speech_utils.global_transf(
+                            gconf.cumat_type.new_from_host(input),
+                            global_transf, 0, 0, gconf) -- preprocessing
+    local gpu_input = transformed
+    local gpu_output = nerv.CuMatrixFloat(output:nrow(), output:ncol())
+    network:propagate({gpu_input}, {gpu_output})
+    gpu_output:copy_toh(output)
+    -- collect garbage in-time to save GPU memory
+    collectgarbage("collect")
+end
+
+return network.dim_in[1], network.dim_out[1], propagator
diff --git a/nerv/Makefile b/nerv/Makefile
index 022e2fb..b449f82 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -30,14 +30,14 @@ LUAT_OBJS := $(addprefix $(OBJ_DIR)/,$(LUAT_OBJS))
 OBJS := $(CORE_OBJS) $(NERV_OBJS) $(LUAT_OBJS)
 LIBS := $(INST_LIBDIR)/libnerv.so $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so
 LUA_LIBS := matrix/init.lua io/init.lua init.lua \
-			layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua \
-			layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua layer/affine_recurrent.lua\
+			layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua layer/softmax.lua \
+			layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua layer/affine_recurrent.lua \
 			nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua \
 			io/sgd_buffer.lua 
 
 INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK
-CUDA_BASE := /usr/local/cuda-6.5
-#CUDA_BASE := /usr/local/cuda-5.0
+#CUDA_BASE := /usr/local/cuda-7.0
+CUDA_BASE := /usr/local/cuda
 CUDA_INCLUDE := -I $(CUDA_BASE)/include/
 INCLUDE += $(CUDA_INCLUDE)
 
@@ -66,7 +66,7 @@ $(LIB_PATH)/libluaT.so: $(LUAT_OBJS)
 $(INST_LIBDIR)/libnerv.so: $(NERV_OBJS) $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so
 	gcc -shared -o $@ $(NERV_OBJS) $(LDFLAGS) -Wl,-rpath=$(LIB_PATH) -L$(LIB_PATH) -lnervcore -lluaT
 
-$(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c matrix/generic/cukernel.cu
+$(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c
 $(OBJ_DIR)/matrix/mmatrix.o: matrix/generic/mmatrix.c matrix/generic/matrix.c
 
 $(OBJ_DIR)/lib/matrix/cumatrix.o: lib/matrix/generic/cumatrix.c lib/matrix/generic/matrix.c lib/matrix/generic/cukernel.cu
diff --git a/nerv/examples/asr_trainer.lua b/nerv/examples/asr_trainer.lua
index 4fa4096..69cfeed 100644
--- a/nerv/examples/asr_trainer.lua
+++ b/nerv/examples/asr_trainer.lua
@@ -1,9 +1,9 @@
 function build_trainer(ifname)
     local param_repo = nerv.ParamRepo()
     param_repo:import(ifname, nil, gconf)
-    local sublayer_repo = make_sublayer_repo(param_repo)
-    local layer_repo = make_layer_repo(sublayer_repo, param_repo)
+    local layer_repo = make_layer_repo(param_repo)
     local network = get_network(layer_repo)
+    local global_transf = get_global_transf(layer_repo)
     local input_order = get_input_order()
     local iterative_trainer = function (prefix, scp_file, bp)
         gconf.randomize = bp
@@ -12,28 +12,41 @@ function build_trainer(ifname)
         -- initialize the network
         network:init(gconf.batch_size)
         gconf.cnt = 0
-        err_input = {nerv.CuMatrixFloat(256, 1)}
+        err_input = {nerv.CuMatrixFloat(gconf.batch_size, 1)}
         err_input[1]:fill(1)
         for data in buffer.get_data, buffer do
             -- prine stat periodically
             gconf.cnt = gconf.cnt + 1
             if gconf.cnt == 1000 then
-                print_stat(sublayer_repo)
+                print_stat(layer_repo)
                 nerv.CuMatrix.print_profile()
                 nerv.CuMatrix.clear_profile()
                 gconf.cnt = 0
                 -- break
             end
             local input = {}
---            if gconf.cnt == 100 then break end
-            for i, id in ipairs(input_order) do
+--            if gconf.cnt == 1000 then break end
+            for i, e in ipairs(input_order) do
+                local id = e.id
                 if data[id] == nil then
                     nerv.error("input data %s not found", id)
                 end
-                table.insert(input, data[id])
+                local transformed
+                if e.global_transf then
+                    transformed = nerv.speech_utils.global_transf(data[id],
+                                        global_transf,
+                                        gconf.frm_ext or 0, 0,
+                                        gconf)
+                else
+                    transformed = data[id]
+                end
+                table.insert(input, transformed)
+            end
+            local output = {nerv.CuMatrixFloat(gconf.batch_size, 1)}
+            err_output = {}
+            for i = 1, #input do
+                table.insert(err_output, input[i]:create())
             end
-            local output = {nerv.CuMatrixFloat(256, 1)}
-            err_output = {input[1]:create()}
             network:propagate(input, output)
             if bp then
                 network:back_propagate(err_input, err_output, input, output)
@@ -42,16 +55,16 @@ function build_trainer(ifname)
             -- collect garbage in-time to save GPU memory
             collectgarbage("collect")
         end
-        print_stat(sublayer_repo)
+        print_stat(layer_repo)
         nerv.CuMatrix.print_profile()
         nerv.CuMatrix.clear_profile()
         if (not bp) and prefix ~= nil then
             nerv.info("writing back...")
             local fname = string.format("%s_cv%.3f.nerv",
-                            prefix, get_accuracy(sublayer_repo))
+                            prefix, get_accuracy(layer_repo))
             network:get_params():export(fname, nil)
         end
-        return get_accuracy(sublayer_repo)
+        return get_accuracy(layer_repo)
     end
     return iterative_trainer
 end
diff --git a/nerv/examples/mmi_chime3.lua b/nerv/examples/mmi_chime3.lua
new file mode 100644
index 0000000..6ac7f28
--- /dev/null
+++ b/nerv/examples/mmi_chime3.lua
@@ -0,0 +1,183 @@
+require 'kaldi_io'
+require 'kaldi_seq'
+gconf = {lrate = 0.00001, wcost = 0, momentum = 0.0,
+        cumat_type = nerv.CuMatrixFloat,
+        mmat_type = nerv.MMatrixFloat,
+        frm_ext = 5,
+        tr_scp = "ark,o:/slfs6/users/ymz09/kaldi/src/featbin/copy-feats scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_mmi/train.scp ark:- |",
+        initialized_param = {"/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_init_mmi.nerv",
+        "/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_global_transf_mmi.nerv"},
+        debug = false}
+
+function make_layer_repo(param_repo)
+    local layer_repo = nerv.LayerRepo(
+    {
+        -- global transf
+        ["nerv.BiasLayer"] =
+        {
+            blayer1 = {{bias = "bias1"}, {dim_in = {440}, dim_out = {440}}},
+            blayer2 = {{bias = "bias2"}, {dim_in = {440}, dim_out = {440}}}
+        },
+        ["nerv.WindowLayer"] =
+        {
+            wlayer1 = {{window = "window1"}, {dim_in = {440}, dim_out = {440}}},
+            wlayer2 = {{window = "window2"}, {dim_in = {440}, dim_out = {440}}}
+        },
+        -- biased linearity
+        ["nerv.AffineLayer"] =
+        {
+            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+            {dim_in = {440}, dim_out = {2048}}},
+            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+            {dim_in = {2048}, dim_out = {2011}}}
+        },
+        ["nerv.SigmoidLayer"] =
+        {
+            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+        },
+        ["nerv.MMILayer"] =
+        {
+            mmi_crit = {{}, {dim_in = {2011, -1}, dim_out = {1},
+                        cmd = {
+                            arg = "--class-frame-counts=/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced/ali_train_pdf.counts --acoustic-scale=0.1 --lm-scale=1.0 --learn-rate=0.00001 --drop-frames=true --verbose=1",
+                            mdl = "/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/final.mdl",
+                            lat = "scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_denlats/lat.scp",
+                            ali = "ark:gunzip -c /slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/ali.*.gz |"
+                        }
+                    }
+                }
+        },
+        ["nerv.SoftmaxLayer"] = -- softmax for decode output
+        {
+            softmax = {{}, {dim_in = {2011}, dim_out = {2011}}}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            global_transf = {{}, {
+                dim_in = {440}, dim_out = {440},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "blayer1[1]",
+                    ["blayer1[1]"] = "wlayer1[1]",
+                    ["wlayer1[1]"] = "blayer2[1]",
+                    ["blayer2[1]"] = "wlayer2[1]",
+                    ["wlayer2[1]"] = "<output>[1]"
+                }
+            }},
+            main = {{}, {
+                dim_in = {440}, dim_out = {2011},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "affine0[1]",
+                    ["affine0[1]"] = "sigmoid0[1]",
+                    ["sigmoid0[1]"] = "affine1[1]",
+                    ["affine1[1]"] = "sigmoid1[1]",
+                    ["sigmoid1[1]"] = "affine2[1]",
+                    ["affine2[1]"] = "sigmoid2[1]",
+                    ["sigmoid2[1]"] = "affine3[1]",
+                    ["affine3[1]"] = "sigmoid3[1]",
+                    ["sigmoid3[1]"] = "affine4[1]",
+                    ["affine4[1]"] = "sigmoid4[1]",
+                    ["sigmoid4[1]"] = "affine5[1]",
+                    ["affine5[1]"] = "sigmoid5[1]",
+                    ["sigmoid5[1]"] = "affine6[1]",
+                    ["affine6[1]"] = "sigmoid6[1]",
+                    ["sigmoid6[1]"] = "affine7[1]",
+                    ["affine7[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            mmi_output = {{}, {
+                dim_in = {440, -1}, dim_out = {1},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "mmi_crit[1]",
+                    ["<input>[2]"] = "mmi_crit[2]",
+                    ["mmi_crit[1]"] = "<output>[1]"
+                }
+            }},
+            softmax_output = {{}, {
+                dim_in = {440}, dim_out = {2011},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "softmax[1]",
+                    ["softmax[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+    return layer_repo
+end
+
+function get_network(layer_repo)
+    return layer_repo:get_layer("mmi_output")
+end
+
+function get_decode_network(layer_repo)
+    return layer_repo:get_layer("softmax_output")
+end
+
+function get_global_transf(layer_repo)
+    return layer_repo:get_layer("global_transf")
+end
+
+function make_readers(feature_rspecifier, layer_repo)
+    return {
+                {reader = nerv.KaldiReader(gconf,
+                    {
+                        id = "main_scp",
+                        feature_rspecifier = feature_rspecifier,
+                        frm_ext = gconf.frm_ext,
+                        global_transf = layer_repo:get_layer("global_transf"),
+                        mlfs = {}
+                    })
+                }
+            }
+end
+
+function get_input_order()
+    return {{id = "main_scp", global_transf = true},
+            {id = "key"}}
+end
+
+function get_accuracy(layer_repo)
+    return 0
+end
+
+function print_stat(layer_repo)
+    local mmi_crit = layer_repo:get_layer("mmi_crit")
+    nerv.info("*** training stat begin ***")
+    nerv.printf("frames:\t\t\t%d\n", mmi_crit.total_frames)
+    nerv.info("*** training stat end ***")
+end
diff --git a/nerv/examples/mpe_chime3.lua b/nerv/examples/mpe_chime3.lua
new file mode 100644
index 0000000..ec095b0
--- /dev/null
+++ b/nerv/examples/mpe_chime3.lua
@@ -0,0 +1,186 @@
+require 'kaldi_io'
+require 'kaldi_seq'
+gconf = {lrate = 0.00001, wcost = 0, momentum = 0.0,
+        cumat_type = nerv.CuMatrixFloat,
+        mmat_type = nerv.MMatrixFloat,
+        frm_ext = 5,
+        tr_scp = "ark,s,cs:/slfs6/users/ymz09/kaldi/src/featbin/copy-feats scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_smbr/train.scp ark:- |",
+        initialized_param = {"/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_init.nerv",
+        "/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_global_transf.nerv"},
+        debug = false}
+
+function make_layer_repo(param_repo)
+    local layer_repo = nerv.LayerRepo(
+    {
+        -- global transf
+        ["nerv.BiasLayer"] =
+        {
+            blayer1 = {{bias = "bias1"}, {dim_in = {440}, dim_out = {440}}},
+            blayer2 = {{bias = "bias2"}, {dim_in = {440}, dim_out = {440}}}
+        },
+        ["nerv.WindowLayer"] =
+        {
+            wlayer1 = {{window = "window1"}, {dim_in = {440}, dim_out = {440}}},
+            wlayer2 = {{window = "window2"}, {dim_in = {440}, dim_out = {440}}}
+        },
+        -- biased linearity
+        ["nerv.AffineLayer"] =
+        {
+            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+            {dim_in = {440}, dim_out = {2048}}},
+            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+            {dim_in = {2048}, dim_out = {2011}}}
+        },
+        ["nerv.SigmoidLayer"] =
+        {
+            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+        },
+        ["nerv.MPELayer"] =
+        {
+            mpe_crit = {{}, {dim_in = {2011, -1}, dim_out = {1},
+                        cmd = {
+                            arg = "--class-frame-counts=/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced/ali_train_pdf.counts --acoustic-scale=0.1 --lm-scale=1.0 --learn-rate=0.00001 --do-smbr=true --verbose=1",
+                            mdl = "/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/final.mdl",
+                            lat = "scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_denlats/lat.scp",
+                            ali = "ark:gunzip -c /slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/ali.*.gz |"
+                        }
+                    }
+                }
+        },
+        ["nerv.SoftmaxLayer"] = -- softmax for decode output
+        {
+            softmax = {{}, {dim_in = {2011}, dim_out = {2011}}}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            global_transf = {{}, {
+                dim_in = {440}, dim_out = {440},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "blayer1[1]",
+                    ["blayer1[1]"] = "wlayer1[1]",
+                    ["wlayer1[1]"] = "blayer2[1]",
+                    ["blayer2[1]"] = "wlayer2[1]",
+                    ["wlayer2[1]"] = "<output>[1]"
+                }
+            }},
+            main = {{}, {
+                dim_in = {440}, dim_out = {2011},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "affine0[1]",
+                    ["affine0[1]"] = "sigmoid0[1]",
+                    ["sigmoid0[1]"] = "affine1[1]",
+                    ["affine1[1]"] = "sigmoid1[1]",
+                    ["sigmoid1[1]"] = "affine2[1]",
+                    ["affine2[1]"] = "sigmoid2[1]",
+                    ["sigmoid2[1]"] = "affine3[1]",
+                    ["affine3[1]"] = "sigmoid3[1]",
+                    ["sigmoid3[1]"] = "affine4[1]",
+                    ["affine4[1]"] = "sigmoid4[1]",
+                    ["sigmoid4[1]"] = "affine5[1]",
+                    ["affine5[1]"] = "sigmoid5[1]",
+                    ["sigmoid5[1]"] = "affine6[1]",
+                    ["affine6[1]"] = "sigmoid6[1]",
+                    ["sigmoid6[1]"] = "affine7[1]",
+                    ["affine7[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            mpe_output = {{}, {
+                dim_in = {440, -1}, dim_out = {1},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "mpe_crit[1]",
+                    ["<input>[2]"] = "mpe_crit[2]",
+                    ["mpe_crit[1]"] = "<output>[1]"
+                }
+            }},
+            softmax_output = {{}, {
+                dim_in = {440}, dim_out = {2011},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "softmax[1]",
+                    ["softmax[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+    return layer_repo
+end
+
+function get_network(layer_repo)
+    return layer_repo:get_layer("mpe_output")
+end
+
+function get_decode_network(layer_repo)
+    return layer_repo:get_layer("softmax_output")
+end
+
+function get_global_transf(layer_repo)
+    return layer_repo:get_layer("global_transf")
+end
+
+function make_readers(feature_rspecifier, layer_repo)
+    return {
+                {reader = nerv.KaldiReader(gconf,
+                    {
+                        id = "main_scp",
+                        feature_rspecifier = feature_rspecifier,
+                        frm_ext = gconf.frm_ext,
+                        global_transf = layer_repo:get_layer("global_transf"),
+                        mlfs = {}
+                    })
+                }
+            }
+end
+
+function get_input_order()
+    return {{id = "main_scp", global_transf = true},
+            {id = "key"}}
+end
+
+function get_accuracy(layer_repo)
+    local mpe_crit = layer_repo:get_layer("mpe_crit")
+    return mpe_crit.total_correct / mpe_crit.total_frames * 100
+end
+
+function print_stat(layer_repo)
+    local mpe_crit = layer_repo:get_layer("mpe_crit")
+    nerv.info("*** training stat begin ***")
+    nerv.printf("correct:\t\t%d\n", mpe_crit.total_correct)
+    nerv.printf("frames:\t\t\t%d\n", mpe_crit.total_frames)
+    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo))
+    nerv.info("*** training stat end ***")
+end
diff --git a/nerv/examples/seq_trainer.lua b/nerv/examples/seq_trainer.lua
new file mode 100644
index 0000000..b8ed3eb
--- /dev/null
+++ b/nerv/examples/seq_trainer.lua
@@ -0,0 +1,87 @@
+function build_trainer(ifname)
+    local param_repo = nerv.ParamRepo()
+    param_repo:import(ifname, nil, gconf)
+    local layer_repo = make_layer_repo(param_repo)
+    local network = get_network(layer_repo)
+    local global_transf = get_global_transf(layer_repo)
+    local input_order = get_input_order()
+    local iterative_trainer = function (prefix, scp_file, bp)
+        local readers = make_readers(scp_file, layer_repo)
+        -- initialize the network
+        network:init(1)
+        gconf.cnt = 0
+        for ri = 1, #readers, 1 do
+            while true do
+                local data = readers[ri].reader:get_data()
+                if data == nil then
+                    break
+                end
+                -- prine stat periodically
+                gconf.cnt = gconf.cnt + 1
+                if gconf.cnt == 1000 then
+                    print_stat(layer_repo)
+                    nerv.CuMatrix.print_profile()
+                    nerv.CuMatrix.clear_profile()
+                    gconf.cnt = 0
+                    -- break
+                end
+                local input = {}
+                --            if gconf.cnt == 1000 then break end
+                for i, e in ipairs(input_order) do
+                    local id = e.id
+                    if data[id] == nil then
+                        nerv.error("input data %s not found", id)
+                    end
+                    local transformed
+                    if e.global_transf then
+                        local batch = gconf.cumat_type(data[id]:nrow(), data[id]:ncol())
+                        batch:copy_fromh(data[id])
+                        transformed = nerv.speech_utils.global_transf(batch,
+                        global_transf,
+                        gconf.frm_ext or 0, 0,
+                        gconf)
+                    else
+                        transformed = data[id]
+                    end
+                    table.insert(input, transformed)
+                end
+                err_output = {input[1]:create()}
+                network:batch_resize(input[1]:nrow())
+                if network:propagate(input, {{}}) == true then
+                    network:back_propagate({{}}, err_output, input, {{}})
+                    gconf.batch_size = 1.0 - gconf.momentum  -- important!!!
+                    network:update({{}}, input, {{}})
+                end
+                -- collect garbage in-time to save GPU memory
+                collectgarbage("collect")
+            end
+        end
+        print_stat(layer_repo)
+        nerv.CuMatrix.print_profile()
+        nerv.CuMatrix.clear_profile()
+        if prefix ~= nil then
+            nerv.info("writing back...")
+            local fname = string.format("%s_tr%.3f.nerv",
+            prefix, get_accuracy(layer_repo))
+            network:get_params():export(fname, nil)
+        end
+        return get_accuracy(layer_repo)
+    end
+    return iterative_trainer
+end
+
+dofile(arg[1])
+
+local pf0 = gconf.initialized_param
+local trainer = build_trainer(pf0)
+
+local i = 1
+nerv.info("[NN] begin iteration %d with lrate = %.6f", i, gconf.lrate)
+local accu_tr = trainer(string.format("%s_%s_iter_%d_lr%f",
+string.gsub(
+(string.gsub(pf0[1], "(.*/)(.*)", "%2")),
+"(.*)%..*", "%1"),
+os.date("%Y%m%d%H%M%S"),
+i, gconf.lrate), gconf.tr_scp, true)
+nerv.info("[TR] training set %d: %.3f", i, accu_tr)
+
diff --git a/nerv/examples/swb_baseline.lua b/nerv/examples/swb_baseline.lua
index 7783f2a..8f72200 100644
--- a/nerv/examples/swb_baseline.lua
+++ b/nerv/examples/swb_baseline.lua
@@ -2,7 +2,9 @@ require 'htk_io'
 gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
         cumat_type = nerv.CuMatrixFloat,
         mmat_type = nerv.MMatrixFloat,
+        direct_update = true,
         frm_ext = 5,
+        frm_trim = 5,
         tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
         cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp",
         htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
@@ -10,8 +12,8 @@ gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
                 "/slfs1/users/mfy43/swb_global_transf.nerv"},
         debug = false}
 
-function make_sublayer_repo(param_repo)
-    return nerv.LayerRepo(
+function make_layer_repo(param_repo)
+    local layer_repo = nerv.LayerRepo(
     {
         -- global transf
         ["nerv.BiasLayer"] =
@@ -54,21 +56,23 @@ function make_sublayer_repo(param_repo)
             sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
             sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
         },
-        ["nerv.SoftmaxCELayer"] =
+        ["nerv.SoftmaxCELayer"] = -- softmax + ce criterion layer for finetune output
         {
             ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}}
+        },
+        ["nerv.SoftmaxLayer"] = -- softmax for decode output
+        {
+            softmax = {{}, {dim_in = {3001}, dim_out = {3001}}}
         }
     }, param_repo, gconf)
-end
 
-function make_layer_repo(sublayer_repo, param_repo)
-    return nerv.LayerRepo(
+    layer_repo:add_layers(
     {
         ["nerv.DAGLayer"] =
         {
             global_transf = {{}, {
                 dim_in = {429}, dim_out = {429},
-                sub_layers = sublayer_repo,
+                sub_layers = layer_repo,
                 connections = {
                     ["<input>[1]"] = "blayer1[1]",
                     ["blayer1[1]"] = "wlayer1[1]",
@@ -78,8 +82,8 @@ function make_layer_repo(sublayer_repo, param_repo)
                 }
             }},
             main = {{}, {
-                dim_in = {429, 1}, dim_out = {1},
-                sub_layers = sublayer_repo,
+                dim_in = {429}, dim_out = {3001},
+                sub_layers = layer_repo,
                 connections = {
                     ["<input>[1]"] = "affine0[1]",
                     ["affine0[1]"] = "sigmoid0[1]",
@@ -96,17 +100,51 @@ function make_layer_repo(sublayer_repo, param_repo)
                     ["sigmoid5[1]"] = "affine6[1]",
                     ["affine6[1]"] = "sigmoid6[1]",
                     ["sigmoid6[1]"] = "affine7[1]",
-                    ["affine7[1]"] = "ce_crit[1]",
+                    ["affine7[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            ce_output = {{}, {
+                dim_in = {429, 1}, dim_out = {1},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "ce_crit[1]",
                     ["<input>[2]"] = "ce_crit[2]",
                     ["ce_crit[1]"] = "<output>[1]"
                 }
+            }},
+            softmax_output = {{}, {
+                dim_in = {429}, dim_out = {3001},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "softmax[1]",
+                    ["softmax[1]"] = "<output>[1]"
+                }
             }}
         }
     }, param_repo, gconf)
+
+    return layer_repo
 end
 
 function get_network(layer_repo)
-    return layer_repo:get_layer("main")
+    return layer_repo:get_layer("ce_output")
+end
+
+function get_decode_network(layer_repo)
+    return layer_repo:get_layer("softmax_output")
+end
+
+function get_global_transf(layer_repo)
+    return layer_repo:get_layer("global_transf")
 end
 
 function make_readers(scp_file, layer_repo)
@@ -125,8 +163,7 @@ function make_readers(scp_file, layer_repo)
                                 dir = "*/",
                                 ext = "lab"
                             }
-                        },
-                        global_transf = layer_repo:get_layer("global_transf")
+                        }
                     }),
                 data = {main_scp = 429, phone_state = 1}}
             }
@@ -137,26 +174,28 @@ function make_buffer(readers)
         {
             buffer_size = gconf.buffer_size,
             randomize = gconf.randomize,
-            readers = readers
+            readers = readers,
+            use_gpu = true
         })
 end
 
 function get_input_order()
-    return {"main_scp", "phone_state"}
+    return {{id = "main_scp", global_transf = true},
+            {id = "phone_state"}}
 end
 
-function get_accuracy(sublayer_repo)
-    local ce_crit = sublayer_repo:get_layer("ce_crit")
+function get_accuracy(layer_repo)
+    local ce_crit = layer_repo:get_layer("ce_crit")
     return ce_crit.total_correct / ce_crit.total_frames * 100
 end
 
-function print_stat(sublayer_repo)
-    local ce_crit = sublayer_repo:get_layer("ce_crit")
+function print_stat(layer_repo)
+    local ce_crit = layer_repo:get_layer("ce_crit")
     nerv.info("*** training stat begin ***")
     nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce)
     nerv.printf("correct:\t\t%d\n", ce_crit.total_correct)
     nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames)
     nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames)
-    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(sublayer_repo))
+    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo))
     nerv.info("*** training stat end ***")
 end
diff --git a/nerv/examples/swb_baseline_basic.lua b/nerv/examples/swb_baseline_basic.lua
new file mode 100644
index 0000000..71f04a3
--- /dev/null
+++ b/nerv/examples/swb_baseline_basic.lua
@@ -0,0 +1,162 @@
+require 'htk_io'
+gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
+        cumat_type = nerv.CuMatrixFloat,
+        mmat_type = nerv.MMatrixFloat,
+        frm_ext = 5,
+        frm_trim = 5,
+        tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
+        cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp",
+        htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
+        initialized_param = {"/slfs1/users/mfy43/swb_init.nerv",
+                "/slfs1/users/mfy43/swb_global_transf.nerv"},
+        debug = false}
+
+function make_layer_repo(param_repo)
+    local layer_repo = nerv.LayerRepo(
+    {
+        -- global transf
+        ["nerv.BiasLayer"] =
+        {
+            blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}},
+            blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}}
+        },
+        ["nerv.WindowLayer"] =
+        {
+            wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}},
+            wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}}
+        },
+        -- biased linearity
+        ["nerv.AffineLayer"] =
+        {
+            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+            {dim_in = {429}, dim_out = {2048}}},
+            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+            {dim_in = {2048}, dim_out = {3001}}}
+        },
+        ["nerv.SigmoidLayer"] =
+        {
+            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+        },
+        ["nerv.SoftmaxCELayer"] =
+        {
+            ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            global_transf = {{}, {
+                dim_in = {429}, dim_out = {429},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "blayer1[1]",
+                    ["blayer1[1]"] = "wlayer1[1]",
+                    ["wlayer1[1]"] = "blayer2[1]",
+                    ["blayer2[1]"] = "wlayer2[1]",
+                    ["wlayer2[1]"] = "<output>[1]"
+                }
+            }},
+            main = {{}, {
+                dim_in = {429, 1}, dim_out = {1},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "affine0[1]",
+                    ["affine0[1]"] = "sigmoid0[1]",
+                    ["sigmoid0[1]"] = "affine1[1]",
+                    ["affine1[1]"] = "sigmoid1[1]",
+                    ["sigmoid1[1]"] = "affine2[1]",
+                    ["affine2[1]"] = "sigmoid2[1]",
+                    ["sigmoid2[1]"] = "affine3[1]",
+                    ["affine3[1]"] = "sigmoid3[1]",
+                    ["sigmoid3[1]"] = "affine4[1]",
+                    ["affine4[1]"] = "sigmoid4[1]",
+                    ["sigmoid4[1]"] = "affine5[1]",
+                    ["affine5[1]"] = "sigmoid5[1]",
+                    ["sigmoid5[1]"] = "affine6[1]",
+                    ["affine6[1]"] = "sigmoid6[1]",
+                    ["sigmoid6[1]"] = "affine7[1]",
+                    ["affine7[1]"] = "ce_crit[1]",
+                    ["<input>[2]"] = "ce_crit[2]",
+                    ["ce_crit[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+    return layer_repo
+end
+
+function get_network(layer_repo)
+    return layer_repo:get_layer("main")
+end
+
+function make_readers(scp_file, layer_repo)
+    return {
+                {reader = nerv.TNetReader(gconf,
+                    {
+                        id = "main_scp",
+                        scp_file = scp_file,
+                        conf_file = gconf.htk_conf,
+                        frm_ext = gconf.frm_ext,
+                        mlfs = {
+                            phone_state = {
+                                file = "/slfs1/users/mfy43/swb_ivec/ref.mlf",
+                                format = "map",
+                                format_arg = "/slfs1/users/mfy43/swb_ivec/dict",
+                                dir = "*/",
+                                ext = "lab"
+                            }
+                        }
+                    }),
+                data = {main_scp = 429, phone_state = 1}}
+            }
+end
+
+function make_buffer(readers)
+    return nerv.SGDBuffer(gconf,
+        {
+            buffer_size = gconf.buffer_size,
+            randomize = gconf.randomize,
+            readers = readers
+        })
+end
+
+function get_input_order()
+    return {{id = "main_scp", global_transf = true},
+            {id = "phone_state"}}
+end
+
+function get_accuracy(layer_repo)
+    local ce_crit = layer_repo:get_layer("ce_crit")
+    return ce_crit.total_correct / ce_crit.total_frames * 100
+end
+
+function print_stat(layer_repo)
+    local ce_crit = layer_repo:get_layer("ce_crit")
+    nerv.info("*** training stat begin ***")
+    nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce)
+    nerv.printf("correct:\t\t%d\n", ce_crit.total_correct)
+    nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames)
+    nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames)
+    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo))
+    nerv.info("*** training stat end ***")
+end
diff --git a/nerv/init.lua b/nerv/init.lua
index 89010a7..9c1a5c8 100644
--- a/nerv/init.lua
+++ b/nerv/init.lua
@@ -1,15 +1,19 @@
 require 'libnerv'
 
-function nerv.error(fmt, ...)
-    error(nerv.printf("[nerv] internal error: " .. fmt .. "\n", ...))
-end
-
 function nerv.error_method_not_implemented()
     nerv.error("method not implemented");
 end
 
+function nerv.sprintf(fmt, ...)
+    return string.format(fmt, ...)
+end
+
 function nerv.printf(fmt, ...)
-    io.write(string.format(fmt, ...))
+    io.write(nerv.sprintf(fmt, ...))
+end
+
+function nerv.error(fmt, ...)
+    error(nerv.sprintf("[nerv] internal error: " .. fmt .. "\n", ...))
 end
 
 function nerv.mesg_with_timestamp(fmt, ...)
diff --git a/nerv/io/sgd_buffer.lua b/nerv/io/sgd_buffer.lua
index f4f7dfe..74c4934 100644
--- a/nerv/io/sgd_buffer.lua
+++ b/nerv/io/sgd_buffer.lua
@@ -5,8 +5,22 @@ function SGDBuffer:__init(global_conf, buffer_conf)
     self.buffer_size = math.floor(buffer_conf.buffer_size /
                                 global_conf.batch_size) * global_conf.batch_size
     self.randomize = buffer_conf.randomize
-    if self.randomize == nil then
-        self.randomize = false
+    self.consume = buffer_conf.consume
+    local cumat_type = global_conf.cumat_type
+    if buffer_conf.use_gpu then
+        self.mat_type = cumat_type
+        self.copy_rows_from_by_idx = cumat_type.copy_rows_fromd_by_idx
+        self.copy_from = cumat_type.copy_fromd
+        self.copy_from_reader = cumat_type.copy_fromh
+        self.perm_gen = function (x)
+            return cumat_type.new_from_host(nerv.MMatrixFloat.perm_gen(x))
+        end
+    else
+        self.mat_type = global_conf.mmat_type
+        self.copy_rows_from_by_idx = cumat_type.copy_rows_fromh_by_idx
+        self.copy_from = cumat_type.copy_fromh
+        self.perm_gen = nerv.MMatrixFloat.perm_gen
+        self.copy_from_reader = self.mat_type.copy_from
     end
     self.head = 0
     self.tail = 0
@@ -14,7 +28,7 @@ function SGDBuffer:__init(global_conf, buffer_conf)
     for i, reader_spec in ipairs(buffer_conf.readers) do
         local buffs = {}
         for id, width in pairs(reader_spec.data) do
-            buffs[id] = {data = global_conf.mmat_type(self.buffer_size, width),
+            buffs[id] = {data = self.mat_type(self.buffer_size, width),
                         leftover = nil,
                         width = width}
         end
@@ -41,7 +55,7 @@ function SGDBuffer:saturate()
                 buff.data:copy_from(buff.leftover, 0, lrow)
                 buff.leftover = nil
             end
-            nerv.printf("leftover: %d\n", lrow)
+            nerv.info("buffer leftover: %d\n", lrow)
             reader.tail = lrow
             reader.has_leftover = false
         end
@@ -65,21 +79,21 @@ function SGDBuffer:saturate()
                     if d == nil then
                         nerv.error("reader does not provide data for %s", id)
                     end
-                    buff.leftover = self.gconf.mmat_type(drow - remain,
-                                                        buff.width)
-                    buff.leftover:copy_from(d, remain, drow)
+                    buff.leftover = self.mat_type(drow - remain,
+                                                  buff.width)
+                    self.copy_from_reader(buff.leftover, d, remain, drow)
                 end
                 drow = remain
                 reader.has_leftover = true
             end
             for id, buff in pairs(reader.buffs) do
-                buff.data:copy_from(data[id], 0, drow, reader.tail)
+                self.copy_from_reader(buff.data, data[id], 0, drow, reader.tail)
             end
             reader.tail = reader.tail + drow
         end
         self.tail = math.min(self.tail, reader.tail)
     end
-    self.rand_map = nerv.MMatrixInt.perm_gen(self.tail) -- generate shuffled index
+    self.rand_map = self.perm_gen(self.tail) -- generate shuffled index
     collectgarbage("collect")
     return self.tail >= self.gconf.batch_size
 end
@@ -87,25 +101,31 @@ end
 function SGDBuffer:get_data()
     local batch_size = self.gconf.batch_size
     if self.head >= self.tail then -- buffer is empty
-        if not self:saturate() then
+        local t = os.clock()
+        if (not self:saturate()) and (not self.consume) then
             return nil -- the remaining data cannot build a batch
         end
+        if self.tail == self.head then
+            return nil -- nothing left
+        end
+        nerv.info("%.3fs to fill the buffer", os.clock() - t)
     end
-    if self.head + batch_size > self.tail then
+    if self.head + batch_size > self.tail and (not self.consume) then
         return nil -- the remaining data cannot build a batch
     end
+    actual_batch_size = math.min(batch_size, self.tail - self.head)
     local res = {}
     for i, reader in ipairs(self.readers) do
         for id, buff in pairs(reader.buffs) do
-            local batch = self.gconf.cumat_type(batch_size, buff.width)
+            local batch = self.gconf.cumat_type(actual_batch_size, buff.width)
             if self.randomize then
-                batch:copy_rows_fromh_by_idx(buff.data, self.rand_map, self.head)
+                self.copy_rows_from_by_idx(batch, buff.data, self.rand_map, self.head)
             else
-                batch:copy_fromh(buff.data, self.head, self.head + batch_size)
+                self.copy_from(batch, buff.data, self.head, self.head + actual_batch_size)
             end
             res[id] = batch
         end
     end
-    self.head = self.head + batch_size
+    self.head = self.head + actual_batch_size
     return res
 end
diff --git a/nerv/layer/affine.lua b/nerv/layer/affine.lua
index 00cbcfb..015ec3f 100644
--- a/nerv/layer/affine.lua
+++ b/nerv/layer/affine.lua
@@ -42,7 +42,7 @@ function AffineLayer:__init(id, global_conf, layer_conf)
     self.dim_out = layer_conf.dim_out
     self.gconf = global_conf
     self:check_dim_len(1, 1) -- exactly one input and one output
-    self.direct_update = layer_conf.direct_update
+    self.direct_update = layer_conf.direct_update or global_conf.direct_update
 end
 
 function AffineLayer:init(batch_size)
@@ -60,6 +60,10 @@ function AffineLayer:init(batch_size)
     self.bp:train_init()
 end
 
+function AffineLayer:batch_resize(batch_size)
+    -- do nothing
+end
+
 function AffineLayer:update(bp_err, input, output)
     if self.direct_update then
         self.ltp.correction:mul(input[1], bp_err[1], 1.0, gconf.momentum, 'T', 'N')
diff --git a/nerv/layer/affine_recurrent.lua b/nerv/layer/affine_recurrent.lua
index 59d259c..92d98e2 100644
--- a/nerv/layer/affine_recurrent.lua
+++ b/nerv/layer/affine_recurrent.lua
@@ -37,6 +37,10 @@ function Recurrent:init(batch_size)
     self.bp:train_init()
 end
 
+function Recurrent:batch_resize(batch_size)
+    -- do nothing
+end
+
 function Recurrent:update(bp_err, input, output)
     if (self.direct_update == true) then
         local ltp_hh = self.ltp_hh.trans
diff --git a/nerv/layer/bias.lua b/nerv/layer/bias.lua
index c99274d..7e9fd46 100644
--- a/nerv/layer/bias.lua
+++ b/nerv/layer/bias.lua
@@ -18,6 +18,10 @@ function BiasLayer:init()
     end
 end
 
+function BiasLayer:batch_resize(batch_size)
+    -- do nothing
+end
+
 function BiasLayer:propagate(input, output)
     output[1]:copy_fromd(input[1])
     output[1]:add_row(self.bias.trans, 1.0)
diff --git a/nerv/layer/combiner.lua b/nerv/layer/combiner.lua
index 7bd7617..1bcfdfb 100644
--- a/nerv/layer/combiner.lua
+++ b/nerv/layer/combiner.lua
@@ -30,6 +30,12 @@ function CombinerLayer:init(batch_size)
     self.sum = self.gconf.cumat_type(batch_size, dim)
 end
 
+function CombinerLayer:batch_resize(batch_size)
+    if self.sum:nrow() ~= batch_size then
+        self.sum = self.gconf.cumat_type(batch_size, self.dim_in[1])
+    end
+end
+
 function CombinerLayer:update(bp_err, input, output)
 end
 
diff --git a/nerv/layer/init.lua b/nerv/layer/init.lua
index 7172f99..6861b0e 100644
--- a/nerv/layer/init.lua
+++ b/nerv/layer/init.lua
@@ -78,3 +78,4 @@ nerv.include('window.lua')
 nerv.include('mse.lua')
 nerv.include('combiner.lua')
 nerv.include('affine_recurrent.lua')
+nerv.include('softmax.lua')
diff --git a/nerv/layer/mse.lua b/nerv/layer/mse.lua
index 9a97add..0ee3080 100644
--- a/nerv/layer/mse.lua
+++ b/nerv/layer/mse.lua
@@ -20,6 +20,14 @@ function MSELayer:init(batch_size)
     self.diff = self.mse:create()
 end
 
+function MSELayer:batch_resize(batch_size)
+    if self.mse:nrow() ~= batch_resize then
+        self.mse = self.gconf.cumat_type(batch_size, self.dim_in[1])
+        self.mse_sum = self.gconf.cumat_type(batch_size, 1)
+        self.diff = self.mse:create()
+    end
+end
+
 function MSELayer:update(bp_err, input, output)
     -- no params, therefore do nothing
 end
@@ -34,7 +42,7 @@ function MSELayer:propagate(input, output)
     if output[1] ~= nil then
         output[1]:copy_fromd(mse_sum)
     end
-    self.total_mse = self.total_mse + mse_sum:colsum()[0]
+    self.total_mse = self.total_mse + mse_sum:colsum()[0][0]
     self.total_frames = self.total_frames + mse_sum:nrow()
 end
 
diff --git a/nerv/layer/sigmoid.lua b/nerv/layer/sigmoid.lua
index dfd09eb..0a8bcdc 100644
--- a/nerv/layer/sigmoid.lua
+++ b/nerv/layer/sigmoid.lua
@@ -14,6 +14,10 @@ function SigmoidLayer:init()
     end
 end
 
+function SigmoidLayer:batch_resize(batch_size)
+    -- do nothing
+end
+
 function SigmoidLayer:update(bp_err, input, output)
     -- no params, therefore do nothing
 end
diff --git a/nerv/layer/softmax.lua b/nerv/layer/softmax.lua
new file mode 100644
index 0000000..4205b66
--- /dev/null
+++ b/nerv/layer/softmax.lua
@@ -0,0 +1,35 @@
+local SoftmaxLayer = nerv.class("nerv.SoftmaxLayer", "nerv.Layer")
+
+function SoftmaxLayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.gconf = global_conf
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self:check_dim_len(1, 1) -- two inputs: nn output and label
+end
+
+function SoftmaxLayer:init(batch_size)
+    if self.dim_in[1] ~= self.dim_out[1] then
+        nerv.error("mismatching dimensions of input and output")
+    end
+end
+
+function SoftmaxLayer:batch_resize(batch_size)
+    -- do nothing
+end
+
+function SoftmaxLayer:update(bp_err, input, output)
+    -- no params, therefore do nothing
+end
+
+function SoftmaxLayer:propagate(input, output)
+    output[1]:softmax(input[1])
+end
+
+function SoftmaxLayer:back_propagate(bp_err, next_bp_err, input, output)
+    nerv.error_method_not_implemented()
+end
+
+function SoftmaxLayer:get_params()
+    return nerv.ParamRepo({})
+end
diff --git a/nerv/layer/softmax_ce.lua b/nerv/layer/softmax_ce.lua
index f878a2f..9071e86 100644
--- a/nerv/layer/softmax_ce.lua
+++ b/nerv/layer/softmax_ce.lua
@@ -23,6 +23,13 @@ function SoftmaxCELayer:init(batch_size)
     self.ce = self.softmax:create()
 end
 
+function SoftmaxCELayer:batch_resize(batch_size)
+    if self.softmax:nrow() ~= batch_resize then
+        self.softmax = self.gconf.cumat_type(batch_size, self.dim_in[1])
+        self.ce = self.softmax:create()
+    end
+end
+
 function SoftmaxCELayer:update(bp_err, input, output)
     -- no params, therefore do nothing
 end
diff --git a/nerv/layer/window.lua b/nerv/layer/window.lua
index 4e9a3b1..8eed352 100644
--- a/nerv/layer/window.lua
+++ b/nerv/layer/window.lua
@@ -18,6 +18,10 @@ function WindowLayer:init()
     end
 end
 
+function WindowLayer:batch_resize(batch_size)
+    -- do nothing
+end
+
 function WindowLayer:propagate(input, output)
     output[1]:copy_fromd(input[1])
     output[1]:scale_rows_by_row(self.window.trans)
diff --git a/nerv/lib/matrix/cukernel.h b/nerv/lib/matrix/cukernel.h
index 7bb4c2c..2126c6f 100644
--- a/nerv/lib/matrix/cukernel.h
+++ b/nerv/lib/matrix/cukernel.h
@@ -18,4 +18,6 @@ void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step);
 void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b);
 void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b);
 void cudak_(cuda_decompress)(const Matrix *a, Matrix *b);
+void cudak_(cuda_copy_rows_by_idx)(const Matrix *a, Matrix *b,
+                                    const Matrix *idx, int b_begin);
 #endif
diff --git a/nerv/lib/matrix/cumatrix.c b/nerv/lib/matrix/cumatrix.c
index aa81bfc..ff1168d 100644
--- a/nerv/lib/matrix/cumatrix.c
+++ b/nerv/lib/matrix/cumatrix.c
@@ -49,6 +49,7 @@ void nerv_cumatrix_init() {
 #define NERV_CUBLAS_(NAME) cublasS##NAME
 #define MATRIX_CUMATRIX_HOST_TNAME nerv_matrix_host_float_tname
 #include "generic/cumatrix.c"
+
 #undef NERV_CUBLAS_
 #undef cudak_
 #undef nerv_matrix_
diff --git a/nerv/lib/matrix/cumatrix.h b/nerv/lib/matrix/cumatrix.h
index 9f71507..e6def66 100644
--- a/nerv/lib/matrix/cumatrix.h
+++ b/nerv/lib/matrix/cumatrix.h
@@ -1,5 +1,6 @@
 #ifndef NERV_CUMATRIX_H
 #define NERV_CUMATRIX_H
+#include "matrix.h"
 void nerv_cumatrix_print_profile();
 void nerv_cumatrix_clear_profile();
 void nerv_cumatrix_init();
diff --git a/nerv/lib/matrix/generic/cukernel.cu b/nerv/lib/matrix/generic/cukernel.cu
index e337798..08feb59 100644
--- a/nerv/lib/matrix/generic/cukernel.cu
+++ b/nerv/lib/matrix/generic/cukernel.cu
@@ -284,6 +284,15 @@ __global__ void cudak_(gen_col_idx)(MATRIX_ELEM *b,
     b[j + i * stride] = j;
 }
 
+__global__ void cudak_(copy_rows_by_idx)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                    const MATRIX_ELEM *idx, int b_begin,
+                                    int nrow, int ncol, int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[j + i * stride] = a[j + lrintf(idx[i]) * stride];
+}
+
 extern "C" {
 #include "../cukernel.h"
     void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) {
@@ -589,5 +598,16 @@ extern "C" {
              b->stride / sizeof(MATRIX_ELEM));
         cudaStreamSynchronize(0);
     }
+
+    void cudak_(cuda_copy_rows_by_idx)(const Matrix *a, Matrix *b,
+                                        const Matrix *idx, int b_begin) {
+        dim3 threadsPerBlock(CUDA_THREADS_NN, 1);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), b->nrow);
+        cudak_(copy_rows_by_idx)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+            MATRIX_ELEM_PTR(idx) + b_begin,
+             b_begin, b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
 }
 #endif
diff --git a/nerv/lib/matrix/generic/cumatrix.c b/nerv/lib/matrix/generic/cumatrix.c
index 40a0030..770e503 100644
--- a/nerv/lib/matrix/generic/cumatrix.c
+++ b/nerv/lib/matrix/generic/cumatrix.c
@@ -315,16 +315,17 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b,
     long nrow = a->nrow;
     if (!(0 <= b_begin && b_begin + nrow <= idx->ncol))
         NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0);
-    long *idx_ptr = idx->data.i;
+    float *idx_ptr = idx->data.f;
     int i;
     if (idx->nrow != 1)
         NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0);
     if (a->ncol != b->ncol)
         NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0);
+    PROFILE_START
     cudaStream_t *streams = (cudaStream_t*)malloc(sizeof(cudaStream_t) * nrow);
     for (i = 0; i < nrow; i++)
     {
-        int src_row = idx_ptr[b_begin + i];
+        int src_row = (int)idx_ptr[b_begin + i];
         if (!(0 <= src_row && src_row < b->nrow))
             NERV_EXIT_STATUS(status, MAT_INVALID_IDX, 0);
         CUDA_SAFE_CALL(cudaStreamCreate(streams + i), status);
@@ -339,6 +340,22 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b,
         CUDA_SAFE_CALL(cudaStreamDestroy(streams[i]), status);
     }
     free(streams);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, NERV_NORMAL, 0);
+}
+
+void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b,
+                            const Matrix *idx, int b_begin, Status *status) {
+    long nrow = a->nrow;
+    if (!(0 <= b_begin && b_begin + nrow <= idx->ncol))
+        NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0);
+    if (idx->nrow != 1)
+        NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0);
+    if (a->ncol != b->ncol)
+        NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0);
+    PROFILE_START
+    cudak_(cuda_copy_rows_by_idx)(b, a, idx, b_begin);
+    PROFILE_STOP
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
 }
 
diff --git a/nerv/lib/matrix/generic/cumatrix.h b/nerv/lib/matrix/generic/cumatrix.h
index 3f1f8a3..04e8c5a 100644
--- a/nerv/lib/matrix/generic/cumatrix.h
+++ b/nerv/lib/matrix/generic/cumatrix.h
@@ -40,6 +40,8 @@ void nerv_matrix_(log_elem)(Matrix *b, const Matrix *a, Status *status);
 Matrix *nerv_matrix_(decompress)(const Matrix *a, int orig_col, Status *status);
 void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b,
                             const Matrix *idx, int b_begin, Status *status);
+void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b,
+                            const Matrix *idx, int b_begin, Status *status);
 
 void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b,
                             int context, Status *status);
diff --git a/nerv/lib/matrix/generic/matrix.c b/nerv/lib/matrix/generic/matrix.c
index e4afa37..4246751 100644
--- a/nerv/lib/matrix/generic/matrix.c
+++ b/nerv/lib/matrix/generic/matrix.c
@@ -3,7 +3,8 @@
 #include "matrix.h"
 /* FIXME: malloc failure detection */
 
-static void nerv_matrix_(data_free)(Matrix *self, Status *status) {
+void nerv_matrix_(data_free)(Matrix *self, Status *status) {
+    if(*self->data_ref == 0) return; /* FIXME: repeat free memory */
     assert(*self->data_ref > 0);
     if (--(*self->data_ref) == 0)
     {
@@ -18,7 +19,7 @@ static void nerv_matrix_(data_free)(Matrix *self, Status *status) {
     }
 }
 
-static void nerv_matrix_(data_retain)(Matrix *self) {
+void nerv_matrix_(data_retain)(Matrix *self) {
     (*self->data_ref)++;
 }
 
diff --git a/nerv/lib/matrix/generic/matrix.h b/nerv/lib/matrix/generic/matrix.h
index 9d44e6d..69b4e6d 100644
--- a/nerv/lib/matrix/generic/matrix.h
+++ b/nerv/lib/matrix/generic/matrix.h
@@ -2,3 +2,5 @@
 Matrix *nerv_matrix_(create)(long nrow, long ncol, Status *status);
 void nerv_matrix_(destroy)(Matrix *self, Status *status);
 Matrix *nerv_matrix_(getrow)(Matrix *self, int row);
+void nerv_matrix_(data_free)(Matrix *self, Status *status);
+void nerv_matrix_(data_retain)(Matrix *self);
diff --git a/nerv/lib/matrix/mmatrix.c b/nerv/lib/matrix/mmatrix.c
index 94f1ea8..b8157eb 100644
--- a/nerv/lib/matrix/mmatrix.c
+++ b/nerv/lib/matrix/mmatrix.c
@@ -7,6 +7,25 @@
 #define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME
 #include "generic/matrix.h"
 #include "generic/mmatrix.c"
+
+Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) {
+    int i;
+    Matrix *self = nerv_matrix_(create)(1, ncol, status);
+    if (status->err_code != NERV_NORMAL)
+        return NULL;
+    float *prow = self->data.f;
+    for (i = 0; i < ncol; i++)
+        prow[i] = i;
+    for (i = ncol - 1; i >= 0; i--)
+    {
+        size_t j = rand() % (i + 1);
+        float tmp = prow[i];
+        prow[i] = prow[j];
+        prow[j] = tmp;
+    }
+    return self;
+}
+
 #undef nerv_matrix_
 #undef host_matrix_
 #undef MATRIX_USE_FLOAT
@@ -33,21 +52,3 @@
 #define host_matrix_(NAME) host_matrix_int_##NAME
 #define nerv_matrix_(NAME) nerv_matrix_host_int_##NAME
 #include "generic/mmatrix.c"
-
-Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) {
-    int i;
-    Matrix *self = nerv_matrix_(create)(1, ncol, status);
-    if (status->err_code != NERV_NORMAL)
-        return NULL;
-    long *prow = self->data.i;
-    for (i = 0; i < ncol; i++)
-        prow[i] = i;
-    for (i = ncol - 1; i >= 0; i--)
-    {
-        size_t j = rand() % (i + 1);
-        long tmp = prow[i];
-        prow[i] = prow[j];
-        prow[j] = tmp;
-    }
-    return self;
-}
diff --git a/nerv/lib/matrix/mmatrix.h b/nerv/lib/matrix/mmatrix.h
index df91e4c..31e7984 100644
--- a/nerv/lib/matrix/mmatrix.h
+++ b/nerv/lib/matrix/mmatrix.h
@@ -1,4 +1,5 @@
 #ifndef NERV_MMATRIX_H
 #define NERV_MMATRIX_H
-Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status);
+#include "matrix.h"
+Matrix *nerv_matrix_host_float_perm_gen(int ncol, Status *status);
 #endif
diff --git a/nerv/matrix/generic/cukernel.cu b/nerv/matrix/generic/cukernel.cu
deleted file mode 100644
index 2ae5e62..0000000
--- a/nerv/matrix/generic/cukernel.cu
+++ /dev/null
@@ -1,592 +0,0 @@
-#ifdef NERV_GENERIC_CUKERNEL
-#include <assert.h>
-#include <stdio.h>
-#include "matrix.h"
-#include "cuda.h"
-#include "float.h"
-#define CUDA_THREADS_N 16
-#define CUDA_THREADS_NN ((CUDA_THREADS_N) * (CUDA_THREADS_N))
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
-__global__ void cudak_(log_elem)(const MATRIX_ELEM *a, MATRIX_ELEM *b, 
-                                int nrow, int ncol, int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    long idx;
-    MATRIX_ELEM tmp;
-    if (i >= nrow || j >= ncol) return;
-    idx = j + i * stride;
-    tmp = a[idx];
-    if(tmp < FLT_MIN) tmp = FLT_MIN;
-    b[idx] = log(tmp);
-}
-
-__global__ void cudak_(mul_elem)(const MATRIX_ELEM *a, const MATRIX_ELEM *b,
-                                MATRIX_ELEM *c, 
-                                int nrow, int ncol, int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    long idx;
-    if (i >= nrow || j >= ncol) return;
-    idx = j + i * stride;
-    c[idx] = a[idx] * b[idx];
-}
-
-__global__ void cudak_(sigmoid)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                        int nrow, int ncol, int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    long idx;
-    if (i >= nrow || j >= ncol) return;
-    idx = j + i * stride;
-    b[idx] = 1.0 / (1.0 + exp(-a[idx]));
-}
-
-__global__ void cudak_(sigmoid_grad)(const MATRIX_ELEM *output,
-                                    const MATRIX_ELEM *err,
-                                    MATRIX_ELEM *nerr,
-                                    int nrow, int ncol, int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    long idx;
-    if (i >= nrow || j >= ncol) return;
-    idx = j + i * stride;
-    nerr[idx] = output[idx] * (1.0 - output[idx]) * err[idx];
-}
-
-__global__ void cudak_(softmax_final)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                        const MATRIX_ELEM *max, const MATRIX_ELEM *deno,
-                        int nrow, int ncol, int stride, int mstride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    long idx;
-    if (i >= nrow || j >= ncol) return;
-    idx = j + i * stride;
-    b[idx] = exp(a[idx] - max[0 + i * mstride]) / deno[0 + i * mstride];
-}
-
-__global__ void cudak_(block_reduce_rowsum)(const MATRIX_ELEM *input,
-                                            MATRIX_ELEM *output,
-                                            const int istride, const int ostride,
-                                            const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0;
-    __syncthreads();
-    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-            cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_colsum)(const MATRIX_ELEM *input,
-                                MATRIX_ELEM *output,
-                                const int istride, const int ostride,
-                                const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    cudak_(arr)[threadIdx.y] = i < n ? input[blockIdx.x + istride * i] : 0;
-    __syncthreads();
-    for (int offset = blockDim.y >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.y < offset)
-            cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset];
-        __syncthreads();
-    }
-    if (threadIdx.y == 0)
-        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_colsame)(const MATRIX_ELEM *input,
-                                            const MATRIX_ELEM *ref_input,
-                                            MATRIX_ELEM *output,
-                                            const int istride, const int ostride,
-                                            const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    cudak_(arr)[threadIdx.y] = (i < n && input[blockIdx.x + istride * i] == \
-                                        ref_input[blockIdx.x + istride * i]) ? 1.0 : 0;
-    __syncthreads();
-    for (int offset = blockDim.y >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.y < offset)
-            cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset];
-        __syncthreads();
-    }
-    if (threadIdx.y == 0)
-        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_softmax_rowsum)(const MATRIX_ELEM *input,
-                                        MATRIX_ELEM *output,
-                                        const MATRIX_ELEM *max,
-                                        const int istride, const int ostride,
-                                        const int mstride, const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    cudak_(arr)[threadIdx.x] = j < n ? exp(input[j + istride * blockIdx.y] - \
-                                    max[0 + mstride * blockIdx.y]) : 0;
-    __syncthreads();
-    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-            cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_rowmax)(const MATRIX_ELEM *input,
-                                            MATRIX_ELEM *output,
-                                            const int istride, const int ostride,
-                                            const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
-    __syncthreads();
-    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-        {
-            MATRIX_ELEM l = cudak_(arr)[threadIdx.x],
-                        r = cudak_(arr)[threadIdx.x + offset];
-            if (r > l)
-                cudak_(arr)[threadIdx.x] = r;
-        }
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_rowmax_idx)(const MATRIX_ELEM *input,
-                                                const MATRIX_ELEM *idx_input,
-                                                MATRIX_ELEM *output,
-                                                MATRIX_ELEM *idx_output,
-                                                const int istride, const int ostride,
-                                                const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    MATRIX_ELEM *arr_val = cudak_(arr);
-    MATRIX_ELEM *arr_idx = arr_val + blockDim.x;
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    arr_val[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
-    arr_idx[threadIdx.x] = j < n ? idx_input[j + istride * blockIdx.y] : 0;
-    __syncthreads();
-    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-        {
-            MATRIX_ELEM l = arr_val[threadIdx.x],
-                        r = arr_val[threadIdx.x + offset];
-            if (r > l)
-            {
-                arr_val[threadIdx.x] = r;
-                arr_idx[threadIdx.x] = arr_idx[threadIdx.x + offset];
-            }
-        }
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-    {
-        output[blockIdx.x + ostride * blockIdx.y] = arr_val[0];
-        idx_output[blockIdx.x + ostride * blockIdx.y] = arr_idx[0];
-    }
-}
-
-__global__ void cudak_(add_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                int nrow, int ncol, int stride, double beta) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[j + i * stride] += beta * a[j];
-}
-
-__global__ void cudak_(fill)(MATRIX_ELEM *a,
-                            int nrow, int ncol, int stride, double val) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    a[j + i * stride] = val;
-}
-
-__global__ void cudak_(clip)(MATRIX_ELEM *a,
-                            int nrow, int ncol, int stride, double val_1, double val_2) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    if (a[j + i * stride] > val_2)
-        a[j + i * stride] = val_2;
-    else if (a[j + i * stride] < val_1)
-        a[j + i * stride] = val_1;
-}
-
-__global__ void cudak_(expand_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                    int nrow, int ncol,
-                                    int enrow, int encol,
-                                    int stride, int estride,
-                                    int context) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    int ridx;
-    if (i >= enrow || j >= encol) return;
-    ridx = i + j / ncol - context;
-    if (ridx < 0) ridx = 0;
-    else if (ridx >= nrow) ridx = nrow - 1;
-    b[j + i * estride] = a[j % ncol + ridx * stride];
-}
-
-__global__ void cudak_(rearrange_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                    int nrow, int ncol,
-                                    int stride, int step, int orig_dim) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[j + i * stride] = a[j / step + (j % step) * orig_dim + i * stride];
-}
-
-__global__ void cudak_(scale_rows_by_col)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                        int nrow, int ncol,
-                                        int astride, int bstride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[j + i * bstride] *= a[i * astride];
-}
-
-__global__ void cudak_(scale_rows_by_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                        int nrow, int ncol,
-                                        int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[j + i * stride] *= a[j];
-}
-
-__global__ void cudak_(decompress)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                    int nrow, int ncol,
-                                    int stride_a, int stride_b) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[lrintf(a[j + i * stride_a]) + i * stride_b] = 1.0;
-}
-
-__global__ void cudak_(gen_col_idx)(MATRIX_ELEM *b,
-                                    int nrow, int ncol, int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[j + i * stride] = j;
-}
-
-extern "C" {
-#include "../cukernel.h"
-    void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(log_elem)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_mul_elem)(const Matrix *a, const Matrix *b,
-                                Matrix *c) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(mul_elem)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             MATRIX_ELEM_PTR(c),
-             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(sigmoid)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
-            b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_sigmoid_grad)(const Matrix *output,
-                                    const Matrix *err, Matrix *nerr) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(nerr->ncol, threadsPerBlock.x),
-                CEIL_DIV(nerr->nrow, threadsPerBlock.y));
-        cudak_(sigmoid_grad)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(output), MATRIX_ELEM_PTR(err),
-             MATRIX_ELEM_PTR(nerr),
-             nerr->nrow, nerr->ncol,
-             nerr->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_rowsum)(const Matrix *a, Matrix *b) {
-        dim3 block(CUDA_THREADS_NN, 1);
-        int ncol = a->ncol;
-        int blocks_per_row = CEIL_DIV(ncol, block.x);
-        dim3 grid(blocks_per_row, a->nrow);
-        MATRIX_ELEM *res;
-        size_t stride;
-        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
-        cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), res,
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             ncol);
-        ncol = blocks_per_row;
-        assert((unsigned long)ncol <= block.x);
-        grid.x = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (res, MATRIX_ELEM_PTR(b),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             ncol);
-        cudaStreamSynchronize(0);
-        cudaFree(res);
-    }
-
-    void cudak_(cuda_colsame)(const Matrix *a, const Matrix *ref, Matrix *b) {
-        dim3 block(1, CUDA_THREADS_NN);
-        int nrow = a->nrow;
-        int blocks_per_col = CEIL_DIV(nrow, block.y);
-        dim3 grid(a->ncol, blocks_per_col);
-        MATRIX_ELEM *res;
-        size_t stride;
-        cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col);
-        cudak_(block_reduce_colsame)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(ref), res,
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             nrow);
-        nrow = blocks_per_col;
-        assert((unsigned long)nrow <= block.y);
-        grid.y = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
-            (res, MATRIX_ELEM_PTR(b),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             nrow);
-        cudaStreamSynchronize(0);
-        cudaFree(res);
-    }
-
-    void cudak_(cuda_colsum)(const Matrix *a, Matrix *b) {
-        dim3 block(1, CUDA_THREADS_NN);
-        int nrow = a->nrow;
-        int blocks_per_col = CEIL_DIV(nrow, block.y);
-        dim3 grid(a->ncol, blocks_per_col);
-        MATRIX_ELEM *res;
-        size_t stride;
-        cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col);
-        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), res,
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             nrow);
-        nrow = blocks_per_col;
-        assert((unsigned long)nrow <= block.y);
-        grid.y = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
-            (res, MATRIX_ELEM_PTR(b),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             nrow);
-        cudaStreamSynchronize(0);
-        cudaFree(res);
-    }
-
-    void cudak_(cuda_softmax_final)(const Matrix *a, const Matrix *max,
-                            const Matrix *deno, Matrix *b) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(softmax_final)<<<numBlocks, threadsPerBlock>>> \
-                (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-                MATRIX_ELEM_PTR(max), MATRIX_ELEM_PTR(deno),
-                b->nrow, b->ncol,
-                b->stride / sizeof(MATRIX_ELEM),
-                max->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_softmax_denominator)(const Matrix *a, const Matrix *max, Matrix *b) {
-        dim3 block(CUDA_THREADS_NN, 1);
-        int ncol = a->ncol;
-        int blocks_per_row = CEIL_DIV(ncol, block.x);
-        dim3 grid(blocks_per_row, a->nrow);
-        MATRIX_ELEM *res;
-        size_t stride;
-        assert(max->ncol == 1);
-        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
-        cudak_(block_reduce_softmax_rowsum) \
-            <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), res, MATRIX_ELEM_PTR(max),
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             max->stride / sizeof(MATRIX_ELEM),
-             ncol);
-        ncol = blocks_per_row;
-        assert((unsigned long)ncol <= block.x);
-        grid.x = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_rowsum) \
-            <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (res, MATRIX_ELEM_PTR(b),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             ncol);
-        cudaStreamSynchronize(0);
-        cudaFree(res);
-    }
-
-    void cudak_(cuda_rowmax)(const Matrix *a, Matrix *b) {
-        dim3 block(CUDA_THREADS_NN, 1);
-        int ncol = a->ncol;
-        int blocks_per_row = CEIL_DIV(ncol, block.x);
-        dim3 grid(blocks_per_row, a->nrow);
-        MATRIX_ELEM *res;
-        size_t stride;
-        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
-        cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), res,
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             ncol);
-        ncol = blocks_per_row;
-        assert((unsigned long)ncol <= block.x);
-        grid.x = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (res, MATRIX_ELEM_PTR(b),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             ncol);
-        cudaStreamSynchronize(0);
-        cudaFree(res);
-    }
-
-    void cudak_(cuda_rowmax_idx)(const Matrix *a, Matrix *b, Matrix *b_idx) {
-        dim3 block(CUDA_THREADS_NN, 1);
-        int ncol = a->ncol;
-        int blocks_per_row = CEIL_DIV(ncol, block.x);
-        dim3 grid(blocks_per_row, a->nrow);
-        MATRIX_ELEM *a_idx, *res, *res_idx;
-        size_t stride;
-        cudaMallocPitch(&a_idx, &stride, a->stride, a->nrow);
-        cudak_(gen_col_idx)<<<grid, block>>>(a_idx, a->nrow, ncol, stride / sizeof(MATRIX_ELEM));
-        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
-        cudaMallocPitch(&res_idx, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_rowmax_idx)<<<grid, block,
-                                        2 * block.x * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), a_idx, res, res_idx,
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             ncol);
-        ncol = blocks_per_row;
-        assert((unsigned long)ncol <= block.x);
-        grid.x = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_rowmax_idx)<<<grid, block,
-                                        2 * block.x * sizeof(MATRIX_ELEM)>>> \
-            (res, res_idx, MATRIX_ELEM_PTR(b), MATRIX_ELEM_PTR(b_idx),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             ncol);
-        cudaStreamSynchronize(0);
-        cudaFree(a_idx);
-        cudaFree(res);
-        cudaFree(res_idx);
-    }
-
-    /* in-place calc */
-    void cudak_(cuda_add_row)(const Matrix *a, Matrix *b, double beta) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(add_row)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
-            b->stride / sizeof(MATRIX_ELEM), beta);
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_fill)(Matrix *a, double val) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x),
-                CEIL_DIV(a->nrow, threadsPerBlock.y));
-        cudak_(fill)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), a->nrow, a->ncol,
-            a->stride / sizeof(MATRIX_ELEM), val);
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_clip)(Matrix *a, double val_1, double val_2) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x),
-                CEIL_DIV(a->nrow, threadsPerBlock.y));
-        cudak_(clip)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), a->nrow, a->ncol,
-            a->stride / sizeof(MATRIX_ELEM), val_1, val_2);
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(expand_frm)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             a->nrow, a->ncol,
-             b->nrow, b->ncol,
-             a->stride / sizeof(MATRIX_ELEM),
-             b->stride / sizeof(MATRIX_ELEM),
-             context);
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(rearrange_frm)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM),
-             step, b->ncol / step);
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(scale_rows_by_col)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             b->nrow, b->ncol,
-             a->stride / sizeof(MATRIX_ELEM),
-             b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(scale_rows_by_row)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_decompress)(const Matrix *a, Matrix *b) {
-        dim3 threadsPerBlock(1, CUDA_THREADS_NN);
-        dim3 numBlocks(1, CEIL_DIV(a->nrow, threadsPerBlock.y));
-        cudak_(decompress)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             a->nrow, a->ncol,
-             a->stride / sizeof(MATRIX_ELEM),
-             b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-}
-#endif
diff --git a/nerv/matrix/generic/cumatrix.c b/nerv/matrix/generic/cumatrix.c
index 4bdf5f0..08cb4c2 100644
--- a/nerv/matrix/generic/cumatrix.c
+++ b/nerv/matrix/generic/cumatrix.c
@@ -43,15 +43,6 @@ static int nerv_matrix_(lua_mul)(lua_State *L) {
     return 0;
 }
 
-static int nerv_matrix_(lua_create)(lua_State *L) {
-    Status status;
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = nerv_matrix_(create)(a->nrow, a->ncol, &status);
-    NERV_LUA_CHECK_STATUS(L, status);
-    luaT_pushudata(L, b, nerv_matrix_(tname));
-    return 1;
-}
-
 static int nerv_matrix_(lua_sigmoid)(lua_State *L) {
     Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
@@ -237,12 +228,12 @@ static int nerv_matrix_(lua_decompress)(lua_State *L) {
     return 1;
 }
 
-extern const char *nerv_matrix_host_int_tname;
+extern const char *nerv_matrix_host_float_tname;
 static int nerv_matrix_(lua_copy_rows_fromh_by_idx)(lua_State *L) {
     Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
     const Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
-    const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_int_tname);
+    const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_float_tname);
     long nrow = a->nrow;
     int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0;
     nerv_matrix_(copy_rows_fromh_by_idx)(a, b, idx, b_begin, &status);
@@ -250,6 +241,18 @@ static int nerv_matrix_(lua_copy_rows_fromh_by_idx)(lua_State *L) {
     return 0;
 }
 
+static int nerv_matrix_(lua_copy_rows_fromd_by_idx)(lua_State *L) {
+    Status status;
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_(tname));
+    long nrow = a->nrow;
+    int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0;
+    nerv_matrix_(copy_rows_fromd_by_idx)(a, b, idx, b_begin, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
+    return 0;
+}
+
 static int nerv_matrix_(lua_expand_frm)(lua_State *L) {
     Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
@@ -289,7 +292,6 @@ static int nerv_matrix_(lua_scale_rows_by_row)(lua_State *L) {
 }
 
 static const luaL_Reg nerv_matrix_(extra_methods)[] = {
-    {"create", nerv_matrix_(lua_create)},
     {"colsum", nerv_matrix_(lua_colsum)},
     {"colsame", nerv_matrix_(lua_colsame)},
     {"rowsum", nerv_matrix_(lua_rowsum)},
@@ -300,6 +302,8 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = {
     /* in-place calc */
     {"copy_fromh", nerv_matrix_(lua_copy_fromh)},
     {"copy_fromd", nerv_matrix_(lua_copy_fromd)},
+    /* alias for copy_fromd */
+    {"copy_from", nerv_matrix_(lua_copy_fromd)},
     {"copy_toh", nerv_matrix_(lua_copy_toh)},
     {"add", nerv_matrix_(lua_add)},
     {"mul", nerv_matrix_(lua_mul)},
@@ -312,6 +316,7 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = {
     {"mul_elem", nerv_matrix_(lua_mul_elem)},
     {"log_elem", nerv_matrix_(lua_log_elem)},
     {"copy_rows_fromh_by_idx", nerv_matrix_(lua_copy_rows_fromh_by_idx)},
+    {"copy_rows_fromd_by_idx", nerv_matrix_(lua_copy_rows_fromd_by_idx)},
     {"expand_frm", nerv_matrix_(lua_expand_frm)},
     {"rearrange_frm", nerv_matrix_(lua_rearrange_frm)},
     {"scale_rows_by_row", nerv_matrix_(lua_scale_rows_by_row)},
@@ -321,6 +326,9 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = {
 
 static void cuda_matrix_(init)(lua_State *L) {
     luaN_append_methods(L, nerv_matrix_(extra_methods));
+#ifdef CUMATRIX_INIT
+    CUMATRIX_INIT(L);
+#endif
 }
 
 int nerv_matrix_(lua_get_elem)(lua_State *L) {
diff --git a/nerv/matrix/init.lua b/nerv/matrix/init.lua
index f230e9f..1091d7e 100644
--- a/nerv/matrix/init.lua
+++ b/nerv/matrix/init.lua
@@ -45,6 +45,10 @@ function nerv.Matrix:generate(gen)
     end
 end
 
+function nerv.Matrix:create(nrow, ncol)
+    return self.__constructor(nrow or self:nrow(), ncol or self:ncol())
+end
+
 nerv.MMatrixInt.fmt = "%d "
 
 function nerv.CuMatrix:__add__(b)
diff --git a/nerv/matrix/mmatrix.c b/nerv/matrix/mmatrix.c
index 5561572..961059c 100644
--- a/nerv/matrix/mmatrix.c
+++ b/nerv/matrix/mmatrix.c
@@ -16,7 +16,30 @@ void nerv_lua_mmatrix_init(lua_State *L) {
 #define host_matrix_(NAME) host_matrix_float_##NAME
 #define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME
 const char *nerv_matrix_(tname) = "nerv.MMatrixFloat";
+#define MMATRIX_INIT(L) host_matrix_(init_extra)(L)
+
+static const luaL_Reg nerv_matrix_(extra_methods_int)[];
+static void host_matrix_(init_extra)(lua_State *L) {
+    luaN_append_methods(L, nerv_matrix_(extra_methods_int));
+}
+
 #include "generic/mmatrix.c"
+#include "../lib/matrix/mmatrix.h"
+
+static int nerv_matrix_(lua_perm_gen)(lua_State *L) {
+    Status status;
+    int i, ncol = luaL_checkinteger(L, 1);
+    Matrix *self = nerv_matrix_(perm_gen)(ncol, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
+    luaT_pushudata(L, self, nerv_matrix_(tname));
+    return 1;
+}
+
+static const luaL_Reg nerv_matrix_(extra_methods_int)[] = {
+    {"perm_gen", nerv_matrix_(lua_perm_gen)},
+    {NULL, NULL}
+};
+
 #undef nerv_matrix_
 #undef host_matrix_
 #undef MATRIX_USE_FLOAT
@@ -24,6 +47,7 @@ const char *nerv_matrix_(tname) = "nerv.MMatrixFloat";
 #undef MATRIX_ELEM_PTR
 #undef MATRIX_ELEM_FMT
 #undef MATRIX_ELEM_WRITE_FMT
+#undef MMATRIX_INIT
 
 #define NERV_GENERIC_MMATRIX
 #define MATRIX_USE_DOUBLE
@@ -44,26 +68,4 @@ const char *nerv_matrix_(tname) = "nerv.MMatrixDouble";
 #define host_matrix_(NAME) host_matrix_int_##NAME
 #define nerv_matrix_(NAME) nerv_matrix_host_int_##NAME
 const char *nerv_matrix_(tname) = "nerv.MMatrixInt";
-#define MMATRIX_INIT(L) host_matrix_(init_extra)(L)
-
-static const luaL_Reg nerv_matrix_(extra_methods_int)[];
-static void host_matrix_(init_extra)(lua_State *L) {
-    luaN_append_methods(L, nerv_matrix_(extra_methods_int));
-}
-
 #include "generic/mmatrix.c"
-#include "../lib/matrix/mmatrix.h"
-
-static int nerv_matrix_(lua_perm_gen)(lua_State *L) {
-    Status status;
-    int i, ncol = luaL_checkinteger(L, 1);
-    Matrix *self = nerv_matrix_(perm_gen)(ncol, &status);
-    NERV_LUA_CHECK_STATUS(L, status);
-    luaT_pushudata(L, self, nerv_matrix_(tname));
-    return 1;
-}
-
-static const luaL_Reg nerv_matrix_(extra_methods_int)[] = {
-    {"perm_gen", nerv_matrix_(lua_perm_gen)},
-    {NULL, NULL}
-};
diff --git a/nerv/nn/layer_dag.lua b/nerv/nn/layer_dag.lua
index 8e30216..73bb77d 100644
--- a/nerv/nn/layer_dag.lua
+++ b/nerv/nn/layer_dag.lua
@@ -79,7 +79,7 @@ function DAGLayer:__init(id, global_conf, layer_conf)
             end
 
             table.insert(parsed_conn,
-                {{ref_from, port_from}, {ref_to, port_to}})
+            {{ref_from, port_from}, {ref_to, port_to}})
             table.insert(ref_from.next_layers, ref_to) -- add edge
             ref_to.in_deg = ref_to.in_deg + 1          -- increase the in-degree of the target layer
         end
@@ -112,7 +112,7 @@ function DAGLayer:__init(id, global_conf, layer_conf)
         end
     end
     for i = 1, #queue do
-        nerv.info("enqueued layer: %s", queue[i].layer.id)
+        nerv.info("enqueued layer: %s %s", queue[i].layer, queue[i].layer.id)
     end
 
     for id, ref in pairs(layers) do
@@ -125,6 +125,7 @@ function DAGLayer:__init(id, global_conf, layer_conf)
     self.layers = layers
     self.inputs = inputs
     self.outputs = outputs
+    self.id = id
     self.dim_in = dim_in
     self.dim_out = dim_out
     self.parsed_conn = parsed_conn
@@ -139,8 +140,11 @@ function DAGLayer:init(batch_size)
         ref_from, port_from = unpack(conn[1])
         ref_to, port_to = unpack(conn[2])
         _, output_dim = ref_from.layer:get_dim()
-        local mid = self.gconf.cumat_type(batch_size,
-                                        output_dim[port_from])
+        local dim = 1
+        if output_dim[port_from] > 0 then
+            dim = output_dim[port_from]
+        end
+        local mid = self.gconf.cumat_type(batch_size, dim)
         local err_mid = mid:create()
 
         ref_from.outputs[port_from] = mid
@@ -175,8 +179,38 @@ function DAGLayer:init(batch_size)
     end
 end
 
+function DAGLayer:batch_resize(batch_size)
+    self.gconf.batch_size = batch_size
+
+    for i, conn in ipairs(self.parsed_conn) do
+        local _, output_dim
+        local ref_from, port_from, ref_to, port_to
+        ref_from, port_from = unpack(conn[1])
+        ref_to, port_to = unpack(conn[2])
+        _, output_dim = ref_from.layer:get_dim()
+
+        if ref_from.outputs[port_from]:nrow() ~= batch_size and output_dim[port_from] > 0 then
+            local mid = self.gconf.cumat_type(batch_size, output_dim[port_from])
+            local err_mid = mid:create()
+
+            ref_from.outputs[port_from] = mid
+            ref_to.inputs[port_to] = mid
+
+            ref_from.err_inputs[port_from] = err_mid
+            ref_to.err_outputs[port_to] = err_mid
+        end
+    end
+    for id, ref in pairs(self.layers) do
+        ref.layer:batch_resize(batch_size)
+    end
+    collectgarbage("collect")
+end
+
 function DAGLayer:set_inputs(input)
     for i = 1, #self.dim_in do
+        if input[i] == nil then
+            nerv.error("some input is not provided");
+        end
         local layer = self.inputs[i][1]
         local port = self.inputs[i][2]
         layer.inputs[port] = input[i]
@@ -185,6 +219,9 @@ end
 
 function DAGLayer:set_outputs(output)
     for i = 1, #self.dim_out do
+        if output[i] == nil then
+            nerv.error("some output is not provided");
+        end
         local layer = self.outputs[i][1]
         local port = self.outputs[i][2]
         layer.outputs[port] = output[i]
@@ -221,11 +258,13 @@ end
 function DAGLayer:propagate(input, output)
     self:set_inputs(input)
     self:set_outputs(output)
+    local ret = false
     for i = 1, #self.queue do
         local ref = self.queue[i]
         -- print(ref.layer.id)
-        ref.layer:propagate(ref.inputs, ref.outputs)
+        ret = ref.layer:propagate(ref.inputs, ref.outputs)
     end
+    return ret
 end
 
 function DAGLayer:back_propagate(bp_err, next_bp_err, input, output)
@@ -247,3 +286,30 @@ function DAGLayer:get_params()
     end
     return nerv.ParamRepo.merge(param_repos)
 end
+
+DAGLayer.PORT_TYPES = {
+    INPUT = {},
+    OUTPUT = {},
+    ERR_INPUT = {},
+    ERR_OUTPUT = {}
+}
+
+function DAGLayer:get_intermediate(id, port_type)
+    if id == "<input>" or id == "<output>" then
+        nerv.error("an actual real layer id is expected")
+    end
+    local layer = self.layers[id]
+    if layer == nil then
+        nerv.error("layer id %s not found", id)
+    end
+    if port_type == DAGLayer.PORT_TYPES.INPUT then
+        return layer.inputs
+    elseif port_type == DAGLayer.PORT_TYPES.OUTPUT then
+        return layer.outputs
+    elseif port_type == DAGLayer.PORT_TYPES.ERR_INPUT then
+        return layer.err_inputs
+    elseif port_type == DAGLayer.PORT_TYPES.ERR_OUTPUT then
+        return layer.err_outputs
+    end
+    nerv.error("unrecognized port type")
+end
diff --git a/nerv/nn/layer_repo.lua b/nerv/nn/layer_repo.lua
index 602c37c..ef333a7 100644
--- a/nerv/nn/layer_repo.lua
+++ b/nerv/nn/layer_repo.lua
@@ -1,7 +1,12 @@
 local LayerRepo = nerv.class("nerv.LayerRepo")
 
 function LayerRepo:__init(layer_spec, param_repo, global_conf)
-    local layers = {}
+    self.layers = {}
+    self:add_layers(layer_spec, param_repo, global_conf)
+end
+
+function LayerRepo:add_layers(layer_spec, param_repo, global_conf)
+    local layers = self.layers
     for ltype, llist in pairs(layer_spec) do
         local layer_type = nerv.get_type(ltype)
         for id, spec in pairs(llist) do
@@ -22,7 +27,6 @@ function LayerRepo:__init(layer_spec, param_repo, global_conf)
             layers[id] = layer_type(id, global_conf, layer_config)
         end
     end
-    self.layers = layers
 end
 
 function LayerRepo:get_layer(lid)
diff --git a/speech b/speech
deleted file mode 160000
-Subproject 08e33afa533af1f026ac271446a0c873fe0bb5c