From 5b16335a903551ffef4fafa88d67146b9131a74e Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Tue, 4 Aug 2015 11:11:50 +0800
Subject: ...

---
 nerv/examples/asr_trainer.lua |  4 ++--
 nerv/init.lua                 |  2 +-
 nerv/nerv                     | 10 +++++-----
 nerv/nn/layer_dag.lua         |  6 ++++++
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/nerv/examples/asr_trainer.lua b/nerv/examples/asr_trainer.lua
index 4fa4096..8dfb2ac 100644
--- a/nerv/examples/asr_trainer.lua
+++ b/nerv/examples/asr_trainer.lua
@@ -12,7 +12,7 @@ function build_trainer(ifname)
         -- initialize the network
         network:init(gconf.batch_size)
         gconf.cnt = 0
-        err_input = {nerv.CuMatrixFloat(256, 1)}
+        err_input = {nerv.CuMatrixFloat(gconf.batch_size, 1)}
         err_input[1]:fill(1)
         for data in buffer.get_data, buffer do
             -- prine stat periodically
@@ -32,7 +32,7 @@ function build_trainer(ifname)
                 end
                 table.insert(input, data[id])
             end
-            local output = {nerv.CuMatrixFloat(256, 1)}
+            local output = {nerv.CuMatrixFloat(gconf.batch_size, 1)}
             err_output = {input[1]:create()}
             network:propagate(input, output)
             if bp then
diff --git a/nerv/init.lua b/nerv/init.lua
index 89010a7..183ae6d 100644
--- a/nerv/init.lua
+++ b/nerv/init.lua
@@ -1,7 +1,7 @@
 require 'libnerv'
 
 function nerv.error(fmt, ...)
-    error(nerv.printf("[nerv] internal error: " .. fmt .. "\n", ...))
+    error("[nerv] internal error: " .. fmt .. "\n", ...)
 end
 
 function nerv.error_method_not_implemented()
diff --git a/nerv/nerv b/nerv/nerv
index 7571659..e5943aa 100644
--- a/nerv/nerv
+++ b/nerv/nerv
@@ -1,13 +1,13 @@
 #! /usr/bin/env luajit
 require 'nerv'
 print("Greetings")
-if #arg < 1 then
+if #args < 1 then
     return
 end
-local script = arg[1]
+local script = args[1]
 local script_arg = {}
-for i = 2, #arg do
-    table.insert(script_arg, arg[i])
+for i = 2, #args do
+    table.insert(script_arg, args[i])
 end
-arg = script_arg
+args = script_arg
 dofile(script)
diff --git a/nerv/nn/layer_dag.lua b/nerv/nn/layer_dag.lua
index 8e30216..e5c1ac7 100644
--- a/nerv/nn/layer_dag.lua
+++ b/nerv/nn/layer_dag.lua
@@ -177,6 +177,9 @@ end
 
 function DAGLayer:set_inputs(input)
     for i = 1, #self.dim_in do
+        if input[i] == nil then
+            nerv.error("some input is not provided");
+        end
         local layer = self.inputs[i][1]
         local port = self.inputs[i][2]
         layer.inputs[port] = input[i]
@@ -185,6 +188,9 @@ end
 
 function DAGLayer:set_outputs(output)
     for i = 1, #self.dim_out do
+        if output[i] == nil then
+            nerv.error("some output is not provided");
+        end
         local layer = self.outputs[i][1]
         local port = self.outputs[i][2]
         layer.outputs[port] = output[i]
-- 
cgit v1.2.3-70-g09d2


From 462d1982f299c8be4ae9a746e6ae7d04c04faa33 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Tue, 4 Aug 2015 14:41:22 +0800
Subject: add embedding_example

---
 embedding_example/.gitignore              |   2 +
 embedding_example/Makefile                |   9 +++
 embedding_example/main.c                  |  95 ++++++++++++++++++++++++++
 embedding_example/setup_nerv.lua          |  26 +++++++
 embedding_example/swb_baseline_decode.lua | 109 ++++++++++++++++++++++++++++++
 nerv/Makefile                             |   2 +-
 nerv/layer/init.lua                       |   1 +
 nerv/lib/matrix/generic/matrix.c          |   4 +-
 nerv/lib/matrix/generic/matrix.h          |   2 +
 9 files changed, 247 insertions(+), 3 deletions(-)
 create mode 100644 embedding_example/.gitignore
 create mode 100644 embedding_example/Makefile
 create mode 100644 embedding_example/main.c
 create mode 100644 embedding_example/setup_nerv.lua
 create mode 100644 embedding_example/swb_baseline_decode.lua

diff --git a/embedding_example/.gitignore b/embedding_example/.gitignore
new file mode 100644
index 0000000..8e68213
--- /dev/null
+++ b/embedding_example/.gitignore
@@ -0,0 +1,2 @@
+main
+main.o
diff --git a/embedding_example/Makefile b/embedding_example/Makefile
new file mode 100644
index 0000000..e4ee314
--- /dev/null
+++ b/embedding_example/Makefile
@@ -0,0 +1,9 @@
+CFLAG += -I ../install/include/luajit-2.0/  -I ../install/include/nerv/
+LDFLAG += -L../install/lib/ -lluajit-5.1 -Wl,-rpath=../install/lib/ -lluaT -lnervcore
+GCC := gcc
+
+main: main.o
+	$(GCC) -o $@ $<  $(LDFLAG)
+
+main.o: main.c
+	$(GCC) $(CFLAG) -o $@ $< -c
diff --git a/embedding_example/main.c b/embedding_example/main.c
new file mode 100644
index 0000000..4e70892
--- /dev/null
+++ b/embedding_example/main.c
@@ -0,0 +1,95 @@
+#include "lua.h"
+#include "lauxlib.h"
+#include "lualib.h"
+#include "matrix/matrix.h"
+#include "common.h"
+#include "luaT/luaT.h"
+#include <stdio.h>
+
+const char *nerv_matrix_host_float_tname = "nerv.MMatrixFloat";
+extern Matrix *nerv_matrix_host_float_create(long nrow, long ncol, Status *status);
+extern void nerv_matrix_host_float_data_retain(Matrix *self);
+extern void nerv_matrix_host_float_data_free(Matrix *self, Status *status);
+
+lua_State *L;
+Matrix *input, *output;
+Status status;
+
+void setup_nerv() {
+    L = lua_open();
+    luaL_openlibs(L);
+    luaL_loadfile(L, "setup_nerv.lua");
+    /* network configuration */
+    lua_pushstring(L, "swb_baseline_decode.lua");
+    if (lua_pcall(L, 1, LUA_MULTRET, 0))
+    {
+        printf("%s\n", luaL_checkstring(L, 1));
+        exit(1);
+    }
+    /* lua stack now: input width, output width, propagator */
+    input = nerv_matrix_host_float_create(1, luaL_checkinteger(L, 1), &status);
+    NERV_LUA_CHECK_STATUS(L, status);
+    output = nerv_matrix_host_float_create(1, luaL_checkinteger(L, 2), &status);
+    NERV_LUA_CHECK_STATUS(L, status);
+}
+
+
+void propagate(float for_fun) {
+    int i, j;
+    printf("ok: %d\n", lua_gettop(L));
+    lua_pushvalue(L, 3);
+    /* lua stack now: input width, output width, propagator, propagator */
+    for (i = 0; i < input->nrow; i++) /* nrow is actually 1 */
+    {
+        float *nerv_row = (float *)((char *)input->data.f + i * input->stride);
+        for (j = 0; j < input->ncol; j++)
+        {
+            nerv_row[j] = j * for_fun;
+        }
+    }
+    /* avoid gc */
+    nerv_matrix_host_float_data_retain(input);
+    nerv_matrix_host_float_data_retain(input);
+    nerv_matrix_host_float_data_retain(input);
+    nerv_matrix_host_float_data_retain(input);
+    nerv_matrix_host_float_data_retain(output);
+    nerv_matrix_host_float_data_retain(output);
+    nerv_matrix_host_float_data_retain(output);
+    nerv_matrix_host_float_data_retain(output);
+
+    luaT_pushudata(L, input, nerv_matrix_host_float_tname);
+    luaT_pushudata(L, output, nerv_matrix_host_float_tname);
+    /* lua stack now: input width, output width, propagator, propagator, input, output */
+    if (lua_pcall(L, 2, 0, 0)) /* call propagator with two parameters */
+    {
+        printf("%s\n", luaL_checkstring(L, -1));
+        exit(-1);
+    }
+    /* lua stack now: input width, output width, propagator */
+    printf("## caller ##\n");
+    for (i = 0; i < output->nrow; i++) /* nrow is actually 1 */
+    {
+        float *nerv_row = (float *)((char *)output->data.f + i * output->stride);
+        for (j = 0; j < output->ncol; j++)
+        {
+            printf("%.8f ", nerv_row[j]);
+        }
+        printf("\n");
+    }
+}
+
+void teardown_nerv() {
+    nerv_matrix_host_float_data_free(input, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
+    nerv_matrix_host_float_data_free(output, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
+}
+
+int main() {
+    setup_nerv();
+    propagate(1.0);
+    propagate(2.0);
+    propagate(3.0);
+    teardown_nerv();
+    return 0;
+}
diff --git a/embedding_example/setup_nerv.lua b/embedding_example/setup_nerv.lua
new file mode 100644
index 0000000..e33a1e7
--- /dev/null
+++ b/embedding_example/setup_nerv.lua
@@ -0,0 +1,26 @@
+package.path="/home/slhome/mfy43/.luarocks/share/lua/5.1/?.lua;/home/slhome/mfy43/.luarocks/share/lua/5.1/?/init.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?/init.lua;"..package.path
+package.cpath="/home/slhome/mfy43/.luarocks/lib/lua/5.1/?.so;/home/slhome/mfy43/nerv/install/lib/lua/5.1/?.so;"..package.cpath
+local k,l,_=pcall(require,"luarocks.loader") _=k and l.add_context("nerv","scm-1")
+
+local args = {...}
+require 'nerv'
+dofile(args[1])
+local param_repo = nerv.ParamRepo()
+param_repo:import(gconf.initialized_param, nil, gconf)
+local sublayer_repo = make_sublayer_repo(param_repo)
+local layer_repo = make_layer_repo(sublayer_repo, param_repo)
+local network = get_network(layer_repo)
+local batch_size = 1
+network:init(batch_size)
+function propagator(input, output)
+    local gpu_input = nerv.CuMatrixFloat(input:nrow(), input:ncol())
+    local gpu_output = nerv.CuMatrixFloat(output:nrow(), output:ncol())
+    gpu_input:copy_fromh(input)
+    print(gpu_input)
+    network:propagate({gpu_input}, {gpu_output})
+    gpu_output:copy_toh(output)
+    print(output)
+    -- collect garbage in-time to save GPU memory
+    collectgarbage("collect")
+end
+return network.dim_in[1], network.dim_out[1], propagator
diff --git a/embedding_example/swb_baseline_decode.lua b/embedding_example/swb_baseline_decode.lua
new file mode 100644
index 0000000..14a463b
--- /dev/null
+++ b/embedding_example/swb_baseline_decode.lua
@@ -0,0 +1,109 @@
+require 'htk_io'
+gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
+        cumat_type = nerv.CuMatrixFloat,
+        mmat_type = nerv.MMatrixFloat,
+        frm_ext = 5,
+        tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
+        cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp",
+        htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
+        initialized_param = {"/slfs1/users/mfy43/swb_init.nerv",
+                "/slfs1/users/mfy43/swb_global_transf.nerv"},
+        debug = false}
+
+function make_sublayer_repo(param_repo)
+    return nerv.LayerRepo(
+    {
+        -- global transf
+        ["nerv.BiasLayer"] =
+        {
+            blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}},
+            blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}}
+        },
+        ["nerv.WindowLayer"] =
+        {
+            wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}},
+            wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}}
+        },
+        -- biased linearity
+        ["nerv.AffineLayer"] =
+        {
+            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+            {dim_in = {429}, dim_out = {2048}}},
+            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+            {dim_in = {2048}, dim_out = {3001}}}
+        },
+        ["nerv.SigmoidLayer"] =
+        {
+            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+        },
+        ["nerv.SoftmaxLayer"] =
+        {
+            soutput = {{}, {dim_in = {3001}, dim_out = {3001}}}
+        }
+    }, param_repo, gconf)
+end
+
+function make_layer_repo(sublayer_repo, param_repo)
+    return nerv.LayerRepo(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            global_transf = {{}, {
+                dim_in = {429}, dim_out = {429},
+                sub_layers = sublayer_repo,
+                connections = {
+                    ["<input>[1]"] = "blayer1[1]",
+                    ["blayer1[1]"] = "wlayer1[1]",
+                    ["wlayer1[1]"] = "blayer2[1]",
+                    ["blayer2[1]"] = "wlayer2[1]",
+                    ["wlayer2[1]"] = "<output>[1]"
+                }
+            }},
+            main = {{}, {
+                dim_in = {429}, dim_out = {3001},
+                sub_layers = sublayer_repo,
+                connections = {
+                    ["<input>[1]"] = "affine0[1]",
+                    ["affine0[1]"] = "sigmoid0[1]",
+                    ["sigmoid0[1]"] = "affine1[1]",
+                    ["affine1[1]"] = "sigmoid1[1]",
+                    ["sigmoid1[1]"] = "affine2[1]",
+                    ["affine2[1]"] = "sigmoid2[1]",
+                    ["sigmoid2[1]"] = "affine3[1]",
+                    ["affine3[1]"] = "sigmoid3[1]",
+                    ["sigmoid3[1]"] = "affine4[1]",
+                    ["affine4[1]"] = "sigmoid4[1]",
+                    ["sigmoid4[1]"] = "affine5[1]",
+                    ["affine5[1]"] = "sigmoid5[1]",
+                    ["sigmoid5[1]"] = "affine6[1]",
+                    ["affine6[1]"] = "sigmoid6[1]",
+                    ["sigmoid6[1]"] = "affine7[1]",
+                    ["affine7[1]"] = "soutput[1]",
+                    ["soutput[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+end
+
+function get_network(layer_repo)
+    return layer_repo:get_layer("main")
+end
diff --git a/nerv/Makefile b/nerv/Makefile
index 022e2fb..fdffd12 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -30,7 +30,7 @@ LUAT_OBJS := $(addprefix $(OBJ_DIR)/,$(LUAT_OBJS))
 OBJS := $(CORE_OBJS) $(NERV_OBJS) $(LUAT_OBJS)
 LIBS := $(INST_LIBDIR)/libnerv.so $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so
 LUA_LIBS := matrix/init.lua io/init.lua init.lua \
-			layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua \
+			layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua layer/softmax.lua \
 			layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua layer/affine_recurrent.lua\
 			nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua \
 			io/sgd_buffer.lua 
diff --git a/nerv/layer/init.lua b/nerv/layer/init.lua
index 7172f99..6861b0e 100644
--- a/nerv/layer/init.lua
+++ b/nerv/layer/init.lua
@@ -78,3 +78,4 @@ nerv.include('window.lua')
 nerv.include('mse.lua')
 nerv.include('combiner.lua')
 nerv.include('affine_recurrent.lua')
+nerv.include('softmax.lua')
diff --git a/nerv/lib/matrix/generic/matrix.c b/nerv/lib/matrix/generic/matrix.c
index e4afa37..6cb3dc0 100644
--- a/nerv/lib/matrix/generic/matrix.c
+++ b/nerv/lib/matrix/generic/matrix.c
@@ -3,7 +3,7 @@
 #include "matrix.h"
 /* FIXME: malloc failure detection */
 
-static void nerv_matrix_(data_free)(Matrix *self, Status *status) {
+void nerv_matrix_(data_free)(Matrix *self, Status *status) {
     assert(*self->data_ref > 0);
     if (--(*self->data_ref) == 0)
     {
@@ -18,7 +18,7 @@ static void nerv_matrix_(data_free)(Matrix *self, Status *status) {
     }
 }
 
-static void nerv_matrix_(data_retain)(Matrix *self) {
+void nerv_matrix_(data_retain)(Matrix *self) {
     (*self->data_ref)++;
 }
 
diff --git a/nerv/lib/matrix/generic/matrix.h b/nerv/lib/matrix/generic/matrix.h
index 9d44e6d..69b4e6d 100644
--- a/nerv/lib/matrix/generic/matrix.h
+++ b/nerv/lib/matrix/generic/matrix.h
@@ -2,3 +2,5 @@
 Matrix *nerv_matrix_(create)(long nrow, long ncol, Status *status);
 void nerv_matrix_(destroy)(Matrix *self, Status *status);
 Matrix *nerv_matrix_(getrow)(Matrix *self, int row);
+void nerv_matrix_(data_free)(Matrix *self, Status *status);
+void nerv_matrix_(data_retain)(Matrix *self);
-- 
cgit v1.2.3-70-g09d2


From 0223b6b88620d9235fc47818aaa5c86ae81d38d9 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Tue, 4 Aug 2015 14:42:08 +0800
Subject: ...

---
 embedding_example/main.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/embedding_example/main.c b/embedding_example/main.c
index 4e70892..4c6459c 100644
--- a/embedding_example/main.c
+++ b/embedding_example/main.c
@@ -49,12 +49,6 @@ void propagate(float for_fun) {
     }
     /* avoid gc */
     nerv_matrix_host_float_data_retain(input);
-    nerv_matrix_host_float_data_retain(input);
-    nerv_matrix_host_float_data_retain(input);
-    nerv_matrix_host_float_data_retain(input);
-    nerv_matrix_host_float_data_retain(output);
-    nerv_matrix_host_float_data_retain(output);
-    nerv_matrix_host_float_data_retain(output);
     nerv_matrix_host_float_data_retain(output);
 
     luaT_pushudata(L, input, nerv_matrix_host_float_tname);
-- 
cgit v1.2.3-70-g09d2


From e20b60f659b08c46b9da0591ee489803f3f3d300 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Tue, 4 Aug 2015 15:51:53 +0800
Subject: ...

---
 nerv/layer/softmax.lua | 31 +++++++++++++++++++++++++++++++
 nerv/nerv              | 10 +++++-----
 2 files changed, 36 insertions(+), 5 deletions(-)
 create mode 100644 nerv/layer/softmax.lua

diff --git a/nerv/layer/softmax.lua b/nerv/layer/softmax.lua
new file mode 100644
index 0000000..e979ebf
--- /dev/null
+++ b/nerv/layer/softmax.lua
@@ -0,0 +1,31 @@
+local SoftmaxLayer = nerv.class("nerv.SoftmaxLayer", "nerv.Layer")
+
+function SoftmaxLayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.gconf = global_conf
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self:check_dim_len(1, 1) -- two inputs: nn output and label
+end
+
+function SoftmaxLayer:init(batch_size)
+    if self.dim_in[1] ~= self.dim_out[1] then
+        nerv.error("mismatching dimensions of input and output")
+    end
+end
+
+function SoftmaxLayer:update(bp_err, input, output)
+    -- no params, therefore do nothing
+end
+
+function SoftmaxLayer:propagate(input, output)
+    output[1]:softmax(input[1])
+end
+
+function SoftmaxLayer:back_propagate(bp_err, next_bp_err, input, output)
+    nerv.error_method_not_implemented()
+end
+
+function SoftmaxLayer:get_params()
+    return nerv.ParamRepo({})
+end
diff --git a/nerv/nerv b/nerv/nerv
index e5943aa..7571659 100644
--- a/nerv/nerv
+++ b/nerv/nerv
@@ -1,13 +1,13 @@
 #! /usr/bin/env luajit
 require 'nerv'
 print("Greetings")
-if #args < 1 then
+if #arg < 1 then
     return
 end
-local script = args[1]
+local script = arg[1]
 local script_arg = {}
-for i = 2, #args do
-    table.insert(script_arg, args[i])
+for i = 2, #arg do
+    table.insert(script_arg, arg[i])
 end
-args = script_arg
+arg = script_arg
 dofile(script)
-- 
cgit v1.2.3-70-g09d2


From c3effaac9e9965371a73f9c84c2a4e0880f32138 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Tue, 4 Aug 2015 17:27:09 +0800
Subject: fix gc issues

---
 embedding_example/Makefile       |  6 ++++++
 embedding_example/main.c         | 28 ++++++++++++++++------------
 embedding_example/run.sh         |  4 ++++
 embedding_example/setup_nerv.lua |  9 ++++-----
 4 files changed, 30 insertions(+), 17 deletions(-)
 create mode 100755 embedding_example/run.sh

diff --git a/embedding_example/Makefile b/embedding_example/Makefile
index e4ee314..3420b30 100644
--- a/embedding_example/Makefile
+++ b/embedding_example/Makefile
@@ -2,6 +2,12 @@ CFLAG += -I ../install/include/luajit-2.0/  -I ../install/include/nerv/
 LDFLAG += -L../install/lib/ -lluajit-5.1 -Wl,-rpath=../install/lib/ -lluaT -lnervcore
 GCC := gcc
 
+.PHONY: FORCE
+
+FORCE: ../install/bin/luarocks
+	echo "#!/bin/bash" > run.sh
+	$< path >> run.sh
+	echo "./main" >> run.sh
 main: main.o
 	$(GCC) -o $@ $<  $(LDFLAG)
 
diff --git a/embedding_example/main.c b/embedding_example/main.c
index 4c6459c..b3c9bf2 100644
--- a/embedding_example/main.c
+++ b/embedding_example/main.c
@@ -7,6 +7,8 @@
 #include <stdio.h>
 
 const char *nerv_matrix_host_float_tname = "nerv.MMatrixFloat";
+const char *input_name = "_nerv_embed_input";
+const char *output_name = "_nerv_embed_output";
 extern Matrix *nerv_matrix_host_float_create(long nrow, long ncol, Status *status);
 extern void nerv_matrix_host_float_data_retain(Matrix *self);
 extern void nerv_matrix_host_float_data_free(Matrix *self, Status *status);
@@ -31,6 +33,11 @@ void setup_nerv() {
     NERV_LUA_CHECK_STATUS(L, status);
     output = nerv_matrix_host_float_create(1, luaL_checkinteger(L, 2), &status);
     NERV_LUA_CHECK_STATUS(L, status);
+    /* add reference to avoid gc */
+    luaT_pushudata(L, output, nerv_matrix_host_float_tname);
+    luaT_pushudata(L, input, nerv_matrix_host_float_tname);
+    lua_setfield(L, LUA_GLOBALSINDEX, input_name);
+    lua_setfield(L, LUA_GLOBALSINDEX, output_name);
 }
 
 
@@ -47,12 +54,8 @@ void propagate(float for_fun) {
             nerv_row[j] = j * for_fun;
         }
     }
-    /* avoid gc */
-    nerv_matrix_host_float_data_retain(input);
-    nerv_matrix_host_float_data_retain(output);
-
-    luaT_pushudata(L, input, nerv_matrix_host_float_tname);
-    luaT_pushudata(L, output, nerv_matrix_host_float_tname);
+    lua_getfield(L, LUA_GLOBALSINDEX, input_name);
+    lua_getfield(L, LUA_GLOBALSINDEX, output_name);
     /* lua stack now: input width, output width, propagator, propagator, input, output */
     if (lua_pcall(L, 2, 0, 0)) /* call propagator with two parameters */
     {
@@ -60,7 +63,7 @@ void propagate(float for_fun) {
         exit(-1);
     }
     /* lua stack now: input width, output width, propagator */
-    printf("## caller ##\n");
+    printf("## output: %ld %ld ##\n", output->nrow, output->ncol);
     for (i = 0; i < output->nrow; i++) /* nrow is actually 1 */
     {
         float *nerv_row = (float *)((char *)output->data.f + i * output->stride);
@@ -68,21 +71,22 @@ void propagate(float for_fun) {
         {
             printf("%.8f ", nerv_row[j]);
         }
-        printf("\n");
     }
 }
 
 void teardown_nerv() {
-    nerv_matrix_host_float_data_free(input, &status);
-    NERV_LUA_CHECK_STATUS(L, status);
-    nerv_matrix_host_float_data_free(output, &status);
-    NERV_LUA_CHECK_STATUS(L, status);
+    lua_pushnil(L);
+    lua_pushnil(L);
+    lua_setfield(L, LUA_GLOBALSINDEX, input_name);
+    lua_setfield(L, LUA_GLOBALSINDEX, output_name);
+    lua_gc(L, LUA_GCCOLLECT, 0);
 }
 
 int main() {
     setup_nerv();
     propagate(1.0);
     propagate(2.0);
+    propagate(2.0);
     propagate(3.0);
     teardown_nerv();
     return 0;
diff --git a/embedding_example/run.sh b/embedding_example/run.sh
new file mode 100755
index 0000000..e919263
--- /dev/null
+++ b/embedding_example/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+export LUA_PATH='/home/slhome/mfy43/.luarocks/share/lua/5.1/?.lua;/home/slhome/mfy43/.luarocks/share/lua/5.1/?/init.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?/init.lua;./?.lua;/usr/local/share/luajit-2.0.4/?.lua;/usr/local/share/lua/5.1/?.lua;/usr/local/share/lua/5.1/?/init.lua'
+export LUA_CPATH='/home/slhome/mfy43/.luarocks/lib/lua/5.1/?.so;/home/slhome/mfy43/nerv/install/lib/lua/5.1/?.so;./?.so;/usr/local/lib/lua/5.1/?.so;/usr/local/lib/lua/5.1/loadall.so'
+./main
diff --git a/embedding_example/setup_nerv.lua b/embedding_example/setup_nerv.lua
index e33a1e7..3ae878d 100644
--- a/embedding_example/setup_nerv.lua
+++ b/embedding_example/setup_nerv.lua
@@ -1,10 +1,7 @@
-package.path="/home/slhome/mfy43/.luarocks/share/lua/5.1/?.lua;/home/slhome/mfy43/.luarocks/share/lua/5.1/?/init.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?/init.lua;"..package.path
-package.cpath="/home/slhome/mfy43/.luarocks/lib/lua/5.1/?.so;/home/slhome/mfy43/nerv/install/lib/lua/5.1/?.so;"..package.cpath
 local k,l,_=pcall(require,"luarocks.loader") _=k and l.add_context("nerv","scm-1")
-
-local args = {...}
 require 'nerv'
-dofile(args[1])
+local arg = {...}
+dofile(arg[1])
 local param_repo = nerv.ParamRepo()
 param_repo:import(gconf.initialized_param, nil, gconf)
 local sublayer_repo = make_sublayer_repo(param_repo)
@@ -12,6 +9,7 @@ local layer_repo = make_layer_repo(sublayer_repo, param_repo)
 local network = get_network(layer_repo)
 local batch_size = 1
 network:init(batch_size)
+
 function propagator(input, output)
     local gpu_input = nerv.CuMatrixFloat(input:nrow(), input:ncol())
     local gpu_output = nerv.CuMatrixFloat(output:nrow(), output:ncol())
@@ -23,4 +21,5 @@ function propagator(input, output)
     -- collect garbage in-time to save GPU memory
     collectgarbage("collect")
 end
+
 return network.dim_in[1], network.dim_out[1], propagator
-- 
cgit v1.2.3-70-g09d2


From 6aa0bb7b1ca5c92e50d6c7eeb1e3db2c16e71c3b Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Tue, 4 Aug 2015 17:32:30 +0800
Subject: ...

---
 embedding_example/Makefile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/embedding_example/Makefile b/embedding_example/Makefile
index 3420b30..73287f4 100644
--- a/embedding_example/Makefile
+++ b/embedding_example/Makefile
@@ -4,6 +4,11 @@ GCC := gcc
 
 .PHONY: FORCE
 
+all: main FORCE
+clean:
+	-rm -f *.o
+	-rm main
+
 FORCE: ../install/bin/luarocks
 	echo "#!/bin/bash" > run.sh
 	$< path >> run.sh
-- 
cgit v1.2.3-70-g09d2


From e935fd6d49b81b0c83d5ad112bfd0a8a68a67175 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Wed, 5 Aug 2015 08:04:40 +0800
Subject: put global transformation into a separate library

---
 Makefile                        |   1 +
 nerv/Makefile                   |   2 +-
 nerv/matrix/generic/cukernel.cu | 592 ----------------------------------------
 speech                          |   2 +-
 4 files changed, 3 insertions(+), 594 deletions(-)
 delete mode 100644 nerv/matrix/generic/cukernel.cu

diff --git a/Makefile b/Makefile
index fa888c3..664a83b 100644
--- a/Makefile
+++ b/Makefile
@@ -9,6 +9,7 @@ luarocks:
 install:
 	cd nerv; $(PREFIX)/bin/luarocks make
 speech:
+	cd speech/speech_utils; $(PREFIX)/bin/luarocks make
 	cd speech/htk_io; $(PREFIX)/bin/luarocks make
 clean:
 	cd nerv && make clean
diff --git a/nerv/Makefile b/nerv/Makefile
index fdffd12..728d010 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -66,7 +66,7 @@ $(LIB_PATH)/libluaT.so: $(LUAT_OBJS)
 $(INST_LIBDIR)/libnerv.so: $(NERV_OBJS) $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so
 	gcc -shared -o $@ $(NERV_OBJS) $(LDFLAGS) -Wl,-rpath=$(LIB_PATH) -L$(LIB_PATH) -lnervcore -lluaT
 
-$(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c matrix/generic/cukernel.cu
+$(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c
 $(OBJ_DIR)/matrix/mmatrix.o: matrix/generic/mmatrix.c matrix/generic/matrix.c
 
 $(OBJ_DIR)/lib/matrix/cumatrix.o: lib/matrix/generic/cumatrix.c lib/matrix/generic/matrix.c lib/matrix/generic/cukernel.cu
diff --git a/nerv/matrix/generic/cukernel.cu b/nerv/matrix/generic/cukernel.cu
deleted file mode 100644
index 2ae5e62..0000000
--- a/nerv/matrix/generic/cukernel.cu
+++ /dev/null
@@ -1,592 +0,0 @@
-#ifdef NERV_GENERIC_CUKERNEL
-#include <assert.h>
-#include <stdio.h>
-#include "matrix.h"
-#include "cuda.h"
-#include "float.h"
-#define CUDA_THREADS_N 16
-#define CUDA_THREADS_NN ((CUDA_THREADS_N) * (CUDA_THREADS_N))
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
-__global__ void cudak_(log_elem)(const MATRIX_ELEM *a, MATRIX_ELEM *b, 
-                                int nrow, int ncol, int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    long idx;
-    MATRIX_ELEM tmp;
-    if (i >= nrow || j >= ncol) return;
-    idx = j + i * stride;
-    tmp = a[idx];
-    if(tmp < FLT_MIN) tmp = FLT_MIN;
-    b[idx] = log(tmp);
-}
-
-__global__ void cudak_(mul_elem)(const MATRIX_ELEM *a, const MATRIX_ELEM *b,
-                                MATRIX_ELEM *c, 
-                                int nrow, int ncol, int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    long idx;
-    if (i >= nrow || j >= ncol) return;
-    idx = j + i * stride;
-    c[idx] = a[idx] * b[idx];
-}
-
-__global__ void cudak_(sigmoid)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                        int nrow, int ncol, int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    long idx;
-    if (i >= nrow || j >= ncol) return;
-    idx = j + i * stride;
-    b[idx] = 1.0 / (1.0 + exp(-a[idx]));
-}
-
-__global__ void cudak_(sigmoid_grad)(const MATRIX_ELEM *output,
-                                    const MATRIX_ELEM *err,
-                                    MATRIX_ELEM *nerr,
-                                    int nrow, int ncol, int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    long idx;
-    if (i >= nrow || j >= ncol) return;
-    idx = j + i * stride;
-    nerr[idx] = output[idx] * (1.0 - output[idx]) * err[idx];
-}
-
-__global__ void cudak_(softmax_final)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                        const MATRIX_ELEM *max, const MATRIX_ELEM *deno,
-                        int nrow, int ncol, int stride, int mstride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    long idx;
-    if (i >= nrow || j >= ncol) return;
-    idx = j + i * stride;
-    b[idx] = exp(a[idx] - max[0 + i * mstride]) / deno[0 + i * mstride];
-}
-
-__global__ void cudak_(block_reduce_rowsum)(const MATRIX_ELEM *input,
-                                            MATRIX_ELEM *output,
-                                            const int istride, const int ostride,
-                                            const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0;
-    __syncthreads();
-    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-            cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_colsum)(const MATRIX_ELEM *input,
-                                MATRIX_ELEM *output,
-                                const int istride, const int ostride,
-                                const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    cudak_(arr)[threadIdx.y] = i < n ? input[blockIdx.x + istride * i] : 0;
-    __syncthreads();
-    for (int offset = blockDim.y >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.y < offset)
-            cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset];
-        __syncthreads();
-    }
-    if (threadIdx.y == 0)
-        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_colsame)(const MATRIX_ELEM *input,
-                                            const MATRIX_ELEM *ref_input,
-                                            MATRIX_ELEM *output,
-                                            const int istride, const int ostride,
-                                            const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    cudak_(arr)[threadIdx.y] = (i < n && input[blockIdx.x + istride * i] == \
-                                        ref_input[blockIdx.x + istride * i]) ? 1.0 : 0;
-    __syncthreads();
-    for (int offset = blockDim.y >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.y < offset)
-            cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset];
-        __syncthreads();
-    }
-    if (threadIdx.y == 0)
-        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_softmax_rowsum)(const MATRIX_ELEM *input,
-                                        MATRIX_ELEM *output,
-                                        const MATRIX_ELEM *max,
-                                        const int istride, const int ostride,
-                                        const int mstride, const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    cudak_(arr)[threadIdx.x] = j < n ? exp(input[j + istride * blockIdx.y] - \
-                                    max[0 + mstride * blockIdx.y]) : 0;
-    __syncthreads();
-    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-            cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_rowmax)(const MATRIX_ELEM *input,
-                                            MATRIX_ELEM *output,
-                                            const int istride, const int ostride,
-                                            const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
-    __syncthreads();
-    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-        {
-            MATRIX_ELEM l = cudak_(arr)[threadIdx.x],
-                        r = cudak_(arr)[threadIdx.x + offset];
-            if (r > l)
-                cudak_(arr)[threadIdx.x] = r;
-        }
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_rowmax_idx)(const MATRIX_ELEM *input,
-                                                const MATRIX_ELEM *idx_input,
-                                                MATRIX_ELEM *output,
-                                                MATRIX_ELEM *idx_output,
-                                                const int istride, const int ostride,
-                                                const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    MATRIX_ELEM *arr_val = cudak_(arr);
-    MATRIX_ELEM *arr_idx = arr_val + blockDim.x;
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    arr_val[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
-    arr_idx[threadIdx.x] = j < n ? idx_input[j + istride * blockIdx.y] : 0;
-    __syncthreads();
-    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-        {
-            MATRIX_ELEM l = arr_val[threadIdx.x],
-                        r = arr_val[threadIdx.x + offset];
-            if (r > l)
-            {
-                arr_val[threadIdx.x] = r;
-                arr_idx[threadIdx.x] = arr_idx[threadIdx.x + offset];
-            }
-        }
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-    {
-        output[blockIdx.x + ostride * blockIdx.y] = arr_val[0];
-        idx_output[blockIdx.x + ostride * blockIdx.y] = arr_idx[0];
-    }
-}
-
-__global__ void cudak_(add_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                int nrow, int ncol, int stride, double beta) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[j + i * stride] += beta * a[j];
-}
-
-__global__ void cudak_(fill)(MATRIX_ELEM *a,
-                            int nrow, int ncol, int stride, double val) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    a[j + i * stride] = val;
-}
-
-__global__ void cudak_(clip)(MATRIX_ELEM *a,
-                            int nrow, int ncol, int stride, double val_1, double val_2) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    if (a[j + i * stride] > val_2)
-        a[j + i * stride] = val_2;
-    else if (a[j + i * stride] < val_1)
-        a[j + i * stride] = val_1;
-}
-
-__global__ void cudak_(expand_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                    int nrow, int ncol,
-                                    int enrow, int encol,
-                                    int stride, int estride,
-                                    int context) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    int ridx;
-    if (i >= enrow || j >= encol) return;
-    ridx = i + j / ncol - context;
-    if (ridx < 0) ridx = 0;
-    else if (ridx >= nrow) ridx = nrow - 1;
-    b[j + i * estride] = a[j % ncol + ridx * stride];
-}
-
-__global__ void cudak_(rearrange_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                    int nrow, int ncol,
-                                    int stride, int step, int orig_dim) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[j + i * stride] = a[j / step + (j % step) * orig_dim + i * stride];
-}
-
-__global__ void cudak_(scale_rows_by_col)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                        int nrow, int ncol,
-                                        int astride, int bstride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[j + i * bstride] *= a[i * astride];
-}
-
-__global__ void cudak_(scale_rows_by_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                        int nrow, int ncol,
-                                        int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[j + i * stride] *= a[j];
-}
-
-__global__ void cudak_(decompress)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                    int nrow, int ncol,
-                                    int stride_a, int stride_b) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[lrintf(a[j + i * stride_a]) + i * stride_b] = 1.0;
-}
-
-__global__ void cudak_(gen_col_idx)(MATRIX_ELEM *b,
-                                    int nrow, int ncol, int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[j + i * stride] = j;
-}
-
-extern "C" {
-#include "../cukernel.h"
-    void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(log_elem)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_mul_elem)(const Matrix *a, const Matrix *b,
-                                Matrix *c) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(mul_elem)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             MATRIX_ELEM_PTR(c),
-             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(sigmoid)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
-            b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_sigmoid_grad)(const Matrix *output,
-                                    const Matrix *err, Matrix *nerr) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(nerr->ncol, threadsPerBlock.x),
-                CEIL_DIV(nerr->nrow, threadsPerBlock.y));
-        cudak_(sigmoid_grad)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(output), MATRIX_ELEM_PTR(err),
-             MATRIX_ELEM_PTR(nerr),
-             nerr->nrow, nerr->ncol,
-             nerr->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_rowsum)(const Matrix *a, Matrix *b) {
-        dim3 block(CUDA_THREADS_NN, 1);
-        int ncol = a->ncol;
-        int blocks_per_row = CEIL_DIV(ncol, block.x);
-        dim3 grid(blocks_per_row, a->nrow);
-        MATRIX_ELEM *res;
-        size_t stride;
-        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
-        cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), res,
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             ncol);
-        ncol = blocks_per_row;
-        assert((unsigned long)ncol <= block.x);
-        grid.x = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (res, MATRIX_ELEM_PTR(b),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             ncol);
-        cudaStreamSynchronize(0);
-        cudaFree(res);
-    }
-
-    void cudak_(cuda_colsame)(const Matrix *a, const Matrix *ref, Matrix *b) {
-        dim3 block(1, CUDA_THREADS_NN);
-        int nrow = a->nrow;
-        int blocks_per_col = CEIL_DIV(nrow, block.y);
-        dim3 grid(a->ncol, blocks_per_col);
-        MATRIX_ELEM *res;
-        size_t stride;
-        cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col);
-        cudak_(block_reduce_colsame)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(ref), res,
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             nrow);
-        nrow = blocks_per_col;
-        assert((unsigned long)nrow <= block.y);
-        grid.y = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
-            (res, MATRIX_ELEM_PTR(b),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             nrow);
-        cudaStreamSynchronize(0);
-        cudaFree(res);
-    }
-
-    void cudak_(cuda_colsum)(const Matrix *a, Matrix *b) {
-        dim3 block(1, CUDA_THREADS_NN);
-        int nrow = a->nrow;
-        int blocks_per_col = CEIL_DIV(nrow, block.y);
-        dim3 grid(a->ncol, blocks_per_col);
-        MATRIX_ELEM *res;
-        size_t stride;
-        cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col);
-        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), res,
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             nrow);
-        nrow = blocks_per_col;
-        assert((unsigned long)nrow <= block.y);
-        grid.y = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
-            (res, MATRIX_ELEM_PTR(b),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             nrow);
-        cudaStreamSynchronize(0);
-        cudaFree(res);
-    }
-
-    void cudak_(cuda_softmax_final)(const Matrix *a, const Matrix *max,
-                            const Matrix *deno, Matrix *b) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(softmax_final)<<<numBlocks, threadsPerBlock>>> \
-                (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-                MATRIX_ELEM_PTR(max), MATRIX_ELEM_PTR(deno),
-                b->nrow, b->ncol,
-                b->stride / sizeof(MATRIX_ELEM),
-                max->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_softmax_denominator)(const Matrix *a, const Matrix *max, Matrix *b) {
-        dim3 block(CUDA_THREADS_NN, 1);
-        int ncol = a->ncol;
-        int blocks_per_row = CEIL_DIV(ncol, block.x);
-        dim3 grid(blocks_per_row, a->nrow);
-        MATRIX_ELEM *res;
-        size_t stride;
-        assert(max->ncol == 1);
-        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
-        cudak_(block_reduce_softmax_rowsum) \
-            <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), res, MATRIX_ELEM_PTR(max),
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             max->stride / sizeof(MATRIX_ELEM),
-             ncol);
-        ncol = blocks_per_row;
-        assert((unsigned long)ncol <= block.x);
-        grid.x = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_rowsum) \
-            <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (res, MATRIX_ELEM_PTR(b),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             ncol);
-        cudaStreamSynchronize(0);
-        cudaFree(res);
-    }
-
-    void cudak_(cuda_rowmax)(const Matrix *a, Matrix *b) {
-        dim3 block(CUDA_THREADS_NN, 1);
-        int ncol = a->ncol;
-        int blocks_per_row = CEIL_DIV(ncol, block.x);
-        dim3 grid(blocks_per_row, a->nrow);
-        MATRIX_ELEM *res;
-        size_t stride;
-        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
-        cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), res,
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             ncol);
-        ncol = blocks_per_row;
-        assert((unsigned long)ncol <= block.x);
-        grid.x = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (res, MATRIX_ELEM_PTR(b),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             ncol);
-        cudaStreamSynchronize(0);
-        cudaFree(res);
-    }
-
-    void cudak_(cuda_rowmax_idx)(const Matrix *a, Matrix *b, Matrix *b_idx) {
-        dim3 block(CUDA_THREADS_NN, 1);
-        int ncol = a->ncol;
-        int blocks_per_row = CEIL_DIV(ncol, block.x);
-        dim3 grid(blocks_per_row, a->nrow);
-        MATRIX_ELEM *a_idx, *res, *res_idx;
-        size_t stride;
-        cudaMallocPitch(&a_idx, &stride, a->stride, a->nrow);
-        cudak_(gen_col_idx)<<<grid, block>>>(a_idx, a->nrow, ncol, stride / sizeof(MATRIX_ELEM));
-        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
-        cudaMallocPitch(&res_idx, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_rowmax_idx)<<<grid, block,
-                                        2 * block.x * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), a_idx, res, res_idx,
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             ncol);
-        ncol = blocks_per_row;
-        assert((unsigned long)ncol <= block.x);
-        grid.x = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_rowmax_idx)<<<grid, block,
-                                        2 * block.x * sizeof(MATRIX_ELEM)>>> \
-            (res, res_idx, MATRIX_ELEM_PTR(b), MATRIX_ELEM_PTR(b_idx),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             ncol);
-        cudaStreamSynchronize(0);
-        cudaFree(a_idx);
-        cudaFree(res);
-        cudaFree(res_idx);
-    }
-
-    /* in-place calc */
-    void cudak_(cuda_add_row)(const Matrix *a, Matrix *b, double beta) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(add_row)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
-            b->stride / sizeof(MATRIX_ELEM), beta);
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_fill)(Matrix *a, double val) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x),
-                CEIL_DIV(a->nrow, threadsPerBlock.y));
-        cudak_(fill)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), a->nrow, a->ncol,
-            a->stride / sizeof(MATRIX_ELEM), val);
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_clip)(Matrix *a, double val_1, double val_2) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x),
-                CEIL_DIV(a->nrow, threadsPerBlock.y));
-        cudak_(clip)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), a->nrow, a->ncol,
-            a->stride / sizeof(MATRIX_ELEM), val_1, val_2);
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(expand_frm)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             a->nrow, a->ncol,
-             b->nrow, b->ncol,
-             a->stride / sizeof(MATRIX_ELEM),
-             b->stride / sizeof(MATRIX_ELEM),
-             context);
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(rearrange_frm)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM),
-             step, b->ncol / step);
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(scale_rows_by_col)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             b->nrow, b->ncol,
-             a->stride / sizeof(MATRIX_ELEM),
-             b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(scale_rows_by_row)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_decompress)(const Matrix *a, Matrix *b) {
-        dim3 threadsPerBlock(1, CUDA_THREADS_NN);
-        dim3 numBlocks(1, CEIL_DIV(a->nrow, threadsPerBlock.y));
-        cudak_(decompress)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             a->nrow, a->ncol,
-             a->stride / sizeof(MATRIX_ELEM),
-             b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-}
-#endif
diff --git a/speech b/speech
index 08e33af..acd1bc3 160000
--- a/speech
+++ b/speech
@@ -1 +1 @@
-Subproject commit 08e33afa533af1f026ac271446a0c873fe0bb5cb
+Subproject commit acd1bc3cf812f69a6260179b584f2a3f0e6d6b80
-- 
cgit v1.2.3-70-g09d2


From bca6cfa05af1dfc898bdb35f9f481048840e9217 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Wed, 5 Aug 2015 08:10:24 +0800
Subject: keep up with speech repo

---
 speech | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/speech b/speech
index acd1bc3..261aed4 160000
--- a/speech
+++ b/speech
@@ -1 +1 @@
-Subproject commit acd1bc3cf812f69a6260179b584f2a3f0e6d6b80
+Subproject commit 261aed43b863c85ecf709dcac2739c388ec491d2
-- 
cgit v1.2.3-70-g09d2


From 00b02761747caf09756c7dccf11a7236777cbb42 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Wed, 5 Aug 2015 08:17:59 +0800
Subject: ...

---
 .gitmodules | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 1432de9..16785f1 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -3,10 +3,7 @@
 	url = http://luajit.org/git/luajit-2.0.git
 [submodule "nerv-speech"]
 	path = nerv-speech
-	url = https://github.com/Determinant/nerv-speech.git
-[submodule "speech"]
-	path = speech
-	url = https://github.com/Determinant/nerv-speech.git
+	url = https://github.com/Nerv-SJTU/nerv-speech.git
 [submodule "luarocks"]
 	path = luarocks
 	url = https://github.com/keplerproject/luarocks.git
-- 
cgit v1.2.3-70-g09d2


From 30701effba795635c9dcfa86882486f1f184e243 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Wed, 5 Aug 2015 08:19:03 +0800
Subject: ...

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 16785f1..afad59e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,7 +1,7 @@
 [submodule "luajit-2.0"]
 	path = luajit-2.0
 	url = http://luajit.org/git/luajit-2.0.git
-[submodule "nerv-speech"]
+[submodule "speech"]
 	path = nerv-speech
 	url = https://github.com/Nerv-SJTU/nerv-speech.git
 [submodule "luarocks"]
-- 
cgit v1.2.3-70-g09d2


From 7ae89059d68850e12826bc6812e4a6d521e45b53 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Wed, 5 Aug 2015 08:20:18 +0800
Subject: ...

---
 .gitmodules | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index afad59e..217ded3 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,8 +1,8 @@
 [submodule "luajit-2.0"]
 	path = luajit-2.0
 	url = http://luajit.org/git/luajit-2.0.git
-[submodule "speech"]
-	path = nerv-speech
+[submodule "nerv-speech"]
+	path = speech
 	url = https://github.com/Nerv-SJTU/nerv-speech.git
 [submodule "luarocks"]
 	path = luarocks
-- 
cgit v1.2.3-70-g09d2


From 7579ff4941d7019d4e911978879ec07b62a4e523 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Wed, 5 Aug 2015 09:29:24 +0800
Subject: use expanded features and do global transf in embedding_example

---
 embedding_example/setup_nerv.lua          | 10 ++++++----
 embedding_example/swb_baseline_decode.lua |  5 +++++
 speech                                    |  2 +-
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/embedding_example/setup_nerv.lua b/embedding_example/setup_nerv.lua
index 3ae878d..49a5dd6 100644
--- a/embedding_example/setup_nerv.lua
+++ b/embedding_example/setup_nerv.lua
@@ -7,17 +7,19 @@ param_repo:import(gconf.initialized_param, nil, gconf)
 local sublayer_repo = make_sublayer_repo(param_repo)
 local layer_repo = make_layer_repo(sublayer_repo, param_repo)
 local network = get_network(layer_repo)
+local global_transf = get_global_transf(layer_repo)
 local batch_size = 1
 network:init(batch_size)
 
 function propagator(input, output)
-    local gpu_input = nerv.CuMatrixFloat(input:nrow(), input:ncol())
+    local transformed = nerv.speech_utils.global_transf(input,
+                            global_transf, 0, gconf) -- preprocessing
+    local gpu_input = nerv.CuMatrixFloat(transformed:nrow(), transformed:ncol())
     local gpu_output = nerv.CuMatrixFloat(output:nrow(), output:ncol())
-    gpu_input:copy_fromh(input)
-    print(gpu_input)
+    print(transformed)
+    gpu_input:copy_fromh(transformed)
     network:propagate({gpu_input}, {gpu_output})
     gpu_output:copy_toh(output)
-    print(output)
     -- collect garbage in-time to save GPU memory
     collectgarbage("collect")
 end
diff --git a/embedding_example/swb_baseline_decode.lua b/embedding_example/swb_baseline_decode.lua
index 14a463b..8cdb320 100644
--- a/embedding_example/swb_baseline_decode.lua
+++ b/embedding_example/swb_baseline_decode.lua
@@ -107,3 +107,8 @@ end
 function get_network(layer_repo)
     return layer_repo:get_layer("main")
 end
+
+
+function get_global_transf(layer_repo)
+    return layer_repo:get_layer("global_transf")
+end
diff --git a/speech b/speech
index 261aed4..c6c6442 160000
--- a/speech
+++ b/speech
@@ -1 +1 @@
-Subproject commit 261aed43b863c85ecf709dcac2739c388ec491d2
+Subproject commit c6c644223dc9168befd189bf6f33243390671c99
-- 
cgit v1.2.3-70-g09d2


From 156e2b5d97ac7a702e655dc3a1e260c72eecaa9c Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Thu, 6 Aug 2015 08:56:18 +0800
Subject: fix typoes in nerv.error

---
 nerv/init.lua | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/nerv/init.lua b/nerv/init.lua
index 183ae6d..9c1a5c8 100644
--- a/nerv/init.lua
+++ b/nerv/init.lua
@@ -1,15 +1,19 @@
 require 'libnerv'
 
-function nerv.error(fmt, ...)
-    error("[nerv] internal error: " .. fmt .. "\n", ...)
-end
-
 function nerv.error_method_not_implemented()
     nerv.error("method not implemented");
 end
 
+function nerv.sprintf(fmt, ...)
+    return string.format(fmt, ...)
+end
+
 function nerv.printf(fmt, ...)
-    io.write(string.format(fmt, ...))
+    io.write(nerv.sprintf(fmt, ...))
+end
+
+function nerv.error(fmt, ...)
+    error(nerv.sprintf("[nerv] internal error: " .. fmt .. "\n", ...))
 end
 
 function nerv.mesg_with_timestamp(fmt, ...)
-- 
cgit v1.2.3-70-g09d2


From ddc52b089a29dbe805ccbe499460b3e5d5b060c7 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Thu, 6 Aug 2015 10:25:26 +0800
Subject: do not use submodule for nerv-speech

---
 .gitmodules | 3 ---
 README.md   | 4 ++--
 speech      | 1 -
 3 files changed, 2 insertions(+), 6 deletions(-)
 delete mode 160000 speech

diff --git a/.gitmodules b/.gitmodules
index 217ded3..9f556c5 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,9 +1,6 @@
 [submodule "luajit-2.0"]
 	path = luajit-2.0
 	url = http://luajit.org/git/luajit-2.0.git
-[submodule "nerv-speech"]
-	path = speech
-	url = https://github.com/Nerv-SJTU/nerv-speech.git
 [submodule "luarocks"]
 	path = luarocks
 	url = https://github.com/keplerproject/luarocks.git
diff --git a/README.md b/README.md
index 10d531c..efec2c4 100644
--- a/README.md
+++ b/README.md
@@ -8,9 +8,9 @@ First make sure you have __lua__ and __CUDA__ installed on your computer.
 __Nerv__ is currently developed via github.You can download and make __Nerv__ by doing the following:
 ```
 cd ~
-git clone https://github.com/Determinant/nerv.git
+git clone https://github.com/Nerv-SJTU/nerv.git
 cd nerv
-git submodule init && git submodule update
+git clone https://github.com/Nerv-SJTU/nerv-speech.git
 make
 ```
 The `git submodule` command is for the __luajit__ repository inside __Nerv__.  
diff --git a/speech b/speech
deleted file mode 160000
index c6c6442..0000000
--- a/speech
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit c6c644223dc9168befd189bf6f33243390671c99
-- 
cgit v1.2.3-70-g09d2


From 2dc87bc02a1242dd5e029d0baaf4e0ae7173184f Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Thu, 6 Aug 2015 10:34:43 +0800
Subject: ...

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index efec2c4..be8e5b0 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,8 @@ __Nerv__ is currently developed via github.You can download and make __Nerv__ by
 cd ~
 git clone https://github.com/Nerv-SJTU/nerv.git
 cd nerv
-git clone https://github.com/Nerv-SJTU/nerv-speech.git
+git clone https://github.com/Nerv-SJTU/nerv-speech.git speech
+git submodule init && git submodule update
 make
 ```
 The `git submodule` command is for the __luajit__ repository inside __Nerv__.  
-- 
cgit v1.2.3-70-g09d2


From b4d9cfa8e3a4735687311577dded97d889340134 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Thu, 6 Aug 2015 14:08:26 +0800
Subject: make network configuration example file clearer

---
 embedding_example/main.c                  |   2 +-
 embedding_example/setup_nerv.lua          |   5 +-
 embedding_example/swb_baseline_decode.lua | 114 ---------------------
 nerv/doc/gh-pages                         |   1 +
 nerv/examples/asr_trainer.lua             |  11 +-
 nerv/examples/swb_baseline.lua            |  68 ++++++++++---
 nerv/examples/swb_baseline_basic.lua      | 161 ++++++++++++++++++++++++++++++
 nerv/nn/layer_dag.lua                     |   3 +-
 nerv/nn/layer_repo.lua                    |   8 +-
 9 files changed, 230 insertions(+), 143 deletions(-)
 delete mode 100644 embedding_example/swb_baseline_decode.lua
 create mode 160000 nerv/doc/gh-pages
 create mode 100644 nerv/examples/swb_baseline_basic.lua

diff --git a/embedding_example/main.c b/embedding_example/main.c
index b3c9bf2..8856d58 100644
--- a/embedding_example/main.c
+++ b/embedding_example/main.c
@@ -22,7 +22,7 @@ void setup_nerv() {
     luaL_openlibs(L);
     luaL_loadfile(L, "setup_nerv.lua");
     /* network configuration */
-    lua_pushstring(L, "swb_baseline_decode.lua");
+    lua_pushstring(L, "../nerv/examples/swb_baseline.lua");
     if (lua_pcall(L, 1, LUA_MULTRET, 0))
     {
         printf("%s\n", luaL_checkstring(L, 1));
diff --git a/embedding_example/setup_nerv.lua b/embedding_example/setup_nerv.lua
index 49a5dd6..5ade950 100644
--- a/embedding_example/setup_nerv.lua
+++ b/embedding_example/setup_nerv.lua
@@ -4,9 +4,8 @@ local arg = {...}
 dofile(arg[1])
 local param_repo = nerv.ParamRepo()
 param_repo:import(gconf.initialized_param, nil, gconf)
-local sublayer_repo = make_sublayer_repo(param_repo)
-local layer_repo = make_layer_repo(sublayer_repo, param_repo)
-local network = get_network(layer_repo)
+local layer_repo = make_layer_repo(param_repo)
+local network = get_decode_network(layer_repo)
 local global_transf = get_global_transf(layer_repo)
 local batch_size = 1
 network:init(batch_size)
diff --git a/embedding_example/swb_baseline_decode.lua b/embedding_example/swb_baseline_decode.lua
deleted file mode 100644
index 8cdb320..0000000
--- a/embedding_example/swb_baseline_decode.lua
+++ /dev/null
@@ -1,114 +0,0 @@
-require 'htk_io'
-gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
-        cumat_type = nerv.CuMatrixFloat,
-        mmat_type = nerv.MMatrixFloat,
-        frm_ext = 5,
-        tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
-        cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp",
-        htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
-        initialized_param = {"/slfs1/users/mfy43/swb_init.nerv",
-                "/slfs1/users/mfy43/swb_global_transf.nerv"},
-        debug = false}
-
-function make_sublayer_repo(param_repo)
-    return nerv.LayerRepo(
-    {
-        -- global transf
-        ["nerv.BiasLayer"] =
-        {
-            blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}},
-            blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}}
-        },
-        ["nerv.WindowLayer"] =
-        {
-            wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}},
-            wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}}
-        },
-        -- biased linearity
-        ["nerv.AffineLayer"] =
-        {
-            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
-            {dim_in = {429}, dim_out = {2048}}},
-            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
-            {dim_in = {2048}, dim_out = {3001}}}
-        },
-        ["nerv.SigmoidLayer"] =
-        {
-            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
-        },
-        ["nerv.SoftmaxLayer"] =
-        {
-            soutput = {{}, {dim_in = {3001}, dim_out = {3001}}}
-        }
-    }, param_repo, gconf)
-end
-
-function make_layer_repo(sublayer_repo, param_repo)
-    return nerv.LayerRepo(
-    {
-        ["nerv.DAGLayer"] =
-        {
-            global_transf = {{}, {
-                dim_in = {429}, dim_out = {429},
-                sub_layers = sublayer_repo,
-                connections = {
-                    ["<input>[1]"] = "blayer1[1]",
-                    ["blayer1[1]"] = "wlayer1[1]",
-                    ["wlayer1[1]"] = "blayer2[1]",
-                    ["blayer2[1]"] = "wlayer2[1]",
-                    ["wlayer2[1]"] = "<output>[1]"
-                }
-            }},
-            main = {{}, {
-                dim_in = {429}, dim_out = {3001},
-                sub_layers = sublayer_repo,
-                connections = {
-                    ["<input>[1]"] = "affine0[1]",
-                    ["affine0[1]"] = "sigmoid0[1]",
-                    ["sigmoid0[1]"] = "affine1[1]",
-                    ["affine1[1]"] = "sigmoid1[1]",
-                    ["sigmoid1[1]"] = "affine2[1]",
-                    ["affine2[1]"] = "sigmoid2[1]",
-                    ["sigmoid2[1]"] = "affine3[1]",
-                    ["affine3[1]"] = "sigmoid3[1]",
-                    ["sigmoid3[1]"] = "affine4[1]",
-                    ["affine4[1]"] = "sigmoid4[1]",
-                    ["sigmoid4[1]"] = "affine5[1]",
-                    ["affine5[1]"] = "sigmoid5[1]",
-                    ["sigmoid5[1]"] = "affine6[1]",
-                    ["affine6[1]"] = "sigmoid6[1]",
-                    ["sigmoid6[1]"] = "affine7[1]",
-                    ["affine7[1]"] = "soutput[1]",
-                    ["soutput[1]"] = "<output>[1]"
-                }
-            }}
-        }
-    }, param_repo, gconf)
-end
-
-function get_network(layer_repo)
-    return layer_repo:get_layer("main")
-end
-
-
-function get_global_transf(layer_repo)
-    return layer_repo:get_layer("global_transf")
-end
diff --git a/nerv/doc/gh-pages b/nerv/doc/gh-pages
new file mode 160000
index 0000000..195d95b
--- /dev/null
+++ b/nerv/doc/gh-pages
@@ -0,0 +1 @@
+Subproject commit 195d95bb663258e3b1c3962f946db9c374018a8c
diff --git a/nerv/examples/asr_trainer.lua b/nerv/examples/asr_trainer.lua
index 8dfb2ac..dcadfa3 100644
--- a/nerv/examples/asr_trainer.lua
+++ b/nerv/examples/asr_trainer.lua
@@ -1,8 +1,7 @@
 function build_trainer(ifname)
     local param_repo = nerv.ParamRepo()
     param_repo:import(ifname, nil, gconf)
-    local sublayer_repo = make_sublayer_repo(param_repo)
-    local layer_repo = make_layer_repo(sublayer_repo, param_repo)
+    local layer_repo = make_layer_repo(param_repo)
     local network = get_network(layer_repo)
     local input_order = get_input_order()
     local iterative_trainer = function (prefix, scp_file, bp)
@@ -18,7 +17,7 @@ function build_trainer(ifname)
             -- prine stat periodically
             gconf.cnt = gconf.cnt + 1
             if gconf.cnt == 1000 then
-                print_stat(sublayer_repo)
+                print_stat(layer_repo)
                 nerv.CuMatrix.print_profile()
                 nerv.CuMatrix.clear_profile()
                 gconf.cnt = 0
@@ -42,16 +41,16 @@ function build_trainer(ifname)
             -- collect garbage in-time to save GPU memory
             collectgarbage("collect")
         end
-        print_stat(sublayer_repo)
+        print_stat(layer_repo)
         nerv.CuMatrix.print_profile()
         nerv.CuMatrix.clear_profile()
         if (not bp) and prefix ~= nil then
             nerv.info("writing back...")
             local fname = string.format("%s_cv%.3f.nerv",
-                            prefix, get_accuracy(sublayer_repo))
+                            prefix, get_accuracy(layer_repo))
             network:get_params():export(fname, nil)
         end
-        return get_accuracy(sublayer_repo)
+        return get_accuracy(layer_repo)
     end
     return iterative_trainer
 end
diff --git a/nerv/examples/swb_baseline.lua b/nerv/examples/swb_baseline.lua
index 7783f2a..0e9f897 100644
--- a/nerv/examples/swb_baseline.lua
+++ b/nerv/examples/swb_baseline.lua
@@ -10,8 +10,8 @@ gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
                 "/slfs1/users/mfy43/swb_global_transf.nerv"},
         debug = false}
 
-function make_sublayer_repo(param_repo)
-    return nerv.LayerRepo(
+function make_layer_repo(param_repo)
+    local layer_repo = nerv.LayerRepo(
     {
         -- global transf
         ["nerv.BiasLayer"] =
@@ -54,21 +54,23 @@ function make_sublayer_repo(param_repo)
             sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
             sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
         },
-        ["nerv.SoftmaxCELayer"] =
+        ["nerv.SoftmaxCELayer"] = -- softmax + ce criterion layer for finetune output
         {
             ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}}
+        },
+        ["nerv.SoftmaxLayer"] = -- softmax for decode output
+        {
+            softmax = {{}, {dim_in = {3001}, dim_out = {3001}}}
         }
     }, param_repo, gconf)
-end
 
-function make_layer_repo(sublayer_repo, param_repo)
-    return nerv.LayerRepo(
+    layer_repo:add_layers(
     {
         ["nerv.DAGLayer"] =
         {
             global_transf = {{}, {
                 dim_in = {429}, dim_out = {429},
-                sub_layers = sublayer_repo,
+                sub_layers = layer_repo,
                 connections = {
                     ["<input>[1]"] = "blayer1[1]",
                     ["blayer1[1]"] = "wlayer1[1]",
@@ -78,8 +80,8 @@ function make_layer_repo(sublayer_repo, param_repo)
                 }
             }},
             main = {{}, {
-                dim_in = {429, 1}, dim_out = {1},
-                sub_layers = sublayer_repo,
+                dim_in = {429}, dim_out = {3001},
+                sub_layers = layer_repo,
                 connections = {
                     ["<input>[1]"] = "affine0[1]",
                     ["affine0[1]"] = "sigmoid0[1]",
@@ -96,17 +98,51 @@ function make_layer_repo(sublayer_repo, param_repo)
                     ["sigmoid5[1]"] = "affine6[1]",
                     ["affine6[1]"] = "sigmoid6[1]",
                     ["sigmoid6[1]"] = "affine7[1]",
-                    ["affine7[1]"] = "ce_crit[1]",
+                    ["affine7[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            ce_output = {{}, {
+                dim_in = {429, 1}, dim_out = {1},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "ce_crit[1]",
                     ["<input>[2]"] = "ce_crit[2]",
                     ["ce_crit[1]"] = "<output>[1]"
                 }
+            }},
+            softmax_output = {{}, {
+                dim_in = {429}, dim_out = {3001},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "softmax[1]",
+                    ["softmax[1]"] = "<output>[1]"
+                }
             }}
         }
     }, param_repo, gconf)
+
+    return layer_repo
 end
 
 function get_network(layer_repo)
-    return layer_repo:get_layer("main")
+    return layer_repo:get_layer("ce_output")
+end
+
+function get_decode_network(layer_repo)
+    return layer_repo:get_layer("softmax_output")
+end
+
+function get_global_transf(layer_repo)
+    return layer_repo:get_layer("global_transf")
 end
 
 function make_readers(scp_file, layer_repo)
@@ -145,18 +181,18 @@ function get_input_order()
     return {"main_scp", "phone_state"}
 end
 
-function get_accuracy(sublayer_repo)
-    local ce_crit = sublayer_repo:get_layer("ce_crit")
+function get_accuracy(layer_repo)
+    local ce_crit = layer_repo:get_layer("ce_crit")
     return ce_crit.total_correct / ce_crit.total_frames * 100
 end
 
-function print_stat(sublayer_repo)
-    local ce_crit = sublayer_repo:get_layer("ce_crit")
+function print_stat(layer_repo)
+    local ce_crit = layer_repo:get_layer("ce_crit")
     nerv.info("*** training stat begin ***")
     nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce)
     nerv.printf("correct:\t\t%d\n", ce_crit.total_correct)
     nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames)
     nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames)
-    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(sublayer_repo))
+    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo))
     nerv.info("*** training stat end ***")
 end
diff --git a/nerv/examples/swb_baseline_basic.lua b/nerv/examples/swb_baseline_basic.lua
new file mode 100644
index 0000000..c47ec3e
--- /dev/null
+++ b/nerv/examples/swb_baseline_basic.lua
@@ -0,0 +1,161 @@
+require 'htk_io'
+gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
+        cumat_type = nerv.CuMatrixFloat,
+        mmat_type = nerv.MMatrixFloat,
+        frm_ext = 5,
+        tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
+        cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp",
+        htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
+        initialized_param = {"/slfs1/users/mfy43/swb_init.nerv",
+                "/slfs1/users/mfy43/swb_global_transf.nerv"},
+        debug = false}
+
+function make_layer_repo(param_repo)
+    local layer_repo = nerv.LayerRepo(
+    {
+        -- global transf
+        ["nerv.BiasLayer"] =
+        {
+            blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}},
+            blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}}
+        },
+        ["nerv.WindowLayer"] =
+        {
+            wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}},
+            wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}}
+        },
+        -- biased linearity
+        ["nerv.AffineLayer"] =
+        {
+            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+            {dim_in = {429}, dim_out = {2048}}},
+            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+            {dim_in = {2048}, dim_out = {3001}}}
+        },
+        ["nerv.SigmoidLayer"] =
+        {
+            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+        },
+        ["nerv.SoftmaxCELayer"] =
+        {
+            ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            global_transf = {{}, {
+                dim_in = {429}, dim_out = {429},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "blayer1[1]",
+                    ["blayer1[1]"] = "wlayer1[1]",
+                    ["wlayer1[1]"] = "blayer2[1]",
+                    ["blayer2[1]"] = "wlayer2[1]",
+                    ["wlayer2[1]"] = "<output>[1]"
+                }
+            }},
+            main = {{}, {
+                dim_in = {429, 1}, dim_out = {1},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "affine0[1]",
+                    ["affine0[1]"] = "sigmoid0[1]",
+                    ["sigmoid0[1]"] = "affine1[1]",
+                    ["affine1[1]"] = "sigmoid1[1]",
+                    ["sigmoid1[1]"] = "affine2[1]",
+                    ["affine2[1]"] = "sigmoid2[1]",
+                    ["sigmoid2[1]"] = "affine3[1]",
+                    ["affine3[1]"] = "sigmoid3[1]",
+                    ["sigmoid3[1]"] = "affine4[1]",
+                    ["affine4[1]"] = "sigmoid4[1]",
+                    ["sigmoid4[1]"] = "affine5[1]",
+                    ["affine5[1]"] = "sigmoid5[1]",
+                    ["sigmoid5[1]"] = "affine6[1]",
+                    ["affine6[1]"] = "sigmoid6[1]",
+                    ["sigmoid6[1]"] = "affine7[1]",
+                    ["affine7[1]"] = "ce_crit[1]",
+                    ["<input>[2]"] = "ce_crit[2]",
+                    ["ce_crit[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+    return layer_repo
+end
+
+function get_network(layer_repo)
+    return layer_repo:get_layer("main")
+end
+
+function make_readers(scp_file, layer_repo)
+    return {
+                {reader = nerv.TNetReader(gconf,
+                    {
+                        id = "main_scp",
+                        scp_file = scp_file,
+                        conf_file = gconf.htk_conf,
+                        frm_ext = gconf.frm_ext,
+                        mlfs = {
+                            phone_state = {
+                                file = "/slfs1/users/mfy43/swb_ivec/ref.mlf",
+                                format = "map",
+                                format_arg = "/slfs1/users/mfy43/swb_ivec/dict",
+                                dir = "*/",
+                                ext = "lab"
+                            }
+                        },
+                        global_transf = layer_repo:get_layer("global_transf")
+                    }),
+                data = {main_scp = 429, phone_state = 1}}
+            }
+end
+
+function make_buffer(readers)
+    return nerv.SGDBuffer(gconf,
+        {
+            buffer_size = gconf.buffer_size,
+            randomize = gconf.randomize,
+            readers = readers
+        })
+end
+
+function get_input_order()
+    return {"main_scp", "phone_state"}
+end
+
+function get_accuracy(layer_repo)
+    local ce_crit = layer_repo:get_layer("ce_crit")
+    return ce_crit.total_correct / ce_crit.total_frames * 100
+end
+
+function print_stat(layer_repo)
+    local ce_crit = layer_repo:get_layer("ce_crit")
+    nerv.info("*** training stat begin ***")
+    nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce)
+    nerv.printf("correct:\t\t%d\n", ce_crit.total_correct)
+    nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames)
+    nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames)
+    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo))
+    nerv.info("*** training stat end ***")
+end
diff --git a/nerv/nn/layer_dag.lua b/nerv/nn/layer_dag.lua
index e5c1ac7..e9d4d86 100644
--- a/nerv/nn/layer_dag.lua
+++ b/nerv/nn/layer_dag.lua
@@ -112,7 +112,7 @@ function DAGLayer:__init(id, global_conf, layer_conf)
         end
     end
     for i = 1, #queue do
-        nerv.info("enqueued layer: %s", queue[i].layer.id)
+        nerv.info("enqueued layer: %s %s", queue[i].layer, queue[i].layer.id)
     end
 
     for id, ref in pairs(layers) do
@@ -125,6 +125,7 @@ function DAGLayer:__init(id, global_conf, layer_conf)
     self.layers = layers
     self.inputs = inputs
     self.outputs = outputs
+    self.id = id
     self.dim_in = dim_in
     self.dim_out = dim_out
     self.parsed_conn = parsed_conn
diff --git a/nerv/nn/layer_repo.lua b/nerv/nn/layer_repo.lua
index 602c37c..ef333a7 100644
--- a/nerv/nn/layer_repo.lua
+++ b/nerv/nn/layer_repo.lua
@@ -1,7 +1,12 @@
 local LayerRepo = nerv.class("nerv.LayerRepo")
 
 function LayerRepo:__init(layer_spec, param_repo, global_conf)
-    local layers = {}
+    self.layers = {}
+    self:add_layers(layer_spec, param_repo, global_conf)
+end
+
+function LayerRepo:add_layers(layer_spec, param_repo, global_conf)
+    local layers = self.layers
     for ltype, llist in pairs(layer_spec) do
         local layer_type = nerv.get_type(ltype)
         for id, spec in pairs(llist) do
@@ -22,7 +27,6 @@ function LayerRepo:__init(layer_spec, param_repo, global_conf)
             layers[id] = layer_type(id, global_conf, layer_config)
         end
     end
-    self.layers = layers
 end
 
 function LayerRepo:get_layer(lid)
-- 
cgit v1.2.3-70-g09d2


From 7082ba094be6ccbf97cfaf893ea437f31ced197b Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Thu, 6 Aug 2015 19:47:59 +0800
Subject: ...

---
 nerv/doc/gh-pages | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 nerv/doc/gh-pages

diff --git a/nerv/doc/gh-pages b/nerv/doc/gh-pages
deleted file mode 160000
index 195d95b..0000000
--- a/nerv/doc/gh-pages
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 195d95bb663258e3b1c3962f946db9c374018a8c
-- 
cgit v1.2.3-70-g09d2


From 4b3e8591816e553a4409f5fa95f5983e59ff711f Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Fri, 14 Aug 2015 15:02:34 +0800
Subject: add profiling for copy_rows_fromh_by_idx

---
 nerv/Makefile                      | 2 +-
 nerv/io/sgd_buffer.lua             | 4 +++-
 nerv/lib/matrix/generic/cumatrix.c | 2 ++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/nerv/Makefile b/nerv/Makefile
index 728d010..0b433d5 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -36,7 +36,7 @@ LUA_LIBS := matrix/init.lua io/init.lua init.lua \
 			io/sgd_buffer.lua 
 
 INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK
-CUDA_BASE := /usr/local/cuda-6.5
+CUDA_BASE := /usr/local/cuda-7.0
 #CUDA_BASE := /usr/local/cuda-5.0
 CUDA_INCLUDE := -I $(CUDA_BASE)/include/
 INCLUDE += $(CUDA_INCLUDE)
diff --git a/nerv/io/sgd_buffer.lua b/nerv/io/sgd_buffer.lua
index f4f7dfe..604fa07 100644
--- a/nerv/io/sgd_buffer.lua
+++ b/nerv/io/sgd_buffer.lua
@@ -41,7 +41,7 @@ function SGDBuffer:saturate()
                 buff.data:copy_from(buff.leftover, 0, lrow)
                 buff.leftover = nil
             end
-            nerv.printf("leftover: %d\n", lrow)
+            nerv.printf("buffer leftover: %d\n", lrow)
             reader.tail = lrow
             reader.has_leftover = false
         end
@@ -87,9 +87,11 @@ end
 function SGDBuffer:get_data()
     local batch_size = self.gconf.batch_size
     if self.head >= self.tail then -- buffer is empty
+        local t = os.clock()
         if not self:saturate() then
             return nil -- the remaining data cannot build a batch
         end
+        nerv.info("%.3fs to fill the buffer\n", os.clock() - t)
     end
     if self.head + batch_size > self.tail then
         return nil -- the remaining data cannot build a batch
diff --git a/nerv/lib/matrix/generic/cumatrix.c b/nerv/lib/matrix/generic/cumatrix.c
index 40a0030..2cb3563 100644
--- a/nerv/lib/matrix/generic/cumatrix.c
+++ b/nerv/lib/matrix/generic/cumatrix.c
@@ -321,6 +321,7 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b,
         NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0);
     if (a->ncol != b->ncol)
         NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0);
+    PROFILE_START
     cudaStream_t *streams = (cudaStream_t*)malloc(sizeof(cudaStream_t) * nrow);
     for (i = 0; i < nrow; i++)
     {
@@ -339,6 +340,7 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b,
         CUDA_SAFE_CALL(cudaStreamDestroy(streams[i]), status);
     }
     free(streams);
+    PROFILE_STOP
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 6cad1b1947fb2ba237b0e843cb7900cdc1653294 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Fri, 14 Aug 2015 15:22:07 +0800
Subject: use default cuda library path

---
 nerv/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nerv/Makefile b/nerv/Makefile
index 0b433d5..b5d26bd 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -36,8 +36,8 @@ LUA_LIBS := matrix/init.lua io/init.lua init.lua \
 			io/sgd_buffer.lua 
 
 INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK
-CUDA_BASE := /usr/local/cuda-7.0
-#CUDA_BASE := /usr/local/cuda-5.0
+#CUDA_BASE := /usr/local/cuda-7.0
+CUDA_BASE := /usr/local/cuda
 CUDA_INCLUDE := -I $(CUDA_BASE)/include/
 INCLUDE += $(CUDA_INCLUDE)
 
-- 
cgit v1.2.3-70-g09d2


From 47dba09eeba2463a804e89c9d0aed7b30cc92b4e Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Tue, 25 Aug 2015 11:38:57 +0800
Subject: use more general implementation for mat:create

---
 nerv/matrix/generic/cumatrix.c | 10 ----------
 nerv/matrix/init.lua           |  4 ++++
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/nerv/matrix/generic/cumatrix.c b/nerv/matrix/generic/cumatrix.c
index 4bdf5f0..ab7f7c4 100644
--- a/nerv/matrix/generic/cumatrix.c
+++ b/nerv/matrix/generic/cumatrix.c
@@ -43,15 +43,6 @@ static int nerv_matrix_(lua_mul)(lua_State *L) {
     return 0;
 }
 
-static int nerv_matrix_(lua_create)(lua_State *L) {
-    Status status;
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = nerv_matrix_(create)(a->nrow, a->ncol, &status);
-    NERV_LUA_CHECK_STATUS(L, status);
-    luaT_pushudata(L, b, nerv_matrix_(tname));
-    return 1;
-}
-
 static int nerv_matrix_(lua_sigmoid)(lua_State *L) {
     Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
@@ -289,7 +280,6 @@ static int nerv_matrix_(lua_scale_rows_by_row)(lua_State *L) {
 }
 
 static const luaL_Reg nerv_matrix_(extra_methods)[] = {
-    {"create", nerv_matrix_(lua_create)},
     {"colsum", nerv_matrix_(lua_colsum)},
     {"colsame", nerv_matrix_(lua_colsame)},
     {"rowsum", nerv_matrix_(lua_rowsum)},
diff --git a/nerv/matrix/init.lua b/nerv/matrix/init.lua
index f230e9f..ae9b884 100644
--- a/nerv/matrix/init.lua
+++ b/nerv/matrix/init.lua
@@ -45,6 +45,10 @@ function nerv.Matrix:generate(gen)
     end
 end
 
+function nerv.Matrix:create()
+    return self.__constructor(self:nrow(), self:ncol())
+end
+
 nerv.MMatrixInt.fmt = "%d "
 
 function nerv.CuMatrix:__add__(b)
-- 
cgit v1.2.3-70-g09d2


From ed2a4148dbb9c18f428571b3e2970d7b2adfb058 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Tue, 25 Aug 2015 11:47:47 +0800
Subject: add optional parameters to mat:create

---
 nerv/matrix/init.lua | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nerv/matrix/init.lua b/nerv/matrix/init.lua
index ae9b884..1091d7e 100644
--- a/nerv/matrix/init.lua
+++ b/nerv/matrix/init.lua
@@ -45,8 +45,8 @@ function nerv.Matrix:generate(gen)
     end
 end
 
-function nerv.Matrix:create()
-    return self.__constructor(self:nrow(), self:ncol())
+function nerv.Matrix:create(nrow, ncol)
+    return self.__constructor(nrow or self:nrow(), ncol or self:ncol())
 end
 
 nerv.MMatrixInt.fmt = "%d "
-- 
cgit v1.2.3-70-g09d2


From e81e9832ec4f2ad031fd42b5018cea134e8cda7e Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Wed, 26 Aug 2015 14:26:54 +0800
Subject: move global_transf to asr_trainer.lua

---
 nerv/examples/asr_trainer.lua        | 23 +++++++++++++++++++----
 nerv/examples/swb_baseline.lua       |  7 ++++---
 nerv/examples/swb_baseline_basic.lua |  7 ++++---
 nerv/io/sgd_buffer.lua               |  2 +-
 nerv/layer/mse.lua                   |  2 +-
 nerv/nn/layer_dag.lua                | 27 +++++++++++++++++++++++++++
 6 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/nerv/examples/asr_trainer.lua b/nerv/examples/asr_trainer.lua
index dcadfa3..5a50542 100644
--- a/nerv/examples/asr_trainer.lua
+++ b/nerv/examples/asr_trainer.lua
@@ -3,6 +3,7 @@ function build_trainer(ifname)
     param_repo:import(ifname, nil, gconf)
     local layer_repo = make_layer_repo(param_repo)
     local network = get_network(layer_repo)
+    local global_transf = get_global_transf(layer_repo)
     local input_order = get_input_order()
     local iterative_trainer = function (prefix, scp_file, bp)
         gconf.randomize = bp
@@ -24,15 +25,29 @@ function build_trainer(ifname)
                 -- break
             end
             local input = {}
---            if gconf.cnt == 100 then break end
-            for i, id in ipairs(input_order) do
+--            if gconf.cnt == 1000 then break end
+            for i, e in ipairs(input_order) do
+                local id = e.id
                 if data[id] == nil then
                     nerv.error("input data %s not found", id)
                 end
-                table.insert(input, data[id])
+                local transformed
+                if e.global_transf then
+                    transformed = nerv.speech_utils.global_transf(data[id],
+                                        global_transf,
+                                        gconf.frm_ext or 0,
+                                        gconf.frm_trim or 0,
+                                        gconf)
+                else
+                    transformed = data[id]
+                end
+                table.insert(input, transformed)
             end
             local output = {nerv.CuMatrixFloat(gconf.batch_size, 1)}
-            err_output = {input[1]:create()}
+            err_output = {}
+            for i = 1, #input do
+                table.insert(err_output, input[i]:create())
+            end
             network:propagate(input, output)
             if bp then
                 network:back_propagate(err_input, err_output, input, output)
diff --git a/nerv/examples/swb_baseline.lua b/nerv/examples/swb_baseline.lua
index 0e9f897..bbc6467 100644
--- a/nerv/examples/swb_baseline.lua
+++ b/nerv/examples/swb_baseline.lua
@@ -3,6 +3,7 @@ gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
         cumat_type = nerv.CuMatrixFloat,
         mmat_type = nerv.MMatrixFloat,
         frm_ext = 5,
+        frm_trim = 5,
         tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
         cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp",
         htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
@@ -161,8 +162,7 @@ function make_readers(scp_file, layer_repo)
                                 dir = "*/",
                                 ext = "lab"
                             }
-                        },
-                        global_transf = layer_repo:get_layer("global_transf")
+                        }
                     }),
                 data = {main_scp = 429, phone_state = 1}}
             }
@@ -178,7 +178,8 @@ function make_buffer(readers)
 end
 
 function get_input_order()
-    return {"main_scp", "phone_state"}
+    return {{id = "main_scp", global_transf = true},
+            {id = "phone_state"}}
 end
 
 function get_accuracy(layer_repo)
diff --git a/nerv/examples/swb_baseline_basic.lua b/nerv/examples/swb_baseline_basic.lua
index c47ec3e..71f04a3 100644
--- a/nerv/examples/swb_baseline_basic.lua
+++ b/nerv/examples/swb_baseline_basic.lua
@@ -3,6 +3,7 @@ gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
         cumat_type = nerv.CuMatrixFloat,
         mmat_type = nerv.MMatrixFloat,
         frm_ext = 5,
+        frm_trim = 5,
         tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
         cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp",
         htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
@@ -124,8 +125,7 @@ function make_readers(scp_file, layer_repo)
                                 dir = "*/",
                                 ext = "lab"
                             }
-                        },
-                        global_transf = layer_repo:get_layer("global_transf")
+                        }
                     }),
                 data = {main_scp = 429, phone_state = 1}}
             }
@@ -141,7 +141,8 @@ function make_buffer(readers)
 end
 
 function get_input_order()
-    return {"main_scp", "phone_state"}
+    return {{id = "main_scp", global_transf = true},
+            {id = "phone_state"}}
 end
 
 function get_accuracy(layer_repo)
diff --git a/nerv/io/sgd_buffer.lua b/nerv/io/sgd_buffer.lua
index 604fa07..f9d281c 100644
--- a/nerv/io/sgd_buffer.lua
+++ b/nerv/io/sgd_buffer.lua
@@ -91,7 +91,7 @@ function SGDBuffer:get_data()
         if not self:saturate() then
             return nil -- the remaining data cannot build a batch
         end
-        nerv.info("%.3fs to fill the buffer\n", os.clock() - t)
+        nerv.info("%.3fs to fill the buffer", os.clock() - t)
     end
     if self.head + batch_size > self.tail then
         return nil -- the remaining data cannot build a batch
diff --git a/nerv/layer/mse.lua b/nerv/layer/mse.lua
index 9a97add..2516998 100644
--- a/nerv/layer/mse.lua
+++ b/nerv/layer/mse.lua
@@ -34,7 +34,7 @@ function MSELayer:propagate(input, output)
     if output[1] ~= nil then
         output[1]:copy_fromd(mse_sum)
     end
-    self.total_mse = self.total_mse + mse_sum:colsum()[0]
+    self.total_mse = self.total_mse + mse_sum:colsum()[0][0]
     self.total_frames = self.total_frames + mse_sum:nrow()
 end
 
diff --git a/nerv/nn/layer_dag.lua b/nerv/nn/layer_dag.lua
index e9d4d86..25297c2 100644
--- a/nerv/nn/layer_dag.lua
+++ b/nerv/nn/layer_dag.lua
@@ -254,3 +254,30 @@ function DAGLayer:get_params()
     end
     return nerv.ParamRepo.merge(param_repos)
 end
+
+DAGLayer.PORT_TYPES = {
+    INPUT = {},
+    OUTPUT = {},
+    ERR_INPUT = {},
+    ERR_OUTPUT = {}
+}
+
+function DAGLayer:get_intermediate(id, port_type)
+    if id == "<input>" or id == "<output>" then
+        nerv.error("an actual real layer id is expected")
+    end
+    local layer = layers[id]
+    if layer == nil then
+        nerv.error("layer id %s not found", id)
+    end
+    if port_type == DAGLayer.PORT_TYPES.INPUT then
+        return layer.inputs
+    elseif port_type == DAGLayer.PORT_TYPES.OUTPUT then
+        return layer.outputs
+    elseif port_type == DAGLayer.PORT_TYPES.ERR_INPUT then
+        return layer.err_inputs
+    elseif port_type == DAGLayer.PORT_TYPES.ERR_OUTPUT then
+        return layer.err_outputs
+    end
+    nerv.error("unrecognized port type")
+end
-- 
cgit v1.2.3-70-g09d2


From e97b97e4c684e7f26064bcc0a6440ac5d6cddc47 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Wed, 26 Aug 2015 15:43:00 +0800
Subject: ...

---
 nerv/examples/asr_trainer.lua | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/nerv/examples/asr_trainer.lua b/nerv/examples/asr_trainer.lua
index 5a50542..69cfeed 100644
--- a/nerv/examples/asr_trainer.lua
+++ b/nerv/examples/asr_trainer.lua
@@ -35,8 +35,7 @@ function build_trainer(ifname)
                 if e.global_transf then
                     transformed = nerv.speech_utils.global_transf(data[id],
                                         global_transf,
-                                        gconf.frm_ext or 0,
-                                        gconf.frm_trim or 0,
+                                        gconf.frm_ext or 0, 0,
                                         gconf)
                 else
                     transformed = data[id]
-- 
cgit v1.2.3-70-g09d2


From 29d1c98bd5dc7608919071efd430fd3a9ac6dc58 Mon Sep 17 00:00:00 2001
From: Yimmon Zhuang <yimmon.zhuang@gmail.com>
Date: Thu, 27 Aug 2015 13:29:29 +0800
Subject: .

---
 nerv/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nerv/Makefile b/nerv/Makefile
index 728d010..7ed140d 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -36,8 +36,8 @@ LUA_LIBS := matrix/init.lua io/init.lua init.lua \
 			io/sgd_buffer.lua 
 
 INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK
-CUDA_BASE := /usr/local/cuda-6.5
-#CUDA_BASE := /usr/local/cuda-5.0
+#CUDA_BASE := /usr/local/cuda-6.5
+CUDA_BASE := /usr/local/cuda-5.0
 CUDA_INCLUDE := -I $(CUDA_BASE)/include/
 INCLUDE += $(CUDA_INCLUDE)
 
-- 
cgit v1.2.3-70-g09d2


From 8bf9c7575ffeeabb3924e9e02a35afe187071fe2 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Thu, 27 Aug 2015 17:38:09 +0800
Subject: update embedding example to use new global_transf()

---
 embedding_example/setup_nerv.lua | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/embedding_example/setup_nerv.lua b/embedding_example/setup_nerv.lua
index 5ade950..d80c306 100644
--- a/embedding_example/setup_nerv.lua
+++ b/embedding_example/setup_nerv.lua
@@ -11,12 +11,11 @@ local batch_size = 1
 network:init(batch_size)
 
 function propagator(input, output)
-    local transformed = nerv.speech_utils.global_transf(input,
-                            global_transf, 0, gconf) -- preprocessing
-    local gpu_input = nerv.CuMatrixFloat(transformed:nrow(), transformed:ncol())
+    local transformed = nerv.speech_utils.global_transf(
+                            gconf.cumat_type.new_from_host(input),
+                            global_transf, 0, 0, gconf) -- preprocessing
+    local gpu_input = transformed
     local gpu_output = nerv.CuMatrixFloat(output:nrow(), output:ncol())
-    print(transformed)
-    gpu_input:copy_fromh(transformed)
     network:propagate({gpu_input}, {gpu_output})
     gpu_output:copy_toh(output)
     -- collect garbage in-time to save GPU memory
-- 
cgit v1.2.3-70-g09d2


From 1a9f63e351582f54fec7817927168cb1dbb0c1d6 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Fri, 28 Aug 2015 13:21:52 +0800
Subject: support gpu buffering

---
 nerv/examples/swb_baseline.lua      |  3 ++-
 nerv/io/sgd_buffer.lua              | 34 +++++++++++++++++++--------
 nerv/lib/matrix/cukernel.h          |  2 ++
 nerv/lib/matrix/cumatrix.c          |  1 +
 nerv/lib/matrix/cumatrix.h          |  1 +
 nerv/lib/matrix/generic/cukernel.cu | 20 ++++++++++++++++
 nerv/lib/matrix/generic/cumatrix.c  | 19 +++++++++++++--
 nerv/lib/matrix/generic/cumatrix.h  |  2 ++
 nerv/lib/matrix/mmatrix.c           | 37 ++++++++++++++---------------
 nerv/lib/matrix/mmatrix.h           |  3 ++-
 nerv/matrix/generic/cumatrix.c      | 22 ++++++++++++++++--
 nerv/matrix/mmatrix.c               | 46 +++++++++++++++++++------------------
 12 files changed, 135 insertions(+), 55 deletions(-)

diff --git a/nerv/examples/swb_baseline.lua b/nerv/examples/swb_baseline.lua
index bbc6467..8015884 100644
--- a/nerv/examples/swb_baseline.lua
+++ b/nerv/examples/swb_baseline.lua
@@ -173,7 +173,8 @@ function make_buffer(readers)
         {
             buffer_size = gconf.buffer_size,
             randomize = gconf.randomize,
-            readers = readers
+            readers = readers,
+            use_gpu = true
         })
 end
 
diff --git a/nerv/io/sgd_buffer.lua b/nerv/io/sgd_buffer.lua
index f9d281c..3f854f0 100644
--- a/nerv/io/sgd_buffer.lua
+++ b/nerv/io/sgd_buffer.lua
@@ -8,13 +8,29 @@ function SGDBuffer:__init(global_conf, buffer_conf)
     if self.randomize == nil then
         self.randomize = false
     end
+    local cumat_type = global_conf.cumat_type
+    if buffer_conf.use_gpu then
+        self.mat_type = cumat_type
+        self.copy_rows_from_by_idx = cumat_type.copy_rows_fromd_by_idx
+        self.copy_from = cumat_type.copy_fromd
+        self.copy_from_reader = cumat_type.copy_fromh
+        self.perm_gen = function (x)
+            return cumat_type.new_from_host(nerv.MMatrixFloat.perm_gen(x))
+        end
+    else
+        self.mat_type = global_conf.mmat_type
+        self.copy_rows_from_by_idx = cumat_type.copy_rows_fromh_by_idx
+        self.copy_from = cumat_type.copy_fromh
+        self.perm_gen = nerv.MMatrixFloat.perm_gen
+        self.copy_from_reader = self.mat_type.copy_from
+    end
     self.head = 0
     self.tail = 0
     self.readers = {}
     for i, reader_spec in ipairs(buffer_conf.readers) do
         local buffs = {}
         for id, width in pairs(reader_spec.data) do
-            buffs[id] = {data = global_conf.mmat_type(self.buffer_size, width),
+            buffs[id] = {data = self.mat_type(self.buffer_size, width),
                         leftover = nil,
                         width = width}
         end
@@ -41,7 +57,7 @@ function SGDBuffer:saturate()
                 buff.data:copy_from(buff.leftover, 0, lrow)
                 buff.leftover = nil
             end
-            nerv.printf("buffer leftover: %d\n", lrow)
+            nerv.info("buffer leftover: %d\n", lrow)
             reader.tail = lrow
             reader.has_leftover = false
         end
@@ -65,21 +81,21 @@ function SGDBuffer:saturate()
                     if d == nil then
                         nerv.error("reader does not provide data for %s", id)
                     end
-                    buff.leftover = self.gconf.mmat_type(drow - remain,
-                                                        buff.width)
-                    buff.leftover:copy_from(d, remain, drow)
+                    buff.leftover = self.mat_type(drow - remain,
+                                                  buff.width)
+                    self.copy_from_reader(buff.leftover, d, remain, drow)
                 end
                 drow = remain
                 reader.has_leftover = true
             end
             for id, buff in pairs(reader.buffs) do
-                buff.data:copy_from(data[id], 0, drow, reader.tail)
+                self.copy_from_reader(buff.data, data[id], 0, drow, reader.tail)
             end
             reader.tail = reader.tail + drow
         end
         self.tail = math.min(self.tail, reader.tail)
     end
-    self.rand_map = nerv.MMatrixInt.perm_gen(self.tail) -- generate shuffled index
+    self.rand_map = self.perm_gen(self.tail) -- generate shuffled index
     collectgarbage("collect")
     return self.tail >= self.gconf.batch_size
 end
@@ -101,9 +117,9 @@ function SGDBuffer:get_data()
         for id, buff in pairs(reader.buffs) do
             local batch = self.gconf.cumat_type(batch_size, buff.width)
             if self.randomize then
-                batch:copy_rows_fromh_by_idx(buff.data, self.rand_map, self.head)
+                self.copy_rows_from_by_idx(batch, buff.data, self.rand_map, self.head)
             else
-                batch:copy_fromh(buff.data, self.head, self.head + batch_size)
+                self.copy_from(batch, buff.data, self.head, self.head + batch_size)
             end
             res[id] = batch
         end
diff --git a/nerv/lib/matrix/cukernel.h b/nerv/lib/matrix/cukernel.h
index 7bb4c2c..2126c6f 100644
--- a/nerv/lib/matrix/cukernel.h
+++ b/nerv/lib/matrix/cukernel.h
@@ -18,4 +18,6 @@ void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step);
 void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b);
 void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b);
 void cudak_(cuda_decompress)(const Matrix *a, Matrix *b);
+void cudak_(cuda_copy_rows_by_idx)(const Matrix *a, Matrix *b,
+                                    const Matrix *idx, int b_begin);
 #endif
diff --git a/nerv/lib/matrix/cumatrix.c b/nerv/lib/matrix/cumatrix.c
index aa81bfc..ff1168d 100644
--- a/nerv/lib/matrix/cumatrix.c
+++ b/nerv/lib/matrix/cumatrix.c
@@ -49,6 +49,7 @@ void nerv_cumatrix_init() {
 #define NERV_CUBLAS_(NAME) cublasS##NAME
 #define MATRIX_CUMATRIX_HOST_TNAME nerv_matrix_host_float_tname
 #include "generic/cumatrix.c"
+
 #undef NERV_CUBLAS_
 #undef cudak_
 #undef nerv_matrix_
diff --git a/nerv/lib/matrix/cumatrix.h b/nerv/lib/matrix/cumatrix.h
index 9f71507..e6def66 100644
--- a/nerv/lib/matrix/cumatrix.h
+++ b/nerv/lib/matrix/cumatrix.h
@@ -1,5 +1,6 @@
 #ifndef NERV_CUMATRIX_H
 #define NERV_CUMATRIX_H
+#include "matrix.h"
 void nerv_cumatrix_print_profile();
 void nerv_cumatrix_clear_profile();
 void nerv_cumatrix_init();
diff --git a/nerv/lib/matrix/generic/cukernel.cu b/nerv/lib/matrix/generic/cukernel.cu
index e337798..08feb59 100644
--- a/nerv/lib/matrix/generic/cukernel.cu
+++ b/nerv/lib/matrix/generic/cukernel.cu
@@ -284,6 +284,15 @@ __global__ void cudak_(gen_col_idx)(MATRIX_ELEM *b,
     b[j + i * stride] = j;
 }
 
+__global__ void cudak_(copy_rows_by_idx)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                    const MATRIX_ELEM *idx, int b_begin,
+                                    int nrow, int ncol, int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[j + i * stride] = a[j + lrintf(idx[i]) * stride];
+}
+
 extern "C" {
 #include "../cukernel.h"
     void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) {
@@ -589,5 +598,16 @@ extern "C" {
              b->stride / sizeof(MATRIX_ELEM));
         cudaStreamSynchronize(0);
     }
+
+    void cudak_(cuda_copy_rows_by_idx)(const Matrix *a, Matrix *b,
+                                        const Matrix *idx, int b_begin) {
+        dim3 threadsPerBlock(CUDA_THREADS_NN, 1);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), b->nrow);
+        cudak_(copy_rows_by_idx)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+            MATRIX_ELEM_PTR(idx) + b_begin,
+             b_begin, b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
 }
 #endif
diff --git a/nerv/lib/matrix/generic/cumatrix.c b/nerv/lib/matrix/generic/cumatrix.c
index 2cb3563..770e503 100644
--- a/nerv/lib/matrix/generic/cumatrix.c
+++ b/nerv/lib/matrix/generic/cumatrix.c
@@ -315,7 +315,7 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b,
     long nrow = a->nrow;
     if (!(0 <= b_begin && b_begin + nrow <= idx->ncol))
         NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0);
-    long *idx_ptr = idx->data.i;
+    float *idx_ptr = idx->data.f;
     int i;
     if (idx->nrow != 1)
         NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0);
@@ -325,7 +325,7 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b,
     cudaStream_t *streams = (cudaStream_t*)malloc(sizeof(cudaStream_t) * nrow);
     for (i = 0; i < nrow; i++)
     {
-        int src_row = idx_ptr[b_begin + i];
+        int src_row = (int)idx_ptr[b_begin + i];
         if (!(0 <= src_row && src_row < b->nrow))
             NERV_EXIT_STATUS(status, MAT_INVALID_IDX, 0);
         CUDA_SAFE_CALL(cudaStreamCreate(streams + i), status);
@@ -344,6 +344,21 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b,
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
 }
 
+void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b,
+                            const Matrix *idx, int b_begin, Status *status) {
+    long nrow = a->nrow;
+    if (!(0 <= b_begin && b_begin + nrow <= idx->ncol))
+        NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0);
+    if (idx->nrow != 1)
+        NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0);
+    if (a->ncol != b->ncol)
+        NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0);
+    PROFILE_START
+    cudak_(cuda_copy_rows_by_idx)(b, a, idx, b_begin);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, NERV_NORMAL, 0);
+}
+
 void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b,
                             int context, Status *status) {
     if (a->nrow != b->nrow)
diff --git a/nerv/lib/matrix/generic/cumatrix.h b/nerv/lib/matrix/generic/cumatrix.h
index 3f1f8a3..04e8c5a 100644
--- a/nerv/lib/matrix/generic/cumatrix.h
+++ b/nerv/lib/matrix/generic/cumatrix.h
@@ -40,6 +40,8 @@ void nerv_matrix_(log_elem)(Matrix *b, const Matrix *a, Status *status);
 Matrix *nerv_matrix_(decompress)(const Matrix *a, int orig_col, Status *status);
 void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b,
                             const Matrix *idx, int b_begin, Status *status);
+void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b,
+                            const Matrix *idx, int b_begin, Status *status);
 
 void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b,
                             int context, Status *status);
diff --git a/nerv/lib/matrix/mmatrix.c b/nerv/lib/matrix/mmatrix.c
index 94f1ea8..b8157eb 100644
--- a/nerv/lib/matrix/mmatrix.c
+++ b/nerv/lib/matrix/mmatrix.c
@@ -7,6 +7,25 @@
 #define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME
 #include "generic/matrix.h"
 #include "generic/mmatrix.c"
+
+Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) {
+    int i;
+    Matrix *self = nerv_matrix_(create)(1, ncol, status);
+    if (status->err_code != NERV_NORMAL)
+        return NULL;
+    float *prow = self->data.f;
+    for (i = 0; i < ncol; i++)
+        prow[i] = i;
+    for (i = ncol - 1; i >= 0; i--)
+    {
+        size_t j = rand() % (i + 1);
+        float tmp = prow[i];
+        prow[i] = prow[j];
+        prow[j] = tmp;
+    }
+    return self;
+}
+
 #undef nerv_matrix_
 #undef host_matrix_
 #undef MATRIX_USE_FLOAT
@@ -33,21 +52,3 @@
 #define host_matrix_(NAME) host_matrix_int_##NAME
 #define nerv_matrix_(NAME) nerv_matrix_host_int_##NAME
 #include "generic/mmatrix.c"
-
-Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) {
-    int i;
-    Matrix *self = nerv_matrix_(create)(1, ncol, status);
-    if (status->err_code != NERV_NORMAL)
-        return NULL;
-    long *prow = self->data.i;
-    for (i = 0; i < ncol; i++)
-        prow[i] = i;
-    for (i = ncol - 1; i >= 0; i--)
-    {
-        size_t j = rand() % (i + 1);
-        long tmp = prow[i];
-        prow[i] = prow[j];
-        prow[j] = tmp;
-    }
-    return self;
-}
diff --git a/nerv/lib/matrix/mmatrix.h b/nerv/lib/matrix/mmatrix.h
index df91e4c..31e7984 100644
--- a/nerv/lib/matrix/mmatrix.h
+++ b/nerv/lib/matrix/mmatrix.h
@@ -1,4 +1,5 @@
 #ifndef NERV_MMATRIX_H
 #define NERV_MMATRIX_H
-Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status);
+#include "matrix.h"
+Matrix *nerv_matrix_host_float_perm_gen(int ncol, Status *status);
 #endif
diff --git a/nerv/matrix/generic/cumatrix.c b/nerv/matrix/generic/cumatrix.c
index ab7f7c4..08cb4c2 100644
--- a/nerv/matrix/generic/cumatrix.c
+++ b/nerv/matrix/generic/cumatrix.c
@@ -228,12 +228,12 @@ static int nerv_matrix_(lua_decompress)(lua_State *L) {
     return 1;
 }
 
-extern const char *nerv_matrix_host_int_tname;
+extern const char *nerv_matrix_host_float_tname;
 static int nerv_matrix_(lua_copy_rows_fromh_by_idx)(lua_State *L) {
     Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
     const Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
-    const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_int_tname);
+    const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_float_tname);
     long nrow = a->nrow;
     int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0;
     nerv_matrix_(copy_rows_fromh_by_idx)(a, b, idx, b_begin, &status);
@@ -241,6 +241,18 @@ static int nerv_matrix_(lua_copy_rows_fromh_by_idx)(lua_State *L) {
     return 0;
 }
 
+static int nerv_matrix_(lua_copy_rows_fromd_by_idx)(lua_State *L) {
+    Status status;
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_(tname));
+    long nrow = a->nrow;
+    int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0;
+    nerv_matrix_(copy_rows_fromd_by_idx)(a, b, idx, b_begin, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
+    return 0;
+}
+
 static int nerv_matrix_(lua_expand_frm)(lua_State *L) {
     Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
@@ -290,6 +302,8 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = {
     /* in-place calc */
     {"copy_fromh", nerv_matrix_(lua_copy_fromh)},
     {"copy_fromd", nerv_matrix_(lua_copy_fromd)},
+    /* alias for copy_fromd */
+    {"copy_from", nerv_matrix_(lua_copy_fromd)},
     {"copy_toh", nerv_matrix_(lua_copy_toh)},
     {"add", nerv_matrix_(lua_add)},
     {"mul", nerv_matrix_(lua_mul)},
@@ -302,6 +316,7 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = {
     {"mul_elem", nerv_matrix_(lua_mul_elem)},
     {"log_elem", nerv_matrix_(lua_log_elem)},
     {"copy_rows_fromh_by_idx", nerv_matrix_(lua_copy_rows_fromh_by_idx)},
+    {"copy_rows_fromd_by_idx", nerv_matrix_(lua_copy_rows_fromd_by_idx)},
     {"expand_frm", nerv_matrix_(lua_expand_frm)},
     {"rearrange_frm", nerv_matrix_(lua_rearrange_frm)},
     {"scale_rows_by_row", nerv_matrix_(lua_scale_rows_by_row)},
@@ -311,6 +326,9 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = {
 
 static void cuda_matrix_(init)(lua_State *L) {
     luaN_append_methods(L, nerv_matrix_(extra_methods));
+#ifdef CUMATRIX_INIT
+    CUMATRIX_INIT(L);
+#endif
 }
 
 int nerv_matrix_(lua_get_elem)(lua_State *L) {
diff --git a/nerv/matrix/mmatrix.c b/nerv/matrix/mmatrix.c
index 5561572..961059c 100644
--- a/nerv/matrix/mmatrix.c
+++ b/nerv/matrix/mmatrix.c
@@ -16,7 +16,30 @@ void nerv_lua_mmatrix_init(lua_State *L) {
 #define host_matrix_(NAME) host_matrix_float_##NAME
 #define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME
 const char *nerv_matrix_(tname) = "nerv.MMatrixFloat";
+#define MMATRIX_INIT(L) host_matrix_(init_extra)(L)
+
+static const luaL_Reg nerv_matrix_(extra_methods_int)[];
+static void host_matrix_(init_extra)(lua_State *L) {
+    luaN_append_methods(L, nerv_matrix_(extra_methods_int));
+}
+
 #include "generic/mmatrix.c"
+#include "../lib/matrix/mmatrix.h"
+
+static int nerv_matrix_(lua_perm_gen)(lua_State *L) {
+    Status status;
+    int i, ncol = luaL_checkinteger(L, 1);
+    Matrix *self = nerv_matrix_(perm_gen)(ncol, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
+    luaT_pushudata(L, self, nerv_matrix_(tname));
+    return 1;
+}
+
+static const luaL_Reg nerv_matrix_(extra_methods_int)[] = {
+    {"perm_gen", nerv_matrix_(lua_perm_gen)},
+    {NULL, NULL}
+};
+
 #undef nerv_matrix_
 #undef host_matrix_
 #undef MATRIX_USE_FLOAT
@@ -24,6 +47,7 @@ const char *nerv_matrix_(tname) = "nerv.MMatrixFloat";
 #undef MATRIX_ELEM_PTR
 #undef MATRIX_ELEM_FMT
 #undef MATRIX_ELEM_WRITE_FMT
+#undef MMATRIX_INIT
 
 #define NERV_GENERIC_MMATRIX
 #define MATRIX_USE_DOUBLE
@@ -44,26 +68,4 @@ const char *nerv_matrix_(tname) = "nerv.MMatrixDouble";
 #define host_matrix_(NAME) host_matrix_int_##NAME
 #define nerv_matrix_(NAME) nerv_matrix_host_int_##NAME
 const char *nerv_matrix_(tname) = "nerv.MMatrixInt";
-#define MMATRIX_INIT(L) host_matrix_(init_extra)(L)
-
-static const luaL_Reg nerv_matrix_(extra_methods_int)[];
-static void host_matrix_(init_extra)(lua_State *L) {
-    luaN_append_methods(L, nerv_matrix_(extra_methods_int));
-}
-
 #include "generic/mmatrix.c"
-#include "../lib/matrix/mmatrix.h"
-
-static int nerv_matrix_(lua_perm_gen)(lua_State *L) {
-    Status status;
-    int i, ncol = luaL_checkinteger(L, 1);
-    Matrix *self = nerv_matrix_(perm_gen)(ncol, &status);
-    NERV_LUA_CHECK_STATUS(L, status);
-    luaT_pushudata(L, self, nerv_matrix_(tname));
-    return 1;
-}
-
-static const luaL_Reg nerv_matrix_(extra_methods_int)[] = {
-    {"perm_gen", nerv_matrix_(lua_perm_gen)},
-    {NULL, NULL}
-};
-- 
cgit v1.2.3-70-g09d2


From cad144243b898a7bed91c18572bf42944e9db3b3 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Sun, 30 Aug 2015 15:14:36 +0800
Subject: ...

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index be8e5b0..8c21bd9 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@ cd nerv
 git clone https://github.com/Nerv-SJTU/nerv-speech.git speech
 git submodule init && git submodule update
 make
+make speech
 ```
 The `git submodule` command is for the __luajit__ repository inside __Nerv__.  
 Now, you can try to run some example scripts.  
-- 
cgit v1.2.3-70-g09d2


From 3721c74d56ffdea43851489617f33cd13b87ab76 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Mon, 31 Aug 2015 18:59:22 +0800
Subject: ...

---
 nerv/io/sgd_buffer.lua | 18 ++++++++++--------
 nerv/nn/layer_dag.lua  |  2 +-
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/nerv/io/sgd_buffer.lua b/nerv/io/sgd_buffer.lua
index 3f854f0..74c4934 100644
--- a/nerv/io/sgd_buffer.lua
+++ b/nerv/io/sgd_buffer.lua
@@ -5,9 +5,7 @@ function SGDBuffer:__init(global_conf, buffer_conf)
     self.buffer_size = math.floor(buffer_conf.buffer_size /
                                 global_conf.batch_size) * global_conf.batch_size
     self.randomize = buffer_conf.randomize
-    if self.randomize == nil then
-        self.randomize = false
-    end
+    self.consume = buffer_conf.consume
     local cumat_type = global_conf.cumat_type
     if buffer_conf.use_gpu then
         self.mat_type = cumat_type
@@ -104,26 +102,30 @@ function SGDBuffer:get_data()
     local batch_size = self.gconf.batch_size
     if self.head >= self.tail then -- buffer is empty
         local t = os.clock()
-        if not self:saturate() then
+        if (not self:saturate()) and (not self.consume) then
             return nil -- the remaining data cannot build a batch
         end
+        if self.tail == self.head then
+            return nil -- nothing left
+        end
         nerv.info("%.3fs to fill the buffer", os.clock() - t)
     end
-    if self.head + batch_size > self.tail then
+    if self.head + batch_size > self.tail and (not self.consume) then
         return nil -- the remaining data cannot build a batch
     end
+    actual_batch_size = math.min(batch_size, self.tail - self.head)
     local res = {}
     for i, reader in ipairs(self.readers) do
         for id, buff in pairs(reader.buffs) do
-            local batch = self.gconf.cumat_type(batch_size, buff.width)
+            local batch = self.gconf.cumat_type(actual_batch_size, buff.width)
             if self.randomize then
                 self.copy_rows_from_by_idx(batch, buff.data, self.rand_map, self.head)
             else
-                self.copy_from(batch, buff.data, self.head, self.head + batch_size)
+                self.copy_from(batch, buff.data, self.head, self.head + actual_batch_size)
             end
             res[id] = batch
         end
     end
-    self.head = self.head + batch_size
+    self.head = self.head + actual_batch_size
     return res
 end
diff --git a/nerv/nn/layer_dag.lua b/nerv/nn/layer_dag.lua
index 25297c2..f69d31c 100644
--- a/nerv/nn/layer_dag.lua
+++ b/nerv/nn/layer_dag.lua
@@ -266,7 +266,7 @@ function DAGLayer:get_intermediate(id, port_type)
     if id == "<input>" or id == "<output>" then
         nerv.error("an actual real layer id is expected")
     end
-    local layer = layers[id]
+    local layer = self.layers[id]
     if layer == nil then
         nerv.error("layer id %s not found", id)
     end
-- 
cgit v1.2.3-70-g09d2


From 37286a08b40f68b544983d8dde4a77ac0b488397 Mon Sep 17 00:00:00 2001
From: Yimmon Zhuang <yimmon.zhuang@gmail.com>
Date: Fri, 18 Sep 2015 22:17:25 +0800
Subject: kaldi mpe training support

---
 nerv/Makefile                    |   2 +-
 nerv/examples/seq_chime.lua      | 185 +++++++++++++++++++++++++++++++++++++++
 nerv/examples/seq_trainer.lua    |  86 ++++++++++++++++++
 nerv/layer/affine.lua            |   4 +
 nerv/layer/affine_recurrent.lua  |   4 +
 nerv/layer/bias.lua              |   4 +
 nerv/layer/combiner.lua          |   6 ++
 nerv/layer/init.lua              |   1 +
 nerv/layer/mpe.lua               |  52 +++++++++++
 nerv/layer/mse.lua               |   8 ++
 nerv/layer/sigmoid.lua           |   4 +
 nerv/layer/softmax.lua           |   4 +
 nerv/layer/softmax_ce.lua        |   7 ++
 nerv/layer/window.lua            |   4 +
 nerv/lib/matrix/generic/matrix.c |   1 +
 nerv/nn/layer_dag.lua            |  40 ++++++++-
 16 files changed, 407 insertions(+), 5 deletions(-)
 create mode 100644 nerv/examples/seq_chime.lua
 create mode 100644 nerv/examples/seq_trainer.lua
 create mode 100644 nerv/layer/mpe.lua

diff --git a/nerv/Makefile b/nerv/Makefile
index b5d26bd..b874a94 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -31,7 +31,7 @@ OBJS := $(CORE_OBJS) $(NERV_OBJS) $(LUAT_OBJS)
 LIBS := $(INST_LIBDIR)/libnerv.so $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so
 LUA_LIBS := matrix/init.lua io/init.lua init.lua \
 			layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua layer/softmax.lua \
-			layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua layer/affine_recurrent.lua\
+			layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua layer/affine_recurrent.lua layer/mpe.lua \
 			nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua \
 			io/sgd_buffer.lua 
 
diff --git a/nerv/examples/seq_chime.lua b/nerv/examples/seq_chime.lua
new file mode 100644
index 0000000..be723ca
--- /dev/null
+++ b/nerv/examples/seq_chime.lua
@@ -0,0 +1,185 @@
+require 'kaldi_io'
+gconf = {lrate = 0.00001, wcost = 0, momentum = 0.0,
+        cumat_type = nerv.CuMatrixFloat,
+        mmat_type = nerv.MMatrixFloat,
+        frm_ext = 5,
+        tr_scp = "ark,s,cs:/slfs6/users/ymz09/kaldi/src/featbin/copy-feats scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_smbr/train.scp ark:- |",
+        initialized_param = {"/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_init.nerv",
+        "/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_global_transf.nerv"},
+        debug = false}
+
+function make_layer_repo(param_repo)
+    local layer_repo = nerv.LayerRepo(
+    {
+        -- global transf
+        ["nerv.BiasLayer"] =
+        {
+            blayer1 = {{bias = "bias1"}, {dim_in = {440}, dim_out = {440}}},
+            blayer2 = {{bias = "bias2"}, {dim_in = {440}, dim_out = {440}}}
+        },
+        ["nerv.WindowLayer"] =
+        {
+            wlayer1 = {{window = "window1"}, {dim_in = {440}, dim_out = {440}}},
+            wlayer2 = {{window = "window2"}, {dim_in = {440}, dim_out = {440}}}
+        },
+        -- biased linearity
+        ["nerv.AffineLayer"] =
+        {
+            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+            {dim_in = {440}, dim_out = {2048}}},
+            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+            {dim_in = {2048}, dim_out = {2011}}}
+        },
+        ["nerv.SigmoidLayer"] =
+        {
+            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+        },
+        ["nerv.MPELayer"] =
+        {
+            mpe_crit = {{}, {dim_in = {2011, -1}, dim_out = {1},
+                        cmd = {
+                            arg = "--class-frame-counts=/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced/ali_train_pdf.counts --acoustic-scale=0.1 --lm-scale=1.0 --learn-rate=0.00001 --do-smbr=true --verbose=1",
+                            mdl = "/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/final.mdl",
+                            lat = "scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_denlats/lat.scp",
+                            ali = "ark:gunzip -c /slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/ali.*.gz |"
+                        }
+                    }
+                }
+        },
+        ["nerv.SoftmaxLayer"] = -- softmax for decode output
+        {
+            softmax = {{}, {dim_in = {2011}, dim_out = {2011}}}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            global_transf = {{}, {
+                dim_in = {440}, dim_out = {440},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "blayer1[1]",
+                    ["blayer1[1]"] = "wlayer1[1]",
+                    ["wlayer1[1]"] = "blayer2[1]",
+                    ["blayer2[1]"] = "wlayer2[1]",
+                    ["wlayer2[1]"] = "<output>[1]"
+                }
+            }},
+            main = {{}, {
+                dim_in = {440}, dim_out = {2011},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "affine0[1]",
+                    ["affine0[1]"] = "sigmoid0[1]",
+                    ["sigmoid0[1]"] = "affine1[1]",
+                    ["affine1[1]"] = "sigmoid1[1]",
+                    ["sigmoid1[1]"] = "affine2[1]",
+                    ["affine2[1]"] = "sigmoid2[1]",
+                    ["sigmoid2[1]"] = "affine3[1]",
+                    ["affine3[1]"] = "sigmoid3[1]",
+                    ["sigmoid3[1]"] = "affine4[1]",
+                    ["affine4[1]"] = "sigmoid4[1]",
+                    ["sigmoid4[1]"] = "affine5[1]",
+                    ["affine5[1]"] = "sigmoid5[1]",
+                    ["sigmoid5[1]"] = "affine6[1]",
+                    ["affine6[1]"] = "sigmoid6[1]",
+                    ["sigmoid6[1]"] = "affine7[1]",
+                    ["affine7[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            mpe_output = {{}, {
+                dim_in = {440, -1}, dim_out = {1},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "mpe_crit[1]",
+                    ["<input>[2]"] = "mpe_crit[2]",
+                    ["mpe_crit[1]"] = "<output>[1]"
+                }
+            }},
+            softmax_output = {{}, {
+                dim_in = {440}, dim_out = {2011},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "softmax[1]",
+                    ["softmax[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+    return layer_repo
+end
+
+function get_network(layer_repo)
+    return layer_repo:get_layer("mpe_output")
+end
+
+function get_decode_network(layer_repo)
+    return layer_repo:get_layer("softmax_output")
+end
+
+function get_global_transf(layer_repo)
+    return layer_repo:get_layer("global_transf")
+end
+
+function make_readers(feature_rspecifier, layer_repo)
+    return {
+                {reader = nerv.KaldiReader(gconf,
+                    {
+                        id = "main_scp",
+                        feature_rspecifier = feature_rspecifier,
+                        frm_ext = gconf.frm_ext,
+                        global_transf = layer_repo:get_layer("global_transf"),
+                        mlfs = {}
+                    })
+                }
+            }
+end
+
+function get_input_order()
+    return {{id = "main_scp", global_transf = true},
+            {id = "key"}}
+end
+
+function get_accuracy(layer_repo)
+    local mpe_crit = layer_repo:get_layer("mpe_crit")
+    return mpe_crit.total_correct / mpe_crit.total_frames * 100
+end
+
+function print_stat(layer_repo)
+    local mpe_crit = layer_repo:get_layer("mpe_crit")
+    nerv.info("*** training stat begin ***")
+    nerv.printf("correct:\t\t%d\n", mpe_crit.total_correct)
+    nerv.printf("frames:\t\t\t%d\n", mpe_crit.total_frames)
+    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo))
+    nerv.info("*** training stat end ***")
+end
diff --git a/nerv/examples/seq_trainer.lua b/nerv/examples/seq_trainer.lua
new file mode 100644
index 0000000..df96e68
--- /dev/null
+++ b/nerv/examples/seq_trainer.lua
@@ -0,0 +1,86 @@
+function build_trainer(ifname)
+    local param_repo = nerv.ParamRepo()
+    param_repo:import(ifname, nil, gconf)
+    local layer_repo = make_layer_repo(param_repo)
+    local network = get_network(layer_repo)
+    local global_transf = get_global_transf(layer_repo)
+    local input_order = get_input_order()
+    local iterative_trainer = function (prefix, scp_file, bp)
+        local readers = make_readers(scp_file, layer_repo)
+        -- initialize the network
+        network:init(1)
+        gconf.cnt = 0
+        for ri = 1, #readers, 1 do
+            while true do
+                local data = readers[ri].reader:get_data()
+                if data == nil then
+                    break
+                end
+                -- prine stat periodically
+                gconf.cnt = gconf.cnt + 1
+                if gconf.cnt == 1000 then
+                    print_stat(layer_repo)
+                    nerv.CuMatrix.print_profile()
+                    nerv.CuMatrix.clear_profile()
+                    gconf.cnt = 0
+                    -- break
+                end
+                local input = {}
+                --            if gconf.cnt == 1000 then break end
+                for i, e in ipairs(input_order) do
+                    local id = e.id
+                    if data[id] == nil then
+                        nerv.error("input data %s not found", id)
+                    end
+                    local transformed
+                    if e.global_transf then
+                        local batch = gconf.cumat_type(data[id]:nrow(), data[id]:ncol())
+                        batch:copy_fromh(data[id])
+                        transformed = nerv.speech_utils.global_transf(batch,
+                        global_transf,
+                        gconf.frm_ext or 0, 0,
+                        gconf)
+                    else
+                        transformed = data[id]
+                    end
+                    table.insert(input, transformed)
+                end
+                err_output = {input[1]:create()}
+                network:batch_resize(input[1]:nrow())
+                if network:propagate(input, {{}}) == true then
+                    network:back_propagate({{}}, err_output, input, {{}})
+                    network:update({{}}, input, {{}})
+                end
+                -- collect garbage in-time to save GPU memory
+                collectgarbage("collect")
+            end
+        end
+        print_stat(layer_repo)
+        nerv.CuMatrix.print_profile()
+        nerv.CuMatrix.clear_profile()
+        if prefix ~= nil then
+            nerv.info("writing back...")
+            local fname = string.format("%s_tr%.3f.nerv",
+            prefix, get_accuracy(layer_repo))
+            network:get_params():export(fname, nil)
+        end
+        return get_accuracy(layer_repo)
+    end
+    return iterative_trainer
+end
+
+dofile(arg[1])
+
+local pf0 = gconf.initialized_param
+local trainer = build_trainer(pf0)
+
+local i = 1
+nerv.info("[NN] begin iteration %d with lrate = %.6f", i, gconf.lrate)
+local accu_tr = trainer(string.format("%s_%s_iter_%d_lr%f",
+string.gsub(
+(string.gsub(pf0[1], "(.*/)(.*)", "%2")),
+"(.*)%..*", "%1"),
+os.date("%Y%m%d%H%M%S"),
+i, gconf.lrate), gconf.tr_scp, true)
+nerv.info("[TR] training set %d: %.3f", i, accu_tr)
+
diff --git a/nerv/layer/affine.lua b/nerv/layer/affine.lua
index 00cbcfb..6c90e3e 100644
--- a/nerv/layer/affine.lua
+++ b/nerv/layer/affine.lua
@@ -60,6 +60,10 @@ function AffineLayer:init(batch_size)
     self.bp:train_init()
 end
 
+function AffineLayer:batch_resize(batch_size)
+    -- do nothing
+end
+
 function AffineLayer:update(bp_err, input, output)
     if self.direct_update then
         self.ltp.correction:mul(input[1], bp_err[1], 1.0, gconf.momentum, 'T', 'N')
diff --git a/nerv/layer/affine_recurrent.lua b/nerv/layer/affine_recurrent.lua
index 59d259c..92d98e2 100644
--- a/nerv/layer/affine_recurrent.lua
+++ b/nerv/layer/affine_recurrent.lua
@@ -37,6 +37,10 @@ function Recurrent:init(batch_size)
     self.bp:train_init()
 end
 
+function Recurrent:batch_resize(batch_size)
+    -- do nothing
+end
+
 function Recurrent:update(bp_err, input, output)
     if (self.direct_update == true) then
         local ltp_hh = self.ltp_hh.trans
diff --git a/nerv/layer/bias.lua b/nerv/layer/bias.lua
index c99274d..7e9fd46 100644
--- a/nerv/layer/bias.lua
+++ b/nerv/layer/bias.lua
@@ -18,6 +18,10 @@ function BiasLayer:init()
     end
 end
 
+function BiasLayer:batch_resize(batch_size)
+    -- do nothing
+end
+
 function BiasLayer:propagate(input, output)
     output[1]:copy_fromd(input[1])
     output[1]:add_row(self.bias.trans, 1.0)
diff --git a/nerv/layer/combiner.lua b/nerv/layer/combiner.lua
index 7bd7617..1bcfdfb 100644
--- a/nerv/layer/combiner.lua
+++ b/nerv/layer/combiner.lua
@@ -30,6 +30,12 @@ function CombinerLayer:init(batch_size)
     self.sum = self.gconf.cumat_type(batch_size, dim)
 end
 
+function CombinerLayer:batch_resize(batch_size)
+    if self.sum:nrow() ~= batch_size then
+        self.sum = self.gconf.cumat_type(batch_size, self.dim_in[1])
+    end
+end
+
 function CombinerLayer:update(bp_err, input, output)
 end
 
diff --git a/nerv/layer/init.lua b/nerv/layer/init.lua
index 6861b0e..b74422f 100644
--- a/nerv/layer/init.lua
+++ b/nerv/layer/init.lua
@@ -79,3 +79,4 @@ nerv.include('mse.lua')
 nerv.include('combiner.lua')
 nerv.include('affine_recurrent.lua')
 nerv.include('softmax.lua')
+nerv.include('mpe.lua')
diff --git a/nerv/layer/mpe.lua b/nerv/layer/mpe.lua
new file mode 100644
index 0000000..ec8a8f3
--- /dev/null
+++ b/nerv/layer/mpe.lua
@@ -0,0 +1,52 @@
+require 'libkaldiseq'
+local MPELayer = nerv.class("nerv.MPELayer", "nerv.Layer")
+
+function MPELayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.gconf = global_conf
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self.arg = layer_conf.cmd.arg
+    self.mdl = layer_conf.cmd.mdl
+    self.lat = layer_conf.cmd.lat
+    self.ali = layer_conf.cmd.ali
+    self:check_dim_len(2, -1) -- two inputs: nn output and utt key
+end
+
+function MPELayer:init(batch_size)
+    self.total_correct = 0
+    self.total_frames = 0
+    self.kaldi_mpe = nerv.KaldiMPE(self.arg, self.mdl, self.lat, self.ali)
+    if self.kaldi_mpe == nil then
+        nerv.error("kaldi arguments is expected: %s %s %s %s", self.arg,
+        self.mdl, self.lat, self.ali)
+    end
+end
+
+function MPELayer:batch_resize(batch_size)
+    -- do nothing
+end
+
+function MPELayer:update(bp_err, input, output)
+    -- no params, therefore do nothing
+end
+
+function MPELayer:propagate(input, output)
+    self.valid = false
+    self.valid = self.kaldi_mpe:check(input[1], input[2])
+    return self.valid
+end
+
+function MPELayer:back_propagate(bp_err, next_bp_err, input, output)
+    if self.valid ~= true then
+        nerv.error("kaldi sequence training back_propagate fail")
+    end
+    local mmat = input[1]:new_to_host()
+    next_bp_err[1]:copy_fromh(self.kaldi_mpe:calc_diff(mmat, input[2]))
+    self.total_frames = self.total_frames + self.kaldi_mpe:get_num_frames()
+    self.total_correct = self.total_correct + self.kaldi_mpe:get_utt_frame_acc()
+end
+
+function MPELayer:get_params()
+    return nerv.ParamRepo({})
+end
diff --git a/nerv/layer/mse.lua b/nerv/layer/mse.lua
index 2516998..0ee3080 100644
--- a/nerv/layer/mse.lua
+++ b/nerv/layer/mse.lua
@@ -20,6 +20,14 @@ function MSELayer:init(batch_size)
     self.diff = self.mse:create()
 end
 
+function MSELayer:batch_resize(batch_size)
+    if self.mse:nrow() ~= batch_resize then
+        self.mse = self.gconf.cumat_type(batch_size, self.dim_in[1])
+        self.mse_sum = self.gconf.cumat_type(batch_size, 1)
+        self.diff = self.mse:create()
+    end
+end
+
 function MSELayer:update(bp_err, input, output)
     -- no params, therefore do nothing
 end
diff --git a/nerv/layer/sigmoid.lua b/nerv/layer/sigmoid.lua
index dfd09eb..0a8bcdc 100644
--- a/nerv/layer/sigmoid.lua
+++ b/nerv/layer/sigmoid.lua
@@ -14,6 +14,10 @@ function SigmoidLayer:init()
     end
 end
 
+function SigmoidLayer:batch_resize(batch_size)
+    -- do nothing
+end
+
 function SigmoidLayer:update(bp_err, input, output)
     -- no params, therefore do nothing
 end
diff --git a/nerv/layer/softmax.lua b/nerv/layer/softmax.lua
index e979ebf..4205b66 100644
--- a/nerv/layer/softmax.lua
+++ b/nerv/layer/softmax.lua
@@ -14,6 +14,10 @@ function SoftmaxLayer:init(batch_size)
     end
 end
 
+function SoftmaxLayer:batch_resize(batch_size)
+    -- do nothing
+end
+
 function SoftmaxLayer:update(bp_err, input, output)
     -- no params, therefore do nothing
 end
diff --git a/nerv/layer/softmax_ce.lua b/nerv/layer/softmax_ce.lua
index f878a2f..9071e86 100644
--- a/nerv/layer/softmax_ce.lua
+++ b/nerv/layer/softmax_ce.lua
@@ -23,6 +23,13 @@ function SoftmaxCELayer:init(batch_size)
     self.ce = self.softmax:create()
 end
 
+function SoftmaxCELayer:batch_resize(batch_size)
+    if self.softmax:nrow() ~= batch_resize then
+        self.softmax = self.gconf.cumat_type(batch_size, self.dim_in[1])
+        self.ce = self.softmax:create()
+    end
+end
+
 function SoftmaxCELayer:update(bp_err, input, output)
     -- no params, therefore do nothing
 end
diff --git a/nerv/layer/window.lua b/nerv/layer/window.lua
index 4e9a3b1..8eed352 100644
--- a/nerv/layer/window.lua
+++ b/nerv/layer/window.lua
@@ -18,6 +18,10 @@ function WindowLayer:init()
     end
 end
 
+function WindowLayer:batch_resize(batch_size)
+    -- do nothing
+end
+
 function WindowLayer:propagate(input, output)
     output[1]:copy_fromd(input[1])
     output[1]:scale_rows_by_row(self.window.trans)
diff --git a/nerv/lib/matrix/generic/matrix.c b/nerv/lib/matrix/generic/matrix.c
index 6cb3dc0..4319e13 100644
--- a/nerv/lib/matrix/generic/matrix.c
+++ b/nerv/lib/matrix/generic/matrix.c
@@ -4,6 +4,7 @@
 /* FIXME: malloc failure detection */
 
 void nerv_matrix_(data_free)(Matrix *self, Status *status) {
+    if(*self->data_ref == 0) return;
     assert(*self->data_ref > 0);
     if (--(*self->data_ref) == 0)
     {
diff --git a/nerv/nn/layer_dag.lua b/nerv/nn/layer_dag.lua
index f69d31c..73bb77d 100644
--- a/nerv/nn/layer_dag.lua
+++ b/nerv/nn/layer_dag.lua
@@ -79,7 +79,7 @@ function DAGLayer:__init(id, global_conf, layer_conf)
             end
 
             table.insert(parsed_conn,
-                {{ref_from, port_from}, {ref_to, port_to}})
+            {{ref_from, port_from}, {ref_to, port_to}})
             table.insert(ref_from.next_layers, ref_to) -- add edge
             ref_to.in_deg = ref_to.in_deg + 1          -- increase the in-degree of the target layer
         end
@@ -140,8 +140,11 @@ function DAGLayer:init(batch_size)
         ref_from, port_from = unpack(conn[1])
         ref_to, port_to = unpack(conn[2])
         _, output_dim = ref_from.layer:get_dim()
-        local mid = self.gconf.cumat_type(batch_size,
-                                        output_dim[port_from])
+        local dim = 1
+        if output_dim[port_from] > 0 then
+            dim = output_dim[port_from]
+        end
+        local mid = self.gconf.cumat_type(batch_size, dim)
         local err_mid = mid:create()
 
         ref_from.outputs[port_from] = mid
@@ -176,6 +179,33 @@ function DAGLayer:init(batch_size)
     end
 end
 
+function DAGLayer:batch_resize(batch_size)
+    self.gconf.batch_size = batch_size
+
+    for i, conn in ipairs(self.parsed_conn) do
+        local _, output_dim
+        local ref_from, port_from, ref_to, port_to
+        ref_from, port_from = unpack(conn[1])
+        ref_to, port_to = unpack(conn[2])
+        _, output_dim = ref_from.layer:get_dim()
+
+        if ref_from.outputs[port_from]:nrow() ~= batch_size and output_dim[port_from] > 0 then
+            local mid = self.gconf.cumat_type(batch_size, output_dim[port_from])
+            local err_mid = mid:create()
+
+            ref_from.outputs[port_from] = mid
+            ref_to.inputs[port_to] = mid
+
+            ref_from.err_inputs[port_from] = err_mid
+            ref_to.err_outputs[port_to] = err_mid
+        end
+    end
+    for id, ref in pairs(self.layers) do
+        ref.layer:batch_resize(batch_size)
+    end
+    collectgarbage("collect")
+end
+
 function DAGLayer:set_inputs(input)
     for i = 1, #self.dim_in do
         if input[i] == nil then
@@ -228,11 +258,13 @@ end
 function DAGLayer:propagate(input, output)
     self:set_inputs(input)
     self:set_outputs(output)
+    local ret = false
     for i = 1, #self.queue do
         local ref = self.queue[i]
         -- print(ref.layer.id)
-        ref.layer:propagate(ref.inputs, ref.outputs)
+        ret = ref.layer:propagate(ref.inputs, ref.outputs)
     end
+    return ret
 end
 
 function DAGLayer:back_propagate(bp_err, next_bp_err, input, output)
-- 
cgit v1.2.3-70-g09d2


From 7975592b94d65b6f356093694a76201de62a7a6a Mon Sep 17 00:00:00 2001
From: Yimmon Zhuang <yimmon.zhuang@gmail.com>
Date: Thu, 8 Oct 2015 22:27:58 +0800
Subject: MMI support

---
 nerv/Makefile                |   3 +-
 nerv/examples/mmi_chime3.lua | 182 ++++++++++++++++++++++++++++++++++++++++++
 nerv/examples/mpe_chime3.lua | 185 +++++++++++++++++++++++++++++++++++++++++++
 nerv/examples/seq_chime.lua  | 185 -------------------------------------------
 nerv/layer/init.lua          |   1 +
 nerv/layer/mmi.lua           |  50 ++++++++++++
 6 files changed, 420 insertions(+), 186 deletions(-)
 create mode 100644 nerv/examples/mmi_chime3.lua
 create mode 100644 nerv/examples/mpe_chime3.lua
 delete mode 100644 nerv/examples/seq_chime.lua
 create mode 100644 nerv/layer/mmi.lua

diff --git a/nerv/Makefile b/nerv/Makefile
index b874a94..ce178a0 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -31,7 +31,8 @@ OBJS := $(CORE_OBJS) $(NERV_OBJS) $(LUAT_OBJS)
 LIBS := $(INST_LIBDIR)/libnerv.so $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so
 LUA_LIBS := matrix/init.lua io/init.lua init.lua \
 			layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua layer/softmax.lua \
-			layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua layer/affine_recurrent.lua layer/mpe.lua \
+			layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua layer/affine_recurrent.lua \
+			layer/mpe.lua layer/mmi.lua \
 			nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua \
 			io/sgd_buffer.lua 
 
diff --git a/nerv/examples/mmi_chime3.lua b/nerv/examples/mmi_chime3.lua
new file mode 100644
index 0000000..a7ad268
--- /dev/null
+++ b/nerv/examples/mmi_chime3.lua
@@ -0,0 +1,182 @@
+require 'kaldi_io'
+gconf = {lrate = 0.00001, wcost = 0, momentum = 0.0,
+        cumat_type = nerv.CuMatrixFloat,
+        mmat_type = nerv.MMatrixFloat,
+        frm_ext = 5,
+        tr_scp = "ark,o:/slfs6/users/ymz09/kaldi/src/featbin/copy-feats scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_mmi/train.scp ark:- |",
+        initialized_param = {"/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_init_mmi.nerv",
+        "/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_global_transf_mmi.nerv"},
+        debug = false}
+
+function make_layer_repo(param_repo)
+    local layer_repo = nerv.LayerRepo(
+    {
+        -- global transf
+        ["nerv.BiasLayer"] =
+        {
+            blayer1 = {{bias = "bias1"}, {dim_in = {440}, dim_out = {440}}},
+            blayer2 = {{bias = "bias2"}, {dim_in = {440}, dim_out = {440}}}
+        },
+        ["nerv.WindowLayer"] =
+        {
+            wlayer1 = {{window = "window1"}, {dim_in = {440}, dim_out = {440}}},
+            wlayer2 = {{window = "window2"}, {dim_in = {440}, dim_out = {440}}}
+        },
+        -- biased linearity
+        ["nerv.AffineLayer"] =
+        {
+            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+            {dim_in = {440}, dim_out = {2048}}},
+            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+            {dim_in = {2048}, dim_out = {2011}}}
+        },
+        ["nerv.SigmoidLayer"] =
+        {
+            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+        },
+        ["nerv.MMILayer"] =
+        {
+            mmi_crit = {{}, {dim_in = {2011, -1}, dim_out = {1},
+                        cmd = {
+                            arg = "--class-frame-counts=/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced/ali_train_pdf.counts --acoustic-scale=0.1 --lm-scale=1.0 --learn-rate=0.00001 --drop-frames=true --verbose=1",
+                            mdl = "/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/final.mdl",
+                            lat = "scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_denlats/lat.scp",
+                            ali = "ark:gunzip -c /slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/ali.*.gz |"
+                        }
+                    }
+                }
+        },
+        ["nerv.SoftmaxLayer"] = -- softmax for decode output
+        {
+            softmax = {{}, {dim_in = {2011}, dim_out = {2011}}}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            global_transf = {{}, {
+                dim_in = {440}, dim_out = {440},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "blayer1[1]",
+                    ["blayer1[1]"] = "wlayer1[1]",
+                    ["wlayer1[1]"] = "blayer2[1]",
+                    ["blayer2[1]"] = "wlayer2[1]",
+                    ["wlayer2[1]"] = "<output>[1]"
+                }
+            }},
+            main = {{}, {
+                dim_in = {440}, dim_out = {2011},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "affine0[1]",
+                    ["affine0[1]"] = "sigmoid0[1]",
+                    ["sigmoid0[1]"] = "affine1[1]",
+                    ["affine1[1]"] = "sigmoid1[1]",
+                    ["sigmoid1[1]"] = "affine2[1]",
+                    ["affine2[1]"] = "sigmoid2[1]",
+                    ["sigmoid2[1]"] = "affine3[1]",
+                    ["affine3[1]"] = "sigmoid3[1]",
+                    ["sigmoid3[1]"] = "affine4[1]",
+                    ["affine4[1]"] = "sigmoid4[1]",
+                    ["sigmoid4[1]"] = "affine5[1]",
+                    ["affine5[1]"] = "sigmoid5[1]",
+                    ["sigmoid5[1]"] = "affine6[1]",
+                    ["affine6[1]"] = "sigmoid6[1]",
+                    ["sigmoid6[1]"] = "affine7[1]",
+                    ["affine7[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            mmi_output = {{}, {
+                dim_in = {440, -1}, dim_out = {1},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "mmi_crit[1]",
+                    ["<input>[2]"] = "mmi_crit[2]",
+                    ["mmi_crit[1]"] = "<output>[1]"
+                }
+            }},
+            softmax_output = {{}, {
+                dim_in = {440}, dim_out = {2011},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "softmax[1]",
+                    ["softmax[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+    return layer_repo
+end
+
+function get_network(layer_repo)
+    return layer_repo:get_layer("mmi_output")
+end
+
+function get_decode_network(layer_repo)
+    return layer_repo:get_layer("softmax_output")
+end
+
+function get_global_transf(layer_repo)
+    return layer_repo:get_layer("global_transf")
+end
+
+function make_readers(feature_rspecifier, layer_repo)
+    return {
+                {reader = nerv.KaldiReader(gconf,
+                    {
+                        id = "main_scp",
+                        feature_rspecifier = feature_rspecifier,
+                        frm_ext = gconf.frm_ext,
+                        global_transf = layer_repo:get_layer("global_transf"),
+                        mlfs = {}
+                    })
+                }
+            }
+end
+
+function get_input_order()
+    return {{id = "main_scp", global_transf = true},
+            {id = "key"}}
+end
+
+function get_accuracy(layer_repo)
+    return 0
+end
+
+function print_stat(layer_repo)
+    local mmi_crit = layer_repo:get_layer("mmi_crit")
+    nerv.info("*** training stat begin ***")
+    nerv.printf("frames:\t\t\t%d\n", mmi_crit.total_frames)
+    nerv.info("*** training stat end ***")
+end
diff --git a/nerv/examples/mpe_chime3.lua b/nerv/examples/mpe_chime3.lua
new file mode 100644
index 0000000..be723ca
--- /dev/null
+++ b/nerv/examples/mpe_chime3.lua
@@ -0,0 +1,185 @@
+require 'kaldi_io'
+gconf = {lrate = 0.00001, wcost = 0, momentum = 0.0,
+        cumat_type = nerv.CuMatrixFloat,
+        mmat_type = nerv.MMatrixFloat,
+        frm_ext = 5,
+        tr_scp = "ark,s,cs:/slfs6/users/ymz09/kaldi/src/featbin/copy-feats scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_smbr/train.scp ark:- |",
+        initialized_param = {"/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_init.nerv",
+        "/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_global_transf.nerv"},
+        debug = false}
+
+function make_layer_repo(param_repo)
+    local layer_repo = nerv.LayerRepo(
+    {
+        -- global transf
+        ["nerv.BiasLayer"] =
+        {
+            blayer1 = {{bias = "bias1"}, {dim_in = {440}, dim_out = {440}}},
+            blayer2 = {{bias = "bias2"}, {dim_in = {440}, dim_out = {440}}}
+        },
+        ["nerv.WindowLayer"] =
+        {
+            wlayer1 = {{window = "window1"}, {dim_in = {440}, dim_out = {440}}},
+            wlayer2 = {{window = "window2"}, {dim_in = {440}, dim_out = {440}}}
+        },
+        -- biased linearity
+        ["nerv.AffineLayer"] =
+        {
+            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+            {dim_in = {440}, dim_out = {2048}}},
+            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+            {dim_in = {2048}, dim_out = {2011}}}
+        },
+        ["nerv.SigmoidLayer"] =
+        {
+            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+        },
+        ["nerv.MPELayer"] =
+        {
+            mpe_crit = {{}, {dim_in = {2011, -1}, dim_out = {1},
+                        cmd = {
+                            arg = "--class-frame-counts=/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced/ali_train_pdf.counts --acoustic-scale=0.1 --lm-scale=1.0 --learn-rate=0.00001 --do-smbr=true --verbose=1",
+                            mdl = "/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/final.mdl",
+                            lat = "scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_denlats/lat.scp",
+                            ali = "ark:gunzip -c /slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/ali.*.gz |"
+                        }
+                    }
+                }
+        },
+        ["nerv.SoftmaxLayer"] = -- softmax for decode output
+        {
+            softmax = {{}, {dim_in = {2011}, dim_out = {2011}}}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            global_transf = {{}, {
+                dim_in = {440}, dim_out = {440},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "blayer1[1]",
+                    ["blayer1[1]"] = "wlayer1[1]",
+                    ["wlayer1[1]"] = "blayer2[1]",
+                    ["blayer2[1]"] = "wlayer2[1]",
+                    ["wlayer2[1]"] = "<output>[1]"
+                }
+            }},
+            main = {{}, {
+                dim_in = {440}, dim_out = {2011},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "affine0[1]",
+                    ["affine0[1]"] = "sigmoid0[1]",
+                    ["sigmoid0[1]"] = "affine1[1]",
+                    ["affine1[1]"] = "sigmoid1[1]",
+                    ["sigmoid1[1]"] = "affine2[1]",
+                    ["affine2[1]"] = "sigmoid2[1]",
+                    ["sigmoid2[1]"] = "affine3[1]",
+                    ["affine3[1]"] = "sigmoid3[1]",
+                    ["sigmoid3[1]"] = "affine4[1]",
+                    ["affine4[1]"] = "sigmoid4[1]",
+                    ["sigmoid4[1]"] = "affine5[1]",
+                    ["affine5[1]"] = "sigmoid5[1]",
+                    ["sigmoid5[1]"] = "affine6[1]",
+                    ["affine6[1]"] = "sigmoid6[1]",
+                    ["sigmoid6[1]"] = "affine7[1]",
+                    ["affine7[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            mpe_output = {{}, {
+                dim_in = {440, -1}, dim_out = {1},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "mpe_crit[1]",
+                    ["<input>[2]"] = "mpe_crit[2]",
+                    ["mpe_crit[1]"] = "<output>[1]"
+                }
+            }},
+            softmax_output = {{}, {
+                dim_in = {440}, dim_out = {2011},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "softmax[1]",
+                    ["softmax[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+    return layer_repo
+end
+
+function get_network(layer_repo)
+    return layer_repo:get_layer("mpe_output")
+end
+
+function get_decode_network(layer_repo)
+    return layer_repo:get_layer("softmax_output")
+end
+
+function get_global_transf(layer_repo)
+    return layer_repo:get_layer("global_transf")
+end
+
+function make_readers(feature_rspecifier, layer_repo)
+    return {
+                {reader = nerv.KaldiReader(gconf,
+                    {
+                        id = "main_scp",
+                        feature_rspecifier = feature_rspecifier,
+                        frm_ext = gconf.frm_ext,
+                        global_transf = layer_repo:get_layer("global_transf"),
+                        mlfs = {}
+                    })
+                }
+            }
+end
+
+function get_input_order()
+    return {{id = "main_scp", global_transf = true},
+            {id = "key"}}
+end
+
+function get_accuracy(layer_repo)
+    local mpe_crit = layer_repo:get_layer("mpe_crit")
+    return mpe_crit.total_correct / mpe_crit.total_frames * 100
+end
+
+function print_stat(layer_repo)
+    local mpe_crit = layer_repo:get_layer("mpe_crit")
+    nerv.info("*** training stat begin ***")
+    nerv.printf("correct:\t\t%d\n", mpe_crit.total_correct)
+    nerv.printf("frames:\t\t\t%d\n", mpe_crit.total_frames)
+    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo))
+    nerv.info("*** training stat end ***")
+end
diff --git a/nerv/examples/seq_chime.lua b/nerv/examples/seq_chime.lua
deleted file mode 100644
index be723ca..0000000
--- a/nerv/examples/seq_chime.lua
+++ /dev/null
@@ -1,185 +0,0 @@
-require 'kaldi_io'
-gconf = {lrate = 0.00001, wcost = 0, momentum = 0.0,
-        cumat_type = nerv.CuMatrixFloat,
-        mmat_type = nerv.MMatrixFloat,
-        frm_ext = 5,
-        tr_scp = "ark,s,cs:/slfs6/users/ymz09/kaldi/src/featbin/copy-feats scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_smbr/train.scp ark:- |",
-        initialized_param = {"/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_init.nerv",
-        "/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_global_transf.nerv"},
-        debug = false}
-
-function make_layer_repo(param_repo)
-    local layer_repo = nerv.LayerRepo(
-    {
-        -- global transf
-        ["nerv.BiasLayer"] =
-        {
-            blayer1 = {{bias = "bias1"}, {dim_in = {440}, dim_out = {440}}},
-            blayer2 = {{bias = "bias2"}, {dim_in = {440}, dim_out = {440}}}
-        },
-        ["nerv.WindowLayer"] =
-        {
-            wlayer1 = {{window = "window1"}, {dim_in = {440}, dim_out = {440}}},
-            wlayer2 = {{window = "window2"}, {dim_in = {440}, dim_out = {440}}}
-        },
-        -- biased linearity
-        ["nerv.AffineLayer"] =
-        {
-            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
-            {dim_in = {440}, dim_out = {2048}}},
-            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
-            {dim_in = {2048}, dim_out = {2011}}}
-        },
-        ["nerv.SigmoidLayer"] =
-        {
-            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
-        },
-        ["nerv.MPELayer"] =
-        {
-            mpe_crit = {{}, {dim_in = {2011, -1}, dim_out = {1},
-                        cmd = {
-                            arg = "--class-frame-counts=/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced/ali_train_pdf.counts --acoustic-scale=0.1 --lm-scale=1.0 --learn-rate=0.00001 --do-smbr=true --verbose=1",
-                            mdl = "/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/final.mdl",
-                            lat = "scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_denlats/lat.scp",
-                            ali = "ark:gunzip -c /slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/ali.*.gz |"
-                        }
-                    }
-                }
-        },
-        ["nerv.SoftmaxLayer"] = -- softmax for decode output
-        {
-            softmax = {{}, {dim_in = {2011}, dim_out = {2011}}}
-        }
-    }, param_repo, gconf)
-
-    layer_repo:add_layers(
-    {
-        ["nerv.DAGLayer"] =
-        {
-            global_transf = {{}, {
-                dim_in = {440}, dim_out = {440},
-                sub_layers = layer_repo,
-                connections = {
-                    ["<input>[1]"] = "blayer1[1]",
-                    ["blayer1[1]"] = "wlayer1[1]",
-                    ["wlayer1[1]"] = "blayer2[1]",
-                    ["blayer2[1]"] = "wlayer2[1]",
-                    ["wlayer2[1]"] = "<output>[1]"
-                }
-            }},
-            main = {{}, {
-                dim_in = {440}, dim_out = {2011},
-                sub_layers = layer_repo,
-                connections = {
-                    ["<input>[1]"] = "affine0[1]",
-                    ["affine0[1]"] = "sigmoid0[1]",
-                    ["sigmoid0[1]"] = "affine1[1]",
-                    ["affine1[1]"] = "sigmoid1[1]",
-                    ["sigmoid1[1]"] = "affine2[1]",
-                    ["affine2[1]"] = "sigmoid2[1]",
-                    ["sigmoid2[1]"] = "affine3[1]",
-                    ["affine3[1]"] = "sigmoid3[1]",
-                    ["sigmoid3[1]"] = "affine4[1]",
-                    ["affine4[1]"] = "sigmoid4[1]",
-                    ["sigmoid4[1]"] = "affine5[1]",
-                    ["affine5[1]"] = "sigmoid5[1]",
-                    ["sigmoid5[1]"] = "affine6[1]",
-                    ["affine6[1]"] = "sigmoid6[1]",
-                    ["sigmoid6[1]"] = "affine7[1]",
-                    ["affine7[1]"] = "<output>[1]"
-                }
-            }}
-        }
-    }, param_repo, gconf)
-
-    layer_repo:add_layers(
-    {
-        ["nerv.DAGLayer"] =
-        {
-            mpe_output = {{}, {
-                dim_in = {440, -1}, dim_out = {1},
-                sub_layers = layer_repo,
-                connections = {
-                    ["<input>[1]"] = "main[1]",
-                    ["main[1]"] = "mpe_crit[1]",
-                    ["<input>[2]"] = "mpe_crit[2]",
-                    ["mpe_crit[1]"] = "<output>[1]"
-                }
-            }},
-            softmax_output = {{}, {
-                dim_in = {440}, dim_out = {2011},
-                sub_layers = layer_repo,
-                connections = {
-                    ["<input>[1]"] = "main[1]",
-                    ["main[1]"] = "softmax[1]",
-                    ["softmax[1]"] = "<output>[1]"
-                }
-            }}
-        }
-    }, param_repo, gconf)
-
-    return layer_repo
-end
-
-function get_network(layer_repo)
-    return layer_repo:get_layer("mpe_output")
-end
-
-function get_decode_network(layer_repo)
-    return layer_repo:get_layer("softmax_output")
-end
-
-function get_global_transf(layer_repo)
-    return layer_repo:get_layer("global_transf")
-end
-
-function make_readers(feature_rspecifier, layer_repo)
-    return {
-                {reader = nerv.KaldiReader(gconf,
-                    {
-                        id = "main_scp",
-                        feature_rspecifier = feature_rspecifier,
-                        frm_ext = gconf.frm_ext,
-                        global_transf = layer_repo:get_layer("global_transf"),
-                        mlfs = {}
-                    })
-                }
-            }
-end
-
-function get_input_order()
-    return {{id = "main_scp", global_transf = true},
-            {id = "key"}}
-end
-
-function get_accuracy(layer_repo)
-    local mpe_crit = layer_repo:get_layer("mpe_crit")
-    return mpe_crit.total_correct / mpe_crit.total_frames * 100
-end
-
-function print_stat(layer_repo)
-    local mpe_crit = layer_repo:get_layer("mpe_crit")
-    nerv.info("*** training stat begin ***")
-    nerv.printf("correct:\t\t%d\n", mpe_crit.total_correct)
-    nerv.printf("frames:\t\t\t%d\n", mpe_crit.total_frames)
-    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo))
-    nerv.info("*** training stat end ***")
-end
diff --git a/nerv/layer/init.lua b/nerv/layer/init.lua
index b74422f..25dfebb 100644
--- a/nerv/layer/init.lua
+++ b/nerv/layer/init.lua
@@ -80,3 +80,4 @@ nerv.include('combiner.lua')
 nerv.include('affine_recurrent.lua')
 nerv.include('softmax.lua')
 nerv.include('mpe.lua')
+nerv.include('mmi.lua')
diff --git a/nerv/layer/mmi.lua b/nerv/layer/mmi.lua
new file mode 100644
index 0000000..ecc7f48
--- /dev/null
+++ b/nerv/layer/mmi.lua
@@ -0,0 +1,50 @@
+require 'libkaldiseq'
+local MMILayer = nerv.class("nerv.MMILayer", "nerv.Layer")
+
+function MMILayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.gconf = global_conf
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self.arg = layer_conf.cmd.arg
+    self.mdl = layer_conf.cmd.mdl
+    self.lat = layer_conf.cmd.lat
+    self.ali = layer_conf.cmd.ali
+    self:check_dim_len(2, -1) -- two inputs: nn output and utt key
+end
+
+function MMILayer:init(batch_size)
+    self.total_frames = 0
+    self.kaldi_mmi = nerv.KaldiMMI(self.arg, self.mdl, self.lat, self.ali)
+    if self.kaldi_mmi == nil then
+        nerv.error("kaldi arguments is expected: %s %s %s %s", self.arg,
+        self.mdl, self.lat, self.ali)
+    end
+end
+
+function MMILayer:batch_resize(batch_size)
+    -- do nothing
+end
+
+function MMILayer:update(bp_err, input, output)
+    -- no params, therefore do nothing
+end
+
+function MMILayer:propagate(input, output)
+    self.valid = false
+    self.valid = self.kaldi_mmi:check(input[1], input[2])
+    return self.valid
+end
+
+function MMILayer:back_propagate(bp_err, next_bp_err, input, output)
+    if self.valid ~= true then
+        nerv.error("kaldi sequence training back_propagate fail")
+    end
+    local mmat = input[1]:new_to_host()
+    next_bp_err[1]:copy_fromh(self.kaldi_mmi:calc_diff(mmat, input[2]))
+    self.total_frames = self.total_frames + self.kaldi_mmi:get_num_frames()
+end
+
+function MMILayer:get_params()
+    return nerv.ParamRepo({})
+end
-- 
cgit v1.2.3-70-g09d2


From bd6d0d8b72ec656dd8fa0c13aa602f4f6e022391 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Fri, 9 Oct 2015 18:42:09 +0800
Subject: enable user to set direct_update in gconf; use direct_update in the
 example

---
 nerv/examples/swb_baseline.lua | 1 +
 nerv/layer/affine.lua          | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/nerv/examples/swb_baseline.lua b/nerv/examples/swb_baseline.lua
index 8015884..8f72200 100644
--- a/nerv/examples/swb_baseline.lua
+++ b/nerv/examples/swb_baseline.lua
@@ -2,6 +2,7 @@ require 'htk_io'
 gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
         cumat_type = nerv.CuMatrixFloat,
         mmat_type = nerv.MMatrixFloat,
+        direct_update = true,
         frm_ext = 5,
         frm_trim = 5,
         tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
diff --git a/nerv/layer/affine.lua b/nerv/layer/affine.lua
index 00cbcfb..b81b2a4 100644
--- a/nerv/layer/affine.lua
+++ b/nerv/layer/affine.lua
@@ -42,7 +42,7 @@ function AffineLayer:__init(id, global_conf, layer_conf)
     self.dim_out = layer_conf.dim_out
     self.gconf = global_conf
     self:check_dim_len(1, 1) -- exactly one input and one output
-    self.direct_update = layer_conf.direct_update
+    self.direct_update = layer_conf.direct_update or global_conf.direct_update
 end
 
 function AffineLayer:init(batch_size)
-- 
cgit v1.2.3-70-g09d2


From 60e7718f4db981557ac48d9d375d8e23b1cd39d1 Mon Sep 17 00:00:00 2001
From: Yimmon Zhuang <yimmon.zhuang@gmail.com>
Date: Sat, 10 Oct 2015 19:06:53 +0800
Subject: disable batch when sequence training

---
 nerv/examples/seq_trainer.lua | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nerv/examples/seq_trainer.lua b/nerv/examples/seq_trainer.lua
index df96e68..b8ed3eb 100644
--- a/nerv/examples/seq_trainer.lua
+++ b/nerv/examples/seq_trainer.lua
@@ -49,6 +49,7 @@ function build_trainer(ifname)
                 network:batch_resize(input[1]:nrow())
                 if network:propagate(input, {{}}) == true then
                     network:back_propagate({{}}, err_output, input, {{}})
+                    gconf.batch_size = 1.0 - gconf.momentum  -- important!!!
                     network:update({{}}, input, {{}})
                 end
                 -- collect garbage in-time to save GPU memory
-- 
cgit v1.2.3-70-g09d2


From 473eb9c082224be19f147697ba951ae5bac4b4b4 Mon Sep 17 00:00:00 2001
From: Yimmon Zhuang <yimmon.zhuang@gmail.com>
Date: Sat, 10 Oct 2015 22:32:51 +0800
Subject: move sequence related layers to kaldi_seq

---
 nerv/Makefile                    |  1 -
 nerv/examples/mmi_chime3.lua     |  1 +
 nerv/examples/mpe_chime3.lua     |  1 +
 nerv/layer/init.lua              |  2 --
 nerv/layer/mmi.lua               | 50 --------------------------------------
 nerv/layer/mpe.lua               | 52 ----------------------------------------
 nerv/lib/matrix/generic/matrix.c |  2 +-
 7 files changed, 3 insertions(+), 106 deletions(-)
 delete mode 100644 nerv/layer/mmi.lua
 delete mode 100644 nerv/layer/mpe.lua

diff --git a/nerv/Makefile b/nerv/Makefile
index ce178a0..b449f82 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -32,7 +32,6 @@ LIBS := $(INST_LIBDIR)/libnerv.so $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT
 LUA_LIBS := matrix/init.lua io/init.lua init.lua \
 			layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua layer/softmax.lua \
 			layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua layer/affine_recurrent.lua \
-			layer/mpe.lua layer/mmi.lua \
 			nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua \
 			io/sgd_buffer.lua 
 
diff --git a/nerv/examples/mmi_chime3.lua b/nerv/examples/mmi_chime3.lua
index a7ad268..6ac7f28 100644
--- a/nerv/examples/mmi_chime3.lua
+++ b/nerv/examples/mmi_chime3.lua
@@ -1,4 +1,5 @@
 require 'kaldi_io'
+require 'kaldi_seq'
 gconf = {lrate = 0.00001, wcost = 0, momentum = 0.0,
         cumat_type = nerv.CuMatrixFloat,
         mmat_type = nerv.MMatrixFloat,
diff --git a/nerv/examples/mpe_chime3.lua b/nerv/examples/mpe_chime3.lua
index be723ca..ec095b0 100644
--- a/nerv/examples/mpe_chime3.lua
+++ b/nerv/examples/mpe_chime3.lua
@@ -1,4 +1,5 @@
 require 'kaldi_io'
+require 'kaldi_seq'
 gconf = {lrate = 0.00001, wcost = 0, momentum = 0.0,
         cumat_type = nerv.CuMatrixFloat,
         mmat_type = nerv.MMatrixFloat,
diff --git a/nerv/layer/init.lua b/nerv/layer/init.lua
index 25dfebb..6861b0e 100644
--- a/nerv/layer/init.lua
+++ b/nerv/layer/init.lua
@@ -79,5 +79,3 @@ nerv.include('mse.lua')
 nerv.include('combiner.lua')
 nerv.include('affine_recurrent.lua')
 nerv.include('softmax.lua')
-nerv.include('mpe.lua')
-nerv.include('mmi.lua')
diff --git a/nerv/layer/mmi.lua b/nerv/layer/mmi.lua
deleted file mode 100644
index ecc7f48..0000000
--- a/nerv/layer/mmi.lua
+++ /dev/null
@@ -1,50 +0,0 @@
-require 'libkaldiseq'
-local MMILayer = nerv.class("nerv.MMILayer", "nerv.Layer")
-
-function MMILayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.gconf = global_conf
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self.arg = layer_conf.cmd.arg
-    self.mdl = layer_conf.cmd.mdl
-    self.lat = layer_conf.cmd.lat
-    self.ali = layer_conf.cmd.ali
-    self:check_dim_len(2, -1) -- two inputs: nn output and utt key
-end
-
-function MMILayer:init(batch_size)
-    self.total_frames = 0
-    self.kaldi_mmi = nerv.KaldiMMI(self.arg, self.mdl, self.lat, self.ali)
-    if self.kaldi_mmi == nil then
-        nerv.error("kaldi arguments is expected: %s %s %s %s", self.arg,
-        self.mdl, self.lat, self.ali)
-    end
-end
-
-function MMILayer:batch_resize(batch_size)
-    -- do nothing
-end
-
-function MMILayer:update(bp_err, input, output)
-    -- no params, therefore do nothing
-end
-
-function MMILayer:propagate(input, output)
-    self.valid = false
-    self.valid = self.kaldi_mmi:check(input[1], input[2])
-    return self.valid
-end
-
-function MMILayer:back_propagate(bp_err, next_bp_err, input, output)
-    if self.valid ~= true then
-        nerv.error("kaldi sequence training back_propagate fail")
-    end
-    local mmat = input[1]:new_to_host()
-    next_bp_err[1]:copy_fromh(self.kaldi_mmi:calc_diff(mmat, input[2]))
-    self.total_frames = self.total_frames + self.kaldi_mmi:get_num_frames()
-end
-
-function MMILayer:get_params()
-    return nerv.ParamRepo({})
-end
diff --git a/nerv/layer/mpe.lua b/nerv/layer/mpe.lua
deleted file mode 100644
index ec8a8f3..0000000
--- a/nerv/layer/mpe.lua
+++ /dev/null
@@ -1,52 +0,0 @@
-require 'libkaldiseq'
-local MPELayer = nerv.class("nerv.MPELayer", "nerv.Layer")
-
-function MPELayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.gconf = global_conf
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self.arg = layer_conf.cmd.arg
-    self.mdl = layer_conf.cmd.mdl
-    self.lat = layer_conf.cmd.lat
-    self.ali = layer_conf.cmd.ali
-    self:check_dim_len(2, -1) -- two inputs: nn output and utt key
-end
-
-function MPELayer:init(batch_size)
-    self.total_correct = 0
-    self.total_frames = 0
-    self.kaldi_mpe = nerv.KaldiMPE(self.arg, self.mdl, self.lat, self.ali)
-    if self.kaldi_mpe == nil then
-        nerv.error("kaldi arguments is expected: %s %s %s %s", self.arg,
-        self.mdl, self.lat, self.ali)
-    end
-end
-
-function MPELayer:batch_resize(batch_size)
-    -- do nothing
-end
-
-function MPELayer:update(bp_err, input, output)
-    -- no params, therefore do nothing
-end
-
-function MPELayer:propagate(input, output)
-    self.valid = false
-    self.valid = self.kaldi_mpe:check(input[1], input[2])
-    return self.valid
-end
-
-function MPELayer:back_propagate(bp_err, next_bp_err, input, output)
-    if self.valid ~= true then
-        nerv.error("kaldi sequence training back_propagate fail")
-    end
-    local mmat = input[1]:new_to_host()
-    next_bp_err[1]:copy_fromh(self.kaldi_mpe:calc_diff(mmat, input[2]))
-    self.total_frames = self.total_frames + self.kaldi_mpe:get_num_frames()
-    self.total_correct = self.total_correct + self.kaldi_mpe:get_utt_frame_acc()
-end
-
-function MPELayer:get_params()
-    return nerv.ParamRepo({})
-end
diff --git a/nerv/lib/matrix/generic/matrix.c b/nerv/lib/matrix/generic/matrix.c
index 4319e13..4246751 100644
--- a/nerv/lib/matrix/generic/matrix.c
+++ b/nerv/lib/matrix/generic/matrix.c
@@ -4,7 +4,7 @@
 /* FIXME: malloc failure detection */
 
 void nerv_matrix_(data_free)(Matrix *self, Status *status) {
-    if(*self->data_ref == 0) return;
+    if(*self->data_ref == 0) return; /* FIXME: repeat free memory */
     assert(*self->data_ref > 0);
     if (--(*self->data_ref) == 0)
     {
-- 
cgit v1.2.3-70-g09d2