aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitmodules6
-rw-r--r--Makefile1
-rw-r--r--README.md4
-rw-r--r--embedding_example/.gitignore2
-rw-r--r--embedding_example/Makefile20
-rw-r--r--embedding_example/main.c93
-rwxr-xr-xembedding_example/run.sh4
-rw-r--r--embedding_example/setup_nerv.lua25
-rw-r--r--nerv/Makefile10
-rw-r--r--nerv/examples/asr_trainer.lua37
-rw-r--r--nerv/examples/mmi_chime3.lua183
-rw-r--r--nerv/examples/mpe_chime3.lua186
-rw-r--r--nerv/examples/seq_trainer.lua87
-rw-r--r--nerv/examples/swb_baseline.lua79
-rw-r--r--nerv/examples/swb_baseline_basic.lua162
-rw-r--r--nerv/init.lua14
-rw-r--r--nerv/io/sgd_buffer.lua50
-rw-r--r--nerv/layer/affine.lua6
-rw-r--r--nerv/layer/affine_recurrent.lua4
-rw-r--r--nerv/layer/bias.lua4
-rw-r--r--nerv/layer/combiner.lua6
-rw-r--r--nerv/layer/init.lua1
-rw-r--r--nerv/layer/mse.lua10
-rw-r--r--nerv/layer/sigmoid.lua4
-rw-r--r--nerv/layer/softmax.lua35
-rw-r--r--nerv/layer/softmax_ce.lua7
-rw-r--r--nerv/layer/window.lua4
-rw-r--r--nerv/lib/matrix/cukernel.h2
-rw-r--r--nerv/lib/matrix/cumatrix.c1
-rw-r--r--nerv/lib/matrix/cumatrix.h1
-rw-r--r--nerv/lib/matrix/generic/cukernel.cu20
-rw-r--r--nerv/lib/matrix/generic/cumatrix.c21
-rw-r--r--nerv/lib/matrix/generic/cumatrix.h2
-rw-r--r--nerv/lib/matrix/generic/matrix.c5
-rw-r--r--nerv/lib/matrix/generic/matrix.h2
-rw-r--r--nerv/lib/matrix/mmatrix.c37
-rw-r--r--nerv/lib/matrix/mmatrix.h3
-rw-r--r--nerv/matrix/generic/cukernel.cu592
-rw-r--r--nerv/matrix/generic/cumatrix.c32
-rw-r--r--nerv/matrix/init.lua4
-rw-r--r--nerv/matrix/mmatrix.c46
-rw-r--r--nerv/nn/layer_dag.lua76
-rw-r--r--nerv/nn/layer_repo.lua8
m---------speech0
44 files changed, 1174 insertions, 722 deletions
diff --git a/.gitmodules b/.gitmodules
index 1432de9..9f556c5 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,12 +1,6 @@
[submodule "luajit-2.0"]
path = luajit-2.0
url = http://luajit.org/git/luajit-2.0.git
-[submodule "nerv-speech"]
- path = nerv-speech
- url = https://github.com/Determinant/nerv-speech.git
-[submodule "speech"]
- path = speech
- url = https://github.com/Determinant/nerv-speech.git
[submodule "luarocks"]
path = luarocks
url = https://github.com/keplerproject/luarocks.git
diff --git a/Makefile b/Makefile
index fa888c3..664a83b 100644
--- a/Makefile
+++ b/Makefile
@@ -9,6 +9,7 @@ luarocks:
install:
cd nerv; $(PREFIX)/bin/luarocks make
speech:
+ cd speech/speech_utils; $(PREFIX)/bin/luarocks make
cd speech/htk_io; $(PREFIX)/bin/luarocks make
clean:
cd nerv && make clean
diff --git a/README.md b/README.md
index 10d531c..8c21bd9 100644
--- a/README.md
+++ b/README.md
@@ -8,10 +8,12 @@ First make sure you have __lua__ and __CUDA__ installed on your computer.
__Nerv__ is currently developed via github.You can download and make __Nerv__ by doing the following:
```
cd ~
-git clone https://github.com/Determinant/nerv.git
+git clone https://github.com/Nerv-SJTU/nerv.git
cd nerv
+git clone https://github.com/Nerv-SJTU/nerv-speech.git speech
git submodule init && git submodule update
make
+make speech
```
The `git submodule` command is for the __luajit__ repository inside __Nerv__.
Now, you can try to run some example scripts.
diff --git a/embedding_example/.gitignore b/embedding_example/.gitignore
new file mode 100644
index 0000000..8e68213
--- /dev/null
+++ b/embedding_example/.gitignore
@@ -0,0 +1,2 @@
+main
+main.o
diff --git a/embedding_example/Makefile b/embedding_example/Makefile
new file mode 100644
index 0000000..73287f4
--- /dev/null
+++ b/embedding_example/Makefile
@@ -0,0 +1,20 @@
+CFLAG += -I ../install/include/luajit-2.0/ -I ../install/include/nerv/
+LDFLAG += -L../install/lib/ -lluajit-5.1 -Wl,-rpath=../install/lib/ -lluaT -lnervcore
+GCC := gcc
+
+.PHONY: FORCE
+
+all: main FORCE
+clean:
+ -rm -f *.o
+ -rm main
+
+FORCE: ../install/bin/luarocks
+ echo "#!/bin/bash" > run.sh
+ $< path >> run.sh
+ echo "./main" >> run.sh
+main: main.o
+ $(GCC) -o $@ $< $(LDFLAG)
+
+main.o: main.c
+ $(GCC) $(CFLAG) -o $@ $< -c
diff --git a/embedding_example/main.c b/embedding_example/main.c
new file mode 100644
index 0000000..8856d58
--- /dev/null
+++ b/embedding_example/main.c
@@ -0,0 +1,93 @@
+#include "lua.h"
+#include "lauxlib.h"
+#include "lualib.h"
+#include "matrix/matrix.h"
+#include "common.h"
+#include "luaT/luaT.h"
+#include <stdio.h>
+
+const char *nerv_matrix_host_float_tname = "nerv.MMatrixFloat";
+const char *input_name = "_nerv_embed_input";
+const char *output_name = "_nerv_embed_output";
+extern Matrix *nerv_matrix_host_float_create(long nrow, long ncol, Status *status);
+extern void nerv_matrix_host_float_data_retain(Matrix *self);
+extern void nerv_matrix_host_float_data_free(Matrix *self, Status *status);
+
+lua_State *L;
+Matrix *input, *output;
+Status status;
+
+void setup_nerv() {
+ L = lua_open();
+ luaL_openlibs(L);
+ luaL_loadfile(L, "setup_nerv.lua");
+ /* network configuration */
+ lua_pushstring(L, "../nerv/examples/swb_baseline.lua");
+ if (lua_pcall(L, 1, LUA_MULTRET, 0))
+ {
+ printf("%s\n", luaL_checkstring(L, 1));
+ exit(1);
+ }
+ /* lua stack now: input width, output width, propagator */
+ input = nerv_matrix_host_float_create(1, luaL_checkinteger(L, 1), &status);
+ NERV_LUA_CHECK_STATUS(L, status);
+ output = nerv_matrix_host_float_create(1, luaL_checkinteger(L, 2), &status);
+ NERV_LUA_CHECK_STATUS(L, status);
+ /* add reference to avoid gc */
+ luaT_pushudata(L, output, nerv_matrix_host_float_tname);
+ luaT_pushudata(L, input, nerv_matrix_host_float_tname);
+ lua_setfield(L, LUA_GLOBALSINDEX, input_name);
+ lua_setfield(L, LUA_GLOBALSINDEX, output_name);
+}
+
+
+void propagate(float for_fun) {
+ int i, j;
+ printf("ok: %d\n", lua_gettop(L));
+ lua_pushvalue(L, 3);
+ /* lua stack now: input width, output width, propagator, propagator */
+ for (i = 0; i < input->nrow; i++) /* nrow is actually 1 */
+ {
+ float *nerv_row = (float *)((char *)input->data.f + i * input->stride);
+ for (j = 0; j < input->ncol; j++)
+ {
+ nerv_row[j] = j * for_fun;
+ }
+ }
+ lua_getfield(L, LUA_GLOBALSINDEX, input_name);
+ lua_getfield(L, LUA_GLOBALSINDEX, output_name);
+ /* lua stack now: input width, output width, propagator, propagator, input, output */
+ if (lua_pcall(L, 2, 0, 0)) /* call propagator with two parameters */
+ {
+ printf("%s\n", luaL_checkstring(L, -1));
+ exit(-1);
+ }
+ /* lua stack now: input width, output width, propagator */
+ printf("## output: %ld %ld ##\n", output->nrow, output->ncol);
+ for (i = 0; i < output->nrow; i++) /* nrow is actually 1 */
+ {
+ float *nerv_row = (float *)((char *)output->data.f + i * output->stride);
+ for (j = 0; j < output->ncol; j++)
+ {
+ printf("%.8f ", nerv_row[j]);
+ }
+ }
+}
+
+void teardown_nerv() {
+ lua_pushnil(L);
+ lua_pushnil(L);
+ lua_setfield(L, LUA_GLOBALSINDEX, input_name);
+ lua_setfield(L, LUA_GLOBALSINDEX, output_name);
+ lua_gc(L, LUA_GCCOLLECT, 0);
+}
+
+int main() {
+ setup_nerv();
+ propagate(1.0);
+ propagate(2.0);
+ propagate(2.0);
+ propagate(3.0);
+ teardown_nerv();
+ return 0;
+}
diff --git a/embedding_example/run.sh b/embedding_example/run.sh
new file mode 100755
index 0000000..e919263
--- /dev/null
+++ b/embedding_example/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+export LUA_PATH='/home/slhome/mfy43/.luarocks/share/lua/5.1/?.lua;/home/slhome/mfy43/.luarocks/share/lua/5.1/?/init.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?/init.lua;./?.lua;/usr/local/share/luajit-2.0.4/?.lua;/usr/local/share/lua/5.1/?.lua;/usr/local/share/lua/5.1/?/init.lua'
+export LUA_CPATH='/home/slhome/mfy43/.luarocks/lib/lua/5.1/?.so;/home/slhome/mfy43/nerv/install/lib/lua/5.1/?.so;./?.so;/usr/local/lib/lua/5.1/?.so;/usr/local/lib/lua/5.1/loadall.so'
+./main
diff --git a/embedding_example/setup_nerv.lua b/embedding_example/setup_nerv.lua
new file mode 100644
index 0000000..d80c306
--- /dev/null
+++ b/embedding_example/setup_nerv.lua
@@ -0,0 +1,25 @@
+local k,l,_=pcall(require,"luarocks.loader") _=k and l.add_context("nerv","scm-1")
+require 'nerv'
+local arg = {...}
+dofile(arg[1])
+local param_repo = nerv.ParamRepo()
+param_repo:import(gconf.initialized_param, nil, gconf)
+local layer_repo = make_layer_repo(param_repo)
+local network = get_decode_network(layer_repo)
+local global_transf = get_global_transf(layer_repo)
+local batch_size = 1
+network:init(batch_size)
+
+function propagator(input, output)
+ local transformed = nerv.speech_utils.global_transf(
+ gconf.cumat_type.new_from_host(input),
+ global_transf, 0, 0, gconf) -- preprocessing
+ local gpu_input = transformed
+ local gpu_output = nerv.CuMatrixFloat(output:nrow(), output:ncol())
+ network:propagate({gpu_input}, {gpu_output})
+ gpu_output:copy_toh(output)
+ -- collect garbage in-time to save GPU memory
+ collectgarbage("collect")
+end
+
+return network.dim_in[1], network.dim_out[1], propagator
diff --git a/nerv/Makefile b/nerv/Makefile
index 022e2fb..b449f82 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -30,14 +30,14 @@ LUAT_OBJS := $(addprefix $(OBJ_DIR)/,$(LUAT_OBJS))
OBJS := $(CORE_OBJS) $(NERV_OBJS) $(LUAT_OBJS)
LIBS := $(INST_LIBDIR)/libnerv.so $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so
LUA_LIBS := matrix/init.lua io/init.lua init.lua \
- layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua \
- layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua layer/affine_recurrent.lua\
+ layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua layer/softmax.lua \
+ layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua layer/affine_recurrent.lua \
nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua \
io/sgd_buffer.lua
INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK
-CUDA_BASE := /usr/local/cuda-6.5
-#CUDA_BASE := /usr/local/cuda-5.0
+#CUDA_BASE := /usr/local/cuda-7.0
+CUDA_BASE := /usr/local/cuda
CUDA_INCLUDE := -I $(CUDA_BASE)/include/
INCLUDE += $(CUDA_INCLUDE)
@@ -66,7 +66,7 @@ $(LIB_PATH)/libluaT.so: $(LUAT_OBJS)
$(INST_LIBDIR)/libnerv.so: $(NERV_OBJS) $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so
gcc -shared -o $@ $(NERV_OBJS) $(LDFLAGS) -Wl,-rpath=$(LIB_PATH) -L$(LIB_PATH) -lnervcore -lluaT
-$(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c matrix/generic/cukernel.cu
+$(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c
$(OBJ_DIR)/matrix/mmatrix.o: matrix/generic/mmatrix.c matrix/generic/matrix.c
$(OBJ_DIR)/lib/matrix/cumatrix.o: lib/matrix/generic/cumatrix.c lib/matrix/generic/matrix.c lib/matrix/generic/cukernel.cu
diff --git a/nerv/examples/asr_trainer.lua b/nerv/examples/asr_trainer.lua
index 4fa4096..69cfeed 100644
--- a/nerv/examples/asr_trainer.lua
+++ b/nerv/examples/asr_trainer.lua
@@ -1,9 +1,9 @@
function build_trainer(ifname)
local param_repo = nerv.ParamRepo()
param_repo:import(ifname, nil, gconf)
- local sublayer_repo = make_sublayer_repo(param_repo)
- local layer_repo = make_layer_repo(sublayer_repo, param_repo)
+ local layer_repo = make_layer_repo(param_repo)
local network = get_network(layer_repo)
+ local global_transf = get_global_transf(layer_repo)
local input_order = get_input_order()
local iterative_trainer = function (prefix, scp_file, bp)
gconf.randomize = bp
@@ -12,28 +12,41 @@ function build_trainer(ifname)
-- initialize the network
network:init(gconf.batch_size)
gconf.cnt = 0
- err_input = {nerv.CuMatrixFloat(256, 1)}
+ err_input = {nerv.CuMatrixFloat(gconf.batch_size, 1)}
err_input[1]:fill(1)
for data in buffer.get_data, buffer do
-- prine stat periodically
gconf.cnt = gconf.cnt + 1
if gconf.cnt == 1000 then
- print_stat(sublayer_repo)
+ print_stat(layer_repo)
nerv.CuMatrix.print_profile()
nerv.CuMatrix.clear_profile()
gconf.cnt = 0
-- break
end
local input = {}
--- if gconf.cnt == 100 then break end
- for i, id in ipairs(input_order) do
+-- if gconf.cnt == 1000 then break end
+ for i, e in ipairs(input_order) do
+ local id = e.id
if data[id] == nil then
nerv.error("input data %s not found", id)
end
- table.insert(input, data[id])
+ local transformed
+ if e.global_transf then
+ transformed = nerv.speech_utils.global_transf(data[id],
+ global_transf,
+ gconf.frm_ext or 0, 0,
+ gconf)
+ else
+ transformed = data[id]
+ end
+ table.insert(input, transformed)
+ end
+ local output = {nerv.CuMatrixFloat(gconf.batch_size, 1)}
+ err_output = {}
+ for i = 1, #input do
+ table.insert(err_output, input[i]:create())
end
- local output = {nerv.CuMatrixFloat(256, 1)}
- err_output = {input[1]:create()}
network:propagate(input, output)
if bp then
network:back_propagate(err_input, err_output, input, output)
@@ -42,16 +55,16 @@ function build_trainer(ifname)
-- collect garbage in-time to save GPU memory
collectgarbage("collect")
end
- print_stat(sublayer_repo)
+ print_stat(layer_repo)
nerv.CuMatrix.print_profile()
nerv.CuMatrix.clear_profile()
if (not bp) and prefix ~= nil then
nerv.info("writing back...")
local fname = string.format("%s_cv%.3f.nerv",
- prefix, get_accuracy(sublayer_repo))
+ prefix, get_accuracy(layer_repo))
network:get_params():export(fname, nil)
end
- return get_accuracy(sublayer_repo)
+ return get_accuracy(layer_repo)
end
return iterative_trainer
end
diff --git a/nerv/examples/mmi_chime3.lua b/nerv/examples/mmi_chime3.lua
new file mode 100644
index 0000000..6ac7f28
--- /dev/null
+++ b/nerv/examples/mmi_chime3.lua
@@ -0,0 +1,183 @@
+require 'kaldi_io'
+require 'kaldi_seq'
+gconf = {lrate = 0.00001, wcost = 0, momentum = 0.0,
+ cumat_type = nerv.CuMatrixFloat,
+ mmat_type = nerv.MMatrixFloat,
+ frm_ext = 5,
+ tr_scp = "ark,o:/slfs6/users/ymz09/kaldi/src/featbin/copy-feats scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_mmi/train.scp ark:- |",
+ initialized_param = {"/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_init_mmi.nerv",
+ "/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_global_transf_mmi.nerv"},
+ debug = false}
+
+function make_layer_repo(param_repo)
+ local layer_repo = nerv.LayerRepo(
+ {
+ -- global transf
+ ["nerv.BiasLayer"] =
+ {
+ blayer1 = {{bias = "bias1"}, {dim_in = {440}, dim_out = {440}}},
+ blayer2 = {{bias = "bias2"}, {dim_in = {440}, dim_out = {440}}}
+ },
+ ["nerv.WindowLayer"] =
+ {
+ wlayer1 = {{window = "window1"}, {dim_in = {440}, dim_out = {440}}},
+ wlayer2 = {{window = "window2"}, {dim_in = {440}, dim_out = {440}}}
+ },
+ -- biased linearity
+ ["nerv.AffineLayer"] =
+ {
+ affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+ {dim_in = {440}, dim_out = {2048}}},
+ affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+ {dim_in = {2048}, dim_out = {2011}}}
+ },
+ ["nerv.SigmoidLayer"] =
+ {
+ sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+ },
+ ["nerv.MMILayer"] =
+ {
+ mmi_crit = {{}, {dim_in = {2011, -1}, dim_out = {1},
+ cmd = {
+ arg = "--class-frame-counts=/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced/ali_train_pdf.counts --acoustic-scale=0.1 --lm-scale=1.0 --learn-rate=0.00001 --drop-frames=true --verbose=1",
+ mdl = "/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/final.mdl",
+ lat = "scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_denlats/lat.scp",
+ ali = "ark:gunzip -c /slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/ali.*.gz |"
+ }
+ }
+ }
+ },
+ ["nerv.SoftmaxLayer"] = -- softmax for decode output
+ {
+ softmax = {{}, {dim_in = {2011}, dim_out = {2011}}}
+ }
+ }, param_repo, gconf)
+
+ layer_repo:add_layers(
+ {
+ ["nerv.DAGLayer"] =
+ {
+ global_transf = {{}, {
+ dim_in = {440}, dim_out = {440},
+ sub_layers = layer_repo,
+ connections = {
+ ["<input>[1]"] = "blayer1[1]",
+ ["blayer1[1]"] = "wlayer1[1]",
+ ["wlayer1[1]"] = "blayer2[1]",
+ ["blayer2[1]"] = "wlayer2[1]",
+ ["wlayer2[1]"] = "<output>[1]"
+ }
+ }},
+ main = {{}, {
+ dim_in = {440}, dim_out = {2011},
+ sub_layers = layer_repo,
+ connections = {
+ ["<input>[1]"] = "affine0[1]",
+ ["affine0[1]"] = "sigmoid0[1]",
+ ["sigmoid0[1]"] = "affine1[1]",
+ ["affine1[1]"] = "sigmoid1[1]",
+ ["sigmoid1[1]"] = "affine2[1]",
+ ["affine2[1]"] = "sigmoid2[1]",
+ ["sigmoid2[1]"] = "affine3[1]",
+ ["affine3[1]"] = "sigmoid3[1]",
+ ["sigmoid3[1]"] = "affine4[1]",
+ ["affine4[1]"] = "sigmoid4[1]",
+ ["sigmoid4[1]"] = "affine5[1]",
+ ["affine5[1]"] = "sigmoid5[1]",
+ ["sigmoid5[1]"] = "affine6[1]",
+ ["affine6[1]"] = "sigmoid6[1]",
+ ["sigmoid6[1]"] = "affine7[1]",
+ ["affine7[1]"] = "<output>[1]"
+ }
+ }}
+ }
+ }, param_repo, gconf)
+
+ layer_repo:add_layers(
+ {
+ ["nerv.DAGLayer"] =
+ {
+ mmi_output = {{}, {
+ dim_in = {440, -1}, dim_out = {1},
+ sub_layers = layer_repo,
+ connections = {
+ ["<input>[1]"] = "main[1]",
+ ["main[1]"] = "mmi_crit[1]",
+ ["<input>[2]"] = "mmi_crit[2]",
+ ["mmi_crit[1]"] = "<output>[1]"
+ }
+ }},
+ softmax_output = {{}, {
+ dim_in = {440}, dim_out = {2011},
+ sub_layers = layer_repo,
+ connections = {
+ ["<input>[1]"] = "main[1]",
+ ["main[1]"] = "softmax[1]",
+ ["softmax[1]"] = "<output>[1]"
+ }
+ }}
+ }
+ }, param_repo, gconf)
+
+ return layer_repo
+end
+
+function get_network(layer_repo)
+ return layer_repo:get_layer("mmi_output")
+end
+
+function get_decode_network(layer_repo)
+ return layer_repo:get_layer("softmax_output")
+end
+
+function get_global_transf(layer_repo)
+ return layer_repo:get_layer("global_transf")
+end
+
+function make_readers(feature_rspecifier, layer_repo)
+ return {
+ {reader = nerv.KaldiReader(gconf,
+ {
+ id = "main_scp",
+ feature_rspecifier = feature_rspecifier,
+ frm_ext = gconf.frm_ext,
+ global_transf = layer_repo:get_layer("global_transf"),
+ mlfs = {}
+ })
+ }
+ }
+end
+
+function get_input_order()
+ return {{id = "main_scp", global_transf = true},
+ {id = "key"}}
+end
+
+function get_accuracy(layer_repo)
+ return 0
+end
+
+function print_stat(layer_repo)
+ local mmi_crit = layer_repo:get_layer("mmi_crit")
+ nerv.info("*** training stat begin ***")
+ nerv.printf("frames:\t\t\t%d\n", mmi_crit.total_frames)
+ nerv.info("*** training stat end ***")
+end
diff --git a/nerv/examples/mpe_chime3.lua b/nerv/examples/mpe_chime3.lua
new file mode 100644
index 0000000..ec095b0
--- /dev/null
+++ b/nerv/examples/mpe_chime3.lua
@@ -0,0 +1,186 @@
+require 'kaldi_io'
+require 'kaldi_seq'
+gconf = {lrate = 0.00001, wcost = 0, momentum = 0.0,
+ cumat_type = nerv.CuMatrixFloat,
+ mmat_type = nerv.MMatrixFloat,
+ frm_ext = 5,
+ tr_scp = "ark,s,cs:/slfs6/users/ymz09/kaldi/src/featbin/copy-feats scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_smbr/train.scp ark:- |",
+ initialized_param = {"/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_init.nerv",
+ "/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_global_transf.nerv"},
+ debug = false}
+
+function make_layer_repo(param_repo)
+ local layer_repo = nerv.LayerRepo(
+ {
+ -- global transf
+ ["nerv.BiasLayer"] =
+ {
+ blayer1 = {{bias = "bias1"}, {dim_in = {440}, dim_out = {440}}},
+ blayer2 = {{bias = "bias2"}, {dim_in = {440}, dim_out = {440}}}
+ },
+ ["nerv.WindowLayer"] =
+ {
+ wlayer1 = {{window = "window1"}, {dim_in = {440}, dim_out = {440}}},
+ wlayer2 = {{window = "window2"}, {dim_in = {440}, dim_out = {440}}}
+ },
+ -- biased linearity
+ ["nerv.AffineLayer"] =
+ {
+ affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+ {dim_in = {440}, dim_out = {2048}}},
+ affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+ {dim_in = {2048}, dim_out = {2011}}}
+ },
+ ["nerv.SigmoidLayer"] =
+ {
+ sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+ },
+ ["nerv.MPELayer"] =
+ {
+ mpe_crit = {{}, {dim_in = {2011, -1}, dim_out = {1},
+ cmd = {
+ arg = "--class-frame-counts=/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced/ali_train_pdf.counts --acoustic-scale=0.1 --lm-scale=1.0 --learn-rate=0.00001 --do-smbr=true --verbose=1",
+ mdl = "/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/final.mdl",
+ lat = "scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_denlats/lat.scp",
+ ali = "ark:gunzip -c /slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/ali.*.gz |"
+ }
+ }
+ }
+ },
+ ["nerv.SoftmaxLayer"] = -- softmax for decode output
+ {
+ softmax = {{}, {dim_in = {2011}, dim_out = {2011}}}
+ }
+ }, param_repo, gconf)
+
+ layer_repo:add_layers(
+ {
+ ["nerv.DAGLayer"] =
+ {
+ global_transf = {{}, {
+ dim_in = {440}, dim_out = {440},
+ sub_layers = layer_repo,
+ connections = {
+ ["<input>[1]"] = "blayer1[1]",
+ ["blayer1[1]"] = "wlayer1[1]",
+ ["wlayer1[1]"] = "blayer2[1]",
+ ["blayer2[1]"] = "wlayer2[1]",
+ ["wlayer2[1]"] = "<output>[1]"
+ }
+ }},
+ main = {{}, {
+ dim_in = {440}, dim_out = {2011},
+ sub_layers = layer_repo,
+ connections = {
+ ["<input>[1]"] = "affine0[1]",
+ ["affine0[1]"] = "sigmoid0[1]",
+ ["sigmoid0[1]"] = "affine1[1]",
+ ["affine1[1]"] = "sigmoid1[1]",
+ ["sigmoid1[1]"] = "affine2[1]",
+ ["affine2[1]"] = "sigmoid2[1]",
+ ["sigmoid2[1]"] = "affine3[1]",
+ ["affine3[1]"] = "sigmoid3[1]",
+ ["sigmoid3[1]"] = "affine4[1]",
+ ["affine4[1]"] = "sigmoid4[1]",
+ ["sigmoid4[1]"] = "affine5[1]",
+ ["affine5[1]"] = "sigmoid5[1]",
+ ["sigmoid5[1]"] = "affine6[1]",
+ ["affine6[1]"] = "sigmoid6[1]",
+ ["sigmoid6[1]"] = "affine7[1]",
+ ["affine7[1]"] = "<output>[1]"
+ }
+ }}
+ }
+ }, param_repo, gconf)
+
+ layer_repo:add_layers(
+ {
+ ["nerv.DAGLayer"] =
+ {
+ mpe_output = {{}, {
+ dim_in = {440, -1}, dim_out = {1},
+ sub_layers = layer_repo,
+ connections = {
+ ["<input>[1]"] = "main[1]",
+ ["main[1]"] = "mpe_crit[1]",
+ ["<input>[2]"] = "mpe_crit[2]",
+ ["mpe_crit[1]"] = "<output>[1]"
+ }
+ }},
+ softmax_output = {{}, {
+ dim_in = {440}, dim_out = {2011},
+ sub_layers = layer_repo,
+ connections = {
+ ["<input>[1]"] = "main[1]",
+ ["main[1]"] = "softmax[1]",
+ ["softmax[1]"] = "<output>[1]"
+ }
+ }}
+ }
+ }, param_repo, gconf)
+
+ return layer_repo
+end
+
+function get_network(layer_repo)
+ return layer_repo:get_layer("mpe_output")
+end
+
+function get_decode_network(layer_repo)
+ return layer_repo:get_layer("softmax_output")
+end
+
+function get_global_transf(layer_repo)
+ return layer_repo:get_layer("global_transf")
+end
+
+function make_readers(feature_rspecifier, layer_repo)
+ return {
+ {reader = nerv.KaldiReader(gconf,
+ {
+ id = "main_scp",
+ feature_rspecifier = feature_rspecifier,
+ frm_ext = gconf.frm_ext,
+ global_transf = layer_repo:get_layer("global_transf"),
+ mlfs = {}
+ })
+ }
+ }
+end
+
+function get_input_order()
+ return {{id = "main_scp", global_transf = true},
+ {id = "key"}}
+end
+
+function get_accuracy(layer_repo)
+ local mpe_crit = layer_repo:get_layer("mpe_crit")
+ return mpe_crit.total_correct / mpe_crit.total_frames * 100
+end
+
+function print_stat(layer_repo)
+ local mpe_crit = layer_repo:get_layer("mpe_crit")
+ nerv.info("*** training stat begin ***")
+ nerv.printf("correct:\t\t%d\n", mpe_crit.total_correct)
+ nerv.printf("frames:\t\t\t%d\n", mpe_crit.total_frames)
+ nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo))
+ nerv.info("*** training stat end ***")
+end
diff --git a/nerv/examples/seq_trainer.lua b/nerv/examples/seq_trainer.lua
new file mode 100644
index 0000000..b8ed3eb
--- /dev/null
+++ b/nerv/examples/seq_trainer.lua
@@ -0,0 +1,87 @@
+function build_trainer(ifname)
+ local param_repo = nerv.ParamRepo()
+ param_repo:import(ifname, nil, gconf)
+ local layer_repo = make_layer_repo(param_repo)
+ local network = get_network(layer_repo)
+ local global_transf = get_global_transf(layer_repo)
+ local input_order = get_input_order()
+ local iterative_trainer = function (prefix, scp_file, bp)
+ local readers = make_readers(scp_file, layer_repo)
+ -- initialize the network
+ network:init(1)
+ gconf.cnt = 0
+ for ri = 1, #readers, 1 do
+ while true do
+ local data = readers[ri].reader:get_data()
+ if data == nil then
+ break
+ end
+ -- prine stat periodically
+ gconf.cnt = gconf.cnt + 1
+ if gconf.cnt == 1000 then
+ print_stat(layer_repo)
+ nerv.CuMatrix.print_profile()
+ nerv.CuMatrix.clear_profile()
+ gconf.cnt = 0
+ -- break
+ end
+ local input = {}
+ -- if gconf.cnt == 1000 then break end
+ for i, e in ipairs(input_order) do
+ local id = e.id
+ if data[id] == nil then
+ nerv.error("input data %s not found", id)
+ end
+ local transformed
+ if e.global_transf then
+ local batch = gconf.cumat_type(data[id]:nrow(), data[id]:ncol())
+ batch:copy_fromh(data[id])
+ transformed = nerv.speech_utils.global_transf(batch,
+ global_transf,
+ gconf.frm_ext or 0, 0,
+ gconf)
+ else
+ transformed = data[id]
+ end
+ table.insert(input, transformed)
+ end
+ err_output = {input[1]:create()}
+ network:batch_resize(input[1]:nrow())
+ if network:propagate(input, {{}}) == true then
+ network:back_propagate({{}}, err_output, input, {{}})
+ gconf.batch_size = 1.0 - gconf.momentum -- important!!!
+ network:update({{}}, input, {{}})
+ end
+ -- collect garbage in-time to save GPU memory
+ collectgarbage("collect")
+ end
+ end
+ print_stat(layer_repo)
+ nerv.CuMatrix.print_profile()
+ nerv.CuMatrix.clear_profile()
+ if prefix ~= nil then
+ nerv.info("writing back...")
+ local fname = string.format("%s_tr%.3f.nerv",
+ prefix, get_accuracy(layer_repo))
+ network:get_params():export(fname, nil)
+ end
+ return get_accuracy(layer_repo)
+ end
+ return iterative_trainer
+end
+
+dofile(arg[1])
+
+local pf0 = gconf.initialized_param
+local trainer = build_trainer(pf0)
+
+local i = 1
+nerv.info("[NN] begin iteration %d with lrate = %.6f", i, gconf.lrate)
+local accu_tr = trainer(string.format("%s_%s_iter_%d_lr%f",
+string.gsub(
+(string.gsub(pf0[1], "(.*/)(.*)", "%2")),
+"(.*)%..*", "%1"),
+os.date("%Y%m%d%H%M%S"),
+i, gconf.lrate), gconf.tr_scp, true)
+nerv.info("[TR] training set %d: %.3f", i, accu_tr)
+
diff --git a/nerv/examples/swb_baseline.lua b/nerv/examples/swb_baseline.lua
index 7783f2a..8f72200 100644
--- a/nerv/examples/swb_baseline.lua
+++ b/nerv/examples/swb_baseline.lua
@@ -2,7 +2,9 @@ require 'htk_io'
gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
cumat_type = nerv.CuMatrixFloat,
mmat_type = nerv.MMatrixFloat,
+ direct_update = true,
frm_ext = 5,
+ frm_trim = 5,
tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp",
htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
@@ -10,8 +12,8 @@ gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
"/slfs1/users/mfy43/swb_global_transf.nerv"},
debug = false}
-function make_sublayer_repo(param_repo)
- return nerv.LayerRepo(
+function make_layer_repo(param_repo)
+ local layer_repo = nerv.LayerRepo(
{
-- global transf
["nerv.BiasLayer"] =
@@ -54,21 +56,23 @@ function make_sublayer_repo(param_repo)
sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
},
- ["nerv.SoftmaxCELayer"] =
+ ["nerv.SoftmaxCELayer"] = -- softmax + ce criterion layer for finetune output
{
ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}}
+ },
+ ["nerv.SoftmaxLayer"] = -- softmax for decode output
+ {
+ softmax = {{}, {dim_in = {3001}, dim_out = {3001}}}
}
}, param_repo, gconf)
-end
-function make_layer_repo(sublayer_repo, param_repo)
- return nerv.LayerRepo(
+ layer_repo:add_layers(
{
["nerv.DAGLayer"] =
{
global_transf = {{}, {
dim_in = {429}, dim_out = {429},
- sub_layers = sublayer_repo,
+ sub_layers = layer_repo,
connections = {
["<input>[1]"] = "blayer1[1]",
["blayer1[1]"] = "wlayer1[1]",
@@ -78,8 +82,8 @@ function make_layer_repo(sublayer_repo, param_repo)
}
}},
main = {{}, {
- dim_in = {429, 1}, dim_out = {1},
- sub_layers = sublayer_repo,
+ dim_in = {429}, dim_out = {3001},
+ sub_layers = layer_repo,
connections = {
["<input>[1]"] = "affine0[1]",
["affine0[1]"] = "sigmoid0[1]",
@@ -96,17 +100,51 @@ function make_layer_repo(sublayer_repo, param_repo)
["sigmoid5[1]"] = "affine6[1]",
["affine6[1]"] = "sigmoid6[1]",
["sigmoid6[1]"] = "affine7[1]",
- ["affine7[1]"] = "ce_crit[1]",
+ ["affine7[1]"] = "<output>[1]"
+ }
+ }}
+ }
+ }, param_repo, gconf)
+
+ layer_repo:add_layers(
+ {
+ ["nerv.DAGLayer"] =
+ {
+ ce_output = {{}, {
+ dim_in = {429, 1}, dim_out = {1},
+ sub_layers = layer_repo,
+ connections = {
+ ["<input>[1]"] = "main[1]",
+ ["main[1]"] = "ce_crit[1]",
["<input>[2]"] = "ce_crit[2]",
["ce_crit[1]"] = "<output>[1]"
}
+ }},
+ softmax_output = {{}, {
+ dim_in = {429}, dim_out = {3001},
+ sub_layers = layer_repo,
+ connections = {
+ ["<input>[1]"] = "main[1]",
+ ["main[1]"] = "softmax[1]",
+ ["softmax[1]"] = "<output>[1]"
+ }
}}
}
}, param_repo, gconf)
+
+ return layer_repo
end
function get_network(layer_repo)
- return layer_repo:get_layer("main")
+ return layer_repo:get_layer("ce_output")
+end
+
+function get_decode_network(layer_repo)
+ return layer_repo:get_layer("softmax_output")
+end
+
+function get_global_transf(layer_repo)
+ return layer_repo:get_layer("global_transf")
end
function make_readers(scp_file, layer_repo)
@@ -125,8 +163,7 @@ function make_readers(scp_file, layer_repo)
dir = "*/",
ext = "lab"
}
- },
- global_transf = layer_repo:get_layer("global_transf")
+ }
}),
data = {main_scp = 429, phone_state = 1}}
}
@@ -137,26 +174,28 @@ function make_buffer(readers)
{
buffer_size = gconf.buffer_size,
randomize = gconf.randomize,
- readers = readers
+ readers = readers,
+ use_gpu = true
})
end
function get_input_order()
- return {"main_scp", "phone_state"}
+ return {{id = "main_scp", global_transf = true},
+ {id = "phone_state"}}
end
-function get_accuracy(sublayer_repo)
- local ce_crit = sublayer_repo:get_layer("ce_crit")
+function get_accuracy(layer_repo)
+ local ce_crit = layer_repo:get_layer("ce_crit")
return ce_crit.total_correct / ce_crit.total_frames * 100
end
-function print_stat(sublayer_repo)
- local ce_crit = sublayer_repo:get_layer("ce_crit")
+function print_stat(layer_repo)
+ local ce_crit = layer_repo:get_layer("ce_crit")
nerv.info("*** training stat begin ***")
nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce)
nerv.printf("correct:\t\t%d\n", ce_crit.total_correct)
nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames)
nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames)
- nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(sublayer_repo))
+ nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo))
nerv.info("*** training stat end ***")
end
diff --git a/nerv/examples/swb_baseline_basic.lua b/nerv/examples/swb_baseline_basic.lua
new file mode 100644
index 0000000..71f04a3
--- /dev/null
+++ b/nerv/examples/swb_baseline_basic.lua
@@ -0,0 +1,162 @@
+require 'htk_io'
+gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
+ cumat_type = nerv.CuMatrixFloat,
+ mmat_type = nerv.MMatrixFloat,
+ frm_ext = 5,
+ frm_trim = 5,
+ tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
+ cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp",
+ htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
+ initialized_param = {"/slfs1/users/mfy43/swb_init.nerv",
+ "/slfs1/users/mfy43/swb_global_transf.nerv"},
+ debug = false}
+
+function make_layer_repo(param_repo)
+ local layer_repo = nerv.LayerRepo(
+ {
+ -- global transf
+ ["nerv.BiasLayer"] =
+ {
+ blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}},
+ blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}}
+ },
+ ["nerv.WindowLayer"] =
+ {
+ wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}},
+ wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}}
+ },
+ -- biased linearity
+ ["nerv.AffineLayer"] =
+ {
+ affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+ {dim_in = {429}, dim_out = {2048}}},
+ affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+ {dim_in = {2048}, dim_out = {3001}}}
+ },
+ ["nerv.SigmoidLayer"] =
+ {
+ sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+ },
+ ["nerv.SoftmaxCELayer"] =
+ {
+ ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}}
+ }
+ }, param_repo, gconf)
+
+ layer_repo:add_layers(
+ {
+ ["nerv.DAGLayer"] =
+ {
+ global_transf = {{}, {
+ dim_in = {429}, dim_out = {429},
+ sub_layers = layer_repo,
+ connections = {
+ ["<input>[1]"] = "blayer1[1]",
+ ["blayer1[1]"] = "wlayer1[1]",
+ ["wlayer1[1]"] = "blayer2[1]",
+ ["blayer2[1]"] = "wlayer2[1]",
+ ["wlayer2[1]"] = "<output>[1]"
+ }
+ }},
+ main = {{}, {
+ dim_in = {429, 1}, dim_out = {1},
+ sub_layers = layer_repo,
+ connections = {
+ ["<input>[1]"] = "affine0[1]",
+ ["affine0[1]"] = "sigmoid0[1]",
+ ["sigmoid0[1]"] = "affine1[1]",
+ ["affine1[1]"] = "sigmoid1[1]",
+ ["sigmoid1[1]"] = "affine2[1]",
+ ["affine2[1]"] = "sigmoid2[1]",
+ ["sigmoid2[1]"] = "affine3[1]",
+ ["affine3[1]"] = "sigmoid3[1]",
+ ["sigmoid3[1]"] = "affine4[1]",
+ ["affine4[1]"] = "sigmoid4[1]",
+ ["sigmoid4[1]"] = "affine5[1]",
+ ["affine5[1]"] = "sigmoid5[1]",
+ ["sigmoid5[1]"] = "affine6[1]",
+ ["affine6[1]"] = "sigmoid6[1]",
+ ["sigmoid6[1]"] = "affine7[1]",
+ ["affine7[1]"] = "ce_crit[1]",
+ ["<input>[2]"] = "ce_crit[2]",
+ ["ce_crit[1]"] = "<output>[1]"
+ }
+ }}
+ }
+ }, param_repo, gconf)
+ return layer_repo
+end
+
+function get_network(layer_repo)
+ return layer_repo:get_layer("main")
+end
+
+function make_readers(scp_file, layer_repo)
+ return {
+ {reader = nerv.TNetReader(gconf,
+ {
+ id = "main_scp",
+ scp_file = scp_file,
+ conf_file = gconf.htk_conf,
+ frm_ext = gconf.frm_ext,
+ mlfs = {
+ phone_state = {
+ file = "/slfs1/users/mfy43/swb_ivec/ref.mlf",
+ format = "map",
+ format_arg = "/slfs1/users/mfy43/swb_ivec/dict",
+ dir = "*/",
+ ext = "lab"
+ }
+ }
+ }),
+ data = {main_scp = 429, phone_state = 1}}
+ }
+end
+
+function make_buffer(readers)
+ return nerv.SGDBuffer(gconf,
+ {
+ buffer_size = gconf.buffer_size,
+ randomize = gconf.randomize,
+ readers = readers
+ })
+end
+
+function get_input_order()
+ return {{id = "main_scp", global_transf = true},
+ {id = "phone_state"}}
+end
+
+function get_accuracy(layer_repo)
+ local ce_crit = layer_repo:get_layer("ce_crit")
+ return ce_crit.total_correct / ce_crit.total_frames * 100
+end
+
+function print_stat(layer_repo)
+ local ce_crit = layer_repo:get_layer("ce_crit")
+ nerv.info("*** training stat begin ***")
+ nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce)
+ nerv.printf("correct:\t\t%d\n", ce_crit.total_correct)
+ nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames)
+ nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames)
+ nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo))
+ nerv.info("*** training stat end ***")
+end
diff --git a/nerv/init.lua b/nerv/init.lua
index 89010a7..9c1a5c8 100644
--- a/nerv/init.lua
+++ b/nerv/init.lua
@@ -1,15 +1,19 @@
require 'libnerv'
-function nerv.error(fmt, ...)
- error(nerv.printf("[nerv] internal error: " .. fmt .. "\n", ...))
-end
-
function nerv.error_method_not_implemented()
nerv.error("method not implemented");
end
+function nerv.sprintf(fmt, ...)
+ return string.format(fmt, ...)
+end
+
function nerv.printf(fmt, ...)
- io.write(string.format(fmt, ...))
+ io.write(nerv.sprintf(fmt, ...))
+end
+
+function nerv.error(fmt, ...)
+ error(nerv.sprintf("[nerv] internal error: " .. fmt .. "\n", ...))
end
function nerv.mesg_with_timestamp(fmt, ...)
diff --git a/nerv/io/sgd_buffer.lua b/nerv/io/sgd_buffer.lua
index f4f7dfe..74c4934 100644
--- a/nerv/io/sgd_buffer.lua
+++ b/nerv/io/sgd_buffer.lua
@@ -5,8 +5,22 @@ function SGDBuffer:__init(global_conf, buffer_conf)
self.buffer_size = math.floor(buffer_conf.buffer_size /
global_conf.batch_size) * global_conf.batch_size
self.randomize = buffer_conf.randomize
- if self.randomize == nil then
- self.randomize = false
+ self.consume = buffer_conf.consume
+ local cumat_type = global_conf.cumat_type
+ if buffer_conf.use_gpu then
+ self.mat_type = cumat_type
+ self.copy_rows_from_by_idx = cumat_type.copy_rows_fromd_by_idx
+ self.copy_from = cumat_type.copy_fromd
+ self.copy_from_reader = cumat_type.copy_fromh
+ self.perm_gen = function (x)
+ return cumat_type.new_from_host(nerv.MMatrixFloat.perm_gen(x))
+ end
+ else
+ self.mat_type = global_conf.mmat_type
+ self.copy_rows_from_by_idx = cumat_type.copy_rows_fromh_by_idx
+ self.copy_from = cumat_type.copy_fromh
+ self.perm_gen = nerv.MMatrixFloat.perm_gen
+ self.copy_from_reader = self.mat_type.copy_from
end
self.head = 0
self.tail = 0
@@ -14,7 +28,7 @@ function SGDBuffer:__init(global_conf, buffer_conf)
for i, reader_spec in ipairs(buffer_conf.readers) do
local buffs = {}
for id, width in pairs(reader_spec.data) do
- buffs[id] = {data = global_conf.mmat_type(self.buffer_size, width),
+ buffs[id] = {data = self.mat_type(self.buffer_size, width),
leftover = nil,
width = width}
end
@@ -41,7 +55,7 @@ function SGDBuffer:saturate()
buff.data:copy_from(buff.leftover, 0, lrow)
buff.leftover = nil
end
- nerv.printf("leftover: %d\n", lrow)
+ nerv.info("buffer leftover: %d\n", lrow)
reader.tail = lrow
reader.has_leftover = false
end
@@ -65,21 +79,21 @@ function SGDBuffer:saturate()
if d == nil then
nerv.error("reader does not provide data for %s", id)
end
- buff.leftover = self.gconf.mmat_type(drow - remain,
- buff.width)
- buff.leftover:copy_from(d, remain, drow)
+ buff.leftover = self.mat_type(drow - remain,
+ buff.width)
+ self.copy_from_reader(buff.leftover, d, remain, drow)
end
drow = remain
reader.has_leftover = true
end
for id, buff in pairs(reader.buffs) do
- buff.data:copy_from(data[id], 0, drow, reader.tail)
+ self.copy_from_reader(buff.data, data[id], 0, drow, reader.tail)
end
reader.tail = reader.tail + drow
end
self.tail = math.min(self.tail, reader.tail)
end
- self.rand_map = nerv.MMatrixInt.perm_gen(self.tail) -- generate shuffled index
+ self.rand_map = self.perm_gen(self.tail) -- generate shuffled index
collectgarbage("collect")
return self.tail >= self.gconf.batch_size
end
@@ -87,25 +101,31 @@ end
function SGDBuffer:get_data()
local batch_size = self.gconf.batch_size
if self.head >= self.tail then -- buffer is empty
- if not self:saturate() then
+ local t = os.clock()
+ if (not self:saturate()) and (not self.consume) then
return nil -- the remaining data cannot build a batch
end
+ if self.tail == self.head then
+ return nil -- nothing left
+ end
+ nerv.info("%.3fs to fill the buffer", os.clock() - t)
end
- if self.head + batch_size > self.tail then
+ if self.head + batch_size > self.tail and (not self.consume) then
return nil -- the remaining data cannot build a batch
end
+ actual_batch_size = math.min(batch_size, self.tail - self.head)
local res = {}
for i, reader in ipairs(self.readers) do
for id, buff in pairs(reader.buffs) do
- local batch = self.gconf.cumat_type(batch_size, buff.width)
+ local batch = self.gconf.cumat_type(actual_batch_size, buff.width)
if self.randomize then
- batch:copy_rows_fromh_by_idx(buff.data, self.rand_map, self.head)
+ self.copy_rows_from_by_idx(batch, buff.data, self.rand_map, self.head)
else
- batch:copy_fromh(buff.data, self.head, self.head + batch_size)
+ self.copy_from(batch, buff.data, self.head, self.head + actual_batch_size)
end
res[id] = batch
end
end
- self.head = self.head + batch_size
+ self.head = self.head + actual_batch_size
return res
end
diff --git a/nerv/layer/affine.lua b/nerv/layer/affine.lua
index 00cbcfb..015ec3f 100644
--- a/nerv/layer/affine.lua
+++ b/nerv/layer/affine.lua
@@ -42,7 +42,7 @@ function AffineLayer:__init(id, global_conf, layer_conf)
self.dim_out = layer_conf.dim_out
self.gconf = global_conf
self:check_dim_len(1, 1) -- exactly one input and one output
- self.direct_update = layer_conf.direct_update
+ self.direct_update = layer_conf.direct_update or global_conf.direct_update
end
function AffineLayer:init(batch_size)
@@ -60,6 +60,10 @@ function AffineLayer:init(batch_size)
self.bp:train_init()
end
+function AffineLayer:batch_resize(batch_size)
+ -- do nothing
+end
+
function AffineLayer:update(bp_err, input, output)
if self.direct_update then
self.ltp.correction:mul(input[1], bp_err[1], 1.0, gconf.momentum, 'T', 'N')
diff --git a/nerv/layer/affine_recurrent.lua b/nerv/layer/affine_recurrent.lua
index 59d259c..92d98e2 100644
--- a/nerv/layer/affine_recurrent.lua
+++ b/nerv/layer/affine_recurrent.lua
@@ -37,6 +37,10 @@ function Recurrent:init(batch_size)
self.bp:train_init()
end
+function Recurrent:batch_resize(batch_size)
+ -- do nothing
+end
+
function Recurrent:update(bp_err, input, output)
if (self.direct_update == true) then
local ltp_hh = self.ltp_hh.trans
diff --git a/nerv/layer/bias.lua b/nerv/layer/bias.lua
index c99274d..7e9fd46 100644
--- a/nerv/layer/bias.lua
+++ b/nerv/layer/bias.lua
@@ -18,6 +18,10 @@ function BiasLayer:init()
end
end
+function BiasLayer:batch_resize(batch_size)
+ -- do nothing
+end
+
function BiasLayer:propagate(input, output)
output[1]:copy_fromd(input[1])
output[1]:add_row(self.bias.trans, 1.0)
diff --git a/nerv/layer/combiner.lua b/nerv/layer/combiner.lua
index 7bd7617..1bcfdfb 100644
--- a/nerv/layer/combiner.lua
+++ b/nerv/layer/combiner.lua
@@ -30,6 +30,12 @@ function CombinerLayer:init(batch_size)
self.sum = self.gconf.cumat_type(batch_size, dim)
end
+function CombinerLayer:batch_resize(batch_size)
+ if self.sum:nrow() ~= batch_size then
+ self.sum = self.gconf.cumat_type(batch_size, self.dim_in[1])
+ end
+end
+
function CombinerLayer:update(bp_err, input, output)
end
diff --git a/nerv/layer/init.lua b/nerv/layer/init.lua
index 7172f99..6861b0e 100644
--- a/nerv/layer/init.lua
+++ b/nerv/layer/init.lua
@@ -78,3 +78,4 @@ nerv.include('window.lua')
nerv.include('mse.lua')
nerv.include('combiner.lua')
nerv.include('affine_recurrent.lua')
+nerv.include('softmax.lua')
diff --git a/nerv/layer/mse.lua b/nerv/layer/mse.lua
index 9a97add..0ee3080 100644
--- a/nerv/layer/mse.lua
+++ b/nerv/layer/mse.lua
@@ -20,6 +20,14 @@ function MSELayer:init(batch_size)
self.diff = self.mse:create()
end
+function MSELayer:batch_resize(batch_size)
+ if self.mse:nrow() ~= batch_resize then
+ self.mse = self.gconf.cumat_type(batch_size, self.dim_in[1])
+ self.mse_sum = self.gconf.cumat_type(batch_size, 1)
+ self.diff = self.mse:create()
+ end
+end
+
function MSELayer:update(bp_err, input, output)
-- no params, therefore do nothing
end
@@ -34,7 +42,7 @@ function MSELayer:propagate(input, output)
if output[1] ~= nil then
output[1]:copy_fromd(mse_sum)
end
- self.total_mse = self.total_mse + mse_sum:colsum()[0]
+ self.total_mse = self.total_mse + mse_sum:colsum()[0][0]
self.total_frames = self.total_frames + mse_sum:nrow()
end
diff --git a/nerv/layer/sigmoid.lua b/nerv/layer/sigmoid.lua
index dfd09eb..0a8bcdc 100644
--- a/nerv/layer/sigmoid.lua
+++ b/nerv/layer/sigmoid.lua
@@ -14,6 +14,10 @@ function SigmoidLayer:init()
end
end
+function SigmoidLayer:batch_resize(batch_size)
+ -- do nothing
+end
+
function SigmoidLayer:update(bp_err, input, output)
-- no params, therefore do nothing
end
diff --git a/nerv/layer/softmax.lua b/nerv/layer/softmax.lua
new file mode 100644
index 0000000..4205b66
--- /dev/null
+++ b/nerv/layer/softmax.lua
@@ -0,0 +1,35 @@
+local SoftmaxLayer = nerv.class("nerv.SoftmaxLayer", "nerv.Layer")
+
+function SoftmaxLayer:__init(id, global_conf, layer_conf)
+ self.id = id
+ self.gconf = global_conf
+ self.dim_in = layer_conf.dim_in
+ self.dim_out = layer_conf.dim_out
+ self:check_dim_len(1, 1) -- two inputs: nn output and label
+end
+
+function SoftmaxLayer:init(batch_size)
+ if self.dim_in[1] ~= self.dim_out[1] then
+ nerv.error("mismatching dimensions of input and output")
+ end
+end
+
+function SoftmaxLayer:batch_resize(batch_size)
+ -- do nothing
+end
+
+function SoftmaxLayer:update(bp_err, input, output)
+ -- no params, therefore do nothing
+end
+
+function SoftmaxLayer:propagate(input, output)
+ output[1]:softmax(input[1])
+end
+
+function SoftmaxLayer:back_propagate(bp_err, next_bp_err, input, output)
+ nerv.error_method_not_implemented()
+end
+
+function SoftmaxLayer:get_params()
+ return nerv.ParamRepo({})
+end
diff --git a/nerv/layer/softmax_ce.lua b/nerv/layer/softmax_ce.lua
index f878a2f..9071e86 100644
--- a/nerv/layer/softmax_ce.lua
+++ b/nerv/layer/softmax_ce.lua
@@ -23,6 +23,13 @@ function SoftmaxCELayer:init(batch_size)
self.ce = self.softmax:create()
end
+function SoftmaxCELayer:batch_resize(batch_size)
+ if self.softmax:nrow() ~= batch_resize then
+ self.softmax = self.gconf.cumat_type(batch_size, self.dim_in[1])
+ self.ce = self.softmax:create()
+ end
+end
+
function SoftmaxCELayer:update(bp_err, input, output)
-- no params, therefore do nothing
end
diff --git a/nerv/layer/window.lua b/nerv/layer/window.lua
index 4e9a3b1..8eed352 100644
--- a/nerv/layer/window.lua
+++ b/nerv/layer/window.lua
@@ -18,6 +18,10 @@ function WindowLayer:init()
end
end
+function WindowLayer:batch_resize(batch_size)
+ -- do nothing
+end
+
function WindowLayer:propagate(input, output)
output[1]:copy_fromd(input[1])
output[1]:scale_rows_by_row(self.window.trans)
diff --git a/nerv/lib/matrix/cukernel.h b/nerv/lib/matrix/cukernel.h
index 7bb4c2c..2126c6f 100644
--- a/nerv/lib/matrix/cukernel.h
+++ b/nerv/lib/matrix/cukernel.h
@@ -18,4 +18,6 @@ void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step);
void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b);
void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b);
void cudak_(cuda_decompress)(const Matrix *a, Matrix *b);
+void cudak_(cuda_copy_rows_by_idx)(const Matrix *a, Matrix *b,
+ const Matrix *idx, int b_begin);
#endif
diff --git a/nerv/lib/matrix/cumatrix.c b/nerv/lib/matrix/cumatrix.c
index aa81bfc..ff1168d 100644
--- a/nerv/lib/matrix/cumatrix.c
+++ b/nerv/lib/matrix/cumatrix.c
@@ -49,6 +49,7 @@ void nerv_cumatrix_init() {
#define NERV_CUBLAS_(NAME) cublasS##NAME
#define MATRIX_CUMATRIX_HOST_TNAME nerv_matrix_host_float_tname
#include "generic/cumatrix.c"
+
#undef NERV_CUBLAS_
#undef cudak_
#undef nerv_matrix_
diff --git a/nerv/lib/matrix/cumatrix.h b/nerv/lib/matrix/cumatrix.h
index 9f71507..e6def66 100644
--- a/nerv/lib/matrix/cumatrix.h
+++ b/nerv/lib/matrix/cumatrix.h
@@ -1,5 +1,6 @@
#ifndef NERV_CUMATRIX_H
#define NERV_CUMATRIX_H
+#include "matrix.h"
void nerv_cumatrix_print_profile();
void nerv_cumatrix_clear_profile();
void nerv_cumatrix_init();
diff --git a/nerv/lib/matrix/generic/cukernel.cu b/nerv/lib/matrix/generic/cukernel.cu
index e337798..08feb59 100644
--- a/nerv/lib/matrix/generic/cukernel.cu
+++ b/nerv/lib/matrix/generic/cukernel.cu
@@ -284,6 +284,15 @@ __global__ void cudak_(gen_col_idx)(MATRIX_ELEM *b,
b[j + i * stride] = j;
}
+__global__ void cudak_(copy_rows_by_idx)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+ const MATRIX_ELEM *idx, int b_begin,
+ int nrow, int ncol, int stride) {
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ if (i >= nrow || j >= ncol) return;
+ b[j + i * stride] = a[j + lrintf(idx[i]) * stride];
+}
+
extern "C" {
#include "../cukernel.h"
void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) {
@@ -589,5 +598,16 @@ extern "C" {
b->stride / sizeof(MATRIX_ELEM));
cudaStreamSynchronize(0);
}
+
+ void cudak_(cuda_copy_rows_by_idx)(const Matrix *a, Matrix *b,
+ const Matrix *idx, int b_begin) {
+ dim3 threadsPerBlock(CUDA_THREADS_NN, 1);
+ dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), b->nrow);
+ cudak_(copy_rows_by_idx)<<<numBlocks, threadsPerBlock>>> \
+ (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+ MATRIX_ELEM_PTR(idx) + b_begin,
+ b_begin, b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+ cudaStreamSynchronize(0);
+ }
}
#endif
diff --git a/nerv/lib/matrix/generic/cumatrix.c b/nerv/lib/matrix/generic/cumatrix.c
index 40a0030..770e503 100644
--- a/nerv/lib/matrix/generic/cumatrix.c
+++ b/nerv/lib/matrix/generic/cumatrix.c
@@ -315,16 +315,17 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b,
long nrow = a->nrow;
if (!(0 <= b_begin && b_begin + nrow <= idx->ncol))
NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0);
- long *idx_ptr = idx->data.i;
+ float *idx_ptr = idx->data.f;
int i;
if (idx->nrow != 1)
NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0);
if (a->ncol != b->ncol)
NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0);
+ PROFILE_START
cudaStream_t *streams = (cudaStream_t*)malloc(sizeof(cudaStream_t) * nrow);
for (i = 0; i < nrow; i++)
{
- int src_row = idx_ptr[b_begin + i];
+ int src_row = (int)idx_ptr[b_begin + i];
if (!(0 <= src_row && src_row < b->nrow))
NERV_EXIT_STATUS(status, MAT_INVALID_IDX, 0);
CUDA_SAFE_CALL(cudaStreamCreate(streams + i), status);
@@ -339,6 +340,22 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b,
CUDA_SAFE_CALL(cudaStreamDestroy(streams[i]), status);
}
free(streams);
+ PROFILE_STOP
+ NERV_SET_STATUS(status, NERV_NORMAL, 0);
+}
+
+void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b,
+ const Matrix *idx, int b_begin, Status *status) {
+ long nrow = a->nrow;
+ if (!(0 <= b_begin && b_begin + nrow <= idx->ncol))
+ NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0);
+ if (idx->nrow != 1)
+ NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0);
+ if (a->ncol != b->ncol)
+ NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0);
+ PROFILE_START
+ cudak_(cuda_copy_rows_by_idx)(b, a, idx, b_begin);
+ PROFILE_STOP
NERV_SET_STATUS(status, NERV_NORMAL, 0);
}
diff --git a/nerv/lib/matrix/generic/cumatrix.h b/nerv/lib/matrix/generic/cumatrix.h
index 3f1f8a3..04e8c5a 100644
--- a/nerv/lib/matrix/generic/cumatrix.h
+++ b/nerv/lib/matrix/generic/cumatrix.h
@@ -40,6 +40,8 @@ void nerv_matrix_(log_elem)(Matrix *b, const Matrix *a, Status *status);
Matrix *nerv_matrix_(decompress)(const Matrix *a, int orig_col, Status *status);
void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b,
const Matrix *idx, int b_begin, Status *status);
+void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b,
+ const Matrix *idx, int b_begin, Status *status);
void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b,
int context, Status *status);
diff --git a/nerv/lib/matrix/generic/matrix.c b/nerv/lib/matrix/generic/matrix.c
index e4afa37..4246751 100644
--- a/nerv/lib/matrix/generic/matrix.c
+++ b/nerv/lib/matrix/generic/matrix.c
@@ -3,7 +3,8 @@
#include "matrix.h"
/* FIXME: malloc failure detection */
-static void nerv_matrix_(data_free)(Matrix *self, Status *status) {
+void nerv_matrix_(data_free)(Matrix *self, Status *status) {
+ if(*self->data_ref == 0) return; /* FIXME: repeat free memory */
assert(*self->data_ref > 0);
if (--(*self->data_ref) == 0)
{
@@ -18,7 +19,7 @@ static void nerv_matrix_(data_free)(Matrix *self, Status *status) {
}
}
-static void nerv_matrix_(data_retain)(Matrix *self) {
+void nerv_matrix_(data_retain)(Matrix *self) {
(*self->data_ref)++;
}
diff --git a/nerv/lib/matrix/generic/matrix.h b/nerv/lib/matrix/generic/matrix.h
index 9d44e6d..69b4e6d 100644
--- a/nerv/lib/matrix/generic/matrix.h
+++ b/nerv/lib/matrix/generic/matrix.h
@@ -2,3 +2,5 @@
Matrix *nerv_matrix_(create)(long nrow, long ncol, Status *status);
void nerv_matrix_(destroy)(Matrix *self, Status *status);
Matrix *nerv_matrix_(getrow)(Matrix *self, int row);
+void nerv_matrix_(data_free)(Matrix *self, Status *status);
+void nerv_matrix_(data_retain)(Matrix *self);
diff --git a/nerv/lib/matrix/mmatrix.c b/nerv/lib/matrix/mmatrix.c
index 94f1ea8..b8157eb 100644
--- a/nerv/lib/matrix/mmatrix.c
+++ b/nerv/lib/matrix/mmatrix.c
@@ -7,6 +7,25 @@
#define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME
#include "generic/matrix.h"
#include "generic/mmatrix.c"
+
+Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) {
+ int i;
+ Matrix *self = nerv_matrix_(create)(1, ncol, status);
+ if (status->err_code != NERV_NORMAL)
+ return NULL;
+ float *prow = self->data.f;
+ for (i = 0; i < ncol; i++)
+ prow[i] = i;
+ for (i = ncol - 1; i >= 0; i--)
+ {
+ size_t j = rand() % (i + 1);
+ float tmp = prow[i];
+ prow[i] = prow[j];
+ prow[j] = tmp;
+ }
+ return self;
+}
+
#undef nerv_matrix_
#undef host_matrix_
#undef MATRIX_USE_FLOAT
@@ -33,21 +52,3 @@
#define host_matrix_(NAME) host_matrix_int_##NAME
#define nerv_matrix_(NAME) nerv_matrix_host_int_##NAME
#include "generic/mmatrix.c"
-
-Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) {
- int i;
- Matrix *self = nerv_matrix_(create)(1, ncol, status);
- if (status->err_code != NERV_NORMAL)
- return NULL;
- long *prow = self->data.i;
- for (i = 0; i < ncol; i++)
- prow[i] = i;
- for (i = ncol - 1; i >= 0; i--)
- {
- size_t j = rand() % (i + 1);
- long tmp = prow[i];
- prow[i] = prow[j];
- prow[j] = tmp;
- }
- return self;
-}
diff --git a/nerv/lib/matrix/mmatrix.h b/nerv/lib/matrix/mmatrix.h
index df91e4c..31e7984 100644
--- a/nerv/lib/matrix/mmatrix.h
+++ b/nerv/lib/matrix/mmatrix.h
@@ -1,4 +1,5 @@
#ifndef NERV_MMATRIX_H
#define NERV_MMATRIX_H
-Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status);
+#include "matrix.h"
+Matrix *nerv_matrix_host_float_perm_gen(int ncol, Status *status);
#endif
diff --git a/nerv/matrix/generic/cukernel.cu b/nerv/matrix/generic/cukernel.cu
deleted file mode 100644
index 2ae5e62..0000000
--- a/nerv/matrix/generic/cukernel.cu
+++ /dev/null
@@ -1,592 +0,0 @@
-#ifdef NERV_GENERIC_CUKERNEL
-#include <assert.h>
-#include <stdio.h>
-#include "matrix.h"
-#include "cuda.h"
-#include "float.h"
-#define CUDA_THREADS_N 16
-#define CUDA_THREADS_NN ((CUDA_THREADS_N) * (CUDA_THREADS_N))
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
-__global__ void cudak_(log_elem)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
- int nrow, int ncol, int stride) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- long idx;
- MATRIX_ELEM tmp;
- if (i >= nrow || j >= ncol) return;
- idx = j + i * stride;
- tmp = a[idx];
- if(tmp < FLT_MIN) tmp = FLT_MIN;
- b[idx] = log(tmp);
-}
-
-__global__ void cudak_(mul_elem)(const MATRIX_ELEM *a, const MATRIX_ELEM *b,
- MATRIX_ELEM *c,
- int nrow, int ncol, int stride) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- long idx;
- if (i >= nrow || j >= ncol) return;
- idx = j + i * stride;
- c[idx] = a[idx] * b[idx];
-}
-
-__global__ void cudak_(sigmoid)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
- int nrow, int ncol, int stride) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- long idx;
- if (i >= nrow || j >= ncol) return;
- idx = j + i * stride;
- b[idx] = 1.0 / (1.0 + exp(-a[idx]));
-}
-
-__global__ void cudak_(sigmoid_grad)(const MATRIX_ELEM *output,
- const MATRIX_ELEM *err,
- MATRIX_ELEM *nerr,
- int nrow, int ncol, int stride) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- long idx;
- if (i >= nrow || j >= ncol) return;
- idx = j + i * stride;
- nerr[idx] = output[idx] * (1.0 - output[idx]) * err[idx];
-}
-
-__global__ void cudak_(softmax_final)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
- const MATRIX_ELEM *max, const MATRIX_ELEM *deno,
- int nrow, int ncol, int stride, int mstride) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- long idx;
- if (i >= nrow || j >= ncol) return;
- idx = j + i * stride;
- b[idx] = exp(a[idx] - max[0 + i * mstride]) / deno[0 + i * mstride];
-}
-
-__global__ void cudak_(block_reduce_rowsum)(const MATRIX_ELEM *input,
- MATRIX_ELEM *output,
- const int istride, const int ostride,
- const int n) {
- extern __shared__ MATRIX_ELEM cudak_(arr)[];
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0;
- __syncthreads();
- for (int offset = blockDim.x >> 1; offset; offset >>= 1)
- {
- if (threadIdx.x < offset)
- cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
- __syncthreads();
- }
- if (threadIdx.x == 0)
- output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_colsum)(const MATRIX_ELEM *input,
- MATRIX_ELEM *output,
- const int istride, const int ostride,
- const int n) {
- extern __shared__ MATRIX_ELEM cudak_(arr)[];
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- cudak_(arr)[threadIdx.y] = i < n ? input[blockIdx.x + istride * i] : 0;
- __syncthreads();
- for (int offset = blockDim.y >> 1; offset; offset >>= 1)
- {
- if (threadIdx.y < offset)
- cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset];
- __syncthreads();
- }
- if (threadIdx.y == 0)
- output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_colsame)(const MATRIX_ELEM *input,
- const MATRIX_ELEM *ref_input,
- MATRIX_ELEM *output,
- const int istride, const int ostride,
- const int n) {
- extern __shared__ MATRIX_ELEM cudak_(arr)[];
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- cudak_(arr)[threadIdx.y] = (i < n && input[blockIdx.x + istride * i] == \
- ref_input[blockIdx.x + istride * i]) ? 1.0 : 0;
- __syncthreads();
- for (int offset = blockDim.y >> 1; offset; offset >>= 1)
- {
- if (threadIdx.y < offset)
- cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset];
- __syncthreads();
- }
- if (threadIdx.y == 0)
- output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_softmax_rowsum)(const MATRIX_ELEM *input,
- MATRIX_ELEM *output,
- const MATRIX_ELEM *max,
- const int istride, const int ostride,
- const int mstride, const int n) {
- extern __shared__ MATRIX_ELEM cudak_(arr)[];
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- cudak_(arr)[threadIdx.x] = j < n ? exp(input[j + istride * blockIdx.y] - \
- max[0 + mstride * blockIdx.y]) : 0;
- __syncthreads();
- for (int offset = blockDim.x >> 1; offset; offset >>= 1)
- {
- if (threadIdx.x < offset)
- cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
- __syncthreads();
- }
- if (threadIdx.x == 0)
- output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_rowmax)(const MATRIX_ELEM *input,
- MATRIX_ELEM *output,
- const int istride, const int ostride,
- const int n) {
- extern __shared__ MATRIX_ELEM cudak_(arr)[];
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
- __syncthreads();
- for (int offset = blockDim.x >> 1; offset; offset >>= 1)
- {
- if (threadIdx.x < offset)
- {
- MATRIX_ELEM l = cudak_(arr)[threadIdx.x],
- r = cudak_(arr)[threadIdx.x + offset];
- if (r > l)
- cudak_(arr)[threadIdx.x] = r;
- }
- __syncthreads();
- }
- if (threadIdx.x == 0)
- output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_rowmax_idx)(const MATRIX_ELEM *input,
- const MATRIX_ELEM *idx_input,
- MATRIX_ELEM *output,
- MATRIX_ELEM *idx_output,
- const int istride, const int ostride,
- const int n) {
- extern __shared__ MATRIX_ELEM cudak_(arr)[];
- MATRIX_ELEM *arr_val = cudak_(arr);
- MATRIX_ELEM *arr_idx = arr_val + blockDim.x;
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- arr_val[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
- arr_idx[threadIdx.x] = j < n ? idx_input[j + istride * blockIdx.y] : 0;
- __syncthreads();
- for (int offset = blockDim.x >> 1; offset; offset >>= 1)
- {
- if (threadIdx.x < offset)
- {
- MATRIX_ELEM l = arr_val[threadIdx.x],
- r = arr_val[threadIdx.x + offset];
- if (r > l)
- {
- arr_val[threadIdx.x] = r;
- arr_idx[threadIdx.x] = arr_idx[threadIdx.x + offset];
- }
- }
- __syncthreads();
- }
- if (threadIdx.x == 0)
- {
- output[blockIdx.x + ostride * blockIdx.y] = arr_val[0];
- idx_output[blockIdx.x + ostride * blockIdx.y] = arr_idx[0];
- }
-}
-
-__global__ void cudak_(add_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
- int nrow, int ncol, int stride, double beta) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- if (i >= nrow || j >= ncol) return;
- b[j + i * stride] += beta * a[j];
-}
-
-__global__ void cudak_(fill)(MATRIX_ELEM *a,
- int nrow, int ncol, int stride, double val) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- if (i >= nrow || j >= ncol) return;
- a[j + i * stride] = val;
-}
-
-__global__ void cudak_(clip)(MATRIX_ELEM *a,
- int nrow, int ncol, int stride, double val_1, double val_2) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- if (i >= nrow || j >= ncol) return;
- if (a[j + i * stride] > val_2)
- a[j + i * stride] = val_2;
- else if (a[j + i * stride] < val_1)
- a[j + i * stride] = val_1;
-}
-
-__global__ void cudak_(expand_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
- int nrow, int ncol,
- int enrow, int encol,
- int stride, int estride,
- int context) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- int ridx;
- if (i >= enrow || j >= encol) return;
- ridx = i + j / ncol - context;
- if (ridx < 0) ridx = 0;
- else if (ridx >= nrow) ridx = nrow - 1;
- b[j + i * estride] = a[j % ncol + ridx * stride];
-}
-
-__global__ void cudak_(rearrange_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
- int nrow, int ncol,
- int stride, int step, int orig_dim) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- if (i >= nrow || j >= ncol) return;
- b[j + i * stride] = a[j / step + (j % step) * orig_dim + i * stride];
-}
-
-__global__ void cudak_(scale_rows_by_col)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
- int nrow, int ncol,
- int astride, int bstride) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- if (i >= nrow || j >= ncol) return;
- b[j + i * bstride] *= a[i * astride];
-}
-
-__global__ void cudak_(scale_rows_by_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
- int nrow, int ncol,
- int stride) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- if (i >= nrow || j >= ncol) return;
- b[j + i * stride] *= a[j];
-}
-
-__global__ void cudak_(decompress)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
- int nrow, int ncol,
- int stride_a, int stride_b) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- if (i >= nrow || j >= ncol) return;
- b[lrintf(a[j + i * stride_a]) + i * stride_b] = 1.0;
-}
-
-__global__ void cudak_(gen_col_idx)(MATRIX_ELEM *b,
- int nrow, int ncol, int stride) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- if (i >= nrow || j >= ncol) return;
- b[j + i * stride] = j;
-}
-
-extern "C" {
-#include "../cukernel.h"
- void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) {
- dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
- dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
- CEIL_DIV(b->nrow, threadsPerBlock.y));
- cudak_(log_elem)<<<numBlocks, threadsPerBlock>>> \
- (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
- b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
- cudaStreamSynchronize(0);
- }
-
- void cudak_(cuda_mul_elem)(const Matrix *a, const Matrix *b,
- Matrix *c) {
- dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
- dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
- CEIL_DIV(b->nrow, threadsPerBlock.y));
- cudak_(mul_elem)<<<numBlocks, threadsPerBlock>>> \
- (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
- MATRIX_ELEM_PTR(c),
- b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
- cudaStreamSynchronize(0);
- }
-
- void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b) {
- dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
- dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
- CEIL_DIV(b->nrow, threadsPerBlock.y));
- cudak_(sigmoid)<<<numBlocks, threadsPerBlock>>> \
- (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
- b->stride / sizeof(MATRIX_ELEM));
- cudaStreamSynchronize(0);
- }
-
- void cudak_(cuda_sigmoid_grad)(const Matrix *output,
- const Matrix *err, Matrix *nerr) {
- dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
- dim3 numBlocks(CEIL_DIV(nerr->ncol, threadsPerBlock.x),
- CEIL_DIV(nerr->nrow, threadsPerBlock.y));
- cudak_(sigmoid_grad)<<<numBlocks, threadsPerBlock>>> \
- (MATRIX_ELEM_PTR(output), MATRIX_ELEM_PTR(err),
- MATRIX_ELEM_PTR(nerr),
- nerr->nrow, nerr->ncol,
- nerr->stride / sizeof(MATRIX_ELEM));
- cudaStreamSynchronize(0);
- }
-
- void cudak_(cuda_rowsum)(const Matrix *a, Matrix *b) {
- dim3 block(CUDA_THREADS_NN, 1);
- int ncol = a->ncol;
- int blocks_per_row = CEIL_DIV(ncol, block.x);
- dim3 grid(blocks_per_row, a->nrow);
- MATRIX_ELEM *res;
- size_t stride;
- cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
- cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
- (MATRIX_ELEM_PTR(a), res,
- a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
- ncol);
- ncol = blocks_per_row;
- assert((unsigned long)ncol <= block.x);
- grid.x = 1;
- cudaStreamSynchronize(0);
- cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
- (res, MATRIX_ELEM_PTR(b),
- stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
- ncol);
- cudaStreamSynchronize(0);
- cudaFree(res);
- }
-
- void cudak_(cuda_colsame)(const Matrix *a, const Matrix *ref, Matrix *b) {
- dim3 block(1, CUDA_THREADS_NN);
- int nrow = a->nrow;
- int blocks_per_col = CEIL_DIV(nrow, block.y);
- dim3 grid(a->ncol, blocks_per_col);
- MATRIX_ELEM *res;
- size_t stride;
- cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col);
- cudak_(block_reduce_colsame)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
- (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(ref), res,
- a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
- nrow);
- nrow = blocks_per_col;
- assert((unsigned long)nrow <= block.y);
- grid.y = 1;
- cudaStreamSynchronize(0);
- cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
- (res, MATRIX_ELEM_PTR(b),
- stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
- nrow);
- cudaStreamSynchronize(0);
- cudaFree(res);
- }
-
- void cudak_(cuda_colsum)(const Matrix *a, Matrix *b) {
- dim3 block(1, CUDA_THREADS_NN);
- int nrow = a->nrow;
- int blocks_per_col = CEIL_DIV(nrow, block.y);
- dim3 grid(a->ncol, blocks_per_col);
- MATRIX_ELEM *res;
- size_t stride;
- cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col);
- cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
- (MATRIX_ELEM_PTR(a), res,
- a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
- nrow);
- nrow = blocks_per_col;
- assert((unsigned long)nrow <= block.y);
- grid.y = 1;
- cudaStreamSynchronize(0);
- cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
- (res, MATRIX_ELEM_PTR(b),
- stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
- nrow);
- cudaStreamSynchronize(0);
- cudaFree(res);
- }
-
- void cudak_(cuda_softmax_final)(const Matrix *a, const Matrix *max,
- const Matrix *deno, Matrix *b) {
- dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
- dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
- CEIL_DIV(b->nrow, threadsPerBlock.y));
- cudak_(softmax_final)<<<numBlocks, threadsPerBlock>>> \
- (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
- MATRIX_ELEM_PTR(max), MATRIX_ELEM_PTR(deno),
- b->nrow, b->ncol,
- b->stride / sizeof(MATRIX_ELEM),
- max->stride / sizeof(MATRIX_ELEM));
- cudaStreamSynchronize(0);
- }
-
- void cudak_(cuda_softmax_denominator)(const Matrix *a, const Matrix *max, Matrix *b) {
- dim3 block(CUDA_THREADS_NN, 1);
- int ncol = a->ncol;
- int blocks_per_row = CEIL_DIV(ncol, block.x);
- dim3 grid(blocks_per_row, a->nrow);
- MATRIX_ELEM *res;
- size_t stride;
- assert(max->ncol == 1);
- cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
- cudak_(block_reduce_softmax_rowsum) \
- <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
- (MATRIX_ELEM_PTR(a), res, MATRIX_ELEM_PTR(max),
- a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
- max->stride / sizeof(MATRIX_ELEM),
- ncol);
- ncol = blocks_per_row;
- assert((unsigned long)ncol <= block.x);
- grid.x = 1;
- cudaStreamSynchronize(0);
- cudak_(block_reduce_rowsum) \
- <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
- (res, MATRIX_ELEM_PTR(b),
- stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
- ncol);
- cudaStreamSynchronize(0);
- cudaFree(res);
- }
-
- void cudak_(cuda_rowmax)(const Matrix *a, Matrix *b) {
- dim3 block(CUDA_THREADS_NN, 1);
- int ncol = a->ncol;
- int blocks_per_row = CEIL_DIV(ncol, block.x);
- dim3 grid(blocks_per_row, a->nrow);
- MATRIX_ELEM *res;
- size_t stride;
- cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
- cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
- (MATRIX_ELEM_PTR(a), res,
- a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
- ncol);
- ncol = blocks_per_row;
- assert((unsigned long)ncol <= block.x);
- grid.x = 1;
- cudaStreamSynchronize(0);
- cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
- (res, MATRIX_ELEM_PTR(b),
- stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
- ncol);
- cudaStreamSynchronize(0);
- cudaFree(res);
- }
-
- void cudak_(cuda_rowmax_idx)(const Matrix *a, Matrix *b, Matrix *b_idx) {
- dim3 block(CUDA_THREADS_NN, 1);
- int ncol = a->ncol;
- int blocks_per_row = CEIL_DIV(ncol, block.x);
- dim3 grid(blocks_per_row, a->nrow);
- MATRIX_ELEM *a_idx, *res, *res_idx;
- size_t stride;
- cudaMallocPitch(&a_idx, &stride, a->stride, a->nrow);
- cudak_(gen_col_idx)<<<grid, block>>>(a_idx, a->nrow, ncol, stride / sizeof(MATRIX_ELEM));
- cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
- cudaMallocPitch(&res_idx, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
- cudaStreamSynchronize(0);
- cudak_(block_reduce_rowmax_idx)<<<grid, block,
- 2 * block.x * sizeof(MATRIX_ELEM)>>> \
- (MATRIX_ELEM_PTR(a), a_idx, res, res_idx,
- a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
- ncol);
- ncol = blocks_per_row;
- assert((unsigned long)ncol <= block.x);
- grid.x = 1;
- cudaStreamSynchronize(0);
- cudak_(block_reduce_rowmax_idx)<<<grid, block,
- 2 * block.x * sizeof(MATRIX_ELEM)>>> \
- (res, res_idx, MATRIX_ELEM_PTR(b), MATRIX_ELEM_PTR(b_idx),
- stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
- ncol);
- cudaStreamSynchronize(0);
- cudaFree(a_idx);
- cudaFree(res);
- cudaFree(res_idx);
- }
-
- /* in-place calc */
- void cudak_(cuda_add_row)(const Matrix *a, Matrix *b, double beta) {
- dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
- dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
- CEIL_DIV(b->nrow, threadsPerBlock.y));
- cudak_(add_row)<<<numBlocks, threadsPerBlock>>> \
- (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
- b->stride / sizeof(MATRIX_ELEM), beta);
- cudaStreamSynchronize(0);
- }
-
- void cudak_(cuda_fill)(Matrix *a, double val) {
- dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
- dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x),
- CEIL_DIV(a->nrow, threadsPerBlock.y));
- cudak_(fill)<<<numBlocks, threadsPerBlock>>> \
- (MATRIX_ELEM_PTR(a), a->nrow, a->ncol,
- a->stride / sizeof(MATRIX_ELEM), val);
- cudaStreamSynchronize(0);
- }
-
- void cudak_(cuda_clip)(Matrix *a, double val_1, double val_2) {
- dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
- dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x),
- CEIL_DIV(a->nrow, threadsPerBlock.y));
- cudak_(clip)<<<numBlocks, threadsPerBlock>>> \
- (MATRIX_ELEM_PTR(a), a->nrow, a->ncol,
- a->stride / sizeof(MATRIX_ELEM), val_1, val_2);
- cudaStreamSynchronize(0);
- }
-
- void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context) {
- dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
- dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
- CEIL_DIV(b->nrow, threadsPerBlock.y));
- cudak_(expand_frm)<<<numBlocks, threadsPerBlock>>> \
- (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
- a->nrow, a->ncol,
- b->nrow, b->ncol,
- a->stride / sizeof(MATRIX_ELEM),
- b->stride / sizeof(MATRIX_ELEM),
- context);
- cudaStreamSynchronize(0);
- }
-
- void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step) {
- dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
- dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
- CEIL_DIV(b->nrow, threadsPerBlock.y));
- cudak_(rearrange_frm)<<<numBlocks, threadsPerBlock>>> \
- (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
- b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM),
- step, b->ncol / step);
- cudaStreamSynchronize(0);
- }
-
- void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b) {
- dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
- dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
- CEIL_DIV(b->nrow, threadsPerBlock.y));
- cudak_(scale_rows_by_col)<<<numBlocks, threadsPerBlock>>> \
- (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
- b->nrow, b->ncol,
- a->stride / sizeof(MATRIX_ELEM),
- b->stride / sizeof(MATRIX_ELEM));
- cudaStreamSynchronize(0);
- }
-
- void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b) {
- dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
- dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
- CEIL_DIV(b->nrow, threadsPerBlock.y));
- cudak_(scale_rows_by_row)<<<numBlocks, threadsPerBlock>>> \
- (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
- b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
- cudaStreamSynchronize(0);
- }
-
- void cudak_(cuda_decompress)(const Matrix *a, Matrix *b) {
- dim3 threadsPerBlock(1, CUDA_THREADS_NN);
- dim3 numBlocks(1, CEIL_DIV(a->nrow, threadsPerBlock.y));
- cudak_(decompress)<<<numBlocks, threadsPerBlock>>> \
- (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
- a->nrow, a->ncol,
- a->stride / sizeof(MATRIX_ELEM),
- b->stride / sizeof(MATRIX_ELEM));
- cudaStreamSynchronize(0);
- }
-}
-#endif
diff --git a/nerv/matrix/generic/cumatrix.c b/nerv/matrix/generic/cumatrix.c
index 4bdf5f0..08cb4c2 100644
--- a/nerv/matrix/generic/cumatrix.c
+++ b/nerv/matrix/generic/cumatrix.c
@@ -43,15 +43,6 @@ static int nerv_matrix_(lua_mul)(lua_State *L) {
return 0;
}
-static int nerv_matrix_(lua_create)(lua_State *L) {
- Status status;
- Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
- Matrix *b = nerv_matrix_(create)(a->nrow, a->ncol, &status);
- NERV_LUA_CHECK_STATUS(L, status);
- luaT_pushudata(L, b, nerv_matrix_(tname));
- return 1;
-}
-
static int nerv_matrix_(lua_sigmoid)(lua_State *L) {
Status status;
Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
@@ -237,12 +228,12 @@ static int nerv_matrix_(lua_decompress)(lua_State *L) {
return 1;
}
-extern const char *nerv_matrix_host_int_tname;
+extern const char *nerv_matrix_host_float_tname;
static int nerv_matrix_(lua_copy_rows_fromh_by_idx)(lua_State *L) {
Status status;
Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
const Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
- const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_int_tname);
+ const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_float_tname);
long nrow = a->nrow;
int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0;
nerv_matrix_(copy_rows_fromh_by_idx)(a, b, idx, b_begin, &status);
@@ -250,6 +241,18 @@ static int nerv_matrix_(lua_copy_rows_fromh_by_idx)(lua_State *L) {
return 0;
}
+static int nerv_matrix_(lua_copy_rows_fromd_by_idx)(lua_State *L) {
+ Status status;
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_(tname));
+ long nrow = a->nrow;
+ int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0;
+ nerv_matrix_(copy_rows_fromd_by_idx)(a, b, idx, b_begin, &status);
+ NERV_LUA_CHECK_STATUS(L, status);
+ return 0;
+}
+
static int nerv_matrix_(lua_expand_frm)(lua_State *L) {
Status status;
Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
@@ -289,7 +292,6 @@ static int nerv_matrix_(lua_scale_rows_by_row)(lua_State *L) {
}
static const luaL_Reg nerv_matrix_(extra_methods)[] = {
- {"create", nerv_matrix_(lua_create)},
{"colsum", nerv_matrix_(lua_colsum)},
{"colsame", nerv_matrix_(lua_colsame)},
{"rowsum", nerv_matrix_(lua_rowsum)},
@@ -300,6 +302,8 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = {
/* in-place calc */
{"copy_fromh", nerv_matrix_(lua_copy_fromh)},
{"copy_fromd", nerv_matrix_(lua_copy_fromd)},
+ /* alias for copy_fromd */
+ {"copy_from", nerv_matrix_(lua_copy_fromd)},
{"copy_toh", nerv_matrix_(lua_copy_toh)},
{"add", nerv_matrix_(lua_add)},
{"mul", nerv_matrix_(lua_mul)},
@@ -312,6 +316,7 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = {
{"mul_elem", nerv_matrix_(lua_mul_elem)},
{"log_elem", nerv_matrix_(lua_log_elem)},
{"copy_rows_fromh_by_idx", nerv_matrix_(lua_copy_rows_fromh_by_idx)},
+ {"copy_rows_fromd_by_idx", nerv_matrix_(lua_copy_rows_fromd_by_idx)},
{"expand_frm", nerv_matrix_(lua_expand_frm)},
{"rearrange_frm", nerv_matrix_(lua_rearrange_frm)},
{"scale_rows_by_row", nerv_matrix_(lua_scale_rows_by_row)},
@@ -321,6 +326,9 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = {
static void cuda_matrix_(init)(lua_State *L) {
luaN_append_methods(L, nerv_matrix_(extra_methods));
+#ifdef CUMATRIX_INIT
+ CUMATRIX_INIT(L);
+#endif
}
int nerv_matrix_(lua_get_elem)(lua_State *L) {
diff --git a/nerv/matrix/init.lua b/nerv/matrix/init.lua
index f230e9f..1091d7e 100644
--- a/nerv/matrix/init.lua
+++ b/nerv/matrix/init.lua
@@ -45,6 +45,10 @@ function nerv.Matrix:generate(gen)
end
end
+function nerv.Matrix:create(nrow, ncol)
+ return self.__constructor(nrow or self:nrow(), ncol or self:ncol())
+end
+
nerv.MMatrixInt.fmt = "%d "
function nerv.CuMatrix:__add__(b)
diff --git a/nerv/matrix/mmatrix.c b/nerv/matrix/mmatrix.c
index 5561572..961059c 100644
--- a/nerv/matrix/mmatrix.c
+++ b/nerv/matrix/mmatrix.c
@@ -16,7 +16,30 @@ void nerv_lua_mmatrix_init(lua_State *L) {
#define host_matrix_(NAME) host_matrix_float_##NAME
#define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME
const char *nerv_matrix_(tname) = "nerv.MMatrixFloat";
+#define MMATRIX_INIT(L) host_matrix_(init_extra)(L)
+
+static const luaL_Reg nerv_matrix_(extra_methods_int)[];
+static void host_matrix_(init_extra)(lua_State *L) {
+ luaN_append_methods(L, nerv_matrix_(extra_methods_int));
+}
+
#include "generic/mmatrix.c"
+#include "../lib/matrix/mmatrix.h"
+
+static int nerv_matrix_(lua_perm_gen)(lua_State *L) {
+ Status status;
+ int i, ncol = luaL_checkinteger(L, 1);
+ Matrix *self = nerv_matrix_(perm_gen)(ncol, &status);
+ NERV_LUA_CHECK_STATUS(L, status);
+ luaT_pushudata(L, self, nerv_matrix_(tname));
+ return 1;
+}
+
+static const luaL_Reg nerv_matrix_(extra_methods_int)[] = {
+ {"perm_gen", nerv_matrix_(lua_perm_gen)},
+ {NULL, NULL}
+};
+
#undef nerv_matrix_
#undef host_matrix_
#undef MATRIX_USE_FLOAT
@@ -24,6 +47,7 @@ const char *nerv_matrix_(tname) = "nerv.MMatrixFloat";
#undef MATRIX_ELEM_PTR
#undef MATRIX_ELEM_FMT
#undef MATRIX_ELEM_WRITE_FMT
+#undef MMATRIX_INIT
#define NERV_GENERIC_MMATRIX
#define MATRIX_USE_DOUBLE
@@ -44,26 +68,4 @@ const char *nerv_matrix_(tname) = "nerv.MMatrixDouble";
#define host_matrix_(NAME) host_matrix_int_##NAME
#define nerv_matrix_(NAME) nerv_matrix_host_int_##NAME
const char *nerv_matrix_(tname) = "nerv.MMatrixInt";
-#define MMATRIX_INIT(L) host_matrix_(init_extra)(L)
-
-static const luaL_Reg nerv_matrix_(extra_methods_int)[];
-static void host_matrix_(init_extra)(lua_State *L) {
- luaN_append_methods(L, nerv_matrix_(extra_methods_int));
-}
-
#include "generic/mmatrix.c"
-#include "../lib/matrix/mmatrix.h"
-
-static int nerv_matrix_(lua_perm_gen)(lua_State *L) {
- Status status;
- int i, ncol = luaL_checkinteger(L, 1);
- Matrix *self = nerv_matrix_(perm_gen)(ncol, &status);
- NERV_LUA_CHECK_STATUS(L, status);
- luaT_pushudata(L, self, nerv_matrix_(tname));
- return 1;
-}
-
-static const luaL_Reg nerv_matrix_(extra_methods_int)[] = {
- {"perm_gen", nerv_matrix_(lua_perm_gen)},
- {NULL, NULL}
-};
diff --git a/nerv/nn/layer_dag.lua b/nerv/nn/layer_dag.lua
index 8e30216..73bb77d 100644
--- a/nerv/nn/layer_dag.lua
+++ b/nerv/nn/layer_dag.lua
@@ -79,7 +79,7 @@ function DAGLayer:__init(id, global_conf, layer_conf)
end
table.insert(parsed_conn,
- {{ref_from, port_from}, {ref_to, port_to}})
+ {{ref_from, port_from}, {ref_to, port_to}})
table.insert(ref_from.next_layers, ref_to) -- add edge
ref_to.in_deg = ref_to.in_deg + 1 -- increase the in-degree of the target layer
end
@@ -112,7 +112,7 @@ function DAGLayer:__init(id, global_conf, layer_conf)
end
end
for i = 1, #queue do
- nerv.info("enqueued layer: %s", queue[i].layer.id)
+ nerv.info("enqueued layer: %s %s", queue[i].layer, queue[i].layer.id)
end
for id, ref in pairs(layers) do
@@ -125,6 +125,7 @@ function DAGLayer:__init(id, global_conf, layer_conf)
self.layers = layers
self.inputs = inputs
self.outputs = outputs
+ self.id = id
self.dim_in = dim_in
self.dim_out = dim_out
self.parsed_conn = parsed_conn
@@ -139,8 +140,11 @@ function DAGLayer:init(batch_size)
ref_from, port_from = unpack(conn[1])
ref_to, port_to = unpack(conn[2])
_, output_dim = ref_from.layer:get_dim()
- local mid = self.gconf.cumat_type(batch_size,
- output_dim[port_from])
+ local dim = 1
+ if output_dim[port_from] > 0 then
+ dim = output_dim[port_from]
+ end
+ local mid = self.gconf.cumat_type(batch_size, dim)
local err_mid = mid:create()
ref_from.outputs[port_from] = mid
@@ -175,8 +179,38 @@ function DAGLayer:init(batch_size)
end
end
+function DAGLayer:batch_resize(batch_size)
+ self.gconf.batch_size = batch_size
+
+ for i, conn in ipairs(self.parsed_conn) do
+ local _, output_dim
+ local ref_from, port_from, ref_to, port_to
+ ref_from, port_from = unpack(conn[1])
+ ref_to, port_to = unpack(conn[2])
+ _, output_dim = ref_from.layer:get_dim()
+
+ if ref_from.outputs[port_from]:nrow() ~= batch_size and output_dim[port_from] > 0 then
+ local mid = self.gconf.cumat_type(batch_size, output_dim[port_from])
+ local err_mid = mid:create()
+
+ ref_from.outputs[port_from] = mid
+ ref_to.inputs[port_to] = mid
+
+ ref_from.err_inputs[port_from] = err_mid
+ ref_to.err_outputs[port_to] = err_mid
+ end
+ end
+ for id, ref in pairs(self.layers) do
+ ref.layer:batch_resize(batch_size)
+ end
+ collectgarbage("collect")
+end
+
function DAGLayer:set_inputs(input)
for i = 1, #self.dim_in do
+ if input[i] == nil then
+ nerv.error("some input is not provided");
+ end
local layer = self.inputs[i][1]
local port = self.inputs[i][2]
layer.inputs[port] = input[i]
@@ -185,6 +219,9 @@ end
function DAGLayer:set_outputs(output)
for i = 1, #self.dim_out do
+ if output[i] == nil then
+ nerv.error("some output is not provided");
+ end
local layer = self.outputs[i][1]
local port = self.outputs[i][2]
layer.outputs[port] = output[i]
@@ -221,11 +258,13 @@ end
function DAGLayer:propagate(input, output)
self:set_inputs(input)
self:set_outputs(output)
+ local ret = false
for i = 1, #self.queue do
local ref = self.queue[i]
-- print(ref.layer.id)
- ref.layer:propagate(ref.inputs, ref.outputs)
+ ret = ref.layer:propagate(ref.inputs, ref.outputs)
end
+ return ret
end
function DAGLayer:back_propagate(bp_err, next_bp_err, input, output)
@@ -247,3 +286,30 @@ function DAGLayer:get_params()
end
return nerv.ParamRepo.merge(param_repos)
end
+
+DAGLayer.PORT_TYPES = {
+ INPUT = {},
+ OUTPUT = {},
+ ERR_INPUT = {},
+ ERR_OUTPUT = {}
+}
+
+function DAGLayer:get_intermediate(id, port_type)
+ if id == "<input>" or id == "<output>" then
+ nerv.error("an actual real layer id is expected")
+ end
+ local layer = self.layers[id]
+ if layer == nil then
+ nerv.error("layer id %s not found", id)
+ end
+ if port_type == DAGLayer.PORT_TYPES.INPUT then
+ return layer.inputs
+ elseif port_type == DAGLayer.PORT_TYPES.OUTPUT then
+ return layer.outputs
+ elseif port_type == DAGLayer.PORT_TYPES.ERR_INPUT then
+ return layer.err_inputs
+ elseif port_type == DAGLayer.PORT_TYPES.ERR_OUTPUT then
+ return layer.err_outputs
+ end
+ nerv.error("unrecognized port type")
+end
diff --git a/nerv/nn/layer_repo.lua b/nerv/nn/layer_repo.lua
index 602c37c..ef333a7 100644
--- a/nerv/nn/layer_repo.lua
+++ b/nerv/nn/layer_repo.lua
@@ -1,7 +1,12 @@
local LayerRepo = nerv.class("nerv.LayerRepo")
function LayerRepo:__init(layer_spec, param_repo, global_conf)
- local layers = {}
+ self.layers = {}
+ self:add_layers(layer_spec, param_repo, global_conf)
+end
+
+function LayerRepo:add_layers(layer_spec, param_repo, global_conf)
+ local layers = self.layers
for ltype, llist in pairs(layer_spec) do
local layer_type = nerv.get_type(ltype)
for id, spec in pairs(llist) do
@@ -22,7 +27,6 @@ function LayerRepo:__init(layer_spec, param_repo, global_conf)
layers[id] = layer_type(id, global_conf, layer_config)
end
end
- self.layers = layers
end
function LayerRepo:get_layer(lid)
diff --git a/speech b/speech
deleted file mode 160000
-Subproject 08e33afa533af1f026ac271446a0c873fe0bb5c