diff options
44 files changed, 1174 insertions, 722 deletions
diff --git a/.gitmodules b/.gitmodules index 1432de9..9f556c5 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,12 +1,6 @@ [submodule "luajit-2.0"] path = luajit-2.0 url = http://luajit.org/git/luajit-2.0.git -[submodule "nerv-speech"] - path = nerv-speech - url = https://github.com/Determinant/nerv-speech.git -[submodule "speech"] - path = speech - url = https://github.com/Determinant/nerv-speech.git [submodule "luarocks"] path = luarocks url = https://github.com/keplerproject/luarocks.git @@ -9,6 +9,7 @@ luarocks: install: cd nerv; $(PREFIX)/bin/luarocks make speech: + cd speech/speech_utils; $(PREFIX)/bin/luarocks make cd speech/htk_io; $(PREFIX)/bin/luarocks make clean: cd nerv && make clean @@ -8,10 +8,12 @@ First make sure you have __lua__ and __CUDA__ installed on your computer. __Nerv__ is currently developed via github.You can download and make __Nerv__ by doing the following: ``` cd ~ -git clone https://github.com/Determinant/nerv.git +git clone https://github.com/Nerv-SJTU/nerv.git cd nerv +git clone https://github.com/Nerv-SJTU/nerv-speech.git speech git submodule init && git submodule update make +make speech ``` The `git submodule` command is for the __luajit__ repository inside __Nerv__. Now, you can try to run some example scripts. diff --git a/embedding_example/.gitignore b/embedding_example/.gitignore new file mode 100644 index 0000000..8e68213 --- /dev/null +++ b/embedding_example/.gitignore @@ -0,0 +1,2 @@ +main +main.o diff --git a/embedding_example/Makefile b/embedding_example/Makefile new file mode 100644 index 0000000..73287f4 --- /dev/null +++ b/embedding_example/Makefile @@ -0,0 +1,20 @@ +CFLAG += -I ../install/include/luajit-2.0/ -I ../install/include/nerv/ +LDFLAG += -L../install/lib/ -lluajit-5.1 -Wl,-rpath=../install/lib/ -lluaT -lnervcore +GCC := gcc + +.PHONY: FORCE + +all: main FORCE +clean: + -rm -f *.o + -rm main + +FORCE: ../install/bin/luarocks + echo "#!/bin/bash" > run.sh + $< path >> run.sh + echo "./main" >> run.sh +main: main.o + $(GCC) -o $@ $< $(LDFLAG) + +main.o: main.c + $(GCC) $(CFLAG) -o $@ $< -c diff --git a/embedding_example/main.c b/embedding_example/main.c new file mode 100644 index 0000000..8856d58 --- /dev/null +++ b/embedding_example/main.c @@ -0,0 +1,93 @@ +#include "lua.h" +#include "lauxlib.h" +#include "lualib.h" +#include "matrix/matrix.h" +#include "common.h" +#include "luaT/luaT.h" +#include <stdio.h> + +const char *nerv_matrix_host_float_tname = "nerv.MMatrixFloat"; +const char *input_name = "_nerv_embed_input"; +const char *output_name = "_nerv_embed_output"; +extern Matrix *nerv_matrix_host_float_create(long nrow, long ncol, Status *status); +extern void nerv_matrix_host_float_data_retain(Matrix *self); +extern void nerv_matrix_host_float_data_free(Matrix *self, Status *status); + +lua_State *L; +Matrix *input, *output; +Status status; + +void setup_nerv() { + L = lua_open(); + luaL_openlibs(L); + luaL_loadfile(L, "setup_nerv.lua"); + /* network configuration */ + lua_pushstring(L, "../nerv/examples/swb_baseline.lua"); + if (lua_pcall(L, 1, LUA_MULTRET, 0)) + { + printf("%s\n", luaL_checkstring(L, 1)); + exit(1); + } + /* lua stack now: input width, output width, propagator */ + input = nerv_matrix_host_float_create(1, luaL_checkinteger(L, 1), &status); + NERV_LUA_CHECK_STATUS(L, status); + output = nerv_matrix_host_float_create(1, luaL_checkinteger(L, 2), &status); + NERV_LUA_CHECK_STATUS(L, status); + /* add reference to avoid gc */ + luaT_pushudata(L, output, nerv_matrix_host_float_tname); + luaT_pushudata(L, input, nerv_matrix_host_float_tname); + lua_setfield(L, LUA_GLOBALSINDEX, input_name); + lua_setfield(L, LUA_GLOBALSINDEX, output_name); +} + + +void propagate(float for_fun) { + int i, j; + printf("ok: %d\n", lua_gettop(L)); + lua_pushvalue(L, 3); + /* lua stack now: input width, output width, propagator, propagator */ + for (i = 0; i < input->nrow; i++) /* nrow is actually 1 */ + { + float *nerv_row = (float *)((char *)input->data.f + i * input->stride); + for (j = 0; j < input->ncol; j++) + { + nerv_row[j] = j * for_fun; + } + } + lua_getfield(L, LUA_GLOBALSINDEX, input_name); + lua_getfield(L, LUA_GLOBALSINDEX, output_name); + /* lua stack now: input width, output width, propagator, propagator, input, output */ + if (lua_pcall(L, 2, 0, 0)) /* call propagator with two parameters */ + { + printf("%s\n", luaL_checkstring(L, -1)); + exit(-1); + } + /* lua stack now: input width, output width, propagator */ + printf("## output: %ld %ld ##\n", output->nrow, output->ncol); + for (i = 0; i < output->nrow; i++) /* nrow is actually 1 */ + { + float *nerv_row = (float *)((char *)output->data.f + i * output->stride); + for (j = 0; j < output->ncol; j++) + { + printf("%.8f ", nerv_row[j]); + } + } +} + +void teardown_nerv() { + lua_pushnil(L); + lua_pushnil(L); + lua_setfield(L, LUA_GLOBALSINDEX, input_name); + lua_setfield(L, LUA_GLOBALSINDEX, output_name); + lua_gc(L, LUA_GCCOLLECT, 0); +} + +int main() { + setup_nerv(); + propagate(1.0); + propagate(2.0); + propagate(2.0); + propagate(3.0); + teardown_nerv(); + return 0; +} diff --git a/embedding_example/run.sh b/embedding_example/run.sh new file mode 100755 index 0000000..e919263 --- /dev/null +++ b/embedding_example/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash +export LUA_PATH='/home/slhome/mfy43/.luarocks/share/lua/5.1/?.lua;/home/slhome/mfy43/.luarocks/share/lua/5.1/?/init.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?/init.lua;./?.lua;/usr/local/share/luajit-2.0.4/?.lua;/usr/local/share/lua/5.1/?.lua;/usr/local/share/lua/5.1/?/init.lua' +export LUA_CPATH='/home/slhome/mfy43/.luarocks/lib/lua/5.1/?.so;/home/slhome/mfy43/nerv/install/lib/lua/5.1/?.so;./?.so;/usr/local/lib/lua/5.1/?.so;/usr/local/lib/lua/5.1/loadall.so' +./main diff --git a/embedding_example/setup_nerv.lua b/embedding_example/setup_nerv.lua new file mode 100644 index 0000000..d80c306 --- /dev/null +++ b/embedding_example/setup_nerv.lua @@ -0,0 +1,25 @@ +local k,l,_=pcall(require,"luarocks.loader") _=k and l.add_context("nerv","scm-1") +require 'nerv' +local arg = {...} +dofile(arg[1]) +local param_repo = nerv.ParamRepo() +param_repo:import(gconf.initialized_param, nil, gconf) +local layer_repo = make_layer_repo(param_repo) +local network = get_decode_network(layer_repo) +local global_transf = get_global_transf(layer_repo) +local batch_size = 1 +network:init(batch_size) + +function propagator(input, output) + local transformed = nerv.speech_utils.global_transf( + gconf.cumat_type.new_from_host(input), + global_transf, 0, 0, gconf) -- preprocessing + local gpu_input = transformed + local gpu_output = nerv.CuMatrixFloat(output:nrow(), output:ncol()) + network:propagate({gpu_input}, {gpu_output}) + gpu_output:copy_toh(output) + -- collect garbage in-time to save GPU memory + collectgarbage("collect") +end + +return network.dim_in[1], network.dim_out[1], propagator diff --git a/nerv/Makefile b/nerv/Makefile index 022e2fb..b449f82 100644 --- a/nerv/Makefile +++ b/nerv/Makefile @@ -30,14 +30,14 @@ LUAT_OBJS := $(addprefix $(OBJ_DIR)/,$(LUAT_OBJS)) OBJS := $(CORE_OBJS) $(NERV_OBJS) $(LUAT_OBJS) LIBS := $(INST_LIBDIR)/libnerv.so $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so LUA_LIBS := matrix/init.lua io/init.lua init.lua \ - layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua \ - layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua layer/affine_recurrent.lua\ + layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua layer/softmax.lua \ + layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua layer/affine_recurrent.lua \ nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua \ io/sgd_buffer.lua INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK -CUDA_BASE := /usr/local/cuda-6.5 -#CUDA_BASE := /usr/local/cuda-5.0 +#CUDA_BASE := /usr/local/cuda-7.0 +CUDA_BASE := /usr/local/cuda CUDA_INCLUDE := -I $(CUDA_BASE)/include/ INCLUDE += $(CUDA_INCLUDE) @@ -66,7 +66,7 @@ $(LIB_PATH)/libluaT.so: $(LUAT_OBJS) $(INST_LIBDIR)/libnerv.so: $(NERV_OBJS) $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so gcc -shared -o $@ $(NERV_OBJS) $(LDFLAGS) -Wl,-rpath=$(LIB_PATH) -L$(LIB_PATH) -lnervcore -lluaT -$(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c matrix/generic/cukernel.cu +$(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c $(OBJ_DIR)/matrix/mmatrix.o: matrix/generic/mmatrix.c matrix/generic/matrix.c $(OBJ_DIR)/lib/matrix/cumatrix.o: lib/matrix/generic/cumatrix.c lib/matrix/generic/matrix.c lib/matrix/generic/cukernel.cu diff --git a/nerv/examples/asr_trainer.lua b/nerv/examples/asr_trainer.lua index 4fa4096..69cfeed 100644 --- a/nerv/examples/asr_trainer.lua +++ b/nerv/examples/asr_trainer.lua @@ -1,9 +1,9 @@ function build_trainer(ifname) local param_repo = nerv.ParamRepo() param_repo:import(ifname, nil, gconf) - local sublayer_repo = make_sublayer_repo(param_repo) - local layer_repo = make_layer_repo(sublayer_repo, param_repo) + local layer_repo = make_layer_repo(param_repo) local network = get_network(layer_repo) + local global_transf = get_global_transf(layer_repo) local input_order = get_input_order() local iterative_trainer = function (prefix, scp_file, bp) gconf.randomize = bp @@ -12,28 +12,41 @@ function build_trainer(ifname) -- initialize the network network:init(gconf.batch_size) gconf.cnt = 0 - err_input = {nerv.CuMatrixFloat(256, 1)} + err_input = {nerv.CuMatrixFloat(gconf.batch_size, 1)} err_input[1]:fill(1) for data in buffer.get_data, buffer do -- prine stat periodically gconf.cnt = gconf.cnt + 1 if gconf.cnt == 1000 then - print_stat(sublayer_repo) + print_stat(layer_repo) nerv.CuMatrix.print_profile() nerv.CuMatrix.clear_profile() gconf.cnt = 0 -- break end local input = {} --- if gconf.cnt == 100 then break end - for i, id in ipairs(input_order) do +-- if gconf.cnt == 1000 then break end + for i, e in ipairs(input_order) do + local id = e.id if data[id] == nil then nerv.error("input data %s not found", id) end - table.insert(input, data[id]) + local transformed + if e.global_transf then + transformed = nerv.speech_utils.global_transf(data[id], + global_transf, + gconf.frm_ext or 0, 0, + gconf) + else + transformed = data[id] + end + table.insert(input, transformed) + end + local output = {nerv.CuMatrixFloat(gconf.batch_size, 1)} + err_output = {} + for i = 1, #input do + table.insert(err_output, input[i]:create()) end - local output = {nerv.CuMatrixFloat(256, 1)} - err_output = {input[1]:create()} network:propagate(input, output) if bp then network:back_propagate(err_input, err_output, input, output) @@ -42,16 +55,16 @@ function build_trainer(ifname) -- collect garbage in-time to save GPU memory collectgarbage("collect") end - print_stat(sublayer_repo) + print_stat(layer_repo) nerv.CuMatrix.print_profile() nerv.CuMatrix.clear_profile() if (not bp) and prefix ~= nil then nerv.info("writing back...") local fname = string.format("%s_cv%.3f.nerv", - prefix, get_accuracy(sublayer_repo)) + prefix, get_accuracy(layer_repo)) network:get_params():export(fname, nil) end - return get_accuracy(sublayer_repo) + return get_accuracy(layer_repo) end return iterative_trainer end diff --git a/nerv/examples/mmi_chime3.lua b/nerv/examples/mmi_chime3.lua new file mode 100644 index 0000000..6ac7f28 --- /dev/null +++ b/nerv/examples/mmi_chime3.lua @@ -0,0 +1,183 @@ +require 'kaldi_io' +require 'kaldi_seq' +gconf = {lrate = 0.00001, wcost = 0, momentum = 0.0, + cumat_type = nerv.CuMatrixFloat, + mmat_type = nerv.MMatrixFloat, + frm_ext = 5, + tr_scp = "ark,o:/slfs6/users/ymz09/kaldi/src/featbin/copy-feats scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_mmi/train.scp ark:- |", + initialized_param = {"/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_init_mmi.nerv", + "/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_global_transf_mmi.nerv"}, + debug = false} + +function make_layer_repo(param_repo) + local layer_repo = nerv.LayerRepo( + { + -- global transf + ["nerv.BiasLayer"] = + { + blayer1 = {{bias = "bias1"}, {dim_in = {440}, dim_out = {440}}}, + blayer2 = {{bias = "bias2"}, {dim_in = {440}, dim_out = {440}}} + }, + ["nerv.WindowLayer"] = + { + wlayer1 = {{window = "window1"}, {dim_in = {440}, dim_out = {440}}}, + wlayer2 = {{window = "window2"}, {dim_in = {440}, dim_out = {440}}} + }, + -- biased linearity + ["nerv.AffineLayer"] = + { + affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"}, + {dim_in = {440}, dim_out = {2048}}}, + affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"}, + {dim_in = {2048}, dim_out = {2011}}} + }, + ["nerv.SigmoidLayer"] = + { + sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}} + }, + ["nerv.MMILayer"] = + { + mmi_crit = {{}, {dim_in = {2011, -1}, dim_out = {1}, + cmd = { + arg = "--class-frame-counts=/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced/ali_train_pdf.counts --acoustic-scale=0.1 --lm-scale=1.0 --learn-rate=0.00001 --drop-frames=true --verbose=1", + mdl = "/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/final.mdl", + lat = "scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_denlats/lat.scp", + ali = "ark:gunzip -c /slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/ali.*.gz |" + } + } + } + }, + ["nerv.SoftmaxLayer"] = -- softmax for decode output + { + softmax = {{}, {dim_in = {2011}, dim_out = {2011}}} + } + }, param_repo, gconf) + + layer_repo:add_layers( + { + ["nerv.DAGLayer"] = + { + global_transf = {{}, { + dim_in = {440}, dim_out = {440}, + sub_layers = layer_repo, + connections = { + ["<input>[1]"] = "blayer1[1]", + ["blayer1[1]"] = "wlayer1[1]", + ["wlayer1[1]"] = "blayer2[1]", + ["blayer2[1]"] = "wlayer2[1]", + ["wlayer2[1]"] = "<output>[1]" + } + }}, + main = {{}, { + dim_in = {440}, dim_out = {2011}, + sub_layers = layer_repo, + connections = { + ["<input>[1]"] = "affine0[1]", + ["affine0[1]"] = "sigmoid0[1]", + ["sigmoid0[1]"] = "affine1[1]", + ["affine1[1]"] = "sigmoid1[1]", + ["sigmoid1[1]"] = "affine2[1]", + ["affine2[1]"] = "sigmoid2[1]", + ["sigmoid2[1]"] = "affine3[1]", + ["affine3[1]"] = "sigmoid3[1]", + ["sigmoid3[1]"] = "affine4[1]", + ["affine4[1]"] = "sigmoid4[1]", + ["sigmoid4[1]"] = "affine5[1]", + ["affine5[1]"] = "sigmoid5[1]", + ["sigmoid5[1]"] = "affine6[1]", + ["affine6[1]"] = "sigmoid6[1]", + ["sigmoid6[1]"] = "affine7[1]", + ["affine7[1]"] = "<output>[1]" + } + }} + } + }, param_repo, gconf) + + layer_repo:add_layers( + { + ["nerv.DAGLayer"] = + { + mmi_output = {{}, { + dim_in = {440, -1}, dim_out = {1}, + sub_layers = layer_repo, + connections = { + ["<input>[1]"] = "main[1]", + ["main[1]"] = "mmi_crit[1]", + ["<input>[2]"] = "mmi_crit[2]", + ["mmi_crit[1]"] = "<output>[1]" + } + }}, + softmax_output = {{}, { + dim_in = {440}, dim_out = {2011}, + sub_layers = layer_repo, + connections = { + ["<input>[1]"] = "main[1]", + ["main[1]"] = "softmax[1]", + ["softmax[1]"] = "<output>[1]" + } + }} + } + }, param_repo, gconf) + + return layer_repo +end + +function get_network(layer_repo) + return layer_repo:get_layer("mmi_output") +end + +function get_decode_network(layer_repo) + return layer_repo:get_layer("softmax_output") +end + +function get_global_transf(layer_repo) + return layer_repo:get_layer("global_transf") +end + +function make_readers(feature_rspecifier, layer_repo) + return { + {reader = nerv.KaldiReader(gconf, + { + id = "main_scp", + feature_rspecifier = feature_rspecifier, + frm_ext = gconf.frm_ext, + global_transf = layer_repo:get_layer("global_transf"), + mlfs = {} + }) + } + } +end + +function get_input_order() + return {{id = "main_scp", global_transf = true}, + {id = "key"}} +end + +function get_accuracy(layer_repo) + return 0 +end + +function print_stat(layer_repo) + local mmi_crit = layer_repo:get_layer("mmi_crit") + nerv.info("*** training stat begin ***") + nerv.printf("frames:\t\t\t%d\n", mmi_crit.total_frames) + nerv.info("*** training stat end ***") +end diff --git a/nerv/examples/mpe_chime3.lua b/nerv/examples/mpe_chime3.lua new file mode 100644 index 0000000..ec095b0 --- /dev/null +++ b/nerv/examples/mpe_chime3.lua @@ -0,0 +1,186 @@ +require 'kaldi_io' +require 'kaldi_seq' +gconf = {lrate = 0.00001, wcost = 0, momentum = 0.0, + cumat_type = nerv.CuMatrixFloat, + mmat_type = nerv.MMatrixFloat, + frm_ext = 5, + tr_scp = "ark,s,cs:/slfs6/users/ymz09/kaldi/src/featbin/copy-feats scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_smbr/train.scp ark:- |", + initialized_param = {"/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_init.nerv", + "/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_global_transf.nerv"}, + debug = false} + +function make_layer_repo(param_repo) + local layer_repo = nerv.LayerRepo( + { + -- global transf + ["nerv.BiasLayer"] = + { + blayer1 = {{bias = "bias1"}, {dim_in = {440}, dim_out = {440}}}, + blayer2 = {{bias = "bias2"}, {dim_in = {440}, dim_out = {440}}} + }, + ["nerv.WindowLayer"] = + { + wlayer1 = {{window = "window1"}, {dim_in = {440}, dim_out = {440}}}, + wlayer2 = {{window = "window2"}, {dim_in = {440}, dim_out = {440}}} + }, + -- biased linearity + ["nerv.AffineLayer"] = + { + affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"}, + {dim_in = {440}, dim_out = {2048}}}, + affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"}, + {dim_in = {2048}, dim_out = {2011}}} + }, + ["nerv.SigmoidLayer"] = + { + sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}} + }, + ["nerv.MPELayer"] = + { + mpe_crit = {{}, {dim_in = {2011, -1}, dim_out = {1}, + cmd = { + arg = "--class-frame-counts=/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced/ali_train_pdf.counts --acoustic-scale=0.1 --lm-scale=1.0 --learn-rate=0.00001 --do-smbr=true --verbose=1", + mdl = "/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/final.mdl", + lat = "scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_denlats/lat.scp", + ali = "ark:gunzip -c /slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_ali/ali.*.gz |" + } + } + } + }, + ["nerv.SoftmaxLayer"] = -- softmax for decode output + { + softmax = {{}, {dim_in = {2011}, dim_out = {2011}}} + } + }, param_repo, gconf) + + layer_repo:add_layers( + { + ["nerv.DAGLayer"] = + { + global_transf = {{}, { + dim_in = {440}, dim_out = {440}, + sub_layers = layer_repo, + connections = { + ["<input>[1]"] = "blayer1[1]", + ["blayer1[1]"] = "wlayer1[1]", + ["wlayer1[1]"] = "blayer2[1]", + ["blayer2[1]"] = "wlayer2[1]", + ["wlayer2[1]"] = "<output>[1]" + } + }}, + main = {{}, { + dim_in = {440}, dim_out = {2011}, + sub_layers = layer_repo, + connections = { + ["<input>[1]"] = "affine0[1]", + ["affine0[1]"] = "sigmoid0[1]", + ["sigmoid0[1]"] = "affine1[1]", + ["affine1[1]"] = "sigmoid1[1]", + ["sigmoid1[1]"] = "affine2[1]", + ["affine2[1]"] = "sigmoid2[1]", + ["sigmoid2[1]"] = "affine3[1]", + ["affine3[1]"] = "sigmoid3[1]", + ["sigmoid3[1]"] = "affine4[1]", + ["affine4[1]"] = "sigmoid4[1]", + ["sigmoid4[1]"] = "affine5[1]", + ["affine5[1]"] = "sigmoid5[1]", + ["sigmoid5[1]"] = "affine6[1]", + ["affine6[1]"] = "sigmoid6[1]", + ["sigmoid6[1]"] = "affine7[1]", + ["affine7[1]"] = "<output>[1]" + } + }} + } + }, param_repo, gconf) + + layer_repo:add_layers( + { + ["nerv.DAGLayer"] = + { + mpe_output = {{}, { + dim_in = {440, -1}, dim_out = {1}, + sub_layers = layer_repo, + connections = { + ["<input>[1]"] = "main[1]", + ["main[1]"] = "mpe_crit[1]", + ["<input>[2]"] = "mpe_crit[2]", + ["mpe_crit[1]"] = "<output>[1]" + } + }}, + softmax_output = {{}, { + dim_in = {440}, dim_out = {2011}, + sub_layers = layer_repo, + connections = { + ["<input>[1]"] = "main[1]", + ["main[1]"] = "softmax[1]", + ["softmax[1]"] = "<output>[1]" + } + }} + } + }, param_repo, gconf) + + return layer_repo +end + +function get_network(layer_repo) + return layer_repo:get_layer("mpe_output") +end + +function get_decode_network(layer_repo) + return layer_repo:get_layer("softmax_output") +end + +function get_global_transf(layer_repo) + return layer_repo:get_layer("global_transf") +end + +function make_readers(feature_rspecifier, layer_repo) + return { + {reader = nerv.KaldiReader(gconf, + { + id = "main_scp", + feature_rspecifier = feature_rspecifier, + frm_ext = gconf.frm_ext, + global_transf = layer_repo:get_layer("global_transf"), + mlfs = {} + }) + } + } +end + +function get_input_order() + return {{id = "main_scp", global_transf = true}, + {id = "key"}} +end + +function get_accuracy(layer_repo) + local mpe_crit = layer_repo:get_layer("mpe_crit") + return mpe_crit.total_correct / mpe_crit.total_frames * 100 +end + +function print_stat(layer_repo) + local mpe_crit = layer_repo:get_layer("mpe_crit") + nerv.info("*** training stat begin ***") + nerv.printf("correct:\t\t%d\n", mpe_crit.total_correct) + nerv.printf("frames:\t\t\t%d\n", mpe_crit.total_frames) + nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo)) + nerv.info("*** training stat end ***") +end diff --git a/nerv/examples/seq_trainer.lua b/nerv/examples/seq_trainer.lua new file mode 100644 index 0000000..b8ed3eb --- /dev/null +++ b/nerv/examples/seq_trainer.lua @@ -0,0 +1,87 @@ +function build_trainer(ifname) + local param_repo = nerv.ParamRepo() + param_repo:import(ifname, nil, gconf) + local layer_repo = make_layer_repo(param_repo) + local network = get_network(layer_repo) + local global_transf = get_global_transf(layer_repo) + local input_order = get_input_order() + local iterative_trainer = function (prefix, scp_file, bp) + local readers = make_readers(scp_file, layer_repo) + -- initialize the network + network:init(1) + gconf.cnt = 0 + for ri = 1, #readers, 1 do + while true do + local data = readers[ri].reader:get_data() + if data == nil then + break + end + -- prine stat periodically + gconf.cnt = gconf.cnt + 1 + if gconf.cnt == 1000 then + print_stat(layer_repo) + nerv.CuMatrix.print_profile() + nerv.CuMatrix.clear_profile() + gconf.cnt = 0 + -- break + end + local input = {} + -- if gconf.cnt == 1000 then break end + for i, e in ipairs(input_order) do + local id = e.id + if data[id] == nil then + nerv.error("input data %s not found", id) + end + local transformed + if e.global_transf then + local batch = gconf.cumat_type(data[id]:nrow(), data[id]:ncol()) + batch:copy_fromh(data[id]) + transformed = nerv.speech_utils.global_transf(batch, + global_transf, + gconf.frm_ext or 0, 0, + gconf) + else + transformed = data[id] + end + table.insert(input, transformed) + end + err_output = {input[1]:create()} + network:batch_resize(input[1]:nrow()) + if network:propagate(input, {{}}) == true then + network:back_propagate({{}}, err_output, input, {{}}) + gconf.batch_size = 1.0 - gconf.momentum -- important!!! + network:update({{}}, input, {{}}) + end + -- collect garbage in-time to save GPU memory + collectgarbage("collect") + end + end + print_stat(layer_repo) + nerv.CuMatrix.print_profile() + nerv.CuMatrix.clear_profile() + if prefix ~= nil then + nerv.info("writing back...") + local fname = string.format("%s_tr%.3f.nerv", + prefix, get_accuracy(layer_repo)) + network:get_params():export(fname, nil) + end + return get_accuracy(layer_repo) + end + return iterative_trainer +end + +dofile(arg[1]) + +local pf0 = gconf.initialized_param +local trainer = build_trainer(pf0) + +local i = 1 +nerv.info("[NN] begin iteration %d with lrate = %.6f", i, gconf.lrate) +local accu_tr = trainer(string.format("%s_%s_iter_%d_lr%f", +string.gsub( +(string.gsub(pf0[1], "(.*/)(.*)", "%2")), +"(.*)%..*", "%1"), +os.date("%Y%m%d%H%M%S"), +i, gconf.lrate), gconf.tr_scp, true) +nerv.info("[TR] training set %d: %.3f", i, accu_tr) + diff --git a/nerv/examples/swb_baseline.lua b/nerv/examples/swb_baseline.lua index 7783f2a..8f72200 100644 --- a/nerv/examples/swb_baseline.lua +++ b/nerv/examples/swb_baseline.lua @@ -2,7 +2,9 @@ require 'htk_io' gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9, cumat_type = nerv.CuMatrixFloat, mmat_type = nerv.MMatrixFloat, + direct_update = true, frm_ext = 5, + frm_trim = 5, tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp", cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp", htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf", @@ -10,8 +12,8 @@ gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9, "/slfs1/users/mfy43/swb_global_transf.nerv"}, debug = false} -function make_sublayer_repo(param_repo) - return nerv.LayerRepo( +function make_layer_repo(param_repo) + local layer_repo = nerv.LayerRepo( { -- global transf ["nerv.BiasLayer"] = @@ -54,21 +56,23 @@ function make_sublayer_repo(param_repo) sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}}, sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}} }, - ["nerv.SoftmaxCELayer"] = + ["nerv.SoftmaxCELayer"] = -- softmax + ce criterion layer for finetune output { ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}} + }, + ["nerv.SoftmaxLayer"] = -- softmax for decode output + { + softmax = {{}, {dim_in = {3001}, dim_out = {3001}}} } }, param_repo, gconf) -end -function make_layer_repo(sublayer_repo, param_repo) - return nerv.LayerRepo( + layer_repo:add_layers( { ["nerv.DAGLayer"] = { global_transf = {{}, { dim_in = {429}, dim_out = {429}, - sub_layers = sublayer_repo, + sub_layers = layer_repo, connections = { ["<input>[1]"] = "blayer1[1]", ["blayer1[1]"] = "wlayer1[1]", @@ -78,8 +82,8 @@ function make_layer_repo(sublayer_repo, param_repo) } }}, main = {{}, { - dim_in = {429, 1}, dim_out = {1}, - sub_layers = sublayer_repo, + dim_in = {429}, dim_out = {3001}, + sub_layers = layer_repo, connections = { ["<input>[1]"] = "affine0[1]", ["affine0[1]"] = "sigmoid0[1]", @@ -96,17 +100,51 @@ function make_layer_repo(sublayer_repo, param_repo) ["sigmoid5[1]"] = "affine6[1]", ["affine6[1]"] = "sigmoid6[1]", ["sigmoid6[1]"] = "affine7[1]", - ["affine7[1]"] = "ce_crit[1]", + ["affine7[1]"] = "<output>[1]" + } + }} + } + }, param_repo, gconf) + + layer_repo:add_layers( + { + ["nerv.DAGLayer"] = + { + ce_output = {{}, { + dim_in = {429, 1}, dim_out = {1}, + sub_layers = layer_repo, + connections = { + ["<input>[1]"] = "main[1]", + ["main[1]"] = "ce_crit[1]", ["<input>[2]"] = "ce_crit[2]", ["ce_crit[1]"] = "<output>[1]" } + }}, + softmax_output = {{}, { + dim_in = {429}, dim_out = {3001}, + sub_layers = layer_repo, + connections = { + ["<input>[1]"] = "main[1]", + ["main[1]"] = "softmax[1]", + ["softmax[1]"] = "<output>[1]" + } }} } }, param_repo, gconf) + + return layer_repo end function get_network(layer_repo) - return layer_repo:get_layer("main") + return layer_repo:get_layer("ce_output") +end + +function get_decode_network(layer_repo) + return layer_repo:get_layer("softmax_output") +end + +function get_global_transf(layer_repo) + return layer_repo:get_layer("global_transf") end function make_readers(scp_file, layer_repo) @@ -125,8 +163,7 @@ function make_readers(scp_file, layer_repo) dir = "*/", ext = "lab" } - }, - global_transf = layer_repo:get_layer("global_transf") + } }), data = {main_scp = 429, phone_state = 1}} } @@ -137,26 +174,28 @@ function make_buffer(readers) { buffer_size = gconf.buffer_size, randomize = gconf.randomize, - readers = readers + readers = readers, + use_gpu = true }) end function get_input_order() - return {"main_scp", "phone_state"} + return {{id = "main_scp", global_transf = true}, + {id = "phone_state"}} end -function get_accuracy(sublayer_repo) - local ce_crit = sublayer_repo:get_layer("ce_crit") +function get_accuracy(layer_repo) + local ce_crit = layer_repo:get_layer("ce_crit") return ce_crit.total_correct / ce_crit.total_frames * 100 end -function print_stat(sublayer_repo) - local ce_crit = sublayer_repo:get_layer("ce_crit") +function print_stat(layer_repo) + local ce_crit = layer_repo:get_layer("ce_crit") nerv.info("*** training stat begin ***") nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce) nerv.printf("correct:\t\t%d\n", ce_crit.total_correct) nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames) nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames) - nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(sublayer_repo)) + nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo)) nerv.info("*** training stat end ***") end diff --git a/nerv/examples/swb_baseline_basic.lua b/nerv/examples/swb_baseline_basic.lua new file mode 100644 index 0000000..71f04a3 --- /dev/null +++ b/nerv/examples/swb_baseline_basic.lua @@ -0,0 +1,162 @@ +require 'htk_io' +gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9, + cumat_type = nerv.CuMatrixFloat, + mmat_type = nerv.MMatrixFloat, + frm_ext = 5, + frm_trim = 5, + tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp", + cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp", + htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf", + initialized_param = {"/slfs1/users/mfy43/swb_init.nerv", + "/slfs1/users/mfy43/swb_global_transf.nerv"}, + debug = false} + +function make_layer_repo(param_repo) + local layer_repo = nerv.LayerRepo( + { + -- global transf + ["nerv.BiasLayer"] = + { + blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}}, + blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}} + }, + ["nerv.WindowLayer"] = + { + wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}}, + wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}} + }, + -- biased linearity + ["nerv.AffineLayer"] = + { + affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"}, + {dim_in = {429}, dim_out = {2048}}}, + affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"}, + {dim_in = {2048}, dim_out = {3001}}} + }, + ["nerv.SigmoidLayer"] = + { + sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}} + }, + ["nerv.SoftmaxCELayer"] = + { + ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}} + } + }, param_repo, gconf) + + layer_repo:add_layers( + { + ["nerv.DAGLayer"] = + { + global_transf = {{}, { + dim_in = {429}, dim_out = {429}, + sub_layers = layer_repo, + connections = { + ["<input>[1]"] = "blayer1[1]", + ["blayer1[1]"] = "wlayer1[1]", + ["wlayer1[1]"] = "blayer2[1]", + ["blayer2[1]"] = "wlayer2[1]", + ["wlayer2[1]"] = "<output>[1]" + } + }}, + main = {{}, { + dim_in = {429, 1}, dim_out = {1}, + sub_layers = layer_repo, + connections = { + ["<input>[1]"] = "affine0[1]", + ["affine0[1]"] = "sigmoid0[1]", + ["sigmoid0[1]"] = "affine1[1]", + ["affine1[1]"] = "sigmoid1[1]", + ["sigmoid1[1]"] = "affine2[1]", + ["affine2[1]"] = "sigmoid2[1]", + ["sigmoid2[1]"] = "affine3[1]", + ["affine3[1]"] = "sigmoid3[1]", + ["sigmoid3[1]"] = "affine4[1]", + ["affine4[1]"] = "sigmoid4[1]", + ["sigmoid4[1]"] = "affine5[1]", + ["affine5[1]"] = "sigmoid5[1]", + ["sigmoid5[1]"] = "affine6[1]", + ["affine6[1]"] = "sigmoid6[1]", + ["sigmoid6[1]"] = "affine7[1]", + ["affine7[1]"] = "ce_crit[1]", + ["<input>[2]"] = "ce_crit[2]", + ["ce_crit[1]"] = "<output>[1]" + } + }} + } + }, param_repo, gconf) + return layer_repo +end + +function get_network(layer_repo) + return layer_repo:get_layer("main") +end + +function make_readers(scp_file, layer_repo) + return { + {reader = nerv.TNetReader(gconf, + { + id = "main_scp", + scp_file = scp_file, + conf_file = gconf.htk_conf, + frm_ext = gconf.frm_ext, + mlfs = { + phone_state = { + file = "/slfs1/users/mfy43/swb_ivec/ref.mlf", + format = "map", + format_arg = "/slfs1/users/mfy43/swb_ivec/dict", + dir = "*/", + ext = "lab" + } + } + }), + data = {main_scp = 429, phone_state = 1}} + } +end + +function make_buffer(readers) + return nerv.SGDBuffer(gconf, + { + buffer_size = gconf.buffer_size, + randomize = gconf.randomize, + readers = readers + }) +end + +function get_input_order() + return {{id = "main_scp", global_transf = true}, + {id = "phone_state"}} +end + +function get_accuracy(layer_repo) + local ce_crit = layer_repo:get_layer("ce_crit") + return ce_crit.total_correct / ce_crit.total_frames * 100 +end + +function print_stat(layer_repo) + local ce_crit = layer_repo:get_layer("ce_crit") + nerv.info("*** training stat begin ***") + nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce) + nerv.printf("correct:\t\t%d\n", ce_crit.total_correct) + nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames) + nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames) + nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo)) + nerv.info("*** training stat end ***") +end diff --git a/nerv/init.lua b/nerv/init.lua index 89010a7..9c1a5c8 100644 --- a/nerv/init.lua +++ b/nerv/init.lua @@ -1,15 +1,19 @@ require 'libnerv' -function nerv.error(fmt, ...) - error(nerv.printf("[nerv] internal error: " .. fmt .. "\n", ...)) -end - function nerv.error_method_not_implemented() nerv.error("method not implemented"); end +function nerv.sprintf(fmt, ...) + return string.format(fmt, ...) +end + function nerv.printf(fmt, ...) - io.write(string.format(fmt, ...)) + io.write(nerv.sprintf(fmt, ...)) +end + +function nerv.error(fmt, ...) + error(nerv.sprintf("[nerv] internal error: " .. fmt .. "\n", ...)) end function nerv.mesg_with_timestamp(fmt, ...) diff --git a/nerv/io/sgd_buffer.lua b/nerv/io/sgd_buffer.lua index f4f7dfe..74c4934 100644 --- a/nerv/io/sgd_buffer.lua +++ b/nerv/io/sgd_buffer.lua @@ -5,8 +5,22 @@ function SGDBuffer:__init(global_conf, buffer_conf) self.buffer_size = math.floor(buffer_conf.buffer_size / global_conf.batch_size) * global_conf.batch_size self.randomize = buffer_conf.randomize - if self.randomize == nil then - self.randomize = false + self.consume = buffer_conf.consume + local cumat_type = global_conf.cumat_type + if buffer_conf.use_gpu then + self.mat_type = cumat_type + self.copy_rows_from_by_idx = cumat_type.copy_rows_fromd_by_idx + self.copy_from = cumat_type.copy_fromd + self.copy_from_reader = cumat_type.copy_fromh + self.perm_gen = function (x) + return cumat_type.new_from_host(nerv.MMatrixFloat.perm_gen(x)) + end + else + self.mat_type = global_conf.mmat_type + self.copy_rows_from_by_idx = cumat_type.copy_rows_fromh_by_idx + self.copy_from = cumat_type.copy_fromh + self.perm_gen = nerv.MMatrixFloat.perm_gen + self.copy_from_reader = self.mat_type.copy_from end self.head = 0 self.tail = 0 @@ -14,7 +28,7 @@ function SGDBuffer:__init(global_conf, buffer_conf) for i, reader_spec in ipairs(buffer_conf.readers) do local buffs = {} for id, width in pairs(reader_spec.data) do - buffs[id] = {data = global_conf.mmat_type(self.buffer_size, width), + buffs[id] = {data = self.mat_type(self.buffer_size, width), leftover = nil, width = width} end @@ -41,7 +55,7 @@ function SGDBuffer:saturate() buff.data:copy_from(buff.leftover, 0, lrow) buff.leftover = nil end - nerv.printf("leftover: %d\n", lrow) + nerv.info("buffer leftover: %d\n", lrow) reader.tail = lrow reader.has_leftover = false end @@ -65,21 +79,21 @@ function SGDBuffer:saturate() if d == nil then nerv.error("reader does not provide data for %s", id) end - buff.leftover = self.gconf.mmat_type(drow - remain, - buff.width) - buff.leftover:copy_from(d, remain, drow) + buff.leftover = self.mat_type(drow - remain, + buff.width) + self.copy_from_reader(buff.leftover, d, remain, drow) end drow = remain reader.has_leftover = true end for id, buff in pairs(reader.buffs) do - buff.data:copy_from(data[id], 0, drow, reader.tail) + self.copy_from_reader(buff.data, data[id], 0, drow, reader.tail) end reader.tail = reader.tail + drow end self.tail = math.min(self.tail, reader.tail) end - self.rand_map = nerv.MMatrixInt.perm_gen(self.tail) -- generate shuffled index + self.rand_map = self.perm_gen(self.tail) -- generate shuffled index collectgarbage("collect") return self.tail >= self.gconf.batch_size end @@ -87,25 +101,31 @@ end function SGDBuffer:get_data() local batch_size = self.gconf.batch_size if self.head >= self.tail then -- buffer is empty - if not self:saturate() then + local t = os.clock() + if (not self:saturate()) and (not self.consume) then return nil -- the remaining data cannot build a batch end + if self.tail == self.head then + return nil -- nothing left + end + nerv.info("%.3fs to fill the buffer", os.clock() - t) end - if self.head + batch_size > self.tail then + if self.head + batch_size > self.tail and (not self.consume) then return nil -- the remaining data cannot build a batch end + actual_batch_size = math.min(batch_size, self.tail - self.head) local res = {} for i, reader in ipairs(self.readers) do for id, buff in pairs(reader.buffs) do - local batch = self.gconf.cumat_type(batch_size, buff.width) + local batch = self.gconf.cumat_type(actual_batch_size, buff.width) if self.randomize then - batch:copy_rows_fromh_by_idx(buff.data, self.rand_map, self.head) + self.copy_rows_from_by_idx(batch, buff.data, self.rand_map, self.head) else - batch:copy_fromh(buff.data, self.head, self.head + batch_size) + self.copy_from(batch, buff.data, self.head, self.head + actual_batch_size) end res[id] = batch end end - self.head = self.head + batch_size + self.head = self.head + actual_batch_size return res end diff --git a/nerv/layer/affine.lua b/nerv/layer/affine.lua index 00cbcfb..015ec3f 100644 --- a/nerv/layer/affine.lua +++ b/nerv/layer/affine.lua @@ -42,7 +42,7 @@ function AffineLayer:__init(id, global_conf, layer_conf) self.dim_out = layer_conf.dim_out self.gconf = global_conf self:check_dim_len(1, 1) -- exactly one input and one output - self.direct_update = layer_conf.direct_update + self.direct_update = layer_conf.direct_update or global_conf.direct_update end function AffineLayer:init(batch_size) @@ -60,6 +60,10 @@ function AffineLayer:init(batch_size) self.bp:train_init() end +function AffineLayer:batch_resize(batch_size) + -- do nothing +end + function AffineLayer:update(bp_err, input, output) if self.direct_update then self.ltp.correction:mul(input[1], bp_err[1], 1.0, gconf.momentum, 'T', 'N') diff --git a/nerv/layer/affine_recurrent.lua b/nerv/layer/affine_recurrent.lua index 59d259c..92d98e2 100644 --- a/nerv/layer/affine_recurrent.lua +++ b/nerv/layer/affine_recurrent.lua @@ -37,6 +37,10 @@ function Recurrent:init(batch_size) self.bp:train_init() end +function Recurrent:batch_resize(batch_size) + -- do nothing +end + function Recurrent:update(bp_err, input, output) if (self.direct_update == true) then local ltp_hh = self.ltp_hh.trans diff --git a/nerv/layer/bias.lua b/nerv/layer/bias.lua index c99274d..7e9fd46 100644 --- a/nerv/layer/bias.lua +++ b/nerv/layer/bias.lua @@ -18,6 +18,10 @@ function BiasLayer:init() end end +function BiasLayer:batch_resize(batch_size) + -- do nothing +end + function BiasLayer:propagate(input, output) output[1]:copy_fromd(input[1]) output[1]:add_row(self.bias.trans, 1.0) diff --git a/nerv/layer/combiner.lua b/nerv/layer/combiner.lua index 7bd7617..1bcfdfb 100644 --- a/nerv/layer/combiner.lua +++ b/nerv/layer/combiner.lua @@ -30,6 +30,12 @@ function CombinerLayer:init(batch_size) self.sum = self.gconf.cumat_type(batch_size, dim) end +function CombinerLayer:batch_resize(batch_size) + if self.sum:nrow() ~= batch_size then + self.sum = self.gconf.cumat_type(batch_size, self.dim_in[1]) + end +end + function CombinerLayer:update(bp_err, input, output) end diff --git a/nerv/layer/init.lua b/nerv/layer/init.lua index 7172f99..6861b0e 100644 --- a/nerv/layer/init.lua +++ b/nerv/layer/init.lua @@ -78,3 +78,4 @@ nerv.include('window.lua') nerv.include('mse.lua') nerv.include('combiner.lua') nerv.include('affine_recurrent.lua') +nerv.include('softmax.lua') diff --git a/nerv/layer/mse.lua b/nerv/layer/mse.lua index 9a97add..0ee3080 100644 --- a/nerv/layer/mse.lua +++ b/nerv/layer/mse.lua @@ -20,6 +20,14 @@ function MSELayer:init(batch_size) self.diff = self.mse:create() end +function MSELayer:batch_resize(batch_size) + if self.mse:nrow() ~= batch_resize then + self.mse = self.gconf.cumat_type(batch_size, self.dim_in[1]) + self.mse_sum = self.gconf.cumat_type(batch_size, 1) + self.diff = self.mse:create() + end +end + function MSELayer:update(bp_err, input, output) -- no params, therefore do nothing end @@ -34,7 +42,7 @@ function MSELayer:propagate(input, output) if output[1] ~= nil then output[1]:copy_fromd(mse_sum) end - self.total_mse = self.total_mse + mse_sum:colsum()[0] + self.total_mse = self.total_mse + mse_sum:colsum()[0][0] self.total_frames = self.total_frames + mse_sum:nrow() end diff --git a/nerv/layer/sigmoid.lua b/nerv/layer/sigmoid.lua index dfd09eb..0a8bcdc 100644 --- a/nerv/layer/sigmoid.lua +++ b/nerv/layer/sigmoid.lua @@ -14,6 +14,10 @@ function SigmoidLayer:init() end end +function SigmoidLayer:batch_resize(batch_size) + -- do nothing +end + function SigmoidLayer:update(bp_err, input, output) -- no params, therefore do nothing end diff --git a/nerv/layer/softmax.lua b/nerv/layer/softmax.lua new file mode 100644 index 0000000..4205b66 --- /dev/null +++ b/nerv/layer/softmax.lua @@ -0,0 +1,35 @@ +local SoftmaxLayer = nerv.class("nerv.SoftmaxLayer", "nerv.Layer") + +function SoftmaxLayer:__init(id, global_conf, layer_conf) + self.id = id + self.gconf = global_conf + self.dim_in = layer_conf.dim_in + self.dim_out = layer_conf.dim_out + self:check_dim_len(1, 1) -- two inputs: nn output and label +end + +function SoftmaxLayer:init(batch_size) + if self.dim_in[1] ~= self.dim_out[1] then + nerv.error("mismatching dimensions of input and output") + end +end + +function SoftmaxLayer:batch_resize(batch_size) + -- do nothing +end + +function SoftmaxLayer:update(bp_err, input, output) + -- no params, therefore do nothing +end + +function SoftmaxLayer:propagate(input, output) + output[1]:softmax(input[1]) +end + +function SoftmaxLayer:back_propagate(bp_err, next_bp_err, input, output) + nerv.error_method_not_implemented() +end + +function SoftmaxLayer:get_params() + return nerv.ParamRepo({}) +end diff --git a/nerv/layer/softmax_ce.lua b/nerv/layer/softmax_ce.lua index f878a2f..9071e86 100644 --- a/nerv/layer/softmax_ce.lua +++ b/nerv/layer/softmax_ce.lua @@ -23,6 +23,13 @@ function SoftmaxCELayer:init(batch_size) self.ce = self.softmax:create() end +function SoftmaxCELayer:batch_resize(batch_size) + if self.softmax:nrow() ~= batch_resize then + self.softmax = self.gconf.cumat_type(batch_size, self.dim_in[1]) + self.ce = self.softmax:create() + end +end + function SoftmaxCELayer:update(bp_err, input, output) -- no params, therefore do nothing end diff --git a/nerv/layer/window.lua b/nerv/layer/window.lua index 4e9a3b1..8eed352 100644 --- a/nerv/layer/window.lua +++ b/nerv/layer/window.lua @@ -18,6 +18,10 @@ function WindowLayer:init() end end +function WindowLayer:batch_resize(batch_size) + -- do nothing +end + function WindowLayer:propagate(input, output) output[1]:copy_fromd(input[1]) output[1]:scale_rows_by_row(self.window.trans) diff --git a/nerv/lib/matrix/cukernel.h b/nerv/lib/matrix/cukernel.h index 7bb4c2c..2126c6f 100644 --- a/nerv/lib/matrix/cukernel.h +++ b/nerv/lib/matrix/cukernel.h @@ -18,4 +18,6 @@ void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step); void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b); void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b); void cudak_(cuda_decompress)(const Matrix *a, Matrix *b); +void cudak_(cuda_copy_rows_by_idx)(const Matrix *a, Matrix *b, + const Matrix *idx, int b_begin); #endif diff --git a/nerv/lib/matrix/cumatrix.c b/nerv/lib/matrix/cumatrix.c index aa81bfc..ff1168d 100644 --- a/nerv/lib/matrix/cumatrix.c +++ b/nerv/lib/matrix/cumatrix.c @@ -49,6 +49,7 @@ void nerv_cumatrix_init() { #define NERV_CUBLAS_(NAME) cublasS##NAME #define MATRIX_CUMATRIX_HOST_TNAME nerv_matrix_host_float_tname #include "generic/cumatrix.c" + #undef NERV_CUBLAS_ #undef cudak_ #undef nerv_matrix_ diff --git a/nerv/lib/matrix/cumatrix.h b/nerv/lib/matrix/cumatrix.h index 9f71507..e6def66 100644 --- a/nerv/lib/matrix/cumatrix.h +++ b/nerv/lib/matrix/cumatrix.h @@ -1,5 +1,6 @@ #ifndef NERV_CUMATRIX_H #define NERV_CUMATRIX_H +#include "matrix.h" void nerv_cumatrix_print_profile(); void nerv_cumatrix_clear_profile(); void nerv_cumatrix_init(); diff --git a/nerv/lib/matrix/generic/cukernel.cu b/nerv/lib/matrix/generic/cukernel.cu index e337798..08feb59 100644 --- a/nerv/lib/matrix/generic/cukernel.cu +++ b/nerv/lib/matrix/generic/cukernel.cu @@ -284,6 +284,15 @@ __global__ void cudak_(gen_col_idx)(MATRIX_ELEM *b, b[j + i * stride] = j; } +__global__ void cudak_(copy_rows_by_idx)(const MATRIX_ELEM *a, MATRIX_ELEM *b, + const MATRIX_ELEM *idx, int b_begin, + int nrow, int ncol, int stride) { + int j = blockIdx.x * blockDim.x + threadIdx.x; + int i = blockIdx.y * blockDim.y + threadIdx.y; + if (i >= nrow || j >= ncol) return; + b[j + i * stride] = a[j + lrintf(idx[i]) * stride]; +} + extern "C" { #include "../cukernel.h" void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) { @@ -589,5 +598,16 @@ extern "C" { b->stride / sizeof(MATRIX_ELEM)); cudaStreamSynchronize(0); } + + void cudak_(cuda_copy_rows_by_idx)(const Matrix *a, Matrix *b, + const Matrix *idx, int b_begin) { + dim3 threadsPerBlock(CUDA_THREADS_NN, 1); + dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), b->nrow); + cudak_(copy_rows_by_idx)<<<numBlocks, threadsPerBlock>>> \ + (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), + MATRIX_ELEM_PTR(idx) + b_begin, + b_begin, b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM)); + cudaStreamSynchronize(0); + } } #endif diff --git a/nerv/lib/matrix/generic/cumatrix.c b/nerv/lib/matrix/generic/cumatrix.c index 40a0030..770e503 100644 --- a/nerv/lib/matrix/generic/cumatrix.c +++ b/nerv/lib/matrix/generic/cumatrix.c @@ -315,16 +315,17 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b, long nrow = a->nrow; if (!(0 <= b_begin && b_begin + nrow <= idx->ncol)) NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0); - long *idx_ptr = idx->data.i; + float *idx_ptr = idx->data.f; int i; if (idx->nrow != 1) NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0); if (a->ncol != b->ncol) NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); + PROFILE_START cudaStream_t *streams = (cudaStream_t*)malloc(sizeof(cudaStream_t) * nrow); for (i = 0; i < nrow; i++) { - int src_row = idx_ptr[b_begin + i]; + int src_row = (int)idx_ptr[b_begin + i]; if (!(0 <= src_row && src_row < b->nrow)) NERV_EXIT_STATUS(status, MAT_INVALID_IDX, 0); CUDA_SAFE_CALL(cudaStreamCreate(streams + i), status); @@ -339,6 +340,22 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b, CUDA_SAFE_CALL(cudaStreamDestroy(streams[i]), status); } free(streams); + PROFILE_STOP + NERV_SET_STATUS(status, NERV_NORMAL, 0); +} + +void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b, + const Matrix *idx, int b_begin, Status *status) { + long nrow = a->nrow; + if (!(0 <= b_begin && b_begin + nrow <= idx->ncol)) + NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0); + if (idx->nrow != 1) + NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0); + if (a->ncol != b->ncol) + NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); + PROFILE_START + cudak_(cuda_copy_rows_by_idx)(b, a, idx, b_begin); + PROFILE_STOP NERV_SET_STATUS(status, NERV_NORMAL, 0); } diff --git a/nerv/lib/matrix/generic/cumatrix.h b/nerv/lib/matrix/generic/cumatrix.h index 3f1f8a3..04e8c5a 100644 --- a/nerv/lib/matrix/generic/cumatrix.h +++ b/nerv/lib/matrix/generic/cumatrix.h @@ -40,6 +40,8 @@ void nerv_matrix_(log_elem)(Matrix *b, const Matrix *a, Status *status); Matrix *nerv_matrix_(decompress)(const Matrix *a, int orig_col, Status *status); void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b, const Matrix *idx, int b_begin, Status *status); +void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b, + const Matrix *idx, int b_begin, Status *status); void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b, int context, Status *status); diff --git a/nerv/lib/matrix/generic/matrix.c b/nerv/lib/matrix/generic/matrix.c index e4afa37..4246751 100644 --- a/nerv/lib/matrix/generic/matrix.c +++ b/nerv/lib/matrix/generic/matrix.c @@ -3,7 +3,8 @@ #include "matrix.h" /* FIXME: malloc failure detection */ -static void nerv_matrix_(data_free)(Matrix *self, Status *status) { +void nerv_matrix_(data_free)(Matrix *self, Status *status) { + if(*self->data_ref == 0) return; /* FIXME: repeat free memory */ assert(*self->data_ref > 0); if (--(*self->data_ref) == 0) { @@ -18,7 +19,7 @@ static void nerv_matrix_(data_free)(Matrix *self, Status *status) { } } -static void nerv_matrix_(data_retain)(Matrix *self) { +void nerv_matrix_(data_retain)(Matrix *self) { (*self->data_ref)++; } diff --git a/nerv/lib/matrix/generic/matrix.h b/nerv/lib/matrix/generic/matrix.h index 9d44e6d..69b4e6d 100644 --- a/nerv/lib/matrix/generic/matrix.h +++ b/nerv/lib/matrix/generic/matrix.h @@ -2,3 +2,5 @@ Matrix *nerv_matrix_(create)(long nrow, long ncol, Status *status); void nerv_matrix_(destroy)(Matrix *self, Status *status); Matrix *nerv_matrix_(getrow)(Matrix *self, int row); +void nerv_matrix_(data_free)(Matrix *self, Status *status); +void nerv_matrix_(data_retain)(Matrix *self); diff --git a/nerv/lib/matrix/mmatrix.c b/nerv/lib/matrix/mmatrix.c index 94f1ea8..b8157eb 100644 --- a/nerv/lib/matrix/mmatrix.c +++ b/nerv/lib/matrix/mmatrix.c @@ -7,6 +7,25 @@ #define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME #include "generic/matrix.h" #include "generic/mmatrix.c" + +Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) { + int i; + Matrix *self = nerv_matrix_(create)(1, ncol, status); + if (status->err_code != NERV_NORMAL) + return NULL; + float *prow = self->data.f; + for (i = 0; i < ncol; i++) + prow[i] = i; + for (i = ncol - 1; i >= 0; i--) + { + size_t j = rand() % (i + 1); + float tmp = prow[i]; + prow[i] = prow[j]; + prow[j] = tmp; + } + return self; +} + #undef nerv_matrix_ #undef host_matrix_ #undef MATRIX_USE_FLOAT @@ -33,21 +52,3 @@ #define host_matrix_(NAME) host_matrix_int_##NAME #define nerv_matrix_(NAME) nerv_matrix_host_int_##NAME #include "generic/mmatrix.c" - -Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) { - int i; - Matrix *self = nerv_matrix_(create)(1, ncol, status); - if (status->err_code != NERV_NORMAL) - return NULL; - long *prow = self->data.i; - for (i = 0; i < ncol; i++) - prow[i] = i; - for (i = ncol - 1; i >= 0; i--) - { - size_t j = rand() % (i + 1); - long tmp = prow[i]; - prow[i] = prow[j]; - prow[j] = tmp; - } - return self; -} diff --git a/nerv/lib/matrix/mmatrix.h b/nerv/lib/matrix/mmatrix.h index df91e4c..31e7984 100644 --- a/nerv/lib/matrix/mmatrix.h +++ b/nerv/lib/matrix/mmatrix.h @@ -1,4 +1,5 @@ #ifndef NERV_MMATRIX_H #define NERV_MMATRIX_H -Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status); +#include "matrix.h" +Matrix *nerv_matrix_host_float_perm_gen(int ncol, Status *status); #endif diff --git a/nerv/matrix/generic/cukernel.cu b/nerv/matrix/generic/cukernel.cu deleted file mode 100644 index 2ae5e62..0000000 --- a/nerv/matrix/generic/cukernel.cu +++ /dev/null @@ -1,592 +0,0 @@ -#ifdef NERV_GENERIC_CUKERNEL -#include <assert.h> -#include <stdio.h> -#include "matrix.h" -#include "cuda.h" -#include "float.h" -#define CUDA_THREADS_N 16 -#define CUDA_THREADS_NN ((CUDA_THREADS_N) * (CUDA_THREADS_N)) -#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) -__global__ void cudak_(log_elem)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - int nrow, int ncol, int stride) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - long idx; - MATRIX_ELEM tmp; - if (i >= nrow || j >= ncol) return; - idx = j + i * stride; - tmp = a[idx]; - if(tmp < FLT_MIN) tmp = FLT_MIN; - b[idx] = log(tmp); -} - -__global__ void cudak_(mul_elem)(const MATRIX_ELEM *a, const MATRIX_ELEM *b, - MATRIX_ELEM *c, - int nrow, int ncol, int stride) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - long idx; - if (i >= nrow || j >= ncol) return; - idx = j + i * stride; - c[idx] = a[idx] * b[idx]; -} - -__global__ void cudak_(sigmoid)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - int nrow, int ncol, int stride) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - long idx; - if (i >= nrow || j >= ncol) return; - idx = j + i * stride; - b[idx] = 1.0 / (1.0 + exp(-a[idx])); -} - -__global__ void cudak_(sigmoid_grad)(const MATRIX_ELEM *output, - const MATRIX_ELEM *err, - MATRIX_ELEM *nerr, - int nrow, int ncol, int stride) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - long idx; - if (i >= nrow || j >= ncol) return; - idx = j + i * stride; - nerr[idx] = output[idx] * (1.0 - output[idx]) * err[idx]; -} - -__global__ void cudak_(softmax_final)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - const MATRIX_ELEM *max, const MATRIX_ELEM *deno, - int nrow, int ncol, int stride, int mstride) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - long idx; - if (i >= nrow || j >= ncol) return; - idx = j + i * stride; - b[idx] = exp(a[idx] - max[0 + i * mstride]) / deno[0 + i * mstride]; -} - -__global__ void cudak_(block_reduce_rowsum)(const MATRIX_ELEM *input, - MATRIX_ELEM *output, - const int istride, const int ostride, - const int n) { - extern __shared__ MATRIX_ELEM cudak_(arr)[]; - int j = blockIdx.x * blockDim.x + threadIdx.x; - cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0; - __syncthreads(); - for (int offset = blockDim.x >> 1; offset; offset >>= 1) - { - if (threadIdx.x < offset) - cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset]; - __syncthreads(); - } - if (threadIdx.x == 0) - output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0]; -} - -__global__ void cudak_(block_reduce_colsum)(const MATRIX_ELEM *input, - MATRIX_ELEM *output, - const int istride, const int ostride, - const int n) { - extern __shared__ MATRIX_ELEM cudak_(arr)[]; - int i = blockIdx.y * blockDim.y + threadIdx.y; - cudak_(arr)[threadIdx.y] = i < n ? input[blockIdx.x + istride * i] : 0; - __syncthreads(); - for (int offset = blockDim.y >> 1; offset; offset >>= 1) - { - if (threadIdx.y < offset) - cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset]; - __syncthreads(); - } - if (threadIdx.y == 0) - output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0]; -} - -__global__ void cudak_(block_reduce_colsame)(const MATRIX_ELEM *input, - const MATRIX_ELEM *ref_input, - MATRIX_ELEM *output, - const int istride, const int ostride, - const int n) { - extern __shared__ MATRIX_ELEM cudak_(arr)[]; - int i = blockIdx.y * blockDim.y + threadIdx.y; - cudak_(arr)[threadIdx.y] = (i < n && input[blockIdx.x + istride * i] == \ - ref_input[blockIdx.x + istride * i]) ? 1.0 : 0; - __syncthreads(); - for (int offset = blockDim.y >> 1; offset; offset >>= 1) - { - if (threadIdx.y < offset) - cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset]; - __syncthreads(); - } - if (threadIdx.y == 0) - output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0]; -} - -__global__ void cudak_(block_reduce_softmax_rowsum)(const MATRIX_ELEM *input, - MATRIX_ELEM *output, - const MATRIX_ELEM *max, - const int istride, const int ostride, - const int mstride, const int n) { - extern __shared__ MATRIX_ELEM cudak_(arr)[]; - int j = blockIdx.x * blockDim.x + threadIdx.x; - cudak_(arr)[threadIdx.x] = j < n ? exp(input[j + istride * blockIdx.y] - \ - max[0 + mstride * blockIdx.y]) : 0; - __syncthreads(); - for (int offset = blockDim.x >> 1; offset; offset >>= 1) - { - if (threadIdx.x < offset) - cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset]; - __syncthreads(); - } - if (threadIdx.x == 0) - output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0]; -} - -__global__ void cudak_(block_reduce_rowmax)(const MATRIX_ELEM *input, - MATRIX_ELEM *output, - const int istride, const int ostride, - const int n) { - extern __shared__ MATRIX_ELEM cudak_(arr)[]; - int j = blockIdx.x * blockDim.x + threadIdx.x; - cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX; - __syncthreads(); - for (int offset = blockDim.x >> 1; offset; offset >>= 1) - { - if (threadIdx.x < offset) - { - MATRIX_ELEM l = cudak_(arr)[threadIdx.x], - r = cudak_(arr)[threadIdx.x + offset]; - if (r > l) - cudak_(arr)[threadIdx.x] = r; - } - __syncthreads(); - } - if (threadIdx.x == 0) - output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0]; -} - -__global__ void cudak_(block_reduce_rowmax_idx)(const MATRIX_ELEM *input, - const MATRIX_ELEM *idx_input, - MATRIX_ELEM *output, - MATRIX_ELEM *idx_output, - const int istride, const int ostride, - const int n) { - extern __shared__ MATRIX_ELEM cudak_(arr)[]; - MATRIX_ELEM *arr_val = cudak_(arr); - MATRIX_ELEM *arr_idx = arr_val + blockDim.x; - int j = blockIdx.x * blockDim.x + threadIdx.x; - arr_val[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX; - arr_idx[threadIdx.x] = j < n ? idx_input[j + istride * blockIdx.y] : 0; - __syncthreads(); - for (int offset = blockDim.x >> 1; offset; offset >>= 1) - { - if (threadIdx.x < offset) - { - MATRIX_ELEM l = arr_val[threadIdx.x], - r = arr_val[threadIdx.x + offset]; - if (r > l) - { - arr_val[threadIdx.x] = r; - arr_idx[threadIdx.x] = arr_idx[threadIdx.x + offset]; - } - } - __syncthreads(); - } - if (threadIdx.x == 0) - { - output[blockIdx.x + ostride * blockIdx.y] = arr_val[0]; - idx_output[blockIdx.x + ostride * blockIdx.y] = arr_idx[0]; - } -} - -__global__ void cudak_(add_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - int nrow, int ncol, int stride, double beta) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i >= nrow || j >= ncol) return; - b[j + i * stride] += beta * a[j]; -} - -__global__ void cudak_(fill)(MATRIX_ELEM *a, - int nrow, int ncol, int stride, double val) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i >= nrow || j >= ncol) return; - a[j + i * stride] = val; -} - -__global__ void cudak_(clip)(MATRIX_ELEM *a, - int nrow, int ncol, int stride, double val_1, double val_2) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i >= nrow || j >= ncol) return; - if (a[j + i * stride] > val_2) - a[j + i * stride] = val_2; - else if (a[j + i * stride] < val_1) - a[j + i * stride] = val_1; -} - -__global__ void cudak_(expand_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - int nrow, int ncol, - int enrow, int encol, - int stride, int estride, - int context) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - int ridx; - if (i >= enrow || j >= encol) return; - ridx = i + j / ncol - context; - if (ridx < 0) ridx = 0; - else if (ridx >= nrow) ridx = nrow - 1; - b[j + i * estride] = a[j % ncol + ridx * stride]; -} - -__global__ void cudak_(rearrange_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - int nrow, int ncol, - int stride, int step, int orig_dim) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i >= nrow || j >= ncol) return; - b[j + i * stride] = a[j / step + (j % step) * orig_dim + i * stride]; -} - -__global__ void cudak_(scale_rows_by_col)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - int nrow, int ncol, - int astride, int bstride) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i >= nrow || j >= ncol) return; - b[j + i * bstride] *= a[i * astride]; -} - -__global__ void cudak_(scale_rows_by_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - int nrow, int ncol, - int stride) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i >= nrow || j >= ncol) return; - b[j + i * stride] *= a[j]; -} - -__global__ void cudak_(decompress)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - int nrow, int ncol, - int stride_a, int stride_b) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i >= nrow || j >= ncol) return; - b[lrintf(a[j + i * stride_a]) + i * stride_b] = 1.0; -} - -__global__ void cudak_(gen_col_idx)(MATRIX_ELEM *b, - int nrow, int ncol, int stride) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i >= nrow || j >= ncol) return; - b[j + i * stride] = j; -} - -extern "C" { -#include "../cukernel.h" - void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(log_elem)<<<numBlocks, threadsPerBlock>>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), - b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM)); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_mul_elem)(const Matrix *a, const Matrix *b, - Matrix *c) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(mul_elem)<<<numBlocks, threadsPerBlock>>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), - MATRIX_ELEM_PTR(c), - b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM)); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(sigmoid)<<<numBlocks, threadsPerBlock>>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol, - b->stride / sizeof(MATRIX_ELEM)); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_sigmoid_grad)(const Matrix *output, - const Matrix *err, Matrix *nerr) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(nerr->ncol, threadsPerBlock.x), - CEIL_DIV(nerr->nrow, threadsPerBlock.y)); - cudak_(sigmoid_grad)<<<numBlocks, threadsPerBlock>>> \ - (MATRIX_ELEM_PTR(output), MATRIX_ELEM_PTR(err), - MATRIX_ELEM_PTR(nerr), - nerr->nrow, nerr->ncol, - nerr->stride / sizeof(MATRIX_ELEM)); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_rowsum)(const Matrix *a, Matrix *b) { - dim3 block(CUDA_THREADS_NN, 1); - int ncol = a->ncol; - int blocks_per_row = CEIL_DIV(ncol, block.x); - dim3 grid(blocks_per_row, a->nrow); - MATRIX_ELEM *res; - size_t stride; - cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow); - cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \ - (MATRIX_ELEM_PTR(a), res, - a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM), - ncol); - ncol = blocks_per_row; - assert((unsigned long)ncol <= block.x); - grid.x = 1; - cudaStreamSynchronize(0); - cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \ - (res, MATRIX_ELEM_PTR(b), - stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM), - ncol); - cudaStreamSynchronize(0); - cudaFree(res); - } - - void cudak_(cuda_colsame)(const Matrix *a, const Matrix *ref, Matrix *b) { - dim3 block(1, CUDA_THREADS_NN); - int nrow = a->nrow; - int blocks_per_col = CEIL_DIV(nrow, block.y); - dim3 grid(a->ncol, blocks_per_col); - MATRIX_ELEM *res; - size_t stride; - cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col); - cudak_(block_reduce_colsame)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(ref), res, - a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM), - nrow); - nrow = blocks_per_col; - assert((unsigned long)nrow <= block.y); - grid.y = 1; - cudaStreamSynchronize(0); - cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \ - (res, MATRIX_ELEM_PTR(b), - stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM), - nrow); - cudaStreamSynchronize(0); - cudaFree(res); - } - - void cudak_(cuda_colsum)(const Matrix *a, Matrix *b) { - dim3 block(1, CUDA_THREADS_NN); - int nrow = a->nrow; - int blocks_per_col = CEIL_DIV(nrow, block.y); - dim3 grid(a->ncol, blocks_per_col); - MATRIX_ELEM *res; - size_t stride; - cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col); - cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \ - (MATRIX_ELEM_PTR(a), res, - a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM), - nrow); - nrow = blocks_per_col; - assert((unsigned long)nrow <= block.y); - grid.y = 1; - cudaStreamSynchronize(0); - cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \ - (res, MATRIX_ELEM_PTR(b), - stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM), - nrow); - cudaStreamSynchronize(0); - cudaFree(res); - } - - void cudak_(cuda_softmax_final)(const Matrix *a, const Matrix *max, - const Matrix *deno, Matrix *b) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(softmax_final)<<<numBlocks, threadsPerBlock>>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), - MATRIX_ELEM_PTR(max), MATRIX_ELEM_PTR(deno), - b->nrow, b->ncol, - b->stride / sizeof(MATRIX_ELEM), - max->stride / sizeof(MATRIX_ELEM)); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_softmax_denominator)(const Matrix *a, const Matrix *max, Matrix *b) { - dim3 block(CUDA_THREADS_NN, 1); - int ncol = a->ncol; - int blocks_per_row = CEIL_DIV(ncol, block.x); - dim3 grid(blocks_per_row, a->nrow); - MATRIX_ELEM *res; - size_t stride; - assert(max->ncol == 1); - cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow); - cudak_(block_reduce_softmax_rowsum) \ - <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \ - (MATRIX_ELEM_PTR(a), res, MATRIX_ELEM_PTR(max), - a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM), - max->stride / sizeof(MATRIX_ELEM), - ncol); - ncol = blocks_per_row; - assert((unsigned long)ncol <= block.x); - grid.x = 1; - cudaStreamSynchronize(0); - cudak_(block_reduce_rowsum) \ - <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \ - (res, MATRIX_ELEM_PTR(b), - stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM), - ncol); - cudaStreamSynchronize(0); - cudaFree(res); - } - - void cudak_(cuda_rowmax)(const Matrix *a, Matrix *b) { - dim3 block(CUDA_THREADS_NN, 1); - int ncol = a->ncol; - int blocks_per_row = CEIL_DIV(ncol, block.x); - dim3 grid(blocks_per_row, a->nrow); - MATRIX_ELEM *res; - size_t stride; - cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow); - cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \ - (MATRIX_ELEM_PTR(a), res, - a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM), - ncol); - ncol = blocks_per_row; - assert((unsigned long)ncol <= block.x); - grid.x = 1; - cudaStreamSynchronize(0); - cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \ - (res, MATRIX_ELEM_PTR(b), - stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM), - ncol); - cudaStreamSynchronize(0); - cudaFree(res); - } - - void cudak_(cuda_rowmax_idx)(const Matrix *a, Matrix *b, Matrix *b_idx) { - dim3 block(CUDA_THREADS_NN, 1); - int ncol = a->ncol; - int blocks_per_row = CEIL_DIV(ncol, block.x); - dim3 grid(blocks_per_row, a->nrow); - MATRIX_ELEM *a_idx, *res, *res_idx; - size_t stride; - cudaMallocPitch(&a_idx, &stride, a->stride, a->nrow); - cudak_(gen_col_idx)<<<grid, block>>>(a_idx, a->nrow, ncol, stride / sizeof(MATRIX_ELEM)); - cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow); - cudaMallocPitch(&res_idx, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow); - cudaStreamSynchronize(0); - cudak_(block_reduce_rowmax_idx)<<<grid, block, - 2 * block.x * sizeof(MATRIX_ELEM)>>> \ - (MATRIX_ELEM_PTR(a), a_idx, res, res_idx, - a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM), - ncol); - ncol = blocks_per_row; - assert((unsigned long)ncol <= block.x); - grid.x = 1; - cudaStreamSynchronize(0); - cudak_(block_reduce_rowmax_idx)<<<grid, block, - 2 * block.x * sizeof(MATRIX_ELEM)>>> \ - (res, res_idx, MATRIX_ELEM_PTR(b), MATRIX_ELEM_PTR(b_idx), - stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM), - ncol); - cudaStreamSynchronize(0); - cudaFree(a_idx); - cudaFree(res); - cudaFree(res_idx); - } - - /* in-place calc */ - void cudak_(cuda_add_row)(const Matrix *a, Matrix *b, double beta) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(add_row)<<<numBlocks, threadsPerBlock>>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol, - b->stride / sizeof(MATRIX_ELEM), beta); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_fill)(Matrix *a, double val) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x), - CEIL_DIV(a->nrow, threadsPerBlock.y)); - cudak_(fill)<<<numBlocks, threadsPerBlock>>> \ - (MATRIX_ELEM_PTR(a), a->nrow, a->ncol, - a->stride / sizeof(MATRIX_ELEM), val); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_clip)(Matrix *a, double val_1, double val_2) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x), - CEIL_DIV(a->nrow, threadsPerBlock.y)); - cudak_(clip)<<<numBlocks, threadsPerBlock>>> \ - (MATRIX_ELEM_PTR(a), a->nrow, a->ncol, - a->stride / sizeof(MATRIX_ELEM), val_1, val_2); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(expand_frm)<<<numBlocks, threadsPerBlock>>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), - a->nrow, a->ncol, - b->nrow, b->ncol, - a->stride / sizeof(MATRIX_ELEM), - b->stride / sizeof(MATRIX_ELEM), - context); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(rearrange_frm)<<<numBlocks, threadsPerBlock>>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), - b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM), - step, b->ncol / step); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(scale_rows_by_col)<<<numBlocks, threadsPerBlock>>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), - b->nrow, b->ncol, - a->stride / sizeof(MATRIX_ELEM), - b->stride / sizeof(MATRIX_ELEM)); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(scale_rows_by_row)<<<numBlocks, threadsPerBlock>>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), - b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM)); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_decompress)(const Matrix *a, Matrix *b) { - dim3 threadsPerBlock(1, CUDA_THREADS_NN); - dim3 numBlocks(1, CEIL_DIV(a->nrow, threadsPerBlock.y)); - cudak_(decompress)<<<numBlocks, threadsPerBlock>>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), - a->nrow, a->ncol, - a->stride / sizeof(MATRIX_ELEM), - b->stride / sizeof(MATRIX_ELEM)); - cudaStreamSynchronize(0); - } -} -#endif diff --git a/nerv/matrix/generic/cumatrix.c b/nerv/matrix/generic/cumatrix.c index 4bdf5f0..08cb4c2 100644 --- a/nerv/matrix/generic/cumatrix.c +++ b/nerv/matrix/generic/cumatrix.c @@ -43,15 +43,6 @@ static int nerv_matrix_(lua_mul)(lua_State *L) { return 0; } -static int nerv_matrix_(lua_create)(lua_State *L) { - Status status; - Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); - Matrix *b = nerv_matrix_(create)(a->nrow, a->ncol, &status); - NERV_LUA_CHECK_STATUS(L, status); - luaT_pushudata(L, b, nerv_matrix_(tname)); - return 1; -} - static int nerv_matrix_(lua_sigmoid)(lua_State *L) { Status status; Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); @@ -237,12 +228,12 @@ static int nerv_matrix_(lua_decompress)(lua_State *L) { return 1; } -extern const char *nerv_matrix_host_int_tname; +extern const char *nerv_matrix_host_float_tname; static int nerv_matrix_(lua_copy_rows_fromh_by_idx)(lua_State *L) { Status status; Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); const Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME); - const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_int_tname); + const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_float_tname); long nrow = a->nrow; int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0; nerv_matrix_(copy_rows_fromh_by_idx)(a, b, idx, b_begin, &status); @@ -250,6 +241,18 @@ static int nerv_matrix_(lua_copy_rows_fromh_by_idx)(lua_State *L) { return 0; } +static int nerv_matrix_(lua_copy_rows_fromd_by_idx)(lua_State *L) { + Status status; + Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); + const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); + const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_(tname)); + long nrow = a->nrow; + int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0; + nerv_matrix_(copy_rows_fromd_by_idx)(a, b, idx, b_begin, &status); + NERV_LUA_CHECK_STATUS(L, status); + return 0; +} + static int nerv_matrix_(lua_expand_frm)(lua_State *L) { Status status; Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); @@ -289,7 +292,6 @@ static int nerv_matrix_(lua_scale_rows_by_row)(lua_State *L) { } static const luaL_Reg nerv_matrix_(extra_methods)[] = { - {"create", nerv_matrix_(lua_create)}, {"colsum", nerv_matrix_(lua_colsum)}, {"colsame", nerv_matrix_(lua_colsame)}, {"rowsum", nerv_matrix_(lua_rowsum)}, @@ -300,6 +302,8 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = { /* in-place calc */ {"copy_fromh", nerv_matrix_(lua_copy_fromh)}, {"copy_fromd", nerv_matrix_(lua_copy_fromd)}, + /* alias for copy_fromd */ + {"copy_from", nerv_matrix_(lua_copy_fromd)}, {"copy_toh", nerv_matrix_(lua_copy_toh)}, {"add", nerv_matrix_(lua_add)}, {"mul", nerv_matrix_(lua_mul)}, @@ -312,6 +316,7 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = { {"mul_elem", nerv_matrix_(lua_mul_elem)}, {"log_elem", nerv_matrix_(lua_log_elem)}, {"copy_rows_fromh_by_idx", nerv_matrix_(lua_copy_rows_fromh_by_idx)}, + {"copy_rows_fromd_by_idx", nerv_matrix_(lua_copy_rows_fromd_by_idx)}, {"expand_frm", nerv_matrix_(lua_expand_frm)}, {"rearrange_frm", nerv_matrix_(lua_rearrange_frm)}, {"scale_rows_by_row", nerv_matrix_(lua_scale_rows_by_row)}, @@ -321,6 +326,9 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = { static void cuda_matrix_(init)(lua_State *L) { luaN_append_methods(L, nerv_matrix_(extra_methods)); +#ifdef CUMATRIX_INIT + CUMATRIX_INIT(L); +#endif } int nerv_matrix_(lua_get_elem)(lua_State *L) { diff --git a/nerv/matrix/init.lua b/nerv/matrix/init.lua index f230e9f..1091d7e 100644 --- a/nerv/matrix/init.lua +++ b/nerv/matrix/init.lua @@ -45,6 +45,10 @@ function nerv.Matrix:generate(gen) end end +function nerv.Matrix:create(nrow, ncol) + return self.__constructor(nrow or self:nrow(), ncol or self:ncol()) +end + nerv.MMatrixInt.fmt = "%d " function nerv.CuMatrix:__add__(b) diff --git a/nerv/matrix/mmatrix.c b/nerv/matrix/mmatrix.c index 5561572..961059c 100644 --- a/nerv/matrix/mmatrix.c +++ b/nerv/matrix/mmatrix.c @@ -16,7 +16,30 @@ void nerv_lua_mmatrix_init(lua_State *L) { #define host_matrix_(NAME) host_matrix_float_##NAME #define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME const char *nerv_matrix_(tname) = "nerv.MMatrixFloat"; +#define MMATRIX_INIT(L) host_matrix_(init_extra)(L) + +static const luaL_Reg nerv_matrix_(extra_methods_int)[]; +static void host_matrix_(init_extra)(lua_State *L) { + luaN_append_methods(L, nerv_matrix_(extra_methods_int)); +} + #include "generic/mmatrix.c" +#include "../lib/matrix/mmatrix.h" + +static int nerv_matrix_(lua_perm_gen)(lua_State *L) { + Status status; + int i, ncol = luaL_checkinteger(L, 1); + Matrix *self = nerv_matrix_(perm_gen)(ncol, &status); + NERV_LUA_CHECK_STATUS(L, status); + luaT_pushudata(L, self, nerv_matrix_(tname)); + return 1; +} + +static const luaL_Reg nerv_matrix_(extra_methods_int)[] = { + {"perm_gen", nerv_matrix_(lua_perm_gen)}, + {NULL, NULL} +}; + #undef nerv_matrix_ #undef host_matrix_ #undef MATRIX_USE_FLOAT @@ -24,6 +47,7 @@ const char *nerv_matrix_(tname) = "nerv.MMatrixFloat"; #undef MATRIX_ELEM_PTR #undef MATRIX_ELEM_FMT #undef MATRIX_ELEM_WRITE_FMT +#undef MMATRIX_INIT #define NERV_GENERIC_MMATRIX #define MATRIX_USE_DOUBLE @@ -44,26 +68,4 @@ const char *nerv_matrix_(tname) = "nerv.MMatrixDouble"; #define host_matrix_(NAME) host_matrix_int_##NAME #define nerv_matrix_(NAME) nerv_matrix_host_int_##NAME const char *nerv_matrix_(tname) = "nerv.MMatrixInt"; -#define MMATRIX_INIT(L) host_matrix_(init_extra)(L) - -static const luaL_Reg nerv_matrix_(extra_methods_int)[]; -static void host_matrix_(init_extra)(lua_State *L) { - luaN_append_methods(L, nerv_matrix_(extra_methods_int)); -} - #include "generic/mmatrix.c" -#include "../lib/matrix/mmatrix.h" - -static int nerv_matrix_(lua_perm_gen)(lua_State *L) { - Status status; - int i, ncol = luaL_checkinteger(L, 1); - Matrix *self = nerv_matrix_(perm_gen)(ncol, &status); - NERV_LUA_CHECK_STATUS(L, status); - luaT_pushudata(L, self, nerv_matrix_(tname)); - return 1; -} - -static const luaL_Reg nerv_matrix_(extra_methods_int)[] = { - {"perm_gen", nerv_matrix_(lua_perm_gen)}, - {NULL, NULL} -}; diff --git a/nerv/nn/layer_dag.lua b/nerv/nn/layer_dag.lua index 8e30216..73bb77d 100644 --- a/nerv/nn/layer_dag.lua +++ b/nerv/nn/layer_dag.lua @@ -79,7 +79,7 @@ function DAGLayer:__init(id, global_conf, layer_conf) end table.insert(parsed_conn, - {{ref_from, port_from}, {ref_to, port_to}}) + {{ref_from, port_from}, {ref_to, port_to}}) table.insert(ref_from.next_layers, ref_to) -- add edge ref_to.in_deg = ref_to.in_deg + 1 -- increase the in-degree of the target layer end @@ -112,7 +112,7 @@ function DAGLayer:__init(id, global_conf, layer_conf) end end for i = 1, #queue do - nerv.info("enqueued layer: %s", queue[i].layer.id) + nerv.info("enqueued layer: %s %s", queue[i].layer, queue[i].layer.id) end for id, ref in pairs(layers) do @@ -125,6 +125,7 @@ function DAGLayer:__init(id, global_conf, layer_conf) self.layers = layers self.inputs = inputs self.outputs = outputs + self.id = id self.dim_in = dim_in self.dim_out = dim_out self.parsed_conn = parsed_conn @@ -139,8 +140,11 @@ function DAGLayer:init(batch_size) ref_from, port_from = unpack(conn[1]) ref_to, port_to = unpack(conn[2]) _, output_dim = ref_from.layer:get_dim() - local mid = self.gconf.cumat_type(batch_size, - output_dim[port_from]) + local dim = 1 + if output_dim[port_from] > 0 then + dim = output_dim[port_from] + end + local mid = self.gconf.cumat_type(batch_size, dim) local err_mid = mid:create() ref_from.outputs[port_from] = mid @@ -175,8 +179,38 @@ function DAGLayer:init(batch_size) end end +function DAGLayer:batch_resize(batch_size) + self.gconf.batch_size = batch_size + + for i, conn in ipairs(self.parsed_conn) do + local _, output_dim + local ref_from, port_from, ref_to, port_to + ref_from, port_from = unpack(conn[1]) + ref_to, port_to = unpack(conn[2]) + _, output_dim = ref_from.layer:get_dim() + + if ref_from.outputs[port_from]:nrow() ~= batch_size and output_dim[port_from] > 0 then + local mid = self.gconf.cumat_type(batch_size, output_dim[port_from]) + local err_mid = mid:create() + + ref_from.outputs[port_from] = mid + ref_to.inputs[port_to] = mid + + ref_from.err_inputs[port_from] = err_mid + ref_to.err_outputs[port_to] = err_mid + end + end + for id, ref in pairs(self.layers) do + ref.layer:batch_resize(batch_size) + end + collectgarbage("collect") +end + function DAGLayer:set_inputs(input) for i = 1, #self.dim_in do + if input[i] == nil then + nerv.error("some input is not provided"); + end local layer = self.inputs[i][1] local port = self.inputs[i][2] layer.inputs[port] = input[i] @@ -185,6 +219,9 @@ end function DAGLayer:set_outputs(output) for i = 1, #self.dim_out do + if output[i] == nil then + nerv.error("some output is not provided"); + end local layer = self.outputs[i][1] local port = self.outputs[i][2] layer.outputs[port] = output[i] @@ -221,11 +258,13 @@ end function DAGLayer:propagate(input, output) self:set_inputs(input) self:set_outputs(output) + local ret = false for i = 1, #self.queue do local ref = self.queue[i] -- print(ref.layer.id) - ref.layer:propagate(ref.inputs, ref.outputs) + ret = ref.layer:propagate(ref.inputs, ref.outputs) end + return ret end function DAGLayer:back_propagate(bp_err, next_bp_err, input, output) @@ -247,3 +286,30 @@ function DAGLayer:get_params() end return nerv.ParamRepo.merge(param_repos) end + +DAGLayer.PORT_TYPES = { + INPUT = {}, + OUTPUT = {}, + ERR_INPUT = {}, + ERR_OUTPUT = {} +} + +function DAGLayer:get_intermediate(id, port_type) + if id == "<input>" or id == "<output>" then + nerv.error("an actual real layer id is expected") + end + local layer = self.layers[id] + if layer == nil then + nerv.error("layer id %s not found", id) + end + if port_type == DAGLayer.PORT_TYPES.INPUT then + return layer.inputs + elseif port_type == DAGLayer.PORT_TYPES.OUTPUT then + return layer.outputs + elseif port_type == DAGLayer.PORT_TYPES.ERR_INPUT then + return layer.err_inputs + elseif port_type == DAGLayer.PORT_TYPES.ERR_OUTPUT then + return layer.err_outputs + end + nerv.error("unrecognized port type") +end diff --git a/nerv/nn/layer_repo.lua b/nerv/nn/layer_repo.lua index 602c37c..ef333a7 100644 --- a/nerv/nn/layer_repo.lua +++ b/nerv/nn/layer_repo.lua @@ -1,7 +1,12 @@ local LayerRepo = nerv.class("nerv.LayerRepo") function LayerRepo:__init(layer_spec, param_repo, global_conf) - local layers = {} + self.layers = {} + self:add_layers(layer_spec, param_repo, global_conf) +end + +function LayerRepo:add_layers(layer_spec, param_repo, global_conf) + local layers = self.layers for ltype, llist in pairs(layer_spec) do local layer_type = nerv.get_type(ltype) for id, spec in pairs(llist) do @@ -22,7 +27,6 @@ function LayerRepo:__init(layer_spec, param_repo, global_conf) layers[id] = layer_type(id, global_conf, layer_config) end end - self.layers = layers end function LayerRepo:get_layer(lid) diff --git a/speech b/speech deleted file mode 160000 -Subproject 08e33afa533af1f026ac271446a0c873fe0bb5c |