From 5b16335a903551ffef4fafa88d67146b9131a74e Mon Sep 17 00:00:00 2001 From: Determinant Date: Tue, 4 Aug 2015 11:11:50 +0800 Subject: ... --- nerv/examples/asr_trainer.lua | 4 ++-- nerv/init.lua | 2 +- nerv/nerv | 10 +++++----- nerv/nn/layer_dag.lua | 6 ++++++ 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/nerv/examples/asr_trainer.lua b/nerv/examples/asr_trainer.lua index 4fa4096..8dfb2ac 100644 --- a/nerv/examples/asr_trainer.lua +++ b/nerv/examples/asr_trainer.lua @@ -12,7 +12,7 @@ function build_trainer(ifname) -- initialize the network network:init(gconf.batch_size) gconf.cnt = 0 - err_input = {nerv.CuMatrixFloat(256, 1)} + err_input = {nerv.CuMatrixFloat(gconf.batch_size, 1)} err_input[1]:fill(1) for data in buffer.get_data, buffer do -- prine stat periodically @@ -32,7 +32,7 @@ function build_trainer(ifname) end table.insert(input, data[id]) end - local output = {nerv.CuMatrixFloat(256, 1)} + local output = {nerv.CuMatrixFloat(gconf.batch_size, 1)} err_output = {input[1]:create()} network:propagate(input, output) if bp then diff --git a/nerv/init.lua b/nerv/init.lua index 89010a7..183ae6d 100644 --- a/nerv/init.lua +++ b/nerv/init.lua @@ -1,7 +1,7 @@ require 'libnerv' function nerv.error(fmt, ...) - error(nerv.printf("[nerv] internal error: " .. fmt .. "\n", ...)) + error("[nerv] internal error: " .. fmt .. "\n", ...) end function nerv.error_method_not_implemented() diff --git a/nerv/nerv b/nerv/nerv index 7571659..e5943aa 100644 --- a/nerv/nerv +++ b/nerv/nerv @@ -1,13 +1,13 @@ #! /usr/bin/env luajit require 'nerv' print("Greetings") -if #arg < 1 then +if #args < 1 then return end -local script = arg[1] +local script = args[1] local script_arg = {} -for i = 2, #arg do - table.insert(script_arg, arg[i]) +for i = 2, #args do + table.insert(script_arg, args[i]) end -arg = script_arg +args = script_arg dofile(script) diff --git a/nerv/nn/layer_dag.lua b/nerv/nn/layer_dag.lua index 8e30216..e5c1ac7 100644 --- a/nerv/nn/layer_dag.lua +++ b/nerv/nn/layer_dag.lua @@ -177,6 +177,9 @@ end function DAGLayer:set_inputs(input) for i = 1, #self.dim_in do + if input[i] == nil then + nerv.error("some input is not provided"); + end local layer = self.inputs[i][1] local port = self.inputs[i][2] layer.inputs[port] = input[i] @@ -185,6 +188,9 @@ end function DAGLayer:set_outputs(output) for i = 1, #self.dim_out do + if output[i] == nil then + nerv.error("some output is not provided"); + end local layer = self.outputs[i][1] local port = self.outputs[i][2] layer.outputs[port] = output[i] -- cgit v1.2.3 From 462d1982f299c8be4ae9a746e6ae7d04c04faa33 Mon Sep 17 00:00:00 2001 From: Determinant Date: Tue, 4 Aug 2015 14:41:22 +0800 Subject: add embedding_example --- embedding_example/.gitignore | 2 + embedding_example/Makefile | 9 +++ embedding_example/main.c | 95 ++++++++++++++++++++++++++ embedding_example/setup_nerv.lua | 26 +++++++ embedding_example/swb_baseline_decode.lua | 109 ++++++++++++++++++++++++++++++ nerv/Makefile | 2 +- nerv/layer/init.lua | 1 + nerv/lib/matrix/generic/matrix.c | 4 +- nerv/lib/matrix/generic/matrix.h | 2 + 9 files changed, 247 insertions(+), 3 deletions(-) create mode 100644 embedding_example/.gitignore create mode 100644 embedding_example/Makefile create mode 100644 embedding_example/main.c create mode 100644 embedding_example/setup_nerv.lua create mode 100644 embedding_example/swb_baseline_decode.lua diff --git a/embedding_example/.gitignore b/embedding_example/.gitignore new file mode 100644 index 0000000..8e68213 --- /dev/null +++ b/embedding_example/.gitignore @@ -0,0 +1,2 @@ +main +main.o diff --git a/embedding_example/Makefile b/embedding_example/Makefile new file mode 100644 index 0000000..e4ee314 --- /dev/null +++ b/embedding_example/Makefile @@ -0,0 +1,9 @@ +CFLAG += -I ../install/include/luajit-2.0/ -I ../install/include/nerv/ +LDFLAG += -L../install/lib/ -lluajit-5.1 -Wl,-rpath=../install/lib/ -lluaT -lnervcore +GCC := gcc + +main: main.o + $(GCC) -o $@ $< $(LDFLAG) + +main.o: main.c + $(GCC) $(CFLAG) -o $@ $< -c diff --git a/embedding_example/main.c b/embedding_example/main.c new file mode 100644 index 0000000..4e70892 --- /dev/null +++ b/embedding_example/main.c @@ -0,0 +1,95 @@ +#include "lua.h" +#include "lauxlib.h" +#include "lualib.h" +#include "matrix/matrix.h" +#include "common.h" +#include "luaT/luaT.h" +#include + +const char *nerv_matrix_host_float_tname = "nerv.MMatrixFloat"; +extern Matrix *nerv_matrix_host_float_create(long nrow, long ncol, Status *status); +extern void nerv_matrix_host_float_data_retain(Matrix *self); +extern void nerv_matrix_host_float_data_free(Matrix *self, Status *status); + +lua_State *L; +Matrix *input, *output; +Status status; + +void setup_nerv() { + L = lua_open(); + luaL_openlibs(L); + luaL_loadfile(L, "setup_nerv.lua"); + /* network configuration */ + lua_pushstring(L, "swb_baseline_decode.lua"); + if (lua_pcall(L, 1, LUA_MULTRET, 0)) + { + printf("%s\n", luaL_checkstring(L, 1)); + exit(1); + } + /* lua stack now: input width, output width, propagator */ + input = nerv_matrix_host_float_create(1, luaL_checkinteger(L, 1), &status); + NERV_LUA_CHECK_STATUS(L, status); + output = nerv_matrix_host_float_create(1, luaL_checkinteger(L, 2), &status); + NERV_LUA_CHECK_STATUS(L, status); +} + + +void propagate(float for_fun) { + int i, j; + printf("ok: %d\n", lua_gettop(L)); + lua_pushvalue(L, 3); + /* lua stack now: input width, output width, propagator, propagator */ + for (i = 0; i < input->nrow; i++) /* nrow is actually 1 */ + { + float *nerv_row = (float *)((char *)input->data.f + i * input->stride); + for (j = 0; j < input->ncol; j++) + { + nerv_row[j] = j * for_fun; + } + } + /* avoid gc */ + nerv_matrix_host_float_data_retain(input); + nerv_matrix_host_float_data_retain(input); + nerv_matrix_host_float_data_retain(input); + nerv_matrix_host_float_data_retain(input); + nerv_matrix_host_float_data_retain(output); + nerv_matrix_host_float_data_retain(output); + nerv_matrix_host_float_data_retain(output); + nerv_matrix_host_float_data_retain(output); + + luaT_pushudata(L, input, nerv_matrix_host_float_tname); + luaT_pushudata(L, output, nerv_matrix_host_float_tname); + /* lua stack now: input width, output width, propagator, propagator, input, output */ + if (lua_pcall(L, 2, 0, 0)) /* call propagator with two parameters */ + { + printf("%s\n", luaL_checkstring(L, -1)); + exit(-1); + } + /* lua stack now: input width, output width, propagator */ + printf("## caller ##\n"); + for (i = 0; i < output->nrow; i++) /* nrow is actually 1 */ + { + float *nerv_row = (float *)((char *)output->data.f + i * output->stride); + for (j = 0; j < output->ncol; j++) + { + printf("%.8f ", nerv_row[j]); + } + printf("\n"); + } +} + +void teardown_nerv() { + nerv_matrix_host_float_data_free(input, &status); + NERV_LUA_CHECK_STATUS(L, status); + nerv_matrix_host_float_data_free(output, &status); + NERV_LUA_CHECK_STATUS(L, status); +} + +int main() { + setup_nerv(); + propagate(1.0); + propagate(2.0); + propagate(3.0); + teardown_nerv(); + return 0; +} diff --git a/embedding_example/setup_nerv.lua b/embedding_example/setup_nerv.lua new file mode 100644 index 0000000..e33a1e7 --- /dev/null +++ b/embedding_example/setup_nerv.lua @@ -0,0 +1,26 @@ +package.path="/home/slhome/mfy43/.luarocks/share/lua/5.1/?.lua;/home/slhome/mfy43/.luarocks/share/lua/5.1/?/init.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?/init.lua;"..package.path +package.cpath="/home/slhome/mfy43/.luarocks/lib/lua/5.1/?.so;/home/slhome/mfy43/nerv/install/lib/lua/5.1/?.so;"..package.cpath +local k,l,_=pcall(require,"luarocks.loader") _=k and l.add_context("nerv","scm-1") + +local args = {...} +require 'nerv' +dofile(args[1]) +local param_repo = nerv.ParamRepo() +param_repo:import(gconf.initialized_param, nil, gconf) +local sublayer_repo = make_sublayer_repo(param_repo) +local layer_repo = make_layer_repo(sublayer_repo, param_repo) +local network = get_network(layer_repo) +local batch_size = 1 +network:init(batch_size) +function propagator(input, output) + local gpu_input = nerv.CuMatrixFloat(input:nrow(), input:ncol()) + local gpu_output = nerv.CuMatrixFloat(output:nrow(), output:ncol()) + gpu_input:copy_fromh(input) + print(gpu_input) + network:propagate({gpu_input}, {gpu_output}) + gpu_output:copy_toh(output) + print(output) + -- collect garbage in-time to save GPU memory + collectgarbage("collect") +end +return network.dim_in[1], network.dim_out[1], propagator diff --git a/embedding_example/swb_baseline_decode.lua b/embedding_example/swb_baseline_decode.lua new file mode 100644 index 0000000..14a463b --- /dev/null +++ b/embedding_example/swb_baseline_decode.lua @@ -0,0 +1,109 @@ +require 'htk_io' +gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9, + cumat_type = nerv.CuMatrixFloat, + mmat_type = nerv.MMatrixFloat, + frm_ext = 5, + tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp", + cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp", + htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf", + initialized_param = {"/slfs1/users/mfy43/swb_init.nerv", + "/slfs1/users/mfy43/swb_global_transf.nerv"}, + debug = false} + +function make_sublayer_repo(param_repo) + return nerv.LayerRepo( + { + -- global transf + ["nerv.BiasLayer"] = + { + blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}}, + blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}} + }, + ["nerv.WindowLayer"] = + { + wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}}, + wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}} + }, + -- biased linearity + ["nerv.AffineLayer"] = + { + affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"}, + {dim_in = {429}, dim_out = {2048}}}, + affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"}, + {dim_in = {2048}, dim_out = {2048}}}, + affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"}, + {dim_in = {2048}, dim_out = {3001}}} + }, + ["nerv.SigmoidLayer"] = + { + sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}}, + sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}} + }, + ["nerv.SoftmaxLayer"] = + { + soutput = {{}, {dim_in = {3001}, dim_out = {3001}}} + } + }, param_repo, gconf) +end + +function make_layer_repo(sublayer_repo, param_repo) + return nerv.LayerRepo( + { + ["nerv.DAGLayer"] = + { + global_transf = {{}, { + dim_in = {429}, dim_out = {429}, + sub_layers = sublayer_repo, + connections = { + ["[1]"] = "blayer1[1]", + ["blayer1[1]"] = "wlayer1[1]", + ["wlayer1[1]"] = "blayer2[1]", + ["blayer2[1]"] = "wlayer2[1]", + ["wlayer2[1]"] = "[1]" + } + }}, + main = {{}, { + dim_in = {429}, dim_out = {3001}, + sub_layers = sublayer_repo, + connections = { + ["[1]"] = "affine0[1]", + ["affine0[1]"] = "sigmoid0[1]", + ["sigmoid0[1]"] = "affine1[1]", + ["affine1[1]"] = "sigmoid1[1]", + ["sigmoid1[1]"] = "affine2[1]", + ["affine2[1]"] = "sigmoid2[1]", + ["sigmoid2[1]"] = "affine3[1]", + ["affine3[1]"] = "sigmoid3[1]", + ["sigmoid3[1]"] = "affine4[1]", + ["affine4[1]"] = "sigmoid4[1]", + ["sigmoid4[1]"] = "affine5[1]", + ["affine5[1]"] = "sigmoid5[1]", + ["sigmoid5[1]"] = "affine6[1]", + ["affine6[1]"] = "sigmoid6[1]", + ["sigmoid6[1]"] = "affine7[1]", + ["affine7[1]"] = "soutput[1]", + ["soutput[1]"] = "[1]" + } + }} + } + }, param_repo, gconf) +end + +function get_network(layer_repo) + return layer_repo:get_layer("main") +end diff --git a/nerv/Makefile b/nerv/Makefile index 022e2fb..fdffd12 100644 --- a/nerv/Makefile +++ b/nerv/Makefile @@ -30,7 +30,7 @@ LUAT_OBJS := $(addprefix $(OBJ_DIR)/,$(LUAT_OBJS)) OBJS := $(CORE_OBJS) $(NERV_OBJS) $(LUAT_OBJS) LIBS := $(INST_LIBDIR)/libnerv.so $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so LUA_LIBS := matrix/init.lua io/init.lua init.lua \ - layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua \ + layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua layer/softmax.lua \ layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua layer/affine_recurrent.lua\ nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua \ io/sgd_buffer.lua diff --git a/nerv/layer/init.lua b/nerv/layer/init.lua index 7172f99..6861b0e 100644 --- a/nerv/layer/init.lua +++ b/nerv/layer/init.lua @@ -78,3 +78,4 @@ nerv.include('window.lua') nerv.include('mse.lua') nerv.include('combiner.lua') nerv.include('affine_recurrent.lua') +nerv.include('softmax.lua') diff --git a/nerv/lib/matrix/generic/matrix.c b/nerv/lib/matrix/generic/matrix.c index e4afa37..6cb3dc0 100644 --- a/nerv/lib/matrix/generic/matrix.c +++ b/nerv/lib/matrix/generic/matrix.c @@ -3,7 +3,7 @@ #include "matrix.h" /* FIXME: malloc failure detection */ -static void nerv_matrix_(data_free)(Matrix *self, Status *status) { +void nerv_matrix_(data_free)(Matrix *self, Status *status) { assert(*self->data_ref > 0); if (--(*self->data_ref) == 0) { @@ -18,7 +18,7 @@ static void nerv_matrix_(data_free)(Matrix *self, Status *status) { } } -static void nerv_matrix_(data_retain)(Matrix *self) { +void nerv_matrix_(data_retain)(Matrix *self) { (*self->data_ref)++; } diff --git a/nerv/lib/matrix/generic/matrix.h b/nerv/lib/matrix/generic/matrix.h index 9d44e6d..69b4e6d 100644 --- a/nerv/lib/matrix/generic/matrix.h +++ b/nerv/lib/matrix/generic/matrix.h @@ -2,3 +2,5 @@ Matrix *nerv_matrix_(create)(long nrow, long ncol, Status *status); void nerv_matrix_(destroy)(Matrix *self, Status *status); Matrix *nerv_matrix_(getrow)(Matrix *self, int row); +void nerv_matrix_(data_free)(Matrix *self, Status *status); +void nerv_matrix_(data_retain)(Matrix *self); -- cgit v1.2.3 From 0223b6b88620d9235fc47818aaa5c86ae81d38d9 Mon Sep 17 00:00:00 2001 From: Determinant Date: Tue, 4 Aug 2015 14:42:08 +0800 Subject: ... --- embedding_example/main.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/embedding_example/main.c b/embedding_example/main.c index 4e70892..4c6459c 100644 --- a/embedding_example/main.c +++ b/embedding_example/main.c @@ -49,12 +49,6 @@ void propagate(float for_fun) { } /* avoid gc */ nerv_matrix_host_float_data_retain(input); - nerv_matrix_host_float_data_retain(input); - nerv_matrix_host_float_data_retain(input); - nerv_matrix_host_float_data_retain(input); - nerv_matrix_host_float_data_retain(output); - nerv_matrix_host_float_data_retain(output); - nerv_matrix_host_float_data_retain(output); nerv_matrix_host_float_data_retain(output); luaT_pushudata(L, input, nerv_matrix_host_float_tname); -- cgit v1.2.3 From e20b60f659b08c46b9da0591ee489803f3f3d300 Mon Sep 17 00:00:00 2001 From: Determinant Date: Tue, 4 Aug 2015 15:51:53 +0800 Subject: ... --- nerv/layer/softmax.lua | 31 +++++++++++++++++++++++++++++++ nerv/nerv | 10 +++++----- 2 files changed, 36 insertions(+), 5 deletions(-) create mode 100644 nerv/layer/softmax.lua diff --git a/nerv/layer/softmax.lua b/nerv/layer/softmax.lua new file mode 100644 index 0000000..e979ebf --- /dev/null +++ b/nerv/layer/softmax.lua @@ -0,0 +1,31 @@ +local SoftmaxLayer = nerv.class("nerv.SoftmaxLayer", "nerv.Layer") + +function SoftmaxLayer:__init(id, global_conf, layer_conf) + self.id = id + self.gconf = global_conf + self.dim_in = layer_conf.dim_in + self.dim_out = layer_conf.dim_out + self:check_dim_len(1, 1) -- two inputs: nn output and label +end + +function SoftmaxLayer:init(batch_size) + if self.dim_in[1] ~= self.dim_out[1] then + nerv.error("mismatching dimensions of input and output") + end +end + +function SoftmaxLayer:update(bp_err, input, output) + -- no params, therefore do nothing +end + +function SoftmaxLayer:propagate(input, output) + output[1]:softmax(input[1]) +end + +function SoftmaxLayer:back_propagate(bp_err, next_bp_err, input, output) + nerv.error_method_not_implemented() +end + +function SoftmaxLayer:get_params() + return nerv.ParamRepo({}) +end diff --git a/nerv/nerv b/nerv/nerv index e5943aa..7571659 100644 --- a/nerv/nerv +++ b/nerv/nerv @@ -1,13 +1,13 @@ #! /usr/bin/env luajit require 'nerv' print("Greetings") -if #args < 1 then +if #arg < 1 then return end -local script = args[1] +local script = arg[1] local script_arg = {} -for i = 2, #args do - table.insert(script_arg, args[i]) +for i = 2, #arg do + table.insert(script_arg, arg[i]) end -args = script_arg +arg = script_arg dofile(script) -- cgit v1.2.3 From c3effaac9e9965371a73f9c84c2a4e0880f32138 Mon Sep 17 00:00:00 2001 From: Determinant Date: Tue, 4 Aug 2015 17:27:09 +0800 Subject: fix gc issues --- embedding_example/Makefile | 6 ++++++ embedding_example/main.c | 28 ++++++++++++++++------------ embedding_example/run.sh | 4 ++++ embedding_example/setup_nerv.lua | 9 ++++----- 4 files changed, 30 insertions(+), 17 deletions(-) create mode 100755 embedding_example/run.sh diff --git a/embedding_example/Makefile b/embedding_example/Makefile index e4ee314..3420b30 100644 --- a/embedding_example/Makefile +++ b/embedding_example/Makefile @@ -2,6 +2,12 @@ CFLAG += -I ../install/include/luajit-2.0/ -I ../install/include/nerv/ LDFLAG += -L../install/lib/ -lluajit-5.1 -Wl,-rpath=../install/lib/ -lluaT -lnervcore GCC := gcc +.PHONY: FORCE + +FORCE: ../install/bin/luarocks + echo "#!/bin/bash" > run.sh + $< path >> run.sh + echo "./main" >> run.sh main: main.o $(GCC) -o $@ $< $(LDFLAG) diff --git a/embedding_example/main.c b/embedding_example/main.c index 4c6459c..b3c9bf2 100644 --- a/embedding_example/main.c +++ b/embedding_example/main.c @@ -7,6 +7,8 @@ #include const char *nerv_matrix_host_float_tname = "nerv.MMatrixFloat"; +const char *input_name = "_nerv_embed_input"; +const char *output_name = "_nerv_embed_output"; extern Matrix *nerv_matrix_host_float_create(long nrow, long ncol, Status *status); extern void nerv_matrix_host_float_data_retain(Matrix *self); extern void nerv_matrix_host_float_data_free(Matrix *self, Status *status); @@ -31,6 +33,11 @@ void setup_nerv() { NERV_LUA_CHECK_STATUS(L, status); output = nerv_matrix_host_float_create(1, luaL_checkinteger(L, 2), &status); NERV_LUA_CHECK_STATUS(L, status); + /* add reference to avoid gc */ + luaT_pushudata(L, output, nerv_matrix_host_float_tname); + luaT_pushudata(L, input, nerv_matrix_host_float_tname); + lua_setfield(L, LUA_GLOBALSINDEX, input_name); + lua_setfield(L, LUA_GLOBALSINDEX, output_name); } @@ -47,12 +54,8 @@ void propagate(float for_fun) { nerv_row[j] = j * for_fun; } } - /* avoid gc */ - nerv_matrix_host_float_data_retain(input); - nerv_matrix_host_float_data_retain(output); - - luaT_pushudata(L, input, nerv_matrix_host_float_tname); - luaT_pushudata(L, output, nerv_matrix_host_float_tname); + lua_getfield(L, LUA_GLOBALSINDEX, input_name); + lua_getfield(L, LUA_GLOBALSINDEX, output_name); /* lua stack now: input width, output width, propagator, propagator, input, output */ if (lua_pcall(L, 2, 0, 0)) /* call propagator with two parameters */ { @@ -60,7 +63,7 @@ void propagate(float for_fun) { exit(-1); } /* lua stack now: input width, output width, propagator */ - printf("## caller ##\n"); + printf("## output: %ld %ld ##\n", output->nrow, output->ncol); for (i = 0; i < output->nrow; i++) /* nrow is actually 1 */ { float *nerv_row = (float *)((char *)output->data.f + i * output->stride); @@ -68,21 +71,22 @@ void propagate(float for_fun) { { printf("%.8f ", nerv_row[j]); } - printf("\n"); } } void teardown_nerv() { - nerv_matrix_host_float_data_free(input, &status); - NERV_LUA_CHECK_STATUS(L, status); - nerv_matrix_host_float_data_free(output, &status); - NERV_LUA_CHECK_STATUS(L, status); + lua_pushnil(L); + lua_pushnil(L); + lua_setfield(L, LUA_GLOBALSINDEX, input_name); + lua_setfield(L, LUA_GLOBALSINDEX, output_name); + lua_gc(L, LUA_GCCOLLECT, 0); } int main() { setup_nerv(); propagate(1.0); propagate(2.0); + propagate(2.0); propagate(3.0); teardown_nerv(); return 0; diff --git a/embedding_example/run.sh b/embedding_example/run.sh new file mode 100755 index 0000000..e919263 --- /dev/null +++ b/embedding_example/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash +export LUA_PATH='/home/slhome/mfy43/.luarocks/share/lua/5.1/?.lua;/home/slhome/mfy43/.luarocks/share/lua/5.1/?/init.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?/init.lua;./?.lua;/usr/local/share/luajit-2.0.4/?.lua;/usr/local/share/lua/5.1/?.lua;/usr/local/share/lua/5.1/?/init.lua' +export LUA_CPATH='/home/slhome/mfy43/.luarocks/lib/lua/5.1/?.so;/home/slhome/mfy43/nerv/install/lib/lua/5.1/?.so;./?.so;/usr/local/lib/lua/5.1/?.so;/usr/local/lib/lua/5.1/loadall.so' +./main diff --git a/embedding_example/setup_nerv.lua b/embedding_example/setup_nerv.lua index e33a1e7..3ae878d 100644 --- a/embedding_example/setup_nerv.lua +++ b/embedding_example/setup_nerv.lua @@ -1,10 +1,7 @@ -package.path="/home/slhome/mfy43/.luarocks/share/lua/5.1/?.lua;/home/slhome/mfy43/.luarocks/share/lua/5.1/?/init.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?.lua;/home/slhome/mfy43/nerv/install/share/lua/5.1/?/init.lua;"..package.path -package.cpath="/home/slhome/mfy43/.luarocks/lib/lua/5.1/?.so;/home/slhome/mfy43/nerv/install/lib/lua/5.1/?.so;"..package.cpath local k,l,_=pcall(require,"luarocks.loader") _=k and l.add_context("nerv","scm-1") - -local args = {...} require 'nerv' -dofile(args[1]) +local arg = {...} +dofile(arg[1]) local param_repo = nerv.ParamRepo() param_repo:import(gconf.initialized_param, nil, gconf) local sublayer_repo = make_sublayer_repo(param_repo) @@ -12,6 +9,7 @@ local layer_repo = make_layer_repo(sublayer_repo, param_repo) local network = get_network(layer_repo) local batch_size = 1 network:init(batch_size) + function propagator(input, output) local gpu_input = nerv.CuMatrixFloat(input:nrow(), input:ncol()) local gpu_output = nerv.CuMatrixFloat(output:nrow(), output:ncol()) @@ -23,4 +21,5 @@ function propagator(input, output) -- collect garbage in-time to save GPU memory collectgarbage("collect") end + return network.dim_in[1], network.dim_out[1], propagator -- cgit v1.2.3 From 6aa0bb7b1ca5c92e50d6c7eeb1e3db2c16e71c3b Mon Sep 17 00:00:00 2001 From: Determinant Date: Tue, 4 Aug 2015 17:32:30 +0800 Subject: ... --- embedding_example/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/embedding_example/Makefile b/embedding_example/Makefile index 3420b30..73287f4 100644 --- a/embedding_example/Makefile +++ b/embedding_example/Makefile @@ -4,6 +4,11 @@ GCC := gcc .PHONY: FORCE +all: main FORCE +clean: + -rm -f *.o + -rm main + FORCE: ../install/bin/luarocks echo "#!/bin/bash" > run.sh $< path >> run.sh -- cgit v1.2.3 From e935fd6d49b81b0c83d5ad112bfd0a8a68a67175 Mon Sep 17 00:00:00 2001 From: Determinant Date: Wed, 5 Aug 2015 08:04:40 +0800 Subject: put global transformation into a separate library --- Makefile | 1 + nerv/Makefile | 2 +- nerv/matrix/generic/cukernel.cu | 592 ---------------------------------------- speech | 2 +- 4 files changed, 3 insertions(+), 594 deletions(-) delete mode 100644 nerv/matrix/generic/cukernel.cu diff --git a/Makefile b/Makefile index fa888c3..664a83b 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,7 @@ luarocks: install: cd nerv; $(PREFIX)/bin/luarocks make speech: + cd speech/speech_utils; $(PREFIX)/bin/luarocks make cd speech/htk_io; $(PREFIX)/bin/luarocks make clean: cd nerv && make clean diff --git a/nerv/Makefile b/nerv/Makefile index fdffd12..728d010 100644 --- a/nerv/Makefile +++ b/nerv/Makefile @@ -66,7 +66,7 @@ $(LIB_PATH)/libluaT.so: $(LUAT_OBJS) $(INST_LIBDIR)/libnerv.so: $(NERV_OBJS) $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so gcc -shared -o $@ $(NERV_OBJS) $(LDFLAGS) -Wl,-rpath=$(LIB_PATH) -L$(LIB_PATH) -lnervcore -lluaT -$(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c matrix/generic/cukernel.cu +$(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c $(OBJ_DIR)/matrix/mmatrix.o: matrix/generic/mmatrix.c matrix/generic/matrix.c $(OBJ_DIR)/lib/matrix/cumatrix.o: lib/matrix/generic/cumatrix.c lib/matrix/generic/matrix.c lib/matrix/generic/cukernel.cu diff --git a/nerv/matrix/generic/cukernel.cu b/nerv/matrix/generic/cukernel.cu deleted file mode 100644 index 2ae5e62..0000000 --- a/nerv/matrix/generic/cukernel.cu +++ /dev/null @@ -1,592 +0,0 @@ -#ifdef NERV_GENERIC_CUKERNEL -#include -#include -#include "matrix.h" -#include "cuda.h" -#include "float.h" -#define CUDA_THREADS_N 16 -#define CUDA_THREADS_NN ((CUDA_THREADS_N) * (CUDA_THREADS_N)) -#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) -__global__ void cudak_(log_elem)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - int nrow, int ncol, int stride) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - long idx; - MATRIX_ELEM tmp; - if (i >= nrow || j >= ncol) return; - idx = j + i * stride; - tmp = a[idx]; - if(tmp < FLT_MIN) tmp = FLT_MIN; - b[idx] = log(tmp); -} - -__global__ void cudak_(mul_elem)(const MATRIX_ELEM *a, const MATRIX_ELEM *b, - MATRIX_ELEM *c, - int nrow, int ncol, int stride) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - long idx; - if (i >= nrow || j >= ncol) return; - idx = j + i * stride; - c[idx] = a[idx] * b[idx]; -} - -__global__ void cudak_(sigmoid)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - int nrow, int ncol, int stride) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - long idx; - if (i >= nrow || j >= ncol) return; - idx = j + i * stride; - b[idx] = 1.0 / (1.0 + exp(-a[idx])); -} - -__global__ void cudak_(sigmoid_grad)(const MATRIX_ELEM *output, - const MATRIX_ELEM *err, - MATRIX_ELEM *nerr, - int nrow, int ncol, int stride) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - long idx; - if (i >= nrow || j >= ncol) return; - idx = j + i * stride; - nerr[idx] = output[idx] * (1.0 - output[idx]) * err[idx]; -} - -__global__ void cudak_(softmax_final)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - const MATRIX_ELEM *max, const MATRIX_ELEM *deno, - int nrow, int ncol, int stride, int mstride) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - long idx; - if (i >= nrow || j >= ncol) return; - idx = j + i * stride; - b[idx] = exp(a[idx] - max[0 + i * mstride]) / deno[0 + i * mstride]; -} - -__global__ void cudak_(block_reduce_rowsum)(const MATRIX_ELEM *input, - MATRIX_ELEM *output, - const int istride, const int ostride, - const int n) { - extern __shared__ MATRIX_ELEM cudak_(arr)[]; - int j = blockIdx.x * blockDim.x + threadIdx.x; - cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0; - __syncthreads(); - for (int offset = blockDim.x >> 1; offset; offset >>= 1) - { - if (threadIdx.x < offset) - cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset]; - __syncthreads(); - } - if (threadIdx.x == 0) - output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0]; -} - -__global__ void cudak_(block_reduce_colsum)(const MATRIX_ELEM *input, - MATRIX_ELEM *output, - const int istride, const int ostride, - const int n) { - extern __shared__ MATRIX_ELEM cudak_(arr)[]; - int i = blockIdx.y * blockDim.y + threadIdx.y; - cudak_(arr)[threadIdx.y] = i < n ? input[blockIdx.x + istride * i] : 0; - __syncthreads(); - for (int offset = blockDim.y >> 1; offset; offset >>= 1) - { - if (threadIdx.y < offset) - cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset]; - __syncthreads(); - } - if (threadIdx.y == 0) - output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0]; -} - -__global__ void cudak_(block_reduce_colsame)(const MATRIX_ELEM *input, - const MATRIX_ELEM *ref_input, - MATRIX_ELEM *output, - const int istride, const int ostride, - const int n) { - extern __shared__ MATRIX_ELEM cudak_(arr)[]; - int i = blockIdx.y * blockDim.y + threadIdx.y; - cudak_(arr)[threadIdx.y] = (i < n && input[blockIdx.x + istride * i] == \ - ref_input[blockIdx.x + istride * i]) ? 1.0 : 0; - __syncthreads(); - for (int offset = blockDim.y >> 1; offset; offset >>= 1) - { - if (threadIdx.y < offset) - cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset]; - __syncthreads(); - } - if (threadIdx.y == 0) - output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0]; -} - -__global__ void cudak_(block_reduce_softmax_rowsum)(const MATRIX_ELEM *input, - MATRIX_ELEM *output, - const MATRIX_ELEM *max, - const int istride, const int ostride, - const int mstride, const int n) { - extern __shared__ MATRIX_ELEM cudak_(arr)[]; - int j = blockIdx.x * blockDim.x + threadIdx.x; - cudak_(arr)[threadIdx.x] = j < n ? exp(input[j + istride * blockIdx.y] - \ - max[0 + mstride * blockIdx.y]) : 0; - __syncthreads(); - for (int offset = blockDim.x >> 1; offset; offset >>= 1) - { - if (threadIdx.x < offset) - cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset]; - __syncthreads(); - } - if (threadIdx.x == 0) - output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0]; -} - -__global__ void cudak_(block_reduce_rowmax)(const MATRIX_ELEM *input, - MATRIX_ELEM *output, - const int istride, const int ostride, - const int n) { - extern __shared__ MATRIX_ELEM cudak_(arr)[]; - int j = blockIdx.x * blockDim.x + threadIdx.x; - cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX; - __syncthreads(); - for (int offset = blockDim.x >> 1; offset; offset >>= 1) - { - if (threadIdx.x < offset) - { - MATRIX_ELEM l = cudak_(arr)[threadIdx.x], - r = cudak_(arr)[threadIdx.x + offset]; - if (r > l) - cudak_(arr)[threadIdx.x] = r; - } - __syncthreads(); - } - if (threadIdx.x == 0) - output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0]; -} - -__global__ void cudak_(block_reduce_rowmax_idx)(const MATRIX_ELEM *input, - const MATRIX_ELEM *idx_input, - MATRIX_ELEM *output, - MATRIX_ELEM *idx_output, - const int istride, const int ostride, - const int n) { - extern __shared__ MATRIX_ELEM cudak_(arr)[]; - MATRIX_ELEM *arr_val = cudak_(arr); - MATRIX_ELEM *arr_idx = arr_val + blockDim.x; - int j = blockIdx.x * blockDim.x + threadIdx.x; - arr_val[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX; - arr_idx[threadIdx.x] = j < n ? idx_input[j + istride * blockIdx.y] : 0; - __syncthreads(); - for (int offset = blockDim.x >> 1; offset; offset >>= 1) - { - if (threadIdx.x < offset) - { - MATRIX_ELEM l = arr_val[threadIdx.x], - r = arr_val[threadIdx.x + offset]; - if (r > l) - { - arr_val[threadIdx.x] = r; - arr_idx[threadIdx.x] = arr_idx[threadIdx.x + offset]; - } - } - __syncthreads(); - } - if (threadIdx.x == 0) - { - output[blockIdx.x + ostride * blockIdx.y] = arr_val[0]; - idx_output[blockIdx.x + ostride * blockIdx.y] = arr_idx[0]; - } -} - -__global__ void cudak_(add_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - int nrow, int ncol, int stride, double beta) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i >= nrow || j >= ncol) return; - b[j + i * stride] += beta * a[j]; -} - -__global__ void cudak_(fill)(MATRIX_ELEM *a, - int nrow, int ncol, int stride, double val) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i >= nrow || j >= ncol) return; - a[j + i * stride] = val; -} - -__global__ void cudak_(clip)(MATRIX_ELEM *a, - int nrow, int ncol, int stride, double val_1, double val_2) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i >= nrow || j >= ncol) return; - if (a[j + i * stride] > val_2) - a[j + i * stride] = val_2; - else if (a[j + i * stride] < val_1) - a[j + i * stride] = val_1; -} - -__global__ void cudak_(expand_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - int nrow, int ncol, - int enrow, int encol, - int stride, int estride, - int context) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - int ridx; - if (i >= enrow || j >= encol) return; - ridx = i + j / ncol - context; - if (ridx < 0) ridx = 0; - else if (ridx >= nrow) ridx = nrow - 1; - b[j + i * estride] = a[j % ncol + ridx * stride]; -} - -__global__ void cudak_(rearrange_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - int nrow, int ncol, - int stride, int step, int orig_dim) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i >= nrow || j >= ncol) return; - b[j + i * stride] = a[j / step + (j % step) * orig_dim + i * stride]; -} - -__global__ void cudak_(scale_rows_by_col)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - int nrow, int ncol, - int astride, int bstride) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i >= nrow || j >= ncol) return; - b[j + i * bstride] *= a[i * astride]; -} - -__global__ void cudak_(scale_rows_by_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - int nrow, int ncol, - int stride) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i >= nrow || j >= ncol) return; - b[j + i * stride] *= a[j]; -} - -__global__ void cudak_(decompress)(const MATRIX_ELEM *a, MATRIX_ELEM *b, - int nrow, int ncol, - int stride_a, int stride_b) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i >= nrow || j >= ncol) return; - b[lrintf(a[j + i * stride_a]) + i * stride_b] = 1.0; -} - -__global__ void cudak_(gen_col_idx)(MATRIX_ELEM *b, - int nrow, int ncol, int stride) { - int j = blockIdx.x * blockDim.x + threadIdx.x; - int i = blockIdx.y * blockDim.y + threadIdx.y; - if (i >= nrow || j >= ncol) return; - b[j + i * stride] = j; -} - -extern "C" { -#include "../cukernel.h" - void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(log_elem)<<>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), - b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM)); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_mul_elem)(const Matrix *a, const Matrix *b, - Matrix *c) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(mul_elem)<<>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), - MATRIX_ELEM_PTR(c), - b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM)); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(sigmoid)<<>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol, - b->stride / sizeof(MATRIX_ELEM)); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_sigmoid_grad)(const Matrix *output, - const Matrix *err, Matrix *nerr) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(nerr->ncol, threadsPerBlock.x), - CEIL_DIV(nerr->nrow, threadsPerBlock.y)); - cudak_(sigmoid_grad)<<>> \ - (MATRIX_ELEM_PTR(output), MATRIX_ELEM_PTR(err), - MATRIX_ELEM_PTR(nerr), - nerr->nrow, nerr->ncol, - nerr->stride / sizeof(MATRIX_ELEM)); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_rowsum)(const Matrix *a, Matrix *b) { - dim3 block(CUDA_THREADS_NN, 1); - int ncol = a->ncol; - int blocks_per_row = CEIL_DIV(ncol, block.x); - dim3 grid(blocks_per_row, a->nrow); - MATRIX_ELEM *res; - size_t stride; - cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow); - cudak_(block_reduce_rowsum)<<>> \ - (MATRIX_ELEM_PTR(a), res, - a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM), - ncol); - ncol = blocks_per_row; - assert((unsigned long)ncol <= block.x); - grid.x = 1; - cudaStreamSynchronize(0); - cudak_(block_reduce_rowsum)<<>> \ - (res, MATRIX_ELEM_PTR(b), - stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM), - ncol); - cudaStreamSynchronize(0); - cudaFree(res); - } - - void cudak_(cuda_colsame)(const Matrix *a, const Matrix *ref, Matrix *b) { - dim3 block(1, CUDA_THREADS_NN); - int nrow = a->nrow; - int blocks_per_col = CEIL_DIV(nrow, block.y); - dim3 grid(a->ncol, blocks_per_col); - MATRIX_ELEM *res; - size_t stride; - cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col); - cudak_(block_reduce_colsame)<<>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(ref), res, - a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM), - nrow); - nrow = blocks_per_col; - assert((unsigned long)nrow <= block.y); - grid.y = 1; - cudaStreamSynchronize(0); - cudak_(block_reduce_colsum)<<>> \ - (res, MATRIX_ELEM_PTR(b), - stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM), - nrow); - cudaStreamSynchronize(0); - cudaFree(res); - } - - void cudak_(cuda_colsum)(const Matrix *a, Matrix *b) { - dim3 block(1, CUDA_THREADS_NN); - int nrow = a->nrow; - int blocks_per_col = CEIL_DIV(nrow, block.y); - dim3 grid(a->ncol, blocks_per_col); - MATRIX_ELEM *res; - size_t stride; - cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col); - cudak_(block_reduce_colsum)<<>> \ - (MATRIX_ELEM_PTR(a), res, - a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM), - nrow); - nrow = blocks_per_col; - assert((unsigned long)nrow <= block.y); - grid.y = 1; - cudaStreamSynchronize(0); - cudak_(block_reduce_colsum)<<>> \ - (res, MATRIX_ELEM_PTR(b), - stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM), - nrow); - cudaStreamSynchronize(0); - cudaFree(res); - } - - void cudak_(cuda_softmax_final)(const Matrix *a, const Matrix *max, - const Matrix *deno, Matrix *b) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(softmax_final)<<>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), - MATRIX_ELEM_PTR(max), MATRIX_ELEM_PTR(deno), - b->nrow, b->ncol, - b->stride / sizeof(MATRIX_ELEM), - max->stride / sizeof(MATRIX_ELEM)); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_softmax_denominator)(const Matrix *a, const Matrix *max, Matrix *b) { - dim3 block(CUDA_THREADS_NN, 1); - int ncol = a->ncol; - int blocks_per_row = CEIL_DIV(ncol, block.x); - dim3 grid(blocks_per_row, a->nrow); - MATRIX_ELEM *res; - size_t stride; - assert(max->ncol == 1); - cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow); - cudak_(block_reduce_softmax_rowsum) \ - <<>> \ - (MATRIX_ELEM_PTR(a), res, MATRIX_ELEM_PTR(max), - a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM), - max->stride / sizeof(MATRIX_ELEM), - ncol); - ncol = blocks_per_row; - assert((unsigned long)ncol <= block.x); - grid.x = 1; - cudaStreamSynchronize(0); - cudak_(block_reduce_rowsum) \ - <<>> \ - (res, MATRIX_ELEM_PTR(b), - stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM), - ncol); - cudaStreamSynchronize(0); - cudaFree(res); - } - - void cudak_(cuda_rowmax)(const Matrix *a, Matrix *b) { - dim3 block(CUDA_THREADS_NN, 1); - int ncol = a->ncol; - int blocks_per_row = CEIL_DIV(ncol, block.x); - dim3 grid(blocks_per_row, a->nrow); - MATRIX_ELEM *res; - size_t stride; - cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow); - cudak_(block_reduce_rowmax)<<>> \ - (MATRIX_ELEM_PTR(a), res, - a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM), - ncol); - ncol = blocks_per_row; - assert((unsigned long)ncol <= block.x); - grid.x = 1; - cudaStreamSynchronize(0); - cudak_(block_reduce_rowmax)<<>> \ - (res, MATRIX_ELEM_PTR(b), - stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM), - ncol); - cudaStreamSynchronize(0); - cudaFree(res); - } - - void cudak_(cuda_rowmax_idx)(const Matrix *a, Matrix *b, Matrix *b_idx) { - dim3 block(CUDA_THREADS_NN, 1); - int ncol = a->ncol; - int blocks_per_row = CEIL_DIV(ncol, block.x); - dim3 grid(blocks_per_row, a->nrow); - MATRIX_ELEM *a_idx, *res, *res_idx; - size_t stride; - cudaMallocPitch(&a_idx, &stride, a->stride, a->nrow); - cudak_(gen_col_idx)<<>>(a_idx, a->nrow, ncol, stride / sizeof(MATRIX_ELEM)); - cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow); - cudaMallocPitch(&res_idx, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow); - cudaStreamSynchronize(0); - cudak_(block_reduce_rowmax_idx)<<>> \ - (MATRIX_ELEM_PTR(a), a_idx, res, res_idx, - a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM), - ncol); - ncol = blocks_per_row; - assert((unsigned long)ncol <= block.x); - grid.x = 1; - cudaStreamSynchronize(0); - cudak_(block_reduce_rowmax_idx)<<>> \ - (res, res_idx, MATRIX_ELEM_PTR(b), MATRIX_ELEM_PTR(b_idx), - stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM), - ncol); - cudaStreamSynchronize(0); - cudaFree(a_idx); - cudaFree(res); - cudaFree(res_idx); - } - - /* in-place calc */ - void cudak_(cuda_add_row)(const Matrix *a, Matrix *b, double beta) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(add_row)<<>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol, - b->stride / sizeof(MATRIX_ELEM), beta); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_fill)(Matrix *a, double val) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x), - CEIL_DIV(a->nrow, threadsPerBlock.y)); - cudak_(fill)<<>> \ - (MATRIX_ELEM_PTR(a), a->nrow, a->ncol, - a->stride / sizeof(MATRIX_ELEM), val); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_clip)(Matrix *a, double val_1, double val_2) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x), - CEIL_DIV(a->nrow, threadsPerBlock.y)); - cudak_(clip)<<>> \ - (MATRIX_ELEM_PTR(a), a->nrow, a->ncol, - a->stride / sizeof(MATRIX_ELEM), val_1, val_2); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(expand_frm)<<>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), - a->nrow, a->ncol, - b->nrow, b->ncol, - a->stride / sizeof(MATRIX_ELEM), - b->stride / sizeof(MATRIX_ELEM), - context); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(rearrange_frm)<<>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), - b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM), - step, b->ncol / step); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(scale_rows_by_col)<<>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), - b->nrow, b->ncol, - a->stride / sizeof(MATRIX_ELEM), - b->stride / sizeof(MATRIX_ELEM)); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b) { - dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); - dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), - CEIL_DIV(b->nrow, threadsPerBlock.y)); - cudak_(scale_rows_by_row)<<>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), - b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM)); - cudaStreamSynchronize(0); - } - - void cudak_(cuda_decompress)(const Matrix *a, Matrix *b) { - dim3 threadsPerBlock(1, CUDA_THREADS_NN); - dim3 numBlocks(1, CEIL_DIV(a->nrow, threadsPerBlock.y)); - cudak_(decompress)<<>> \ - (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), - a->nrow, a->ncol, - a->stride / sizeof(MATRIX_ELEM), - b->stride / sizeof(MATRIX_ELEM)); - cudaStreamSynchronize(0); - } -} -#endif diff --git a/speech b/speech index 08e33af..acd1bc3 160000 --- a/speech +++ b/speech @@ -1 +1 @@ -Subproject commit 08e33afa533af1f026ac271446a0c873fe0bb5cb +Subproject commit acd1bc3cf812f69a6260179b584f2a3f0e6d6b80 -- cgit v1.2.3 From bca6cfa05af1dfc898bdb35f9f481048840e9217 Mon Sep 17 00:00:00 2001 From: Determinant Date: Wed, 5 Aug 2015 08:10:24 +0800 Subject: keep up with speech repo --- speech | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speech b/speech index acd1bc3..261aed4 160000 --- a/speech +++ b/speech @@ -1 +1 @@ -Subproject commit acd1bc3cf812f69a6260179b584f2a3f0e6d6b80 +Subproject commit 261aed43b863c85ecf709dcac2739c388ec491d2 -- cgit v1.2.3 From 00b02761747caf09756c7dccf11a7236777cbb42 Mon Sep 17 00:00:00 2001 From: Determinant Date: Wed, 5 Aug 2015 08:17:59 +0800 Subject: ... --- .gitmodules | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.gitmodules b/.gitmodules index 1432de9..16785f1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -3,10 +3,7 @@ url = http://luajit.org/git/luajit-2.0.git [submodule "nerv-speech"] path = nerv-speech - url = https://github.com/Determinant/nerv-speech.git -[submodule "speech"] - path = speech - url = https://github.com/Determinant/nerv-speech.git + url = https://github.com/Nerv-SJTU/nerv-speech.git [submodule "luarocks"] path = luarocks url = https://github.com/keplerproject/luarocks.git -- cgit v1.2.3 From 30701effba795635c9dcfa86882486f1f184e243 Mon Sep 17 00:00:00 2001 From: Determinant Date: Wed, 5 Aug 2015 08:19:03 +0800 Subject: ... --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 16785f1..afad59e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,7 @@ [submodule "luajit-2.0"] path = luajit-2.0 url = http://luajit.org/git/luajit-2.0.git -[submodule "nerv-speech"] +[submodule "speech"] path = nerv-speech url = https://github.com/Nerv-SJTU/nerv-speech.git [submodule "luarocks"] -- cgit v1.2.3 From 7ae89059d68850e12826bc6812e4a6d521e45b53 Mon Sep 17 00:00:00 2001 From: Determinant Date: Wed, 5 Aug 2015 08:20:18 +0800 Subject: ... --- .gitmodules | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index afad59e..217ded3 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,8 +1,8 @@ [submodule "luajit-2.0"] path = luajit-2.0 url = http://luajit.org/git/luajit-2.0.git -[submodule "speech"] - path = nerv-speech +[submodule "nerv-speech"] + path = speech url = https://github.com/Nerv-SJTU/nerv-speech.git [submodule "luarocks"] path = luarocks -- cgit v1.2.3 From 7579ff4941d7019d4e911978879ec07b62a4e523 Mon Sep 17 00:00:00 2001 From: Determinant Date: Wed, 5 Aug 2015 09:29:24 +0800 Subject: use expanded features and do global transf in embedding_example --- embedding_example/setup_nerv.lua | 10 ++++++---- embedding_example/swb_baseline_decode.lua | 5 +++++ speech | 2 +- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/embedding_example/setup_nerv.lua b/embedding_example/setup_nerv.lua index 3ae878d..49a5dd6 100644 --- a/embedding_example/setup_nerv.lua +++ b/embedding_example/setup_nerv.lua @@ -7,17 +7,19 @@ param_repo:import(gconf.initialized_param, nil, gconf) local sublayer_repo = make_sublayer_repo(param_repo) local layer_repo = make_layer_repo(sublayer_repo, param_repo) local network = get_network(layer_repo) +local global_transf = get_global_transf(layer_repo) local batch_size = 1 network:init(batch_size) function propagator(input, output) - local gpu_input = nerv.CuMatrixFloat(input:nrow(), input:ncol()) + local transformed = nerv.speech_utils.global_transf(input, + global_transf, 0, gconf) -- preprocessing + local gpu_input = nerv.CuMatrixFloat(transformed:nrow(), transformed:ncol()) local gpu_output = nerv.CuMatrixFloat(output:nrow(), output:ncol()) - gpu_input:copy_fromh(input) - print(gpu_input) + print(transformed) + gpu_input:copy_fromh(transformed) network:propagate({gpu_input}, {gpu_output}) gpu_output:copy_toh(output) - print(output) -- collect garbage in-time to save GPU memory collectgarbage("collect") end diff --git a/embedding_example/swb_baseline_decode.lua b/embedding_example/swb_baseline_decode.lua index 14a463b..8cdb320 100644 --- a/embedding_example/swb_baseline_decode.lua +++ b/embedding_example/swb_baseline_decode.lua @@ -107,3 +107,8 @@ end function get_network(layer_repo) return layer_repo:get_layer("main") end + + +function get_global_transf(layer_repo) + return layer_repo:get_layer("global_transf") +end diff --git a/speech b/speech index 261aed4..c6c6442 160000 --- a/speech +++ b/speech @@ -1 +1 @@ -Subproject commit 261aed43b863c85ecf709dcac2739c388ec491d2 +Subproject commit c6c644223dc9168befd189bf6f33243390671c99 -- cgit v1.2.3 From 156e2b5d97ac7a702e655dc3a1e260c72eecaa9c Mon Sep 17 00:00:00 2001 From: Determinant Date: Thu, 6 Aug 2015 08:56:18 +0800 Subject: fix typoes in nerv.error --- nerv/init.lua | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/nerv/init.lua b/nerv/init.lua index 183ae6d..9c1a5c8 100644 --- a/nerv/init.lua +++ b/nerv/init.lua @@ -1,15 +1,19 @@ require 'libnerv' -function nerv.error(fmt, ...) - error("[nerv] internal error: " .. fmt .. "\n", ...) -end - function nerv.error_method_not_implemented() nerv.error("method not implemented"); end +function nerv.sprintf(fmt, ...) + return string.format(fmt, ...) +end + function nerv.printf(fmt, ...) - io.write(string.format(fmt, ...)) + io.write(nerv.sprintf(fmt, ...)) +end + +function nerv.error(fmt, ...) + error(nerv.sprintf("[nerv] internal error: " .. fmt .. "\n", ...)) end function nerv.mesg_with_timestamp(fmt, ...) -- cgit v1.2.3 From ddc52b089a29dbe805ccbe499460b3e5d5b060c7 Mon Sep 17 00:00:00 2001 From: Determinant Date: Thu, 6 Aug 2015 10:25:26 +0800 Subject: do not use submodule for nerv-speech --- .gitmodules | 3 --- README.md | 4 ++-- speech | 1 - 3 files changed, 2 insertions(+), 6 deletions(-) delete mode 160000 speech diff --git a/.gitmodules b/.gitmodules index 217ded3..9f556c5 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,6 @@ [submodule "luajit-2.0"] path = luajit-2.0 url = http://luajit.org/git/luajit-2.0.git -[submodule "nerv-speech"] - path = speech - url = https://github.com/Nerv-SJTU/nerv-speech.git [submodule "luarocks"] path = luarocks url = https://github.com/keplerproject/luarocks.git diff --git a/README.md b/README.md index 10d531c..efec2c4 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,9 @@ First make sure you have __lua__ and __CUDA__ installed on your computer. __Nerv__ is currently developed via github.You can download and make __Nerv__ by doing the following: ``` cd ~ -git clone https://github.com/Determinant/nerv.git +git clone https://github.com/Nerv-SJTU/nerv.git cd nerv -git submodule init && git submodule update +git clone https://github.com/Nerv-SJTU/nerv-speech.git make ``` The `git submodule` command is for the __luajit__ repository inside __Nerv__. diff --git a/speech b/speech deleted file mode 160000 index c6c6442..0000000 --- a/speech +++ /dev/null @@ -1 +0,0 @@ -Subproject commit c6c644223dc9168befd189bf6f33243390671c99 -- cgit v1.2.3 From 2dc87bc02a1242dd5e029d0baaf4e0ae7173184f Mon Sep 17 00:00:00 2001 From: Determinant Date: Thu, 6 Aug 2015 10:34:43 +0800 Subject: ... --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index efec2c4..be8e5b0 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,8 @@ __Nerv__ is currently developed via github.You can download and make __Nerv__ by cd ~ git clone https://github.com/Nerv-SJTU/nerv.git cd nerv -git clone https://github.com/Nerv-SJTU/nerv-speech.git +git clone https://github.com/Nerv-SJTU/nerv-speech.git speech +git submodule init && git submodule update make ``` The `git submodule` command is for the __luajit__ repository inside __Nerv__. -- cgit v1.2.3 From b4d9cfa8e3a4735687311577dded97d889340134 Mon Sep 17 00:00:00 2001 From: Determinant Date: Thu, 6 Aug 2015 14:08:26 +0800 Subject: make network configuration example file clearer --- embedding_example/main.c | 2 +- embedding_example/setup_nerv.lua | 5 +- embedding_example/swb_baseline_decode.lua | 114 --------------------- nerv/doc/gh-pages | 1 + nerv/examples/asr_trainer.lua | 11 +- nerv/examples/swb_baseline.lua | 68 ++++++++++--- nerv/examples/swb_baseline_basic.lua | 161 ++++++++++++++++++++++++++++++ nerv/nn/layer_dag.lua | 3 +- nerv/nn/layer_repo.lua | 8 +- 9 files changed, 230 insertions(+), 143 deletions(-) delete mode 100644 embedding_example/swb_baseline_decode.lua create mode 160000 nerv/doc/gh-pages create mode 100644 nerv/examples/swb_baseline_basic.lua diff --git a/embedding_example/main.c b/embedding_example/main.c index b3c9bf2..8856d58 100644 --- a/embedding_example/main.c +++ b/embedding_example/main.c @@ -22,7 +22,7 @@ void setup_nerv() { luaL_openlibs(L); luaL_loadfile(L, "setup_nerv.lua"); /* network configuration */ - lua_pushstring(L, "swb_baseline_decode.lua"); + lua_pushstring(L, "../nerv/examples/swb_baseline.lua"); if (lua_pcall(L, 1, LUA_MULTRET, 0)) { printf("%s\n", luaL_checkstring(L, 1)); diff --git a/embedding_example/setup_nerv.lua b/embedding_example/setup_nerv.lua index 49a5dd6..5ade950 100644 --- a/embedding_example/setup_nerv.lua +++ b/embedding_example/setup_nerv.lua @@ -4,9 +4,8 @@ local arg = {...} dofile(arg[1]) local param_repo = nerv.ParamRepo() param_repo:import(gconf.initialized_param, nil, gconf) -local sublayer_repo = make_sublayer_repo(param_repo) -local layer_repo = make_layer_repo(sublayer_repo, param_repo) -local network = get_network(layer_repo) +local layer_repo = make_layer_repo(param_repo) +local network = get_decode_network(layer_repo) local global_transf = get_global_transf(layer_repo) local batch_size = 1 network:init(batch_size) diff --git a/embedding_example/swb_baseline_decode.lua b/embedding_example/swb_baseline_decode.lua deleted file mode 100644 index 8cdb320..0000000 --- a/embedding_example/swb_baseline_decode.lua +++ /dev/null @@ -1,114 +0,0 @@ -require 'htk_io' -gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9, - cumat_type = nerv.CuMatrixFloat, - mmat_type = nerv.MMatrixFloat, - frm_ext = 5, - tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp", - cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp", - htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf", - initialized_param = {"/slfs1/users/mfy43/swb_init.nerv", - "/slfs1/users/mfy43/swb_global_transf.nerv"}, - debug = false} - -function make_sublayer_repo(param_repo) - return nerv.LayerRepo( - { - -- global transf - ["nerv.BiasLayer"] = - { - blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}}, - blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}} - }, - ["nerv.WindowLayer"] = - { - wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}}, - wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}} - }, - -- biased linearity - ["nerv.AffineLayer"] = - { - affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"}, -