From ea6f2990f99dd9ded6a0e74d75a3ec84900a2518 Mon Sep 17 00:00:00 2001 From: Determinant Date: Wed, 3 Jun 2015 23:00:45 +0800 Subject: demo now works (without random shuffle) --- Makefile | 5 ++- examples/test_dnn_layers.lua | 2 +- examples/test_nn_lib.lua | 91 ++++++++++++++++++++++++++++++---------- io/init.lua | 22 ++++++++++ io/sgd_buffer.lua | 99 ++++++++++++++++++++++++++++++++++++++++++++ layer/affine.lua | 2 +- layer/init.lua | 24 +++++------ layer/softmax_ce.lua | 20 ++++++--- matrix/cuda_helper.h | 2 +- matrix/cukernel.h | 1 + matrix/generic/cukernel.cu | 19 +++++++++ matrix/generic/cumatrix.c | 87 +++++++++++++++++++++++++------------- matrix/generic/mmatrix.c | 20 ++++++++- matrix/init.lua | 3 +- nn/layer_dag.lua | 59 ++++++++++++++++---------- speech | 2 +- 16 files changed, 362 insertions(+), 96 deletions(-) create mode 100644 io/sgd_buffer.lua diff --git a/Makefile b/Makefile index 934235f..f0d319f 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,8 @@ LUA_LIBS := matrix/init.lua io/init.lua nerv.lua \ pl/utils.lua pl/compat.lua \ layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua \ layer/window.lua layer/bias.lua \ - nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua + nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua \ + io/sgd_buffer.lua INCLUDE := -I build/luajit-2.0/include/luajit-2.0/ -DLUA_USE_APICHECK CUDA_BASE := /usr/local/cuda-6.5 CUDA_INCLUDE := -I $(CUDA_BASE)/include/ @@ -53,7 +54,7 @@ $(OBJ_DIR)/matrix/cukernel.o: matrix/generic/cukernel.cu speech: -mkdir -p build/objs/speech/tnet_io - $(MAKE) -C speech/ BUILD_DIR=$(BUILD_DIR) LIB_DIR=$(LIB_DIR) OBJ_DIR=$(CURDIR)/build/objs/speech/ + $(MAKE) -C speech/ BUILD_DIR=$(BUILD_DIR) LIB_DIR=$(LIB_DIR) OBJ_DIR=$(CURDIR)/build/objs/speech/ LUA_DIR=$(LUA_DIR) clean: -rm -rf $(OBJ_DIR) diff --git a/examples/test_dnn_layers.lua b/examples/test_dnn_layers.lua index 6e4d98d..f306807 100644 --- a/examples/test_dnn_layers.lua +++ b/examples/test_dnn_layers.lua @@ -3,7 +3,7 @@ require 'layer.sigmoid' require 'layer.softmax_ce' global_conf = {lrate = 0.8, wcost = 1e-6, - momentum = 0.9, mat_type = nerv.CuMatrixFloat} + momentum = 0.9, cumat_type = nerv.CuMatrixFloat} pf = nerv.ChunkFile("affine.param", "r") ltp = pf:read_chunk("a", global_conf) diff --git a/examples/test_nn_lib.lua b/examples/test_nn_lib.lua index ec338fe..9600917 100644 --- a/examples/test_nn_lib.lua +++ b/examples/test_nn_lib.lua @@ -1,14 +1,24 @@ --- require 'layer.affine' --- require 'layer.sigmoid' --- require 'layer.softmax_ce' - +require 'speech.init' gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9, - mat_type = nerv.CuMatrixFloat, - batch_size = 10} + cumat_type = nerv.CuMatrixFloat, + mmat_type = nerv.MMatrixFloat, + batch_size = 256} -param_repo = nerv.ParamRepo({"converted.nerv"}) +param_repo = nerv.ParamRepo({"converted.nerv", "global_transf.nerv"}) sublayer_repo = nerv.LayerRepo( { + -- global transf + ["nerv.BiasLayer"] = + { + blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}}, + blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}} + }, + ["nerv.WindowLayer"] = + { + wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}}, + wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}} + }, + -- biased linearity ["nerv.AffineLayer"] = { affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"}, @@ -40,7 +50,7 @@ sublayer_repo = nerv.LayerRepo( }, ["nerv.SoftmaxCELayer"] = { - softmax_ce0 = {{}, {dim_in = {3001, 3001}, dim_out = {}}} + softmax_ce0 = {{}, {dim_in = {3001, 1}, dim_out = {}, compressed = true}} } }, param_repo, gconf) @@ -48,8 +58,19 @@ layer_repo = nerv.LayerRepo( { ["nerv.DAGLayer"] = { + global_transf = {{}, { + dim_in = {429}, dim_out = {429}, + sub_layers = sublayer_repo, + connections = { + ["[1]"] = "blayer1[1]", + ["blayer1[1]"] = "wlayer1[1]", + ["wlayer1[1]"] = "blayer2[1]", + ["blayer2[1]"] = "wlayer2[1]", + ["wlayer2[1]"] = "[1]" + } + }}, main = {{}, { - dim_in = {429, 3001}, dim_out = {}, + dim_in = {429, 1}, dim_out = {}, sub_layers = sublayer_repo, connections = { ["[1]"] = "affine0[1]", @@ -74,24 +95,52 @@ layer_repo = nerv.LayerRepo( } }, param_repo, gconf) -df = nerv.ChunkFile("input.param", "r") -label = nerv.CuMatrixFloat(10, 3001) -label:fill(0) -for i = 0, 9 do - label[i][i] = 1.0 -end +tnet_reader = nerv.TNetReader(gconf, + { + id = "main_scp", + scp_file = "/slfs1/users/mfy43/swb_ivec/train_bp.scp", +-- scp_file = "t.scp", + conf_file = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf", + frm_ext = 5, + mlfs = { + ref = { + file = "/slfs1/users/mfy43/swb_ivec/ref.mlf", + format = "map", + format_arg = "/slfs1/users/mfy43/swb_ivec/dict", + dir = "*/", + ext = "lab" + } + }, + global_transf = layer_repo:get_layer("global_transf") + }) + +buffer = nerv.SGDBuffer(gconf, + { + buffer_size = 8192, + readers = { + { reader = tnet_reader, + data = {main_scp = 429, ref = 1}} + } + }) -input = {df:read_chunk("input", gconf).trans, label} -output = {} -err_input = {} -err_output = {input[1]:create()} sm = sublayer_repo:get_layer("softmax_ce0") main = layer_repo:get_layer("main") -main:init() -for i = 0, 3 do +main:init(gconf.batch_size) +cnt = 0 +for data in buffer.get_data, buffer do + if cnt == 1000 then break end + cnt = cnt + 1 + input = {data.main_scp, data.ref} + output = {} + err_input = {} + err_output = {input[1]:create()} + main:propagate(input, output) main:back_propagate(err_output, err_input, input, output) main:update(err_input, input, output) + nerv.utils.printf("cross entropy: %.8f\n", sm.total_ce) nerv.utils.printf("frames: %.8f\n", sm.total_frames) + nerv.utils.printf("err/frm: %.8f\n", sm.total_ce / sm.total_frames) + collectgarbage("collect") end diff --git a/io/init.lua b/io/init.lua index 4a663a7..9bbd51a 100644 --- a/io/init.lua +++ b/io/init.lua @@ -28,3 +28,25 @@ function nerv.ChunkFile:read_chunk(id, global_conf) chunk:read(self:get_chunkdata(id)) return chunk end + +local DataReader = nerv.class("nerv.DataReader") + +function DataReader:__init(global_conf, reader_conf) + nerv.error_method_not_implemented() +end + +function DataReader:get_data() + nerv.error_method_not_implemented() +end + +local DataBuffer = nerv.class("nerv.DataBuffer") + +function DataBuffer:__init(global_conf, buffer_conf) + nerv.error_method_not_implemented() +end + +function DataBuffer:get_batch() + nerv.error_method_not_implemented() +end + +require 'io.sgd_buffer' diff --git a/io/sgd_buffer.lua b/io/sgd_buffer.lua new file mode 100644 index 0000000..dadcf67 --- /dev/null +++ b/io/sgd_buffer.lua @@ -0,0 +1,99 @@ +local SGDBuffer = nerv.class("nerv.SGDBuffer", "nerv.DataBuffer") + +function SGDBuffer:__init(global_conf, buffer_conf) + self.gconf = global_conf + self.buffer_size = math.floor(buffer_conf.buffer_size / + global_conf.batch_size) * global_conf.batch_size + self.head = 0 + self.tail = 0 + self.readers = {} + for i, reader_spec in ipairs(buffer_conf.readers) do + local buffs = {} + for id, width in pairs(reader_spec.data) do + buffs[id] = {data = global_conf.mmat_type(self.buffer_size, width), + leftover = {}, + width = width} + end + table.insert(self.readers, {buffs = buffs, + reader = reader_spec.reader, + tail = 0, + has_leftover = false}) + end +end + +function SGDBuffer:saturate() + local buffer_size = self.buffer_size + self.head = 0 + self.tail = buffer_size + for i, reader in ipairs(self.readers) do + reader.tail = 0 + if reader.has_leftover then + local lrow + for id, buff in pairs(reader.buffs) do + lrow = buff.leftover:nrow() + if lrow > buffer_size then + nerv.error("buffer size is too small to contain leftovers") + end + buff.data:copy_from(buff.leftover, 0, lrow) + end + reader.tail = lrow + reader.has_leftover = false + end + while reader.tail < buffer_size do + local data = reader.reader:get_data() + if data == nil then + break + end + local drow = nil + for id, d in pairs(data) do + if drow == nil then + drow = d:nrow() + elseif d:nrow() ~= drow then + nerv.error("reader provides with inconsistent rows of data") + end + end + local remain = buffer_size - reader.tail + if drow > remain then + for id, buff in pairs(reader.buffs) do + local d = data[id] + if d == nil then + nerv.error("reader does not provide data for %s", id) + end + buff.leftover = self.gconf.mmat_type(drow - remain, + buff.width) + buff.leftover:copy_from(d, remain, drow) + end + drow = remain + reader.has_leftover = true + end + for id, buff in pairs(reader.buffs) do + buff.data:copy_from(data[id], 0, drow, reader.tail) + end + reader.tail = reader.tail + drow + end + self.tail = math.min(self.tail, reader.tail) + end + return self.tail >= self.gconf.batch_size +end + +function SGDBuffer:get_data() + local batch_size = self.gconf.batch_size + if self.head >= self.tail then -- buffer is empty + if not self:saturate() then + return nil -- the remaining data cannot build a batch + end + end + if self.head + batch_size > self.tail then + return nil -- the remaining data cannot build a batch + end + local res = {} + for i, reader in ipairs(self.readers) do + for id, buff in pairs(reader.buffs) do + local batch = self.gconf.cumat_type(batch_size, buff.width) + batch:copy_fromh(buff.data, self.head, self.head + batch_size) + res[id] = batch + end + end + self.head = self.head + batch_size + return res +end diff --git a/layer/affine.lua b/layer/affine.lua index 90a1d16..59a0e91 100644 --- a/layer/affine.lua +++ b/layer/affine.lua @@ -4,7 +4,7 @@ local BiasParam = nerv.class('nerv.BiasParam', 'nerv.MatrixParam') local AffineLayer = nerv.class('nerv.AffineLayer', 'nerv.Layer') function MatrixParam:read(pcdata) - self.trans = self.gconf.mat_type.new_from_host( + self.trans = self.gconf.cumat_type.new_from_host( nerv.MMatrixFloat.load(pcdata)) end diff --git a/layer/init.lua b/layer/init.lua index c8c691b..38bcd7f 100644 --- a/layer/init.lua +++ b/layer/init.lua @@ -2,50 +2,50 @@ local Param = nerv.class('nerv.Param') -function nerv.Param:__init(id, global_conf) +function Param:__init(id, global_conf) self.id = id self.gconf = global_conf end -function nerv.Param:get_info() +function Param:get_info() return self.info end -function nerv.Param:set_info(info) +function Param:set_info(info) self.info = info end -function nerv.Param:read(pfhandle) +function Param:read(pfhandle) nerv.error_method_not_implemented() end -function nerv.Param:write(pfhandle) +function Param:write(pfhandle) nerv.error_method_not_implemented() end local Layer = nerv.class('nerv.Layer') -function nerv.Layer:__init(id, global_conf, ...) +function Layer:__init(id, global_conf, layer_conf) nerv.error_method_not_implemented() end -function nerv.Layer:init(id) +function Layer:init(id) nerv.error_method_not_implemented() end -function nerv.Layer:update(bp_err, input, output) +function Layer:update(bp_err, input, output) nerv.error_method_not_implemented() end -function nerv.Layer:propagate(input, output) +function Layer:propagate(input, output) nerv.error_method_not_implemented() end -function nerv.Layer:back_propagate(next_bp_err, bp_err, input, output) +function Layer:back_propagate(next_bp_err, bp_err, input, output) nerv.error_method_not_implemented() end -function nerv.Layer:check_dim_len(len_in, len_out) +function Layer:check_dim_len(len_in, len_out) local expected_in = #self.dim_in local expected_out = #self.dim_out if len_in > 0 and expected_in ~= len_in then @@ -58,7 +58,7 @@ function nerv.Layer:check_dim_len(len_in, len_out) end end -function nerv.Layer:get_dim() +function Layer:get_dim() return self.dim_in, self.dim_out end diff --git a/layer/softmax_ce.lua b/layer/softmax_ce.lua index 09eb3a9..cf98c45 100644 --- a/layer/softmax_ce.lua +++ b/layer/softmax_ce.lua @@ -5,6 +5,10 @@ function SoftmaxCELayer:__init(id, global_conf, layer_conf) self.gconf = global_conf self.dim_in = layer_conf.dim_in self.dim_out = layer_conf.dim_out + self.compressed = layer_conf.compressed + if self.compressed == nil then + self.compressed = false + end self:check_dim_len(2, -1) -- two inputs: nn output and label end @@ -26,15 +30,21 @@ function SoftmaxCELayer:propagate(input, output) soutput:softmax(input[1]) local ce = soutput:create() ce:log_elem(soutput) - ce:mul_elem(ce, input[2]) --- print(input[1][0]) --- print(soutput[1][0]) - -- add total ce + local label = input[2] + if self.compressed then + label = label:decompress(input[1]:ncol()) + end + ce:mul_elem(ce, label) + -- add total ce self.total_ce = self.total_ce - ce:rowsum():colsum()[0] self.total_frames = self.total_frames + soutput:nrow() end function SoftmaxCELayer:back_propagate(next_bp_err, bp_err, input, output) -- softmax output - label - next_bp_err[1]:add(self.soutput, input[2], 1.0, -1.0) + local label = input[2] + if self.compressed then + label = label:decompress(input[1]:ncol()) + end + next_bp_err[1]:add(self.soutput, label, 1.0, -1.0) end diff --git a/matrix/cuda_helper.h b/matrix/cuda_helper.h index c0fa618..cedc643 100644 --- a/matrix/cuda_helper.h +++ b/matrix/cuda_helper.h @@ -23,7 +23,7 @@ #define CHECK_SAME_DIMENSION(a, b) \ do { \ if (!(a->nrow == b->nrow && a->ncol == b->ncol)) \ - nerv_error(L, "Matrices should be of the same dimension"); \ + nerv_error(L, "matrices should be of the same dimension"); \ } while (0) static const char *cublasGetErrorString(cublasStatus_t err) { diff --git a/matrix/cukernel.h b/matrix/cukernel.h index 178b7d3..7d2168e 100644 --- a/matrix/cukernel.h +++ b/matrix/cukernel.h @@ -13,4 +13,5 @@ void cudak_(cuda_fill)(Matrix *a, double val); void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context); void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step); void cudak_(cuda_scale_row)(const Matrix *a, Matrix *b); +void cudak_(cuda_decompress)(const Matrix *a, Matrix *b); #endif diff --git a/matrix/generic/cukernel.cu b/matrix/generic/cukernel.cu index 1d8b983..05a1e78 100644 --- a/matrix/generic/cukernel.cu +++ b/matrix/generic/cukernel.cu @@ -187,6 +187,15 @@ __global__ void cudak_(scale_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b, b[j + i * stride] *= a[j]; } +__global__ void cudak_(decompress)(const MATRIX_ELEM *a, MATRIX_ELEM *b, + int nrow, int ncol, + int stride_a, int stride_b) { + int j = blockIdx.x * blockDim.x + threadIdx.x; + int i = blockIdx.y * blockDim.y + threadIdx.y; + if (i >= nrow || j >= ncol) return; + b[lrintf(a[j + i * stride_a]) + i * stride_b] = 1.0; +} + extern "C" { #include "../cukernel.h" void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) { @@ -385,5 +394,15 @@ extern "C" { (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM)); } + + void cudak_(cuda_decompress)(const Matrix *a, Matrix *b) { + dim3 threadsPerBlock(1, CUDA_THREADS_NN); + dim3 numBlocks(1, CEIL_DIV(a->nrow, threadsPerBlock.y)); + cudak_(decompress)<<>> \ + (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), + a->nrow, a->ncol, + a->stride / sizeof(MATRIX_ELEM), + b->stride / sizeof(MATRIX_ELEM)); + } } #endif diff --git a/matrix/generic/cumatrix.c b/matrix/generic/cumatrix.c index 0df1bd7..373fc42 100644 --- a/matrix/generic/cumatrix.c +++ b/matrix/generic/cumatrix.c @@ -74,7 +74,8 @@ static int nerv_matrix_(mul)(lua_State *L) { if (an != bm) nerv_error(L, "Wrong dimension of multipliers"); /* MATRIX_ELEM alpha = 1.0f, beta = 0.0f; */ - CUBLAS_SAFE_CALL( //Because matrix in Nerv is row-major, here b comes first + /* Because matrix in Nerv is row-major, here b comes first */ + CUBLAS_SAFE_CALL( NERV_CUBLAS_(gemm)(cublas_handle, tb, ta, bn, am, bm, &alpha, @@ -113,9 +114,11 @@ static int nerv_matrix_(sigmoid_grad)(lua_State *L) { static int nerv_matrix_(softmax)(lua_State *L) { Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname)); - Matrix *max = nerv_matrix_(new_)(L, a->nrow, 1); - Matrix *dno = nerv_matrix_(new_)(L, a->nrow, 1); + Matrix *max; + Matrix *dno; CHECK_SAME_DIMENSION(a, b); + max = nerv_matrix_(new_)(L, a->nrow, 1); + dno = nerv_matrix_(new_)(L, a->nrow, 1); cudak_(cuda_rowmax)(a, max); cudak_(cuda_softmax_denominator)(a, max, dno); cudak_(cuda_softmax_final)(a, max, dno, b); @@ -168,26 +171,22 @@ static int nerv_matrix_(fill)(lua_State *L) { return 0; } -static int nerv_matrix_(copy_fromd)(lua_State *L) { +static int nerv_matrix_(copy_fromd)(lua_State *L) { Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); - CHECK_SAME_DIMENSION(a, b); - CUDA_SAFE_SYNC_CALL( - cudaMemcpy2D(MATRIX_ELEM_PTR(a), a->stride, - MATRIX_ELEM_PTR(b), b->stride, - sizeof(MATRIX_ELEM) * b->ncol, b->nrow, - cudaMemcpyDeviceToDevice)); - return 0; -} - -static int nerv_matrix_(copy_tod)(lua_State *L) { - Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); - Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); - CHECK_SAME_DIMENSION(a, b); + int nargs = lua_gettop(L); + int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0; + int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow; + int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0; + if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow && + a_begin + b_end - b_begin <= a->nrow)) + nerv_error(L, "invalid copy interval"); + if (a->ncol != b->ncol) + nerv_error(L, "matrices should be of the same dimension"); CUDA_SAFE_SYNC_CALL( - cudaMemcpy2D(MATRIX_ELEM_PTR(b), b->stride, - MATRIX_ELEM_PTR(a), a->stride, - sizeof(MATRIX_ELEM) * a->ncol, a->nrow, + cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride, + MATRIX_ROW_PTR(b, b_begin), b->stride, + sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin, cudaMemcpyDeviceToDevice)); return 0; } @@ -196,11 +195,19 @@ extern const char *MATRIX_CUMATRIX_HOST_TNAME; static int nerv_matrix_(copy_fromh)(lua_State *L) { Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME); - CHECK_SAME_DIMENSION(a, b); + int nargs = lua_gettop(L); + int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0; + int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow; + int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0; + if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow && + a_begin + b_end - b_begin <= a->nrow)) + nerv_error(L, "invalid copy interval"); + if (a->ncol != b->ncol) + nerv_error(L, "matrices should be of the same dimension"); CUDA_SAFE_SYNC_CALL( - cudaMemcpy2D(MATRIX_ELEM_PTR(a), a->stride, - MATRIX_ELEM_PTR(b), b->stride, - sizeof(MATRIX_ELEM) * b->ncol, b->nrow, + cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride, + MATRIX_ROW_PTR(b, b_begin), b->stride, + sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin, cudaMemcpyHostToDevice)); return 0; } @@ -208,11 +215,19 @@ static int nerv_matrix_(copy_fromh)(lua_State *L) { static int nerv_matrix_(copy_toh)(lua_State *L) { Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME); - CHECK_SAME_DIMENSION(a, b); + int nargs = lua_gettop(L); + int a_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0; + int a_end = nargs > 3 ? luaL_checkinteger(L, 4) : a->nrow; + int b_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0; + if (!(0 <= a_begin && a_begin < a_end && a_end <= a->nrow && + b_begin + a_end - a_begin <= b->nrow)) + nerv_error(L, "invalid copy interval"); + if (b->ncol != a->ncol) + nerv_error(L, "matrices should be of the same dimension"); CUDA_SAFE_SYNC_CALL( - cudaMemcpy2D(MATRIX_ELEM_PTR(b), b->stride, - MATRIX_ELEM_PTR(a), a->stride, - sizeof(MATRIX_ELEM) * a->ncol, a->nrow, + cudaMemcpy2D(MATRIX_ROW_PTR(b, b_begin), b->stride, + MATRIX_ROW_PTR(a, a_begin), a->stride, + sizeof(MATRIX_ELEM) * a->ncol, a_end - a_begin, cudaMemcpyDeviceToHost)); return 0; } @@ -221,6 +236,7 @@ static int nerv_matrix_(trans)(lua_State *L) { Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b = nerv_matrix_(new_)(L, a->ncol, a->nrow); MATRIX_ELEM alpha = 1, beta = 0; + /* FIXME: possible memory leak when lua error is raised */ CUBLAS_SAFE_CALL( NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T, a->nrow, a->ncol, @@ -251,6 +267,19 @@ static int nerv_matrix_(log_elem)(lua_State *L) { return 0; } +static int nerv_matrix_(decompress)(lua_State *L) { + Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); + Matrix *b; + int orig_col = luaL_checkinteger(L, 2); + if (a->ncol != 1) + nerv_error(L, "the compressed matrix must be a column vector"); + b = nerv_matrix_(new_)(L, a->nrow, orig_col); + cudak_(cuda_fill)(b, 0.0); + cudak_(cuda_decompress)(a, b); + luaT_pushudata(L, b, nerv_matrix_(tname)); + return 1; +} + extern const char *nerv_matrix_host_int_tname; static int nerv_matrix_(copy_rows_fromh_by_idx)(lua_State *L) { Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); @@ -322,11 +351,11 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = { {"rowsum", nerv_matrix_(rowsum)}, {"rowmax", nerv_matrix_(rowmax)}, {"trans", nerv_matrix_(trans)}, + {"decompress", nerv_matrix_(decompress)}, /* in-place calc */ {"copy_fromh", nerv_matrix_(copy_fromh)}, {"copy_fromd", nerv_matrix_(copy_fromd)}, {"copy_toh", nerv_matrix_(copy_toh)}, - {"copy_tod", nerv_matrix_(copy_tod)}, {"add", nerv_matrix_(add)}, {"mul", nerv_matrix_(mul)}, {"add_row", nerv_matrix_(add_row)}, diff --git a/matrix/generic/mmatrix.c b/matrix/generic/mmatrix.c index 3a9ae79..4b722f3 100644 --- a/matrix/generic/mmatrix.c +++ b/matrix/generic/mmatrix.c @@ -11,6 +11,7 @@ #define NERV_GENERIC_MATRIX #include "../../common.h" #include "../../io/chunk_file.h" +#include "string.h" static void host_matrix_(alloc)(lua_State *L, MATRIX_ELEM **dptr, size_t *stride, @@ -96,10 +97,27 @@ int nerv_matrix_(save)(lua_State *L) { return 0; } - +static int nerv_matrix_(copy_from)(lua_State *L) { + Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); + Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); + int nargs = lua_gettop(L); + int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0; + int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow; + int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0; + if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow && + a_begin + b_end - b_begin <= a->nrow)) + nerv_error(L, "invalid copy interval"); + if (a->ncol != b->ncol) + nerv_error(L, "matrices should be of the same dimension"); + memmove(MATRIX_ROW_PTR(a, a_begin), + MATRIX_ROW_PTR(b, b_begin), + sizeof(MATRIX_ELEM) * b->ncol * (b_end - b_begin)); + return 0; +} static const luaL_Reg nerv_matrix_(extra_methods)[] = { {"load", nerv_matrix_(load)}, {"save", nerv_matrix_(save)}, + {"copy_from", nerv_matrix_(copy_from)}, {NULL, NULL} }; diff --git a/matrix/init.lua b/matrix/init.lua index f309f81..9637391 100644 --- a/matrix/init.lua +++ b/matrix/init.lua @@ -22,7 +22,8 @@ function nerv.Matrix:__tostring__() table.insert(strt, "\n") end end - table.insert(strt, string.format("[Matrix %d x %d]", nrow, ncol)) + table.insert(strt, string.format( + "[%s %d x %d]", self.__typename, nrow, ncol)) return table.concat(strt) end diff --git a/nn/layer_dag.lua b/nn/layer_dag.lua index 1ab18fa..4ee829e 100644 --- a/nn/layer_dag.lua +++ b/nn/layer_dag.lua @@ -44,6 +44,7 @@ function nerv.DAGLayer:__init(id, global_conf, layer_conf) local outputs = {} local dim_in = layer_conf.dim_in local dim_out = layer_conf.dim_out + local parsed_conn = {} for from, to in pairs(layer_conf.connections) do local id_from, port_from = parse_id(from) local id_to, port_to = parse_id(to) @@ -76,32 +77,18 @@ function nerv.DAGLayer:__init(id, global_conf, layer_conf) if output_dim[port_from] ~= input_dim[port_to] then nerv.error("mismatching data dimension between %s and %s", from, to) end - local mid = global_conf.mat_type(global_conf.batch_size, - output_dim[port_from]) - local err_mid = mid:create() - - ref_from.outputs[port_from] = mid - ref_to.inputs[port_to] = mid - - ref_from.err_inputs[port_from] = err_mid - ref_to.err_outputs[port_to] = err_mid + table.insert(parsed_conn, + {{ref_from, port_from}, {ref_to, port_to}}) table.insert(ref_from.next_layers, ref_to) -- add edge ref_to.in_deg = ref_to.in_deg + 1 -- increase the in-degree of the target layer end end - self.layers = layers - self.inputs = inputs - self.outputs = outputs - self.dim_in = dim_in - self.dim_out = dim_out -end -function nerv.DAGLayer:init(id) -- topology sort local queue = {} local l = 1 local r = 1 - for id, ref in pairs(self.layers) do + for id, ref in pairs(layers) do if ref.in_deg == 0 then table.insert(queue, ref) nerv.utils.printf("adding source layer: %s\n", id) @@ -126,20 +113,50 @@ function nerv.DAGLayer:init(id) -- topology sort for i = 1, #queue do nerv.utils.printf("queued layer: %s\n", queue[i].layer.id) end - self.queue = queue - for id, ref in pairs(self.layers) do + + for id, ref in pairs(layers) do -- check wether the graph is connected if ref.visited == false then nerv.utils.printf("warning: layer %s is ignored\n", id) end + end + + self.layers = layers + self.inputs = inputs + self.outputs = outputs + self.dim_in = dim_in + self.dim_out = dim_out + self.parsed_conn = parsed_conn + self.queue = queue + self.gconf = global_conf +end + +function nerv.DAGLayer:init(batch_size) -- topology sort + for i, conn in ipairs(self.parsed_conn) do + local _, output_dim + local ref_from, port_from, ref_to, port_to + ref_from, port_from = unpack(conn[1]) + ref_to, port_to = unpack(conn[2]) + _, output_dim = ref_from.layer:get_dim() + local mid = self.gconf.cumat_type(batch_size, + output_dim[port_from]) + local err_mid = mid:create() + + ref_from.outputs[port_from] = mid + ref_to.inputs[port_to] = mid + + ref_from.err_inputs[port_from] = err_mid + ref_to.err_outputs[port_to] = err_mid + end + for id, ref in pairs(self.layers) do for i = 1, ref.input_len do if ref.inputs[i] == nil then - nerv.error("dangling port %d of layer %s", i, id) + nerv.error("dangling input port %d of layer %s", i, id) end end for i = 1, ref.output_len do if ref.outputs[i] == nil then - nerv.error("dangling port %d of layer %s", i, id) + nerv.error("dangling output port %d of layer %s", i, id) end end -- initialize sub layers diff --git a/speech b/speech index 0c6ca6a..a753eca 160000 --- a/speech +++ b/speech @@ -1 +1 @@ -Subproject commit 0c6ca6a17f06821cd5d612f489ca6cb68c2c4d5b +Subproject commit a753eca0121ac3ec81ed76bd719d3f1cb9522680 -- cgit v1.2.3