From 1a9f63e351582f54fec7817927168cb1dbb0c1d6 Mon Sep 17 00:00:00 2001 From: Determinant Date: Fri, 28 Aug 2015 13:21:52 +0800 Subject: support gpu buffering --- nerv/examples/swb_baseline.lua | 3 ++- nerv/io/sgd_buffer.lua | 34 +++++++++++++++++++-------- nerv/lib/matrix/cukernel.h | 2 ++ nerv/lib/matrix/cumatrix.c | 1 + nerv/lib/matrix/cumatrix.h | 1 + nerv/lib/matrix/generic/cukernel.cu | 20 ++++++++++++++++ nerv/lib/matrix/generic/cumatrix.c | 19 +++++++++++++-- nerv/lib/matrix/generic/cumatrix.h | 2 ++ nerv/lib/matrix/mmatrix.c | 37 ++++++++++++++--------------- nerv/lib/matrix/mmatrix.h | 3 ++- nerv/matrix/generic/cumatrix.c | 22 ++++++++++++++++-- nerv/matrix/mmatrix.c | 46 +++++++++++++++++++------------------ 12 files changed, 135 insertions(+), 55 deletions(-) diff --git a/nerv/examples/swb_baseline.lua b/nerv/examples/swb_baseline.lua index bbc6467..8015884 100644 --- a/nerv/examples/swb_baseline.lua +++ b/nerv/examples/swb_baseline.lua @@ -173,7 +173,8 @@ function make_buffer(readers) { buffer_size = gconf.buffer_size, randomize = gconf.randomize, - readers = readers + readers = readers, + use_gpu = true }) end diff --git a/nerv/io/sgd_buffer.lua b/nerv/io/sgd_buffer.lua index f9d281c..3f854f0 100644 --- a/nerv/io/sgd_buffer.lua +++ b/nerv/io/sgd_buffer.lua @@ -8,13 +8,29 @@ function SGDBuffer:__init(global_conf, buffer_conf) if self.randomize == nil then self.randomize = false end + local cumat_type = global_conf.cumat_type + if buffer_conf.use_gpu then + self.mat_type = cumat_type + self.copy_rows_from_by_idx = cumat_type.copy_rows_fromd_by_idx + self.copy_from = cumat_type.copy_fromd + self.copy_from_reader = cumat_type.copy_fromh + self.perm_gen = function (x) + return cumat_type.new_from_host(nerv.MMatrixFloat.perm_gen(x)) + end + else + self.mat_type = global_conf.mmat_type + self.copy_rows_from_by_idx = cumat_type.copy_rows_fromh_by_idx + self.copy_from = cumat_type.copy_fromh + self.perm_gen = nerv.MMatrixFloat.perm_gen + self.copy_from_reader = self.mat_type.copy_from + end self.head = 0 self.tail = 0 self.readers = {} for i, reader_spec in ipairs(buffer_conf.readers) do local buffs = {} for id, width in pairs(reader_spec.data) do - buffs[id] = {data = global_conf.mmat_type(self.buffer_size, width), + buffs[id] = {data = self.mat_type(self.buffer_size, width), leftover = nil, width = width} end @@ -41,7 +57,7 @@ function SGDBuffer:saturate() buff.data:copy_from(buff.leftover, 0, lrow) buff.leftover = nil end - nerv.printf("buffer leftover: %d\n", lrow) + nerv.info("buffer leftover: %d\n", lrow) reader.tail = lrow reader.has_leftover = false end @@ -65,21 +81,21 @@ function SGDBuffer:saturate() if d == nil then nerv.error("reader does not provide data for %s", id) end - buff.leftover = self.gconf.mmat_type(drow - remain, - buff.width) - buff.leftover:copy_from(d, remain, drow) + buff.leftover = self.mat_type(drow - remain, + buff.width) + self.copy_from_reader(buff.leftover, d, remain, drow) end drow = remain reader.has_leftover = true end for id, buff in pairs(reader.buffs) do - buff.data:copy_from(data[id], 0, drow, reader.tail) + self.copy_from_reader(buff.data, data[id], 0, drow, reader.tail) end reader.tail = reader.tail + drow end self.tail = math.min(self.tail, reader.tail) end - self.rand_map = nerv.MMatrixInt.perm_gen(self.tail) -- generate shuffled index + self.rand_map = self.perm_gen(self.tail) -- generate shuffled index collectgarbage("collect") return self.tail >= self.gconf.batch_size end @@ -101,9 +117,9 @@ function SGDBuffer:get_data() for id, buff in pairs(reader.buffs) do local batch = self.gconf.cumat_type(batch_size, buff.width) if self.randomize then - batch:copy_rows_fromh_by_idx(buff.data, self.rand_map, self.head) + self.copy_rows_from_by_idx(batch, buff.data, self.rand_map, self.head) else - batch:copy_fromh(buff.data, self.head, self.head + batch_size) + self.copy_from(batch, buff.data, self.head, self.head + batch_size) end res[id] = batch end diff --git a/nerv/lib/matrix/cukernel.h b/nerv/lib/matrix/cukernel.h index 7bb4c2c..2126c6f 100644 --- a/nerv/lib/matrix/cukernel.h +++ b/nerv/lib/matrix/cukernel.h @@ -18,4 +18,6 @@ void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step); void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b); void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b); void cudak_(cuda_decompress)(const Matrix *a, Matrix *b); +void cudak_(cuda_copy_rows_by_idx)(const Matrix *a, Matrix *b, + const Matrix *idx, int b_begin); #endif diff --git a/nerv/lib/matrix/cumatrix.c b/nerv/lib/matrix/cumatrix.c index aa81bfc..ff1168d 100644 --- a/nerv/lib/matrix/cumatrix.c +++ b/nerv/lib/matrix/cumatrix.c @@ -49,6 +49,7 @@ void nerv_cumatrix_init() { #define NERV_CUBLAS_(NAME) cublasS##NAME #define MATRIX_CUMATRIX_HOST_TNAME nerv_matrix_host_float_tname #include "generic/cumatrix.c" + #undef NERV_CUBLAS_ #undef cudak_ #undef nerv_matrix_ diff --git a/nerv/lib/matrix/cumatrix.h b/nerv/lib/matrix/cumatrix.h index 9f71507..e6def66 100644 --- a/nerv/lib/matrix/cumatrix.h +++ b/nerv/lib/matrix/cumatrix.h @@ -1,5 +1,6 @@ #ifndef NERV_CUMATRIX_H #define NERV_CUMATRIX_H +#include "matrix.h" void nerv_cumatrix_print_profile(); void nerv_cumatrix_clear_profile(); void nerv_cumatrix_init(); diff --git a/nerv/lib/matrix/generic/cukernel.cu b/nerv/lib/matrix/generic/cukernel.cu index e337798..08feb59 100644 --- a/nerv/lib/matrix/generic/cukernel.cu +++ b/nerv/lib/matrix/generic/cukernel.cu @@ -284,6 +284,15 @@ __global__ void cudak_(gen_col_idx)(MATRIX_ELEM *b, b[j + i * stride] = j; } +__global__ void cudak_(copy_rows_by_idx)(const MATRIX_ELEM *a, MATRIX_ELEM *b, + const MATRIX_ELEM *idx, int b_begin, + int nrow, int ncol, int stride) { + int j = blockIdx.x * blockDim.x + threadIdx.x; + int i = blockIdx.y * blockDim.y + threadIdx.y; + if (i >= nrow || j >= ncol) return; + b[j + i * stride] = a[j + lrintf(idx[i]) * stride]; +} + extern "C" { #include "../cukernel.h" void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) { @@ -589,5 +598,16 @@ extern "C" { b->stride / sizeof(MATRIX_ELEM)); cudaStreamSynchronize(0); } + + void cudak_(cuda_copy_rows_by_idx)(const Matrix *a, Matrix *b, + const Matrix *idx, int b_begin) { + dim3 threadsPerBlock(CUDA_THREADS_NN, 1); + dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), b->nrow); + cudak_(copy_rows_by_idx)<<>> \ + (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), + MATRIX_ELEM_PTR(idx) + b_begin, + b_begin, b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM)); + cudaStreamSynchronize(0); + } } #endif diff --git a/nerv/lib/matrix/generic/cumatrix.c b/nerv/lib/matrix/generic/cumatrix.c index 2cb3563..770e503 100644 --- a/nerv/lib/matrix/generic/cumatrix.c +++ b/nerv/lib/matrix/generic/cumatrix.c @@ -315,7 +315,7 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b, long nrow = a->nrow; if (!(0 <= b_begin && b_begin + nrow <= idx->ncol)) NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0); - long *idx_ptr = idx->data.i; + float *idx_ptr = idx->data.f; int i; if (idx->nrow != 1) NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0); @@ -325,7 +325,7 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b, cudaStream_t *streams = (cudaStream_t*)malloc(sizeof(cudaStream_t) * nrow); for (i = 0; i < nrow; i++) { - int src_row = idx_ptr[b_begin + i]; + int src_row = (int)idx_ptr[b_begin + i]; if (!(0 <= src_row && src_row < b->nrow)) NERV_EXIT_STATUS(status, MAT_INVALID_IDX, 0); CUDA_SAFE_CALL(cudaStreamCreate(streams + i), status); @@ -344,6 +344,21 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b, NERV_SET_STATUS(status, NERV_NORMAL, 0); } +void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b, + const Matrix *idx, int b_begin, Status *status) { + long nrow = a->nrow; + if (!(0 <= b_begin && b_begin + nrow <= idx->ncol)) + NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0); + if (idx->nrow != 1) + NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0); + if (a->ncol != b->ncol) + NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); + PROFILE_START + cudak_(cuda_copy_rows_by_idx)(b, a, idx, b_begin); + PROFILE_STOP + NERV_SET_STATUS(status, NERV_NORMAL, 0); +} + void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b, int context, Status *status) { if (a->nrow != b->nrow) diff --git a/nerv/lib/matrix/generic/cumatrix.h b/nerv/lib/matrix/generic/cumatrix.h index 3f1f8a3..04e8c5a 100644 --- a/nerv/lib/matrix/generic/cumatrix.h +++ b/nerv/lib/matrix/generic/cumatrix.h @@ -40,6 +40,8 @@ void nerv_matrix_(log_elem)(Matrix *b, const Matrix *a, Status *status); Matrix *nerv_matrix_(decompress)(const Matrix *a, int orig_col, Status *status); void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b, const Matrix *idx, int b_begin, Status *status); +void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b, + const Matrix *idx, int b_begin, Status *status); void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b, int context, Status *status); diff --git a/nerv/lib/matrix/mmatrix.c b/nerv/lib/matrix/mmatrix.c index 94f1ea8..b8157eb 100644 --- a/nerv/lib/matrix/mmatrix.c +++ b/nerv/lib/matrix/mmatrix.c @@ -7,6 +7,25 @@ #define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME #include "generic/matrix.h" #include "generic/mmatrix.c" + +Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) { + int i; + Matrix *self = nerv_matrix_(create)(1, ncol, status); + if (status->err_code != NERV_NORMAL) + return NULL; + float *prow = self->data.f; + for (i = 0; i < ncol; i++) + prow[i] = i; + for (i = ncol - 1; i >= 0; i--) + { + size_t j = rand() % (i + 1); + float tmp = prow[i]; + prow[i] = prow[j]; + prow[j] = tmp; + } + return self; +} + #undef nerv_matrix_ #undef host_matrix_ #undef MATRIX_USE_FLOAT @@ -33,21 +52,3 @@ #define host_matrix_(NAME) host_matrix_int_##NAME #define nerv_matrix_(NAME) nerv_matrix_host_int_##NAME #include "generic/mmatrix.c" - -Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) { - int i; - Matrix *self = nerv_matrix_(create)(1, ncol, status); - if (status->err_code != NERV_NORMAL) - return NULL; - long *prow = self->data.i; - for (i = 0; i < ncol; i++) - prow[i] = i; - for (i = ncol - 1; i >= 0; i--) - { - size_t j = rand() % (i + 1); - long tmp = prow[i]; - prow[i] = prow[j]; - prow[j] = tmp; - } - return self; -} diff --git a/nerv/lib/matrix/mmatrix.h b/nerv/lib/matrix/mmatrix.h index df91e4c..31e7984 100644 --- a/nerv/lib/matrix/mmatrix.h +++ b/nerv/lib/matrix/mmatrix.h @@ -1,4 +1,5 @@ #ifndef NERV_MMATRIX_H #define NERV_MMATRIX_H -Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status); +#include "matrix.h" +Matrix *nerv_matrix_host_float_perm_gen(int ncol, Status *status); #endif diff --git a/nerv/matrix/generic/cumatrix.c b/nerv/matrix/generic/cumatrix.c index ab7f7c4..08cb4c2 100644 --- a/nerv/matrix/generic/cumatrix.c +++ b/nerv/matrix/generic/cumatrix.c @@ -228,12 +228,12 @@ static int nerv_matrix_(lua_decompress)(lua_State *L) { return 1; } -extern const char *nerv_matrix_host_int_tname; +extern const char *nerv_matrix_host_float_tname; static int nerv_matrix_(lua_copy_rows_fromh_by_idx)(lua_State *L) { Status status; Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); const Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME); - const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_int_tname); + const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_float_tname); long nrow = a->nrow; int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0; nerv_matrix_(copy_rows_fromh_by_idx)(a, b, idx, b_begin, &status); @@ -241,6 +241,18 @@ static int nerv_matrix_(lua_copy_rows_fromh_by_idx)(lua_State *L) { return 0; } +static int nerv_matrix_(lua_copy_rows_fromd_by_idx)(lua_State *L) { + Status status; + Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); + const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); + const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_(tname)); + long nrow = a->nrow; + int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0; + nerv_matrix_(copy_rows_fromd_by_idx)(a, b, idx, b_begin, &status); + NERV_LUA_CHECK_STATUS(L, status); + return 0; +} + static int nerv_matrix_(lua_expand_frm)(lua_State *L) { Status status; Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); @@ -290,6 +302,8 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = { /* in-place calc */ {"copy_fromh", nerv_matrix_(lua_copy_fromh)}, {"copy_fromd", nerv_matrix_(lua_copy_fromd)}, + /* alias for copy_fromd */ + {"copy_from", nerv_matrix_(lua_copy_fromd)}, {"copy_toh", nerv_matrix_(lua_copy_toh)}, {"add", nerv_matrix_(lua_add)}, {"mul", nerv_matrix_(lua_mul)}, @@ -302,6 +316,7 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = { {"mul_elem", nerv_matrix_(lua_mul_elem)}, {"log_elem", nerv_matrix_(lua_log_elem)}, {"copy_rows_fromh_by_idx", nerv_matrix_(lua_copy_rows_fromh_by_idx)}, + {"copy_rows_fromd_by_idx", nerv_matrix_(lua_copy_rows_fromd_by_idx)}, {"expand_frm", nerv_matrix_(lua_expand_frm)}, {"rearrange_frm", nerv_matrix_(lua_rearrange_frm)}, {"scale_rows_by_row", nerv_matrix_(lua_scale_rows_by_row)}, @@ -311,6 +326,9 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = { static void cuda_matrix_(init)(lua_State *L) { luaN_append_methods(L, nerv_matrix_(extra_methods)); +#ifdef CUMATRIX_INIT + CUMATRIX_INIT(L); +#endif } int nerv_matrix_(lua_get_elem)(lua_State *L) { diff --git a/nerv/matrix/mmatrix.c b/nerv/matrix/mmatrix.c index 5561572..961059c 100644 --- a/nerv/matrix/mmatrix.c +++ b/nerv/matrix/mmatrix.c @@ -16,7 +16,30 @@ void nerv_lua_mmatrix_init(lua_State *L) { #define host_matrix_(NAME) host_matrix_float_##NAME #define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME const char *nerv_matrix_(tname) = "nerv.MMatrixFloat"; +#define MMATRIX_INIT(L) host_matrix_(init_extra)(L) + +static const luaL_Reg nerv_matrix_(extra_methods_int)[]; +static void host_matrix_(init_extra)(lua_State *L) { + luaN_append_methods(L, nerv_matrix_(extra_methods_int)); +} + #include "generic/mmatrix.c" +#include "../lib/matrix/mmatrix.h" + +static int nerv_matrix_(lua_perm_gen)(lua_State *L) { + Status status; + int i, ncol = luaL_checkinteger(L, 1); + Matrix *self = nerv_matrix_(perm_gen)(ncol, &status); + NERV_LUA_CHECK_STATUS(L, status); + luaT_pushudata(L, self, nerv_matrix_(tname)); + return 1; +} + +static const luaL_Reg nerv_matrix_(extra_methods_int)[] = { + {"perm_gen", nerv_matrix_(lua_perm_gen)}, + {NULL, NULL} +}; + #undef nerv_matrix_ #undef host_matrix_ #undef MATRIX_USE_FLOAT @@ -24,6 +47,7 @@ const char *nerv_matrix_(tname) = "nerv.MMatrixFloat"; #undef MATRIX_ELEM_PTR #undef MATRIX_ELEM_FMT #undef MATRIX_ELEM_WRITE_FMT +#undef MMATRIX_INIT #define NERV_GENERIC_MMATRIX #define MATRIX_USE_DOUBLE @@ -44,26 +68,4 @@ const char *nerv_matrix_(tname) = "nerv.MMatrixDouble"; #define host_matrix_(NAME) host_matrix_int_##NAME #define nerv_matrix_(NAME) nerv_matrix_host_int_##NAME const char *nerv_matrix_(tname) = "nerv.MMatrixInt"; -#define MMATRIX_INIT(L) host_matrix_(init_extra)(L) - -static const luaL_Reg nerv_matrix_(extra_methods_int)[]; -static void host_matrix_(init_extra)(lua_State *L) { - luaN_append_methods(L, nerv_matrix_(extra_methods_int)); -} - #include "generic/mmatrix.c" -#include "../lib/matrix/mmatrix.h" - -static int nerv_matrix_(lua_perm_gen)(lua_State *L) { - Status status; - int i, ncol = luaL_checkinteger(L, 1); - Matrix *self = nerv_matrix_(perm_gen)(ncol, &status); - NERV_LUA_CHECK_STATUS(L, status); - luaT_pushudata(L, self, nerv_matrix_(tname)); - return 1; -} - -static const luaL_Reg nerv_matrix_(extra_methods_int)[] = { - {"perm_gen", nerv_matrix_(lua_perm_gen)}, - {NULL, NULL} -}; -- cgit v1.2.3-70-g09d2