From ea6f2990f99dd9ded6a0e74d75a3ec84900a2518 Mon Sep 17 00:00:00 2001 From: Determinant Date: Wed, 3 Jun 2015 23:00:45 +0800 Subject: demo now works (without random shuffle) --- matrix/cuda_helper.h | 2 +- matrix/cukernel.h | 1 + matrix/generic/cukernel.cu | 19 ++++++++++ matrix/generic/cumatrix.c | 87 ++++++++++++++++++++++++++++++---------------- matrix/generic/mmatrix.c | 20 ++++++++++- matrix/init.lua | 3 +- 6 files changed, 100 insertions(+), 32 deletions(-) (limited to 'matrix') diff --git a/matrix/cuda_helper.h b/matrix/cuda_helper.h index c0fa618..cedc643 100644 --- a/matrix/cuda_helper.h +++ b/matrix/cuda_helper.h @@ -23,7 +23,7 @@ #define CHECK_SAME_DIMENSION(a, b) \ do { \ if (!(a->nrow == b->nrow && a->ncol == b->ncol)) \ - nerv_error(L, "Matrices should be of the same dimension"); \ + nerv_error(L, "matrices should be of the same dimension"); \ } while (0) static const char *cublasGetErrorString(cublasStatus_t err) { diff --git a/matrix/cukernel.h b/matrix/cukernel.h index 178b7d3..7d2168e 100644 --- a/matrix/cukernel.h +++ b/matrix/cukernel.h @@ -13,4 +13,5 @@ void cudak_(cuda_fill)(Matrix *a, double val); void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context); void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step); void cudak_(cuda_scale_row)(const Matrix *a, Matrix *b); +void cudak_(cuda_decompress)(const Matrix *a, Matrix *b); #endif diff --git a/matrix/generic/cukernel.cu b/matrix/generic/cukernel.cu index 1d8b983..05a1e78 100644 --- a/matrix/generic/cukernel.cu +++ b/matrix/generic/cukernel.cu @@ -187,6 +187,15 @@ __global__ void cudak_(scale_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b, b[j + i * stride] *= a[j]; } +__global__ void cudak_(decompress)(const MATRIX_ELEM *a, MATRIX_ELEM *b, + int nrow, int ncol, + int stride_a, int stride_b) { + int j = blockIdx.x * blockDim.x + threadIdx.x; + int i = blockIdx.y * blockDim.y + threadIdx.y; + if (i >= nrow || j >= ncol) return; + b[lrintf(a[j + i * stride_a]) + i * stride_b] = 1.0; +} + extern "C" { #include "../cukernel.h" void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) { @@ -385,5 +394,15 @@ extern "C" { (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM)); } + + void cudak_(cuda_decompress)(const Matrix *a, Matrix *b) { + dim3 threadsPerBlock(1, CUDA_THREADS_NN); + dim3 numBlocks(1, CEIL_DIV(a->nrow, threadsPerBlock.y)); + cudak_(decompress)<<>> \ + (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), + a->nrow, a->ncol, + a->stride / sizeof(MATRIX_ELEM), + b->stride / sizeof(MATRIX_ELEM)); + } } #endif diff --git a/matrix/generic/cumatrix.c b/matrix/generic/cumatrix.c index 0df1bd7..373fc42 100644 --- a/matrix/generic/cumatrix.c +++ b/matrix/generic/cumatrix.c @@ -74,7 +74,8 @@ static int nerv_matrix_(mul)(lua_State *L) { if (an != bm) nerv_error(L, "Wrong dimension of multipliers"); /* MATRIX_ELEM alpha = 1.0f, beta = 0.0f; */ - CUBLAS_SAFE_CALL( //Because matrix in Nerv is row-major, here b comes first + /* Because matrix in Nerv is row-major, here b comes first */ + CUBLAS_SAFE_CALL( NERV_CUBLAS_(gemm)(cublas_handle, tb, ta, bn, am, bm, &alpha, @@ -113,9 +114,11 @@ static int nerv_matrix_(sigmoid_grad)(lua_State *L) { static int nerv_matrix_(softmax)(lua_State *L) { Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname)); - Matrix *max = nerv_matrix_(new_)(L, a->nrow, 1); - Matrix *dno = nerv_matrix_(new_)(L, a->nrow, 1); + Matrix *max; + Matrix *dno; CHECK_SAME_DIMENSION(a, b); + max = nerv_matrix_(new_)(L, a->nrow, 1); + dno = nerv_matrix_(new_)(L, a->nrow, 1); cudak_(cuda_rowmax)(a, max); cudak_(cuda_softmax_denominator)(a, max, dno); cudak_(cuda_softmax_final)(a, max, dno, b); @@ -168,26 +171,22 @@ static int nerv_matrix_(fill)(lua_State *L) { return 0; } -static int nerv_matrix_(copy_fromd)(lua_State *L) { +static int nerv_matrix_(copy_fromd)(lua_State *L) { Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); - CHECK_SAME_DIMENSION(a, b); - CUDA_SAFE_SYNC_CALL( - cudaMemcpy2D(MATRIX_ELEM_PTR(a), a->stride, - MATRIX_ELEM_PTR(b), b->stride, - sizeof(MATRIX_ELEM) * b->ncol, b->nrow, - cudaMemcpyDeviceToDevice)); - return 0; -} - -static int nerv_matrix_(copy_tod)(lua_State *L) { - Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); - Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); - CHECK_SAME_DIMENSION(a, b); + int nargs = lua_gettop(L); + int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0; + int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow; + int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0; + if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow && + a_begin + b_end - b_begin <= a->nrow)) + nerv_error(L, "invalid copy interval"); + if (a->ncol != b->ncol) + nerv_error(L, "matrices should be of the same dimension"); CUDA_SAFE_SYNC_CALL( - cudaMemcpy2D(MATRIX_ELEM_PTR(b), b->stride, - MATRIX_ELEM_PTR(a), a->stride, - sizeof(MATRIX_ELEM) * a->ncol, a->nrow, + cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride, + MATRIX_ROW_PTR(b, b_begin), b->stride, + sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin, cudaMemcpyDeviceToDevice)); return 0; } @@ -196,11 +195,19 @@ extern const char *MATRIX_CUMATRIX_HOST_TNAME; static int nerv_matrix_(copy_fromh)(lua_State *L) { Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME); - CHECK_SAME_DIMENSION(a, b); + int nargs = lua_gettop(L); + int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0; + int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow; + int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0; + if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow && + a_begin + b_end - b_begin <= a->nrow)) + nerv_error(L, "invalid copy interval"); + if (a->ncol != b->ncol) + nerv_error(L, "matrices should be of the same dimension"); CUDA_SAFE_SYNC_CALL( - cudaMemcpy2D(MATRIX_ELEM_PTR(a), a->stride, - MATRIX_ELEM_PTR(b), b->stride, - sizeof(MATRIX_ELEM) * b->ncol, b->nrow, + cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride, + MATRIX_ROW_PTR(b, b_begin), b->stride, + sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin, cudaMemcpyHostToDevice)); return 0; } @@ -208,11 +215,19 @@ static int nerv_matrix_(copy_fromh)(lua_State *L) { static int nerv_matrix_(copy_toh)(lua_State *L) { Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME); - CHECK_SAME_DIMENSION(a, b); + int nargs = lua_gettop(L); + int a_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0; + int a_end = nargs > 3 ? luaL_checkinteger(L, 4) : a->nrow; + int b_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0; + if (!(0 <= a_begin && a_begin < a_end && a_end <= a->nrow && + b_begin + a_end - a_begin <= b->nrow)) + nerv_error(L, "invalid copy interval"); + if (b->ncol != a->ncol) + nerv_error(L, "matrices should be of the same dimension"); CUDA_SAFE_SYNC_CALL( - cudaMemcpy2D(MATRIX_ELEM_PTR(b), b->stride, - MATRIX_ELEM_PTR(a), a->stride, - sizeof(MATRIX_ELEM) * a->ncol, a->nrow, + cudaMemcpy2D(MATRIX_ROW_PTR(b, b_begin), b->stride, + MATRIX_ROW_PTR(a, a_begin), a->stride, + sizeof(MATRIX_ELEM) * a->ncol, a_end - a_begin, cudaMemcpyDeviceToHost)); return 0; } @@ -221,6 +236,7 @@ static int nerv_matrix_(trans)(lua_State *L) { Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b = nerv_matrix_(new_)(L, a->ncol, a->nrow); MATRIX_ELEM alpha = 1, beta = 0; + /* FIXME: possible memory leak when lua error is raised */ CUBLAS_SAFE_CALL( NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T, a->nrow, a->ncol, @@ -251,6 +267,19 @@ static int nerv_matrix_(log_elem)(lua_State *L) { return 0; } +static int nerv_matrix_(decompress)(lua_State *L) { + Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); + Matrix *b; + int orig_col = luaL_checkinteger(L, 2); + if (a->ncol != 1) + nerv_error(L, "the compressed matrix must be a column vector"); + b = nerv_matrix_(new_)(L, a->nrow, orig_col); + cudak_(cuda_fill)(b, 0.0); + cudak_(cuda_decompress)(a, b); + luaT_pushudata(L, b, nerv_matrix_(tname)); + return 1; +} + extern const char *nerv_matrix_host_int_tname; static int nerv_matrix_(copy_rows_fromh_by_idx)(lua_State *L) { Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); @@ -322,11 +351,11 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = { {"rowsum", nerv_matrix_(rowsum)}, {"rowmax", nerv_matrix_(rowmax)}, {"trans", nerv_matrix_(trans)}, + {"decompress", nerv_matrix_(decompress)}, /* in-place calc */ {"copy_fromh", nerv_matrix_(copy_fromh)}, {"copy_fromd", nerv_matrix_(copy_fromd)}, {"copy_toh", nerv_matrix_(copy_toh)}, - {"copy_tod", nerv_matrix_(copy_tod)}, {"add", nerv_matrix_(add)}, {"mul", nerv_matrix_(mul)}, {"add_row", nerv_matrix_(add_row)}, diff --git a/matrix/generic/mmatrix.c b/matrix/generic/mmatrix.c index 3a9ae79..4b722f3 100644 --- a/matrix/generic/mmatrix.c +++ b/matrix/generic/mmatrix.c @@ -11,6 +11,7 @@ #define NERV_GENERIC_MATRIX #include "../../common.h" #include "../../io/chunk_file.h" +#include "string.h" static void host_matrix_(alloc)(lua_State *L, MATRIX_ELEM **dptr, size_t *stride, @@ -96,10 +97,27 @@ int nerv_matrix_(save)(lua_State *L) { return 0; } - +static int nerv_matrix_(copy_from)(lua_State *L) { + Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); + Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); + int nargs = lua_gettop(L); + int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0; + int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow; + int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0; + if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow && + a_begin + b_end - b_begin <= a->nrow)) + nerv_error(L, "invalid copy interval"); + if (a->ncol != b->ncol) + nerv_error(L, "matrices should be of the same dimension"); + memmove(MATRIX_ROW_PTR(a, a_begin), + MATRIX_ROW_PTR(b, b_begin), + sizeof(MATRIX_ELEM) * b->ncol * (b_end - b_begin)); + return 0; +} static const luaL_Reg nerv_matrix_(extra_methods)[] = { {"load", nerv_matrix_(load)}, {"save", nerv_matrix_(save)}, + {"copy_from", nerv_matrix_(copy_from)}, {NULL, NULL} }; diff --git a/matrix/init.lua b/matrix/init.lua index f309f81..9637391 100644 --- a/matrix/init.lua +++ b/matrix/init.lua @@ -22,7 +22,8 @@ function nerv.Matrix:__tostring__() table.insert(strt, "\n") end end - table.insert(strt, string.format("[Matrix %d x %d]", nrow, ncol)) + table.insert(strt, string.format( + "[%s %d x %d]", self.__typename, nrow, ncol)) return table.concat(strt) end -- cgit v1.2.3