From df737041e4a9f3f55978cc74db9a9cea27fa9fa0 Mon Sep 17 00:00:00 2001 From: Determinant Date: Fri, 5 Jun 2015 10:58:57 +0800 Subject: add profiling; add ce accurarcy; several other changes --- matrix/generic/cumatrix.c | 97 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 84 insertions(+), 13 deletions(-) (limited to 'matrix/generic/cumatrix.c') diff --git a/matrix/generic/cumatrix.c b/matrix/generic/cumatrix.c index 373fc42..8e7d34f 100644 --- a/matrix/generic/cumatrix.c +++ b/matrix/generic/cumatrix.c @@ -11,15 +11,11 @@ #define MATRIX_BASE_TNAME nerv_matrix_cuda_tname #define NERV_GENERIC_MATRIX #define NERV_GENERIC_CUKERNEL +#define PROFILE_HASHMAP_SIZE 123457 #include "../../common.h" #include "../cukernel.h" -#include "cuda.h" -#include "cuda_runtime.h" -#include "driver_types.h" -#include "cublas_v2.h" #include "../cuda_helper.h" - -static cublasHandle_t cublas_handle; +#include Matrix *nerv_matrix_(new_)(lua_State *L, long nrow, long ncol); void nerv_matrix_(data_free)(lua_State *L, Matrix *self); @@ -27,6 +23,7 @@ void nerv_matrix_(data_free)(lua_State *L, Matrix *self); static void nerv_matrix_(add_)(lua_State *L, const Matrix *a, const Matrix *b, const Matrix *c, MATRIX_ELEM alpha, MATRIX_ELEM beta) { + PROFILE_START CUBLAS_SAFE_CALL( NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, a->ncol, a->nrow, @@ -35,6 +32,7 @@ static void nerv_matrix_(add_)(lua_State *L, const Matrix *a, const Matrix *b, &beta, MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM), MATRIX_ELEM_PTR(c), c->stride / sizeof(MATRIX_ELEM))); + PROFILE_STOP } static int nerv_matrix_(add)(lua_State *L) { @@ -75,6 +73,7 @@ static int nerv_matrix_(mul)(lua_State *L) { nerv_error(L, "Wrong dimension of multipliers"); /* MATRIX_ELEM alpha = 1.0f, beta = 0.0f; */ /* Because matrix in Nerv is row-major, here b comes first */ + PROFILE_START CUBLAS_SAFE_CALL( NERV_CUBLAS_(gemm)(cublas_handle, tb, ta, bn, am, bm, @@ -83,6 +82,7 @@ static int nerv_matrix_(mul)(lua_State *L) { MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM), &beta, MATRIX_ELEM_PTR(c), c->stride / sizeof(MATRIX_ELEM))); + PROFILE_STOP return 0; } @@ -97,7 +97,9 @@ static int nerv_matrix_(sigmoid)(lua_State *L) { Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); CHECK_SAME_DIMENSION(a, b); + PROFILE_START cudak_(cuda_sigmoid)(b, a); + PROFILE_STOP return 0; } @@ -107,30 +109,38 @@ static int nerv_matrix_(sigmoid_grad)(lua_State *L) { Matrix *output = luaT_checkudata(L, 3, nerv_matrix_(tname)); CHECK_SAME_DIMENSION(nerr, err); CHECK_SAME_DIMENSION(nerr, output); + PROFILE_START cudak_(cuda_sigmoid_grad)(output, err, nerr); + PROFILE_STOP return 0; } static int nerv_matrix_(softmax)(lua_State *L) { Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname)); - Matrix *max; + Matrix *max, *max_idx; Matrix *dno; CHECK_SAME_DIMENSION(a, b); max = nerv_matrix_(new_)(L, a->nrow, 1); + max_idx = nerv_matrix_(new_)(L, a->nrow, 1); dno = nerv_matrix_(new_)(L, a->nrow, 1); - cudak_(cuda_rowmax)(a, max); + PROFILE_START + cudak_(cuda_rowmax_idx)(a, max, max_idx); cudak_(cuda_softmax_denominator)(a, max, dno); cudak_(cuda_softmax_final)(a, max, dno, b); + PROFILE_STOP nerv_matrix_(data_free)(L, max); nerv_matrix_(data_free)(L, dno); - return 0; + luaT_pushudata(L, max_idx, nerv_matrix_(tname)); + return 1; } static int nerv_matrix_(rowsum)(lua_State *L) { Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1); + PROFILE_START cudak_(cuda_rowsum)(a, b); + PROFILE_STOP luaT_pushudata(L, b, nerv_matrix_(tname)); return 1; } @@ -138,7 +148,21 @@ static int nerv_matrix_(rowsum)(lua_State *L) { static int nerv_matrix_(colsum)(lua_State *L) { Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b = nerv_matrix_(new_)(L, 1, a->ncol); + PROFILE_START cudak_(cuda_colsum)(a, b); + PROFILE_STOP + luaT_pushudata(L, b, nerv_matrix_(tname)); + return 1; +} + +static int nerv_matrix_(colsame)(lua_State *L) { + Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); + Matrix *ref = luaT_checkudata(L, 2, nerv_matrix_(tname)); + Matrix *b = nerv_matrix_(new_)(L, 1, a->ncol); + CHECK_SAME_DIMENSION(a, ref); + PROFILE_START + cudak_(cuda_colsame)(a, ref, b); + PROFILE_STOP luaT_pushudata(L, b, nerv_matrix_(tname)); return 1; } @@ -146,11 +170,24 @@ static int nerv_matrix_(colsum)(lua_State *L) { static int nerv_matrix_(rowmax)(lua_State *L) { Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1); + PROFILE_START cudak_(cuda_rowmax)(a, b); + PROFILE_STOP luaT_pushudata(L, b, nerv_matrix_(tname)); return 1; } +static int nerv_matrix_(rowmax_idx)(lua_State *L) { + Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); + Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1); + Matrix *idx = nerv_matrix_(new_)(L, a->nrow, 1); + PROFILE_START + cudak_(cuda_rowmax_idx)(a, b, idx); + PROFILE_STOP + luaT_pushudata(L, b, nerv_matrix_(tname)); + luaT_pushudata(L, idx, nerv_matrix_(tname)); + return 2; +} static int nerv_matrix_(add_row)(lua_State *L) { Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname)); @@ -160,14 +197,18 @@ static int nerv_matrix_(add_row)(lua_State *L) { nerv_error(L, "the number of columns is not the same"); if (a->nrow != 1) nerv_error(L, "a row vector is expected"); + PROFILE_START cudak_(cuda_add_row)(a, b, beta); + PROFILE_STOP return 0; } static int nerv_matrix_(fill)(lua_State *L) { Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname)); double val = luaL_checknumber(L, 2); + PROFILE_START cudak_(cuda_fill)(self, val); + PROFILE_STOP return 0; } @@ -183,11 +224,13 @@ static int nerv_matrix_(copy_fromd)(lua_State *L) { nerv_error(L, "invalid copy interval"); if (a->ncol != b->ncol) nerv_error(L, "matrices should be of the same dimension"); + PROFILE_START CUDA_SAFE_SYNC_CALL( cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride, MATRIX_ROW_PTR(b, b_begin), b->stride, sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin, cudaMemcpyDeviceToDevice)); + PROFILE_STOP return 0; } @@ -204,11 +247,13 @@ static int nerv_matrix_(copy_fromh)(lua_State *L) { nerv_error(L, "invalid copy interval"); if (a->ncol != b->ncol) nerv_error(L, "matrices should be of the same dimension"); + PROFILE_START CUDA_SAFE_SYNC_CALL( cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride, MATRIX_ROW_PTR(b, b_begin), b->stride, sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin, cudaMemcpyHostToDevice)); + PROFILE_STOP return 0; } @@ -224,11 +269,13 @@ static int nerv_matrix_(copy_toh)(lua_State *L) { nerv_error(L, "invalid copy interval"); if (b->ncol != a->ncol) nerv_error(L, "matrices should be of the same dimension"); + PROFILE_START CUDA_SAFE_SYNC_CALL( cudaMemcpy2D(MATRIX_ROW_PTR(b, b_begin), b->stride, MATRIX_ROW_PTR(a, a_begin), a->stride, sizeof(MATRIX_ELEM) * a->ncol, a_end - a_begin, cudaMemcpyDeviceToHost)); + PROFILE_STOP return 0; } @@ -237,6 +284,7 @@ static int nerv_matrix_(trans)(lua_State *L) { Matrix *b = nerv_matrix_(new_)(L, a->ncol, a->nrow); MATRIX_ELEM alpha = 1, beta = 0; /* FIXME: possible memory leak when lua error is raised */ + PROFILE_START CUBLAS_SAFE_CALL( NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T, a->nrow, a->ncol, @@ -245,6 +293,7 @@ static int nerv_matrix_(trans)(lua_State *L) { &beta, MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM), MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM))); + PROFILE_STOP luaT_pushudata(L, b, nerv_matrix_(tname)); return 1; } @@ -255,7 +304,9 @@ static int nerv_matrix_(mul_elem)(lua_State *L) { Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname)); CHECK_SAME_DIMENSION(a, b); CHECK_SAME_DIMENSION(a, c); + PROFILE_START cudak_(cuda_mul_elem)(a, b, c); + PROFILE_STOP return 0; } @@ -263,7 +314,9 @@ static int nerv_matrix_(log_elem)(lua_State *L) { Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname)); CHECK_SAME_DIMENSION(a, b); + PROFILE_START cudak_(cuda_log_elem)(a, b); + PROFILE_STOP return 0; } @@ -274,8 +327,10 @@ static int nerv_matrix_(decompress)(lua_State *L) { if (a->ncol != 1) nerv_error(L, "the compressed matrix must be a column vector"); b = nerv_matrix_(new_)(L, a->nrow, orig_col); + PROFILE_START cudak_(cuda_fill)(b, 0.0); cudak_(cuda_decompress)(a, b); + PROFILE_STOP luaT_pushudata(L, b, nerv_matrix_(tname)); return 1; } @@ -285,21 +340,25 @@ static int nerv_matrix_(copy_rows_fromh_by_idx)(lua_State *L) { Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME); Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_int_tname); + long nrow = a->nrow; + int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0; + if (!(0 <= b_begin && b_begin + nrow <= idx->ncol)) + nerv_error(L, "invalid copy interval"); long *idx_ptr = idx->data.i; int i; - long nrow = a->nrow; if (idx->nrow != 1) nerv_error(L, "index should be a vector"); - if (idx->ncol != nrow) - nerv_error(L, "index dimension mismatch"); if (a->ncol != b->ncol) nerv_error(L, "source/destination dimension mismatch"); cudaStream_t *streams = (cudaStream_t*)malloc(sizeof(cudaStream_t) * nrow); for (i = 0; i < nrow; i++) { + int src_row = idx_ptr[b_begin + i]; + if (!(0 <= src_row && src_row < b->nrow)) + nerv_error(L, "invalid index"); CUDA_SAFE_CALL(cudaStreamCreate(streams + i)); CUDA_SAFE_CALL(cudaMemcpyAsync(MATRIX_ROW_PTR(a, i), - MATRIX_ROW_PTR(b, idx_ptr[i]), + MATRIX_ROW_PTR(b, src_row), b->stride, cudaMemcpyHostToDevice, streams[i])); } @@ -308,6 +367,7 @@ static int nerv_matrix_(copy_rows_fromh_by_idx)(lua_State *L) { CUDA_SAFE_CALL(cudaStreamSynchronize(streams[i])); CUDA_SAFE_CALL(cudaStreamDestroy(streams[i])); } + free(streams); return 0; } @@ -319,7 +379,9 @@ static int nerv_matrix_(expand_frm)(lua_State *L) { nerv_error(L, "mismatching number of frames"); if (a->ncol != b->ncol * (context * 2 + 1)) nerv_error(L, "the width should be 2 * context + 1"); + PROFILE_START cudak_(cuda_expand_frm)(b, a, context); + PROFILE_STOP return 0; } @@ -330,7 +392,9 @@ static int nerv_matrix_(rearrange_frm)(lua_State *L) { CHECK_SAME_DIMENSION(a, b); if (b->ncol % step) nerv_error(L, "the dimension of columns is not divisible by step"); + PROFILE_START cudak_(cuda_rearrange_frm)(b, a, step); + PROFILE_STOP return 0; } @@ -341,15 +405,19 @@ static int nerv_matrix_(scale_row)(lua_State *L) { nerv_error(L, "the number of columns is not the same"); if (b->nrow != 1) nerv_error(L, "a row vector is expected"); + PROFILE_START cudak_(cuda_scale_row)(b, a); + PROFILE_STOP return 0; } static const luaL_Reg nerv_matrix_(extra_methods)[] = { {"create", nerv_matrix_(create)}, {"colsum", nerv_matrix_(colsum)}, + {"colsame", nerv_matrix_(colsame)}, {"rowsum", nerv_matrix_(rowsum)}, {"rowmax", nerv_matrix_(rowmax)}, + {"rowmax_idx", nerv_matrix_(rowmax_idx)}, {"trans", nerv_matrix_(trans)}, {"decompress", nerv_matrix_(decompress)}, /* in-place calc */ @@ -375,6 +443,7 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = { static void cuda_matrix_(init)(lua_State *L) { luaN_append_methods(L, nerv_matrix_(extra_methods)); cublasCreate(&cublas_handle); + profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp); } static void cuda_matrix_(free)(lua_State *L, MATRIX_ELEM *ptr) { @@ -383,7 +452,9 @@ static void cuda_matrix_(free)(lua_State *L, MATRIX_ELEM *ptr) { static void cuda_matrix_(alloc)(lua_State *L, MATRIX_ELEM **dptr, size_t *stride, long width, long height) { + PROFILE_START CUDA_SAFE_SYNC_CALL(cudaMallocPitch((void **)dptr, stride, width, height)); + PROFILE_STOP } static MATRIX_ELEM cuda_matrix_(read)(lua_State *L, MATRIX_ELEM *data, -- cgit v1.2.3