diff options
Diffstat (limited to 'matrix/generic')
-rw-r--r-- | matrix/generic/cukernel.cu | 20 | ||||
-rw-r--r-- | matrix/generic/cumatrix.c | 11 |
2 files changed, 31 insertions, 0 deletions
diff --git a/matrix/generic/cukernel.cu b/matrix/generic/cukernel.cu index 4b6af61..2e794b7 100644 --- a/matrix/generic/cukernel.cu +++ b/matrix/generic/cukernel.cu @@ -105,6 +105,15 @@ __global__ void cudak_(block_reduce_rowmax)(const MATRIX_ELEM *input, output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0]; } +__global__ void cudak_(add_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b, + int nrow, int ncol, int stride, double beta) { + int j = blockIdx.x * blockDim.x + threadIdx.x; + int i = blockIdx.y * blockDim.y + threadIdx.y; + if (i >= nrow || j >= ncol) return; + b[j + i * stride] += beta * a[j]; +} + + extern "C" { #include "../cukernel.h" void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b) { @@ -222,5 +231,16 @@ extern "C" { ncol); cudaFree(res); } + + /* in-place calc */ + void cudak_(cuda_add_row)(const Matrix *a, Matrix *b, double beta) { + dim3 threadsPerBlock(CUDA_THREADS_N, + CUDA_THREADS_N); + dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), + CEIL_DIV(b->nrow, threadsPerBlock.y)); + cudak_(add_row)<<<numBlocks, threadsPerBlock>>> \ + (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol, + b->stride / sizeof(MATRIX_ELEM), beta); + } } #endif diff --git a/matrix/generic/cumatrix.c b/matrix/generic/cumatrix.c index 557e4c1..ae57b21 100644 --- a/matrix/generic/cumatrix.c +++ b/matrix/generic/cumatrix.c @@ -126,6 +126,15 @@ static int nerv_matrix_(rowmax)(lua_State *L) { return 1; } + +static int nerv_matrix_(add_row)(lua_State *L) { + Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname)); + Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname)); + double beta = luaL_checknumber(L, 3); + cudak_(cuda_add_row)(a, b, beta); + return 0; +} + extern const char *MATRIX_CUMATRIX_HOST_TNAME; static int nerv_matrix_(copy_from)(lua_State *L) { Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); @@ -163,6 +172,8 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = { {"rowmax", nerv_matrix_(rowmax)}, {"copy_from", nerv_matrix_(copy_from)}, {"copy_to", nerv_matrix_(copy_to)}, + /* in-place calc */ + {"add_row", nerv_matrix_(add_row)}, {NULL, NULL} }; |