aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--matrix/cuda_helper.h3
-rw-r--r--matrix/generic/cukernel.cu30
-rw-r--r--matrix/generic/cumatrix.c6
3 files changed, 32 insertions, 7 deletions
diff --git a/matrix/cuda_helper.h b/matrix/cuda_helper.h
index 5e5f2ad..88619fd 100644
--- a/matrix/cuda_helper.h
+++ b/matrix/cuda_helper.h
@@ -4,12 +4,13 @@
#include "cuda_runtime.h"
#include "driver_types.h"
#include "cublas_v2.h"
-#define CUBLAS_SAFE_CALL(call) \
+#define CUBLAS_SAFE_SYNC_CALL(call) \
do { \
cublasStatus_t err = (call); \
if (err != CUBLAS_STATUS_SUCCESS) \
nerv_error(L, "cumatrix cublas error: %s at %s:%d", \
cublasGetErrorString(err), __FILE__, __LINE__); \
+ cudaDeviceSynchronize(); \
} while (0)
#define CUDA_SAFE_CALL(call) \
diff --git a/matrix/generic/cukernel.cu b/matrix/generic/cukernel.cu
index fdab356..ffae5ed 100644
--- a/matrix/generic/cukernel.cu
+++ b/matrix/generic/cukernel.cu
@@ -146,7 +146,7 @@ __global__ void cudak_(block_reduce_rowmax)(const MATRIX_ELEM *input,
const int n) {
extern __shared__ MATRIX_ELEM cudak_(arr)[];
int j = blockIdx.x * blockDim.x + threadIdx.x;
- cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0;
+ cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
__syncthreads();
for (int offset = blockDim.x >> 1; offset; offset >>= 1)
{
@@ -173,7 +173,7 @@ __global__ void cudak_(block_reduce_rowmax_idx)(const MATRIX_ELEM *input,
MATRIX_ELEM *arr_val = cudak_(arr);
MATRIX_ELEM *arr_idx = arr_val + blockDim.x;
int j = blockIdx.x * blockDim.x + threadIdx.x;
- arr_val[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0;
+ arr_val[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
arr_idx[threadIdx.x] = j < n ? idx_input[j + istride * blockIdx.y] : 0;
__syncthreads();
for (int offset = blockDim.x >> 1; offset; offset >>= 1)
@@ -272,6 +272,7 @@ extern "C" {
cudak_(log_elem)<<<numBlocks, threadsPerBlock>>> \
(MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+ cudaStreamSynchronize(0);
}
void cudak_(cuda_mul_elem)(const Matrix *a, const Matrix *b,
@@ -283,6 +284,7 @@ extern "C" {
(MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
MATRIX_ELEM_PTR(c),
b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+ cudaStreamSynchronize(0);
}
void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b) {
@@ -292,6 +294,7 @@ extern "C" {
cudak_(sigmoid)<<<numBlocks, threadsPerBlock>>> \
(MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
b->stride / sizeof(MATRIX_ELEM));
+ cudaStreamSynchronize(0);
}
void cudak_(cuda_sigmoid_grad)(const Matrix *output,
@@ -304,6 +307,7 @@ extern "C" {
MATRIX_ELEM_PTR(nerr),
nerr->nrow, nerr->ncol,
nerr->stride / sizeof(MATRIX_ELEM));
+ cudaStreamSynchronize(0);
}
void cudak_(cuda_rowsum)(const Matrix *a, Matrix *b) {
@@ -321,10 +325,12 @@ extern "C" {
ncol = blocks_per_row;
assert((unsigned long)ncol <= block.x);
grid.x = 1;
+ cudaStreamSynchronize(0);
cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
(res, MATRIX_ELEM_PTR(b),
stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
ncol);
+ cudaStreamSynchronize(0);
cudaFree(res);
}
@@ -343,10 +349,12 @@ extern "C" {
nrow = blocks_per_col;
assert((unsigned long)nrow <= block.y);
grid.y = 1;
+ cudaStreamSynchronize(0);
cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
(res, MATRIX_ELEM_PTR(b),
stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
nrow);
+ cudaStreamSynchronize(0);
cudaFree(res);
}
@@ -365,10 +373,12 @@ extern "C" {
nrow = blocks_per_col;
assert((unsigned long)nrow <= block.y);
grid.y = 1;
+ cudaStreamSynchronize(0);
cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
(res, MATRIX_ELEM_PTR(b),
stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
nrow);
+ cudaStreamSynchronize(0);
cudaFree(res);
}
@@ -383,6 +393,7 @@ extern "C" {
b->nrow, b->ncol,
b->stride / sizeof(MATRIX_ELEM),
max->stride / sizeof(MATRIX_ELEM));
+ cudaStreamSynchronize(0);
}
void cudak_(cuda_softmax_denominator)(const Matrix *a, const Matrix *max, Matrix *b) {
@@ -403,11 +414,13 @@ extern "C" {
ncol = blocks_per_row;
assert((unsigned long)ncol <= block.x);
grid.x = 1;
+ cudaStreamSynchronize(0);
cudak_(block_reduce_rowsum) \
<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
(res, MATRIX_ELEM_PTR(b),
stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
ncol);
+ cudaStreamSynchronize(0);
cudaFree(res);
}
@@ -426,10 +439,12 @@ extern "C" {
ncol = blocks_per_row;
assert((unsigned long)ncol <= block.x);
grid.x = 1;
+ cudaStreamSynchronize(0);
cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
(res, MATRIX_ELEM_PTR(b),
stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
ncol);
+ cudaStreamSynchronize(0);
cudaFree(res);
}
@@ -444,20 +459,23 @@ extern "C" {
cudak_(gen_col_idx)<<<grid, block>>>(a_idx, a->nrow, ncol, stride / sizeof(MATRIX_ELEM));
cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
cudaMallocPitch(&res_idx, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+ cudaStreamSynchronize(0);
cudak_(block_reduce_rowmax_idx)<<<grid, block,
2 * block.x * sizeof(MATRIX_ELEM)>>> \
(MATRIX_ELEM_PTR(a), a_idx, res, res_idx,
a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
ncol);
- cudaFree(a_idx);
ncol = blocks_per_row;
assert((unsigned long)ncol <= block.x);
grid.x = 1;
+ cudaStreamSynchronize(0);
cudak_(block_reduce_rowmax_idx)<<<grid, block,
2 * block.x * sizeof(MATRIX_ELEM)>>> \
(res, res_idx, MATRIX_ELEM_PTR(b), MATRIX_ELEM_PTR(b_idx),
stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
ncol);
+ cudaStreamSynchronize(0);
+ cudaFree(a_idx);
cudaFree(res);
cudaFree(res_idx);
}
@@ -470,6 +488,7 @@ extern "C" {
cudak_(add_row)<<<numBlocks, threadsPerBlock>>> \
(MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
b->stride / sizeof(MATRIX_ELEM), beta);
+ cudaStreamSynchronize(0);
}
void cudak_(cuda_fill)(Matrix *a, double val) {
@@ -479,6 +498,7 @@ extern "C" {
cudak_(fill)<<<numBlocks, threadsPerBlock>>> \
(MATRIX_ELEM_PTR(a), a->nrow, a->ncol,
a->stride / sizeof(MATRIX_ELEM), val);
+ cudaStreamSynchronize(0);
}
void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context) {
@@ -492,6 +512,7 @@ extern "C" {
a->stride / sizeof(MATRIX_ELEM),
b->stride / sizeof(MATRIX_ELEM),
context);
+ cudaStreamSynchronize(0);
}
void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step) {
@@ -502,6 +523,7 @@ extern "C" {
(MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM),
step, b->ncol / step);
+ cudaStreamSynchronize(0);
}
void cudak_(cuda_scale_row)(const Matrix *a, Matrix *b) {
@@ -511,6 +533,7 @@ extern "C" {
cudak_(scale_row)<<<numBlocks, threadsPerBlock>>> \
(MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+ cudaStreamSynchronize(0);
}
void cudak_(cuda_decompress)(const Matrix *a, Matrix *b) {
@@ -521,6 +544,7 @@ extern "C" {
a->nrow, a->ncol,
a->stride / sizeof(MATRIX_ELEM),
b->stride / sizeof(MATRIX_ELEM));
+ cudaStreamSynchronize(0);
}
}
#endif
diff --git a/matrix/generic/cumatrix.c b/matrix/generic/cumatrix.c
index 8e7d34f..956e1e6 100644
--- a/matrix/generic/cumatrix.c
+++ b/matrix/generic/cumatrix.c
@@ -24,7 +24,7 @@ static void nerv_matrix_(add_)(lua_State *L, const Matrix *a, const Matrix *b,
const Matrix *c,
MATRIX_ELEM alpha, MATRIX_ELEM beta) {
PROFILE_START
- CUBLAS_SAFE_CALL(
+ CUBLAS_SAFE_SYNC_CALL(
NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
a->ncol, a->nrow,
&alpha,
@@ -74,7 +74,7 @@ static int nerv_matrix_(mul)(lua_State *L) {
/* MATRIX_ELEM alpha = 1.0f, beta = 0.0f; */
/* Because matrix in Nerv is row-major, here b comes first */
PROFILE_START
- CUBLAS_SAFE_CALL(
+ CUBLAS_SAFE_SYNC_CALL(
NERV_CUBLAS_(gemm)(cublas_handle, tb, ta,
bn, am, bm,
&alpha,
@@ -285,7 +285,7 @@ static int nerv_matrix_(trans)(lua_State *L) {
MATRIX_ELEM alpha = 1, beta = 0;
/* FIXME: possible memory leak when lua error is raised */
PROFILE_START
- CUBLAS_SAFE_CALL(
+ CUBLAS_SAFE_SYNC_CALL(
NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T,
a->nrow, a->ncol,
&alpha,