diff options
Diffstat (limited to 'nerv/lib/matrix/generic/cukernel.cu')
-rw-r--r-- | nerv/lib/matrix/generic/cukernel.cu | 12 |
1 files changed, 7 insertions, 5 deletions
diff --git a/nerv/lib/matrix/generic/cukernel.cu b/nerv/lib/matrix/generic/cukernel.cu index 08feb59..707f8fd 100644 --- a/nerv/lib/matrix/generic/cukernel.cu +++ b/nerv/lib/matrix/generic/cukernel.cu @@ -229,14 +229,15 @@ __global__ void cudak_(expand_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b, int nrow, int ncol, int enrow, int encol, int stride, int estride, - int context) { + int context, + int a_begin, int a_end) { int j = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.y * blockDim.y + threadIdx.y; int ridx; if (i >= enrow || j >= encol) return; ridx = i + j / ncol - context; - if (ridx < 0) ridx = 0; - else if (ridx >= nrow) ridx = nrow - 1; + if (ridx < a_begin) ridx = a_begin; + else if (ridx >= a_end) ridx = a_end - 1; b[j + i * estride] = a[j % ncol + ridx * stride]; } @@ -541,7 +542,7 @@ extern "C" { cudaStreamSynchronize(0); } - void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context) { + void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context, int a_begin, int a_end) { dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), CEIL_DIV(b->nrow, threadsPerBlock.y)); @@ -551,7 +552,8 @@ extern "C" { b->nrow, b->ncol, a->stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM), - context); + context, + a_begin, a_end); cudaStreamSynchronize(0); } |