diff options
Diffstat (limited to 'nerv/lib/matrix')
-rw-r--r-- | nerv/lib/matrix/cukernel.cu | 1 | ||||
-rw-r--r-- | nerv/lib/matrix/cumatrix.c | 1 | ||||
-rw-r--r-- | nerv/lib/matrix/generic/cukernel.cu | 72 | ||||
-rw-r--r-- | nerv/lib/matrix/generic/cumatrix.c | 38 | ||||
-rw-r--r-- | nerv/lib/matrix/generic/cumatrix.h | 8 | ||||
-rw-r--r-- | nerv/lib/matrix/generic/elem_type.h | 13 | ||||
-rw-r--r-- | nerv/lib/matrix/generic/matrix.c | 9 | ||||
-rw-r--r-- | nerv/lib/matrix/matrix.h | 1 | ||||
-rw-r--r-- | nerv/lib/matrix/mmatrix.c | 5 |
9 files changed, 127 insertions, 21 deletions
diff --git a/nerv/lib/matrix/cukernel.cu b/nerv/lib/matrix/cukernel.cu index 210e6bf..c20e538 100644 --- a/nerv/lib/matrix/cukernel.cu +++ b/nerv/lib/matrix/cukernel.cu @@ -44,6 +44,7 @@ __device__ float atomicAdd_nvidia(float* address, float val) { #undef MATRIX_USE_FLOAT #undef MATRIX_ELEM #undef MATRIX_ELEM_PTR +#undef MATRIX_ELEM_PTR_BASE #undef MATRIX_ELEM_FMT #undef MATRIX_ELEM_WRITE_FMT diff --git a/nerv/lib/matrix/cumatrix.c b/nerv/lib/matrix/cumatrix.c index ff1168d..a8ed075 100644 --- a/nerv/lib/matrix/cumatrix.c +++ b/nerv/lib/matrix/cumatrix.c @@ -57,6 +57,7 @@ void nerv_cumatrix_init() { #undef MATRIX_USE_FLOAT #undef MATRIX_ELEM #undef MATRIX_ELEM_PTR +#undef MATRIX_ELEM_PTR_BASE #undef MATRIX_ELEM_FMT #undef MATRIX_ELEM_WRITE_FMT #undef MATRIX_CUMATRIX_HOST_TNAME diff --git a/nerv/lib/matrix/generic/cukernel.cu b/nerv/lib/matrix/generic/cukernel.cu index aa830b5..2b696d5 100644 --- a/nerv/lib/matrix/generic/cukernel.cu +++ b/nerv/lib/matrix/generic/cukernel.cu @@ -262,12 +262,29 @@ __global__ void cudak_(clip)(MATRIX_ELEM *a, } #ifdef __NERV_FUTURE_CUDA_7 -__global__ void cudak_(update_select_rows)(MATRIX_ELEM *c, const MATRIX_ELEM *a, const MATRIX_ELEM *idx, - int nrow_a, int ncol_a, int stride_c, int stride_a, double alpha, double beta) { +__global__ void cudak_(update_select_rows_by_rowidx)(MATRIX_ELEM *c, const MATRIX_ELEM *a, const MATRIX_ELEM *idx, + int nrow_a, int ncol_a, int nrow_c, int stride_c, int stride_a, double alpha, double beta) { int j = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.y * blockDim.y + threadIdx.y; if (i >= nrow_a || j >= ncol_a) return; int i_c = lrintf(idx[i]); + if (i_c < 0 || i_c >= nrow_c) { + printf("ERROR inside kernel update_select_rows, i_c(%d) out of range!", i_c); + } + //critical: i_c could conflict among threads(same index in the idx array), so atomicAdd is used + //c[j + i_c * stride_c] = c[j + i_c * stride_c] * (1 - beta * alpha) + a[j + i * stride_a] * alpha; + atomicAdd_nvidia(c + j + i_c * stride_c, c[j + i_c * stride_c] * (- beta * alpha) + a[j + i * stride_a] * alpha); +} + +__global__ void cudak_(update_select_rows_by_colidx)(MATRIX_ELEM *c, const MATRIX_ELEM *a, const MATRIX_ELEM *idx, + int nrow_a, int ncol_a, int nrow_c, int stride_c, int stride_a, int stride_idx, double alpha, double beta) { + int j = blockIdx.x * blockDim.x + threadIdx.x; + int i = blockIdx.y * blockDim.y + threadIdx.y; + if (i >= nrow_a || j >= ncol_a) return; + int i_c = lrintf(idx[stride_idx * i]); + if (i_c < 0 || i_c >= nrow_c) { + printf("ERROR inside kernel update_select_rows, i_c(%d) out of range!", i_c); + } //critical: i_c could conflict among threads(same index in the idx array), so atomicAdd is used //c[j + i_c * stride_c] = c[j + i_c * stride_c] * (1 - beta * alpha) + a[j + i * stride_a] * alpha; atomicAdd_nvidia(c + j + i_c * stride_c, c[j + i_c * stride_c] * (- beta * alpha) + a[j + i * stride_a] * alpha); @@ -335,13 +352,31 @@ __global__ void cudak_(gen_col_idx)(MATRIX_ELEM *b, __global__ void cudak_(copy_rows_by_idx)(const MATRIX_ELEM *a, MATRIX_ELEM *b, const MATRIX_ELEM *idx, - int nrow, int ncol, int stride) { + int nrow, int ncol, int a_nrow, int stride) { + int j = blockIdx.x * blockDim.x + threadIdx.x; + int i = blockIdx.y * blockDim.y + threadIdx.y; + if (i >= nrow || j >= ncol) return; + int k = lrintf(idx[i]); + if (k < 0 || k >= a_nrow) { + printf("error in kernel copy_rows_by_idx k(%d) out of range\n", k); + } + b[j + i * stride] = a[j + k * stride]; +} + +__global__ void cudak_(copy_rows_by_colidx)(const MATRIX_ELEM *a, MATRIX_ELEM *b, + const MATRIX_ELEM *idx, + int nrow, int ncol, int a_nrow, int stride, int idx_stride) { int j = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.y * blockDim.y + threadIdx.y; if (i >= nrow || j >= ncol) return; - b[j + i * stride] = a[j + lrintf(idx[i]) * stride]; + int k = lrintf(idx[i * idx_stride]); + if (k < 0 || k >= a_nrow) { + printf("error in kernel copy_rows_by_colidx k(%d) out of range\n", k); + } + b[j + i * stride] = a[j + k * stride]; } + extern "C" { #include "../cukernel.h" void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) { @@ -633,16 +668,26 @@ extern "C" { } #ifdef __NERV_FUTURE_CUDA_7 - void cudak_(cuda_update_select_rows)(Matrix *c, const Matrix *a, const Matrix *idx, double alpha, double beta) { + void cudak_(cuda_update_select_rows_by_rowidx)(Matrix *c, const Matrix *a, const Matrix *idx, double alpha, double beta) { dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x), CEIL_DIV(a->nrow, threadsPerBlock.y)); - cudak_(update_select_rows)<<<numBlocks, threadsPerBlock>>> \ + cudak_(update_select_rows_by_rowidx)<<<numBlocks, threadsPerBlock>>> \ (MATRIX_ELEM_PTR(c), MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(idx), - a->nrow, a->ncol, c->stride / sizeof(MATRIX_ELEM), + a->nrow, a->ncol, c->nrow, c->stride / sizeof(MATRIX_ELEM), a->stride / sizeof(MATRIX_ELEM), alpha, beta); cudaStreamSynchronize(0); } + void cudak_(cuda_update_select_rows_by_colidx)(Matrix *c, const Matrix *a, const Matrix *idx, double alpha, double beta) { + dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N); + dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x), + CEIL_DIV(a->nrow, threadsPerBlock.y)); + cudak_(update_select_rows_by_colidx)<<<numBlocks, threadsPerBlock>>> \ + (MATRIX_ELEM_PTR(c), MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(idx), + a->nrow, a->ncol, c->nrow, c->stride / sizeof(MATRIX_ELEM), + a->stride / sizeof(MATRIX_ELEM), idx->stride / sizeof(MATRIX_ELEM), alpha, beta); + cudaStreamSynchronize(0); + } #endif void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context) { @@ -710,7 +755,18 @@ extern "C" { cudak_(copy_rows_by_idx)<<<numBlocks, threadsPerBlock>>> \ (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), MATRIX_ELEM_PTR(idx) + idx_begin, - b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM)); + b->nrow, b->ncol, a->nrow, b->stride / sizeof(MATRIX_ELEM)); + cudaStreamSynchronize(0); + } + + void cudak_(cuda_copy_rows_by_colidx)(const Matrix *a, Matrix *b, + const Matrix *idx, int idx_begin) { + dim3 threadsPerBlock(CUDA_THREADS_NN, 1); + dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), b->nrow); + cudak_(copy_rows_by_colidx)<<<numBlocks, threadsPerBlock>>> \ + (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), + MATRIX_ELEM_PTR(idx) + idx_begin, + b->nrow, b->ncol, a->nrow, b->stride / sizeof(MATRIX_ELEM), idx->stride / sizeof(MATRIX_ELEM)); cudaStreamSynchronize(0); } } diff --git a/nerv/lib/matrix/generic/cumatrix.c b/nerv/lib/matrix/generic/cumatrix.c index 65e0788..7582725 100644 --- a/nerv/lib/matrix/generic/cumatrix.c +++ b/nerv/lib/matrix/generic/cumatrix.c @@ -349,7 +349,7 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b, long nrow = a->nrow; if (!(0 <= b_begin && b_begin + nrow <= idx->ncol)) NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0); - float *idx_ptr = idx->data.f; + float *idx_ptr = MATRIX_ELEM_PTR_F(idx); int i; if (idx->nrow != 1) NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0); @@ -393,15 +393,45 @@ void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b, NERV_SET_STATUS(status, NERV_NORMAL, 0); } +void nerv_matrix_(copy_rows_fromd_by_colidx)(Matrix *a, const Matrix *b, + const Matrix *idx, int b_begin, Status *status) { + long nrow = a->nrow; + if (!(0 <= b_begin && b_begin + nrow <= idx->nrow)) + NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0); + if (idx->ncol != 1) + NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0); + if (a->ncol != b->ncol) { + printf("%d %d\n", a->ncol, b->ncol); + NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); + } + PROFILE_START + cudak_(cuda_copy_rows_by_colidx)(b, a, idx, b_begin); + PROFILE_STOP + NERV_SET_STATUS(status, NERV_NORMAL, 0); +} + + #ifdef __NERV_FUTURE_CUDA_7 -void nerv_matrix_(update_select_rows)(Matrix *c, const Matrix *a, const Matrix *idx, double alpha, double beta, Status *status) { +void nerv_matrix_(update_select_rows_by_rowidx)(Matrix *c, const Matrix *a, const Matrix *idx, double alpha, double beta, Status *status) { long nrow = a->nrow; - if (idx->nrow != 1) + if (idx->nrow != 1 || idx->ncol != a->nrow) + NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0); + if (a->ncol != c->ncol) + NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); + PROFILE_START + cudak_(cuda_update_select_rows_by_rowidx)(c, a, idx, alpha, beta); + PROFILE_STOP + NERV_SET_STATUS(status, NERV_NORMAL, 0); +} + +void nerv_matrix_(update_select_rows_by_colidx)(Matrix *c, const Matrix *a, const Matrix *idx, double alpha, double beta, Status *status) { + long nrow = a->nrow; + if (idx->ncol != 1 || idx->nrow != a->nrow) NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0); if (a->ncol != c->ncol) NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); PROFILE_START - cudak_(cuda_update_select_rows)(c, a, idx, alpha, beta); + cudak_(cuda_update_select_rows_by_colidx)(c, a, idx, alpha, beta); PROFILE_STOP NERV_SET_STATUS(status, NERV_NORMAL, 0); } diff --git a/nerv/lib/matrix/generic/cumatrix.h b/nerv/lib/matrix/generic/cumatrix.h index aa8805a..e82dccd 100644 --- a/nerv/lib/matrix/generic/cumatrix.h +++ b/nerv/lib/matrix/generic/cumatrix.h @@ -45,7 +45,13 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b, const Matrix *idx, int b_begin, Status *status); void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b, const Matrix *idx, int b_begin, Status *status); -void nerv_matrix_(update_select_rows)(Matrix *c, const Matrix *a, const Matrix *idx, double alpha, double beta, Status *status); +void nerv_matrix_(copy_rows_fromd_by_colidx)(Matrix *a, const Matrix *b, + const Matrix *idx, int b_begin, Status *status); + +#ifdef __NERV_FUTURE_CUDA_7 +void nerv_matrix_(update_select_rows_by_rowidx)(Matrix *c, const Matrix *a, const Matrix *idx, double alpha, double beta, Status *status); +void nerv_matrix_(update_select_rows_by_colidx)(Matrix *c, const Matrix *a, const Matrix *idx, double alpha, double beta, Status *status); +#endif void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b, int context, Status *status); diff --git a/nerv/lib/matrix/generic/elem_type.h b/nerv/lib/matrix/generic/elem_type.h index bffe940..07f6355 100644 --- a/nerv/lib/matrix/generic/elem_type.h +++ b/nerv/lib/matrix/generic/elem_type.h @@ -1,22 +1,29 @@ +#define MATRIX_ELEM_PTR_F(self) ((float *)((char *)((self)->data.f) + (self)->offset)) +#define MATRIX_ELEM_PTR_D(self) ((double *)((char *)((self)->data.d) + (self)->offset)) +#define MATRIX_ELEM_PTR_I(self) ((long *)((char *)((self)->data.i) + (self)->offset)) + #ifdef MATRIX_USE_FLOAT #define MATRIX_ELEM float #define MATRIX_ELEM_FMT "%f" #define MATRIX_ELEM_WRITE_FMT "%.8f" -#define MATRIX_ELEM_PTR(self) ((self)->data.f) +#define MATRIX_ELEM_PTR(self) MATRIX_ELEM_PTR_F(self) +#define MATRIX_ELEM_PTR_BASE(self) ((self)->data.f) #elif defined(MATRIX_USE_DOUBLE) #define MATRIX_ELEM double #define MATRIX_ELEM_FMT "%lf" #define MATRIX_ELEM_WRITE_FMT "%.8lf" -#define MATRIX_ELEM_PTR(self) ((self)->data.d) +#define MATRIX_ELEM_PTR(self) MATRIX_ELEM_PTR_D(self) +#define MATRIX_ELEM_PTR_BASE(self) ((self)->data.d) #elif defined(MATRIX_USE_INT) #define MATRIX_ELEM long #define MATRIX_ELEM_FMT "%ld" #define MATRIX_ELEM_WRITE_FMT "%ld" -#define MATRIX_ELEM_PTR(self) ((self)->data.i) +#define MATRIX_ELEM_PTR(self) MATRIX_ELEM_PTR_I(self) +#define MATRIX_ELEM_PTR_BASE(self) ((self)->data.i) #endif diff --git a/nerv/lib/matrix/generic/matrix.c b/nerv/lib/matrix/generic/matrix.c index fd5d28f..004d9aa 100644 --- a/nerv/lib/matrix/generic/matrix.c +++ b/nerv/lib/matrix/generic/matrix.c @@ -4,12 +4,11 @@ /* FIXME: malloc failure detection */ void nerv_matrix_(data_free)(Matrix *self, Status *status) { - if(*self->data_ref == 0) return; /* FIXME: repeat free memory */ assert(*self->data_ref > 0); if (--(*self->data_ref) == 0) { /* free matrix data */ - MATRIX_DATA_FREE(MATRIX_ELEM_PTR(self), status); + MATRIX_DATA_FREE(MATRIX_ELEM_PTR_BASE(self), status); curandDestroyGenerator(*(self->curand_gen)); free(self->curand_gen); free(self->data_ref); @@ -31,7 +30,7 @@ Matrix *nerv_matrix_(create)(long nrow, long ncol, Status *status) { self->ncol = ncol; self->nmax = self->nrow * self->ncol; self->dim = 2; - MATRIX_DATA_ALLOC(&MATRIX_ELEM_PTR(self), &self->stride, + MATRIX_DATA_ALLOC(&MATRIX_ELEM_PTR_BASE(self), &self->stride, sizeof(MATRIX_ELEM) * self->ncol, self->nrow, status); if (status->err_code != NERV_NORMAL) @@ -46,6 +45,7 @@ Matrix *nerv_matrix_(create)(long nrow, long ncol, Status *status) { curandCreateGenerator(self->curand_gen, CURAND_RNG_PSEUDO_DEFAULT); curandSetPseudoRandomGeneratorSeed(*(self->curand_gen), time(NULL)); + self->offset = 0; nerv_matrix_(data_retain)(self); NERV_SET_STATUS(status, NERV_NORMAL, 0); return self; @@ -62,9 +62,10 @@ Matrix *nerv_matrix_(getrow)(Matrix *self, int row) { prow->dim = 1; prow->stride = self->stride; prow->nmax = prow->ncol; - MATRIX_ELEM_PTR(prow) = MATRIX_ROW_PTR(self, row); + prow->data = self->data; prow->data_ref = self->data_ref; prow->curand_gen = self->curand_gen; + prow->offset = row * self->stride; nerv_matrix_(data_retain)(prow); return prow; } diff --git a/nerv/lib/matrix/matrix.h b/nerv/lib/matrix/matrix.h index 5a85c08..a28fd97 100644 --- a/nerv/lib/matrix/matrix.h +++ b/nerv/lib/matrix/matrix.h @@ -13,6 +13,7 @@ typedef struct Matrix { double *d; long *i; } data; /* pointer to actual storage */ + unsigned long offset; /* the actual beginning of the matrix */ long *data_ref; curandGenerator_t *curand_gen; } Matrix; diff --git a/nerv/lib/matrix/mmatrix.c b/nerv/lib/matrix/mmatrix.c index b8157eb..b5670f2 100644 --- a/nerv/lib/matrix/mmatrix.c +++ b/nerv/lib/matrix/mmatrix.c @@ -6,6 +6,7 @@ #define host_matrix_(NAME) host_matrix_float_##NAME #define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME #include "generic/matrix.h" +#include "generic/elem_type.h" #include "generic/mmatrix.c" Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) { @@ -13,7 +14,7 @@ Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) { Matrix *self = nerv_matrix_(create)(1, ncol, status); if (status->err_code != NERV_NORMAL) return NULL; - float *prow = self->data.f; + float *prow = MATRIX_ELEM_PTR_F(self); for (i = 0; i < ncol; i++) prow[i] = i; for (i = ncol - 1; i >= 0; i--) @@ -31,6 +32,7 @@ Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) { #undef MATRIX_USE_FLOAT #undef MATRIX_ELEM #undef MATRIX_ELEM_PTR +#undef MATRIX_ELEM_PTR_BASE #undef MATRIX_ELEM_FMT #undef MATRIX_ELEM_WRITE_FMT @@ -44,6 +46,7 @@ Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) { #undef MATRIX_USE_DOUBLE #undef MATRIX_ELEM #undef MATRIX_ELEM_PTR +#undef MATRIX_ELEM_PTR_BASE #undef MATRIX_ELEM_FMT #undef MATRIX_ELEM_WRITE_FMT |