aboutsummaryrefslogtreecommitdiff
path: root/matrix/cukernel.cu
diff options
context:
space:
mode:
Diffstat (limited to 'matrix/cukernel.cu')
-rw-r--r--matrix/cukernel.cu196
1 files changed, 19 insertions, 177 deletions
diff --git a/matrix/cukernel.cu b/matrix/cukernel.cu
index ee6d871..1f97b41 100644
--- a/matrix/cukernel.cu
+++ b/matrix/cukernel.cu
@@ -1,177 +1,19 @@
-#include <assert.h>
-#include <stdio.h>
-#include "generic/matrix.h"
-#include "cuda.h"
-#define CUDA_THREADS_N 16
-#define CUDA_THREADS_NN (16 * 16)
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
-__global__ void sigmoid(const float *a, float *b,
- int nrow, int ncol, int stride) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- long idx;
- if (i >= nrow || j >= ncol) return;
- idx = j + i * stride;
- b[idx] = 1.0 / (1.0 + exp(-a[idx]));
-}
-
-__global__ void softmax_final(const float *a, float *b,
- const float *max, const float *deno,
- int nrow, int ncol, int stride, int mstride) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- long idx;
- if (i >= nrow || j >= ncol) return;
- idx = j + i * stride;
- b[idx] = exp(a[idx] - max[0 + i * mstride]) / deno[0 + i * mstride];
-}
-
-__global__ void block_reduce_sum(const float *input, float *output,
- const int istride, const int ostride,
- const int n) {
- extern __shared__ float arr[];
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- arr[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0;
- __syncthreads();
- for (int offset = blockDim.x >> 1; offset; offset >>= 1)
- {
- if (threadIdx.x < offset)
- arr[threadIdx.x] += arr[threadIdx.x + offset];
- __syncthreads();
- }
- if (threadIdx.x == 0)
- output[blockIdx.x + ostride * blockIdx.y] = arr[0];
-}
-
-__global__ void block_reduce_softmax_sum(const float *input, float *output,
- const float *max,
- const int istride, const int ostride,
- const int mstride, const int n) {
- extern __shared__ float arr[];
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- arr[threadIdx.x] = j < n ? exp(input[j + istride * blockIdx.y] - \
- max[0 + mstride * blockIdx.y]) : 0;
- __syncthreads();
- for (int offset = blockDim.x >> 1; offset; offset >>= 1)
- {
- if (threadIdx.x < offset)
- arr[threadIdx.x] += arr[threadIdx.x + offset];
- __syncthreads();
- }
- if (threadIdx.x == 0)
- output[blockIdx.x + ostride * blockIdx.y] = arr[0];
-}
-
-__global__ void block_reduce_max(const float *input, float *output,
- const int istride, const int ostride,
- const int n) {
- extern __shared__ float arr[];
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- arr[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0;
- __syncthreads();
- for (int offset = blockDim.x >> 1; offset; offset >>= 1)
- {
- if (threadIdx.x < offset)
- {
- float l = arr[threadIdx.x],
- r = arr[threadIdx.x + offset];
- if (r > l) arr[threadIdx.x] = r;
- }
- __syncthreads();
- }
- if (threadIdx.x == 0)
- output[blockIdx.x + ostride * blockIdx.y] = arr[0];
-}
-
-extern "C" {
-#include "cukernel.h"
- void cuda_sigmoid(const Matrix *a, Matrix *b) {
- dim3 threadsPerBlock(CUDA_THREADS_N,
- CUDA_THREADS_N);
- dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
- CEIL_DIV(b->nrow, threadsPerBlock.y));
- sigmoid<<<numBlocks, threadsPerBlock>>>(a->data.f, b->data.f, b->nrow, b->ncol,
- b->stride / sizeof(float));
- }
-
- void cuda_colsum(const Matrix *a, Matrix *b) {
- dim3 block(CUDA_THREADS_NN, 1);
- int ncol = a->ncol;
- int blocks_per_row = CEIL_DIV(ncol, block.x);
- dim3 grid(blocks_per_row, a->nrow);
- float *res;
- size_t stride;
- cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(float), a->nrow);
- block_reduce_sum<<<grid, block, block.x * sizeof(float)>>> \
- (a->data.f, res,
- a->stride / sizeof(float), stride / sizeof(float),
- ncol);
- ncol = blocks_per_row;
- assert((unsigned long)ncol <= block.x);
- grid.x = 1;
- block_reduce_sum<<<grid, block, block.x * sizeof(float)>>> \
- (res, b->data.f,
- stride / sizeof(float), b->stride / sizeof(float),
- ncol);
- cudaFree(res);
- }
-
- void cuda_softmax_final(const Matrix *a, const Matrix *max,
- const Matrix *deno, Matrix *b) {
- dim3 threadsPerBlock(CUDA_THREADS_N,
- CUDA_THREADS_N);
- dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
- CEIL_DIV(b->nrow, threadsPerBlock.y));
- softmax_final<<<numBlocks, threadsPerBlock>>>(a->data.f, b->data.f,
- max->data.f, deno->data.f,
- b->nrow, b->ncol,
- b->stride / sizeof(float),
- max->stride / sizeof(float));
- }
-
- void cuda_softmax_denominator(const Matrix *a, const Matrix *max, Matrix *b) {
- dim3 block(CUDA_THREADS_NN, 1);
- int ncol = a->ncol;
- int blocks_per_row = CEIL_DIV(ncol, block.x);
- dim3 grid(blocks_per_row, a->nrow);
- float *res;
- size_t stride;
- assert(max->ncol == 1);
- cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(float), a->nrow);
- block_reduce_softmax_sum<<<grid, block, block.x * sizeof(float)>>> \
- (a->data.f, res, max->data.f,
- a->stride / sizeof(float), stride / sizeof(float),
- max->stride / sizeof(float),
- ncol);
- ncol = blocks_per_row;
- assert((unsigned long)ncol <= block.x);
- grid.x = 1;
- block_reduce_sum<<<grid, block, block.x * sizeof(float)>>> \
- (res, b->data.f,
- stride / sizeof(float), b->stride / sizeof(float),
- ncol);
- cudaFree(res);
- }
-
- void cuda_colmax(const Matrix *a, Matrix *b) {
- dim3 block(CUDA_THREADS_NN, 1);
- int ncol = a->ncol;
- int blocks_per_row = CEIL_DIV(ncol, block.x);
- dim3 grid(blocks_per_row, a->nrow);
- float *res;
- size_t stride;
- cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(float), a->nrow);
- block_reduce_max<<<grid, block, block.x * sizeof(float)>>> \
- (a->data.f, res,
- a->stride / sizeof(float), stride / sizeof(float),
- ncol);
- ncol = blocks_per_row;
- assert((unsigned long)ncol <= block.x);
- grid.x = 1;
- block_reduce_max<<<grid, block, block.x * sizeof(float)>>> \
- (res, b->data.f,
- stride / sizeof(float), b->stride / sizeof(float),
- ncol);
- cudaFree(res);
- }
-}
+#define NERV_GENERIC_CUKERNEL
+
+#define cudak_(NAME) cudak_float_ ## NAME
+#define MATRIX_USE_FLOAT
+#include "generic/elem_type.h"
+#include "generic/cukernel.cu"
+#undef cudak_
+#undef MATRIX_USE_FLOAT
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+
+#define cudak_(NAME) cudak_double_ ## NAME
+#define MATRIX_USE_DOUBLE
+#include "generic/elem_type.h"
+#include "generic/cukernel.cu"
+#undef cudak_
+#undef MATRIX_USE_DOUBLE
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR