diff options
Diffstat (limited to 'matrix/cukernel.cu')
-rw-r--r-- | matrix/cukernel.cu | 119 |
1 files changed, 113 insertions, 6 deletions
diff --git a/matrix/cukernel.cu b/matrix/cukernel.cu index d6d7997..dd1ebfc 100644 --- a/matrix/cukernel.cu +++ b/matrix/cukernel.cu @@ -1,6 +1,6 @@ #include <assert.h> -#include "generic/matrix.h" #include <stdio.h> +#include "generic/matrix.h" #include "cuda.h" #define CUDA_THREADS_N 16 #define CUDA_THREADS_NN (16 * 16) @@ -15,7 +15,18 @@ __global__ void sigmoid(const float *a, float *b, b[idx] = 1.0 / (1.0 + exp(-a[idx])); } -__global__ void block_sum(const float *input, float *output, +__global__ void softmax_final(const float *a, float *b, + const float *max, const float *deno, + int nrow, int ncol, int stride, int mstride) { + int j = blockIdx.x * blockDim.x + threadIdx.x; + int i = blockIdx.y * blockDim.y + threadIdx.y; + long idx; + if (i >= nrow || j >= ncol) return; + idx = j + i * stride; + b[idx] = exp(a[idx] - max[0 + i * mstride]) / deno[0 + i * mstride]; +} + +__global__ void block_reduce_sum(const float *input, float *output, const int istride, const int ostride, const int n) { extern __shared__ float arr[]; @@ -29,10 +40,47 @@ __global__ void block_sum(const float *input, float *output, __syncthreads(); } if (threadIdx.x == 0) + output[blockIdx.x + ostride * blockIdx.y] = arr[0]; +} + +__global__ void block_reduce_softmax_sum(const float *input, float *output, + const float *max, + const int istride, const int ostride, + const int mstride, const int n) { + extern __shared__ float arr[]; + int j = blockIdx.x * blockDim.x + threadIdx.x; + arr[threadIdx.x] = j < n ? exp(input[j + istride * blockIdx.y] - \ + max[0 + mstride * blockIdx.y]) : 0; + __syncthreads(); + for (int offset = blockDim.x >> 1; offset; offset >>= 1) { - /* printf("bx: %d by: %d arr: %f\n", blockIdx.x, blockIdx.y, arr[0]); */ + if (threadIdx.x < offset) + arr[threadIdx.x] += arr[threadIdx.x + offset]; + __syncthreads(); + } + if (threadIdx.x == 0) output[blockIdx.x + ostride * blockIdx.y] = arr[0]; +} + +__global__ void block_reduce_max(const float *input, float *output, + const int istride, const int ostride, + const int n) { + extern __shared__ float arr[]; + int j = blockIdx.x * blockDim.x + threadIdx.x; + arr[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0; + __syncthreads(); + for (int offset = blockDim.x >> 1; offset; offset >>= 1) + { + if (threadIdx.x < offset) + { + float l = arr[threadIdx.x], + r = arr[threadIdx.x + offset]; + if (r > l) arr[threadIdx.x] = r; + } + __syncthreads(); } + if (threadIdx.x == 0) + output[blockIdx.x + ostride * blockIdx.y] = arr[0]; } extern "C" { @@ -45,7 +93,66 @@ extern "C" { b->stride / sizeof(float)); } - void cuda_rowsum(const Matrix *a, Matrix *b) { + void cuda_colsum(const Matrix *a, Matrix *b) { + dim3 block(CUDA_THREADS_NN, 1); + int ncol = a->ncol; + int blocks_per_row = CEIL_DIV(ncol, block.x); + dim3 grid(blocks_per_row, a->nrow); + float *res; + size_t stride; + cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(float), a->nrow); + block_reduce_sum<<<grid, block, block.x * sizeof(float)>>> \ + (a->data.f, res, + a->stride / sizeof(float), stride / sizeof(float), + ncol); + ncol = blocks_per_row; + assert(ncol <= block.x); + grid.x = 1; + block_reduce_sum<<<grid, block, block.x * sizeof(float)>>> \ + (res, b->data.f, + stride / sizeof(float), b->stride / sizeof(float), + ncol); + cudaFree(res); + } + + void cuda_softmax_final(const Matrix *a, const Matrix *max, + const Matrix *deno, Matrix *b) { + dim3 threadsPerBlock(CUDA_THREADS_N, + CUDA_THREADS_N); + dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x), + CEIL_DIV(b->nrow, threadsPerBlock.y)); + softmax_final<<<numBlocks, threadsPerBlock>>>(a->data.f, b->data.f, + max->data.f, deno->data.f, + b->nrow, b->ncol, + b->stride / sizeof(float), + max->stride / sizeof(float)); + } + + void cuda_softmax_denominator(const Matrix *a, const Matrix *max, Matrix *b) { + dim3 block(CUDA_THREADS_NN, 1); + int ncol = a->ncol; + int blocks_per_row = CEIL_DIV(ncol, block.x); + dim3 grid(blocks_per_row, a->nrow); + float *res; + size_t stride; + assert(max->ncol == 1); + cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(float), a->nrow); + block_reduce_softmax_sum<<<grid, block, block.x * sizeof(float)>>> \ + (a->data.f, res, max->data.f, + a->stride / sizeof(float), stride / sizeof(float), + max->stride / sizeof(float), + ncol); + ncol = blocks_per_row; + assert(ncol <= block.x); + grid.x = 1; + block_reduce_sum<<<grid, block, block.x * sizeof(float)>>> \ + (res, b->data.f, + stride / sizeof(float), b->stride / sizeof(float), + ncol); + cudaFree(res); + } + + void cuda_colmax(const Matrix *a, Matrix *b) { dim3 block(CUDA_THREADS_NN, 1); int ncol = a->ncol; int blocks_per_row = CEIL_DIV(ncol, block.x); @@ -53,14 +160,14 @@ extern "C" { float *res; size_t stride; cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(float), a->nrow); - block_sum<<<grid, block, block.x * sizeof(float)>>> \ + block_reduce_max<<<grid, block, block.x * sizeof(float)>>> \ (a->data.f, res, a->stride / sizeof(float), stride / sizeof(float), ncol); ncol = blocks_per_row; assert(ncol <= block.x); grid.x = 1; - block_sum<<<grid, block, block.x * sizeof(float)>>> \ + block_reduce_max<<<grid, block, block.x * sizeof(float)>>> \ (res, b->data.f, stride / sizeof(float), b->stride / sizeof(float), ncol); |