aboutsummaryrefslogtreecommitdiff
path: root/matrix
diff options
context:
space:
mode:
Diffstat (limited to 'matrix')
-rw-r--r--matrix/cukernel.cu196
-rw-r--r--matrix/cukernel.h13
-rw-r--r--matrix/cumatrix.c163
-rw-r--r--matrix/generic/cukernel.cu184
-rw-r--r--matrix/generic/cumatrix.c143
-rw-r--r--matrix/generic/elem_type.h11
-rw-r--r--matrix/generic/matrix.c83
-rw-r--r--matrix/generic/matrix.h1
-rw-r--r--matrix/generic/mmatrix.c (renamed from matrix/matrix.c)25
-rw-r--r--matrix/init.c13
-rw-r--r--matrix/init.lua22
-rw-r--r--matrix/mmatrix.c5
12 files changed, 462 insertions, 397 deletions
diff --git a/matrix/cukernel.cu b/matrix/cukernel.cu
index ee6d871..1f97b41 100644
--- a/matrix/cukernel.cu
+++ b/matrix/cukernel.cu
@@ -1,177 +1,19 @@
-#include <assert.h>
-#include <stdio.h>
-#include "generic/matrix.h"
-#include "cuda.h"
-#define CUDA_THREADS_N 16
-#define CUDA_THREADS_NN (16 * 16)
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
-__global__ void sigmoid(const float *a, float *b,
- int nrow, int ncol, int stride) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- long idx;
- if (i >= nrow || j >= ncol) return;
- idx = j + i * stride;
- b[idx] = 1.0 / (1.0 + exp(-a[idx]));
-}
-
-__global__ void softmax_final(const float *a, float *b,
- const float *max, const float *deno,
- int nrow, int ncol, int stride, int mstride) {
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- int i = blockIdx.y * blockDim.y + threadIdx.y;
- long idx;
- if (i >= nrow || j >= ncol) return;
- idx = j + i * stride;
- b[idx] = exp(a[idx] - max[0 + i * mstride]) / deno[0 + i * mstride];
-}
-
-__global__ void block_reduce_sum(const float *input, float *output,
- const int istride, const int ostride,
- const int n) {
- extern __shared__ float arr[];
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- arr[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0;
- __syncthreads();
- for (int offset = blockDim.x >> 1; offset; offset >>= 1)
- {
- if (threadIdx.x < offset)
- arr[threadIdx.x] += arr[threadIdx.x + offset];
- __syncthreads();
- }
- if (threadIdx.x == 0)
- output[blockIdx.x + ostride * blockIdx.y] = arr[0];
-}
-
-__global__ void block_reduce_softmax_sum(const float *input, float *output,
- const float *max,
- const int istride, const int ostride,
- const int mstride, const int n) {
- extern __shared__ float arr[];
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- arr[threadIdx.x] = j < n ? exp(input[j + istride * blockIdx.y] - \
- max[0 + mstride * blockIdx.y]) : 0;
- __syncthreads();
- for (int offset = blockDim.x >> 1; offset; offset >>= 1)
- {
- if (threadIdx.x < offset)
- arr[threadIdx.x] += arr[threadIdx.x + offset];
- __syncthreads();
- }
- if (threadIdx.x == 0)
- output[blockIdx.x + ostride * blockIdx.y] = arr[0];
-}
-
-__global__ void block_reduce_max(const float *input, float *output,
- const int istride, const int ostride,
- const int n) {
- extern __shared__ float arr[];
- int j = blockIdx.x * blockDim.x + threadIdx.x;
- arr[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0;
- __syncthreads();
- for (int offset = blockDim.x >> 1; offset; offset >>= 1)
- {
- if (threadIdx.x < offset)
- {
- float l = arr[threadIdx.x],
- r = arr[threadIdx.x + offset];
- if (r > l) arr[threadIdx.x] = r;
- }
- __syncthreads();
- }
- if (threadIdx.x == 0)
- output[blockIdx.x + ostride * blockIdx.y] = arr[0];
-}
-
-extern "C" {
-#include "cukernel.h"
- void cuda_sigmoid(const Matrix *a, Matrix *b) {
- dim3 threadsPerBlock(CUDA_THREADS_N,
- CUDA_THREADS_N);
- dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
- CEIL_DIV(b->nrow, threadsPerBlock.y));
- sigmoid<<<numBlocks, threadsPerBlock>>>(a->data.f, b->data.f, b->nrow, b->ncol,
- b->stride / sizeof(float));
- }
-
- void cuda_colsum(const Matrix *a, Matrix *b) {
- dim3 block(CUDA_THREADS_NN, 1);
- int ncol = a->ncol;
- int blocks_per_row = CEIL_DIV(ncol, block.x);
- dim3 grid(blocks_per_row, a->nrow);
- float *res;
- size_t stride;
- cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(float), a->nrow);
- block_reduce_sum<<<grid, block, block.x * sizeof(float)>>> \
- (a->data.f, res,
- a->stride / sizeof(float), stride / sizeof(float),
- ncol);
- ncol = blocks_per_row;
- assert((unsigned long)ncol <= block.x);
- grid.x = 1;
- block_reduce_sum<<<grid, block, block.x * sizeof(float)>>> \
- (res, b->data.f,
- stride / sizeof(float), b->stride / sizeof(float),
- ncol);
- cudaFree(res);
- }
-
- void cuda_softmax_final(const Matrix *a, const Matrix *max,
- const Matrix *deno, Matrix *b) {
- dim3 threadsPerBlock(CUDA_THREADS_N,
- CUDA_THREADS_N);
- dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
- CEIL_DIV(b->nrow, threadsPerBlock.y));
- softmax_final<<<numBlocks, threadsPerBlock>>>(a->data.f, b->data.f,
- max->data.f, deno->data.f,
- b->nrow, b->ncol,
- b->stride / sizeof(float),
- max->stride / sizeof(float));
- }
-
- void cuda_softmax_denominator(const Matrix *a, const Matrix *max, Matrix *b) {
- dim3 block(CUDA_THREADS_NN, 1);
- int ncol = a->ncol;
- int blocks_per_row = CEIL_DIV(ncol, block.x);
- dim3 grid(blocks_per_row, a->nrow);
- float *res;
- size_t stride;
- assert(max->ncol == 1);
- cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(float), a->nrow);
- block_reduce_softmax_sum<<<grid, block, block.x * sizeof(float)>>> \
- (a->data.f, res, max->data.f,
- a->stride / sizeof(float), stride / sizeof(float),
- max->stride / sizeof(float),
- ncol);
- ncol = blocks_per_row;
- assert((unsigned long)ncol <= block.x);
- grid.x = 1;
- block_reduce_sum<<<grid, block, block.x * sizeof(float)>>> \
- (res, b->data.f,
- stride / sizeof(float), b->stride / sizeof(float),
- ncol);
- cudaFree(res);
- }
-
- void cuda_colmax(const Matrix *a, Matrix *b) {
- dim3 block(CUDA_THREADS_NN, 1);
- int ncol = a->ncol;
- int blocks_per_row = CEIL_DIV(ncol, block.x);
- dim3 grid(blocks_per_row, a->nrow);
- float *res;
- size_t stride;
- cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(float), a->nrow);
- block_reduce_max<<<grid, block, block.x * sizeof(float)>>> \
- (a->data.f, res,
- a->stride / sizeof(float), stride / sizeof(float),
- ncol);
- ncol = blocks_per_row;
- assert((unsigned long)ncol <= block.x);
- grid.x = 1;
- block_reduce_max<<<grid, block, block.x * sizeof(float)>>> \
- (res, b->data.f,
- stride / sizeof(float), b->stride / sizeof(float),
- ncol);
- cudaFree(res);
- }
-}
+#define NERV_GENERIC_CUKERNEL
+
+#define cudak_(NAME) cudak_float_ ## NAME
+#define MATRIX_USE_FLOAT
+#include "generic/elem_type.h"
+#include "generic/cukernel.cu"
+#undef cudak_
+#undef MATRIX_USE_FLOAT
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+
+#define cudak_(NAME) cudak_double_ ## NAME
+#define MATRIX_USE_DOUBLE
+#include "generic/elem_type.h"
+#include "generic/cukernel.cu"
+#undef cudak_
+#undef MATRIX_USE_DOUBLE
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
diff --git a/matrix/cukernel.h b/matrix/cukernel.h
index 9c13558..ea81e5a 100644
--- a/matrix/cukernel.h
+++ b/matrix/cukernel.h
@@ -1,8 +1,7 @@
-#ifndef NERV_CUKERNEL_H
-#define NERV_CUKERNEL_H
-void cuda_sigmoid(const Matrix *a, Matrix *b);
-void cuda_colsum(const Matrix *a, Matrix *b);
-void cuda_colmax(const Matrix *a, Matrix *b);
-void cuda_softmax_denominator(const Matrix *a, const Matrix *max, Matrix *b);
-void cuda_softmax_final(const Matrix *a, const Matrix *max, const Matrix *deno, Matrix *b);
+#ifdef NERV_GENERIC_CUKERNEL
+void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b);
+void cudak_(cuda_colsum)(const Matrix *a, Matrix *b);
+void cudak_(cuda_colmax)(const Matrix *a, Matrix *b);
+void cudak_(cuda_softmax_denominator)(const Matrix *a, const Matrix *max, Matrix *b);
+void cudak_(cuda_softmax_final)(const Matrix *a, const Matrix *max, const Matrix *deno, Matrix *b);
#endif
diff --git a/matrix/cumatrix.c b/matrix/cumatrix.c
index aa10571..90a6703 100644
--- a/matrix/cumatrix.c
+++ b/matrix/cumatrix.c
@@ -1,139 +1,24 @@
-#define MATRIX_DATA_FREE(ptr) cuda_float_array_free(ptr)
-#define MATRIX_DATA_ALLOC(dptr, stride, width, height) cuda_float_array_alloc(dptr, stride, width, height)
-#define MATRIX_DATA_WRITE(data, idx, val) cuda_float_array_write(data, idx, val)
-#define MATRIX_DATA_READ(data, idx) cuda_float_array_read(data, idx)
-#define MATRIX_INIT(L) cuda_float_init(L)
-#define NERV_GENERIC_MATRIX
-#define nerv_float_matrix_(NAME) nerv_float_matrix_cuda_ ## NAME
-#include "../common.h"
-#include "generic/matrix.h"
-#include "cukernel.h"
-#include "cuda.h"
-#include "cuda_runtime.h"
-#include "driver_types.h"
-#include "cublas_v2.h"
-
-const char *nerv_float_matrix_(tname) = "nerv.FloatCuMatrix";
-static cublasHandle_t cublas_handle;
-
-Matrix *nerv_float_matrix_(new_)(long nrow, long ncol);
-static int nerv_float_matrix_(add)(lua_State *L) {
- Matrix *a = luaT_checkudata(L, 1, nerv_float_matrix_(tname));
- Matrix *b = luaT_checkudata(L, 2, nerv_float_matrix_(tname));
- Matrix *c;
- long nrow, ncol;
- if (!(a->nrow == b->nrow && a->ncol == b->ncol))
- nerv_error(L, "Matrices should be of the same dimension");
- nrow = a->nrow;
- ncol = a->ncol;
- c = nerv_float_matrix_(new_)(nrow, ncol);
- float alpha = 1.0f, beta = 1.0f;
- cublasSgeam(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
- ncol, nrow,
- &alpha,
- a->data.f, a->stride / sizeof(float),
- &beta,
- b->data.f, b->stride / sizeof(float),
- c->data.f, c->stride / sizeof(float));
- luaT_pushudata(L, c, nerv_float_matrix_(tname));
- return 1;
-}
-
-static int nerv_float_matrix_(mul)(lua_State *L) {
- Matrix *a = luaT_checkudata(L, 1, nerv_float_matrix_(tname));
- Matrix *b = luaT_checkudata(L, 2, nerv_float_matrix_(tname));
- Matrix *c;
- if (a->ncol != b->nrow)
- nerv_error(L, "Wrong dimension of multipliers");
- c = nerv_float_matrix_(new_)(a->nrow, b->ncol);
- float alpha = 1.0f, beta = 0.0f;
- cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
- b->ncol, a->nrow, b->nrow,
- &alpha,
- b->data.f, b->stride / sizeof(float),
- a->data.f, a->stride / sizeof(float),
- &beta,
- c->data.f, c->stride / sizeof(float));
- luaT_pushudata(L, c, nerv_float_matrix_(tname));
- return 1;
-}
-
-static int nerv_float_matrix_(sigmoid)(lua_State *L) {
- Matrix *a = luaT_checkudata(L, 1, nerv_float_matrix_(tname));
- Matrix *b = nerv_float_matrix_(new_)(a->nrow, a->ncol);
- cuda_sigmoid(a, b);
- luaT_pushudata(L, b, nerv_float_matrix_(tname));
- return 1;
-}
-
-static int nerv_float_matrix_(softmax)(lua_State *L) {
- Matrix *a = luaT_checkudata(L, 1, nerv_float_matrix_(tname));
- Matrix *max = nerv_float_matrix_(new_)(a->nrow, 1);
- Matrix *dno = nerv_float_matrix_(new_)(a->nrow, 1);
- Matrix *b = nerv_float_matrix_(new_)(a->nrow, a->ncol);
- cuda_colmax(a, max);
- cuda_softmax_denominator(a, max, dno);
- cuda_softmax_final(a, max, dno, b);
- luaT_pushudata(L, b, nerv_float_matrix_(tname));
- return 1;
-}
-
-static int nerv_float_matrix_(colsum)(lua_State *L) {
- Matrix *a = luaT_checkudata(L, 1, nerv_float_matrix_(tname));
- Matrix *b = nerv_float_matrix_(new_)(a->nrow, 1);
- cuda_colsum(a, b);
- luaT_pushudata(L, b, nerv_float_matrix_(tname));
- return 1;
-}
-
-static int nerv_float_matrix_(colmax)(lua_State *L) {
- Matrix *a = luaT_checkudata(L, 1, nerv_float_matrix_(tname));
- Matrix *b = nerv_float_matrix_(new_)(a->nrow, 1);
- cuda_colmax(a, b);
- luaT_pushudata(L, b, nerv_float_matrix_(tname));
- return 1;
-}
-
-static const luaL_Reg nerv_float_matrix_(extra_methods)[] = {
- {"__add__", nerv_float_matrix_(add)},
- {"__mul__", nerv_float_matrix_(mul)},
- {"sigmoid", nerv_float_matrix_(sigmoid)},
- {"softmax", nerv_float_matrix_(softmax)},
- {"colsum", nerv_float_matrix_(colsum)},
- {"colmax", nerv_float_matrix_(colmax)},
- {NULL, NULL}
-};
-
-static void cuda_float_init(lua_State *L) {
- luaN_append_methods(L, nerv_float_matrix_(extra_methods));
- cublasCreate(&cublas_handle);
-}
-
-static void cuda_float_array_free(float *ptr) {
- cudaFree(ptr);
-}
-
-static void cuda_float_array_alloc(float **dptr, size_t *stride,
- long width, long height) {
- cudaMallocPitch((void **)dptr, stride, width, height);
-}
-
-static float cuda_float_array_read(float *data, int idx) {
- float res;
- cudaMemcpy(&res, data + idx, sizeof(float), cudaMemcpyDeviceToHost);
- return res;
-}
-
-static void cuda_float_array_write(float *data, int idx, float val) {
- cudaMemcpy(data + idx, &val, sizeof(float), cudaMemcpyHostToDevice);
-}
-
-int nerv_float_matrix_(get_elem)(lua_State *L) {
- return nerv_error_method_not_implemented(L);
-}
-
-int nerv_float_matrix_(set_elem)(lua_State *L) {
- return nerv_error_method_not_implemented(L);
-}
-
-#include "generic/matrix.c"
+#define NERV_GENERIC_CUMATRIX
+
+#define MATRIX_USE_FLOAT
+#define cuda_matrix_(NAME) cuda_matrix_float_ ## NAME
+#define nerv_matrix_(NAME) nerv_matrix_float_cuda_ ## NAME
+#define cudak_(NAME) cudak_float_ ## NAME
+#define NERV_CUBLAS_(NAME) cublasS##NAME
+const char *nerv_matrix_(tname) = "nerv.FloatCuMatrix";
+#include "generic/cumatrix.c"
+#undef NERV_CUBLAS_
+#undef cudak_
+#undef nerv_matrix_
+#undef cuda_matrix_
+#undef MATRIX_USE_FLOAT
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+
+#define MATRIX_USE_DOUBLE
+#define cuda_matrix_(NAME) cuda_matrix_double_ ## NAME
+#define nerv_matrix_(NAME) nerv_matrix_double_cuda_ ## NAME
+#define cudak_(NAME) cudak_double_ ## NAME
+#define NERV_CUBLAS_(NAME) cublasD##NAME
+const char *nerv_matrix_(tname) = "nerv.DoubleCuMatrix";
+#include "generic/cumatrix.c"
diff --git a/matrix/generic/cukernel.cu b/matrix/generic/cukernel.cu
new file mode 100644
index 0000000..a37ccf4
--- /dev/null
+++ b/matrix/generic/cukernel.cu
@@ -0,0 +1,184 @@
+#ifdef NERV_GENERIC_CUKERNEL
+#include <assert.h>
+#include <stdio.h>
+#include "matrix.h"
+#include "cuda.h"
+#define CUDA_THREADS_N 16
+#define CUDA_THREADS_NN (16 * 16)
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+__global__ void cudak_(sigmoid)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+ int nrow, int ncol, int stride) {
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ long idx;
+ if (i >= nrow || j >= ncol) return;
+ idx = j + i * stride;
+ b[idx] = 1.0 / (1.0 + exp(-a[idx]));
+}
+
+__global__ void cudak_(softmax_final)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+ const MATRIX_ELEM *max, const MATRIX_ELEM *deno,
+ int nrow, int ncol, int stride, int mstride) {
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ long idx;
+ if (i >= nrow || j >= ncol) return;
+ idx = j + i * stride;
+ b[idx] = exp(a[idx] - max[0 + i * mstride]) / deno[0 + i * mstride];
+}
+
+__global__ void cudak_(block_reduce_sum)(const MATRIX_ELEM *input,
+ MATRIX_ELEM *output,
+ const int istride, const int ostride,
+ const int n) {
+ extern __shared__ MATRIX_ELEM cudak_(arr)[];
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0;
+ __syncthreads();
+ for (int offset = blockDim.x >> 1; offset; offset >>= 1)
+ {
+ if (threadIdx.x < offset)
+ cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
+ __syncthreads();
+ }
+ if (threadIdx.x == 0)
+ output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_softmax_sum)(const MATRIX_ELEM *input,
+ MATRIX_ELEM *output,
+ const MATRIX_ELEM *max,
+ const int istride, const int ostride,
+ const int mstride, const int n) {
+ extern __shared__ MATRIX_ELEM cudak_(arr)[];
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ cudak_(arr)[threadIdx.x] = j < n ? exp(input[j + istride * blockIdx.y] - \
+ max[0 + mstride * blockIdx.y]) : 0;
+ __syncthreads();
+ for (int offset = blockDim.x >> 1; offset; offset >>= 1)
+ {
+ if (threadIdx.x < offset)
+ cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
+ __syncthreads();
+ }
+ if (threadIdx.x == 0)
+ output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_max)(const MATRIX_ELEM *input,
+ MATRIX_ELEM *output,
+ const int istride, const int ostride,
+ const int n) {
+ extern __shared__ MATRIX_ELEM cudak_(arr)[];
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0;
+ __syncthreads();
+ for (int offset = blockDim.x >> 1; offset; offset >>= 1)
+ {
+ if (threadIdx.x < offset)
+ {
+ MATRIX_ELEM l = cudak_(arr)[threadIdx.x],
+ r = cudak_(arr)[threadIdx.x + offset];
+ if (r > l) cudak_(arr)[threadIdx.x] = r;
+ }
+ __syncthreads();
+ }
+ if (threadIdx.x == 0)
+ output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+extern "C" {
+#include "../cukernel.h"
+ void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b) {
+ dim3 threadsPerBlock(CUDA_THREADS_N,
+ CUDA_THREADS_N);
+ dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+ CEIL_DIV(b->nrow, threadsPerBlock.y));
+ cudak_(sigmoid)<<<numBlocks, threadsPerBlock>>> \
+ (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
+ b->stride / sizeof(MATRIX_ELEM));
+ }
+
+ void cudak_(cuda_colsum)(const Matrix *a, Matrix *b) {
+ dim3 block(CUDA_THREADS_NN, 1);
+ int ncol = a->ncol;
+ int blocks_per_row = CEIL_DIV(ncol, block.x);
+ dim3 grid(blocks_per_row, a->nrow);
+ MATRIX_ELEM *res;
+ size_t stride;
+ cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+ cudak_(block_reduce_sum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+ (MATRIX_ELEM_PTR(a), res,
+ a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+ ncol);
+ ncol = blocks_per_row;
+ assert((unsigned long)ncol <= block.x);
+ grid.x = 1;
+ cudak_(block_reduce_sum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+ (res, MATRIX_ELEM_PTR(b),
+ stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+ ncol);
+ cudaFree(res);
+ }
+
+ void cudak_(cuda_softmax_final)(const Matrix *a, const Matrix *max,
+ const Matrix *deno, Matrix *b) {
+ dim3 threadsPerBlock(CUDA_THREADS_N,
+ CUDA_THREADS_N);
+ dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+ CEIL_DIV(b->nrow, threadsPerBlock.y));
+ cudak_(softmax_final)<<<numBlocks, threadsPerBlock>>> \
+ (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+ MATRIX_ELEM_PTR(max), MATRIX_ELEM_PTR(deno),
+ b->nrow, b->ncol,
+ b->stride / sizeof(MATRIX_ELEM),
+ max->stride / sizeof(MATRIX_ELEM));
+ }
+
+ void cudak_(cuda_softmax_denominator)(const Matrix *a, const Matrix *max, Matrix *b) {
+ dim3 block(CUDA_THREADS_NN, 1);
+ int ncol = a->ncol;
+ int blocks_per_row = CEIL_DIV(ncol, block.x);
+ dim3 grid(blocks_per_row, a->nrow);
+ MATRIX_ELEM *res;
+ size_t stride;
+ assert(max->ncol == 1);
+ cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+ cudak_(block_reduce_softmax_sum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+ (MATRIX_ELEM_PTR(a), res, MATRIX_ELEM_PTR(max),
+ a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+ max->stride / sizeof(MATRIX_ELEM),
+ ncol);
+ ncol = blocks_per_row;
+ assert((unsigned long)ncol <= block.x);
+ grid.x = 1;
+ cudak_(block_reduce_sum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+ (res, MATRIX_ELEM_PTR(b),
+ stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+ ncol);
+ cudaFree(res);
+ }
+
+ void cudak_(cuda_colmax)(const Matrix *a, Matrix *b) {
+ dim3 block(CUDA_THREADS_NN, 1);
+ int ncol = a->ncol;
+ int blocks_per_row = CEIL_DIV(ncol, block.x);
+ dim3 grid(blocks_per_row, a->nrow);
+ MATRIX_ELEM *res;
+ size_t stride;
+ cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+ cudak_(block_reduce_max)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+ (MATRIX_ELEM_PTR(a), res,
+ a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+ ncol);
+ ncol = blocks_per_row;
+ assert((unsigned long)ncol <= block.x);
+ grid.x = 1;
+ cudak_(block_reduce_max)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+ (res, MATRIX_ELEM_PTR(b),
+ stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+ ncol);
+ cudaFree(res);
+ }
+}
+#endif
diff --git a/matrix/generic/cumatrix.c b/matrix/generic/cumatrix.c
new file mode 100644
index 0000000..f0ef99d
--- /dev/null
+++ b/matrix/generic/cumatrix.c
@@ -0,0 +1,143 @@
+#ifdef NERV_GENERIC_CUMATRIX
+#include "matrix.h"
+#include "elem_type.h"
+
+#define MATRIX_DATA_FREE(ptr) cuda_matrix_(free)(ptr)
+#define MATRIX_DATA_ALLOC(dptr, stride, width, height) \
+ cuda_matrix_(alloc)(dptr, stride, width, height)
+#define MATRIX_DATA_WRITE(data, idx, val) cuda_matrix_(write)(data, idx, val)
+#define MATRIX_DATA_READ(data, idx) cuda_matrix_(read)(data, idx)
+#define MATRIX_INIT(L) cuda_matrix_(init)(L)
+#define NERV_GENERIC_MATRIX
+#define NERV_GENERIC_CUKERNEL
+#include "../../common.h"
+#include "../cukernel.h"
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "driver_types.h"
+#include "cublas_v2.h"
+
+static cublasHandle_t cublas_handle;
+
+Matrix *nerv_matrix_(new_)(long nrow, long ncol);
+static int nerv_matrix_(add)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ Matrix *c;
+ long nrow, ncol;
+ if (!(a->nrow == b->nrow && a->ncol == b->ncol))
+ nerv_error(L, "Matrices should be of the same dimension");
+ nrow = a->nrow;
+ ncol = a->ncol;
+ c = nerv_matrix_(new_)(nrow, ncol);
+ MATRIX_ELEM alpha = 1.0f, beta = 1.0f;
+ NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
+ ncol, nrow,
+ &alpha,
+ MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+ &beta,
+ MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM),
+ MATRIX_ELEM_PTR(c), c->stride / sizeof(MATRIX_ELEM));
+ luaT_pushudata(L, c, nerv_matrix_(tname));
+ return 1;
+}
+
+static int nerv_matrix_(mul)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ Matrix *c;
+ if (a->ncol != b->nrow)
+ nerv_error(L, "Wrong dimension of multipliers");
+ c = nerv_matrix_(new_)(a->nrow, b->ncol);
+ MATRIX_ELEM alpha = 1.0f, beta = 0.0f;
+ NERV_CUBLAS_(gemm)(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
+ b->ncol, a->nrow, b->nrow,
+ &alpha,
+ MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM),
+ MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+ &beta,
+ MATRIX_ELEM_PTR(c), c->stride / sizeof(MATRIX_ELEM));
+ luaT_pushudata(L, c, nerv_matrix_(tname));
+ return 1;
+}
+
+static int nerv_matrix_(sigmoid)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = nerv_matrix_(new_)(a->nrow, a->ncol);
+ cudak_(cuda_sigmoid)(a, b);
+ luaT_pushudata(L, b, nerv_matrix_(tname));
+ return 1;
+}
+
+static int nerv_matrix_(softmax)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *max = nerv_matrix_(new_)(a->nrow, 1);
+ Matrix *dno = nerv_matrix_(new_)(a->nrow, 1);
+ Matrix *b = nerv_matrix_(new_)(a->nrow, a->ncol);
+ cudak_(cuda_colmax)(a, max);
+ cudak_(cuda_softmax_denominator)(a, max, dno);
+ cudak_(cuda_softmax_final)(a, max, dno, b);
+ luaT_pushudata(L, b, nerv_matrix_(tname));
+ return 1;
+}
+
+static int nerv_matrix_(colsum)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = nerv_matrix_(new_)(a->nrow, 1);
+ cudak_(cuda_colsum)(a, b);
+ luaT_pushudata(L, b, nerv_matrix_(tname));
+ return 1;
+}
+
+static int nerv_matrix_(colmax)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = nerv_matrix_(new_)(a->nrow, 1);
+ cudak_(cuda_colmax)(a, b);
+ luaT_pushudata(L, b, nerv_matrix_(tname));
+ return 1;
+}
+
+static const luaL_Reg nerv_matrix_(extra_methods)[] = {
+ {"__add__", nerv_matrix_(add)},
+ {"__mul__", nerv_matrix_(mul)},
+ {"sigmoid", nerv_matrix_(sigmoid)},
+ {"softmax", nerv_matrix_(softmax)},
+ {"colsum", nerv_matrix_(colsum)},
+ {"colmax", nerv_matrix_(colmax)},
+ {NULL, NULL}
+};
+
+static void cuda_matrix_(init)(lua_State *L) {
+ luaN_append_methods(L, nerv_matrix_(extra_methods));
+ cublasCreate(&cublas_handle);
+}
+
+static void cuda_matrix_(free)(MATRIX_ELEM *ptr) {
+ cudaFree(ptr);
+}
+
+static void cuda_matrix_(alloc)(MATRIX_ELEM **dptr, size_t *stride,
+ long width, long height) {
+ cudaMallocPitch((void **)dptr, stride, width, height);
+}
+
+static MATRIX_ELEM cuda_matrix_(read)(MATRIX_ELEM *data, int idx) {
+ MATRIX_ELEM res;
+ cudaMemcpy(&res, data + idx, sizeof(MATRIX_ELEM), cudaMemcpyDeviceToHost);
+ return res;
+}
+
+static void cuda_matrix_(write)(MATRIX_ELEM *data, int idx, MATRIX_ELEM val) {
+ cudaMemcpy(data + idx, &val, sizeof(MATRIX_ELEM), cudaMemcpyHostToDevice);
+}
+
+int nerv_matrix_(get_elem)(lua_State *L) {
+ return nerv_error_method_not_implemented(L);
+}
+
+int nerv_matrix_(set_elem)(lua_State *L) {
+ return nerv_error_method_not_implemented(L);
+}
+
+#include "matrix.c"
+#endif
diff --git a/matrix/generic/elem_type.h b/matrix/generic/elem_type.h
new file mode 100644
index 0000000..8f80306
--- /dev/null
+++ b/matrix/generic/elem_type.h
@@ -0,0 +1,11 @@
+#ifdef MATRIX_USE_FLOAT
+
+#define MATRIX_ELEM float
+#define MATRIX_ELEM_PTR(self) ((self)->data.f)
+
+#elif defined(MATRIX_USE_DOUBLE)
+
+#define MATRIX_ELEM double
+#define MATRIX_ELEM_PTR(self) ((self)->data.d)
+
+#endif
diff --git a/matrix/generic/matrix.c b/matrix/generic/matrix.c
index 9ced397..f0f81a9 100644
--- a/matrix/generic/matrix.c
+++ b/matrix/generic/matrix.c
@@ -3,59 +3,61 @@
#include "matrix.h"
extern const char *nerv_matrix_tname;
-extern const char *nerv_float_matrix_(tname);
+extern const char *nerv_matrix_(tname);
-void nerv_float_matrix_(data_free)(Matrix *self) {
+void nerv_matrix_(data_free)(Matrix *self) {
if (--(*self->data_ref) == 0)
- MATRIX_DATA_FREE(self->data.f);
+ MATRIX_DATA_FREE(MATRIX_ELEM_PTR(self));
}
-void nerv_float_matrix_(data_retain)(Matrix *self) {
+void nerv_matrix_(data_retain)(Matrix *self) {
(*self->data_ref)++;
}
-Matrix *nerv_float_matrix_(new_)(long nrow, long ncol) {
+Matrix *nerv_matrix_(new_)(long nrow, long ncol) {
Matrix *self = (Matrix *)malloc(sizeof(Matrix));
self->nrow = nrow;
self->ncol = ncol;
self->nmax = self->nrow * self->ncol;
- MATRIX_DATA_ALLOC(&self->data.f, &self->stride, sizeof(float) * self->ncol, self->nrow);
+ MATRIX_DATA_ALLOC(&MATRIX_ELEM_PTR(self), &self->stride,
+ sizeof(MATRIX_ELEM) * self->ncol, self->nrow);
self->data_ref = (long *)malloc(sizeof(long));
*self->data_ref = 0;
- nerv_float_matrix_(data_retain)(self);
+ nerv_matrix_(data_retain)(self);
return self;
}
-int nerv_float_matrix_(new)(lua_State *L) {
- luaT_pushudata(L, nerv_float_matrix_(new_)(luaL_checkinteger(L, 1),
+int nerv_matrix_(new)(lua_State *L) {
+ luaT_pushudata(L, nerv_matrix_(new_)(luaL_checkinteger(L, 1),
luaL_checkinteger(L, 2)),
- nerv_float_matrix_(tname));
+ nerv_matrix_(tname));
return 1;
}
-int nerv_float_matrix_(destroy)(lua_State *L) {
- Matrix *self = luaT_checkudata(L, 1, nerv_float_matrix_(tname));
- nerv_float_matrix_(data_free)(self);
+int nerv_matrix_(destroy)(lua_State *L) {
+ Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ nerv_matrix_(data_free)(self);
return 0;
}
-int nerv_float_matrix_(get_elem)(lua_State *L);
-int nerv_float_matrix_(set_elem)(lua_State *L);
+int nerv_matrix_(get_elem)(lua_State *L);
+int nerv_matrix_(set_elem)(lua_State *L);
-static Matrix *nerv_float_matrix_(getrow)(Matrix *self, int row) {
+static Matrix *nerv_matrix_(getrow)(Matrix *self, int row) {
Matrix *prow = (Matrix *)malloc(sizeof(Matrix));
prow->ncol = self->ncol;
prow->nrow = 1;
prow->stride = self->stride;
prow->nmax = prow->ncol;
- prow->data.f = (float *)((char *)self->data.f + row * self->stride);
+ MATRIX_ELEM_PTR(prow) = \
+ (MATRIX_ELEM *)((char *)MATRIX_ELEM_PTR(self) + row * self->stride);
prow->data_ref = self->data_ref;
- nerv_float_matrix_(data_retain)(self);
+ nerv_matrix_(data_retain)(self);
return prow;
}
-static int nerv_float_matrix_(newindex)(lua_State *L) {
- Matrix *self = luaT_checkudata(L, 1, nerv_float_matrix_(tname));
+static int nerv_matrix_(newindex)(lua_State *L) {
+ Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
if (lua_isnumber(L, 2))
{
int idx = luaL_checkinteger(L, 2);
@@ -63,7 +65,8 @@ static int nerv_float_matrix_(newindex)(lua_State *L) {
{
if (idx < 0 || idx >= self->ncol)
nerv_error(L, "index must be within range [0, %d)", self->ncol);
- MATRIX_DATA_WRITE(self->data.f, idx, luaL_checknumber(L, 3));
+ MATRIX_DATA_WRITE(MATRIX_ELEM_PTR(self), idx,
+ luaL_checknumber(L, 3));
}
else
nerv_error(L, "cannot assign a scalar to row vector");
@@ -78,8 +81,8 @@ static int nerv_float_matrix_(newindex)(lua_State *L) {
}
-static int nerv_float_matrix_(index)(lua_State *L) {
- Matrix *self = luaT_checkudata(L, 1, nerv_float_matrix_(tname));
+static int nerv_matrix_(index)(lua_State *L) {
+ Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
if (lua_isnumber(L, 2))
{
int idx = luaL_checkinteger(L, 2);
@@ -87,13 +90,13 @@ static int nerv_float_matrix_(index)(lua_State *L) {
{
if (idx < 0 || idx >= self->ncol)
nerv_error(L, "index must be within range [0, %d)", self->ncol);
- lua_pushnumber(L, MATRIX_DATA_READ(self->data.f, idx));
+ lua_pushnumber(L, MATRIX_DATA_READ(MATRIX_ELEM_PTR(self), idx));
}
else
{
if (idx < 0 || idx >= self->nrow)
nerv_error(L, "index must be within range [0, %d)", self->nrow);
- luaT_pushudata(L, nerv_float_matrix_(getrow)(self, idx), nerv_float_matrix_(tname));
+ luaT_pushudata(L, nerv_matrix_(getrow)(self, idx), nerv_matrix_(tname));
}
lua_pushboolean(L, 1);
return 2;
@@ -105,33 +108,33 @@ static int nerv_float_matrix_(index)(lua_State *L) {
}
}
-static int nerv_float_matrix_(ncol)(lua_State *L) {
- Matrix *self = luaT_checkudata(L, 1, nerv_float_matrix_(tname));
+static int nerv_matrix_(ncol)(lua_State *L) {
+ Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
lua_pushinteger(L, self->ncol);
return 1;
}
-static int nerv_float_matrix_(nrow)(lua_State *L) {
- Matrix *self = luaT_checkudata(L, 1, nerv_float_matrix_(tname));
+static int nerv_matrix_(nrow)(lua_State *L) {
+ Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
lua_pushinteger(L, self->nrow);
return 1;
}
-static const luaL_Reg nerv_float_matrix_(methods)[] = {
- {"get_elem", nerv_float_matrix_(get_elem)},
- {"set_elem", nerv_float_matrix_(set_elem)},
- {"ncol", nerv_float_matrix_(ncol)},
- {"nrow", nerv_float_matrix_(nrow)},
- {"__index__", nerv_float_matrix_(index)},
- {"__newindex__", nerv_float_matrix_(newindex)},
+static const luaL_Reg nerv_matrix_(methods)[] = {
+ {"get_elem", nerv_matrix_(get_elem)},
+ {"set_elem", nerv_matrix_(set_elem)},
+ {"ncol", nerv_matrix_(ncol)},
+ {"nrow", nerv_matrix_(nrow)},
+ {"__index__", nerv_matrix_(index)},
+ {"__newindex__", nerv_matrix_(newindex)},
{NULL, NULL}
};
-void nerv_float_matrix_(init)(lua_State *L) {
- luaT_newmetatable(L, nerv_float_matrix_(tname), nerv_matrix_tname,
- nerv_float_matrix_(new), nerv_float_matrix_(destroy), NULL);
- luaL_register(L, NULL, nerv_float_matrix_(methods));
+void nerv_matrix_(init)(lua_State *L) {
+ luaT_newmetatable(L, nerv_matrix_(tname), nerv_matrix_tname,
+ nerv_matrix_(new), nerv_matrix_(destroy), NULL);
+ luaL_register(L, NULL, nerv_matrix_(methods));
#ifdef MATRIX_INIT
MATRIX_INIT(L);
#endif
diff --git a/matrix/generic/matrix.h b/matrix/generic/matrix.h
index 264859b..276ca5c 100644
--- a/matrix/generic/matrix.h
+++ b/matrix/generic/matrix.h
@@ -1,6 +1,7 @@
#ifndef NERV_GENERIC_MATRIX_H
#define NERV_GENERIC_MATRIX_H
+#include <stddef.h>
typedef struct Matrix {
size_t stride; /* size of a row */
long ncol, nrow, nmax; /* dimension of the matrix */
diff --git a/matrix/matrix.c b/matrix/generic/mmatrix.c
index b392f56..ac71c3d 100644
--- a/matrix/matrix.c
+++ b/matrix/generic/mmatrix.c
@@ -1,23 +1,25 @@
+#ifdef NERV_GENERIC_MMATRIX
+#include "matrix.h"
+#include "elem_type.h"
#define MATRIX_DATA_FREE(ptr) free(ptr)
-#define MATRIX_DATA_ALLOC(dptr, stride, width, height) host_float_array_alloc(dptr, stride, width, height)
+#define MATRIX_DATA_ALLOC(dptr, stride, width, height) \
+ host_matrix_(alloc)(dptr, stride, width, height)
#define MATRIX_DATA_STRIDE(ncol) (sizeof(float) * (ncol))
#define MATRIX_DATA_WRITE(data, idx, val) (data[idx] = val)
#define MATRIX_DATA_READ(data, idx) (data[idx])
#define NERV_GENERIC_MATRIX
-#define nerv_float_matrix_(NAME) nerv_float_matrix_host_ ## NAME
-#include "../common.h"
-#include "generic/matrix.h"
+#include "../../common.h"
-const char *nerv_float_matrix_(tname) = "nerv.FloatMatrix";
+const char *nerv_matrix_(tname) = "nerv.FloatMMatrix";
-static void host_float_array_alloc(float **dptr, size_t *stride,
+static void host_matrix_(alloc)(float **dptr, size_t *stride,
long width, long height) {
*dptr = (float *)malloc(width * height);
*stride = width;
}
-int nerv_float_matrix_(get_elem)(lua_State *L) {
- Matrix *self = luaT_checkudata(L, 1, nerv_float_matrix_(tname));
+int nerv_matrix_(get_elem)(lua_State *L) {
+ Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
int idx = luaL_checkinteger(L, 2);
if (idx < 0 || idx >= self->nmax)
nerv_error(L, "index must be within range [0, %d)", self->nmax);
@@ -25,8 +27,8 @@ int nerv_float_matrix_(get_elem)(lua_State *L) {
return 1;
}
-int nerv_float_matrix_(set_elem)(lua_State *L) {
- Matrix *self = luaT_checkudata(L, 1, nerv_float_matrix_(tname));
+int nerv_matrix_(set_elem)(lua_State *L) {
+ Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
int idx = luaL_checkinteger(L, 2);
float v = luaL_checknumber(L, 3);
if (idx < 0 || idx >= self->nmax)
@@ -35,4 +37,5 @@ int nerv_float_matrix_(set_elem)(lua_State *L) {
return 0;
}
-#include "generic/matrix.c"
+#include "matrix.c"
+#endif
diff --git a/matrix/init.c b/matrix/init.c
index e723f55..fb1c287 100644
--- a/matrix/init.c
+++ b/matrix/init.c
@@ -2,8 +2,11 @@
#include "generic/matrix.h"
const char *nerv_matrix_tname = "nerv.Matrix";
-void nerv_float_matrix_host_init(lua_State *L);
-void nerv_float_matrix_cuda_init(lua_State *L);
+void nerv_matrix_float_host_init(lua_State *L);
+void nerv_matrix_float_cuda_init(lua_State *L);
+void nerv_matrix_double_host_init(lua_State *L);
+void nerv_matrix_double_cuda_init(lua_State *L);
+
static const luaL_Reg matrix_methods[] = {
{"__tostring__", nerv_error_method_not_implemented },
{"__add__", nerv_error_method_not_implemented },
@@ -17,6 +20,8 @@ void nerv_matrix_init(lua_State *L) {
luaT_newmetatable(L, nerv_matrix_tname, NULL, NULL, NULL, NULL);
luaL_register(L, NULL, matrix_methods);
lua_pop(L, 1);
- nerv_float_matrix_host_init(L);
- nerv_float_matrix_cuda_init(L);
+ nerv_matrix_float_host_init(L);
+ nerv_matrix_float_cuda_init(L);
+/* nerv_matrix_double_host_init(L); */
+ nerv_matrix_double_cuda_init(L);
}
diff --git a/matrix/init.lua b/matrix/init.lua
index 59b8384..d6aab73 100644
--- a/matrix/init.lua
+++ b/matrix/init.lua
@@ -1,4 +1,4 @@
-function nerv.FloatCuMatrix:__tostring__()
+function nerv.Matrix:__tostring__()
local ncol = self:ncol()
local nrow = self:nrow()
local strt = {}
@@ -12,27 +12,11 @@ function nerv.FloatCuMatrix:__tostring__()
for row = 0, nrow - 1 do
local rp = self[row]
for col = 0, ncol - 1 do
- table.insert(strt, string.format("%f ", rp[col]))
+ table.insert(strt, string.format("%.10f ", rp[col]))
end
table.insert(strt, "\n")
end
end
- table.insert(strt, string.format("[Float Matrix %d x %d]", nrow, ncol))
- return table.concat(strt)
-end
-
-function nerv.FloatMatrix:__tostring__()
- local ncol = self:ncol()
- local nrow = self:nrow()
- local i = 0
- local strt = {}
- for row = 0, nrow - 1 do
- for col = 0, ncol - 1 do
- table.insert(strt, string.format("%f ", self:get_elem(i)))
- i = i + 1
- end
- table.insert(strt, "\n")
- end
- table.insert(strt, string.format("[Float Matrix %d x %d]", nrow, ncol))
+ table.insert(strt, string.format("[Matrix %d x %d]", nrow, ncol))
return table.concat(strt)
end
diff --git a/matrix/mmatrix.c b/matrix/mmatrix.c
new file mode 100644
index 0000000..f616d51
--- /dev/null
+++ b/matrix/mmatrix.c
@@ -0,0 +1,5 @@
+#define NERV_GENERIC_MMATRIX
+#define MATRIX_USE_FLOAT
+#define host_matrix_(NAME) host_matrix_float_ ## NAME
+#define nerv_matrix_(NAME) nerv_matrix_float_host_ ## NAME
+#include "generic/mmatrix.c"
> 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233