From 83006367aeec856bf8e59231c78df5b1802e3138 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Sun, 17 May 2015 23:40:25 +0800
Subject: add addition and multiplication for cumatrix

---
 .gitignore              |  3 +-
 Makefile                |  7 ++--
 common.c                |  8 +++++
 common.h                |  1 +
 cumatrix_example.lua    | 25 +++++++++++---
 matrix.c                |  0
 matrix/cumatrix.c       | 92 ++++++++++++++++++++++++++++++++++++++++++++++---
 matrix/generic/matrix.c | 26 +++++++++-----
 matrix/generic/matrix.h |  5 +++
 matrix/matrix.c         | 18 ++++++++--
 10 files changed, 161 insertions(+), 24 deletions(-)
 create mode 100644 matrix.c

diff --git a/.gitignore b/.gitignore
index d1c2910..d8e6051 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 *.o
-libnerv.so
 build/
+*.swp
+*.swo
diff --git a/Makefile b/Makefile
index 33219ce..4514263 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,10 @@ OBJS := oop_example.o nerv.o luaT.o common.o matrix/matrix.o matrix/cumatrix.o m
 LIBS := libnerv.so
 LUA_LIBS := matrix/init.lua nerv.lua
 INCLUDE := -I build/luajit-2.0/include/luajit-2.0/ -DLUA_USE_APICHECK
-LDFLAGS := -L luajit-2.0/build/lib/ -llua -lm
+CUDA_BASE := /usr/local/cuda-6.5
+CUDA_INCLUDE := -I $(CUDA_BASE)/include/
+INCLUDE += $(CUDA_INCLUDE)
+LDFLAGS := -L$(CUDA_BASE)/lib64/  -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcublas
 CFLAGS :=
 OBJ_DIR := build/objs
 LUA_DIR := build/lua
@@ -30,7 +33,7 @@ $(LUA_DIR)/%.lua: %.lua
 $(OBJ_DIR)/luaT.o:
 	gcc -c -o $@ luaT/luaT.c $(INCLUDE) -fPIC
 $(LIBS): $(OBJS)
-	gcc -shared -o $@ $(OBJS)
+	gcc -shared -o $@ $(OBJS) $(LDFLAGS)
 clean:
 	-rm -rf $(OBJ_DIR)
 	-rm -rf $(LUA_DIR)
diff --git a/common.c b/common.c
index f5521fd..c60c1ec 100644
--- a/common.c
+++ b/common.c
@@ -16,4 +16,12 @@ int nerv_error(lua_State *L, const char *err_mesg_fmt, ...) {
 int nerv_error_method_not_implemented(lua_State *L) {
     return nerv_error(L, "method not implemented"); 
 }
+
+void luaN_append_methods(lua_State *L, const luaL_Reg *mlist) {
+    for (; mlist->func; mlist++)
+    {
+        lua_pushcfunction(L, mlist->func);
+        lua_setfield(L, -2, mlist->name);
+    }
+}
 #endif
diff --git a/common.h b/common.h
index b316f20..51e90ee 100644
--- a/common.h
+++ b/common.h
@@ -7,3 +7,4 @@
 
 int nerv_error(lua_State *L, const char *err_mesg_fmt, ...); 
 int nerv_error_method_not_implemented(lua_State *L);
+void luaN_append_methods(lua_State *L, const luaL_Reg *mlist);
diff --git a/cumatrix_example.lua b/cumatrix_example.lua
index 4b6fb4a..ccd88b8 100644
--- a/cumatrix_example.lua
+++ b/cumatrix_example.lua
@@ -1,10 +1,27 @@
-t = nerv.FloatCuMatrix(10, 20)
+m = 2
+n = 3
+t = nerv.FloatCuMatrix(m, n)
+t2 = nerv.FloatCuMatrix(m, n)
 print(t)
 a = t[1]
-for i = 0, 9 do
-    for j = 0, 19 do
+for i = 0, m - 1 do
+    for j = 0, n - 1 do
         t[i][j] = i + j
+        t2[i][j] = t[i][j]
     end
 end
-print(t)
 print(a)
+print(t)
+print(t2)
+print(t + t2)
+d = nerv.FloatCuMatrix(3, 3)
+for i = 0, 2 do
+    for j = 0, 2 do
+        d[i][j] = 0
+    end
+end
+d[0][0] = 1
+d[1][1] = 2
+d[2][2] = 3
+print(d)
+print(t * d)
diff --git a/matrix.c b/matrix.c
new file mode 100644
index 0000000..e69de29
diff --git a/matrix/cumatrix.c b/matrix/cumatrix.c
index 87ad57b..9c2878a 100644
--- a/matrix/cumatrix.c
+++ b/matrix/cumatrix.c
@@ -1,11 +1,91 @@
-#define MATRIX_DATA_FREE(ptr) free(ptr)
-#define MATRIX_DATA_ALLOC(size) malloc(size)
-#define MATRIX_DATA_STRIDE(ncol) (sizeof(float) * (ncol))
-#define MATRIX_GENERIC
+#define MATRIX_DATA_FREE(ptr) cuda_float_array_free(ptr)
+#define MATRIX_DATA_ALLOC(dptr, stride, width, height) cuda_float_array_alloc(dptr, stride, width, height)
+#define MATRIX_DATA_WRITE(data, idx, val) cuda_float_array_write(data, idx, val)
+#define MATRIX_DATA_READ(data, idx) cuda_float_array_read(data, idx)
+#define MATRIX_INIT(L) cuda_float_init(L)
+#define NERV_GENERIC_MATRIX
 #define nerv_float_matrix_(NAME) nerv_float_matrix_cuda_ ## NAME
-#include "generic/matrix.c"
+#include "../common.h"
+#include "generic/matrix.h"
+#include "cuda.h"
+#include "driver_types.h"
+#include "cublas_v2.h"
 
 const char *nerv_float_matrix_(tname) = "nerv.FloatCuMatrix";
+static cublasHandle_t cublas_handle;
+
+Matrix *nerv_float_matrix_(new_)(long nrow, long ncol);
+static int nerv_float_matrix_(add)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_float_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_float_matrix_(tname));
+    Matrix *c;
+    long nrow, ncol;
+    if (!(a->nrow == b->nrow && a->ncol == b->ncol))
+        nerv_error(L, "Matrices should be of the same dimension");
+    nrow = a->nrow;
+    ncol = a->ncol;
+    c = nerv_float_matrix_(new_)(nrow, ncol);
+    float alpha = 1.0f, beta = 1.0f;
+    cublasSgeam(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
+                ncol, nrow,
+                &alpha,
+                a->data.f, a->stride / sizeof(float),
+                &beta,
+                b->data.f, b->stride / sizeof(float),
+                c->data.f, c->stride / sizeof(float));
+    luaT_pushudata(L, c, nerv_float_matrix_(tname));
+    return 1;
+}
+
+static int nerv_float_matrix_(mul)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_float_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_float_matrix_(tname));
+    Matrix *c;
+    if (a->ncol != b->nrow)
+        nerv_error(L, "Wrong dimension of multipliers");
+    c = nerv_float_matrix_(new_)(a->nrow, b->ncol);
+    float alpha = 1.0f, beta = 0.0f;
+    cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
+                b->ncol, a->nrow, b->nrow,
+                &alpha,
+                b->data.f, b->stride / sizeof(float),
+                a->data.f, a->stride / sizeof(float),
+                &beta,
+                c->data.f, c->stride / sizeof(float));
+    luaT_pushudata(L, c, nerv_float_matrix_(tname));
+    return 1;
+}
+
+static const luaL_Reg nerv_float_matrix_(extra_methods)[] = {
+    {"__add__", nerv_float_matrix_(add)},
+    {"__mul__", nerv_float_matrix_(mul)},
+    {NULL, NULL}
+};
+
+static void cuda_float_init(lua_State *L) {
+    luaN_append_methods(L, nerv_float_matrix_(extra_methods));
+    cublasCreate(&cublas_handle);
+}
+
+static cuda_float_array_free(float *ptr) {
+    cudaFree(ptr);
+}
+
+static cuda_float_array_alloc(float **dptr, long *stride,
+                                long width, long height) {
+    cudaMallocPitch(dptr, stride, width, height);
+}
+
+static float cuda_float_array_read(float *data, int idx) {
+    float res;
+    cudaMemcpy(&res, data + idx, sizeof(float), cudaMemcpyDeviceToHost);
+    return res;
+}
+
+static void cuda_float_array_write(float *data, int idx, float val) {
+    cudaMemcpy(data + idx, &val, sizeof(float), cudaMemcpyHostToDevice);
+}
+
 int nerv_float_matrix_(get_elem)(lua_State *L) {
     return nerv_error_method_not_implemented(L);
 }
@@ -13,3 +93,5 @@ int nerv_float_matrix_(get_elem)(lua_State *L) {
 int nerv_float_matrix_(set_elem)(lua_State *L) {
     return nerv_error_method_not_implemented(L);
 }
+
+#include "generic/matrix.c"
diff --git a/matrix/generic/matrix.c b/matrix/generic/matrix.c
index 29919d8..9ced397 100644
--- a/matrix/generic/matrix.c
+++ b/matrix/generic/matrix.c
@@ -1,4 +1,4 @@
-#ifdef MATRIX_GENERIC
+#ifdef NERV_GENERIC_MATRIX
 #include "../../common.h"
 #include "matrix.h"
 
@@ -14,17 +14,22 @@ void nerv_float_matrix_(data_retain)(Matrix *self) {
     (*self->data_ref)++;
 }
 
-int nerv_float_matrix_(new)(lua_State *L) {
+Matrix *nerv_float_matrix_(new_)(long nrow, long ncol) {
     Matrix *self = (Matrix *)malloc(sizeof(Matrix));
-    self->nrow = luaL_checkinteger(L, 1);
-    self->ncol = luaL_checkinteger(L, 2);
+    self->nrow = nrow;
+    self->ncol = ncol;
     self->nmax = self->nrow * self->ncol;
-    self->stride = MATRIX_DATA_STRIDE(self->ncol);
-    self->data.f = MATRIX_DATA_ALLOC(self->stride * self->nrow);
+    MATRIX_DATA_ALLOC(&self->data.f, &self->stride, sizeof(float) * self->ncol, self->nrow);
     self->data_ref = (long *)malloc(sizeof(long));
     *self->data_ref = 0;
     nerv_float_matrix_(data_retain)(self);
-    luaT_pushudata(L, self, nerv_float_matrix_(tname));
+    return self;
+}
+
+int nerv_float_matrix_(new)(lua_State *L) {
+    luaT_pushudata(L, nerv_float_matrix_(new_)(luaL_checkinteger(L, 1),
+                                                luaL_checkinteger(L, 2)),
+                    nerv_float_matrix_(tname));
     return 1;
 }
 
@@ -58,7 +63,7 @@ static int nerv_float_matrix_(newindex)(lua_State *L) {
         {
             if (idx < 0 || idx >= self->ncol)
                 nerv_error(L, "index must be within range [0, %d)", self->ncol);
-            self->data.f[idx] = luaL_checknumber(L, 3);
+            MATRIX_DATA_WRITE(self->data.f, idx, luaL_checknumber(L, 3));
         }
         else
             nerv_error(L, "cannot assign a scalar to row vector");
@@ -82,7 +87,7 @@ static int nerv_float_matrix_(index)(lua_State *L) {
         {
             if (idx < 0 || idx >= self->ncol)
                 nerv_error(L, "index must be within range [0, %d)", self->ncol);
-            lua_pushnumber(L, self->data.f[idx]);
+            lua_pushnumber(L, MATRIX_DATA_READ(self->data.f, idx));
         }
         else
         {
@@ -127,6 +132,9 @@ void nerv_float_matrix_(init)(lua_State *L) {
     luaT_newmetatable(L, nerv_float_matrix_(tname), nerv_matrix_tname,
                         nerv_float_matrix_(new), nerv_float_matrix_(destroy), NULL);
     luaL_register(L, NULL, nerv_float_matrix_(methods));
+#ifdef MATRIX_INIT
+    MATRIX_INIT(L);
+#endif
     lua_pop(L, 1);
 }
 #endif
diff --git a/matrix/generic/matrix.h b/matrix/generic/matrix.h
index d02b56e..655ff3d 100644
--- a/matrix/generic/matrix.h
+++ b/matrix/generic/matrix.h
@@ -1,3 +1,6 @@
+#ifndef NERV_GENERIC_MATRIX_H
+#define NERV_GENERIC_MATRIX_H
+
 typedef struct Matrix {
     long stride;              /* size of a row */
     long ncol, nrow, nmax;    /* dimension of the matrix */
@@ -7,3 +10,5 @@ typedef struct Matrix {
     } data;                   /* pointer to actual storage */
     long *data_ref;
 } Matrix;
+
+#endif
diff --git a/matrix/matrix.c b/matrix/matrix.c
index 0e5f75f..ef311d6 100644
--- a/matrix/matrix.c
+++ b/matrix/matrix.c
@@ -1,11 +1,21 @@
 #define MATRIX_DATA_FREE(ptr) free(ptr)
-#define MATRIX_DATA_ALLOC(size) malloc(size)
+#define MATRIX_DATA_ALLOC(dptr, stride, width, height) host_float_array_alloc(dptr, stride, width, height)
 #define MATRIX_DATA_STRIDE(ncol) (sizeof(float) * (ncol))
-#define MATRIX_GENERIC
+#define MATRIX_DATA_WRITE(data, idx, val) (data[idx] = val)
+#define MATRIX_DATA_READ(data, idx) (data[idx])
+#define NERV_GENERIC_MATRIX
 #define nerv_float_matrix_(NAME) nerv_float_matrix_host_ ## NAME
-#include "generic/matrix.c"
+#include "../common.h"
+#include "generic/matrix.h"
 
 const char *nerv_float_matrix_(tname) = "nerv.FloatMatrix";
+
+static void host_float_array_alloc(float **dptr, long *stride,
+                                    long width, long height) {
+    *dptr = (float *)malloc(width * height);
+    *stride = width;
+}
+
 int nerv_float_matrix_(get_elem)(lua_State *L) {
     Matrix *self = luaT_checkudata(L, 1, nerv_float_matrix_(tname));
     int idx = luaL_checkinteger(L, 2);
@@ -25,3 +35,5 @@ int nerv_float_matrix_(set_elem)(lua_State *L) {
     self->data.f[idx] = v;
     return 0;
 }
+
+#include "generic/matrix.c"
-- 
cgit v1.2.3-70-g09d2