From ab12a9583bdd39884fde9bc2444e6fd1bc5f518e Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Sun, 31 May 2015 11:18:16 +0800
Subject: add async copy by index; add MMatrixInt

---
 matrix/cuda_helper.h       | 55 ++++++++++++++++++++++++++++++++++++++++++++++
 matrix/generic/cumatrix.c  | 32 +++++++++++++++++++++++++++
 matrix/generic/elem_type.h |  6 +++++
 matrix/generic/matrix.c    |  3 ---
 matrix/generic/matrix.h    |  4 ++++
 matrix/init.c              |  1 +
 matrix/init.lua            | 13 ++++++++---
 matrix/mmatrix.c           | 14 ++++++++++++
 8 files changed, 122 insertions(+), 6 deletions(-)
 create mode 100644 matrix/cuda_helper.h
diff --git a/matrix/cuda_helper.h b/matrix/cuda_helper.h
new file mode 100644
index 0000000..c0fa618
--- /dev/null
+++ b/matrix/cuda_helper.h
@@ -0,0 +1,55 @@
+#ifndef NERV_CUDA_HELPER_H
+#define NERV_CUDA_HELPER_H
+#define CUBLAS_SAFE_CALL(call) \
+    do { \
+        cublasStatus_t  err = (call); \
+        if (err != CUBLAS_STATUS_SUCCESS) \
+            nerv_error(L, "cumatrix cublas error: %s", cublasGetErrorString(err)); \
+    } while (0)
+
+#define CUDA_SAFE_CALL(call) \
+    do { \
+        cudaError_t err = (call); \
+        if (err != cudaSuccess) \
+            nerv_error(L, "cumatrix CUDA error: %s", cudaGetErrorString(err)); \
+    } while (0)
+
+#define CUDA_SAFE_SYNC_CALL(call) \
+    do { \
+        CUDA_SAFE_CALL(call); \
+        cudaDeviceSynchronize(); \
+    } while (0)
+
+#define CHECK_SAME_DIMENSION(a, b) \
+    do { \
+        if (!(a->nrow == b->nrow && a->ncol == b->ncol)) \
+            nerv_error(L, "Matrices should be of the same dimension"); \
+    } while (0)
+
+static const char *cublasGetErrorString(cublasStatus_t err) {
+    switch (err)
+    {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+        case CUBLAS_STATUS_LICENSE_ERROR:
+            return "CUBLAS_STATUS_LICENSE_ERROR";
+    }
+    return "<unknown>";
+}
+#endif
diff --git a/matrix/generic/cumatrix.c b/matrix/generic/cumatrix.c
index 7b0aa2a..3bc58d7 100644
--- a/matrix/generic/cumatrix.c
+++ b/matrix/generic/cumatrix.c
@@ -251,6 +251,37 @@ static int nerv_matrix_(log_elem)(lua_State *L) {
     return 0;
 }
 
+extern const char *nerv_matrix_host_int_tname;
+static int nerv_matrix_(copy_rows_fromh_by_idx)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
+    Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_int_tname);
+    long *idx_ptr = idx->data.i;
+    int i;
+    long nrow = a->nrow;
+    if (idx->nrow != 1)
+        nerv_error(L, "index should be a vector");
+    if (idx->ncol != nrow)
+        nerv_error(L, "index dimension mismatch");
+    if (a->ncol != b->ncol)
+        nerv_error(L, "source/destination dimension mismatch");
+    cudaStream_t *streams = (cudaStream_t*)malloc(sizeof(cudaStream_t) * nrow);
+    for (i = 0; i < nrow; i++)
+    {
+        CUDA_SAFE_CALL(cudaStreamCreate(streams + i));
+        CUDA_SAFE_CALL(cudaMemcpyAsync(MATRIX_ROW_PTR(a, i),
+                    MATRIX_ROW_PTR(b, idx_ptr[i]),
+                    b->stride,
+                    cudaMemcpyHostToDevice, streams[i]));
+    }
+    for (i = 0; i < nrow; i++)
+    {
+        CUDA_SAFE_CALL(cudaStreamSynchronize(streams[i]));
+        CUDA_SAFE_CALL(cudaStreamDestroy(streams[i]));
+    }
+    return 0;
+}
+
 static const luaL_Reg nerv_matrix_(extra_methods)[] = {
     {"create", nerv_matrix_(create)},
     {"colsum", nerv_matrix_(colsum)},
@@ -271,6 +302,7 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = {
     {"softmax", nerv_matrix_(softmax)},
     {"mul_elem", nerv_matrix_(mul_elem)},
     {"log_elem", nerv_matrix_(log_elem)},
+    {"copy_rows_fromh_by_idx", nerv_matrix_(copy_rows_fromh_by_idx)},
     {NULL, NULL}
 };
 
diff --git a/matrix/generic/elem_type.h b/matrix/generic/elem_type.h
index 78233a3..2a6ffa8 100644
--- a/matrix/generic/elem_type.h
+++ b/matrix/generic/elem_type.h
@@ -10,4 +10,10 @@
 #define MATRIX_ELEM_FMT "%lf"
 #define MATRIX_ELEM_PTR(self) ((self)->data.d)
 
+#elif defined(MATRIX_USE_INT)
+
+#define MATRIX_ELEM long
+#define MATRIX_ELEM_FMT "%ld"
+#define MATRIX_ELEM_PTR(self) ((self)->data.i)
+
 #endif
diff --git a/matrix/generic/matrix.c b/matrix/generic/matrix.c
index e0098de..a0f9ecf 100644
--- a/matrix/generic/matrix.c
+++ b/matrix/generic/matrix.c
@@ -2,9 +2,6 @@
 #include "../../common.h"
 #include "matrix.h"
 
-#define MATRIX_ROW_PTR(self, row) \
-    (MATRIX_ELEM *)((char *)MATRIX_ELEM_PTR(self) + (row) * (self)->stride)
-
 extern const char *nerv_matrix_(tname);
 extern const char *MATRIX_BASE_TNAME;
 
diff --git a/matrix/generic/matrix.h b/matrix/generic/matrix.h
index 276ca5c..833724b 100644
--- a/matrix/generic/matrix.h
+++ b/matrix/generic/matrix.h
@@ -8,8 +8,12 @@ typedef struct Matrix {
     union {
         float *f;
         double *d;
+        long *i;
     } data;                   /* pointer to actual storage */
     long *data_ref;
 } Matrix;
 
+#define MATRIX_ROW_PTR(self, row) \
+    (MATRIX_ELEM *)((char *)MATRIX_ELEM_PTR(self) + (row) * (self)->stride)
+
 #endif
diff --git a/matrix/init.c b/matrix/init.c
index e55558a..b54cd12 100644
--- a/matrix/init.c
+++ b/matrix/init.c
@@ -35,4 +35,5 @@ void nerv_matrix_init(lua_State *L) {
                             NULL, NULL, NULL);
     nerv_matrix_host_float_init(L);
     nerv_matrix_host_double_init(L);
+    nerv_matrix_host_int_init(L);
 }
diff --git a/matrix/init.lua b/matrix/init.lua
index 0075668..057b085 100644
--- a/matrix/init.lua
+++ b/matrix/init.lua
@@ -2,17 +2,22 @@ function nerv.Matrix:__tostring__()
     local ncol = self:ncol()
     local nrow = self:nrow()
     local strt = {}
-
+    local fmt
+    if self.fmt then
+        fmt = self.fmt
+    else
+        fmt = "%.10f "
+    end
     if nrow == 1 then
         for col = 0, ncol - 1 do
-            table.insert(strt, string.format("%f ", self[col]))
+            table.insert(strt, string.format(fmt, self[col]))
         end
         table.insert(strt, "\n")
     else
         for row = 0, nrow - 1 do
             local rp = self[row]
             for col = 0, ncol - 1 do
-                table.insert(strt, string.format("%.10f ", rp[col]))
+                table.insert(strt, string.format(fmt, rp[col]))
             end
             table.insert(strt, "\n")
         end
@@ -21,6 +26,8 @@ function nerv.Matrix:__tostring__()
     return table.concat(strt)
 end
 
+nerv.MMatrixInt.fmt = "%d "
+
 function nerv.CuMatrix:__add__(b)
     c = self:create()
     c:add(self, b, 1.0, 1.0)
diff --git a/matrix/mmatrix.c b/matrix/mmatrix.c
index b7d7dae..ab15197 100644
--- a/matrix/mmatrix.c
+++ b/matrix/mmatrix.c
@@ -17,3 +17,17 @@ const char *nerv_matrix_(tname) = "nerv.MMatrixFloat";
 #define nerv_matrix_(NAME) nerv_matrix_host_double_##NAME
 const char *nerv_matrix_(tname) = "nerv.MMatrixDouble";
 #include "generic/mmatrix.c"
+#undef nerv_matrix_
+#undef host_matrix_
+#undef MATRIX_USE_DOUBLE
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_FMT
+
+#define NERV_GENERIC_MMATRIX
+#define MATRIX_USE_INT
+#define host_matrix_(NAME) host_matrix_int_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_host_int_##NAME
+const char *nerv_matrix_(tname) = "nerv.MMatrixInt";
+#include "generic/mmatrix.c"
+
-- 
cgit v1.2.3-70-g09d2