From ea6f2990f99dd9ded6a0e74d75a3ec84900a2518 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Wed, 3 Jun 2015 23:00:45 +0800
Subject: demo now works (without random shuffle)

---
 matrix/cuda_helper.h       |  2 +-
 matrix/cukernel.h          |  1 +
 matrix/generic/cukernel.cu | 19 ++++++++++
 matrix/generic/cumatrix.c  | 87 ++++++++++++++++++++++++++++++----------------
 matrix/generic/mmatrix.c   | 20 ++++++++++-
 matrix/init.lua            |  3 +-
 6 files changed, 100 insertions(+), 32 deletions(-)

(limited to 'matrix')

diff --git a/matrix/cuda_helper.h b/matrix/cuda_helper.h
index c0fa618..cedc643 100644
--- a/matrix/cuda_helper.h
+++ b/matrix/cuda_helper.h
@@ -23,7 +23,7 @@
 #define CHECK_SAME_DIMENSION(a, b) \
     do { \
         if (!(a->nrow == b->nrow && a->ncol == b->ncol)) \
-            nerv_error(L, "Matrices should be of the same dimension"); \
+            nerv_error(L, "matrices should be of the same dimension"); \
     } while (0)
 
 static const char *cublasGetErrorString(cublasStatus_t err) {
diff --git a/matrix/cukernel.h b/matrix/cukernel.h
index 178b7d3..7d2168e 100644
--- a/matrix/cukernel.h
+++ b/matrix/cukernel.h
@@ -13,4 +13,5 @@ void cudak_(cuda_fill)(Matrix *a, double val);
 void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context);
 void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step);
 void cudak_(cuda_scale_row)(const Matrix *a, Matrix *b);
+void cudak_(cuda_decompress)(const Matrix *a, Matrix *b);
 #endif
diff --git a/matrix/generic/cukernel.cu b/matrix/generic/cukernel.cu
index 1d8b983..05a1e78 100644
--- a/matrix/generic/cukernel.cu
+++ b/matrix/generic/cukernel.cu
@@ -187,6 +187,15 @@ __global__ void cudak_(scale_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
     b[j + i * stride] *= a[j];
 }
 
+__global__ void cudak_(decompress)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                    int nrow, int ncol,
+                                    int stride_a, int stride_b) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[lrintf(a[j + i * stride_a]) + i * stride_b] = 1.0;
+}
+
 extern "C" {
 #include "../cukernel.h"
     void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) {
@@ -385,5 +394,15 @@ extern "C" {
             (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
              b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
     }
+
+    void cudak_(cuda_decompress)(const Matrix *a, Matrix *b) {
+        dim3 threadsPerBlock(1, CUDA_THREADS_NN);
+        dim3 numBlocks(1, CEIL_DIV(a->nrow, threadsPerBlock.y));
+        cudak_(decompress)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             a->nrow, a->ncol,
+             a->stride / sizeof(MATRIX_ELEM),
+             b->stride / sizeof(MATRIX_ELEM));
+    }
 }
 #endif
diff --git a/matrix/generic/cumatrix.c b/matrix/generic/cumatrix.c
index 0df1bd7..373fc42 100644
--- a/matrix/generic/cumatrix.c
+++ b/matrix/generic/cumatrix.c
@@ -74,7 +74,8 @@ static int nerv_matrix_(mul)(lua_State *L) {
     if (an != bm)
         nerv_error(L, "Wrong dimension of multipliers");
 /*    MATRIX_ELEM alpha = 1.0f, beta = 0.0f; */
-    CUBLAS_SAFE_CALL( //Because matrix in Nerv is row-major, here b comes first
+    /* Because matrix in Nerv is row-major, here b comes first */
+    CUBLAS_SAFE_CALL(
             NERV_CUBLAS_(gemm)(cublas_handle, tb, ta,
                 bn, am, bm,
                 &alpha,
@@ -113,9 +114,11 @@ static int nerv_matrix_(sigmoid_grad)(lua_State *L) {
 static int nerv_matrix_(softmax)(lua_State *L) {
     Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
     Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *max = nerv_matrix_(new_)(L, a->nrow, 1);
-    Matrix *dno = nerv_matrix_(new_)(L, a->nrow, 1);
+    Matrix *max;
+    Matrix *dno;
     CHECK_SAME_DIMENSION(a, b);
+    max = nerv_matrix_(new_)(L, a->nrow, 1);
+    dno = nerv_matrix_(new_)(L, a->nrow, 1);
     cudak_(cuda_rowmax)(a, max);
     cudak_(cuda_softmax_denominator)(a, max, dno);
     cudak_(cuda_softmax_final)(a, max, dno, b);
@@ -168,26 +171,22 @@ static int nerv_matrix_(fill)(lua_State *L) {
     return 0;
 }
 
-static int nerv_matrix_(copy_fromd)(lua_State *L) { 
+static int nerv_matrix_(copy_fromd)(lua_State *L) {
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
     Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    CHECK_SAME_DIMENSION(a, b);
-    CUDA_SAFE_SYNC_CALL(
-            cudaMemcpy2D(MATRIX_ELEM_PTR(a), a->stride,
-                MATRIX_ELEM_PTR(b), b->stride,
-                sizeof(MATRIX_ELEM) * b->ncol, b->nrow,
-                cudaMemcpyDeviceToDevice));
-    return 0;
-}
-
-static int nerv_matrix_(copy_tod)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    CHECK_SAME_DIMENSION(a, b);
+    int nargs = lua_gettop(L);
+    int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
+    int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
+    int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
+    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+            a_begin + b_end - b_begin <= a->nrow))
+        nerv_error(L, "invalid copy interval");
+    if (a->ncol != b->ncol)
+        nerv_error(L, "matrices should be of the same dimension");
     CUDA_SAFE_SYNC_CALL(
-            cudaMemcpy2D(MATRIX_ELEM_PTR(b), b->stride,
-                MATRIX_ELEM_PTR(a), a->stride,
-                sizeof(MATRIX_ELEM) * a->ncol, a->nrow,
+            cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride,
+                MATRIX_ROW_PTR(b, b_begin), b->stride,
+                sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin,
                 cudaMemcpyDeviceToDevice));
     return 0;
 }
@@ -196,11 +195,19 @@ extern const char *MATRIX_CUMATRIX_HOST_TNAME;
 static int nerv_matrix_(copy_fromh)(lua_State *L) { 
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
     Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
-    CHECK_SAME_DIMENSION(a, b);
+    int nargs = lua_gettop(L);
+    int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
+    int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
+    int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
+    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+            a_begin + b_end - b_begin <= a->nrow))
+        nerv_error(L, "invalid copy interval");
+    if (a->ncol != b->ncol)
+        nerv_error(L, "matrices should be of the same dimension");
     CUDA_SAFE_SYNC_CALL(
-            cudaMemcpy2D(MATRIX_ELEM_PTR(a), a->stride,
-                MATRIX_ELEM_PTR(b), b->stride,
-                sizeof(MATRIX_ELEM) * b->ncol, b->nrow,
+            cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride,
+                MATRIX_ROW_PTR(b, b_begin), b->stride,
+                sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin,
                 cudaMemcpyHostToDevice));
     return 0;
 }
@@ -208,11 +215,19 @@ static int nerv_matrix_(copy_fromh)(lua_State *L) {
 static int nerv_matrix_(copy_toh)(lua_State *L) {
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
     Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
-    CHECK_SAME_DIMENSION(a, b);
+    int nargs = lua_gettop(L);
+    int a_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
+    int a_end = nargs > 3 ? luaL_checkinteger(L, 4) : a->nrow;
+    int b_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
+    if (!(0 <= a_begin && a_begin < a_end && a_end <= a->nrow &&
+            b_begin + a_end - a_begin <= b->nrow))
+        nerv_error(L, "invalid copy interval");
+    if (b->ncol != a->ncol)
+        nerv_error(L, "matrices should be of the same dimension");
     CUDA_SAFE_SYNC_CALL(
-            cudaMemcpy2D(MATRIX_ELEM_PTR(b), b->stride,
-                MATRIX_ELEM_PTR(a), a->stride,
-                sizeof(MATRIX_ELEM) * a->ncol, a->nrow,
+            cudaMemcpy2D(MATRIX_ROW_PTR(b, b_begin), b->stride,
+                MATRIX_ROW_PTR(a, a_begin), a->stride,
+                sizeof(MATRIX_ELEM) * a->ncol, a_end - a_begin,
                 cudaMemcpyDeviceToHost));
     return 0;
 }
@@ -221,6 +236,7 @@ static int nerv_matrix_(trans)(lua_State *L) {
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
     Matrix *b = nerv_matrix_(new_)(L, a->ncol, a->nrow);
     MATRIX_ELEM alpha = 1, beta = 0;
+    /* FIXME: possible memory leak when lua error is raised */
     CUBLAS_SAFE_CALL(
             NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T,
                 a->nrow, a->ncol,
@@ -251,6 +267,19 @@ static int nerv_matrix_(log_elem)(lua_State *L) {
     return 0;
 }
 
+static int nerv_matrix_(decompress)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b;
+    int orig_col = luaL_checkinteger(L, 2);
+    if (a->ncol != 1)
+        nerv_error(L, "the compressed matrix must be a column vector");
+    b = nerv_matrix_(new_)(L, a->nrow, orig_col);
+    cudak_(cuda_fill)(b, 0.0);
+    cudak_(cuda_decompress)(a, b);
+    luaT_pushudata(L, b, nerv_matrix_(tname));
+    return 1;
+}
+
 extern const char *nerv_matrix_host_int_tname;
 static int nerv_matrix_(copy_rows_fromh_by_idx)(lua_State *L) {
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
@@ -322,11 +351,11 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = {
     {"rowsum", nerv_matrix_(rowsum)},
     {"rowmax", nerv_matrix_(rowmax)},
     {"trans", nerv_matrix_(trans)},
+    {"decompress", nerv_matrix_(decompress)},
     /* in-place calc */
     {"copy_fromh", nerv_matrix_(copy_fromh)},
     {"copy_fromd", nerv_matrix_(copy_fromd)},
     {"copy_toh", nerv_matrix_(copy_toh)},
-    {"copy_tod", nerv_matrix_(copy_tod)},
     {"add", nerv_matrix_(add)},
     {"mul", nerv_matrix_(mul)},
     {"add_row", nerv_matrix_(add_row)},
diff --git a/matrix/generic/mmatrix.c b/matrix/generic/mmatrix.c
index 3a9ae79..4b722f3 100644
--- a/matrix/generic/mmatrix.c
+++ b/matrix/generic/mmatrix.c
@@ -11,6 +11,7 @@
 #define NERV_GENERIC_MATRIX
 #include "../../common.h"
 #include "../../io/chunk_file.h"
+#include "string.h"
 
 static void host_matrix_(alloc)(lua_State *L,
                                 MATRIX_ELEM **dptr, size_t *stride,
@@ -96,10 +97,27 @@ int nerv_matrix_(save)(lua_State *L) {
     return 0;
 }
 
-
+static int nerv_matrix_(copy_from)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    int nargs = lua_gettop(L);
+    int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
+    int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
+    int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
+    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+            a_begin + b_end - b_begin <= a->nrow))
+        nerv_error(L, "invalid copy interval");
+    if (a->ncol != b->ncol)
+        nerv_error(L, "matrices should be of the same dimension");
+    memmove(MATRIX_ROW_PTR(a, a_begin),
+            MATRIX_ROW_PTR(b, b_begin),
+            sizeof(MATRIX_ELEM) * b->ncol * (b_end - b_begin));
+    return 0;
+}
 static const luaL_Reg nerv_matrix_(extra_methods)[] = {
     {"load", nerv_matrix_(load)},
     {"save", nerv_matrix_(save)},
+    {"copy_from", nerv_matrix_(copy_from)},
     {NULL, NULL}
 };
 
diff --git a/matrix/init.lua b/matrix/init.lua
index f309f81..9637391 100644
--- a/matrix/init.lua
+++ b/matrix/init.lua
@@ -22,7 +22,8 @@ function nerv.Matrix:__tostring__()
             table.insert(strt, "\n")
         end
     end
-    table.insert(strt, string.format("[Matrix %d x %d]", nrow, ncol))
+    table.insert(strt, string.format(
+        "[%s %d x %d]", self.__typename, nrow, ncol))
     return table.concat(strt)
 end
 
-- 
cgit v1.2.3-70-g09d2