29 files changed, 1805 insertions, 537 deletions
diff --git a/nerv/Makefile b/nerv/Makefile
index 4008453..06a439a 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -1,58 +1,65 @@
 .PHONY: build install clean
 SHELL := /bin/bash
 BUILD_DIR := $(CURDIR)/build
-OBJS := nerv.o lib/luaT/luaT.o lib/io/chunk_file.o common.o \
-		matrix/mmatrix.o matrix/cumatrix.o matrix/init.o matrix/cukernel.o \
+LIB_PATH := $(LUA_BINDIR)/../lib
+OBJ_DIR := $(BUILD_DIR)/objs
+CORE_OBJS := common.o lib/io/chunk_file.o lib/matrix/mmatrix.o lib/matrix/cumatrix.o lib/matrix/cukernel.o
+NERV_OBJS := nerv.o \
+		matrix/mmatrix.o matrix/cumatrix.o matrix/init.o \
 		io/init.o io/chunk_file.o \
 		examples/oop_example.o
-LIBS := libnerv.so
+LUAT_OBJS := lib/luaT/luaT.o
+CORE_OBJS := $(addprefix $(OBJ_DIR)/,$(CORE_OBJS))
+NERV_OBJS := $(addprefix $(OBJ_DIR)/,$(NERV_OBJS))
+LUAT_OBJS := $(addprefix $(OBJ_DIR)/,$(LUAT_OBJS))
+OBJS := $(CORE_OBJS) $(NERV_OBJS) $(LUAT_OBJS)
+LIBS := $(INST_LIBDIR)/libnerv.so $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so
 LUA_LIBS := matrix/init.lua io/init.lua init.lua \
 			layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua \
 			layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua \
 			nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua \
 			io/sgd_buffer.lua
 INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK
-CUDA_BASE := /usr/local/cuda-6.5
-#CUDA_BASE := /usr/local/cuda-5.0
+#CUDA_BASE := /usr/local/cuda-6.5
+CUDA_BASE := /usr/local/cuda-5.0
 CUDA_INCLUDE := -I $(CUDA_BASE)/include/
 INCLUDE += $(CUDA_INCLUDE)
 LDFLAGS := -L$(CUDA_BASE)/lib64/  -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcublas
 CFLAGS := -Wall -Wextra -O2
-OBJ_DIR := $(BUILD_DIR)/objs
-SUBDIR := matrix io layer examples nn lib/io lib/luaT
+SUBDIR := matrix io layer examples nn lib/io lib/luaT lib/matrix
 NVCC := $(CUDA_BASE)/bin/nvcc
 NVCC_FLAGS := -Xcompiler -fPIC,-Wall,-Wextra
 
 LUA_DIR = $(INST_LUADIR)/nerv
-OBJS := $(addprefix $(OBJ_DIR)/,$(OBJS))
 OBJ_SUBDIR := $(addprefix $(OBJ_DIR)/,$(SUBDIR))
 LUA_SUBDIR := $(addprefix $(LUA_DIR)/,$(SUBDIR))
-LIBS := $(addprefix $(INST_LIBDIR)/,$(LIBS))
 LUA_LIBS := $(addprefix $(LUA_DIR)/,$(LUA_LIBS))
 
 build: $(OBJ_DIR) $(OBJ_SUBDIR) $(OBJS)
+
 $(OBJ_DIR) $(LUA_DIR) $(OBJ_SUBDIR) $(LUA_SUBDIR):
 	-mkdir -p $@
 $(OBJ_DIR)/%.o: %.c $(patsubst /%.o,/%.c,$@)
 	gcc -c -o $@ $< $(INCLUDE) -fPIC $(CFLAGS)
-$(OBJ_DIR)/matrix/cukernel.o: matrix/cukernel.cu
+$(OBJ_DIR)/lib/matrix/cukernel.o: lib/matrix/cukernel.cu
 	$(NVCC) -c -o $@ $< $(INCLUDE) $(NVCC_FLAGS)
 $(LUA_DIR)/%.lua: %.lua
 	cp $< $@
 #$(OBJ_DIR)/luaT.o:
 #	gcc -c -o $@ luaT/luaT.c $(INCLUDE) -fPIC
-$(LIBS): $(OBJS)
-	gcc -shared -o $@ $(OBJS) $(LDFLAGS)
+$(LIB_PATH)/libnervcore.so: $(CORE_OBJS)
+	gcc -shared -o $@ $^ $(LDFLAGS)
+$(LIB_PATH)/libluaT.so: $(LUAT_OBJS)
+	gcc -shared -o $@ $^ $(LDFLAGS)
+$(INST_LIBDIR)/libnerv.so: $(NERV_OBJS) $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so
+	gcc -shared -o $@ $(NERV_OBJS) $(LDFLAGS) -Wl,-rpath=$(LIB_PATH) -L$(LIB_PATH) -lnervcore -lluaT
 
 $(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c matrix/generic/cukernel.cu
 $(OBJ_DIR)/matrix/mmatrix.o: matrix/generic/mmatrix.c matrix/generic/matrix.c
-$(OBJ_DIR)/matrix/cukernel.o: matrix/generic/cukernel.cu
-
-.PHONY: speech
 
-speech:
-	-mkdir -p build/objs/speech/tnet_io
-	$(MAKE) -C speech/ BUILD_DIR=$(BUILD_DIR) LIB_DIR=$(LIB_DIR) OBJ_DIR=$(CURDIR)/build/objs/speech/ LUA_DIR=$(LUA_DIR)
+$(OBJ_DIR)/lib/matrix/cumatrix.o: lib/matrix/generic/cumatrix.c lib/matrix/generic/matrix.c lib/matrix/generic/cukernel.cu
+$(OBJ_DIR)/lib/matrix/mmatrix.o: lib/matrix/generic/mmatrix.c lib/matrix/generic/matrix.c
+$(OBJ_DIR)/lib/matrix/cukernel.o: lib/matrix/generic/cukernel.cu
 
 clean:
 	-rm -rf $(OBJ_DIR)
diff --git a/nerv/common.c b/nerv/common.c
index b4e39e6..0584438 100644
--- a/nerv/common.c
+++ b/nerv/common.c
@@ -11,6 +11,31 @@ int nerv_error(lua_State *L, const char *err_mesg_fmt, ...) {
     return 0;
 }
 
+int nerv_error_status(lua_State *L, Status *status) {
+    const char *mmesg = NULL;
+    switch (status->err_code)
+    {
+        case MAT_GENERAL_ERR: mmesg = "general error"; break;
+        case MAT_INSUF_MEM: mmesg = "insufficient memory"; break;
+        case MAT_INVALID_FORMAT: mmesg = "invalid matrix format"; break;
+        case MAT_WRITE_ERROR: mmesg = "error while writing matrix"; break;
+        case MAT_INVALID_COPY_INTERVAL: mmesg = "invalid copy interval"; break;
+        case MAT_MISMATCH_DIM: mmesg = "mismatching matrix dimension"; break;
+        case MAT_WRONG_MULT_DIM: mmesg = "wrong multipier dimension"; break;
+        case MAT_ROW_VECTOR_EXP: mmesg = "row vector expected"; break;
+        case MAT_COL_VECTOR_EXP: mmesg = "column vector expected"; break;
+        case MAT_IDX_VECTOR_EXP: mmesg = "index vector expected"; break;
+        case MAT_INVALID_IDX: mmesg = "invalid index"; break;
+        case MAT_CUDA_ERR: mmesg = "cuda error"; break;
+        case MAT_CUBLAS_ERR: mmesg = "cublas error"; break;
+    }
+    if (status->msg)
+        nerv_error(L, "%s: %s @%s:%d", mmesg, status->msg,
+                                        status->file, status->lineno);
+    else
+        nerv_error(L, "%s @%s:%d", mmesg, status->file, status->lineno);
+}
+
 int nerv_error_method_not_implemented(lua_State *L) {
     return nerv_error(L, "method not implemented"); 
 }
diff --git a/nerv/common.h b/nerv/common.h
index 6657dc4..413ca51 100644
--- a/nerv/common.h
+++ b/nerv/common.h
@@ -7,6 +7,50 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+enum {
+    MAT_NORMAL,
+    MAT_GENERAL_ERR,
+    MAT_INSUF_MEM,
+    MAT_INVALID_FORMAT,
+    MAT_WRITE_ERROR,
+    MAT_INVALID_COPY_INTERVAL,
+    MAT_MISMATCH_DIM,
+    MAT_WRONG_MULT_DIM,
+    MAT_ROW_VECTOR_EXP,
+    MAT_COL_VECTOR_EXP,
+    MAT_IDX_VECTOR_EXP,
+    MAT_INVALID_IDX,
+    MAT_CUDA_ERR,
+    MAT_CUBLAS_ERR
+};
+
+typedef struct Status {
+    int err_code;
+    const char *file;
+    int lineno;
+    const char *msg;
+} Status;
+
+#define NERV_SET_STATUS(status, code, m) \
+    do { \
+        (status)->err_code = code; \
+        (status)->msg = m; \
+        (status)->file = __FILE__; \
+        (status)->lineno = __LINE__; \
+    } while (0)
+
+#define NERV_EXIT_STATUS(status, code, msg) \
+    do { \
+        NERV_SET_STATUS(status, code, msg); \
+        return; \
+    } while (0)
+
+#define NERV_LUA_CHECK_STATUS(L, status) \
+    do { \
+        if (status.err_code != MAT_NORMAL) \
+            nerv_error_status(L, &status); \
+    } while (0)
+
 typedef struct HashNode {
     const char *key;
     void *val;
@@ -31,6 +75,7 @@ void hashmap_clear(HashMap *h);
 size_t bkdr_hash(const char *key);
 
 int nerv_error(lua_State *L, const char *err_mesg_fmt, ...); 
+int nerv_error_status(lua_State *L, Status *status); 
 int nerv_error_method_not_implemented(lua_State *L);
 void luaN_append_methods(lua_State *L, const luaL_Reg *mlist);
 #endif
diff --git a/nerv/lib/io/chunk_file.c b/nerv/lib/io/chunk_file.c
index a305962..e70ffc9 100644
--- a/nerv/lib/io/chunk_file.c
+++ b/nerv/lib/io/chunk_file.c
@@ -91,6 +91,7 @@ static ChunkFile *open_write(const char *fn, int *status) {
     }
     cfp = (ChunkFile *)malloc(sizeof(ChunkFile));
     cfp->fp = fp;
+    cfp->info = NULL;
     cfp->status = CF_WRITE;
     *status = CF_NORMAL;
     return cfp;
@@ -111,8 +112,6 @@ static ChunkFile *open_read(const char *fn, int *status) {
         return NULL;
     }
     cfp = (ChunkFile *)malloc(sizeof(ChunkFile));
-    cfp->fp = fp;
-    cfp->status = CF_READ;
     offset = ftello(fp);
     /* fprintf(stderr, "%d\n", (int)offset); */
     for (i = 0;; offset += chunk_len, i++)
@@ -144,7 +143,9 @@ static ChunkFile *open_read(const char *fn, int *status) {
         head = cip;
     }
     *status = CF_NORMAL;
+    cfp->fp = fp;
     cfp->info = head;
+    cfp->status = CF_READ;
     return cfp;
 }
 
@@ -208,13 +209,16 @@ void nerv_chunk_file_close(ChunkFile *cfp) {
 
 void nerv_chunk_file_destroy(ChunkFile *cfp) {
     ChunkInfo *i, *ni;
-    if (cfp->status != CF_CLOSED) fclose(cfp->fp);
-    for (i = cfp->info; i; i = ni)
+    if (cfp->info)
     {
-        ni = i->next;
-        free(i->metadata);
-        free(i);
+        for (i = cfp->info; i; i = ni)
+        {
+            ni = i->next;
+            free(i->metadata);
+            free(i);
+        }
     }
+    if (cfp->status != CF_CLOSED) fclose(cfp->fp);
     free(cfp);
 }
 
diff --git a/nerv/matrix/cuda_helper.h b/nerv/lib/matrix/cuda_helper.h
index fde6f18..8041efb 100644
--- a/nerv/matrix/cuda_helper.h
+++ b/nerv/lib/matrix/cuda_helper.h
@@ -4,33 +4,68 @@
 #include "cuda_runtime.h"
 #include "driver_types.h"
 #include "cublas_v2.h"
-#define CUBLAS_SAFE_SYNC_CALL(call) \
+
+#define CUBLAS_SAFE_SYNC_CALL_RET(call, status) \
+    do { \
+        cublasStatus_t  err = (call); \
+        if (err != CUBLAS_STATUS_SUCCESS) \
+        { \
+            NERV_SET_STATUS(status, MAT_CUBLAS_ERR, cublasGetErrorString(err)); \
+            return 0; \
+        } \
+        cudaDeviceSynchronize(); \
+    } while (0)
+
+#define CUBLAS_SAFE_SYNC_CALL(call, status) \
     do { \
         cublasStatus_t  err = (call); \
         if (err != CUBLAS_STATUS_SUCCESS) \
-            nerv_error(L, "cumatrix cublas error: %s at %s:%d", \
-                        cublasGetErrorString(err), __FILE__, __LINE__); \
+            NERV_EXIT_STATUS(status, MAT_CUBLAS_ERR, cublasGetErrorString(err)); \
         cudaDeviceSynchronize(); \
     } while (0)
 
-#define CUDA_SAFE_CALL(call) \
+#define CUDA_SAFE_CALL_RET(call, status) \
+    do { \
+        cudaError_t err = (call); \
+        if (err != cudaSuccess) \
+        { \
+            NERV_SET_STATUS(status, MAT_CUDA_ERR, cudaGetErrorString(err)); \
+            return 0; \
+        } \
+    } while (0)
+
+#define CUDA_SAFE_CALL(call, status) \
     do { \
         cudaError_t err = (call); \
         if (err != cudaSuccess) \
-            nerv_error(L, "cumatrix CUDA error: %s at %s:%d", \
-                            cudaGetErrorString(err), __FILE__, __LINE__); \
+            NERV_EXIT_STATUS(status, MAT_CUDA_ERR, cudaGetErrorString(err)); \
+    } while (0)
+
+#define CUDA_SAFE_SYNC_CALL(call, status) \
+    do { \
+        CUDA_SAFE_CALL(call, status); \
+        cudaDeviceSynchronize(); \
     } while (0)
 
-#define CUDA_SAFE_SYNC_CALL(call) \
+#define CUDA_SAFE_SYNC_CALL_RET(call, status) \
     do { \
-        CUDA_SAFE_CALL(call); \
+        CUDA_SAFE_CALL_RET(call, status); \
         cudaDeviceSynchronize(); \
     } while (0)
 
-#define CHECK_SAME_DIMENSION(a, b) \
+#define CHECK_SAME_DIMENSION(a, b, status) \
+    do { \
+        if (!(a->nrow == b->nrow && a->ncol == b->ncol)) \
+            NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); \
+    } while (0)
+
+#define CHECK_SAME_DIMENSION_RET(a, b, status) \
     do { \
         if (!(a->nrow == b->nrow && a->ncol == b->ncol)) \
-            nerv_error(L, "matrices should be of the same dimension"); \
+        { \
+            NERV_SET_STATUS(status, MAT_MISMATCH_DIM, 0); \
+            return 0; \
+        } \
     } while (0)
 
 static const char *cublasGetErrorString(cublasStatus_t err) {
diff --git a/nerv/matrix/cukernel.cu b/nerv/lib/matrix/cukernel.cu
index a19030a..a19030a 100644
--- a/nerv/matrix/cukernel.cu
+++ b/nerv/lib/matrix/cukernel.cu
diff --git a/nerv/matrix/cukernel.h b/nerv/lib/matrix/cukernel.h
index 8a1494f..8a1494f 100644
--- a/nerv/matrix/cukernel.h
+++ b/nerv/lib/matrix/cukernel.h
diff --git a/nerv/lib/matrix/cumatrix.c b/nerv/lib/matrix/cumatrix.c
new file mode 100644
index 0000000..9641197
--- /dev/null
+++ b/nerv/lib/matrix/cumatrix.c
@@ -0,0 +1,69 @@
+#define NERV_GENERIC_CUMATRIX
+#include "../../common.h"
+#include "cuda_helper.h"
+#include <string.h>
+#define PROFILE_HASHMAP_SIZE 123457
+static cublasHandle_t cublas_handle;
+static cudaEvent_t profile_start, profile_stop;
+static HashMap *profile;
+
+void nerv_cumatrix_print_profile() {
+    size_t i;
+    fprintf(stderr, "*** [nerv cumatrix profile] **\n");
+    for (i = 0; i < profile->size; i++)
+    {
+        HashNode *ptr;
+        for (ptr = profile->bucket[i]; ptr; ptr = ptr->next)
+        {
+            fprintf(stderr, "%s:\t%.6f\n", ptr->key, *(float *)ptr->val);
+        }
+    }
+}
+
+void nerv_cumatrix_clear_profile() {
+    hashmap_clear(profile);
+}
+
+void accu_profile(const char *name, float delta) {
+    float *val = hashmap_getval(profile, name);
+    if (!val)
+    {
+        val = malloc(sizeof(float));
+        *val = 0;
+        hashmap_setval(profile, name, val);
+    }
+    *val += delta;
+}
+
+void nerv_cumatrix_init() {
+    cublasCreate(&cublas_handle);
+    cudaEventCreate(&profile_start);
+    cudaEventCreate(&profile_stop);
+    profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
+}
+
+#define MATRIX_USE_FLOAT
+#define cuda_matrix_(NAME) cuda_matrix_float_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_cuda_float_##NAME
+#define cudak_(NAME) cudak_float_ ## NAME
+#define NERV_CUBLAS_(NAME) cublasS##NAME
+#define MATRIX_CUMATRIX_HOST_TNAME nerv_matrix_host_float_tname
+#include "generic/cumatrix.c"
+#undef NERV_CUBLAS_
+#undef cudak_
+#undef nerv_matrix_
+#undef cuda_matrix_
+#undef MATRIX_USE_FLOAT
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_FMT
+#undef MATRIX_ELEM_WRITE_FMT
+#undef MATRIX_CUMATRIX_HOST_TNAME
+
+#define MATRIX_USE_DOUBLE
+#define cuda_matrix_(NAME) cuda_matrix_double_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_cuda_double_##NAME
+#define cudak_(NAME) cudak_double_ ## NAME
+#define NERV_CUBLAS_(NAME) cublasD##NAME
+#define MATRIX_CUMATRIX_HOST_TNAME nerv_matrix_host_double_tname
+#include "generic/cumatrix.c"
diff --git a/nerv/lib/matrix/cumatrix.h b/nerv/lib/matrix/cumatrix.h
new file mode 100644
index 0000000..9f71507
--- /dev/null
+++ b/nerv/lib/matrix/cumatrix.h
@@ -0,0 +1,6 @@
+#ifndef NERV_CUMATRIX_H
+#define NERV_CUMATRIX_H
+void nerv_cumatrix_print_profile();
+void nerv_cumatrix_clear_profile();
+void nerv_cumatrix_init();
+#endif
diff --git a/nerv/lib/matrix/generic/cukernel.cu b/nerv/lib/matrix/generic/cukernel.cu
new file mode 100644
index 0000000..6111193
--- /dev/null
+++ b/nerv/lib/matrix/generic/cukernel.cu
@@ -0,0 +1,571 @@
+#ifdef NERV_GENERIC_CUKERNEL
+#include <assert.h>
+#include <stdio.h>
+#include "../matrix.h"
+#include "cuda.h"
+#include "float.h"
+#define CUDA_THREADS_N 16
+#define CUDA_THREADS_NN ((CUDA_THREADS_N) * (CUDA_THREADS_N))
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+__global__ void cudak_(log_elem)(const MATRIX_ELEM *a, MATRIX_ELEM *b, 
+                                int nrow, int ncol, int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    long idx;
+    MATRIX_ELEM tmp;
+    if (i >= nrow || j >= ncol) return;
+    idx = j + i * stride;
+    tmp = a[idx];
+    if(tmp < FLT_MIN) tmp = FLT_MIN;
+    b[idx] = log(tmp);
+}
+
+__global__ void cudak_(mul_elem)(const MATRIX_ELEM *a, const MATRIX_ELEM *b,
+                                MATRIX_ELEM *c, 
+                                int nrow, int ncol, int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    long idx;
+    if (i >= nrow || j >= ncol) return;
+    idx = j + i * stride;
+    c[idx] = a[idx] * b[idx];
+}
+
+__global__ void cudak_(sigmoid)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                        int nrow, int ncol, int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    long idx;
+    if (i >= nrow || j >= ncol) return;
+    idx = j + i * stride;
+    b[idx] = 1.0 / (1.0 + exp(-a[idx]));
+}
+
+__global__ void cudak_(sigmoid_grad)(const MATRIX_ELEM *output,
+                                    const MATRIX_ELEM *err,
+                                    MATRIX_ELEM *nerr,
+                                    int nrow, int ncol, int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    long idx;
+    if (i >= nrow || j >= ncol) return;
+    idx = j + i * stride;
+    nerr[idx] = output[idx] * (1.0 - output[idx]) * err[idx];
+}
+
+__global__ void cudak_(softmax_final)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                        const MATRIX_ELEM *max, const MATRIX_ELEM *deno,
+                        int nrow, int ncol, int stride, int mstride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    long idx;
+    if (i >= nrow || j >= ncol) return;
+    idx = j + i * stride;
+    b[idx] = exp(a[idx] - max[0 + i * mstride]) / deno[0 + i * mstride];
+}
+
+__global__ void cudak_(block_reduce_rowsum)(const MATRIX_ELEM *input,
+                                            MATRIX_ELEM *output,
+                                            const int istride, const int ostride,
+                                            const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0;
+    __syncthreads();
+    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+            cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_colsum)(const MATRIX_ELEM *input,
+                                MATRIX_ELEM *output,
+                                const int istride, const int ostride,
+                                const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    cudak_(arr)[threadIdx.y] = i < n ? input[blockIdx.x + istride * i] : 0;
+    __syncthreads();
+    for (int offset = blockDim.y >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.y < offset)
+            cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset];
+        __syncthreads();
+    }
+    if (threadIdx.y == 0)
+        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_colsame)(const MATRIX_ELEM *input,
+                                            const MATRIX_ELEM *ref_input,
+                                            MATRIX_ELEM *output,
+                                            const int istride, const int ostride,
+                                            const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    cudak_(arr)[threadIdx.y] = (i < n && input[blockIdx.x + istride * i] == \
+                                        ref_input[blockIdx.x + istride * i]) ? 1.0 : 0;
+    __syncthreads();
+    for (int offset = blockDim.y >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.y < offset)
+            cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset];
+        __syncthreads();
+    }
+    if (threadIdx.y == 0)
+        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_softmax_rowsum)(const MATRIX_ELEM *input,
+                                        MATRIX_ELEM *output,
+                                        const MATRIX_ELEM *max,
+                                        const int istride, const int ostride,
+                                        const int mstride, const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    cudak_(arr)[threadIdx.x] = j < n ? exp(input[j + istride * blockIdx.y] - \
+                                    max[0 + mstride * blockIdx.y]) : 0;
+    __syncthreads();
+    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+            cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_rowmax)(const MATRIX_ELEM *input,
+                                            MATRIX_ELEM *output,
+                                            const int istride, const int ostride,
+                                            const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
+    __syncthreads();
+    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+        {
+            MATRIX_ELEM l = cudak_(arr)[threadIdx.x],
+                        r = cudak_(arr)[threadIdx.x + offset];
+            if (r > l)
+                cudak_(arr)[threadIdx.x] = r;
+        }
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_rowmax_idx)(const MATRIX_ELEM *input,
+                                                const MATRIX_ELEM *idx_input,
+                                                MATRIX_ELEM *output,
+                                                MATRIX_ELEM *idx_output,
+                                                const int istride, const int ostride,
+                                                const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    MATRIX_ELEM *arr_val = cudak_(arr);
+    MATRIX_ELEM *arr_idx = arr_val + blockDim.x;
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    arr_val[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
+    arr_idx[threadIdx.x] = j < n ? idx_input[j + istride * blockIdx.y] : 0;
+    __syncthreads();
+    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+        {
+            MATRIX_ELEM l = arr_val[threadIdx.x],
+                        r = arr_val[threadIdx.x + offset];
+            if (r > l)
+            {
+                arr_val[threadIdx.x] = r;
+                arr_idx[threadIdx.x] = arr_idx[threadIdx.x + offset];
+            }
+        }
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+    {
+        output[blockIdx.x + ostride * blockIdx.y] = arr_val[0];
+        idx_output[blockIdx.x + ostride * blockIdx.y] = arr_idx[0];
+    }
+}
+
+__global__ void cudak_(add_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                int nrow, int ncol, int stride, double beta) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[j + i * stride] += beta * a[j];
+}
+
+__global__ void cudak_(fill)(MATRIX_ELEM *a,
+                            int nrow, int ncol, int stride, double val) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    a[j + i * stride] = val;
+}
+
+__global__ void cudak_(expand_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                    int nrow, int ncol,
+                                    int enrow, int encol,
+                                    int stride, int estride,
+                                    int context) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    int ridx;
+    if (i >= enrow || j >= encol) return;
+    ridx = i + j / ncol - context;
+    if (ridx < 0) ridx = 0;
+    else if (ridx >= nrow) ridx = nrow - 1;
+    b[j + i * estride] = a[j % ncol + ridx * stride];
+}
+
+__global__ void cudak_(rearrange_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                    int nrow, int ncol,
+                                    int stride, int step, int orig_dim) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[j + i * stride] = a[j / step + (j % step) * orig_dim + i * stride];
+}
+
+__global__ void cudak_(scale_rows_by_col)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                        int nrow, int ncol,
+                                        int astride, int bstride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[j + i * bstride] *= a[i * astride];
+}
+
+__global__ void cudak_(scale_rows_by_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                        int nrow, int ncol,
+                                        int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[j + i * stride] *= a[j];
+}
+
+__global__ void cudak_(decompress)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                    int nrow, int ncol,
+                                    int stride_a, int stride_b) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[lrintf(a[j + i * stride_a]) + i * stride_b] = 1.0;
+}
+
+__global__ void cudak_(gen_col_idx)(MATRIX_ELEM *b,
+                                    int nrow, int ncol, int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[j + i * stride] = j;
+}
+
+extern "C" {
+#include "../cukernel.h"
+    void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(log_elem)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_mul_elem)(const Matrix *a, const Matrix *b,
+                                Matrix *c) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(mul_elem)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             MATRIX_ELEM_PTR(c),
+             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(sigmoid)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
+            b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_sigmoid_grad)(const Matrix *output,
+                                    const Matrix *err, Matrix *nerr) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(nerr->ncol, threadsPerBlock.x),
+                CEIL_DIV(nerr->nrow, threadsPerBlock.y));
+        cudak_(sigmoid_grad)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(output), MATRIX_ELEM_PTR(err),
+             MATRIX_ELEM_PTR(nerr),
+             nerr->nrow, nerr->ncol,
+             nerr->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_rowsum)(const Matrix *a, Matrix *b) {
+        dim3 block(CUDA_THREADS_NN, 1);
+        int ncol = a->ncol;
+        int blocks_per_row = CEIL_DIV(ncol, block.x);
+        dim3 grid(blocks_per_row, a->nrow);
+        MATRIX_ELEM *res;
+        size_t stride;
+        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+        cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), res,
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             ncol);
+        ncol = blocks_per_row;
+        assert((unsigned long)ncol <= block.x);
+        grid.x = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (res, MATRIX_ELEM_PTR(b),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             ncol);
+        cudaStreamSynchronize(0);
+        cudaFree(res);
+    }
+
+    void cudak_(cuda_colsame)(const Matrix *a, const Matrix *ref, Matrix *b) {
+        dim3 block(1, CUDA_THREADS_NN);
+        int nrow = a->nrow;
+        int blocks_per_col = CEIL_DIV(nrow, block.y);
+        dim3 grid(a->ncol, blocks_per_col);
+        MATRIX_ELEM *res;
+        size_t stride;
+        cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col);
+        cudak_(block_reduce_colsame)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(ref), res,
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             nrow);
+        nrow = blocks_per_col;
+        assert((unsigned long)nrow <= block.y);
+        grid.y = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
+            (res, MATRIX_ELEM_PTR(b),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             nrow);
+        cudaStreamSynchronize(0);
+        cudaFree(res);
+    }
+
+    void cudak_(cuda_colsum)(const Matrix *a, Matrix *b) {
+        dim3 block(1, CUDA_THREADS_NN);
+        int nrow = a->nrow;
+        int blocks_per_col = CEIL_DIV(nrow, block.y);
+        dim3 grid(a->ncol, blocks_per_col);
+        MATRIX_ELEM *res;
+        size_t stride;
+        cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col);
+        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), res,
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             nrow);
+        nrow = blocks_per_col;
+        assert((unsigned long)nrow <= block.y);
+        grid.y = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
+            (res, MATRIX_ELEM_PTR(b),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             nrow);
+        cudaStreamSynchronize(0);
+        cudaFree(res);
+    }
+
+    void cudak_(cuda_softmax_final)(const Matrix *a, const Matrix *max,
+                            const Matrix *deno, Matrix *b) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(softmax_final)<<<numBlocks, threadsPerBlock>>> \
+                (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+                MATRIX_ELEM_PTR(max), MATRIX_ELEM_PTR(deno),
+                b->nrow, b->ncol,
+                b->stride / sizeof(MATRIX_ELEM),
+                max->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_softmax_denominator)(const Matrix *a, const Matrix *max, Matrix *b) {
+        dim3 block(CUDA_THREADS_NN, 1);
+        int ncol = a->ncol;
+        int blocks_per_row = CEIL_DIV(ncol, block.x);
+        dim3 grid(blocks_per_row, a->nrow);
+        MATRIX_ELEM *res;
+        size_t stride;
+        assert(max->ncol == 1);
+        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+        cudak_(block_reduce_softmax_rowsum) \
+            <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), res, MATRIX_ELEM_PTR(max),
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             max->stride / sizeof(MATRIX_ELEM),
+             ncol);
+        ncol = blocks_per_row;
+        assert((unsigned long)ncol <= block.x);
+        grid.x = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_rowsum) \
+            <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (res, MATRIX_ELEM_PTR(b),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             ncol);
+        cudaStreamSynchronize(0);
+        cudaFree(res);
+    }
+
+    void cudak_(cuda_rowmax)(const Matrix *a, Matrix *b) {
+        dim3 block(CUDA_THREADS_NN, 1);
+        int ncol = a->ncol;
+        int blocks_per_row = CEIL_DIV(ncol, block.x);
+        dim3 grid(blocks_per_row, a->nrow);
+        MATRIX_ELEM *res;
+        size_t stride;
+        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+        cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), res,
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             ncol);
+        ncol = blocks_per_row;
+        assert((unsigned long)ncol <= block.x);
+        grid.x = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (res, MATRIX_ELEM_PTR(b),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             ncol);
+        cudaStreamSynchronize(0);
+        cudaFree(res);
+    }
+
+    void cudak_(cuda_rowmax_idx)(const Matrix *a, Matrix *b, Matrix *b_idx) {
+        dim3 block(CUDA_THREADS_NN, 1);
+        int ncol = a->ncol;
+        int blocks_per_row = CEIL_DIV(ncol, block.x);
+        dim3 grid(blocks_per_row, a->nrow);
+        MATRIX_ELEM *a_idx, *res, *res_idx;
+        size_t stride;
+        cudaMallocPitch(&a_idx, &stride, a->stride, a->nrow);
+        cudak_(gen_col_idx)<<<grid, block>>>(a_idx, a->nrow, ncol, stride / sizeof(MATRIX_ELEM));
+        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+        cudaMallocPitch(&res_idx, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_rowmax_idx)<<<grid, block,
+                                        2 * block.x * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), a_idx, res, res_idx,
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             ncol);
+        ncol = blocks_per_row;
+        assert((unsigned long)ncol <= block.x);
+        grid.x = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_rowmax_idx)<<<grid, block,
+                                        2 * block.x * sizeof(MATRIX_ELEM)>>> \
+            (res, res_idx, MATRIX_ELEM_PTR(b), MATRIX_ELEM_PTR(b_idx),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             ncol);
+        cudaStreamSynchronize(0);
+        cudaFree(a_idx);
+        cudaFree(res);
+        cudaFree(res_idx);
+    }
+
+    /* in-place calc */
+    void cudak_(cuda_add_row)(const Matrix *a, Matrix *b, double beta) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(add_row)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
+            b->stride / sizeof(MATRIX_ELEM), beta);
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_fill)(Matrix *a, double val) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x),
+                CEIL_DIV(a->nrow, threadsPerBlock.y));
+        cudak_(fill)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), a->nrow, a->ncol,
+            a->stride / sizeof(MATRIX_ELEM), val);
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(expand_frm)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             a->nrow, a->ncol,
+             b->nrow, b->ncol,
+             a->stride / sizeof(MATRIX_ELEM),
+             b->stride / sizeof(MATRIX_ELEM),
+             context);
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(rearrange_frm)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM),
+             step, b->ncol / step);
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(scale_rows_by_col)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             b->nrow, b->ncol,
+             a->stride / sizeof(MATRIX_ELEM),
+             b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(scale_rows_by_row)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_decompress)(const Matrix *a, Matrix *b) {
+        dim3 threadsPerBlock(1, CUDA_THREADS_NN);
+        dim3 numBlocks(1, CEIL_DIV(a->nrow, threadsPerBlock.y));
+        cudak_(decompress)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             a->nrow, a->ncol,
+             a->stride / sizeof(MATRIX_ELEM),
+             b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+}
+#endif
diff --git a/nerv/lib/matrix/generic/cumatrix.c b/nerv/lib/matrix/generic/cumatrix.c
new file mode 100644
index 0000000..11aacec
--- /dev/null
+++ b/nerv/lib/matrix/generic/cumatrix.c
@@ -0,0 +1,403 @@
+#ifdef NERV_GENERIC_CUMATRIX
+#include "matrix.h"
+#include "elem_type.h"
+#define MATRIX_DATA_FREE(ptr, status) cuda_matrix_(free)(ptr, status)
+#define MATRIX_DATA_ALLOC(dptr, stride, width, height, status) \
+                            cuda_matrix_(alloc)(dptr, stride, width, height, status)
+
+#define NERV_GENERIC_MATRIX
+#define NERV_GENERIC_CUKERNEL
+#include "../../../common.h"
+#include "../cukernel.h"
+#include "../cuda_helper.h"
+
+void nerv_matrix_(add)(Matrix *c, const Matrix *a, const Matrix *b,
+                            MATRIX_ELEM alpha, MATRIX_ELEM beta,
+                            Status *status) {
+    CHECK_SAME_DIMENSION(a, b, status);
+    CHECK_SAME_DIMENSION(a, c, status);
+    PROFILE_START
+    CUBLAS_SAFE_SYNC_CALL(
+            NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
+                a->ncol, a->nrow,
+                &alpha,
+                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+                &beta,
+                MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM),
+                MATRIX_ELEM_PTR(c), c->stride / sizeof(MATRIX_ELEM)),
+            status);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+void nerv_matrix_(mul)(Matrix *c, const Matrix *a, const Matrix *b,
+                            MATRIX_ELEM alpha, MATRIX_ELEM beta,
+                            int ta, int tb, Status *status) {
+#define SWAP(a, b) \
+    do { int t = (a); (a) = (b); (b) = t; } while (0)
+
+    int am = a->nrow, an = a->ncol;
+    int bm = b->nrow, bn = b->ncol;
+    if (ta == CUBLAS_OP_T) SWAP(am, an);
+    if (tb == CUBLAS_OP_T) SWAP(bm, bn);
+    if (an != bm)
+        NERV_EXIT_STATUS(status, MAT_WRONG_MULT_DIM, 0);
+    /* Because matrix in Nerv is row-major, here b comes first */
+    PROFILE_START
+    CUBLAS_SAFE_SYNC_CALL(
+            NERV_CUBLAS_(gemm)(cublas_handle, tb, ta,
+                bn, am, bm,
+                &alpha,
+                MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM),
+                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+                &beta,
+                MATRIX_ELEM_PTR(c), c->stride / sizeof(MATRIX_ELEM)),
+            status);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+void nerv_matrix_(sigmoid)(Matrix *a, const Matrix *b, Status *status) {
+    CHECK_SAME_DIMENSION(a, b, status);
+    PROFILE_START
+    cudak_(cuda_sigmoid)(b, a);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+void nerv_matrix_(sigmoid_grad)(Matrix *nerr, const Matrix *err,
+                                const Matrix *output, Status *status) {
+    CHECK_SAME_DIMENSION(nerr, err, status);
+    CHECK_SAME_DIMENSION(nerr, output, status);
+    PROFILE_START
+    cudak_(cuda_sigmoid_grad)(output, err, nerr);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+Matrix *nerv_matrix_(softmax)(Matrix *b, const Matrix *a, Status *status) {
+    Matrix *max, *max_idx;
+    Matrix *dno;
+    CHECK_SAME_DIMENSION_RET(a, b, status);
+    max = nerv_matrix_(create)(a->nrow, 1, status);
+    if (status->err_code != MAT_NORMAL)
+        return NULL;
+    max_idx = nerv_matrix_(create)(a->nrow, 1, status);
+    if (status->err_code != MAT_NORMAL)
+    {
+        nerv_matrix_(destroy)(max, status);
+        return NULL;
+    }
+    dno = nerv_matrix_(create)(a->nrow, 1, status);
+    if (status->err_code != MAT_NORMAL)
+    {   /* FIXME: destroy may also fail? */
+        nerv_matrix_(destroy)(max, status);
+        nerv_matrix_(destroy)(max_idx, status);
+        return NULL;
+    }
+    PROFILE_START
+    cudak_(cuda_rowmax_idx)(a, max, max_idx);
+    cudak_(cuda_softmax_denominator)(a, max, dno);
+    cudak_(cuda_softmax_final)(a, max, dno, b);
+    PROFILE_STOP
+    nerv_matrix_(destroy)(max, status);
+    nerv_matrix_(destroy)(dno, status);
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+    return max_idx;
+}
+
+Matrix *nerv_matrix_(rowsum)(Matrix *a, Status *status) {
+    Matrix *b = nerv_matrix_(create)(a->nrow, 1, status);
+    if (status->err_code != MAT_NORMAL)
+        return NULL;
+    PROFILE_START
+    cudak_(cuda_rowsum)(a, b);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+    return b;
+}
+
+Matrix *nerv_matrix_(colsum)(Matrix *a, Status *status) {
+    Matrix *b = nerv_matrix_(create)(1, a->ncol, status);
+    if (status->err_code != MAT_NORMAL)
+        return NULL;
+    PROFILE_START
+    cudak_(cuda_colsum)(a, b);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+    return b;
+}
+
+Matrix *nerv_matrix_(colsame)(Matrix *a, const Matrix *ref,
+                                Status *status) {
+    Matrix *b = nerv_matrix_(create)(1, a->ncol, status);
+    if (status->err_code != MAT_NORMAL)
+        return NULL;
+    CHECK_SAME_DIMENSION_RET(a, ref, status);
+    PROFILE_START
+    cudak_(cuda_colsame)(a, ref, b);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+    return b;
+}
+
+Matrix *nerv_matrix_(rowmax)(Matrix *a, Status *status) {
+    Matrix *b = nerv_matrix_(create)(a->nrow, 1, status);
+    if (status->err_code != MAT_NORMAL)
+        return NULL;
+    PROFILE_START
+    cudak_(cuda_rowmax)(a, b);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+    return b;
+}
+
+void nerv_matrix_(rowmax_idx)(Matrix *a, Matrix **b, Matrix **idx,
+                                Status *status) {
+    *b = nerv_matrix_(create)(a->nrow, 1, status);
+    if (status->err_code != MAT_NORMAL)
+        return;
+    *idx = nerv_matrix_(create)(a->nrow, 1, status);
+    if (status->err_code != MAT_NORMAL)
+    {
+        /* FIXME: destroy may also fail? */
+        nerv_matrix_(destroy)(*b, status);
+        return;
+    }
+    PROFILE_START
+    cudak_(cuda_rowmax_idx)(a, *b, *idx);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+void nerv_matrix_(add_row)(Matrix *b, const Matrix *a, double beta,
+                            Status *status) {
+    if (a->ncol != b->ncol)
+        NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0);
+    if (a->nrow != 1)
+        NERV_EXIT_STATUS(status, MAT_ROW_VECTOR_EXP, 0);
+    PROFILE_START
+    cudak_(cuda_add_row)(a, b, beta);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+void nerv_matrix_(fill)(Matrix *self, double val, Status *status) {
+    PROFILE_START
+    cudak_(cuda_fill)(self, val);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+void nerv_matrix_(copy_fromd)(Matrix *a, const Matrix *b,
+                            int a_begin, int b_begin, int b_end,
+                            Status *status) {
+    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+            a_begin + b_end - b_begin <= a->nrow))
+        NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0);
+    if (a->ncol != b->ncol)
+        NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0);
+    PROFILE_START
+    CUDA_SAFE_SYNC_CALL(
+            cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride,
+                MATRIX_ROW_PTR(b, b_begin), b->stride,
+                sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin,
+                cudaMemcpyDeviceToDevice),
+            status);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+void nerv_matrix_(copy_fromh)(Matrix *a, const Matrix *b,
+                            int a_begin, int b_begin, int b_end,
+                            Status *status) { 
+    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+            a_begin + b_end - b_begin <= a->nrow))
+        NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0);
+    if (a->ncol != b->ncol)
+        NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0);
+    PROFILE_START
+    CUDA_SAFE_SYNC_CALL(
+            cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride,
+                MATRIX_ROW_PTR(b, b_begin), b->stride,
+                sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin,
+                cudaMemcpyHostToDevice),
+            status);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+void nerv_matrix_(copy_toh)(Matrix *a, const Matrix *b,
+                            int a_begin, int a_end, int b_begin,
+                            Status *status) {
+    if (!(0 <= a_begin && a_begin < a_end && a_end <= a->nrow &&
+            b_begin + a_end - a_begin <= b->nrow))
+        NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0);
+    if (b->ncol != a->ncol)
+        NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0);
+    PROFILE_START
+    CUDA_SAFE_SYNC_CALL(
+            cudaMemcpy2D(MATRIX_ROW_PTR(b, b_begin), b->stride,
+                MATRIX_ROW_PTR(a, a_begin), a->stride,
+                sizeof(MATRIX_ELEM) * a->ncol, a_end - a_begin,
+                cudaMemcpyDeviceToHost),
+            status);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+Matrix *nerv_matrix_(trans)(Matrix *a, Status *status) {
+    MATRIX_ELEM alpha = 1, beta = 0;
+    Matrix *b = nerv_matrix_(create)(a->ncol, a->nrow, status);
+    if (status->err_code != MAT_NORMAL)
+        return NULL;
+    /* FIXME: possible memory leak when lua error is raised */
+    PROFILE_START
+    CUBLAS_SAFE_SYNC_CALL_RET(
+            NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T,
+                a->nrow, a->ncol,
+                &alpha,
+                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+                &beta,
+                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+                MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM)),
+            status);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+    return b;
+}
+
+void nerv_matrix_(mul_elem)(Matrix *c, const Matrix *a, const Matrix *b,
+                            Status *status) {
+    CHECK_SAME_DIMENSION(a, b, status);
+    CHECK_SAME_DIMENSION(a, c, status);
+    PROFILE_START
+    cudak_(cuda_mul_elem)(a, b, c);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+void nerv_matrix_(log_elem)(Matrix *b, const Matrix *a, Status *status) {
+    CHECK_SAME_DIMENSION(a, b, status);
+    PROFILE_START
+    cudak_(cuda_log_elem)(a, b);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+Matrix *nerv_matrix_(decompress)(const Matrix *a, int orig_col, Status *status) {
+    Matrix *b;
+    if (a->ncol != 1)
+    {
+        NERV_SET_STATUS(status, MAT_COL_VECTOR_EXP, 0);
+        return NULL;
+    }
+    b = nerv_matrix_(create)(a->nrow, orig_col, status);
+    if (status->err_code != MAT_NORMAL)
+        return NULL;
+    PROFILE_START
+    cudak_(cuda_fill)(b, 0.0);
+    cudak_(cuda_decompress)(a, b);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+    return b;
+}
+
+void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b,
+                            const Matrix *idx, int b_begin, Status *status) {
+    long nrow = a->nrow;
+    if (!(0 <= b_begin && b_begin + nrow <= idx->ncol))
+        NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0);
+    long *idx_ptr = idx->data.i;
+    int i;
+    if (idx->nrow != 1)
+        NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0);
+    if (a->ncol != b->ncol)
+        NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0);
+    cudaStream_t *streams = (cudaStream_t*)malloc(sizeof(cudaStream_t) * nrow);
+    for (i = 0; i < nrow; i++)
+    {
+        int src_row = idx_ptr[b_begin + i];
+        if (!(0 <= src_row && src_row < b->nrow))
+            NERV_EXIT_STATUS(status, MAT_INVALID_IDX, 0);
+        CUDA_SAFE_CALL(cudaStreamCreate(streams + i), status);
+        CUDA_SAFE_CALL(cudaMemcpyAsync(MATRIX_ROW_PTR(a, i),
+                    MATRIX_ROW_PTR(b, src_row),
+                    b->stride,
+                    cudaMemcpyHostToDevice, streams[i]), status);
+    }
+    for (i = 0; i < nrow; i++)
+    {
+        CUDA_SAFE_CALL(cudaStreamSynchronize(streams[i]), status);
+        CUDA_SAFE_CALL(cudaStreamDestroy(streams[i]), status);
+    }
+    free(streams);
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b,
+                            int context, Status *status) {
+    if (a->nrow != b->nrow)
+        NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0);
+    if (a->ncol != b->ncol * (context * 2 + 1))
+        NERV_EXIT_STATUS(status, MAT_GENERAL_ERR,
+                        "the width should be 2 * context + 1");
+    PROFILE_START
+    cudak_(cuda_expand_frm)(b, a, context);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+void nerv_matrix_(rearrange_frm)(Matrix *a, const Matrix *b,
+                                int step, Status *status) {
+    CHECK_SAME_DIMENSION(a, b, status);
+    if (b->ncol % step)
+        NERV_EXIT_STATUS(status, MAT_GENERAL_ERR,
+                        "the dimension of columns is not divisible by step");
+    PROFILE_START
+    cudak_(cuda_rearrange_frm)(b, a, step);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+void nerv_matrix_(scale_rows_by_col)(Matrix *a, const Matrix *b,
+                                    Status *status) {
+    if (a->nrow != b->nrow)
+        NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0);
+    if (b->ncol != 1)
+        NERV_EXIT_STATUS(status, MAT_COL_VECTOR_EXP, 0);
+    PROFILE_START
+    cudak_(cuda_scale_rows_by_col)(b, a);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+void nerv_matrix_(scale_rows_by_row)(Matrix *a, const Matrix *b,
+                                    Status *status) {
+    if (a->ncol != b->ncol)
+        NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0);
+    if (b->nrow != 1)
+        NERV_EXIT_STATUS(status, MAT_ROW_VECTOR_EXP, 0);
+    PROFILE_START
+    cudak_(cuda_scale_rows_by_row)(b, a);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+static void cuda_matrix_(free)(MATRIX_ELEM *ptr, Status *status) {
+    CUDA_SAFE_SYNC_CALL(cudaFree(ptr), status);
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+static void cuda_matrix_(alloc)(MATRIX_ELEM **dptr,
+                                size_t *stride, long width, long height,
+                                Status *status) {
+    PROFILE_START
+    CUDA_SAFE_SYNC_CALL(cudaMallocPitch((void **)dptr, stride, width, height),
+                        status);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+#include "matrix.c"
+#endif
diff --git a/nerv/lib/matrix/generic/cumatrix.h b/nerv/lib/matrix/generic/cumatrix.h
new file mode 100644
index 0000000..9a4f87e
--- /dev/null
+++ b/nerv/lib/matrix/generic/cumatrix.h
@@ -0,0 +1,50 @@
+#include "../../../common.h"
+
+void nerv_matrix_(add)(Matrix *c, const Matrix *a, const Matrix *b,
+                            MATRIX_ELEM alpha, MATRIX_ELEM beta,
+                            Status *status);
+void nerv_matrix_(mul)(Matrix *c, const Matrix *a, const Matrix *b,
+                            MATRIX_ELEM alpha, MATRIX_ELEM beta,
+                            int ta, int tb, Status *status);
+void nerv_matrix_(sigmoid)(Matrix *a, const Matrix *b, Status *status);
+void nerv_matrix_(sigmoid_grad)(Matrix *nerr, const Matrix *err,
+                                const Matrix *output, Status *status);
+
+Matrix *nerv_matrix_(softmax)(Matrix *b, const Matrix *a, Status *status);
+Matrix *nerv_matrix_(rowsum)(Matrix *a, Status *status);
+Matrix *nerv_matrix_(colsum)(Matrix *a, Status *status);
+Matrix *nerv_matrix_(colsame)(Matrix *a, const Matrix *ref,
+                                Status *status);
+Matrix *nerv_matrix_(rowmax)(Matrix *a, Status *status);
+void nerv_matrix_(rowmax_idx)(Matrix *a, Matrix **b, Matrix **idx,
+                                Status *status);
+void nerv_matrix_(add_row)(Matrix *b, const Matrix *a, double beta,
+                            Status *status);
+void nerv_matrix_(fill)(Matrix *self, double val, Status *status);
+void nerv_matrix_(copy_fromd)(Matrix *a, const Matrix *b,
+                            int a_begin, int b_begin, int b_end,
+                            Status *status);
+void nerv_matrix_(copy_fromh)(Matrix *a, const Matrix *b,
+                            int a_begin, int b_begin, int b_end,
+                            Status *status);
+void nerv_matrix_(copy_toh)(Matrix *a, const Matrix *b,
+                            int a_begin, int a_end, int b_begin,
+                            Status *status);
+Matrix *nerv_matrix_(trans)(Matrix *a, Status *status);
+void nerv_matrix_(mul_elem)(Matrix *c, const Matrix *a, const Matrix *b,
+                            Status *status);
+
+void nerv_matrix_(log_elem)(Matrix *b, const Matrix *a, Status *status);
+
+Matrix *nerv_matrix_(decompress)(const Matrix *a, int orig_col, Status *status);
+void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b,
+                            const Matrix *idx, int b_begin, Status *status);
+
+void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b,
+                            int context, Status *status);
+void nerv_matrix_(rearrange_frm)(Matrix *a, const Matrix *b,
+                                int step, Status *status);
+void nerv_matrix_(scale_rows_by_col)(Matrix *a, const Matrix *b,
+                                    Status *status);
+void nerv_matrix_(scale_rows_by_row)(Matrix *a, const Matrix *b,
+                                    Status *status);
diff --git a/nerv/lib/matrix/generic/elem_type.h b/nerv/lib/matrix/generic/elem_type.h
new file mode 100644
index 0000000..bffe940
--- /dev/null
+++ b/nerv/lib/matrix/generic/elem_type.h
@@ -0,0 +1,22 @@
+#ifdef MATRIX_USE_FLOAT
+
+#define MATRIX_ELEM float
+#define MATRIX_ELEM_FMT "%f"
+#define MATRIX_ELEM_WRITE_FMT "%.8f"
+#define MATRIX_ELEM_PTR(self) ((self)->data.f)
+
+#elif defined(MATRIX_USE_DOUBLE)
+
+#define MATRIX_ELEM double
+#define MATRIX_ELEM_FMT "%lf"
+#define MATRIX_ELEM_WRITE_FMT "%.8lf"
+#define MATRIX_ELEM_PTR(self) ((self)->data.d)
+
+#elif defined(MATRIX_USE_INT)
+
+#define MATRIX_ELEM long
+#define MATRIX_ELEM_FMT "%ld"
+#define MATRIX_ELEM_WRITE_FMT "%ld"
+#define MATRIX_ELEM_PTR(self) ((self)->data.i)
+
+#endif
diff --git a/nerv/lib/matrix/generic/matrix.c b/nerv/lib/matrix/generic/matrix.c
new file mode 100644
index 0000000..91577e1
--- /dev/null
+++ b/nerv/lib/matrix/generic/matrix.c
@@ -0,0 +1,57 @@
+#ifdef NERV_GENERIC_MATRIX
+#include "../../../common.h"
+#include "matrix.h"
+/* FIXME: malloc failure detection */
+
+static void nerv_matrix_(data_free)(Matrix *self, Status *status) {
+    assert(*self->data_ref > 0);
+    if (--(*self->data_ref) == 0)
+    {
+        /* free matrix data */
+        MATRIX_DATA_FREE(MATRIX_ELEM_PTR(self), status);
+        free(self->data_ref);
+        free(self);
+    }
+    else NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+static void nerv_matrix_(data_retain)(Matrix *self) {
+    (*self->data_ref)++;
+}
+
+Matrix *nerv_matrix_(create)(long nrow, long ncol, Status *status) {
+    Matrix *self = (Matrix *)malloc(sizeof(Matrix));
+    self->nrow = nrow;
+    self->ncol = ncol;
+    self->nmax = self->nrow * self->ncol;
+    MATRIX_DATA_ALLOC(&MATRIX_ELEM_PTR(self), &self->stride,
+                     sizeof(MATRIX_ELEM) * self->ncol, self->nrow,
+                     status);
+    if (status->err_code != MAT_NORMAL)
+    {
+        free(self);
+        return NULL;
+    }
+    self->data_ref = (long *)malloc(sizeof(long));
+    *self->data_ref = 0;
+    nerv_matrix_(data_retain)(self);
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+    return self;
+}
+
+void nerv_matrix_(destroy)(Matrix *self, Status *status) {
+    nerv_matrix_(data_free)(self, status);
+}
+
+Matrix *nerv_matrix_(getrow)(Matrix *self, int row) {
+    Matrix *prow = (Matrix *)malloc(sizeof(Matrix));
+    prow->ncol = self->ncol;
+    prow->nrow = 1;
+    prow->stride = self->stride;
+    prow->nmax = prow->ncol;
+    MATRIX_ELEM_PTR(prow) = MATRIX_ROW_PTR(self, row);
+    prow->data_ref = self->data_ref;
+    nerv_matrix_(data_retain)(prow);
+    return prow;
+}
+#endif
diff --git a/nerv/lib/matrix/generic/matrix.h b/nerv/lib/matrix/generic/matrix.h
new file mode 100644
index 0000000..9d44e6d
--- /dev/null
+++ b/nerv/lib/matrix/generic/matrix.h
@@ -0,0 +1,4 @@
+#include "../matrix.h"
+Matrix *nerv_matrix_(create)(long nrow, long ncol, Status *status);
+void nerv_matrix_(destroy)(Matrix *self, Status *status);
+Matrix *nerv_matrix_(getrow)(Matrix *self, int row);
diff --git a/nerv/lib/matrix/generic/mmatrix.c b/nerv/lib/matrix/generic/mmatrix.c
new file mode 100644
index 0000000..e3d1f93
--- /dev/null
+++ b/nerv/lib/matrix/generic/mmatrix.c
@@ -0,0 +1,82 @@
+#ifdef NERV_GENERIC_MMATRIX
+#include "matrix.h"
+#include "elem_type.h"
+#define MATRIX_DATA_FREE(ptr, status) host_matrix_(free)(ptr, status)
+#define MATRIX_DATA_ALLOC(dptr, stride, width, height, status) \
+                            host_matrix_(alloc)(dptr, stride, width, height, status)
+#define NERV_GENERIC_MATRIX
+#include "../../../common.h"
+#include "../../io/chunk_file.h"
+#include "string.h"
+
+static void host_matrix_(free)(MATRIX_ELEM *ptr, Status *status) {
+    free(ptr);
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+static void host_matrix_(alloc)(MATRIX_ELEM **dptr, size_t *stride,
+                                long width, long height, Status *status) {
+    if ((*dptr = (MATRIX_ELEM *)malloc(width * height)) == NULL)
+        NERV_EXIT_STATUS(status, MAT_INSUF_MEM, 0);
+    *stride = width;
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+#include "matrix.c"
+Matrix *nerv_matrix_(load)(ChunkData *cdp, Status *status) {
+    int i, j;
+    long nrow, ncol;
+    FILE *fp = cdp->fp;
+    Matrix *self;
+    if (fscanf(fp, "%ld %ld", &nrow, &ncol) != 2)
+        NERV_EXIT_STATUS(status, MAT_INVALID_FORMAT, 0);
+    self = nerv_matrix_(create)(nrow, ncol, status);
+    if (status->err_code != MAT_NORMAL)
+        return NULL;
+    for (i = 0; i < nrow; i++)
+    {
+        MATRIX_ELEM *row = MATRIX_ROW_PTR(self, i);
+        for (j = 0; j < ncol; j++)
+            if (fscanf(fp, MATRIX_ELEM_FMT, row + j) != 1)
+            {
+                free(self);
+                NERV_EXIT_STATUS(status, MAT_INVALID_FORMAT, 0);
+            }
+    }
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+    return self;
+}
+
+void nerv_matrix_(save)(Matrix *self, ChunkFile *cfp, Status *status) {
+    int i, j;
+    long nrow = self->nrow, ncol = self->ncol;
+    FILE *fp = cfp->fp;
+    if (fprintf(fp, "%ld %ld\n", nrow, ncol) < 0)
+        NERV_EXIT_STATUS(status, MAT_WRITE_ERROR, 0);
+    for (i = 0; i < nrow; i++)
+    {
+        MATRIX_ELEM *row = MATRIX_ROW_PTR(self, i);
+        for (j = 0; j < ncol; j++)
+            if (fprintf(fp, MATRIX_ELEM_WRITE_FMT " ", row[j]) < 0)
+                NERV_EXIT_STATUS(status, MAT_WRITE_ERROR, 0);
+        if (fprintf(fp, "\n") < 0)
+            NERV_EXIT_STATUS(status, MAT_WRITE_ERROR, 0);
+    }
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+void nerv_matrix_(copy_from)(Matrix *a, const Matrix *b,
+                            int a_begin, int b_begin, int b_end,
+                            Status *status) {
+    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+            a_begin + b_end - b_begin <= a->nrow))
+        NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0);
+    if (a->ncol != b->ncol)
+        NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0);
+    memmove(MATRIX_ROW_PTR(a, a_begin),
+            MATRIX_ROW_PTR(b, b_begin),
+            sizeof(MATRIX_ELEM) * b->ncol * (b_end - b_begin));
+    NERV_SET_STATUS(status, MAT_NORMAL, 0);
+}
+
+#endif
diff --git a/nerv/lib/matrix/generic/mmatrix.h b/nerv/lib/matrix/generic/mmatrix.h
new file mode 100644
index 0000000..5336e7a
--- /dev/null
+++ b/nerv/lib/matrix/generic/mmatrix.h
@@ -0,0 +1,7 @@
+#include "../../../common.h"
+
+Matrix *nerv_matrix_(load)(ChunkData *cdp, Status *status);
+void nerv_matrix_(save)(Matrix *self, ChunkFile *cfp, Status *status);
+void nerv_matrix_(copy_from)(Matrix *a, const Matrix *b,
+                            int a_begin, int b_begin, int b_end,
+                            Status *status);
diff --git a/nerv/lib/matrix/init.lua b/nerv/lib/matrix/init.lua
new file mode 100644
index 0000000..1a8925f
--- /dev/null
+++ b/nerv/lib/matrix/init.lua
@@ -0,0 +1,77 @@
+function nerv.Matrix:__tostring__()
+    local ncol = self:ncol()
+    local nrow = self:nrow()
+    local strt = {}
+    local fmt
+    if self.fmt then
+        fmt = self.fmt
+    else
+        fmt = "%.8f "
+    end
+    if nrow == 1 then
+        for col = 0, ncol - 1 do
+            table.insert(strt, string.format(fmt, self[col]))
+        end
+        table.insert(strt, "\n")
+    else
+        for row = 0, nrow - 1 do
+            local rp = self[row]
+            for col = 0, ncol - 1 do
+                table.insert(strt, string.format(fmt, rp[col]))
+            end
+            table.insert(strt, "\n")
+        end
+    end
+    table.insert(strt, string.format(
+        "[%s %d x %d]", self.__typename, nrow, ncol))
+    return table.concat(strt)
+end
+
+-- gen: a function takes take indices of the matrix and return the generated
+-- all entrys in the matrix will be assigned by calling gen(i, j)
+function nerv.Matrix:generate(gen)
+    if (self:nrow() == 1) then
+        for j = 0, self:ncol() - 1 do
+            self[j] = gen(j)
+        end
+    else
+        for i = 0, self:nrow() - 1 do
+            local row = self[i]
+            for j = 0, self:ncol() - 1 do
+                row[j] = gen(i, j)
+            end
+        end
+    end
+end
+
+nerv.MMatrixInt.fmt = "%d "
+
+function nerv.CuMatrix:__add__(b)
+    c = self:create()
+    c:add(self, b, 1.0, 1.0)
+    return c
+end
+
+function nerv.CuMatrix:__sub__(b)
+    c = self:create()
+    c:add(self, b, 1.0, -1.0)
+    return c
+end
+
+function nerv.CuMatrix:__mul__(b)
+    c = nerv.get_type(self.__typename)(self:nrow(), b:ncol())
+    c:mul(self, b, 1.0, 0.0, 'N', 'N')
+    return c
+end
+
+function nerv.CuMatrixFloat.new_from_host(mat)
+    local res = nerv.CuMatrixFloat(mat:nrow(), mat:ncol())
+    res:copy_fromh(mat)
+    return res
+end
+
+function nerv.CuMatrixFloat:new_to_host()
+    local res = nerv.MMatrixFloat(self:nrow(), self:ncol())
+    self:copy_toh(res)
+    return res
+end
diff --git a/nerv/matrix/generic/matrix.h b/nerv/lib/matrix/matrix.h
index 833724b..cbf32c2 100644
--- a/nerv/matrix/generic/matrix.h
+++ b/nerv/lib/matrix/matrix.h
@@ -2,6 +2,7 @@
 #define NERV_GENERIC_MATRIX_H
 
 #include <stddef.h>
+
 typedef struct Matrix {
     size_t stride;              /* size of a row */
     long ncol, nrow, nmax;    /* dimension of the matrix */
@@ -15,5 +16,4 @@ typedef struct Matrix {
 
 #define MATRIX_ROW_PTR(self, row) \
     (MATRIX_ELEM *)((char *)MATRIX_ELEM_PTR(self) + (row) * (self)->stride)
-
 #endif
diff --git a/nerv/lib/matrix/mmatrix.c b/nerv/lib/matrix/mmatrix.c
new file mode 100644
index 0000000..2f58e7f
--- /dev/null
+++ b/nerv/lib/matrix/mmatrix.c
@@ -0,0 +1,53 @@
+#define NERV_GENERIC_MMATRIX
+#include <stdlib.h>
+#include "../../common.h"
+
+#define MATRIX_USE_FLOAT
+#define host_matrix_(NAME) host_matrix_float_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME
+#include "generic/matrix.h"
+#include "generic/mmatrix.c"
+#undef nerv_matrix_
+#undef host_matrix_
+#undef MATRIX_USE_FLOAT
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_FMT
+#undef MATRIX_ELEM_WRITE_FMT
+
+#define NERV_GENERIC_MMATRIX
+#define MATRIX_USE_DOUBLE
+#define host_matrix_(NAME) host_matrix_double_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_host_double_##NAME
+#include "generic/mmatrix.c"
+#undef nerv_matrix_
+#undef host_matrix_
+#undef MATRIX_USE_DOUBLE
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_FMT
+#undef MATRIX_ELEM_WRITE_FMT
+
+#define NERV_GENERIC_MMATRIX
+#define MATRIX_USE_INT
+#define host_matrix_(NAME) host_matrix_int_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_host_int_##NAME
+#include "generic/mmatrix.c"
+
+Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) {
+    int i;
+    Matrix *self = nerv_matrix_(create)(1, ncol, status);
+    if (status->err_code != MAT_NORMAL)
+        return NULL;
+    long *prow = self->data.i;
+    for (i = 0; i < ncol; i++)
+        prow[i] = i;
+    for (i = ncol - 1; i >= 0; i--)
+    {
+        size_t j = rand() % (i + 1);
+        long tmp = prow[i];
+        prow[i] = prow[j];
+        prow[j] = tmp;
+    }
+    return self;
+}
diff --git a/nerv/lib/matrix/mmatrix.h b/nerv/lib/matrix/mmatrix.h
new file mode 100644
index 0000000..df91e4c
--- /dev/null
+++ b/nerv/lib/matrix/mmatrix.h
@@ -0,0 +1,4 @@
+#ifndef NERV_MMATRIX_H
+#define NERV_MMATRIX_H
+Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status);
+#endif
diff --git a/nerv/matrix/cumatrix.c b/nerv/matrix/cumatrix.c
index af34fb4..1bcb0f1 100644
--- a/nerv/matrix/cumatrix.c
+++ b/nerv/matrix/cumatrix.c
@@ -1,6 +1,7 @@
 #define NERV_GENERIC_CUMATRIX
 #include "../common.h"
-#include "cuda_helper.h"
+#include "../lib/matrix/cumatrix.h"
+#include "../lib/matrix/cuda_helper.h"
 #include <string.h>
 #define PROFILE_HASHMAP_SIZE 123457
 static cublasHandle_t cublas_handle;
@@ -8,54 +9,29 @@ static cudaEvent_t profile_start, profile_stop;
 static HashMap *profile;
 
 static int print_profile(lua_State *L) {
-    (void)L;
-    size_t i;
-    fprintf(stderr, "*** [nerv cumatrix profile] **\n");
-    for (i = 0; i < profile->size; i++)
-    {
-        HashNode *ptr;
-        for (ptr = profile->bucket[i]; ptr; ptr = ptr->next)
-        {
-            fprintf(stderr, "%s:\t%.6f\n", ptr->key, *(float *)ptr->val);
-        }
-    }
+    nerv_cumatrix_print_profile();
     return 0;
 }
 
 static int clear_profile(lua_State *L) {
-    (void)L;
-    hashmap_clear(profile);
+    nerv_cumatrix_clear_profile();
     return 0;
 }
 
-void accu_profile(const char *name, float delta) {
-    float *val = hashmap_getval(profile, name);
-    if (!val)
-    {
-        val = malloc(sizeof(float));
-        *val = 0;
-        hashmap_setval(profile, name, val);
-    }
-    *val += delta;
-}
-
 static const luaL_Reg cumatrix_methods[] = {
     {"print_profile", print_profile},
     {"clear_profile", clear_profile},
     {NULL, NULL}
 };
 
-extern void nerv_matrix_cuda_float_init(lua_State *L);
-extern void nerv_matrix_cuda_double_init(lua_State *L);
+extern void nerv_matrix_cuda_float_lua_init(lua_State *L);
+extern void nerv_matrix_cuda_double_lua_init(lua_State *L);
 
-void nerv_cumatrix_init(lua_State *L) {
+void nerv_lua_cumatrix_init(lua_State *L) {
     luaL_register(L, NULL, cumatrix_methods);
-    cublasCreate(&cublas_handle);
-    cudaEventCreate(&profile_start);
-    cudaEventCreate(&profile_stop);
-    profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
-    nerv_matrix_cuda_float_init(L);
-    nerv_matrix_cuda_double_init(L);
+    nerv_cumatrix_init();
+    nerv_matrix_cuda_float_lua_init(L);
+    nerv_matrix_cuda_double_lua_init(L);
 }
 
 #define MATRIX_USE_FLOAT
diff --git a/nerv/matrix/generic/cumatrix.c b/nerv/matrix/generic/cumatrix.c
index b5d1a35..859718e 100644
--- a/nerv/matrix/generic/cumatrix.c
+++ b/nerv/matrix/generic/cumatrix.c
@@ -1,10 +1,6 @@
 #ifdef NERV_GENERIC_CUMATRIX
-#include "matrix.h"
+#include "../../lib/matrix/generic/matrix.h"
 #include "elem_type.h"
-
-#define MATRIX_DATA_FREE(L, ptr) cuda_matrix_(free)(L, ptr)
-#define MATRIX_DATA_ALLOC(L, dptr, stride, width, height) \
-                            cuda_matrix_(alloc)(L, dptr, stride, width, height)
 #define MATRIX_DATA_WRITE(L, data, idx, val) cuda_matrix_(write)(L, data, idx, val)
 #define MATRIX_DATA_READ(L, data, idx) cuda_matrix_(read)(L, data, idx)
 #define MATRIX_INIT(L) cuda_matrix_(init)(L)
@@ -12,443 +8,303 @@
 #define NERV_GENERIC_MATRIX
 #define NERV_GENERIC_CUKERNEL
 #include "../../common.h"
-#include "../cukernel.h"
-#include "../cuda_helper.h"
-
-Matrix *nerv_matrix_(new_)(lua_State *L, long nrow, long ncol);
-void nerv_matrix_(data_free)(lua_State *L, Matrix *self);
-
-static void nerv_matrix_(add_)(lua_State *L, const Matrix *a, const Matrix *b,
-                                const Matrix *c,
-                                MATRIX_ELEM alpha, MATRIX_ELEM beta) {
-    PROFILE_START
-    CUBLAS_SAFE_SYNC_CALL(
-            NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
-                a->ncol, a->nrow,
-                &alpha,
-                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
-                &beta,
-                MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM),
-                MATRIX_ELEM_PTR(c), c->stride / sizeof(MATRIX_ELEM)));
-    PROFILE_STOP
-}
+#include "../../lib/matrix/generic/cumatrix.h"
 
-static int nerv_matrix_(add)(lua_State *L) {
+static int nerv_matrix_(lua_add)(lua_State *L) {
+    Status status;
     Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname));
+    const Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    const Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname));
     MATRIX_ELEM alpha = luaL_checknumber(L, 4);
     MATRIX_ELEM beta = luaL_checknumber(L, 5);
-    CHECK_SAME_DIMENSION(a, b);
-    CHECK_SAME_DIMENSION(a, c);
-    nerv_matrix_(add_)(L, a, b, c, alpha, beta);
+    nerv_matrix_(add)(c, a, b, alpha, beta, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
-static int nerv_matrix_(get_cublas_op)(char ch) {
+static int nerv_matrix_(lua_get_cublas_op)(char ch) {
     return (ch == 'T' || ch == 't') ? CUBLAS_OP_T : CUBLAS_OP_N;
 }
 
-static int nerv_matrix_(mul)(lua_State *L) {
-#define SWAP(a, b) \
-    do { int t = (a); (a) = (b); (b) = t; } while (0)
-
+static int nerv_matrix_(lua_mul)(lua_State *L) {
+    Status status;
     Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname));
     Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
     Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname));
     MATRIX_ELEM alpha = luaL_checknumber(L, 4);
     MATRIX_ELEM beta = luaL_checknumber(L, 5);
     int nargs = lua_gettop(L);
-    int ta = nargs > 5 ? nerv_matrix_(get_cublas_op)(*luaL_checkstring(L, 6)) \
+    int ta = nargs > 5 ? nerv_matrix_(lua_get_cublas_op)(*luaL_checkstring(L, 6)) \
                             : CUBLAS_OP_N;
-    int tb = nargs > 6 ? nerv_matrix_(get_cublas_op)(*luaL_checkstring(L, 7)) \
+    int tb = nargs > 6 ? nerv_matrix_(lua_get_cublas_op)(*luaL_checkstring(L, 7)) \
                             : CUBLAS_OP_N;
-    int am = a->nrow, an = a->ncol;
-    int bm = b->nrow, bn = b->ncol;
-    if (ta == CUBLAS_OP_T) SWAP(am, an);
-    if (tb == CUBLAS_OP_T) SWAP(bm, bn);
-    if (an != bm)
-        nerv_error(L, "Wrong dimension of multipliers");
-/*    MATRIX_ELEM alpha = 1.0f, beta = 0.0f; */
-    /* Because matrix in Nerv is row-major, here b comes first */
-    PROFILE_START
-    CUBLAS_SAFE_SYNC_CALL(
-            NERV_CUBLAS_(gemm)(cublas_handle, tb, ta,
-                bn, am, bm,
-                &alpha,
-                MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM),
-                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
-                &beta,
-                MATRIX_ELEM_PTR(c), c->stride / sizeof(MATRIX_ELEM)));
-    PROFILE_STOP
+    nerv_matrix_(mul)(c, a, b, alpha, beta, ta, tb, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
-static int nerv_matrix_(create)(lua_State *L) {
+static int nerv_matrix_(lua_create)(lua_State *L) {
+    Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = nerv_matrix_(new_)(L, a->nrow, a->ncol);
+    Matrix *b = nerv_matrix_(create)(a->nrow, a->ncol, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     luaT_pushudata(L, b, nerv_matrix_(tname));
     return 1;
 }
 
-static int nerv_matrix_(sigmoid)(lua_State *L) {
+static int nerv_matrix_(lua_sigmoid)(lua_State *L) {
+    Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
     Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    CHECK_SAME_DIMENSION(a, b);
-    PROFILE_START
-    cudak_(cuda_sigmoid)(b, a);
-    PROFILE_STOP
+    nerv_matrix_(sigmoid)(a, b, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
-static int nerv_matrix_(sigmoid_grad)(lua_State *L) {
+static int nerv_matrix_(lua_sigmoid_grad)(lua_State *L) {
+    Status status;
     Matrix *nerr = luaT_checkudata(L, 1, nerv_matrix_(tname));
     Matrix *err = luaT_checkudata(L, 2, nerv_matrix_(tname));
     Matrix *output = luaT_checkudata(L, 3, nerv_matrix_(tname));
-    CHECK_SAME_DIMENSION(nerr, err);
-    CHECK_SAME_DIMENSION(nerr, output);
-    PROFILE_START
-    cudak_(cuda_sigmoid_grad)(output, err, nerr);
-    PROFILE_STOP
+    nerv_matrix_(sigmoid_grad)(nerr, err, output, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
-static int nerv_matrix_(softmax)(lua_State *L) {
+static int nerv_matrix_(lua_softmax)(lua_State *L) {
+    Status status;
     Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
     Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *max, *max_idx;
-    Matrix *dno;
-    CHECK_SAME_DIMENSION(a, b);
-    max = nerv_matrix_(new_)(L, a->nrow, 1);
-    max_idx = nerv_matrix_(new_)(L, a->nrow, 1);
-    dno = nerv_matrix_(new_)(L, a->nrow, 1);
-    PROFILE_START
-    cudak_(cuda_rowmax_idx)(a, max, max_idx);
-    cudak_(cuda_softmax_denominator)(a, max, dno);
-    cudak_(cuda_softmax_final)(a, max, dno, b);
-    PROFILE_STOP
-    nerv_matrix_(data_free)(L, max);
-    nerv_matrix_(data_free)(L, dno);
+    Matrix *max_idx = nerv_matrix_(softmax)(b, a, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     luaT_pushudata(L, max_idx, nerv_matrix_(tname));
     return 1;
 }
 
-static int nerv_matrix_(rowsum)(lua_State *L) {
+static int nerv_matrix_(lua_rowsum)(lua_State *L) {
+    Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1);
-    PROFILE_START
-    cudak_(cuda_rowsum)(a, b);
-    PROFILE_STOP
+    Matrix *b = nerv_matrix_(rowsum)(a, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     luaT_pushudata(L, b, nerv_matrix_(tname));
     return 1;
 }
 
-static int nerv_matrix_(colsum)(lua_State *L) {
+static int nerv_matrix_(lua_colsum)(lua_State *L) {
+    Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = nerv_matrix_(new_)(L, 1, a->ncol);
-    PROFILE_START
-    cudak_(cuda_colsum)(a, b);
-    PROFILE_STOP
+    Matrix *b = nerv_matrix_(colsum)(a, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     luaT_pushudata(L, b, nerv_matrix_(tname));
     return 1;
 }
 
-static int nerv_matrix_(colsame)(lua_State *L) {
+static int nerv_matrix_(lua_colsame)(lua_State *L) {
+    Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *ref = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    Matrix *b = nerv_matrix_(new_)(L, 1, a->ncol);
-    CHECK_SAME_DIMENSION(a, ref);
-    PROFILE_START
-    cudak_(cuda_colsame)(a, ref, b);
-    PROFILE_STOP
+    const Matrix *ref = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    Matrix *b = nerv_matrix_(colsame)(a, ref, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     luaT_pushudata(L, b, nerv_matrix_(tname));
     return 1;
 }
 
-static int nerv_matrix_(rowmax)(lua_State *L) {
+static int nerv_matrix_(lua_rowmax)(lua_State *L) {
+    Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1);
-    PROFILE_START
-    cudak_(cuda_rowmax)(a, b);
-    PROFILE_STOP
+    Matrix *b = nerv_matrix_(rowmax)(a, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     luaT_pushudata(L, b, nerv_matrix_(tname));
     return 1;
 }
 
-static int nerv_matrix_(rowmax_idx)(lua_State *L) {
+static int nerv_matrix_(lua_rowmax_idx)(lua_State *L) {
+    Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1);
-    Matrix *idx = nerv_matrix_(new_)(L, a->nrow, 1);
-    PROFILE_START
-    cudak_(cuda_rowmax_idx)(a, b, idx);
-    PROFILE_STOP
+    Matrix *b;
+    Matrix *idx;
+    nerv_matrix_(rowmax_idx)(a, &b, &idx, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     luaT_pushudata(L, b, nerv_matrix_(tname));
     luaT_pushudata(L, idx, nerv_matrix_(tname));
     return 2;
 }
 
-static int nerv_matrix_(add_row)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+static int nerv_matrix_(lua_add_row)(lua_State *L) {
+    Status status;
+    const Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
     Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname));
     double beta = luaL_checknumber(L, 3);
-    if (a->ncol != b->ncol)
-        nerv_error(L, "the number of columns is not the same");
-    if (a->nrow != 1)
-        nerv_error(L, "a row vector is expected");
-    PROFILE_START
-    cudak_(cuda_add_row)(a, b, beta);
-    PROFILE_STOP
+    nerv_matrix_(add_row)(b, a, beta, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
-static int nerv_matrix_(fill)(lua_State *L) {
+static int nerv_matrix_(lua_fill)(lua_State *L) {
+    Status status;
     Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
     double val = luaL_checknumber(L, 2);
-    PROFILE_START
-    cudak_(cuda_fill)(self, val);
-    PROFILE_STOP
+    nerv_matrix_(fill)(self, val, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
-static int nerv_matrix_(copy_fromd)(lua_State *L) {
+static int nerv_matrix_(lua_copy_fromd)(lua_State *L) {
+    Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
     int nargs = lua_gettop(L);
     int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
     int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
     int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
-    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
-            a_begin + b_end - b_begin <= a->nrow))
-        nerv_error(L, "invalid copy interval");
-    if (a->ncol != b->ncol)
-        nerv_error(L, "matrices should be of the same dimension");
-    PROFILE_START
-    CUDA_SAFE_SYNC_CALL(
-            cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride,
-                MATRIX_ROW_PTR(b, b_begin), b->stride,
-                sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin,
-                cudaMemcpyDeviceToDevice));
-    PROFILE_STOP
+    nerv_matrix_(copy_fromd)(a, b, a_begin, b_begin, b_end, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
 extern const char *MATRIX_CUMATRIX_HOST_TNAME;
-static int nerv_matrix_(copy_fromh)(lua_State *L) { 
+static int nerv_matrix_(lua_copy_fromh)(lua_State *L) { 
+    Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
+    const Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
     int nargs = lua_gettop(L);
     int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
     int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
     int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
-    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
-            a_begin + b_end - b_begin <= a->nrow))
-        nerv_error(L, "invalid copy interval");
-    if (a->ncol != b->ncol)
-        nerv_error(L, "matrices should be of the same dimension");
-    PROFILE_START
-    CUDA_SAFE_SYNC_CALL(
-            cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride,
-                MATRIX_ROW_PTR(b, b_begin), b->stride,
-                sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin,
-                cudaMemcpyHostToDevice));
-    PROFILE_STOP
+    nerv_matrix_(copy_fromh)(a, b, a_begin, b_begin, b_end, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
-static int nerv_matrix_(copy_toh)(lua_State *L) {
+static int nerv_matrix_(lua_copy_toh)(lua_State *L) {
+    Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
+    const Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
     int nargs = lua_gettop(L);
     int a_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
     int a_end = nargs > 3 ? luaL_checkinteger(L, 4) : a->nrow;
     int b_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
-    if (!(0 <= a_begin && a_begin < a_end && a_end <= a->nrow &&
-            b_begin + a_end - a_begin <= b->nrow))
-        nerv_error(L, "invalid copy interval");
-    if (b->ncol != a->ncol)
-        nerv_error(L, "matrices should be of the same dimension");
-    PROFILE_START
-    CUDA_SAFE_SYNC_CALL(
-            cudaMemcpy2D(MATRIX_ROW_PTR(b, b_begin), b->stride,
-                MATRIX_ROW_PTR(a, a_begin), a->stride,
-                sizeof(MATRIX_ELEM) * a->ncol, a_end - a_begin,
-                cudaMemcpyDeviceToHost));
-    PROFILE_STOP
+    nerv_matrix_(copy_toh)(a, b, a_begin, a_end, b_begin, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
-static int nerv_matrix_(trans)(lua_State *L) {
+static int nerv_matrix_(lua_trans)(lua_State *L) {
+    Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = nerv_matrix_(new_)(L, a->ncol, a->nrow);
-    MATRIX_ELEM alpha = 1, beta = 0;
-    /* FIXME: possible memory leak when lua error is raised */
-    PROFILE_START
-    CUBLAS_SAFE_SYNC_CALL(
-            NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T,
-                a->nrow, a->ncol,
-                &alpha,
-                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
-                &beta,
-                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
-                MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM)));
-    PROFILE_STOP
+    Matrix *b = nerv_matrix_(trans)(a, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     luaT_pushudata(L, b, nerv_matrix_(tname));
     return 1;
 }
 
-static int nerv_matrix_(mul_elem)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname));
+static int nerv_matrix_(lua_mul_elem)(lua_State *L) {
+    Status status;
+    const Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    const Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname));
     Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    CHECK_SAME_DIMENSION(a, b);
-    CHECK_SAME_DIMENSION(a, c);
-    PROFILE_START
-    cudak_(cuda_mul_elem)(a, b, c);
-    PROFILE_STOP
+    nerv_matrix_(mul_elem)(c, a, b, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
-static int nerv_matrix_(log_elem)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+static int nerv_matrix_(lua_log_elem)(lua_State *L) {
+    Status status;
+    const Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
     Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    CHECK_SAME_DIMENSION(a, b);
-    PROFILE_START
-    cudak_(cuda_log_elem)(a, b);
-    PROFILE_STOP
+    nerv_matrix_(log_elem)(b, a, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
-static int nerv_matrix_(decompress)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b;
+static int nerv_matrix_(lua_decompress)(lua_State *L) {
+    Status status;
+    const Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
     int orig_col = luaL_checkinteger(L, 2);
-    if (a->ncol != 1)
-        nerv_error(L, "the compressed matrix must be a column vector");
-    b = nerv_matrix_(new_)(L, a->nrow, orig_col);
-    PROFILE_START
-    cudak_(cuda_fill)(b, 0.0);
-    cudak_(cuda_decompress)(a, b);
-    PROFILE_STOP
+    Matrix *b = nerv_matrix_(decompress)(a, orig_col, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     luaT_pushudata(L, b, nerv_matrix_(tname));
     return 1;
 }
 
 extern const char *nerv_matrix_host_int_tname;
-static int nerv_matrix_(copy_rows_fromh_by_idx)(lua_State *L) {
+static int nerv_matrix_(lua_copy_rows_fromh_by_idx)(lua_State *L) {
+    Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
-    Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_int_tname);
+    const Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
+    const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_int_tname);
     long nrow = a->nrow;
     int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0;
-    if (!(0 <= b_begin && b_begin + nrow <= idx->ncol))
-        nerv_error(L, "invalid copy interval");
-    long *idx_ptr = idx->data.i;
-    int i;
-    if (idx->nrow != 1)
-        nerv_error(L, "index should be a vector");
-    if (a->ncol != b->ncol)
-        nerv_error(L, "source/destination dimension mismatch");
-    cudaStream_t *streams = (cudaStream_t*)malloc(sizeof(cudaStream_t) * nrow);
-    for (i = 0; i < nrow; i++)
-    {
-        int src_row = idx_ptr[b_begin + i];
-        if (!(0 <= src_row && src_row < b->nrow))
-            nerv_error(L, "invalid index");
-        CUDA_SAFE_CALL(cudaStreamCreate(streams + i));
-        CUDA_SAFE_CALL(cudaMemcpyAsync(MATRIX_ROW_PTR(a, i),
-                    MATRIX_ROW_PTR(b, src_row),
-                    b->stride,
-                    cudaMemcpyHostToDevice, streams[i]));
-    }
-    for (i = 0; i < nrow; i++)
-    {
-        CUDA_SAFE_CALL(cudaStreamSynchronize(streams[i]));
-        CUDA_SAFE_CALL(cudaStreamDestroy(streams[i]));
-    }
-    free(streams);
+    nerv_matrix_(copy_rows_fromh_by_idx)(a, b, idx, b_begin, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
-static int nerv_matrix_(expand_frm)(lua_State *L) {
+static int nerv_matrix_(lua_expand_frm)(lua_State *L) {
+    Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
     int context = luaL_checkinteger(L, 3);
-    if (a->nrow != b->nrow)
-        nerv_error(L, "mismatching number of frames");
-    if (a->ncol != b->ncol * (context * 2 + 1))
-        nerv_error(L, "the width should be 2 * context + 1");
-    PROFILE_START
-    cudak_(cuda_expand_frm)(b, a, context);
-    PROFILE_STOP
+    nerv_matrix_(expand_frm)(a, b, context, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
-static int nerv_matrix_(rearrange_frm)(lua_State *L) {
+static int nerv_matrix_(lua_rearrange_frm)(lua_State *L) {
+    Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
     int step = luaL_checkinteger(L, 3);
-    CHECK_SAME_DIMENSION(a, b);
-    if (b->ncol % step)
-        nerv_error(L, "the dimension of columns is not divisible by step");
-    PROFILE_START
-    cudak_(cuda_rearrange_frm)(b, a, step);
-    PROFILE_STOP
+    nerv_matrix_(rearrange_frm)(a, b, step, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
-static int nerv_matrix_(scale_rows_by_col)(lua_State *L) {
+static int nerv_matrix_(lua_scale_rows_by_col)(lua_State *L) {
+    Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    if (a->nrow != b->nrow)
-        nerv_error(L, "the number of rows is not the same");
-    if (b->ncol != 1)
-        nerv_error(L, "a column vector is expected");
-    PROFILE_START
-    cudak_(cuda_scale_rows_by_col)(b, a);
-    PROFILE_STOP
+    const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    nerv_matrix_(scale_rows_by_col)(a, b, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
-static int nerv_matrix_(scale_rows_by_row)(lua_State *L) {
+static int nerv_matrix_(lua_scale_rows_by_row)(lua_State *L) {
+    Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    if (a->ncol != b->ncol)
-        nerv_error(L, "the number of columns is not the same");
-    if (b->nrow != 1)
-        nerv_error(L, "a row vector is expected");
-    PROFILE_START
-    cudak_(cuda_scale_rows_by_row)(b, a);
-    PROFILE_STOP
+    const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    nerv_matrix_(scale_rows_by_row)(a, b, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
 static const luaL_Reg nerv_matrix_(extra_methods)[] = {
-    {"create", nerv_matrix_(create)},
-    {"colsum", nerv_matrix_(colsum)},
-    {"colsame", nerv_matrix_(colsame)},
-    {"rowsum", nerv_matrix_(rowsum)},
-    {"rowmax", nerv_matrix_(rowmax)},
-    {"rowmax_idx", nerv_matrix_(rowmax_idx)},
-    {"trans", nerv_matrix_(trans)},
-    {"decompress", nerv_matrix_(decompress)},
+    {"create", nerv_matrix_(lua_create)},
+    {"colsum", nerv_matrix_(lua_colsum)},
+    {"colsame", nerv_matrix_(lua_colsame)},
+    {"rowsum", nerv_matrix_(lua_rowsum)},
+    {"rowmax", nerv_matrix_(lua_rowmax)},
+    {"rowmax_idx", nerv_matrix_(lua_rowmax_idx)},
+    {"trans", nerv_matrix_(lua_trans)},
+    {"decompress", nerv_matrix_(lua_decompress)},
     /* in-place calc */
-    {"copy_fromh", nerv_matrix_(copy_fromh)},
-    {"copy_fromd", nerv_matrix_(copy_fromd)},
-    {"copy_toh", nerv_matrix_(copy_toh)},
-    {"add", nerv_matrix_(add)},
-    {"mul", nerv_matrix_(mul)},
-    {"add_row", nerv_matrix_(add_row)},
-    {"fill", nerv_matrix_(fill)},
-    {"sigmoid", nerv_matrix_(sigmoid)},
-    {"sigmoid_grad", nerv_matrix_(sigmoid_grad)},
-    {"softmax", nerv_matrix_(softmax)},
-    {"mul_elem", nerv_matrix_(mul_elem)},
-    {"log_elem", nerv_matrix_(log_elem)},
-    {"copy_rows_fromh_by_idx", nerv_matrix_(copy_rows_fromh_by_idx)},
-    {"expand_frm", nerv_matrix_(expand_frm)},
-    {"rearrange_frm", nerv_matrix_(rearrange_frm)},
-    {"scale_rows_by_row", nerv_matrix_(scale_rows_by_row)},
-    {"scale_rows_by_col", nerv_matrix_(scale_rows_by_col)},
+    {"copy_fromh", nerv_matrix_(lua_copy_fromh)},
+    {"copy_fromd", nerv_matrix_(lua_copy_fromd)},
+    {"copy_toh", nerv_matrix_(lua_copy_toh)},
+    {"add", nerv_matrix_(lua_add)},
+    {"mul", nerv_matrix_(lua_mul)},
+    {"add_row", nerv_matrix_(lua_add_row)},
+    {"fill", nerv_matrix_(lua_fill)},
+    {"sigmoid", nerv_matrix_(lua_sigmoid)},
+    {"sigmoid_grad", nerv_matrix_(lua_sigmoid_grad)},
+    {"softmax", nerv_matrix_(lua_softmax)},
+    {"mul_elem", nerv_matrix_(lua_mul_elem)},
+    {"log_elem", nerv_matrix_(lua_log_elem)},
+    {"copy_rows_fromh_by_idx", nerv_matrix_(lua_copy_rows_fromh_by_idx)},
+    {"expand_frm", nerv_matrix_(lua_expand_frm)},
+    {"rearrange_frm", nerv_matrix_(lua_rearrange_frm)},
+    {"scale_rows_by_row", nerv_matrix_(lua_scale_rows_by_row)},
+    {"scale_rows_by_col", nerv_matrix_(lua_scale_rows_by_col)},
     {NULL, NULL}
 };
 
@@ -456,37 +312,34 @@ static void cuda_matrix_(init)(lua_State *L) {
     luaN_append_methods(L, nerv_matrix_(extra_methods));
 }
 
-static void cuda_matrix_(free)(lua_State *L, MATRIX_ELEM *ptr) {
-    CUDA_SAFE_SYNC_CALL(cudaFree(ptr));
+int nerv_matrix_(lua_get_elem)(lua_State *L) {
+    return nerv_error_method_not_implemented(L);
 }
 
-static void cuda_matrix_(alloc)(lua_State *L, MATRIX_ELEM **dptr,
-                                size_t *stride, long width, long height) {
-    PROFILE_START
-    CUDA_SAFE_SYNC_CALL(cudaMallocPitch((void **)dptr, stride, width, height));
-    PROFILE_STOP
+int nerv_matrix_(lua_set_elem)(lua_State *L) {
+    return nerv_error_method_not_implemented(L);
 }
 
 static MATRIX_ELEM cuda_matrix_(read)(lua_State *L, MATRIX_ELEM *data,
-                                        int idx) {
+                                    int idx) {
+    cudaError_t err;
     MATRIX_ELEM res;
-    CUDA_SAFE_SYNC_CALL(cudaMemcpy(&res, data + idx,
-                sizeof(MATRIX_ELEM), cudaMemcpyDeviceToHost));
+    err = cudaMemcpy(&res, data + idx,
+                sizeof(MATRIX_ELEM), cudaMemcpyDeviceToHost);
+    if (err != cudaSuccess)
+        nerv_error(L, "cuda error: error while reading element");
+    cudaDeviceSynchronize();
     return res;
 }
 
 static void cuda_matrix_(write)(lua_State *L, MATRIX_ELEM *data,
                                 int idx, MATRIX_ELEM val) {
-    CUDA_SAFE_SYNC_CALL(cudaMemcpy(data + idx, &val,
-                sizeof(MATRIX_ELEM), cudaMemcpyHostToDevice));
-}
-
-int nerv_matrix_(get_elem)(lua_State *L) {
-    return nerv_error_method_not_implemented(L);
-}
-
-int nerv_matrix_(set_elem)(lua_State *L) {
-    return nerv_error_method_not_implemented(L);
+    cudaError_t err;
+    err = cudaMemcpy(data + idx, &val,
+                sizeof(MATRIX_ELEM), cudaMemcpyHostToDevice);
+    if (err != cudaSuccess)
+        nerv_error(L, "cuda error: error while writing element");
+    cudaDeviceSynchronize();
 }
 
 #include "matrix.c"
diff --git a/nerv/matrix/generic/matrix.c b/nerv/matrix/generic/matrix.c
index e17fb42..9d2521b 100644
--- a/nerv/matrix/generic/matrix.c
+++ b/nerv/matrix/generic/matrix.c
@@ -1,68 +1,32 @@
 #ifdef NERV_GENERIC_MATRIX
 #include "../../common.h"
-#include "matrix.h"
+#include "../../lib/matrix/generic/matrix.h"
 
 extern const char *nerv_matrix_(tname);
 extern const char *MATRIX_BASE_TNAME;
 
-void nerv_matrix_(data_free)(lua_State *L, Matrix *self) {
-    (void)L;
-    assert(*self->data_ref > 0);
-    if (--(*self->data_ref) == 0)
-    {
-        /* free matrix data */
-        MATRIX_DATA_FREE(L, MATRIX_ELEM_PTR(self));
-        free(self->data_ref);
-        free(self);
-    }
-}
-
-void nerv_matrix_(data_retain)(Matrix *self) {
-    (*self->data_ref)++;
-}
-
-Matrix *nerv_matrix_(new_)(lua_State *L, long nrow, long ncol) {
-    Matrix *self = (Matrix *)malloc(sizeof(Matrix));
-    self->nrow = nrow;
-    self->ncol = ncol;
-    self->nmax = self->nrow * self->ncol;
-    MATRIX_DATA_ALLOC(L, &MATRIX_ELEM_PTR(self), &self->stride,
-                        sizeof(MATRIX_ELEM) * self->ncol, self->nrow);
-    self->data_ref = (long *)malloc(sizeof(long));
-    *self->data_ref = 0;
-    nerv_matrix_(data_retain)(self);
-    return self;
-}
 
-int nerv_matrix_(new)(lua_State *L) {
-    luaT_pushudata(L, nerv_matrix_(new_)(L, luaL_checkinteger(L, 1),
-                                        luaL_checkinteger(L, 2)),
-                    nerv_matrix_(tname));
+int nerv_matrix_(lua_new)(lua_State *L) {
+    Status status;
+    Matrix *self = nerv_matrix_(create)(luaL_checkinteger(L, 1),
+                                        luaL_checkinteger(L, 2), &status);
+    NERV_LUA_CHECK_STATUS(L, status);
+    luaT_pushudata(L, self, nerv_matrix_(tname));
     return 1;
 }
 
-int nerv_matrix_(destroy)(lua_State *L) {
+int nerv_matrix_(lua_destroy)(lua_State *L) {
+    Status status;
     Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    nerv_matrix_(data_free)(L, self);
+    nerv_matrix_(destroy)(self, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 1;
 }
 
-int nerv_matrix_(get_elem)(lua_State *L); 
-int nerv_matrix_(set_elem)(lua_State *L);
-
-static Matrix *nerv_matrix_(getrow)(Matrix *self, int row) {
-    Matrix *prow = (Matrix *)malloc(sizeof(Matrix));
-    prow->ncol = self->ncol;
-    prow->nrow = 1;
-    prow->stride = self->stride;
-    prow->nmax = prow->ncol;
-    MATRIX_ELEM_PTR(prow) = MATRIX_ROW_PTR(self, row);
-    prow->data_ref = self->data_ref;
-    nerv_matrix_(data_retain)(prow);
-    return prow;
-}
+int nerv_matrix_(lua_get_elem)(lua_State *L);
+int nerv_matrix_(lua_set_elem)(lua_State *L);
 
-static int nerv_matrix_(newindex)(lua_State *L) {
+static int nerv_matrix_(lua_newindex)(lua_State *L) {
     Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
     if (lua_isnumber(L, 2))
     {
@@ -87,7 +51,8 @@ static int nerv_matrix_(newindex)(lua_State *L) {
 }
 
 
-static int nerv_matrix_(index)(lua_State *L) {
+static int nerv_matrix_(lua_index)(lua_State *L) {
+    Status status;
     Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
     if (lua_isnumber(L, 2))
     {
@@ -102,7 +67,8 @@ static int nerv_matrix_(index)(lua_State *L) {
         {
             if (idx < 0 || idx >= self->nrow)
                 nerv_error(L, "index must be within range [0, %d)", self->nrow);
-            luaT_pushudata(L, nerv_matrix_(getrow)(self, idx), nerv_matrix_(tname));
+            luaT_pushudata(L, nerv_matrix_(getrow)(self, idx),
+                                nerv_matrix_(tname));
         }
         lua_pushboolean(L, 1);
         return 2;
@@ -114,38 +80,38 @@ static int nerv_matrix_(index)(lua_State *L) {
     }
 }
 
-static int nerv_matrix_(ncol)(lua_State *L) {
+static int nerv_matrix_(lua_ncol)(lua_State *L) {
     Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
     lua_pushinteger(L, self->ncol);
     return 1;
 }
 
-static int nerv_matrix_(nrow)(lua_State *L) {
+static int nerv_matrix_(lua_nrow)(lua_State *L) {
     Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
     lua_pushinteger(L, self->nrow);
     return 1;
 }
 
-static int nerv_matrix_(get_dataref_value)(lua_State *L) {                                                                                                                                               
-    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));                                                                                                                                           
-    lua_pushinteger(L, *(self->data_ref));                                                                                                                                                               
-    return 1;                                                                                                                                                                                            
-}      
+static int nerv_matrix_(lua_get_dataref_value)(lua_State *L) {
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    lua_pushinteger(L, *(self->data_ref));
+    return 1;
+}
 
 static const luaL_Reg nerv_matrix_(methods)[] = {
-    {"get_elem", nerv_matrix_(get_elem)},
-    {"set_elem", nerv_matrix_(set_elem)},
-    {"ncol", nerv_matrix_(ncol)},
-    {"nrow", nerv_matrix_(nrow)},
-    {"get_dataref_value", nerv_matrix_(get_dataref_value)},
-    {"__index__", nerv_matrix_(index)},
-    {"__newindex__", nerv_matrix_(newindex)},
+    {"get_elem", nerv_matrix_(lua_get_elem)},
+    {"set_elem", nerv_matrix_(lua_set_elem)},
+    {"ncol", nerv_matrix_(lua_ncol)},
+    {"nrow", nerv_matrix_(lua_nrow)},
+    {"get_dataref_value", nerv_matrix_(lua_get_dataref_value)},
+    {"__index__", nerv_matrix_(lua_index)},
+    {"__newindex__", nerv_matrix_(lua_newindex)},
     {NULL, NULL}
 };
 
-void nerv_matrix_(init)(lua_State *L) {
+void nerv_matrix_(lua_init)(lua_State *L) {
     luaT_newmetatable(L, nerv_matrix_(tname), MATRIX_BASE_TNAME,
-                        nerv_matrix_(new), nerv_matrix_(destroy), NULL);
+                        nerv_matrix_(lua_new), nerv_matrix_(lua_destroy), NULL);
     luaL_register(L, NULL, nerv_matrix_(methods));
 #ifdef MATRIX_INIT
     MATRIX_INIT(L);
diff --git a/nerv/matrix/generic/mmatrix.c b/nerv/matrix/generic/mmatrix.c
index 697c9fc..233102a 100644
--- a/nerv/matrix/generic/mmatrix.c
+++ b/nerv/matrix/generic/mmatrix.c
@@ -1,9 +1,6 @@
 #ifdef NERV_GENERIC_MMATRIX
-#include "matrix.h"
+#include "../../lib/matrix/generic/matrix.h"
 #include "elem_type.h"
-#define MATRIX_DATA_FREE(L, ptr) free(ptr)
-#define MATRIX_DATA_ALLOC(L, dptr, stride, width, height) \
-                            host_matrix_(alloc)(L, dptr, stride, width, height)
 #define MATRIX_DATA_WRITE(L, data, idx, val) (data[idx] = val)
 #define MATRIX_DATA_READ(L, data, idx) (data[idx])
 #define MATRIX_INIT(L) host_matrix_(init)(L)
@@ -11,17 +8,10 @@
 #define NERV_GENERIC_MATRIX
 #include "../../common.h"
 #include "../../io/chunk_file.h"
+#include "../../lib/matrix/generic/mmatrix.h"
 #include "string.h"
 
-static void host_matrix_(alloc)(lua_State *L,
-                                MATRIX_ELEM **dptr, size_t *stride,
-                                long width, long height) {
-    if ((*dptr = (MATRIX_ELEM *)malloc(width * height)) == NULL)
-        nerv_error(L, "mmatrix insufficient memory");
-    *stride = width;
-}
-
-int nerv_matrix_(get_elem)(lua_State *L) {
+int nerv_matrix_(lua_get_elem)(lua_State *L) {
     Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
     int idx = luaL_checkinteger(L, 2);
     if (idx < 0 || idx >= self->nmax)
@@ -30,7 +20,7 @@ int nerv_matrix_(get_elem)(lua_State *L) {
     return 1;
 }
 
-int nerv_matrix_(set_elem)(lua_State *L) {
+int nerv_matrix_(lua_set_elem)(lua_State *L) {
     Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
     int idx = luaL_checkinteger(L, 2);
     MATRIX_ELEM v = luaL_checknumber(L, 3);
@@ -50,72 +40,42 @@ static void host_matrix_(init)(lua_State *L) {
 
 #include "matrix.c"
 
-int nerv_matrix_(load)(lua_State *L) {
+int nerv_matrix_(lua_load)(lua_State *L) {
+    Status status;
     ChunkData *cdp = luaT_checkudata(L, 1, nerv_chunk_data_tname);
-    Matrix *self;
-    int i, j;
-    long nrow, ncol;
-    FILE *fp = cdp->fp;
-    if (fscanf(fp, "%ld %ld", &nrow, &ncol) != 2)
-        return 0;
-    self = nerv_matrix_(new_)(L, nrow, ncol);
-    for (i = 0; i < nrow; i++)
-    {
-        MATRIX_ELEM *row = MATRIX_ROW_PTR(self, i);
-        for (j = 0; j < ncol; j++)
-            if (fscanf(fp, MATRIX_ELEM_FMT, row + j) != 1)
-            {
-                free(self);
-                return 0;
-            }
-    }
+    Matrix *self = nerv_matrix_(load)(cdp, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     luaT_pushudata(L, self, nerv_matrix_(tname));
     return 1;
 }
 
-int nerv_matrix_(save)(lua_State *L) {
+int nerv_matrix_(lua_save)(lua_State *L) {
+    Status status;
     ChunkFile *cfp = luaT_checkudata(L, 2,
                             nerv_chunk_file_handle_tname);
     Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    int i, j;
-    long nrow = self->nrow, ncol = self->ncol;
-    FILE *fp = cfp->fp;
-    if (fprintf(fp, "%ld %ld\n", nrow, ncol) < 0)
-        return 0;
-    for (i = 0; i < nrow; i++)
-    {
-        MATRIX_ELEM *row = MATRIX_ROW_PTR(self, i);
-        for (j = 0; j < ncol; j++)
-            if (fprintf(fp, MATRIX_ELEM_WRITE_FMT " ", row[j]) < 0)
-                return 0;
-        if (fprintf(fp, "\n") < 0)
-             return 0;
-    }
+    nerv_matrix_(save)(self, cfp, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
-static int nerv_matrix_(copy_from)(lua_State *L) {
+int nerv_matrix_(lua_copy_from)(lua_State *L) {
+    Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
     int nargs = lua_gettop(L);
     int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
     int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
     int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
-    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
-            a_begin + b_end - b_begin <= a->nrow))
-        nerv_error(L, "invalid copy interval");
-    if (a->ncol != b->ncol)
-        nerv_error(L, "matrices should be of the same dimension");
-    memmove(MATRIX_ROW_PTR(a, a_begin),
-            MATRIX_ROW_PTR(b, b_begin),
-            sizeof(MATRIX_ELEM) * b->ncol * (b_end - b_begin));
+    nerv_matrix_(copy_from)(a, b, a_begin, b_begin, b_end, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
 
 static const luaL_Reg nerv_matrix_(extra_methods)[] = {
-    {"load", nerv_matrix_(load)},
-    {"save", nerv_matrix_(save)},
-    {"copy_from", nerv_matrix_(copy_from)},
+    {"load", nerv_matrix_(lua_load)},
+    {"save", nerv_matrix_(lua_save)},
+    {"copy_from", nerv_matrix_(lua_copy_from)},
     {NULL, NULL}
 };
 
diff --git a/nerv/matrix/init.c b/nerv/matrix/init.c
index c29d7e9..27472b9 100644
--- a/nerv/matrix/init.c
+++ b/nerv/matrix/init.c
@@ -1,12 +1,11 @@
 #include "../common.h"
-#include "generic/matrix.h"
 
 const char *nerv_matrix_tname = "nerv.Matrix";
 const char *nerv_matrix_cuda_tname = "nerv.CuMatrix";
 const char *nerv_matrix_host_tname = "nerv.MMatrix";
 
-void nerv_cumatrix_init(lua_State *L);
-void nerv_mmatrix_init(lua_State *L);
+void nerv_lua_cumatrix_init(lua_State *L);
+void nerv_lua_mmatrix_init(lua_State *L);
 
 static const luaL_Reg matrix_methods[] = {
     {"__tostring__", nerv_error_method_not_implemented },
@@ -25,11 +24,11 @@ void nerv_matrix_init(lua_State *L) {
     /* CuMatrix inherits from Matrix */
     luaT_newmetatable(L, nerv_matrix_cuda_tname, nerv_matrix_tname,
                             NULL, NULL, NULL);
-    nerv_cumatrix_init(L);
+    nerv_lua_cumatrix_init(L);
     lua_pop(L, 1);
     /* MMatrix inherits from Matrix */
     luaT_newmetatable(L, nerv_matrix_host_tname, nerv_matrix_tname,
                             NULL, NULL, NULL);
-    nerv_mmatrix_init(L);
+    nerv_lua_mmatrix_init(L);
     lua_pop(L, 1);
 }
diff --git a/nerv/matrix/mmatrix.c b/nerv/matrix/mmatrix.c
index d1d68b9..b25af03 100644
--- a/nerv/matrix/mmatrix.c
+++ b/nerv/matrix/mmatrix.c
@@ -1,15 +1,15 @@
 #define NERV_GENERIC_MMATRIX
 #include <stdlib.h>
 #include "../common.h"
-void nerv_matrix_host_float_init(lua_State *L);
-void nerv_matrix_host_double_init(lua_State *L);
-void nerv_matrix_host_int_init(lua_State *L);
+void nerv_matrix_host_float_lua_init(lua_State *L);
+void nerv_matrix_host_double_lua_init(lua_State *L);
+void nerv_matrix_host_int_lua_init(lua_State *L);
 
-void nerv_mmatrix_init(lua_State *L) {
+void nerv_lua_mmatrix_init(lua_State *L) {
     srand(1);
-    nerv_matrix_host_float_init(L);
-    nerv_matrix_host_double_init(L);
-    nerv_matrix_host_int_init(L);
+    nerv_matrix_host_float_lua_init(L);
+    nerv_matrix_host_double_lua_init(L);
+    nerv_matrix_host_int_lua_init(L);
 }
 
 #define MATRIX_USE_FLOAT
@@ -52,26 +52,18 @@ static void host_matrix_(init_extra)(lua_State *L) {
 }
 
 #include "generic/mmatrix.c"
+#include "../lib/matrix/mmatrix.h"
 
-static int nerv_matrix_(perm_gen)(lua_State *L) {
+static int nerv_matrix_(lua_perm_gen)(lua_State *L) {
+    Status status;
     int i, ncol = luaL_checkinteger(L, 1);
-    Matrix *self = nerv_matrix_(new_)(L, 1, ncol);
-    long *prow = self->data.i;
-    for (i = 0; i < ncol; i++)
-        prow[i] = i;
-    for (i = ncol - 1; i >= 0; i--)
-    {
-        size_t j = rand() % (i + 1);
-        long tmp = prow[i];
-        prow[i] = prow[j];
-        prow[j] = tmp;
-    }
+    Matrix *self = nerv_matrix_(perm_gen)(ncol, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
     luaT_pushudata(L, self, nerv_matrix_(tname));
     return 1;
 }
 
 static const luaL_Reg nerv_matrix_(extra_methods_int)[] = {
-    {"perm_gen", nerv_matrix_(perm_gen)},
+    {"perm_gen", nerv_matrix_(lua_perm_gen)},
     {NULL, NULL}
 };
-
diff --git a/nerv/nerv-scm-1.rockspec b/nerv/nerv-scm-1.rockspec
index 0b7e4cb..786b2df 100644
--- a/nerv/nerv-scm-1.rockspec
+++ b/nerv/nerv-scm-1.rockspec
@@ -24,6 +24,7 @@ build = {
         LUA="$(LUA)",
     },
     install_variables = {
+        LUA_BINDIR="$(LUA_BINDIR)",
         INST_PREFIX="$(PREFIX)",
         INST_BINDIR="$(BINDIR)",
         INST_LIBDIR="$(LIBDIR)",
diff --git a/speech b/speech
-Subproject 32eac093cc431849a92e5a2297c5fe646fd6055
+Subproject b6301089cde20f4c825c7f5deaf179082aad63d