28 files changed, 303 insertions, 58 deletions
diff --git a/nerv/init.lua b/nerv/init.lua
index 9c1a5c8..406aea6 100644
--- a/nerv/init.lua
+++ b/nerv/init.lua
@@ -22,6 +22,12 @@ function nerv.mesg_with_timestamp(fmt, ...)
             os.date("%H:%M:%S %F"), fmt), ...)
 end
 
+function nerv.info_stderr(fmt, ...)
+    io.stderr:write(string.format(
+		string.format("(%s)[nerv] info: %s\n",os.date("%H:%M:%S %F"), fmt),
+		...))
+end
+
 function nerv.info(fmt, ...)
     nerv.printf(
         string.format("(%s)[nerv] info: %s\n",
diff --git a/nerv/io/sgd_buffer.lua b/nerv/io/sgd_buffer.lua
index 74c4934..dd5d285 100644
--- a/nerv/io/sgd_buffer.lua
+++ b/nerv/io/sgd_buffer.lua
@@ -55,7 +55,7 @@ function SGDBuffer:saturate()
                 buff.data:copy_from(buff.leftover, 0, lrow)
                 buff.leftover = nil
             end
-            nerv.info("buffer leftover: %d\n", lrow)
+            nerv.info("buffer leftover: %d", lrow)
             reader.tail = lrow
             reader.has_leftover = false
         end
diff --git a/nerv/layer/affine.lua b/nerv/layer/affine.lua
index 00cbcfb..56a32f9 100644
--- a/nerv/layer/affine.lua
+++ b/nerv/layer/affine.lua
@@ -19,19 +19,19 @@ end
 
 function MatrixParam:update(gradient)
     local gconf = self.gconf
-    self.correction:add(self.correction, gradient, gconf.momentum, 1.0)
+    self.correction:add(self.correction, gradient, gconf.momentum, 1.0, nerv.context)
     -- momentum gain
     local mmt_gain = 1.0 / (1.0 - gconf.momentum);
     local n = self.gconf.batch_size * mmt_gain
     -- perform update
-    self.trans:add(self.trans, self.correction, 1.0, -gconf.lrate / n)
+    self.trans:add(self.trans, self.correction, 1.0, -gconf.lrate / n, nerv.context)
 end
 
 function LinearTransParam:update(gradient)
     MatrixParam.update(self, gradient)
     local gconf = self.gconf
     -- weight decay
-    self.trans:add(self.trans, self.trans, 1.0, -gconf.lrate * gconf.wcost)
+    self.trans:add(self.trans, self.trans, 1.0, -gconf.lrate * gconf.wcost, nerv.context)
 end
 
 function AffineLayer:__init(id, global_conf, layer_conf)
@@ -61,29 +61,50 @@ function AffineLayer:init(batch_size)
 end
 
 function AffineLayer:update(bp_err, input, output)
+    --print(nerv.context)
     if self.direct_update then
-        self.ltp.correction:mul(input[1], bp_err[1], 1.0, gconf.momentum, 'T', 'N')
+        self.ltp.correction:mul(input[1], bp_err[1], 1.0, gconf.momentum, 'T', 'N', nerv.context)
         -- momentum gain
         local mmt_gain = 1.0 / (1.0 - gconf.momentum);
         local n = self.gconf.batch_size * mmt_gain
         -- perform update
-        self.ltp.trans:add(self.ltp.trans, self.ltp.correction, 1.0, -gconf.lrate / n)
+        self.ltp.trans:add(self.ltp.trans, self.ltp.correction, 1.0, -gconf.lrate / n, nerv.context)
     else
-        self.ltp_grad:mul(input[1], bp_err[1], 1.0, 0.0, 'T', 'N')
+        self.ltp_grad:mul(input[1], bp_err[1], 1.0, 0.0, 'T', 'N', nerv.context)
         self.ltp:update(self.ltp_grad)
     end
     self.bp:update(bp_err[1]:colsum())
 end
 
+function AffineLayer:gradient(bp_err, input, output)
+
+        self.ltp.correction:mul(input[1], bp_err[1], 1.0, gconf.momentum, 'T', 'N', nerv.context)
+	self.bp_grad = bp_err[1]:colsum()
+	self.bp.correction:add(self.bp.correction, self.bp_grad, gconf.momentum, 1.0, nerv.context)
+end
+
+function AffineLayer:update_gradient()
+	-- momentum gain
+        local mmt_gain = 1.0 / (1.0 - gconf.momentum);
+        local n = self.gconf.batch_size * mmt_gain
+        -- perform update
+        self.ltp.trans:add(self.ltp.trans, self.ltp.correction, 1.0, -gconf.lrate / n, nerv.context)
+    	self.bp.trans:add(self.bp.trans, self.bp.correction, 1.0, -gconf.lrate / n, nerv.context)
+
+	self.ltp.trans:add(self.ltp.trans, self.ltp.trans, 1.0, -gconf.lrate * gconf.wcost, nerv.context)
+	self.bp.trans:add(self.bp.trans, self.bp.trans, 1.0, -gconf.lrate * gconf.wcost, nerv.context)
+end
+
 function AffineLayer:propagate(input, output)
     -- apply linear transform
-    output[1]:mul(input[1], self.ltp.trans, 1.0, 0.0, 'N', 'N')
+    --print(nerv.context)
+    output[1]:mul(input[1], self.ltp.trans, 1.0, 0.0, 'N', 'N', nerv.context)
     -- add bias
     output[1]:add_row(self.bp.trans, 1.0)
 end
 
 function AffineLayer:back_propagate(bp_err, next_bp_err, input, output)
-    next_bp_err[1]:mul(bp_err[1], self.ltp.trans, 1.0, 0.0, 'N', 'T')
+    next_bp_err[1]:mul(bp_err[1], self.ltp.trans, 1.0, 0.0, 'N', 'T', nerv.context)
 end
 
 function AffineLayer:get_params()
diff --git a/nerv/layer/combiner.lua b/nerv/layer/combiner.lua
index 7bd7617..23cf1db 100644
--- a/nerv/layer/combiner.lua
+++ b/nerv/layer/combiner.lua
@@ -36,7 +36,7 @@ end
 function CombinerLayer:propagate(input, output)
     output[1]:fill(0)
     for i = 1, #self.dim_in do
-        output[1]:add(output[1], input[i], 1.0, self.lambda[i])
+        output[1]:add(output[1], input[i], 1.0, self.lambda[i], nerv.context)
     end
     for i = 2, #self.dim_out do
         output[i]:copy_fromd(output[1])
@@ -47,10 +47,10 @@ function CombinerLayer:back_propagate(bp_err, next_bp_err, input, output)
     local sum = self.sum
     sum:copy_fromd(bp_err[1])
     for i = 2, #self.dim_out do
-        sum:add(sum, bp_err[i], 1.0, 1.0)
+        sum:add(sum, bp_err[i], 1.0, 1.0, nerv.context)
     end
     for i = 1, #self.dim_in do
-        next_bp_err[i]:add(next_bp_err[i], sum, 0.0, self.lambda[i])
+        next_bp_err[i]:add(next_bp_err[i], sum, 0.0, self.lambda[i], nerv.context)
     end
 end
 
diff --git a/nerv/layer/sigmoid.lua b/nerv/layer/sigmoid.lua
index dfd09eb..f6f1417 100644
--- a/nerv/layer/sigmoid.lua
+++ b/nerv/layer/sigmoid.lua
@@ -18,6 +18,14 @@ function SigmoidLayer:update(bp_err, input, output)
     -- no params, therefore do nothing
 end
 
+function SigmoidLayer:gradient(bp_err, input, output)
+    -- no params, therefore do nothing
+end
+
+function SigmoidLayer:update_gradient()
+    -- no params, therefore do nothing
+end
+
 function SigmoidLayer:propagate(input, output)
     output[1]:sigmoid(input[1])
 end
diff --git a/nerv/layer/softmax.lua b/nerv/layer/softmax.lua
index e979ebf..7e9c6f0 100644
--- a/nerv/layer/softmax.lua
+++ b/nerv/layer/softmax.lua
@@ -18,6 +18,15 @@ function SoftmaxLayer:update(bp_err, input, output)
     -- no params, therefore do nothing
 end
 
+function SoftmaxLayer:gradient(bp_err, input, output)
+    -- no params, therefore do nothing
+end
+
+function SoftmaxLayer:update_gradient()
+    -- no params, therefore do nothing
+end
+
+
 function SoftmaxLayer:propagate(input, output)
     output[1]:softmax(input[1])
 end
diff --git a/nerv/layer/softmax_ce.lua b/nerv/layer/softmax_ce.lua
index f878a2f..42adbc6 100644
--- a/nerv/layer/softmax_ce.lua
+++ b/nerv/layer/softmax_ce.lua
@@ -27,6 +27,14 @@ function SoftmaxCELayer:update(bp_err, input, output)
     -- no params, therefore do nothing
 end
 
+function SoftmaxCELayer:gradient(bp_err, input, output)
+    -- no params, therefore do nothing
+end
+
+function SoftmaxCELayer:update_gradient(bp_err, input, output)
+    -- no params, therefore do nothing
+end
+
 function SoftmaxCELayer:propagate(input, output)
     local softmax = self.softmax
     local ce = self.ce
diff --git a/nerv/lib/common.c b/nerv/lib/common.c
index db667b2..1fa1d9f 100644
--- a/nerv/lib/common.c
+++ b/nerv/lib/common.c
@@ -1,4 +1,5 @@
 #include "common.h"
+#include "matrix/cuda_helper.h"
 #include <stdarg.h>
 int nerv_error(lua_State *L, const char *err_mesg_fmt, ...) {
     va_list ap;
diff --git a/nerv/lib/common.h b/nerv/lib/common.h
index 1c588d1..a4e3582 100644
--- a/nerv/lib/common.h
+++ b/nerv/lib/common.h
@@ -59,6 +59,8 @@ typedef struct Status {
             nerv_error_status(L, &status); \
     } while (0)
 
+#define PROFILE_HASHMAP_SIZE 123457
+
 typedef struct HashNode {
     const char *key;
     void *val;
@@ -82,6 +84,8 @@ void hashmap_clear(HashMap *h);
 
 size_t bkdr_hash(const char *key);
 
+extern const char *nerv_context_tname;
+
 int nerv_error(lua_State *L, const char *err_mesg_fmt, ...); 
 int nerv_error_status(lua_State *L, Status *status); 
 int nerv_error_method_not_implemented(lua_State *L);
diff --git a/nerv/lib/io/chunk_file.c b/nerv/lib/io/chunk_file.c
index 4e00b0b..79dbee3 100644
--- a/nerv/lib/io/chunk_file.c
+++ b/nerv/lib/io/chunk_file.c
@@ -61,7 +61,7 @@ static const char *read_chunk_metadata(FILE *fp, const char *fn,
     NERV_SET_STATUS(status, (fgets(buff + LUA_RETURN_LEN,
                                 LINEBUFF_SIZE, fp) == (buff + LUA_RETURN_LEN) ? \
                                 NERV_NORMAL : CF_INVALID_FORMAT), 0);
-    fprintf(stderr, "metadata: %s\n", buff);
+    //fprintf(stderr, "metadata: %s\n", buff);
     return buff;
 }
 
@@ -112,7 +112,7 @@ static ChunkFile *open_read(const char *fn, Status *status) {
     for (i = 0;; offset += chunk_len, i++)
     {
         ChunkInfo *cip;
-        fprintf(stderr, "reading chunk %d from %d\n", i, (int)offset);
+        //fprintf(stderr, "reading chunk %d from %d\n", i, (int)offset);
         /* skip to the begining of chunk i */
         if (fseeko(fp, offset, SEEK_SET) != 0)
         {
diff --git a/nerv/lib/matrix/cukernel.h b/nerv/lib/matrix/cukernel.h
index 2126c6f..31e199b 100644
--- a/nerv/lib/matrix/cukernel.h
+++ b/nerv/lib/matrix/cukernel.h
@@ -13,7 +13,7 @@ void cudak_(cuda_softmax_final)(const Matrix *a, const Matrix *max, const Matrix
 void cudak_(cuda_add_row)(const Matrix *a, Matrix *b, double beta);
 void cudak_(cuda_fill)(Matrix *a, double val);
 void cudak_(cuda_clip)(Matrix *a, double val_1, double val_2);
-void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context);
+void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context, int a_begin, int a_end);
 void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step);
 void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b);
 void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b);
diff --git a/nerv/lib/matrix/cumatrix.c b/nerv/lib/matrix/cumatrix.c
index ff1168d..c913db2 100644
--- a/nerv/lib/matrix/cumatrix.c
+++ b/nerv/lib/matrix/cumatrix.c
@@ -2,11 +2,13 @@
 #include "../common.h"
 #include "cuda_helper.h"
 #include <string.h>
-#define PROFILE_HASHMAP_SIZE 123457
+
 static cublasHandle_t cublas_handle;
 static cudaEvent_t profile_start, profile_stop;
 static HashMap *profile;
 
+const char *nerv_context_tname = "nerv.CCuContext";
+
 void nerv_cumatrix_print_profile() {
     size_t i;
     fprintf(stderr, "*** [nerv cumatrix profile] **\n");
@@ -35,6 +37,12 @@ void accu_profile(const char *name, float delta) {
     *val += delta;
 }
 
+cublasHandle_t* nerv_get_cublas_handle()
+{
+	return &cublas_handle;
+}
+
+
 void nerv_cumatrix_init() {
     cublasCreate(&cublas_handle);
     cudaEventCreate(&profile_start);
diff --git a/nerv/lib/matrix/cumatrix.h b/nerv/lib/matrix/cumatrix.h
index e6def66..e53c702 100644
--- a/nerv/lib/matrix/cumatrix.h
+++ b/nerv/lib/matrix/cumatrix.h
@@ -4,4 +4,18 @@
 void nerv_cumatrix_print_profile();
 void nerv_cumatrix_clear_profile();
 void nerv_cumatrix_init();
+
+void nerv_set_cublas_handle();
+
+typedef struct CuContext 
+{
+        cublasHandle_t cublas_handle;
+        cudaEvent_t profile_start, profile_stop;
+        HashMap *profile;
+        pthread_t pid;
+        int refcount;
+}CuContext;
+
+extern const char *nerv_context_tname;
+
 #endif
diff --git a/nerv/lib/matrix/generic/cukernel.cu b/nerv/lib/matrix/generic/cukernel.cu
index 08feb59..707f8fd 100644
--- a/nerv/lib/matrix/generic/cukernel.cu
+++ b/nerv/lib/matrix/generic/cukernel.cu
@@ -229,14 +229,15 @@ __global__ void cudak_(expand_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
                                     int nrow, int ncol,
                                     int enrow, int encol,
                                     int stride, int estride,
-                                    int context) {
+                                    int context,
+				    int a_begin, int a_end) {
     int j = blockIdx.x * blockDim.x + threadIdx.x;
     int i = blockIdx.y * blockDim.y + threadIdx.y;
     int ridx;
     if (i >= enrow || j >= encol) return;
     ridx = i + j / ncol - context;
-    if (ridx < 0) ridx = 0;
-    else if (ridx >= nrow) ridx = nrow - 1;
+    if (ridx < a_begin) ridx = a_begin;
+    else if (ridx >= a_end) ridx = a_end - 1;
     b[j + i * estride] = a[j % ncol + ridx * stride];
 }
 
@@ -541,7 +542,7 @@ extern "C" {
         cudaStreamSynchronize(0);
     }
 
-    void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context) {
+    void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context, int a_begin, int a_end) {
         dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
         dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
                 CEIL_DIV(b->nrow, threadsPerBlock.y));
@@ -551,7 +552,8 @@ extern "C" {
              b->nrow, b->ncol,
              a->stride / sizeof(MATRIX_ELEM),
              b->stride / sizeof(MATRIX_ELEM),
-             context);
+             context,
+	     a_begin, a_end);
         cudaStreamSynchronize(0);
     }
 
diff --git a/nerv/lib/matrix/generic/cumatrix.c b/nerv/lib/matrix/generic/cumatrix.c
index 770e503..5b11496 100644
--- a/nerv/lib/matrix/generic/cumatrix.c
+++ b/nerv/lib/matrix/generic/cumatrix.c
@@ -13,12 +13,13 @@
 
 void nerv_matrix_(add)(Matrix *c, const Matrix *a, const Matrix *b,
                             MATRIX_ELEM alpha, MATRIX_ELEM beta,
-                            Status *status) {
+                            cublasHandle_t *handle, Status *status) {
     CHECK_SAME_DIMENSION(a, b, status);
     CHECK_SAME_DIMENSION(a, c, status);
-    PROFILE_START
+    cublasHandle_t *cuhandle = (handle == NULL ? &cublas_handle : handle);
+    PROFILE_START   //cublas_handle
     CUBLAS_SAFE_SYNC_CALL(
-            NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
+            NERV_CUBLAS_(geam)(*cuhandle, CUBLAS_OP_N, CUBLAS_OP_N,
                 a->ncol, a->nrow,
                 &alpha,
                 MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
@@ -32,7 +33,7 @@ void nerv_matrix_(add)(Matrix *c, const Matrix *a, const Matrix *b,
 
 void nerv_matrix_(mul)(Matrix *c, const Matrix *a, const Matrix *b,
                             MATRIX_ELEM alpha, MATRIX_ELEM beta,
-                            int ta, int tb, Status *status) {
+                            int ta, int tb, cublasHandle_t *handle, Status *status) {
 #define SWAP(a, b) \
     do { int t = (a); (a) = (b); (b) = t; } while (0)
 
@@ -42,10 +43,11 @@ void nerv_matrix_(mul)(Matrix *c, const Matrix *a, const Matrix *b,
     if (tb == CUBLAS_OP_T) SWAP(bm, bn);
     if (an != bm)
         NERV_EXIT_STATUS(status, MAT_WRONG_MULT_DIM, 0);
+    cublasHandle_t *cuhandle = (handle == NULL ? &cublas_handle : handle);
     /* Because matrix in Nerv is row-major, here b comes first */
-    PROFILE_START
+    PROFILE_START //cublas_handle
     CUBLAS_SAFE_SYNC_CALL(
-            NERV_CUBLAS_(gemm)(cublas_handle, tb, ta,
+            NERV_CUBLAS_(gemm)(*cuhandle, tb, ta,
                 bn, am, bm,
                 &alpha,
                 MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM),
@@ -253,15 +255,16 @@ void nerv_matrix_(copy_toh)(Matrix *a, const Matrix *b,
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
 }
 
-Matrix *nerv_matrix_(trans)(Matrix *a, Status *status) {
+Matrix *nerv_matrix_(trans)(Matrix *a, cublasHandle_t *handle, Status *status) {
     MATRIX_ELEM alpha = 1, beta = 0;
     Matrix *b = nerv_matrix_(create)(a->ncol, a->nrow, status);
     if (status->err_code != NERV_NORMAL)
         return NULL;
+    cublasHandle_t *cuhandle = (handle == NULL ? &cublas_handle : handle);
     /* FIXME: possible memory leak when lua error is raised */
-    PROFILE_START
+    PROFILE_START //cublas_handle
     CUBLAS_SAFE_SYNC_CALL_RET(
-            NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T,
+            NERV_CUBLAS_(geam)(*cuhandle, CUBLAS_OP_T, CUBLAS_OP_T,
                 a->nrow, a->ncol,
                 &alpha,
                 MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
@@ -360,14 +363,16 @@ void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b,
 }
 
 void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b,
-                            int context, Status *status) {
-    if (a->nrow != b->nrow)
-        NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0);
+                            int context, int b_begin, int b_end,
+				 Status *status) {
+    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+            b_end - b_begin == a->nrow))
+        NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); 
     if (a->ncol != b->ncol * (context * 2 + 1))
         NERV_EXIT_STATUS(status, MAT_GENERAL_ERR,
                         "the width should be 2 * context + 1");
     PROFILE_START
-    cudak_(cuda_expand_frm)(b, a, context);
+    cudak_(cuda_expand_frm)(b, a, context, b_begin, b_end);
     PROFILE_STOP
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
 }
diff --git a/nerv/lib/matrix/generic/cumatrix.h b/nerv/lib/matrix/generic/cumatrix.h
index 04e8c5a..f476414 100644
--- a/nerv/lib/matrix/generic/cumatrix.h
+++ b/nerv/lib/matrix/generic/cumatrix.h
@@ -1,11 +1,11 @@
 #include "../../common.h"
 
 void nerv_matrix_(add)(Matrix *c, const Matrix *a, const Matrix *b,
-                            MATRIX_ELEM alpha, MATRIX_ELEM beta,
+                            MATRIX_ELEM alpha, MATRIX_ELEM beta, cublasHandle_t *handle,
                             Status *status);
 void nerv_matrix_(mul)(Matrix *c, const Matrix *a, const Matrix *b,
                             MATRIX_ELEM alpha, MATRIX_ELEM beta,
-                            int ta, int tb, Status *status);
+                            int ta, int tb, cublasHandle_t *handle, Status *status);
 void nerv_matrix_(sigmoid)(Matrix *a, const Matrix *b, Status *status);
 void nerv_matrix_(sigmoid_grad)(Matrix *nerr, const Matrix *err,
                                 const Matrix *output, Status *status);
@@ -31,7 +31,7 @@ void nerv_matrix_(copy_fromh)(Matrix *a, const Matrix *b,
 void nerv_matrix_(copy_toh)(Matrix *a, const Matrix *b,
                             int a_begin, int a_end, int b_begin,
                             Status *status);
-Matrix *nerv_matrix_(trans)(Matrix *a, Status *status);
+Matrix *nerv_matrix_(trans)(Matrix *a, cublasHandle_t *handle, Status *status);
 void nerv_matrix_(mul_elem)(Matrix *c, const Matrix *a, const Matrix *b,
                             Status *status);
 
@@ -44,7 +44,7 @@ void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b,
                             const Matrix *idx, int b_begin, Status *status);
 
 void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b,
-                            int context, Status *status);
+                            int context, int b_begin, int b_end, Status *status);
 void nerv_matrix_(rearrange_frm)(Matrix *a, const Matrix *b,
                                 int step, Status *status);
 void nerv_matrix_(scale_rows_by_col)(Matrix *a, const Matrix *b,
diff --git a/nerv/lib/matrix/generic/matrix.c b/nerv/lib/matrix/generic/matrix.c
index 6cb3dc0..cac1ee7 100644
--- a/nerv/lib/matrix/generic/matrix.c
+++ b/nerv/lib/matrix/generic/matrix.c
@@ -5,21 +5,29 @@
 
 void nerv_matrix_(data_free)(Matrix *self, Status *status) {
     assert(*self->data_ref > 0);
-    if (--(*self->data_ref) == 0)
+    //if (--(*self->data_ref) == 0)
+    if (NULL != self && __sync_fetch_and_add(&self->refcount, -1) == 1)
     {
-        /* free matrix data */
-        MATRIX_DATA_FREE(MATRIX_ELEM_PTR(self), status);
-        free(self->data_ref);
-        free(self);
-    }
-    else {
-        free(self);
-        NERV_SET_STATUS(status, NERV_NORMAL, 0);
+	if(__sync_fetch_and_add(self->data_ref, -1) == 1)
+    	{
+        	/* free matrix data */
+        	MATRIX_DATA_FREE(MATRIX_ELEM_PTR(self), status);
+        	free(self->data_ref);
+        	free(self);
+		self = NULL;
+    	}
+    	else 
+    	{
+        	free(self);
+		self = NULL;
+        	NERV_SET_STATUS(status, NERV_NORMAL, 0);
+	}
     }
 }
 
 void nerv_matrix_(data_retain)(Matrix *self) {
-    (*self->data_ref)++;
+    __sync_fetch_and_add(self->data_ref, 1);
+    //(*self->data_ref)++;
 }
 
 Matrix *nerv_matrix_(create)(long nrow, long ncol, Status *status) {
@@ -36,6 +44,7 @@ Matrix *nerv_matrix_(create)(long nrow, long ncol, Status *status) {
         free(self);
         return NULL;
     }
+    self->refcount = 1;
     self->data_ref = (long *)malloc(sizeof(long));
     *self->data_ref = 0;
     nerv_matrix_(data_retain)(self);
@@ -56,6 +65,7 @@ Matrix *nerv_matrix_(getrow)(Matrix *self, int row) {
     prow->nmax = prow->ncol;
     MATRIX_ELEM_PTR(prow) = MATRIX_ROW_PTR(self, row);
     prow->data_ref = self->data_ref;
+    prow->refcount = 1;
     nerv_matrix_(data_retain)(prow);
     return prow;
 }
diff --git a/nerv/lib/matrix/generic/mmatrix.c b/nerv/lib/matrix/generic/mmatrix.c
index 225079e..0850c6e 100644
--- a/nerv/lib/matrix/generic/mmatrix.c
+++ b/nerv/lib/matrix/generic/mmatrix.c
@@ -7,6 +7,7 @@
 #define NERV_GENERIC_MATRIX
 #include "../../common.h"
 #include "../../io/chunk_file.h"
+#include "../cuda_helper.h"
 #include "string.h"
 
 static void host_matrix_(free)(MATRIX_ELEM *ptr, Status *status) {
@@ -79,4 +80,47 @@ void nerv_matrix_(copy_from)(Matrix *a, const Matrix *b,
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
 }
 
+void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b, 
+                            int context, int b_begin, int b_end, Status *status) {
+    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+            b_end - b_begin == a->nrow))
+        NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); 
+    if (a->ncol != b->ncol * (context * 2 + 1)) 
+        NERV_EXIT_STATUS(status, MAT_GENERAL_ERR,
+                        "the width should be 2 * context + 1");
+    int i, j, r_off;
+    for (i = 0; i < a->nrow; i++)
+    {
+	for (j = 0; j < context * 2 + 1; j++)
+	{
+		r_off = b_begin + i + j - context;
+		if (r_off < b_begin) r_off = b_begin;
+		if (r_off >= b_end) r_off = b_end - 1;
+		memcpy(MATRIX_ROW_PTR(a, i) + j*b->ncol, MATRIX_ROW_PTR(b, r_off), sizeof(MATRIX_ELEM) * b->ncol);
+	}
+    }
+    NERV_SET_STATUS(status, NERV_NORMAL, 0); 
+}
+
+void nerv_matrix_(rearrange_frm)(Matrix *a, const Matrix *b, 
+                                int step, int b_begin, int b_end, Status *status) {
+    //CHECK_SAME_DIMENSION(a, b, status);     
+    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+            b_end - b_begin == a->nrow))
+        NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); 
+    if (b->ncol % step)
+        NERV_EXIT_STATUS(status, MAT_GENERAL_ERR,
+                        "the dimension of columns is not divisible by step");
+	
+    int i, j;
+    int stride = a->stride / sizeof(MATRIX_ELEM);
+    for (i = 0; i < a->nrow; i++)
+    {
+	for (j = 0; j < a->ncol; j++)
+		MATRIX_ELEM_PTR(a)[j + i * stride] = MATRIX_ELEM_PTR(b)[j / step + (j % step) * (b->ncol/step) + (i+b_begin) * stride];
+    }
+    NERV_SET_STATUS(status, NERV_NORMAL, 0); 
+}
+
+
 #endif
diff --git a/nerv/lib/matrix/generic/mmatrix.h b/nerv/lib/matrix/generic/mmatrix.h
index f00a04d..eb6c4c7 100644
--- a/nerv/lib/matrix/generic/mmatrix.h
+++ b/nerv/lib/matrix/generic/mmatrix.h
@@ -6,3 +6,9 @@ void nerv_matrix_(save)(Matrix *self, ChunkFile *cfp, Status *status);
 void nerv_matrix_(copy_from)(Matrix *a, const Matrix *b,
                             int a_begin, int b_begin, int b_end,
                             Status *status);
+
+void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b, 
+                            int context, int b_begin, int b_end, Status *status);
+
+void nerv_matrix_(rearrange_frm)(Matrix *a, const Matrix *b, 
+                                int step, int b_begin, int b_end, Status *status);
diff --git a/nerv/lib/matrix/matrix.h b/nerv/lib/matrix/matrix.h
index 67a6e30..51ca736 100644
--- a/nerv/lib/matrix/matrix.h
+++ b/nerv/lib/matrix/matrix.h
@@ -13,6 +13,7 @@ typedef struct Matrix {
         long *i;
     } data;                   /* pointer to actual storage */
     long *data_ref;
+    int   refcount;           /* prevent matrix struct double free */
 } Matrix;
 
 #define MATRIX_ROW_PTR(self, row) \
diff --git a/nerv/lib/matrix/mmatrix.c b/nerv/lib/matrix/mmatrix.c
index b8157eb..e54b336 100644
--- a/nerv/lib/matrix/mmatrix.c
+++ b/nerv/lib/matrix/mmatrix.c
@@ -52,3 +52,21 @@ Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) {
 #define host_matrix_(NAME) host_matrix_int_##NAME
 #define nerv_matrix_(NAME) nerv_matrix_host_int_##NAME
 #include "generic/mmatrix.c"
+
+Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) {
+    int i;
+    Matrix *self = nerv_matrix_(create)(1, ncol, status);
+    if (status->err_code != NERV_NORMAL)
+        return NULL;
+    long *prow = self->data.i;
+    for (i = 0; i < ncol; i++)
+        prow[i] = i;
+    for (i = ncol - 1; i >= 0; i--)
+    {
+        size_t j = rand() % (i + 1);
+        long tmp = prow[i];
+        prow[i] = prow[j];
+        prow[j] = tmp;
+    }
+    return self;
+}
diff --git a/nerv/matrix/cumatrix.c b/nerv/matrix/cumatrix.c
index fef03fc..08b7efa 100644
--- a/nerv/matrix/cumatrix.c
+++ b/nerv/matrix/cumatrix.c
@@ -1,4 +1,5 @@
 #define NERV_GENERIC_CUMATRIX
+#include "../lib/matrix/cuda_helper.h"
 #include "../lib/common.h"
 #include "../lib/matrix/cumatrix.h"
 #include "../lib/matrix/cuda_helper.h"
diff --git a/nerv/matrix/generic/cumatrix.c b/nerv/matrix/generic/cumatrix.c
index 08cb4c2..ce7e68a 100644
--- a/nerv/matrix/generic/cumatrix.c
+++ b/nerv/matrix/generic/cumatrix.c
@@ -6,6 +6,7 @@
 #define MATRIX_BASE_TNAME nerv_matrix_cuda_tname
 #define NERV_GENERIC_MATRIX
 #define NERV_GENERIC_CUKERNEL
+#include "../../lib/matrix/cuda_helper.h"
 #include "../../lib/common.h"
 #include "../../lib/matrix/generic/matrix.h"
 #include "../../lib/matrix/generic/cumatrix.h"
@@ -17,7 +18,9 @@ static int nerv_matrix_(lua_add)(lua_State *L) {
     const Matrix *b = lu