major change: use luarocks to manage project

author: Determinant <[email protected]> 2015-06-22 19:01:29 +0800
committer: Determinant <[email protected]> 2015-06-22 19:01:29 +0800
commit: 2497fd9e7a0fae5ee4887890d7a312e0e08a93b8 (patch)
tree: 382f97575bd2df9ee6abb1662b11b279fc22d72b /nerv
parent: 196e9b48a3541caccdffc5743001cced70667091 (diff)
59 files changed, 6468 insertions, 2 deletions
diff --git a/nerv b/nerv
deleted file mode 100755
index 3c16418..0000000
--- a/nerv
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/bash
-exec 'build/luajit-2.0/bin/luajit' -e "package.cpath=\"${PWD}/build/lib/?.so\"" -e "package.path=\"${PWD}/build/lua/?/init.lua;${PWD}/build/lua/?.lua;${PWD}/?.lua\"" -e "require 'nerv'" "$@"
diff --git a/nerv/.gitignore b/nerv/.gitignore
new file mode 100644
index 0000000..567609b
--- /dev/null
+++ b/nerv/.gitignore
@@ -0,0 +1 @@
+build/
diff --git a/nerv/Makefile b/nerv/Makefile
new file mode 100644
index 0000000..b69a63e
--- /dev/null
+++ b/nerv/Makefile
@@ -0,0 +1,60 @@
+.PHONY: build install clean
+SHELL := /bin/bash
+BUILD_DIR := $(CURDIR)/build
+OBJS := nerv.o luaT.o common.o \
+		matrix/mmatrix.o matrix/cumatrix.o matrix/init.o matrix/cukernel.o \
+		io/init.o io/chunk_file.o \
+		examples/oop_example.o
+LIBS := libnerv.so
+LUA_LIBS := matrix/init.lua io/init.lua init.lua \
+			layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua \
+			layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua \
+			nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua \
+			io/sgd_buffer.lua
+INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK
+CUDA_BASE := /usr/local/cuda-6.5
+#CUDA_BASE := /usr/local/cuda-5.0
+CUDA_INCLUDE := -I $(CUDA_BASE)/include/
+INCLUDE += $(CUDA_INCLUDE)
+LDFLAGS := -L$(CUDA_BASE)/lib64/  -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcublas
+CFLAGS := -Wall -Wextra -O2
+OBJ_DIR := $(BUILD_DIR)/objs
+SUBDIR := matrix io layer examples nn
+NVCC := $(CUDA_BASE)/bin/nvcc
+NVCC_FLAGS := -Xcompiler -fPIC,-Wall,-Wextra
+
+LUA_DIR = $(INST_LUADIR)/nerv
+OBJS := $(addprefix $(OBJ_DIR)/,$(OBJS))
+OBJ_SUBDIR := $(addprefix $(OBJ_DIR)/,$(SUBDIR))
+LUA_SUBDIR := $(addprefix $(LUA_DIR)/,$(SUBDIR))
+LIBS := $(addprefix $(INST_LIBDIR)/,$(LIBS))
+LUA_LIBS := $(addprefix $(LUA_DIR)/,$(LUA_LIBS))
+
+build: $(OBJ_DIR) $(OBJ_SUBDIR) $(OBJS)
+$(OBJ_DIR) $(LUA_DIR) $(OBJ_SUBDIR) $(LUA_SUBDIR):
+	-mkdir -p $@
+$(OBJ_DIR)/%.o: %.c $(patsubst /%.o,/%.c,$@)
+	gcc -c -o $@ $< $(INCLUDE) -fPIC $(CFLAGS)
+$(OBJ_DIR)/matrix/cukernel.o: matrix/cukernel.cu
+	$(NVCC) -c -o $@ $< $(INCLUDE) $(NVCC_FLAGS)
+$(LUA_DIR)/%.lua: %.lua
+	cp $< $@
+$(OBJ_DIR)/luaT.o:
+	gcc -c -o $@ luaT/luaT.c $(INCLUDE) -fPIC
+$(LIBS): $(OBJS)
+	gcc -shared -o $@ $(OBJS) $(LDFLAGS)
+
+$(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c matrix/generic/cukernel.cu
+$(OBJ_DIR)/matrix/mmatrix.o: matrix/generic/mmatrix.c matrix/generic/matrix.c
+$(OBJ_DIR)/matrix/cukernel.o: matrix/generic/cukernel.cu
+
+.PHONY: speech
+
+speech:
+	-mkdir -p build/objs/speech/tnet_io
+	$(MAKE) -C speech/ BUILD_DIR=$(BUILD_DIR) LIB_DIR=$(LIB_DIR) OBJ_DIR=$(CURDIR)/build/objs/speech/ LUA_DIR=$(LUA_DIR)
+
+clean:
+	-rm -rf $(OBJ_DIR)
+
+install: $(LIBS) $(LUA_DIR) $(LUA_SUBDIR) $(LUA_LIBS)
diff --git a/nerv/common.c b/nerv/common.c
new file mode 100644
index 0000000..b4e39e6
--- /dev/null
+++ b/nerv/common.c
@@ -0,0 +1,76 @@
+#include "common.h"
+#include <stdarg.h>
+int nerv_error(lua_State *L, const char *err_mesg_fmt, ...) {
+    va_list ap;
+    va_start(ap, err_mesg_fmt);
+    lua_pushstring(L, "[nerv] internal error: ");
+    lua_pushvfstring(L, err_mesg_fmt, ap); 
+    lua_concat(L, 2);
+    lua_error(L);
+    va_end(ap);
+    return 0;
+}
+
+int nerv_error_method_not_implemented(lua_State *L) {
+    return nerv_error(L, "method not implemented"); 
+}
+
+void luaN_append_methods(lua_State *L, const luaL_Reg *mlist) {
+    for (; mlist->func; mlist++)
+    {
+        lua_pushcfunction(L, mlist->func);
+        lua_setfield(L, -2, mlist->name);
+    }
+}
+
+HashMap *hashmap_create(size_t size, HashKey_t hfunc, HashMapCmp_t cmp) {
+    HashMap *res = (HashMap *)malloc(sizeof(HashMap));
+    res->bucket = calloc(size, sizeof(HashNode));
+    res->cmp = cmp;
+    res->hfunc = hfunc;
+    res->size = size;
+    return res;
+}
+
+void *hashmap_getval(HashMap *h, const char *key) {
+    size_t idx = h->hfunc(key) % h->size;
+    HashNode *ptr;
+    for (ptr = h->bucket[idx]; ptr; ptr = ptr->next)
+    {
+        if (!h->cmp(ptr->key, key))
+            return ptr->val;
+    }
+    return NULL;
+}
+
+void hashmap_setval(HashMap *h, const char *key, void *val) {
+    size_t idx = h->hfunc(key) % h->size;
+    HashNode *ptr = malloc(sizeof(HashNode));
+    ptr->next = h->bucket[idx];
+    h->bucket[idx] = ptr;
+    ptr->key = key;
+    ptr->val = val;
+}
+
+void hashmap_clear(HashMap *h) {
+    size_t i;
+    for (i = 0; i < h->size; i++)
+    {
+        HashNode *ptr, *nptr;
+        for (ptr = h->bucket[i]; ptr; ptr = nptr)
+        {
+            nptr = ptr->next;
+            free(ptr->val);
+            free(ptr);
+        }
+        h->bucket[i] = NULL;
+    }
+}
+
+size_t bkdr_hash(const char *key) {
+    unsigned int seed = 131;
+    unsigned int res = 0;
+    while (*key)
+        res = res * seed + *key++;
+    return res;
+}
diff --git a/nerv/common.h b/nerv/common.h
new file mode 100644
index 0000000..e21c7a5
--- /dev/null
+++ b/nerv/common.h
@@ -0,0 +1,36 @@
+#ifndef NERV_COMMON_H
+#define NERV_COMMON_H
+#include "lua.h"
+#include "lauxlib.h"
+#include "lualib.h"
+#include "luaT/luaT.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef struct HashNode {
+    const char *key;
+    void *val;
+    struct HashNode *next;
+} HashNode;
+
+typedef int (*HashMapCmp_t)(const char *a, const char *b);
+typedef size_t (*HashKey_t)(const char *key);
+
+typedef struct HashMap {
+    HashNode **bucket;
+    HashMapCmp_t cmp;
+    HashKey_t hfunc;
+    size_t size;
+} HashMap;
+
+HashMap *hashmap_create(size_t size, HashKey_t hfunc, HashMapCmp_t cmp);
+void *hashmap_getval(HashMap *h, const char *key);
+void hashmap_setval(HashMap *h, const char *key, void *val);
+void hashmap_clear(HashMap *h);
+
+size_t bkdr_hash(const char *key);
+
+int nerv_error(lua_State *L, const char *err_mesg_fmt, ...); 
+int nerv_error_method_not_implemented(lua_State *L);
+void luaN_append_methods(lua_State *L, const luaL_Reg *mlist);
+#endif
diff --git a/nerv/doc/nerv.md b/nerv/doc/nerv.md
new file mode 100644
index 0000000..28411f5
--- /dev/null
+++ b/nerv/doc/nerv.md
@@ -0,0 +1,17 @@
+#The Nerv utility functions#
+Part of the [Nerv](../README.md) toolkit.
+##Methods##
+* __string = nerv.typename(obj a)__  
+A registered function, the original function is `luaT_lua_typename`. In some cases if you call `type(a)`  for object of some class in __Nerv__(like __Nerv.CuMatrix__) it will only return "userdata"(because it is created in C), in this case you can use this method to get its type.
+
+---
+
+* __metatable = nerv.getmetatable(string tname)__  
+A registered function, the original function is `luaT_lua_getmetatable`. `tname` should be a class name that has been registered in __luaT__.
+
+* __metatable = nerv.newmetatable(string tname, string parenttname, function constructor, function destructor, function factory)__  
+A registered function, the original function is `luaT_newmetatable`, it returns the metatable of the created class by the name `tname`.
+* __string = nerv.setmetatable(table self, string tname)__  
+A registered function, the original function is `luaT_lua_setmetatable`. It assigns the metatable registered in __luaT__ by the name *tname* to the table *self*. And return *tname* to user.
+* __table = nerv.get_type(string typename)__  
+Returns the type(`loadstring("return " .. typename)`).
+\ No newline at end of file
diff --git a/nerv/doc/nerv_class.md b/nerv/doc/nerv_class.md
new file mode 100644
index 0000000..99f63e7
--- /dev/null
+++ b/nerv/doc/nerv_class.md
@@ -0,0 +1,36 @@
+#The Nerv OOP#
+Part of the [Nerv](../README.md) toolkit.
+##Methods##
+* __metatable mt, metatable mpt = nerv.class(string tname, string parenttname)__  
+This method is used to create a class by the name `tname`, which inherits `parenttname` in __Nerv__, then you create a new instance of this class by calling `obj=tname(...)`. The  `tname.__init(...)` method(if defined) will be called in the constructing. The metatable of the class and its parent class will be returned.
+
+##Examples##
+* This example implements a simple `nerv.Counter` class which is inherited by `nerv.BetterCounter`.  
+
+```
+do
+    nerv.class("nerv.Counter")
+    function nerv.Counter:__init(c)
+        if (c) then
+            self.c = c
+        else
+            self.c = 0
+        end
+    end
+end
+do
+    local mt, mpt = nerv.class("nerv.BetterCounter", "nerv.Counter")
+    function nerv.BetterCounter:__init(c, bc)
+        mpt.__init(self, c)
+        if (bc) then
+            self.bc = bc
+        else
+            self.bc = 0
+        end
+    end
+end
+c1 = nerv.Counter(1)
+print(c1.c)
+bc1 = nerv.BetterCounter(1, 1)
+print(bc1.c, bc1.bc)
+```
+\ No newline at end of file
diff --git a/nerv/doc/nerv_io.md b/nerv/doc/nerv_io.md
new file mode 100644
index 0000000..07589df
--- /dev/null
+++ b/nerv/doc/nerv_io.md
@@ -0,0 +1,113 @@
+#The Nerv IO Package#
+Part of the [Nerv](../README.md) toolkit.
+
+##Description##
+The main class that the user uses to store and read parameter object to and from files is __nerv.ChunkFile__.  
+In the file, a parameter object will be saved using a standard format. First is the length(in byte) of this object, then a table which includes some meta information of the object, and a data area. Below is an example text file.  
+```
+[0000000000202]
+{type="nerv.ExampleP",info={message="just-a-try"},id="exampleP1"}
+3 3
+5.000000 5.000000 5.000000 
+5.000000 5.000000 5.000000 
+5.000000 5.000000 5.000000 
+1 3
+4.000000 4.000000 4.000000 
+[0000000000202]
+{type="nerv.ExampleP",info={message="just-a-try"},id="exampleP2"}
+3 3
+4.000000 4.000000 4.000000 
+4.000000 4.000000 4.000000 
+4.000000 4.000000 4.000000 
+1 3
+3.000000 3.000000 3.000000 
+```
+
+##Methods##
+* __ChunkFile ChunkFile(string fn, string mode)__  
+`mode` can be `r` or `w`, for reading or writing a file. The returned __ChunkFile__ will be ready to write or read objects which follows the __nerv.Param__ interface(using `write_chunk` and `read_chunk`). 
+* __void ChunkFile.write_chunk(ChunkFile self, Param p)__  
+Write `p` into the file. `p:write` will be called.
+* __Param ChunkFile.read_chunk(ChunkFile self, string id, table global_conf)__  
+Read the __Param__ object by id `id` from the file `self`. It will be constructed using `__init(id, global_conf)`. `p:read` will be called.
+* __void ChunkFile.close(ChunkFile self)__  
+Close the opened file.
+
+##Examples##
+* An example showing how to use __ChunkFile__ to store and read parameter objects.
+```
+require 'io'
+do
+    local mt, mpt = nerv.class('nerv.ExampleP', 'nerv.Param')
+    function nerv.ExampleP:__init(id, global_conf)
+        self.id = id
+        self.global_conf = global_conf
+        self.matrix = nerv.MMatrixFloat(3, 3)
+        for i = 0, 2, 1 do
+            for j = 0, 2, 1 do
+                self.matrix[i][j] = 3
+            end
+        end
+        self.bias = nerv.MMatrixFloat(1, 3)
+        for i = 0, 2, 1 do
+            self.bias[i] = 2;
+        end
+        self:set_info({message = 'just-a-try'})
+    end
+    function nerv.ExampleP:addOne()
+        for i = 0, 2, 1 do
+            for j = 0, 2, 1 do
+                self.matrix[i][j] = self.matrix[i][j] + 1
+            end
+        end
+        for i = 0, 2, 1 do
+            self.bias[i] = self.bias[i] + 1
+        end
+    end
+    function nerv.ExampleP:read(pcdata)
+        self.matrix = nerv.MMatrixFloat.load(pcdata)
+        self.bias = nerv.MMatrixFloat.load(pcdata)
+    end
+    function nerv.ExampleP:write(pfhandle)
+        self.matrix:save(pfhandle) 
+        self.bias:save(pfhandle)
+    end
+end
+global_conf = {}
+do
+    local f = nerv.ChunkFile('../tmp', 'w')
+    local exampleP1 = nerv.ExampleP('exampleP1', global_conf)
+    local exampleP2 = nerv.ExampleP('exampleP2', global_conf)
+    exampleP1:addOne() 
+    exampleP1:addOne()
+    exampleP2:addOne()
+
+    f:write_chunk(exampleP1)
+    f:write_chunk(exampleP2)
+    f:close()
+end
+do
+    local f = nerv.ChunkFile('../tmp', 'r')
+    local exampleP1 = f:read_chunk('exampleP1', global_conf)
+    local exampleP2 = f:read_chunk('exampleP2', global_conf)
+    f:close()
+    print(exampleP1.matrix)
+    print(exampleP2.matrix)
+end
+```
+
+##Developer Notes##
+* There are four classes in to deal with chunk data, which are __nerv.ChunkFile__, __nerv.ChunkFileHandle__, __nerv.ChunkInfo__, __nerv.ChunkData__. Below is the underlying C structs.
+```
+typedef struct ChunkFileHandle {
+    FILE *fp;
+} ChunkFileHandle;
+typedef struct ChunkInfo {
+    off_t offset, length;
+} ChunkInfo;
+typedef struct ChunkData {
+    FILE *fp;
+    char *data;
+} ChunkData;
+```
+* In __Nerv.io__, a returned(by `ChunkFile.__init`) __nerv.ChunkFile__ will have a member `handle`, which is a __nerv.ChunkFileHandle__.  
+\ No newline at end of file
diff --git a/nerv/doc/nerv_layer.md b/nerv/doc/nerv_layer.md
new file mode 100644
index 0000000..de2fb12
--- /dev/null
+++ b/nerv/doc/nerv_layer.md
@@ -0,0 +1,180 @@
+#The Nerv Layer Package#
+Part of the [Nerv](../README.md) toolkit.
+
+##Description##
+__nerv.Layer__ is the base class and most of its methods are abstract.  
+###Class hierarchy and their members###
+* __nerv.Layer__.  
+	* `table dim_in` It specifies the dimensions of the inputs.  
+	* `table dim_out` It specifies the dimensions of the outputs.  
+	* `string id` ID of this layer.
+	* `table gconf` Stores the `global_conf`.
+* __nerv.AffineLayer__ inherits __nerv.Layer__, both `#dim_in` and `#dim_out` are 1. 
+	* `MatrixParam ltp` The liner transform parameter.
+	* `BiasParam bp` The bias parameter.
+* __nerv.BiasLayer__ inherits __nerv.Layer__, both `#dim_in` nad `#dim_out` are 1.
+	* `BiasParam bias` The bias parameter.
+* __nerv.SigmoidLayer__ inherits __nerv.Layer__, both `#dim_in` and `#dim_out` are 1.
+* __nerv.SoftmaxCELayer__ inherits __nerv.Layer__, `#dim_in` is 2 and `#dim_out` is -1(optional). `input[1]` is the input to the softmax layer, `input[2]` is the reference distribution. In its `propagate(input, output)` method, if `output[1] ~= nil`, cross\_entropy value will outputed.
+	* `float total_ce` Records the accumlated cross entropy value.
+	* `int total_frams` Records how many frames have passed.  
+	* `bool compressed` The reference distribution can be a one-hot format. This feature is enabled by `layer_conf.compressed`.
+
+##Methods##
+* __void Layer.\_\_init(Layer self, string id, table global_conf, table layer_conf)__  
+Abstract method.  
+The constructing method should assign `id` to `self.id` and `global_conf` to `self.gconf`, `layer_conf.dim_in` to `self.dim_in`, `layer_conf.dim_out` to `self.dim_out`. `dim_in` and `dim_out` are a list specifies the dimensions of the inputs and outputs. Also, `layer_conf` will include the parameters, which should also be properly saved.
+* __void Layer.init(Layer self)__  
+Abstract method.  
+Initialization method, in this method the layer should do some self-checking and allocate space for intermediate results.
+* __void Layer.update(Layer self, table bp_err, table input, table output)__  
+Abstract method.  
+`bp_err[i]` should be the error on `output[i]`. In this method the parameters of `self` is updated.
+* __void Layer.propagate(Layer self, table input, table output)__  
+Abstract method.  
+Given `input` and the current parameters, propagate and store the result in `output`.
+* __void Layer.back_propagate(Layer self, Matrix next_bp_err, Matrix bp_err, Matrix input, Matrix output)__  
+Abstract method.  
+Calculate the error on the inputs and store them in `next_bp_err`.
+
+* __void Layer.check_dim_len(int len_in, int len_out)__  
+Check whether `#self.dim_in == len_in` and `#self.dim_out == len_out`, if violated, an error will be posted.
+* __void Layer.get_params(Layer self)__  
+Abstract method.  
+The layer should return a list containing its parameters.
+
+####nerv.Layer.get\_dim(self)####
+*	Returns:
+	`dim_in`: __table__.  
+    `dim_out`: __table__.  
+*	Parameters:  
+	`self`: __nerv.Layer__.
+*	Description:  
+	Returns `self.dim_in, self.dim_out`.
+
+##Examples##
+* a basic example using __Nerv__ layers to a linear classification.
+
+```
+require 'math'
+
+require 'layer.affine'
+require 'layer.softmax_ce'
+
+--[[Example using layers, a simple two-classification problem]]--
+
+function calculate_accurate(networkO, labelM)
+    sum = 0
+    for i = 0, networkO:nrow() - 1, 1 do
+        if (labelM[i][0] == 1 and networkO[i][0] >= 0.5) then
+            sum = sum + 1
+        end
+        if (labelM[i][1] == 1 and networkO[i][1] >= 0.5) then
+            sum = sum + 1
+        end 
+    end
+    return sum
+end
+
+--[[begin global setting and data generation]]--
+global_conf =  {lrate = 10, 
+                wcost = 1e-6,
+                momentum = 0.9,
+                cumat_type = nerv.CuMatrixFloat}
+
+input_dim = 5
+data_num = 100
+ansV = nerv.CuMatrixFloat(input_dim, 1)
+for i = 0, input_dim - 1, 1 do
+    ansV[i][0] = math.random() - 0.5
+end
+ansB = math.random() - 0.5
+print('displaying ansV')
+print(ansV)
+print('displaying ansB(bias)')
+print(ansB)
+
+dataM = nerv.CuMatrixFloat(data_num, input_dim)
+for i = 0, data_num - 1, 1 do
+    for j = 0, input_dim - 1, 1 do
+        dataM[i][j] = math.random() * 2 - 1
+    end
+end
+refM = nerv.CuMatrixFloat(data_num, 1)
+refM:fill(ansB)
+refM:mul(dataM, ansV, 1, 1) --refM = dataM * ansV + ansB
+
+labelM = nerv.CuMatrixFloat(data_num, 2)
+for i = 0, data_num - 1, 1 do
+    if (refM[i][0] > 0) then
+        labelM[i][0] = 1 
+        labelM[i][1] = 0
+    else
+        labelM[i][0] = 0
+        labelM[i][1] = 1
+    end
+end
+--[[global setting and data generation end]]--
+
+
+--[[begin network building]]--
+--parameters
+affineL_ltp = nerv.LinearTransParam('AffineL_ltp', global_conf)
+affineL_ltp.trans = nerv.CuMatrixFloat(input_dim, 2)
+for i = 0, input_dim - 1, 1 do
+    for j = 0, 1, 1 do
+        affineL_ltp.trans[i][j] = math.random() - 0.5 
+    end
+end
+affineL_bp = nerv.BiasParam('AffineL_bp', global_conf)
+affineL_bp.trans = nerv.CuMatrixFloat(1, 2)
+for j = 0, 1, 1 do
+    affineL_bp.trans[j] = math.random() - 0.5
+end
+
+--layers
+affineL = nerv.AffineLayer('AffineL', global_conf, {['ltp'] = affineL_ltp,
+                                                      ['bp'] = affineL_bp,
+                                                      dim_in = {input_dim},
+                                                      dim_out = {2}})
+softmaxL = nerv.SoftmaxCELayer('softmaxL', global_conf, {dim_in = {2, 2},
+                                                         dim_out = {}})
+print('layers initializing...')
+affineL:init()
+softmaxL:init()
+--[[network building end]]--
+
+
+--[[begin space allocation]]--
+print('network input&output&error space allocation...')
+affineI = {dataM} --input to the network is data
+affineO = {nerv.CuMatrixFloat(data_num, 2)}
+softmaxI = {affineO[1], labelM}
+softmaxO = {}
+output = nerv.CuMatrixFloat(data_num, 2)
+
+affineE = {nerv.CuMatrixFloat(data_num, 2)}
+--[[space allocation end]]--
+
+
+--[[begin training]]--
+ce_last = 0
+for l = 0, 10, 1 do
+    affineL:propagate(affineI, affineO)
+    softmaxL:propagate(softmaxI, softmaxO)
+    output:softmax(softmaxI[1])
+
+    softmaxL:back_propagate(affineE, {}, softmaxI, softmaxO)
+    
+    affineL:update(affineE, affineI, affineO) 
+
+    if (l % 5 == 0) then
+        nerv.utils.printf("training iteration %d finished\n", l)
+        nerv.utils.printf("cross entropy: %.8f\n", softmaxL.total_ce - ce_last)
+        ce_last = softmaxL.total_ce 
+        nerv.utils.printf("accurate labels: %d\n", calculate_accurate(output, labelM))
+        nerv.utils.printf("total frames processed: %.8f\n", softmaxL.total_frames)
+    end
+end
+--[[end training]]--
+```
diff --git a/nerv/doc/nerv_matrix.md b/nerv/doc/nerv_matrix.md
new file mode 100644
index 0000000..22971d2
--- /dev/null
+++ b/nerv/doc/nerv_matrix.md
@@ -0,0 +1,165 @@
+#The Nerv Matrix Package#
+Part of the [Nerv](../README.md) toolkit.
+
+##Description##
+###Underlying structure###
+In the begining is could be useful to know something about the underlying structure of a __Nerv__ matrix. Please keep in mind that matrice in __Nerv__ is row-major.  
+Every matrix object is a encapsulation of a C struct that describes the attributes of this matrix.  
+```
+typedef struct Matrix {
+    size_t stride;              /* size of a row */
+    long ncol, nrow, nmax;    /* dimension of the matrix, nmax is simply nrow * ncol */
+    union {
+        float *f;
+        double *d;
+        long *i;
+    } data;                   /* pointer to actual storage */
+    long *data_ref;
+} Matrix;
+```
+It is worth mentioning that that `data_ref` is a counter which counts the number of references to its memory space, mind that it will also be increased when a row of the matrix is referenced(`col = m[2]`). A __Nerv__ matrix will deallocate its space when this counter is decreased to zero.
+Also note that all assigning operation in __Nerv__ is reference copy, you can use `copy_tod` or `copy_toh` method to copy value. Also, row assigning operations like `m1[2]=m2[3]` is forbidden in __Nerv__.
+
+###Class hierarchy###
+The class hierarchy of the matrix classes can be clearly observed in `matrix/init.c`.
+First there is a abstract base class __Nerv.Matrix__, which is inherited by __Nerv.CuMatrix__ and __Nerv.MMatrix__(also abstract).  
+Finally, there is __Nerv.CuMatrixFloat__, __Nerv.CuMatrixDouble__, inheriting __Nerv.CuMatrix__, and __Nerv.MMatrixFloat__, __Nerv.MMatrixDouble__, __Nerv.MMatrixInt__ , inheriting __Nerv.MMatrix__.
+
+##Methods##
+Mind that usually a matrix object can only do calculation with matrix of its own type(a __Nerv.CuMatrixFloat__ matrix can only do add operation with a __Nerv.CuMatrixFloat__).  
+In the methods description below, __Matrix__ could be __Nerv.CuMatrixFloat__, __Nerv.CuMatrixDouble__, __Nerv.MMatrixFloat__ or __Nerv.MMatrixDouble__. __Element_type__ could be `float` or `double`, respectively.
+* __Matrix = Matrix(int nrow, int ncol)__  
+Returns a __Matrix__ object of `nrow` rows and `ncol` columns.
+* __Element_type = Matrix.get_elem(Matrix self, int index)__  
+Returns the element value at the specific index(treating the matrix as a vector). The index should be less than `nmax` of the matrix.
+* __void Matrix.set_elem(Matrix self, int index, Element_type value)__  
+Set the value at `index` to be `value`.
+* __int Matrix.ncol(Matrix self)__  
+Get `ncol`, the number of columns.
+* __int Matrix.nrow(Matrix self)__  
+Get `nrow`, the number of rows.
+* __int Matrix.get_dataref_value(Matrix self)__  
+Returns the value(not a pointer) of space the `data_ref` pointer pointed to. This function is mainly for debugging.  
+* __Matrix/Element\_type, boolean Matrix.\_\_index\_\_(Matrix self, int index)__  
+If the matrix has more than one row, will return the row at `index` as a __Matrix__ . Otherwise it will return the value at `index`.
+* __void Matrix.\_\_newindex\_\_(Matrix self, int index, Element_type value)__  
+Set the element at `index` to be `value`.
+---
+* __Matrix Matrix.create(Matrix a)__  
+Return a new __Matrix__ of `a`'s size(of the same number of rows and columns).
+* __Matrix Matrix.colsum(Matrix self)__  
+Return a new __Matrix__ of size (1,`self.ncol`), which stores the sum of all columns of __Matrix__ `self`.
+* __Matrix Matrix.rowsum(Matrix self)__  
+Return a new __Matrix__ of size (`self.nrow`,1), which stores the sum of all rows of __Matrix__ `self`.
+* __Matrix Matrix.rowmax(Matrix self)__  
+Return a new __Matrix__ of size (`self.nrow`,1), which stores the max value of all rows of __Matrix__ `self`.
+* __Matrix Matrix.trans(Matrix self)__  
+Return a new __Matrix__ of size (`self.ncol`,`self.nrow`), which stores the transpose of __Matrix__ `self`.
+* __void Matrix.copy_fromh(Matrix self, MMatrix a)__  
+Copy the content of a __MMatrix__ `a` to __Matrix__ `self`, they should be of the same size.
+* __void Matrix.copy_fromd(Matrix self, CuMatrix a)__  
+Copy the content of a __CuMatrix__ `a` to __Matrix__ `self`, they should be of the same size.
+* __void Matrix.copy_toh(Matrix self, MMatrix a)__  
+Copy the content of the __Matrix__ `self` to a __MMatrix__ `a`.
+* __void Matrix.copy_tod(Matrix self, CuMatrix a)__  
+Copy the content of the __Matrix__ `self` to a __CuMatrix__ `a`.
+* __void Matrix.add(Matrix self, Matrix ma, Matrix mb, Element_type alpha, Element_type beta)__  
+It sets the content of __Matrix__ `self` to be `alpha * ma + beta * mb`.__Matrix__ `ma,mb,self` should be of the same size.
+* __void Matrix.mul(Matrix self, Matrix ma, Matrix mb, Element_type alpha, Element_type beta, [string ta, string tb])__  
+It sets the content of __Matrix__ `self` to be `beta * self + alpha * ma * mb`. `ta` and `tb` is optional, if `ta` is 'T', then `ma` will be transposed, also if `tb` is 'T', `mb` will be transposed.
+* __void Matrix.add_row(Matrix self, Matrix va, Element_type beta)__  
+Add `beta * va` to every row of __Matrix__ `self`.
+* __void Matrix.fill(Matrix self, Element_type value)__  
+Fill the content of __Matrix__ `self` to be `value`.
+* __void Matrix.sigmoid(Matrix self, Matrix ma)__  
+Set the element of __Matrix__ `self` to be elementwise-sigmoid of `ma`.
+* __void Matrix.sigmoid_grad(Matrix self, Matrix err, Matrix output)__  
+Set the element of __Matrix__ `self`, to be `self[i][j]=err[i][j]*output[i][j]*(1-output[i][j])`. This function is used to propagate sigmoid layer error.
+* __void Matrix.softmax(Matrix self, Matrix a)__  
+Calculate a row-by-row softmax of __Matrix__ `a` and save the result in `self`.
+* __void Matrix.mul_elem(Matrix self, Matrix ma, Matrix mb)__  
+Calculate element-wise multiplication of __Matrix__ `ma` and `mb`, store the result in `self`.
+* __void Matrix.log_elem(Matrix self, Matrix ma)__  
+Calculate element-wise log of __Matrix__ `ma`, store the result in `self`.
+* __void Matrix.copy_rows_fromh_by_idx(Matrix self, MMatrix ma, MMatrixInt idx)__  
+`idx` should be a row vector. This function copy the rows of `ma` to `self` according to `idx`, in other words, it assigns `ma[idx[i]]` to `self[i]`.
+* __void Matrix.expand_frm(Matrix self, Matrix a, int context)__  
+Treating each row of `a` as speech feature, and do a feature expansion. The `self` should of size `(a.nrow, a.ncol * (context * 2 + 1))`. `self[i]` will be `(a[i-context] a[i-context+1] ... a[i] a[i+1] a[i+context])`. `a[0]` and `a[nrow]` will be copied to extend the index range.
+* __void Matrix.rearrange_frm(Matrix self, Matrix a, int step)__  
+Rearrange `a` according to its feature dimension. The `step` is the length of context. So, `self[i][j]` will be assigned `a[i][j / step + (j % step) * (a.ncol / step)]`. `a` and `self` should be of the same size and `step` should be divisible by `a.ncol`.
+* __void Matrix.scale_row(Matrix self, Matrix scale)__  
+Scale each column of `self` according to a vector `scale`. `scale` should be of size `1 * self.ncol`.
+* __Matrix Matrix.\_\_add\_\_(Matrix ma, Matrix mb)__  
+Returns a new __Matrix__ which stores the result of `ma+mb`.
+* __Matrix Matrix.\_\_sub\_\_(Matrix ma, Matrix mb)__  
+Returns a new __Matrix__ which stores the result of `ma-mb`.
+* __Matrix Matrix.\_\_mul\_\_(Matrix ma, Matrix mb)__  
+Returns a new __Matrix__ which stores the result of `ma*mb`.
+* __CuMatrix CuMatrix.new_from_host(MMatrix m)__  
+Return a new __CuMatrix__ which is a copy of `m`.
+* __MMatrix CuMatrix.new_to_host(CuMatrix self)__  
+Return a new __MMatrix__ which is a copy of `self`.
+* __string Matrix.\_\_tostring\_\_(Matrix self)__  
+Returns a string containing values of __Matrix__ `self`.
+---
+* __MMatrix MMatrix.load(ChunkData chunk)__  
+Return a new __MMatrix__ loaded from the file position in `chunk`.
+* __void MMatrix.save(MMatrix self, ChunkFileHandle chunk)__  
+Write `self` to the file position in `chunk`.
+* __void MMatrix.copy_from(MMatrix ma, MMatrix mb,[int b_bgein, int b_end, int a_begin])__  
+Copy a part of `mb`(rows of index `[b_begin..b_end)`) to `ma` beginning at row index `a_begin`. If not specified, `b_begin` will be `0`, `b_end` will be `b.nrow`, `a_begin` will be `0`.
+
+##Examples##
+* Use `get_dataref_value` to test __Nerv__'s matrix space allocation.  
+```
+m = 10
+n = 10
+fm = nerv.MMatrixFloat(m, n)
+dm = nerv.MMatrixDouble(m, n)
+for i = 0, m - 1 do
+    for j = 0, n - 1 do
+        t = i / (j + 1)
+        fm[i][j] = t
+        dm[i][j] = t
+    end
+end
+print("test fm:get_dataref_value:", fm:get_dataref_value())
+print("forced a garbade collect")
+collectgarbage("collect")
+print("test fm:get_dataref_value:", fm:get_dataref_value())
+print(fm)
+print(dm)
+```
+* Test some __Matrix__ calculations.
+```
+m = 4
+n = 4
+fm = nerv.CuMatrixFloat(m, n)
+dm = nerv.CuMatrixDouble(m, n)
+for i = 0, m - 1 do
+    for j = 0, n - 1 do
+        -- local t = math.random(10)
+        t = i / (j + 1)
+        fm[i][j] = t
+        dm[i][j] = t
+    end
+end
+print(fm)
+fs = fm:create()
+fs:softmax(fm)
+-- print(fs)
+print(dm)
+ds = dm:create()
+ds:softmax(dm)
+-- print(ds)
+print(fs)
+print(fs + fs)
+print(ds + ds)
+print(fs - fs)
+print(ds - ds)
+a = fs:create()
+a:mul_elem(fs, fs)
+print(a)
+a:log_elem(fs)
+print(a)
+```
+\ No newline at end of file
diff --git a/nerv/doc/nerv_nn.md b/nerv/doc/nerv_nn.md
new file mode 100644
index 0000000..c57447d
--- /dev/null
+++ b/nerv/doc/nerv_nn.md
@@ -0,0 +1,256 @@
+#The Nerv NN Package#
+Part of the [Nerv](../README.md) toolkit.
+
+##Description##
+###Class hierarchy###
+it contains __nerv.LayerRepo__, __nerv.ParamRepo__, and __nerv.DAGLayer__(inherits __nerv.Layer__).
+
+###Class hierarchy and their members###
+####nerv.ParamRepo#### 
+Get parameter object by ID.  
+*	`table param_table` Contains the mapping of parameter ID to parameter file(__nerv.ChunkFile__)  
+*  __nerv.LayerRepo__ Get layer object by ID.  
+* 	`table layers` Contains the mapping of layer ID to layer object.
+objects.
+
+####__nerv.DAGLayer__####
+Inherits __nerv.Layer__.  
+* 	`layers`: __table__, a mapping from a layer ID to its "ref". A ref is a structure that contains reference to space allocations and other info of the layer.
+* 	`inputs`: __table__, a mapping from the inputs ports of the DAG layer to the input ports of the sublayer, the key is the port number, the value is `{ref, port}`.
+* 	`outputs`:__table__, the counterpart of `inputs`.
+* 	`parsed_conn`: __table__, a list of parsed connections, each entry is of format `{{ref_from, port_from}, {ref_to, port_to}}`.
+* 	`queue`: __table__, a list of "ref"s, the propagation of the DAGLayer will follow this order, and back-propagation will follow a reverse order.
+	
+##Methods##
+
+###__nerv.ParamRepo__###
+
+####nerv.ParamRepo:\_\_init(param\_files)####
+* 	Parameters:  
+	`param_files`: __table__
+*	Description:  
+	`param_files` is a list of file names that stores parameters, the newed __ParamRepo__ will read them from file and store the mapping for future fetching.  
+    
+####nerv.Param ParamRepo.get_param(ParamRepo self, string pid, table global_conf)####
+*	Returns:  
+	__nerv.Layer__  
+*	Parameters:  
+	`self`: __nerv.ParamRepo__.  
+    `pid`: __string__.  
+    `global_conf`: __table__.  
+*	Description:  
+	__ParamRepo__ will find the __nerv.ChunkFile__ `pf` that contains parameter of ID `pid` and return `pf:read_chunk(pid, global_conf)`.
+
+###__nerv.LayerRepo__###
+####nerv.LayerRepo:\_\_init(layer\_spec, param\_repo, global\_conf)####
+* 	Returns:  
+  	__nerv.LayerRepo__.  
+* 	Parameters:  
+  	`self`: __nerv.ParamRepo__.  
+  	`layer_spec`: __table__.  
+  	`param_repo`: __nerv.ParamRepo__.  
+  	`global_conf`: __table__.  
+* 	Description:  
+  	__LayerRepo__ will construct the layers specified in `layer_spec`. Every entry in the `layer_spec` table should follow the format below:  
+	
+    > layer_spec : {[layer_type1] = llist1, [layer_type2] = llist2, ...}   
+  	> llist : {layer1, layer2, ...}   
+  	> layer : layerid = {param_config, layer_config}   
+  	> param_config : {param1 = paramID1, param2 = paramID2}  	 
+    
+  	__LayerRepo__ will merge `param_config` into `layer_config` and construct a layer by calling `layer_type(layerid, global_conf, layer_config)`.    
+
+####nerv.LayerRepo.get\_layer(self, lid)####
+* 	Returns:  
+	__nerv.LayerRepo__, the layer with ID `lid`.
+* 	Parameters:  
+	`self`:__nerv.LayerRepo__.  
+	`lid`:__string__.  
+*	Description:   
+	Returns the layer with ID `lid`.
+    
+###nerv.DAGLayer###
+####nerv.DAGLayer:\_\_init(id, global\_conf, layer\_conf)####
+*	Returns:  
+	__nerv.DAGLayer__  
+*	Parameters:  
+	`id`: __string__  
+    `global_conf`: __table__  
+    `layer_conf`: __table__  
+*	Description:  
+	The `layer_conf` should contain `layer_conf.sub_layers` which is a __nerv.LayerRepo__ storing the sub layers of the DAGLayer. It should also contain `layer_conf.connections`, which is a string-to-string mapping table describing the DAG connections. See an example below:
+    
+    ```
+    dagL = nerv.DAGLayer("DAGL", global_conf, {["dim_in"] = {input_dim, 2}, ["dim_out"] = {}, ["sub_layers"] = layerRepo,
+    	["connections"] = {
+    	["<input>[1]"] = "AffineL[1]",
+    	["AffineL[1]"] = "SoftmaxL[1]",
+    	["<input>[2]"] = "SoftmaxL[2]",
+  	}})
+    ```
+    
+####nerv.DAGLayer.init(self, batch\_size)####
+*	Parameters:  
+	`self`: __nerv.DAGLayer__  
+    `batch_size`: __int__
+* 	Description:  
+	This initialization method will allocate space for output and input matrice, and will call `init()` for each of its sub layers.
+    
+
+####nerv.DAGLayer.propagate(self, input, output)####
+*	Parameters:  
+	`self`: __nerv.DAGLayer__  
+    `input`: __table__  
+    `output`: __table__  
+*	Description:  
+	The same function as __nerv.Layer.propagate__, do propagation for each layer in the order of `self.queue`.
+
+####nerv.DAGLayer.back\_propagate(self, next\_bp\_err, bp\_err, input, output)####
+*	Parameters:  
+	`self`: __nerv.DAGLayer__  
+    `next_bp_err`: __table__  
+    `bp_err`: __table__  
+    `input`: __table__  
+    `output`: __table__  
+*	Description:  
+	The same function as __nerv.Layer.back_propagate__, do back-propagation for each layer in the reverse order of `self.queue`.
+
+####nerv.DAGLayer.update(self, bp\_err, input, output)####
+*	Parameters:  
+	`self`: __nerv.DAGLayer__  
+    `bp_err`: __table__  
+    `input`: __table__  
+    `output`: __table__  
+*	Description:  
+	The same function as __nerv.Layer.update__, do update for each layer in the order of `self.queue`.
+    
+##Examples##
+*	aaa
+	
+```
+require 'math'
+
+require 'layer.affine'
+require 'layer.softmax_ce'
+
+--[[Example using DAGLayer, a simple two-classification problem]]--
+
+--[[begin global setting and data generation]]--
+global_conf =  {lrate = 10, 
+                wcost = 1e-6,
+                momentum = 0.9,
+                cumat_type = nerv.CuMatrixFloat,
+               }
+
+input_dim = 5
+data_num = 100
+param_fn = "../tmp"
+ansV = nerv.CuMatrixFloat(input_dim, 1)
+for i = 0, input_dim - 1, 1 do
+    ansV[i][0] = math.random() - 0.5
+end
+ansB = math.random() - 0.5
+print('displaying ansV')
+print(ansV)
+print('displaying ansB(bias)')
+print(ansB)
+
+dataM = nerv.CuMatrixFloat(data_num, input_dim)
+for i = 0, data_num - 1, 1 do
+    for j = 0, input_dim - 1, 1 do
+        dataM[i][j] = math.random() * 2 - 1
+    end
+end
+refM = nerv.CuMatrixFloat(data_num, 1)
+refM:fill(ansB)
+refM:mul(dataM, ansV, 1, 1) --refM = dataM * ansV + ansB
+
+labelM = nerv.CuMatrixFloat(data_num, 2)
+for i = 0, data_num - 1, 1 do
+    if (refM[i][0] > 0) then
+        labelM[i][0] = 1 
+        labelM[i][1] = 0
+    else
+        labelM[i][0] = 0
+        labelM[i][1] = 1
+    end
+end
+--[[global setting and data generation end]]--
+
+
+--[[begin network building]]--
+--parameters
+do
+    local affineL_ltp = nerv.LinearTransParam('AffineL_ltp', global_conf)
+    affineL_ltp.trans = nerv.CuMatrixFloat(input_dim, 2)
+    for i = 0, input_dim - 1, 1 do
+        for j = 0, 1, 1 do
+            affineL_ltp.trans[i][j] = math.random() - 0.5 
+        end
+    end
+    local affineL_bp = nerv.BiasParam('AffineL_bp', global_conf)
+    affineL_bp.trans = nerv.CuMatrixFloat(1, 2)
+    for j = 0, 1, 1 do
+        affineL_bp.trans[j] = math.random() - 0.5
+    end
+
+    local chunk = nerv.ChunkFile(param_fn, 'w')
+    chunk:write_chunk(affineL_ltp)
+    chunk:write_chunk(affineL_bp)
+    chunk:close()
+
+    paramRepo = nerv.ParamRepo({param_fn})
+end
+
+--layers
+layerRepo = nerv.LayerRepo({
+        ["nerv.AffineLayer"] = 
+        {
+            ["AffineL"] = {{["ltp"] = "AffineL_ltp", ["bp"] = "AffineL_bp"}, {["dim_in"] = {input_dim}, ["dim_out"] = {2}}},
+        },
+        ["nerv.SoftmaxCELayer"] = 
+        {
+            ["SoftmaxL"] = {{}, {["dim_in"] = {2, 2}, ["dim_out"] = {}}}
+        },
+        }, paramRepo, global_conf)
+affineL = layerRepo:get_layer("AffineL")
+softmaxL = layerRepo:get_layer("SoftmaxL")
+print('layers initializing...')
+dagL = nerv.DAGLayer("DAGL", global_conf, {["dim_in"] = {input_dim, 2}, ["dim_out"] = {}, ["sub_layers"] = layerRepo,
+        ["connections"] = {
+           ["<input>[1]"] = "AffineL[1]",
+           ["AffineL[1]"] = "SoftmaxL[1]",
+           ["<input>[2]"] = "SoftmaxL[2]",
+        }})
+dagL:init(data_num)
+--affineL:init()
+--softmaxL:init()
+--[[network building end]]--
+
+
+--[[begin space allocation]]--
+print('network input&output&error space allocation...')
+dagL_input = {dataM, labelM}
+dagL_output = {}
+dagL_err = {}
+dagL_ierr = {nerv.CuMatrixFloat(data_num, input_dim), nerv.CuMatrixFloat(data_num, 2)}
+--[[space allocation end]]--
+
+
+--[[begin training]]--
+ce_last = 0
+for l = 0, 10, 1 do
+    dagL:propagate(dagL_input, dagL_output)
+    dagL:back_propagate(dagL_ierr, dagL_err, dagL_input, dagL_output)
+    dagL:update(dagL_err, dagL_input, dagL_output)
+    
+    if (l % 2 == 0) then
+        nerv.utils.printf("training iteration %d finished\n", l)
+        nerv.utils.printf("cross entropy: %.8f\n", softmaxL.total_ce - ce_last)
+        --nerv.utils.printf("accurate labels: %d\n", calculate_accurate(output, labelM))
+        nerv.utils.printf("total frames processed: %.8f\n", softmaxL.total_frames)
+    end
+    ce_last = softmaxL.total_ce 
+end
+--[[end training]]--
+```
+\ No newline at end of file
diff --git a/nerv/doc/nerv_param.md b/nerv/doc/nerv_param.md
new file mode 100644
index 0000000..167cb11
--- /dev/null
+++ b/nerv/doc/nerv_param.md
@@ -0,0 +1,27 @@
+#The Nerv Parameter Package#
+Part of the [Nerv](../README.md) toolkit.
+
+##Description##
+###Class hierarchy###
+There is a base class __Nerv.Param__ defined in `layer/init.lua`.
+
+###Class hierarchy and their members###
+* __nerv.MatrixParam__ inherits __nerv.Param__  
+	* `Matrix trans` stores the parameter matrix.
+* __nerv.LinearTransParam__ inherits __Nerv.MatrixParam__.  
+* __Nerv.BiasParam__ inherits __Nerv.MatrixParam__.  
+
+##Methods##
+* __void Param.\_\_init(Param self, string id, table global_conf)__  
+Constructor of a __Param__, it will set `self.id` to be `id` and `self.gconf` to be `global_conf`.
+* __void Param.set_info(Param self, table info)__  
+Set `self.info` to be `info`.
+* __table Param.get_info(Param self)__  
+Returns `self.info`.
+* __void Param.read(Param self, ChunkData pcdata)__  
+Abstract method.  
+In this method, `self` should in turn calls its members to load from `pcdata`.
+* __void Param.write(Param self, ChunkFileHandle pfhandle)__  
+Abstract method.  
+Save parameters to file. In this method, `self` should in turn calls its members to save to `pfhandle`.
+
diff --git a/nerv/examples/asr_trainer.lua b/nerv/examples/asr_trainer.lua
new file mode 100644
index 0000000..a5727be
--- /dev/null
+++ b/nerv/examples/asr_trainer.lua
@@ -0,0 +1,106 @@
+function build_trainer(ifname)
+    local param_repo = nerv.ParamRepo()
+    param_repo:import(ifname, nil, gconf)
+    local sublayer_repo = make_sublayer_repo(param_repo)
+    local layer_repo = make_layer_repo(sublayer_repo, param_repo)
+    local crit = get_criterion_layer(sublayer_repo)
+    local network = get_network(layer_repo)
+    local input_order = get_input_order()
+    local iterative_trainer = function (prefix, scp_file, bp)
+        gconf.randomize = bp
+        -- build buffer
+        local buffer = make_buffer(make_readers(scp_file, layer_repo))
+        -- initialize the network
+        network:init(gconf.batch_size)
+        gconf.cnt = 0
+        err_input = {nerv.CuMatrixFloat(256, 1)}
+        err_input[1]:fill(1)
+        for data in buffer.get_data, buffer do
+            -- prine stat periodically
+            gconf.cnt = gconf.cnt + 1
+            if gconf.cnt == 1000 then
+                print_stat(sublayer_repo)
+                nerv.CuMatrix.print_profile()
+                nerv.CuMatrix.clear_profile()
+                gconf.cnt = 0
+                -- break
+            end
+            local input = {}
+--            if gconf.cnt == 100 then break end
+            for i, id in ipairs(input_order) do
+                if data[id] == nil then
+                    nerv.error("input data %s not found", id)
+                end
+                table.insert(input, data[id])
+            end
+            local output = {nerv.CuMatrixFloat(256, 1)}
+            err_output = {input[1]:create()}
+            network:propagate(input, output)
+            if bp then
+                network:back_propagate(err_input, err_output, input, output)
+                network:update(err_input, input, output)
+            end
+            -- collect garbage in-time to save GPU memory
+            collectgarbage("collect")
+        end
+        print_stat(sublayer_repo)
+        nerv.CuMatrix.print_profile()
+        nerv.CuMatrix.clear_profile()
+        if (not bp) and prefix ~= nil then
+            nerv.info("writing back...")
+            local fname = string.format("%s_cv%.3f.nerv",
+                            prefix, get_accuracy(sublayer_repo))
+            network:get_params():export(fname, nil)
+        end
+        return get_accuracy(sublayer_repo)
+    end
+    return iterative_trainer
+end
+
+dofile(arg[1])
+start_halving_inc = 0.5
+halving_factor = 0.6
+end_halving_inc = 0.1
+min_iter = 1
+max_iter = 20
+min_halving = 5
+gconf.batch_size = 256
+gconf.buffer_size = 81920
+
+local pf0 = gconf.initialized_param
+local trainer = build_trainer(pf0)
+--local trainer = build_trainer("c3.nerv")
+local accu_best = trainer(nil, gconf.cv_scp, false)
+local do_halving = false
+
+nerv.info("initial cross validation: %.3f", accu_best)
+for i = 1, max_iter do
+    nerv.info("[NN] begin iteration %d with lrate = %.6f", i, gconf.lrate)
+    local accu_tr = trainer(nil, gconf.tr_scp, true)
+    nerv.info("[TR] training set %d: %.3f", i, accu_tr)
+    local accu_new = trainer(
+                        string.format("%s_%s_iter_%d_lr%f_tr%.3f",
+                            string.gsub(
+                                (string.gsub(pf0[1], "(.*/)(.*)", "%2")),
+                                "(.*)%..*", "%1"),
+                            os.date("%Y%m%d%H%M%S"),
+                            i, gconf.lrate,
+                            accu_tr),
+                        gconf.cv_scp, false)
+    nerv.info("[CV] cross validation %d: %.3f", i, accu_new)
+    -- TODO: revert the weights
+    local accu_diff = accu_new - accu_best
+    if do_halving and accu_diff < end_halving_inc and i > min_iter then
+        break
+    end
+    if accu_diff < start_halving_inc and i >= min_halving then
+        do_halving = true
+    end
+    if do_halving then
+        gconf.lrate = gconf.lrate * halving_factor
+    end
+    if accu_new > accu_best then
+        accu_best = accu_new
+    end
+--    nerv.Matrix.print_profile()
+end
diff --git a/nerv/examples/chunk_file_example.lua b/nerv/examples/chunk_file_example.lua
new file mode 100644
index 0000000..5961c98
--- /dev/null
+++ b/nerv/examples/chunk_file_example.lua
@@ -0,0 +1,53 @@
+-- To define a readable and writable chunk, one must define a class with the
+-- following methods: __init(id, global_conf), read(handle), write(handle),
+-- get_info(), set_info(info) and an id attribute. This file demonstrates a
+-- basic chunk implementation which manages the I/O of a matrix
+
+local MatrixChunk = nerv.class("nerv.MatrixChunk")
+
+function MatrixChunk:__init(id, global_conf)
+    self.id = id
+    self.info = {}
+    self.gconf = global_conf
+end
+
+function MatrixChunk:read(handle)
+    -- pass the read handle to the matrix method
+    self.data = nerv.MMatrixFloat.load(handle)
+end
+
+function MatrixChunk:write(handle)
+    -- pass the write handle to the matrix method
+    self.data:save(handle)
+end
+
+function MatrixChunk:get_info()
+    return self.info
+end
+
+function MatrixChunk:set_info(info)
+    self.info = info
+end
+
+function MatrixChunk.create_from_matrix(id, mat)
+    local ins = nerv.MatrixChunk(id)
+    ins.data = mat
+    return ins
+end
+
+mat = nerv.MMatrixFloat(3, 4)
+for i = 0, 2 do
+    for j = 0, 3 do
+        mat[i][j] = i + j
+    end
+end
+
+cd = nerv.MatrixChunk.create_from_matrix("matrix1", mat)
+
+cf = nerv.ChunkFile("test.nerv", "w")
+cf:write_chunk(cd)
+cf:close()
+
+cf2 = nerv.ChunkFile("test.nerv", "r")
+cd2 = cf2:read_chunk("matrix1")
+print(cd2.data)
diff --git a/nerv/examples/cumatrix_example.lua b/nerv/examples/cumatrix_example.lua
new file mode 100644
index 0000000..544fc7f
--- /dev/null
+++ b/nerv/examples/cumatrix_example.lua
@@ -0,0 +1,31 @@
+m = 4
+n = 4
+fm = nerv.CuMatrixFloat(m, n)
+dm = nerv.CuMatrixDouble(m, n)
+for i = 0, m - 1 do
+    for j = 0, n - 1 do
+        -- local t = math.random(10)
+        t = i / (j + 1)
+        fm[i][j] = t
+        dm[i][j] = t
+    end
+end
+print(fm)
+fs = fm:create()
+fs:softmax(fm)
+-- print(fs)
+print(dm)
+ds = dm:create()
+ds:softmax(dm)
+-- print(ds)
+print(fs)
+print(fs + fs)
+print(ds + ds)
+print(fs - fs)
+print(ds - ds)
+
+a = fs:create()
+a:mul_elem(fs, fs)
+print(a)
+a:log_elem(fs)
+print(a)
diff --git a/nerv/examples/cumatrix_from_mmatrix.lua b/nerv/examples/cumatrix_from_mmatrix.lua
new file mode 100644
index 0000000..2309e14
--- /dev/null
+++ b/nerv/examples/cumatrix_from_mmatrix.lua
@@ -0,0 +1,32 @@
+m = 3
+n = 4
+fm = nerv.MMatrixFloat(m, n)
+dm = nerv.MMatrixDouble(m, n)
+for i = 0, m - 1 do
+    for j = 0, n - 1 do
+        -- local t = math.random(10)
+        t = i / (j + 1)
+        fm[i][j] = t
+        dm[i][j] = t
+    end
+end
+print(fm)
+print(dm)
+
+fc = nerv.CuMatrixFloat(m, n)
+dc = nerv.CuMatrixDouble(m, n)
+fc:copy_fromh(fm)
+dc:copy_fromh(dm)
+print("fc and dc")
+print(fc)
+print(dc)
+dc[1]:copy_tod(dc[0])
+print("dc[1] copied to dc[0]")
+print(dc)
+print("softmax of fc and dc")
+sfc = fc:create()
+sdc = dc:create()
+sfc:softmax(fc)
+print(sfc)
+sdc:softmax(dc)
+print(sdc)
diff --git a/nerv/examples/mmatrix_example.lua b/nerv/examples/mmatrix_example.lua
new file mode 100644
index 0000000..8ddfe84
--- /dev/null
+++ b/nerv/examples/mmatrix_example.lua
@@ -0,0 +1,20 @@
+m = 10
+n = 10
+fm = nerv.MMatrixFloat(m, n)
+dm = nerv.MMatrixDouble(m, n)
+for i = 0, m - 1 do
+    for j = 0, n - 1 do
+        -- local t = math.random(10)
+        t = i / (j + 1)
+        fm[i][j] = t
+        dm[i][j] = t
+    end
+end
+print("test fm:get_dataref_value:", fm:get_dataref_value())
+print("forced a garbade collect")
+collectgarbage("collect")
+print("test fm:get_dataref_value:", fm:get_dataref_value())
+print(fm)
+-- print(fm:softmax())
+print(dm)
+-- print(dm:softmax())
diff --git a/nerv/examples/oop_example.c b/nerv/examples/oop_example.c
new file mode 100644
index 0000000..59dfc5a
--- /dev/null
+++ b/nerv/examples/oop_example.c
@@ -0,0 +1,101 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../common.h"
+
+#define SQR(x) ((x) * (x))
+
+const char *point_tname = "nerv.Point";
+const char *better_point_tname = "nerv.BetterPoint";
+
+typedef struct {
+    double x, y;
+} Point;
+
+static int point_norm (lua_State *L) {
+    Point *p = luaT_checkudata(L, 1, point_tname);
+    lua_pushnumber(L, sqrt(SQR(p->x) + SQR(p->y)));
+    return 1;
+}
+
+static int point_set_x (lua_State *L) {
+    Point *p = luaT_checkudata(L, 1, point_tname);
+    p->x = luaL_checknumber(L, 2);
+    return 0;
+}
+
+static int point_set_y (lua_State *L) {
+    Point *p = luaT_checkudata(L, 1, point_tname);
+    p->y = luaL_checknumber(L, 2);
+    return 0;
+}
+
+/* generic constructor */
+void point_new_(Point *self, double x, double y) {
+    self->x = x;
+    self->y = y;
+}
+
+int point_new(lua_State *L) {
+    /* `_new` function should create the object itself */
+    Point *self = (Point *)malloc(sizeof(Point));
+    point_new_(self, luaL_checknumber(L, 1), luaL_checknumber(L, 2));
+    luaT_pushudata(L, self, point_tname);
+    fprintf(stderr, "[example] %s constructor is invoked\n",
+            point_tname);
+    return 1;
+}
+
+static const luaL_Reg point_methods[] = {
+    {"set_x", point_set_x},
+    {"set_y", point_set_y},
+    {"norm", point_norm},
+    {NULL, NULL}
+};
+
+
+/* the subclass method overrides the one from baseclass */
+static int better_point_norm (lua_State *L) {
+    Point *p = luaT_checkudata(L, 1, point_tname);
+    lua_pushnumber(L, fabs(p->x) + fabs(p->y));
+    return 1;
+}
+
+int better_point_new(lua_State *L) {
+    /* `_new` function should create the object itself */
+    Point *self = (Point *)malloc(sizeof(Point));
+    point_new_(self, luaL_checknumber(L, 1), luaL_checknumber(L, 2));
+    luaT_pushudata(L, self, better_point_tname);
+    fprintf(stderr, "[example] %s constructor is invoked\n",
+            better_point_tname);
+    return 1;
+}
+
+static const luaL_Reg better_point_methods[] = {
+    {"norm", better_point_norm},
+    {NULL, NULL}
+};
+
+void nerv_point_init(lua_State *L) {
+    /* create a class and let luaT know */
+    luaT_newmetatable(L, point_tname, NULL, point_new, NULL, NULL);
+    /* register member functions */
+    luaL_register(L, NULL, point_methods);
+    /* keep the stack balanced, see `nerv.c` */
+    lua_pop(L, 1);
+}
+
+void nerv_better_point_init(lua_State *L) {
+    /* create a class and let luaT know */
+    luaT_newmetatable(L, better_point_tname, point_tname,
+                        better_point_new, NULL, NULL);
+    /* register member functions */
+    luaL_register(L, NULL, better_point_methods);
+    /* keep the stack balanced, see `nerv.c` */
+    lua_pop(L, 1);
+}
+
+void nerv_example_init(lua_State *L) {
+    nerv_point_init(L);
+    nerv_better_point_init(L);
+}
diff --git a/nerv/examples/oop_example.lua b/nerv/examples/oop_example.lua
new file mode 100644
index 0000000..b753288
--- /dev/null
+++ b/nerv/examples/oop_example.lua
@@ -0,0 +1,16 @@
+p = nerv.Point(0, 0) -- create a Point instance
+print(p)
+print(p:norm()) -- get 2-norm of the Point
+p:set_x(1.0)
+p:set_y(2.0)
+print(p:norm()) -- get 2-norm of the Point
+
+bp = nerv.BetterPoint(1, 2)
+-- use methods from base class
+bp:set_x(1.0)
+bp:set_y(2.0)
+print(bp)
+print(bp:norm()) --get 1-norm of the Point
+
+print(p.__typename)
+print(bp.__typename)
diff --git a/nerv/examples/swb_baseline.lua b/nerv/examples/swb_baseline.lua
new file mode 100644
index 0000000..8b7e01a
--- /dev/null
+++ b/nerv/examples/swb_baseline.lua
@@ -0,0 +1,166 @@
+require 'speech.init'
+gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
+        cumat_type = nerv.CuMatrixFloat,
+        mmat_type = nerv.MMatrixFloat,
+        frm_ext = 5,
+        tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
+        cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp",
+        htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
+        initialized_param = {"/slfs1/users/mfy43/swb_init.nerv",
+                "/slfs1/users/mfy43/swb_global_transf.nerv"},
+        debug = false}
+
+function make_sublayer_repo(param_repo)
+    return nerv.LayerRepo(
+    {
+        -- global transf
+        ["nerv.BiasLayer"] =
+        {
+            blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}},
+            blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}}
+        },
+        ["nerv.WindowLayer"] =
+        {
+            wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}},
+            wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}}
+        },
+        -- biased linearity
+        ["nerv.AffineLayer"] =
+        {
+            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+            {dim_in = {429}, dim_out = {2048}}},
+            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+            {dim_in = {2048}, dim_out = {3001}}}
+        },
+        ["nerv.SigmoidLayer"] =
+        {
+            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+        },
+        ["nerv.SoftmaxCELayer"] =
+        {
+            ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}}
+        }
+    }, param_repo, gconf)
+end
+
+function make_layer_repo(sublayer_repo, param_repo)
+    return nerv.LayerRepo(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            global_transf = {{}, {
+                dim_in = {429}, dim_out = {429},
+                sub_layers = sublayer_repo,
+                connections = {
+                    ["<input>[1]"] = "blayer1[1]",
+                    ["blayer1[1]"] = "wlayer1[1]",
+                    ["wlayer1[1]"] = "blayer2[1]",
+                    ["blayer2[1]"] = "wlayer2[1]",
+                    ["wlayer2[1]"] = "<output>[1]"
+                }
+            }},
+            main = {{}, {
+                dim_in = {429, 1}, dim_out = {1},
+                sub_layers = sublayer_repo,
+                connections = {
+                    ["<input>[1]"] = "affine0[1]",
+                    ["affine0[1]"] = "sigmoid0[1]",
+                    ["sigmoid0[1]"] = "affine1[1]",
+                    ["affine1[1]"] = "sigmoid1[1]",
+                    ["sigmoid1[1]"] = "affine2[1]",
+                    ["affine2[1]"] = "sigmoid2[1]",
+                    ["sigmoid2[1]"] = "affine3[1]",
+                    ["affine3[1]"] = "sigmoid3[1]",
+                    ["sigmoid3[1]"] = "affine4[1]",
+                    ["affine4[1]"] = "sigmoid4[1]",
+                    ["sigmoid4[1]"] = "affine5[1]",
+                    ["affine5[1]"] = "sigmoid5[1]",
+                    ["sigmoid5[1]"] = "affine6[1]",
+                    ["affine6[1]"] = "sigmoid6[1]",
+                    ["sigmoid6[1]"] = "affine7[1]",
+                    ["affine7[1]"] = "ce_crit[1]",
+                    ["<input>[2]"] = "ce_crit[2]",
+                    ["ce_crit[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+end
+
+function get_criterion_layer(sublayer_repo)
+    return sublayer_repo:get_layer("ce_crit")
+end
+
+function get_network(layer_repo)
+    return layer_repo:get_layer("main")
+end
+
+function make_readers(scp_file, layer_repo)
+    return {
+                {reader = nerv.TNetReader(gconf,
+                    {
+                        id = "main_scp",
+                        scp_file = scp_file,
+                        conf_file = gconf.htk_conf,
+                        frm_ext = gconf.frm_ext,
+                        mlfs = {
+                            phone_state = {
+                                file = "/slfs1/users/mfy43/swb_ivec/ref.mlf",
+                                format = "map",
+                                format_arg = "/slfs1/users/mfy43/swb_ivec/dict",
+                                dir = "*/",
+                                ext = "lab"
+                            }
+                        },
+                        global_transf = layer_repo:get_layer("global_transf")
+                    }),
+                data = {main_scp = 429, phone_state = 1}}
+            }
+end
+
+function make_buffer(readers)
+    return nerv.SGDBuffer(gconf,
+        {
+            buffer_size = gconf.buffer_size,
+            randomize = gconf.randomize,
+            readers = readers
+        })
+end
+
+function get_input_order()
+    return {"main_scp", "phone_state"}
+end
+
+function get_accuracy(sublayer_repo)
+    local ce_crit = sublayer_repo:get_layer("ce_crit")
+    return ce_crit.total_correct / ce_crit.total_frames * 100
+end
+
+function print_stat(sublayer_repo)
+    local ce_crit = sublayer_repo:get_layer("ce_crit")
+    nerv.info("*** training stat begin ***")
+    nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce)
+    nerv.printf("correct:\t\t%d\n", ce_crit.total_correct)
+    nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames)
+    nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames)
+    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(sublayer_repo))
+    nerv.info("*** training stat end ***")
+end
diff --git a/nerv/examples/test_dnn_layers.lua b/nerv/examples/test_dnn_layers.lua
new file mode 100644
index 0000000..64c0dec
--- /dev/null
+++ b/nerv/examples/test_dnn_layers.lua
@@ -0,0 +1,78 @@
+require 'layer.affine'
+require 'layer.sigmoid'
+require 'layer.softmax_ce'
+
+global_conf = {lrate = 0.8, wcost = 1e-6,
+                momentum = 0.9, cumat_type = nerv.CuMatrixFloat}
+
+pf = nerv.ChunkFile("affine.param", "r")
+ltp = pf:read_chunk("a", global_conf)
+bp = pf:read_chunk("b", global_conf)
+
+-- print(bp.trans)
+
+af = nerv.AffineLayer("test", global_conf, {["ltp"] = ltp,
+                                            ["bp"] = bp,
+                                            dim_in = {429},
+                                            dim_out = {2048}})
+sg = nerv.SigmoidLayer("test2", global_conf, {dim_in = {2048},
+                                                dim_out = {2048}})
+sm = nerv.SoftmaxCELayer("test3", global_conf, {dim_in = {2048, 2048},
+                                                dim_out = {}})
+af:init()
+sg:init()
+sm:init()
+
+df = nerv.ChunkFile("input.param", "r")
+
+label = nerv.CuMatrixFloat(10, 2048)
+label:fill(0)
+for i = 0, 9 do
+    label[i][i] = 1.0
+end
+
+input1 = {df:read_chunk("input", global_conf).trans}
+output1 = {nerv.CuMatrixFloat(10, 2048)}
+input2 = output1
+output2 = {nerv.CuMatrixFloat(10, 2048)}
+input3 = {output2[1], label}
+output3 = {}
+err_input1 = {}
+err_output1 = {nerv.CuMatrixFloat(10, 2048)}
+err_input2 = err_output1
+err_output2 = {nerv.CuMatrixFloat(10, 2048)}
+err_input3 = err_output2
+err_output3 = {input1[1]:create()}
+
+for i = 0, 3 do
+    -- propagate
+    af:propagate(input1, output1)
+    sg:propagate(input2, output2)
+    sm:propagate(input3, output3)
+
+    -- back_propagate
+    sm:back_propagate(err_output1, err_input1, input3, output3)
+    sg:back_propagate(err_output2, err_input2, input2, output2)
+    af:back_propagate(err_output3, err_input3, input1, output1)
+
+    -- update
+    sm:update(err_input1, input3, output3)
+    sg:update(err_input2, input2, output2)
+    af:update(err_input3, input1, output1)
+
+
+    print("output1")
+    print(output1[1])
+    print("output2")
+    print(output2[1])
+    print("err_output1")
+    print(err_output1[1])
+    print("err_output2")
+    print(err_output2[1])
+    nerv.printf("cross entropy: %.8f\n", sm.total_ce)
+    nerv.printf("frames: %.8f\n", sm.total_frames)
+end
+print("linear")
+print(af.ltp.trans)
+print("linear2")
+print(af.bp.trans)
diff --git a/nerv/examples/test_nn_lib.lua b/nerv/examples/test_nn_lib.lua
new file mode 100644
index 0000000..5444810
--- /dev/null
+++ b/nerv/examples/test_nn_lib.lua
@@ -0,0 +1,164 @@
+require 'speech.init'
+gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
+        cumat_type = nerv.CuMatrixFloat,
+        mmat_type = nerv.MMatrixFloat,
+        batch_size = 256}
+
+param_repo = nerv.ParamRepo({"converted.nerv", "global_transf.nerv"})
+sublayer_repo = nerv.LayerRepo(
+    {
+        -- global transf
+        ["nerv.BiasLayer"] =
+        {
+            blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}},
+            blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}}
+        },
+        ["nerv.WindowLayer"] =
+        {
+            wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}},
+            wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}}
+        },
+        -- biased linearity
+        ["nerv.AffineLayer"] =
+        {
+            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+                        {dim_in = {429}, dim_out = {2048}}},
+            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+                        {dim_in = {2048}, dim_out = {2048}}},
+            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+                        {dim_in = {2048}, dim_out = {2048}}},
+            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+                        {dim_in = {2048}, dim_out = {2048}}},
+            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+                        {dim_in = {2048}, dim_out = {2048}}},
+            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+                        {dim_in = {2048}, dim_out = {2048}}},
+            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+                        {dim_in = {2048}, dim_out = {2048}}},
+            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+                        {dim_in = {2048}, dim_out = {3001}}}
+        },
+        ["nerv.SigmoidLayer"] =
+        {
+            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+        },
+        ["nerv.SoftmaxCELayer"] =
+        {
+            softmax_ce0 = {{}, {dim_in = {3001, 1}, dim_out = {}, compressed = true}}
+        }
+    }, param_repo, gconf)
+
+layer_repo = nerv.LayerRepo(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            global_transf = {{}, {
+                dim_in = {429}, dim_out = {429},
+                sub_layers = sublayer_repo,
+                connections = {
+                    ["<input>[1]"] = "blayer1[1]",
+                    ["blayer1[1]"] = "wlayer1[1]",
+                    ["wlayer1[1]"] = "blayer2[1]",
+                    ["blayer2[1]"] = "wlayer2[1]",
+                    ["wlayer2[1]"] = "<output>[1]"
+                }
+            }},
+            main = {{}, {
+                dim_in = {429, 1}, dim_out = {},
+                sub_layers = sublayer_repo,
+                connections = {
+                    ["<input>[1]"] = "affine0[1]",
+                    ["affine0[1]"] = "sigmoid0[1]",
+                    ["sigmoid0[1]"] = "affine1[1]",
+                    ["affine1[1]"] = "sigmoid1[1]",
+                    ["sigmoid1[1]"] = "affine2[1]",
+                    ["affine2[1]"] = "sigmoid2[1]",
+                    ["sigmoid2[1]"] = "affine3[1]",
+                    ["affine3[1]"] = "sigmoid3[1]",
+                    ["sigmoid3[1]"] = "affine4[1]",
+                    ["affine4[1]"] = "sigmoid4[1]",
+                    ["sigmoid4[1]"] = "affine5[1]",
+                    ["affine5[1]"] = "sigmoid5[1]",
+                    ["sigmoid5[1]"] = "affine6[1]",
+                    ["affine6[1]"] = "sigmoid6[1]",
+                    ["sigmoid6[1]"] = "affine7[1]",
+                    ["affine7[1]"] = "softmax_ce0[1]",
+                    ["<input>[2]"] = "softmax_ce0[2]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+tnet_reader = nerv.TNetReader(gconf,
+    {
+        id = "main_scp",
+        scp_file = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
+--        scp_file = "t.scp",
+        conf_file = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
+        frm_ext = 5,
+        mlfs = {
+            ref = {
+                file = "/slfs1/users/mfy43/swb_ivec/ref.mlf",
+                format = "map",
+                format_arg = "/slfs1/users/mfy43/swb_ivec/dict",
+                dir = "*/",
+                ext = "lab"
+            }
+        },
+        global_transf = layer_repo:get_layer("global_transf")
+    })
+
+buffer = nerv.SGDBuffer(gconf,
+    {
+        buffer_size = 81920,
+        randomize = true,
+        readers = {
+            { reader = tnet_reader,
+              data = {main_scp = 429, ref = 1}}
+        }
+    })
+
+sm = sublayer_repo:get_layer("softmax_ce0")
+main = layer_repo:get_layer("main")
+main:init(gconf.batch_size)
+gconf.cnt = 0
+-- data = buffer:get_data()
+-- input = {data.main_scp, data.ref}
+-- while true do
+for data in buffer.get_data, buffer do
+--    if gconf.cnt == 100 then break end
+--    gconf.cnt = gconf.cnt + 1
+
+    input = {data.main_scp, data.ref}
+    output = {}
+    err_input = {}
+    err_output = {input[1]:create()}
+    
+    main:propagate(input, output)
+    main:back_propagate(err_output, err_input, input, output)
+    main:update(err_input, input, output)
+
+--    nerv.printf("cross entropy: %.8f\n", sm.total_ce)
+--    nerv.printf("correct: %d\n", sm.total_correct)
+--    nerv.printf("frames: %d\n", sm.total_frames)
+--    nerv.printf("err/frm: %.8f\n", sm.total_ce / sm.total_frames)
+--    nerv.printf("accuracy: %.8f\n", sm.total_correct / sm.total_frames)
+    collectgarbage("collect")
+end
+nerv.printf("cross entropy: %.8f\n", sm.total_ce)
+nerv.printf("correct: %d\n", sm.total_correct)
+nerv.printf("accuracy: %.3f%%\n", sm.total_correct / sm.total_frames * 100)
+nerv.printf("writing back...\n")
+cf = nerv.ChunkFile("output.nerv", "w")
+for i, p in ipairs(main:get_params()) do
+    print(p)
+    cf:write_chunk(p)
+end
+cf:close()
+nerv.Matrix.print_profile()
diff --git a/nerv/init.lua b/nerv/init.lua
new file mode 100644
index 0000000..89010a7
--- /dev/null
+++ b/nerv/init.lua
@@ -0,0 +1,128 @@
+require 'libnerv'
+
+function nerv.error(fmt, ...)
+    error(nerv.printf("[nerv] internal error: " .. fmt .. "\n", ...))
+end
+
+function nerv.error_method_not_implemented()
+    nerv.error("method not implemented");
+end
+
+function nerv.printf(fmt, ...)
+    io.write(string.format(fmt, ...))
+end
+
+function nerv.mesg_with_timestamp(fmt, ...)
+    nerv.printf(
+        string.format("(%s)[nerv] info: %s\n",
+            os.date("%H:%M:%S %F"), fmt), ...)
+end
+
+function nerv.info(fmt, ...)
+    nerv.printf(
+        string.format("(%s)[nerv] info: %s\n",
+            os.date("%H:%M:%S %F"), fmt), ...)
+end
+
+function nerv.warning(fmt, ...)
+    nerv.printf(
+        string.format("(%s)[nerv] warning: %s\n",
+            os.date("%H:%M:%S %F"), fmt), ...)
+end
+
+-- Torch C API wrapper
+function nerv.class(tname, parenttname)
+
+   local function constructor(...)
+      local self = {}
+      nerv.setmetatable(self, tname)
+      if self.__init then
+         self:__init(...)
+      end
+      return self
+   end
+
+   local function factory()
+      local self = {}
+      nerv.setmetatable(self, tname)
+      return self
+   end
+
+   local mt = nerv.newmetatable(tname, parenttname, constructor, nil, factory)
+   local mpt
+   if parenttname then
+      mpt = nerv.getmetatable(parenttname)
+   end
+   return mt, mpt
+end
+
+function table.val_to_str(v)
+  if "string" == type(v) then
+    v = string.gsub(v, "\n", "\\n")
+    if string.match(string.gsub(v,"[^'\"]",""), '^"+$') then
+      return "'" .. v .. "'"
+    end
+    return '"' .. string.gsub(v,'"', '\\"') .. '"'
+  else
+    return "table" == type(v) and table.tostring(v) or
+      tostring(v)
+  end
+end
+
+function table.key_to_str (k)
+  if "string" == type(k) and string.match(k, "^[_%a][_%a%d]*$") then
+    return k
+  else
+    return "[" .. table.val_to_str(k) .. "]"
+  end
+end
+
+function table.tostring(tbl)
+  local result, done = {}, {}
+  for k, v in ipairs(tbl) do
+    table.insert(result, table.val_to_str(v))
+    done[k] = true
+  end
+  for k, v in pairs(tbl) do
+    if not done[k] then
+      table.insert(result,
+        table.key_to_str(k) .. "=" .. table.val_to_str(v))
+    end
+  end
+  return "{" .. table.concat(result, ",") .. "}"
+end
+
+function nerv.get_type(tname)
+    return assert(loadstring("return " .. tname))()
+end
+
+function nerv.is_type(obj, tname)
+    local mt0 = nerv.getmetatable(tname)
+    local mt = getmetatable(obj)
+    while mt do
+        if mt == mt0 then
+            return true
+        end
+        mt = getmetatable(mt)
+    end
+    return false
+end
+
+function nerv.dirname(filename)
+    if filename:match(".-/.-") then
+        local name = string.gsub(filename, "(.*/)(.*)", "%1")
+        return name
+    else
+        return ''
+    end
+end
+
+function nerv.include(filename)
+    local caller = debug.getinfo(2, "S").source:sub(2)
+    dofile(nerv.dirname(caller) .. filename)
+end
+
+nerv.include('matrix/init.lua')
+nerv.include('io/init.lua')
+nerv.include('layer/init.lua')
+nerv.include('nn/init.lua')
diff --git a/nerv/io/chunk_file.c b/nerv/io/chunk_file.c
new file mode 100644
index 0000000..c0b6b9f
--- /dev/null
+++ b/nerv/io/chunk_file.c
@@ -0,0 +1,325 @@
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include "../common.h"
+#include "chunk_file.h"
+
+#define INVALID_FORMAT_ERROR(fn) \
+    nerv_error(L, "Invalid chunk file: %s", fn)
+#define CHECK_FORMAT(exp, ret, fname) \
+    do { \
+        if ((exp) != (ret)) INVALID_FORMAT_ERROR(fn); \
+    } while (0)
+#define CHECK_FILE_OPEN(pfh) \
+    do { \
+        if ((pfh)->closed) \
+            nerv_error(L, "operations on a closed file"); \
+    } while (0)
+
+const char *nerv_chunk_file_tname = "nerv.ChunkFile";
+const char *nerv_chunk_file_handle_tname = "nerv.ChunkFileHandle";
+const char *nerv_chunk_info_tname = "nerv.ChunkInfo";
+const char *nerv_chunk_data_tname = "nerv.ChunkData";
+
+#define PARAM_HEADER_SIZE 16
+
+enum {
+    NORMAL,
+    INVALID_FORMAT,
+    END_OF_FILE,
+    SECTION_OVERFLOW,
+    WRITE_ERROR
+};
+
+size_t read_chunk_header_plain(FILE *fp, int *status) {
+    static char buff[PARAM_HEADER_SIZE];
+    int i;
+    size_t size = 0;
+    *status = NORMAL;
+    if (fread(buff, 1, PARAM_HEADER_SIZE, fp) != PARAM_HEADER_SIZE)
+    {
+        if (feof(fp)) *status = END_OF_FILE;
+        else *status = INVALID_FORMAT;
+    }
+    for (i = 0; i < PARAM_HEADER_SIZE; i++)
+        if (isdigit(buff[i]))
+            size = size * 10 + buff[i] - '0';
+/*    fprintf(stderr, "header: %lu\n", size); */
+    return size;
+}
+
+#define CHECK_WRITE(status) \
+    do { \
+        if (status == SECTION_OVERFLOW) \
+            nerv_error(L, "section overflowed"); \
+        else if (status == WRITE_ERROR) \
+            nerv_error(L, "error while writing"); \
+    } while (0)
+
+void write_chunk_header_plain(FILE *fp, size_t size, int *status) {
+    static char buff[PARAM_HEADER_SIZE];
+    int i;
+    *status = NORMAL;
+    for (i = PARAM_HEADER_SIZE - 3; i > 0; i--, size /= 10)
+        buff[i] = size % 10 + '0';
+    if (size)
+    {
+        *status = SECTION_OVERFLOW;
+        return;
+    }
+    buff[0] = '[';
+    buff[PARAM_HEADER_SIZE - 2] = ']';
+    buff[PARAM_HEADER_SIZE - 1] = '\n';
+    if (fwrite(buff, 1, PARAM_HEADER_SIZE, fp) != PARAM_HEADER_SIZE)
+    {
+        *status = WRITE_ERROR;
+        return;
+    }
+}
+
+ChunkData *get_chunk_data(FILE *fp, ChunkInfo *info) {
+    ChunkData *pcd = (ChunkData *)malloc(sizeof(ChunkData));
+    pcd->data = (char *)malloc(info->length);
+    pcd->fp = fmemopen(pcd->data, info->length, "r");
+    assert(fseeko(fp, info->offset, SEEK_SET) == 0);
+    if (fread(pcd->data, 1, info->length, fp) != (size_t)info->length)
+        return NULL;
+    return pcd;
+}
+
+const char *read_chunk_metadata(lua_State *L, FILE *fp, const char *fn) {
+#define LINEBUFF_SIZE 1024
+    static char buff[7 + LINEBUFF_SIZE] = "return ";
+    CHECK_FORMAT(fgets(buff + 7, LINEBUFF_SIZE, fp), buff + 7, fn);
+    /* fprintf(stderr, "metadata: %s\n", buff); */
+    return buff;
+}
+
+void write_chunk_metadata(FILE *fp, const char *metadata_str, int *status) {
+    size_t size = strlen(metadata_str);
+    *status = NORMAL;
+    if (fwrite(metadata_str, 1, size, fp) != size ||
+        fprintf(fp, "\n") < 0)
+    {
+        *status = WRITE_ERROR;
+        return;
+    }
+    /* fprintf(stderr, "metadata: %s\n", metadata_str); */
+}
+
+
+int nerv_chunk_file_open_write(lua_State *L, const char *fn) {
+    FILE *fp = fopen(fn, "w");
+    ChunkFileHandle *lfp;
+    if (!fp) nerv_error(L, "Error while opening chunk file: %s", fn);
+    lfp = (ChunkFileHandle *)malloc(sizeof(ChunkFileHandle));
+    lfp->fp = fp;
+    lfp->closed = 0;
+    luaT_pushudata(L, lfp, nerv_chunk_file_handle_tname);
+    lua_setfield(L, -2, "handle");
+    luaT_pushmetatable(L, nerv_chunk_file_tname);
+    lua_setmetatable(L, -2);
+    return 1;
+}
+
+int nerv_chunk_file_open_read(lua_State *L, const char *fn) {
+    FILE *fp = fopen(fn, "r");
+    int i, status;
+    size_t chunk_len;
+    off_t offset;
+    ChunkFileHandle *lfp;
+
+    if (!fp) nerv_error(L, "Error while opening chunk file: %s", fn);
+    offset = ftello(fp);
+    lua_newtable(L);
+    /* fprintf(stderr, "%d\n", (int)offset); */
+    for (i = 0;; offset += chunk_len, i++)
+    {
+        ChunkInfo *pci;
+        /* fprintf(stderr, "reading chunk %d from %d\n", i, (int)offset); */
+        /* skip to the begining of chunk i */
+        CHECK_FORMAT(fseeko(fp, offset, SEEK_SET), 0, fn);
+        /* read header */
+        chunk_len = read_chunk_header_plain(fp, &status);
+        if (status == END_OF_FILE) break;
+        else if (status == INVALID_FORMAT)
+            INVALID_FORMAT_ERROR(fn);
+        /* read metadata */
+        luaL_loadstring(L, read_chunk_metadata(L, fp, fn));
+        CHECK_FORMAT(lua_pcall(L, 0, 1, 0), 0, fn);
+        CHECK_FORMAT(lua_istable(L, -1), 1, fn);
+        /* stack: obj_table, metadata */
+        /* chunk info */
+        pci = (ChunkInfo *)malloc(sizeof(ChunkInfo));
+        pci->offset = ftello(fp);
+        pci->length = chunk_len - (pci->offset - offset);
+        /* fprintf(stderr, "%d + %d (skip %lu)\n", (int)pci->offset,
+                (int)pci->length, chunk_len); */
+        luaT_pushudata(L, pci, nerv_chunk_info_tname);
+        lua_setfield(L, -2, "chunk");
+        /* stack: obj_table, metadata */
+        /* get id */
+        lua_getfield(L, -1, "id");
+        /* stack: obj_table, metadata, id */
+        if (!lua_isstring(L, -1))
+            nerv_error(L, "id field in metadata must be a string");
+        lua_pushvalue(L, -1);
+        /* stack: obj_table, metadata, id, id */
+        lua_gettable(L, -4);
+        /* stack: obj_table, metadata, id, obj[id] */
+        if (!lua_isnil(L, -1))
+            nerv_error(L, "conflicting id");
+        lua_pop(L, 1);
+        /* stack: obj_table, metadata, id */
+        lua_pushvalue(L, -2);
+        /* stack: obj_table, metadata, id, metadata */
+        lua_settable(L, -4);
+        /* stack: obj_table, metadata */
+        lua_pop(L, 1);
+    }
+    lua_setfield(L, -2, "metadata");
+    lfp = (ChunkFileHandle *)malloc(sizeof(ChunkFileHandle));
+    lfp->fp = fp;
+    lfp->closed = 0;
+    luaT_pushudata(L, lfp, nerv_chunk_file_handle_tname);
+    lua_setfield(L, -2, "handle");
+    luaT_pushmetatable(L, nerv_chunk_file_tname);
+    lua_setmetatable(L, -2);
+    return 1;
+}
+
+int nerv_chunk_file_new_(lua_State *L, const char *fn, const char *mode) {
+    int rd = 1, bin = 0;
+    size_t i, len = strlen(mode);
+    for (i = 0; i < len; i++)
+        switch (mode[i])
+        {
+            case 'r': rd = 1; break;
+            case 'w': rd = 0; break;
+            case 'b': bin = 1; break;
+        }
+    return rd ? nerv_chunk_file_open_read(L, fn) : \
+                nerv_chunk_file_open_write(L, fn);
+}
+
+int nerv_chunk_file___init(lua_State *L) {
+    lua_pushvalue(L, 1);
+    return nerv_chunk_file_new_(L, luaL_checkstring(L, 2),
+                                    luaL_checkstring(L, 3));
+}
+
+int nerv_chunk_file_new(lua_State *L) {
+    lua_newtable(L);
+    return nerv_chunk_file_new_(L, luaL_checkstring(L, 1),
+                                    luaL_checkstring(L, 2));
+}
+
+int nerv_chunk_file_write_chunkdata(lua_State *L) {
+    ChunkFileHandle *pfh;
+    int status;
+    off_t start;
+    size_t size;
+    const char *metadata_str = lua_tolstring(L, 2, NULL);
+    lua_getfield(L, 1, "handle");
+    pfh = luaT_checkudata(L, -1, nerv_chunk_file_handle_tname);
+    CHECK_FILE_OPEN(pfh);
+    start = ftello(pfh->fp);
+    write_chunk_header_plain(pfh->fp, 0, &status); /* fill zeros */
+    CHECK_WRITE(status);
+    write_chunk_metadata(pfh->fp, metadata_str, &status);
+    CHECK_WRITE(status);
+    lua_pushvalue(L, 3);
+    lua_getfield(L, -1, "write");
+    if (!lua_isfunction(L, -1))
+        nerv_error(L, "\"write\" method must be implemented");
+    lua_pushvalue(L, -2);
+    lua_pushvalue(L, 4); /* pass handle as parameter to write() */
+    lua_call(L, 2, 0); /* let the write() to write */
+    lua_pop(L, 1);
+    size = ftello(pfh->fp) - start;
+    fseeko(pfh->fp, start, SEEK_SET);
+    /* write the calced size */
+    write_chunk_header_plain(pfh->fp, size, &status);
+    CHECK_WRITE(status);
+    fseeko(pfh->fp, 0, SEEK_END);
+    return 0;
+}
+
+int nerv_chunk_file_get_chunkdata(lua_State *L) {
+    ChunkFileHandle *pfh;
+    ChunkInfo *pci;
+    ChunkData *pcd;
+    const char *id = luaL_checkstring(L, 2);
+
+    lua_getfield(L, 1, "handle");
+    pfh = luaT_checkudata(L, -1, nerv_chunk_file_handle_tname);
+    CHECK_FILE_OPEN(pfh);
+    lua_pop(L, 1); /* pop handle */
+    lua_getfield(L, 1, "metadata");
+    /* now stack: self, k, metadata */
+    lua_getfield(L, -1, id);
+    /* now stack: self, k, metadata, kth{} */
+    if (lua_isnil(L, -1)) /* no chunck with the id */
+        return 0;
+    lua_getfield(L, -1, "chunk");
+    pci = luaT_checkudata(L, -1, nerv_chunk_info_tname);
+    if (!(pcd = get_chunk_data(pfh->fp, pci)))
+        nerv_error(L, "unexpected end of file");
+    luaT_pushudata(L, pcd, nerv_chunk_data_tname);
+    return 1;
+}
+
+int nerv_chunk_file_close(lua_State *L) {
+    ChunkFileHandle *pfh;
+    lua_getfield(L, 1, "handle");
+    pfh = luaT_checkudata(L, -1, nerv_chunk_file_handle_tname);
+    CHECK_FILE_OPEN(pfh);
+    fclose(pfh->fp);
+    pfh->closed = 1;
+    return 0;
+}
+
+int nerv_chunk_file_handle_destroy(lua_State *L) {
+    ChunkFileHandle *pfh = luaT_checkudata(L, 1,
+                                nerv_chunk_file_handle_tname);
+    if (!pfh->closed) fclose(pfh->fp);
+    free(pfh);
+    return 0;
+}
+
+static int nerv_chunk_info_destroy(lua_State *L) {
+    ChunkInfo *pci = luaT_checkudata(L, 1, nerv_chunk_info_tname);
+    free(pci);
+    return 0;
+}
+
+static int nerv_chunk_data_destroy(lua_State *L) {
+    ChunkData *pcd = luaT_checkudata(L, 1, nerv_chunk_data_tname);
+    fclose(pcd->fp);
+    free(pcd->data);
+    free(pcd);
+    return 0;
+}
+
+static const luaL_Reg nerv_chunk_file_methods[] = {
+    {"get_chunkdata", nerv_chunk_file_get_chunkdata},
+    {"_write_chunkdata", nerv_chunk_file_write_chunkdata},
+    {"close", nerv_chunk_file_close},
+    {"__init", nerv_chunk_file___init},
+    {NULL, NULL}
+};
+
+void nerv_chunk_file_init(lua_State *L) {
+    luaT_newmetatable(L, nerv_chunk_file_tname, NULL,
+                        nerv_chunk_file_new,
+                        NULL, NULL);
+    luaL_register(L, NULL, nerv_chunk_file_methods);
+    lua_pop(L, 1);
+    luaT_newmetatable(L, nerv_chunk_file_handle_tname, NULL,
+                        NULL, nerv_chunk_file_handle_destroy, NULL);
+    luaT_newmetatable(L, nerv_chunk_info_tname, NULL,
+                        NULL, nerv_chunk_info_destroy, NULL);
+    luaT_newmetatable(L, nerv_chunk_data_tname, NULL,
+                        NULL, nerv_chunk_data_destroy, NULL);
+}
+
diff --git a/nerv/io/chunk_file.h b/nerv/io/chunk_file.h
new file mode 100644
index 0000000..9bae59d
--- /dev/null
+++ b/nerv/io/chunk_file.h
@@ -0,0 +1,23 @@
+#ifndef NERV_LAYER_FILE_H
+#define NERV_LAYER_FILE_H
+
+extern const char *nerv_chunk_file_tname;
+extern const char *nerv_chunk_file_handle_tname;
+extern const char *nerv_chunk_info_tname;
+extern const char *nerv_chunk_data_tname;
+
+typedef struct ChunkFileHandle {
+    FILE *fp;
+    int closed;
+} ChunkFileHandle;
+
+typedef struct ChunkInfo {
+    off_t offset, length;
+} ChunkInfo;
+
+typedef struct ChunkData {
+    FILE *fp;
+    char *data;
+} ChunkData;
+
+#endif
diff --git a/nerv/io/init.c b/nerv/io/init.c
new file mode 100644
index 0000000..70585f7
--- /dev/null
+++ b/nerv/io/init.c
@@ -0,0 +1,6 @@
+#include "../common.h"
+
+extern void nerv_chunk_file_init(lua_State *L);
+void nerv_io_init(lua_State *L) {
+    nerv_chunk_file_init(L);
+}
diff --git a/nerv/io/init.lua b/nerv/io/init.lua
new file mode 100644
index 0000000..647ff93
--- /dev/null
+++ b/nerv/io/init.lua
@@ -0,0 +1,55 @@
+function nerv.ChunkFile:write_chunkdata(metadata, writer)
+    if type(metadata) ~= "table" then
+        nerv.error("metadata should be a Lua table")
+        return
+    end
+    return self:_write_chunkdata(table.tostring(metadata), writer)
+end
+
+function nerv.ChunkFile:write_chunk(chunk)
+    local id = chunk.id
+    local type = chunk.__typename
+    if id == nil then
+        nerv.error("id of chunk %s must be specified", type)
+    end
+    self:write_chunkdata({id = id,
+                            type = type,
+                            info = chunk:get_info()}, chunk)
+end
+
+function nerv.ChunkFile:read_chunk(id, global_conf)
+    if self.metadata == nil then
+        nerv.error("wrong file opening mode")
+    end
+    local metadata = self.metadata[id]
+    if metadata == nil then
+        nerv.error("chunk with id %s does not exist", id)
+    end
+    local chunk_type = nerv.get_type(metadata.type)
+    local chunk = chunk_type(id, global_conf)
+    chunk:set_info(metadata.info)
+    chunk:read(self:get_chunkdata(id))
+    return chunk
+end
+
+local DataReader = nerv.class("nerv.DataReader")
+
+function DataReader:__init(global_conf, reader_conf)
+    nerv.error_method_not_implemented()
+end
+
+function DataReader:get_data()
+    nerv.error_method_not_implemented()
+end
+
+local DataBuffer = nerv.class("nerv.DataBuffer")
+
+function DataBuffer:__init(global_conf, buffer_conf)
+    nerv.error_method_not_implemented()
+end
+
+function DataBuffer:get_batch()
+    nerv.error_method_not_implemented()
+end
+
+nerv.include('sgd_buffer.lua')
diff --git a/nerv/io/sgd_buffer.lua b/nerv/io/sgd_buffer.lua
new file mode 100644
index 0000000..f4f7dfe
--- /dev/null
+++ b/nerv/io/sgd_buffer.lua
@@ -0,0 +1,111 @@
+local SGDBuffer = nerv.class("nerv.SGDBuffer", "nerv.DataBuffer")
+
+function SGDBuffer:__init(global_conf, buffer_conf)
+    self.gconf = global_conf
+    self.buffer_size = math.floor(buffer_conf.buffer_size /
+                                global_conf.batch_size) * global_conf.batch_size
+    self.randomize = buffer_conf.randomize
+    if self.randomize == nil then
+        self.randomize = false
+    end
+    self.head = 0
+    self.tail = 0
+    self.readers = {}
+    for i, reader_spec in ipairs(buffer_conf.readers) do
+        local buffs = {}
+        for id, width in pairs(reader_spec.data) do
+            buffs[id] = {data = global_conf.mmat_type(self.buffer_size, width),
+                        leftover = nil,
+                        width = width}
+        end
+        table.insert(self.readers, {buffs = buffs,
+                                    reader = reader_spec.reader,
+                                    tail = 0,
+                                    has_leftover = false})
+    end
+end
+
+function SGDBuffer:saturate()
+    local buffer_size = self.buffer_size
+    self.head = 0
+    self.tail = buffer_size
+    for i, reader in ipairs(self.readers) do
+        reader.tail = 0
+        if reader.has_leftover then
+            local lrow
+            for id, buff in pairs(reader.buffs) do
+                lrow = buff.leftover:nrow()
+                if lrow > buffer_size then
+                    nerv.error("buffer size is too small to contain leftovers")
+                end
+                buff.data:copy_from(buff.leftover, 0, lrow)
+                buff.leftover = nil
+            end
+            nerv.printf("leftover: %d\n", lrow)
+            reader.tail = lrow
+            reader.has_leftover = false
+        end
+        while reader.tail < buffer_size do
+            local data = reader.reader:get_data()
+            if data == nil then
+                break
+            end
+            local drow = nil
+            for id, d in pairs(data) do
+                if drow == nil then
+                    drow = d:nrow()
+                elseif d:nrow() ~= drow then
+                    nerv.error("reader provides with inconsistent rows of data")
+                end
+            end
+            local remain = buffer_size - reader.tail
+            if drow > remain then
+                for id, buff in pairs(reader.buffs) do
+                    local d = data[id]
+                    if d == nil then
+                        nerv.error("reader does not provide data for %s", id)
+                    end
+                    buff.leftover = self.gconf.mmat_type(drow - remain,
+                                                        buff.width)
+                    buff.leftover:copy_from(d, remain, drow)
+                end
+                drow = remain
+                reader.has_leftover = true
+            end
+            for id, buff in pairs(reader.buffs) do
+                buff.data:copy_from(data[id], 0, drow, reader.tail)
+            end
+            reader.tail = reader.tail + drow
+        end
+        self.tail = math.min(self.tail, reader.tail)
+    end
+    self.rand_map = nerv.MMatrixInt.perm_gen(self.tail) -- generate shuffled index
+    collectgarbage("collect")
+    return self.tail >= self.gconf.batch_size
+end
+
+function SGDBuffer:get_data()
+    local batch_size = self.gconf.batch_size
+    if self.head >= self.tail then -- buffer is empty
+        if not self:saturate() then
+            return nil -- the remaining data cannot build a batch
+        end
+    end
+    if self.head + batch_size > self.tail then
+        return nil -- the remaining data cannot build a batch
+    end
+    local res = {}
+    for i, reader in ipairs(self.readers) do
+        for id, buff in pairs(reader.buffs) do
+            local batch = self.gconf.cumat_type(batch_size, buff.width)
+            if self.randomize then
+                batch:copy_rows_fromh_by_idx(buff.data, self.rand_map, self.head)
+            else
+                batch:copy_fromh(buff.data, self.head, self.head + batch_size)
+            end
+            res[id] = batch
+        end
+    end
+    self.head = self.head + batch_size
+    return res
+end
diff --git a/nerv/layer/affine.lua b/nerv/layer/affine.lua
new file mode 100644
index 0000000..00cbcfb
--- /dev/null
+++ b/nerv/layer/affine.lua
@@ -0,0 +1,91 @@
+local MatrixParam = nerv.class('nerv.MatrixParam', 'nerv.Param')
+local LinearTransParam = nerv.class('nerv.LinearTransParam', 'nerv.MatrixParam')
+local BiasParam = nerv.class('nerv.BiasParam', 'nerv.MatrixParam')
+local AffineLayer = nerv.class('nerv.AffineLayer', 'nerv.Layer')
+
+function MatrixParam:read(handle)
+    self.trans = self.gconf.cumat_type.new_from_host(
+                    nerv.MMatrixFloat.load(handle))
+end
+
+function MatrixParam:write(handle)
+    self.trans:new_to_host():save(handle)
+end
+
+function MatrixParam:train_init()
+    self.correction = self.trans:create()
+    self.correction:fill(0)
+end
+
+function MatrixParam:update(gradient)
+    local gconf = self.gconf
+    self.correction:add(self.correction, gradient, gconf.momentum, 1.0)
+    -- momentum gain
+    local mmt_gain = 1.0 / (1.0 - gconf.momentum);
+    local n = self.gconf.batch_size * mmt_gain
+    -- perform update
+    self.trans:add(self.trans, self.correction, 1.0, -gconf.lrate / n)
+end
+
+function LinearTransParam:update(gradient)
+    MatrixParam.update(self, gradient)
+    local gconf = self.gconf
+    -- weight decay
+    self.trans:add(self.trans, self.trans, 1.0, -gconf.lrate * gconf.wcost)
+end
+
+function AffineLayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.ltp = layer_conf.ltp
+    self.bp = layer_conf.bp
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self.gconf = global_conf
+    self:check_dim_len(1, 1) -- exactly one input and one output
+    self.direct_update = layer_conf.direct_update
+end
+
+function AffineLayer:init(batch_size)
+    if self.ltp.trans:ncol() ~= self.bp.trans:ncol() then
+        nerv.error("mismatching dimensions of linear transform and bias paramter")
+    end
+    if self.dim_in[1] ~= self.ltp.trans:nrow() then
+        nerv.error("mismatching dimensions of linear transform parameter and input")
+    end
+    if self.dim_out[1] ~= self.ltp.trans:ncol() then
+        nerv.error("mismatching dimensions of linear transform parameter and output")
+    end
+    self.ltp_grad = self.ltp.trans:create()
+    self.ltp:train_init()
+    self.bp:train_init()
+end
+
+function AffineLayer:update(bp_err, input, output)
+    if self.direct_update then
+        self.ltp.correction:mul(input[1], bp_err[1], 1.0, gconf.momentum, 'T', 'N')
+        -- momentum gain
+        local mmt_gain = 1.0 / (1.0 - gconf.momentum);
+        local n = self.gconf.batch_size * mmt_gain
+        -- perform update
+        self.ltp.trans:add(self.ltp.trans, self.ltp.correction, 1.0, -gconf.lrate / n)
+    else
+        self.ltp_grad:mul(input[1], bp_err[1], 1.0, 0.0, 'T', 'N')
+        self.ltp:update(self.ltp_grad)
+    end
+    self.bp:update(bp_err[1]:colsum())
+end
+
+function AffineLayer:propagate(input, output)
+    -- apply linear transform
+    output[1]:mul(input[1], self.ltp.trans, 1.0, 0.0, 'N', 'N')
+    -- add bias
+    output[1]:add_row(self.bp.trans, 1.0)
+end
+
+function AffineLayer:back_propagate(bp_err, next_bp_err, input, output)
+    next_bp_err[1]:mul(bp_err[1], self.ltp.trans, 1.0, 0.0, 'N', 'T')
+end
+
+function AffineLayer:get_params()
+    return nerv.ParamRepo({self.ltp, self.bp})
+end
diff --git a/nerv/layer/bias.lua b/nerv/layer/bias.lua
new file mode 100644
index 0000000..c99274d
--- /dev/null
+++ b/nerv/layer/bias.lua
@@ -0,0 +1,28 @@
+local BiasLayer = nerv.class("nerv.BiasLayer", "nerv.Layer")
+
+function BiasLayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.gconf = global_conf
+    self.bias = layer_conf.bias
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self:check_dim_len(1, 1)
+end
+
+function BiasLayer:init()
+    if self.dim_in[1] ~= self.bias.trans:ncol() then
+        nerv.error("mismatching dimensions of input and bias parameter")
+    end
+    if self.dim_out[1] ~= self.bias.trans:ncol() then
+        nerv.error("mismatching dimensions of output and bias parameter")
+    end
+end
+
+function BiasLayer:propagate(input, output)
+    output[1]:copy_fromd(input[1])
+    output[1]:add_row(self.bias.trans, 1.0)
+end
+
+function BiasLayer:get_params()
+    return nerv.ParamRepo({self.bias})
+end
diff --git a/nerv/layer/combiner.lua b/nerv/layer/combiner.lua
new file mode 100644
index 0000000..7bd7617
--- /dev/null
+++ b/nerv/layer/combiner.lua
@@ -0,0 +1,59 @@
+local CombinerLayer = nerv.class('nerv.CombinerLayer', 'nerv.Layer')
+
+function CombinerLayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.lambda = layer_conf.lambda
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self.gconf = global_conf
+    self:check_dim_len(#self.lambda, -1)
+    if #self.dim_in < 1 then
+        nerv.error("no input specified")
+    end
+    if #self.dim_out < 1 then
+        nerv.error("no output specified")
+    end
+end
+
+function CombinerLayer:init(batch_size)
+    local dim = self.dim_in[1]
+    for i = 2, #self.dim_in do
+        if self.dim_in[i] ~= dim then
+            nerv.error("mismatching dimensions of inputs")
+        end
+    end
+    for i = 1, #self.dim_out do
+        if self.dim_out[i] ~= dim then
+            nerv.error("mismatching dimensions of inputs/outputs")
+        end
+    end
+    self.sum = self.gconf.cumat_type(batch_size, dim)
+end
+
+function CombinerLayer:update(bp_err, input, output)
+end
+
+function CombinerLayer:propagate(input, output)
+    output[1]:fill(0)
+    for i = 1, #self.dim_in do
+        output[1]:add(output[1], input[i], 1.0, self.lambda[i])
+    end
+    for i = 2, #self.dim_out do
+        output[i]:copy_fromd(output[1])
+    end
+end
+
+function CombinerLayer:back_propagate(bp_err, next_bp_err, input, output)
+    local sum = self.sum
+    sum:copy_fromd(bp_err[1])
+    for i = 2, #self.dim_out do
+        sum:add(sum, bp_err[i], 1.0, 1.0)
+    end
+    for i = 1, #self.dim_in do
+        next_bp_err[i]:add(next_bp_err[i], sum, 0.0, self.lambda[i])
+    end
+end
+
+function CombinerLayer:get_params()
+    return nerv.ParamRepo({})
+end
diff --git a/nerv/layer/init.lua b/nerv/layer/init.lua
new file mode 100644
index 0000000..3c55a94
--- /dev/null
+++ b/nerv/layer/init.lua
@@ -0,0 +1,79 @@
+-- The following methods must be implemented to let a layer work properly
+
+local Param = nerv.class('nerv.Param')
+
+function Param:__init(id, global_conf)
+    self.id = id
+    self.gconf = global_conf
+end
+
+function Param:get_info()
+    return self.info
+end
+
+function Param:set_info(info)
+    self.info = info
+end
+
+function Param:read(handle)
+    nerv.error_method_not_implemented()
+end
+
+function Param:write(handle)
+    nerv.error_method_not_implemented()
+end
+
+function Param:update(gradient)
+    nerv.error_method_not_implemented()
+end
+
+local Layer = nerv.class('nerv.Layer')
+
+function Layer:__init(id, global_conf, layer_conf)
+    nerv.error_method_not_implemented()
+end
+
+function Layer:init(batch_size)
+    nerv.error_method_not_implemented()
+end
+
+function Layer:update(bp_err, input, output)
+    nerv.error_method_not_implemented()
+end
+
+function Layer:propagate(input, output)
+    nerv.error_method_not_implemented()
+end
+
+function Layer:back_propagate(bp_err, next_bp_err, input, output)
+    nerv.error_method_not_implemented()
+end
+
+function Layer:check_dim_len(len_in, len_out)
+    local expected_in = #self.dim_in
+    local expected_out = #self.dim_out
+    if len_in > 0 and expected_in ~= len_in then
+        nerv.error("layer %s expects %d inputs, %d given",
+                    self.id, len_in, expected_in)
+    end
+    if len_out > 0 and expected_out ~= len_out then
+        nerv.error("layer %s expects %d outputs, %d given",
+                    self.id, len_out, expected_out)
+    end
+end
+
+function Layer:get_params()
+    nerv.error_method_not_implemented()
+end
+
+function Layer:get_dim()
+    return self.dim_in, self.dim_out
+end
+
+nerv.include('affine.lua')
+nerv.include('sigmoid.lua')
+nerv.include('softmax_ce.lua')
+nerv.include('bias.lua')
+nerv.include('window.lua')
+nerv.include('mse.lua')
+nerv.include('combiner.lua')
diff --git a/nerv/layer/mse.lua b/nerv/layer/mse.lua
new file mode 100644
index 0000000..9a97add
--- /dev/null
+++ b/nerv/layer/mse.lua
@@ -0,0 +1,52 @@
+local MSELayer = nerv.class("nerv.MSELayer", "nerv.Layer")
+
+function MSELayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self.gconf = global_conf
+    self:check_dim_len(2, -1)
+end
+
+function MSELayer:init(batch_size)
+    if self.dim_in[1] ~= self.dim_in[2] then
+        nerv.error("mismatching dimensions of previous network output and labels")
+    end
+    self.scale = 1 / self.dim_in[1]
+    self.total_mse = 0.0
+    self.total_frames = 0
+    self.mse = self.gconf.cumat_type(batch_size, self.dim_in[1])
+    self.mse_sum = self.gconf.cumat_type(batch_size, 1)
+    self.diff = self.mse:create()
+end
+
+function MSELayer:update(bp_err, input, output)
+    -- no params, therefore do nothing
+end
+
+function MSELayer:propagate(input, output)
+    local mse = self.mse
+    local mse_sum = self.mse_sum
+    mse:add(input[1], input[2], 1.0, -1.0)
+    self.diff:copy_fromd(mse)
+    mse:mul_elem(mse, mse)
+    mse_sum:add(mse_sum, mse:rowsum(mse), 0.0, self.scale)
+    if output[1] ~= nil then
+        output[1]:copy_fromd(mse_sum)
+    end
+    self.total_mse = self.total_mse + mse_sum:colsum()[0]
+    self.total_frames = self.total_frames + mse_sum:nrow()
+end
+
+-- NOTE: must call propagate before back_propagate
+function MSELayer:back_propagate(bp_err, next_bp_err, input, output)
+    local nbe = next_bp_err[1]
+    nbe:add(nbe, self.diff, 0.0, 2 * self.scale)
+    if bp_err[1] ~= nil then
+        nbe:scale_rows_by_col(bp_err[1])
+    end
+end
+
+function MSELayer:get_params()
+    return nerv.ParamRepo({})
+end
diff --git a/nerv/layer/sigmoid.lua b/nerv/layer/sigmoid.lua
new file mode 100644
index 0000000..dfd09eb
--- /dev/null
+++ b/nerv/layer/sigmoid.lua
@@ -0,0 +1,31 @@
+local SigmoidLayer = nerv.class("nerv.SigmoidLayer", "nerv.Layer")
+
+function SigmoidLayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.gconf = global_conf
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self:check_dim_len(1, 1)
+end
+
+function SigmoidLayer:init()
+    if self.dim_in[1] ~= self.dim_out[1] then
+        nerv.error("mismatching dimensions of input and output")
+    end
+end
+
+function SigmoidLayer:update(bp_err, input, output)
+    -- no params, therefore do nothing
+end
+
+function SigmoidLayer:propagate(input, output)
+    output[1]:sigmoid(input[1])
+end
+
+function SigmoidLayer:back_propagate(bp_err, next_bp_err, input, output)
+    next_bp_err[1]:sigmoid_grad(bp_err[1], output[1])
+end
+
+function SigmoidLayer:get_params()
+    return nerv.ParamRepo({})
+end
diff --git a/nerv/layer/softmax_ce.lua b/nerv/layer/softmax_ce.lua
new file mode 100644
index 0000000..daf891e
--- /dev/null
+++ b/nerv/layer/softmax_ce.lua
@@ -0,0 +1,68 @@
+local SoftmaxCELayer = nerv.class("nerv.SoftmaxCELayer", "nerv.Layer")
+
+function SoftmaxCELayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.gconf = global_conf
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self.compressed = layer_conf.compressed
+    if self.compressed == nil then
+        self.compressed = false
+    end
+    self:check_dim_len(2, -1) -- two inputs: nn output and label
+end
+
+function SoftmaxCELayer:init(batch_size)
+    if not self.compressed and (self.dim_in[1] ~= self.dim_in[2]) then
+        nerv.error("mismatching dimensions of previous network output and labels")
+    end
+    self.total_ce = 0.0
+    self.total_correct = 0
+    self.total_frames = 0
+    self.softmax = self.gconf.cumat_type(batch_size, self.dim_in[1])
+    self.ce = self.softmax:create()
+end
+
+function SoftmaxCELayer:update(bp_err, input, output)
+    -- no params, therefore do nothing
+end
+
+function SoftmaxCELayer:propagate(input, output)
+    local softmax = self.softmax
+    local ce = self.ce
+    local classified = softmax:softmax(input[1])
+    local label = input[2]
+    ce:log_elem(softmax)
+    if self.compressed then
+        label = label:decompress(input[1]:ncol())
+    end
+    ce:mul_elem(ce, label)
+    ce = ce:rowsum()
+    if output[1] ~= nil then
+        output[1]:copy_fromd(ce)
+    end
+    -- add total ce
+    self.total_ce = self.total_ce - ce:colsum()[0]
+    self.total_frames = self.total_frames + softmax:nrow()
+    -- TODO: add colsame for uncompressed label
+    if self.compressed then
+        self.total_correct = self.total_correct + classified:colsame(input[2])[0]
+    end
+end
+
+function SoftmaxCELayer:back_propagate(bp_err, next_bp_err, input, output)
+    -- softmax output - label
+    local label = input[2]
+    if self.compressed then
+        label = label:decompress(input[1]:ncol())
+    end
+    local nbe = next_bp_err[1]
+    nbe:add(self.softmax, label, 1.0, -1.0)
+    if bp_err[1] ~= nil then
+        nbe:scale_rows_by_col(bp_err[1])
+    end
+end
+
+function SoftmaxCELayer:get_params()
+    return nerv.ParamRepo({})
+end
diff --git a/nerv/layer/window.lua b/nerv/layer/window.lua
new file mode 100644
index 0000000..4e9a3b1
--- /dev/null
+++ b/nerv/layer/window.lua
@@ -0,0 +1,28 @@
+local WindowLayer = nerv.class("nerv.WindowLayer", "nerv.Layer")
+
+function WindowLayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.gconf = global_conf
+    self.window = layer_conf.window
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self:check_dim_len(1, 1)
+end
+
+function WindowLayer:init()
+    if self.dim_in[1] ~= self.window.trans:ncol() then
+        nerv.error("mismatching dimensions of input and window parameter")
+    end
+    if self.dim_out[1] ~= self.window.trans:ncol() then
+        nerv.error("mismatching dimensions of output and window parameter")
+    end
+end
+
+function WindowLayer:propagate(input, output)
+    output[1]:copy_fromd(input[1])
+    output[1]:scale_rows_by_row(self.window.trans)
+end
+
+function WindowLayer:get_params()
+    return nerv.ParamRepo({self.window})
+end
diff --git a/nerv/luaT/README.md b/nerv/luaT/README.md
new file mode 100644
index 0000000..6e9cf0d
--- /dev/null
+++ b/nerv/luaT/README.md
@@ -0,0 +1,239 @@
+<a name="luat.dok"/>
+# Lua Torch C API #
+
+luaT provides an API to interface Lua and C in Torch packages. It defines a
+concept of _classes_ to Lua for Torch, and provides a mechanism to easily
+handle these Lua classes from C.
+
+It additionally provides few functions that `luaL` should have defined, and
+defines several functions similar to `luaL` ones for better type error printing when using
+`luaT` classes.
+
+<a name="luat.memory.dok"/>
+## Memory functions ##
+
+Classical memory allocation functions which generate a Lua error in case of
+problem.
+
+<a name="luaT_alloc"/>
+### void* luaT_alloc(lua_State *L, long size) ###
+
+Allocates `size` bytes, and return a pointer on the allocated
+memory. A Lua error will be generated if running out of memory.
+
+<a name="luaT_realloc"/>
+### void* luaT_realloc(lua_State *L, void *ptr, long size) ###
+
+Realloc `ptr` to `size` bytes. `ptr` must have been previously
+allocated with [luaT_alloc](#luaT_alloc) or
+[luaT_realloc](#luaT_realloc), or the C `malloc` or `realloc`
+functions. A Lua error will be generated if running out of memory.
+
+<a name="luaT_free"/>
+### void luaT_free(lua_State *L, void *ptr) ###
+
+Free memory allocated at address `ptr`. The memory must have been
+previously allocated with [luaT_alloc](#luaT_alloc) or
+[luaT_realloc](#luaT_realloc), or the C `malloc` or `realloc`
+functions.
+
+<a name="luat.classcreate"/>
+## Class creation and basic handling ##
+
+A `luaT` class is basically either a Lua _table_ or _userdata_ with
+an appropriate _metatable_. This appropriate metatable is created with
+[luaT_newmetatable](#luaT_newmetatable). Contrary to luaL userdata
+functions, luaT mechanism handles inheritance. If the class inherit from
+another class, then the metatable will itself have a metatable
+corresponding to the _parent metatable_: the metatables are cascaded
+according to the class inheritance. Multiple inheritance is not supported.
+
+<a name="luat.operatoroverloading"/>
+### Operator overloading ###
+
+The metatable of a `luaT` object contains `Lua` operators like
+`__index`, `__newindex`, `__tostring`, `__add`
+(etc...). These operators will respectively look for `__index__`,
+`__newindex__`, `__tostring__`, `__add__` (etc...) in the
+metatable. If found, the corresponding function or value will be returned,
+else a Lua error will be raised.
+
+If one wants to provide `__index__` or `__newindex__` in the
+metaclass, these operators must follow a particular scheme:
+
+  * `__index__` must either return a value _and_ `true` or return `false` only. In the first case, it means `__index__` was able to handle the given argument (for e.g., the type was correct). The second case means it was not able to do anything, so `__index` in the root metatable can then try to see if the metaclass contains the required value.
+
+  * `__newindex__` must either return `true` or `false`. As for `__index__`, `true` means it could handle the argument and `false` not. If not, the root metatable `__newindex` will then raise an error if the object was a userdata, or apply a rawset if the object was a Lua table.
+
+Other metaclass operators like `__tostring__`, `__add__`, etc... do not have any particular constraint.
+
+<a name="luat_newmetatable"/>
+### const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parenttname, lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory) ###
+
+This function creates a new metatable, which is the Lua way to define a new
+object class. As for `luaL_newmetatable`, the metatable is registered in
+the Lua registry table, with the key `tname`. In addition, `tname` is
+also registered in the Lua registry, with the metatable as key (the
+typename of a given object can be thus easily retrieved).
+
+The class name `tname` must be of the form `modulename.classname`. The module name
+If not NULL, `parenttname` must be a valid typename corresponding to the
+parent class of the new class.
+
+If not NULL, `constructor`, a function `new` will be added to the metatable, pointing to this given function. The constructor might also
+be called through `modulename.classname()`, which is an alias setup by `luaT_metatable`.
+
+If not NULL, `destructor` will be called when garbage collecting the object.
+
+If not NULL, `factory` must be a Lua C function creating an empty object
+instance of the class. This functions are used in Torch for serialization.
+
+Note that classes can be partly defined in C and partly defined in Lua:
+once the metatable is created in C, it can be filled up with additional
+methods in Lua.
+
+The return value is the value returned by [luaT_typenameid](#luat_typenameid).
+
+<a name="luat_pushmetatable"/>
+### int luaT_pushmetatable(lua_State *L, const name *tname) ###
+
+Push the metatable with type name `tname` on the stack, it `tname` is a
+valid Torch class name (previously registered with luaT_newmetatable).
+
+On success, returns 1. If `tname` is invalid, nothing is pushed and it
+returns 0.
+
+<a name="luat_typenameid"/>
+### const char* luaT_typenameid(lua_State *L, const char *tname) ###
+
+If `tname` is a valid Torch class name, then returns a unique string (the
+contents will be the same than `tname`) pointing on the string registered
+in the Lua registry. This string is thus valid as long as Lua is
+running. The returned string shall not be freed.
+
+If `tname` is an invalid class name, returns NULL.
+
+<a name="luat_typename"/>
+### const char* luaT_typename(lua_State *L, int ud) ###
+
+Returns the typename of the object at index `ud` on the stack. If it is
+not a valid Torch object, returns NULL.
+
+<a name="luat_pushudata"/>
+### void luaT_pushudata(lua_State *L, void *udata, const char *tname) ###
+
+Given a C structure `udata`, push a userdata object on the stack with
+metatable corresponding to `tname`. Obviously, `tname` must be a valid
+Torch name registered with [luaT_newmetatable](#luat_newmetatable).
+
+<a name="luat_toudata"/>
+### void *luaT_toudata(lua_State *L, int ud, const char *tname) ###
+
+Returns a pointer to the original C structure previously pushed on the
+stack with [luaT_pushudata](#luat_pushudata), if the object at index
+`ud` is a valid Torch class name. Returns NULL otherwise.
+
+<a name="luat_isudata"/>
+### int luaT_isudata(lua_State *L, int ud, const char *tname) ###
+
+Returns 1 if the object at index `ud` on the stack is a valid Torch class name `tname`.
+Returns 0 otherwise.
+
+<a name="luat_getfield"/>
+### Checking fields of a table ###
+
+This functions check that the table at the given index `ud` on the Lua
+stack has a field named `field`, and that it is of the specified type.
+These function raises a Lua error on failure.
+
+<a name="luat_getfieldcheckudata"/>
+## void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+Torch class name `tname`.  Returns the pointer of the C structure
+previously pushed on the stack with [luaT_pushudata](#luat_pushudata) on
+success. The function raises a Lua error on failure.
+
+<a name="luat_getfieldchecklightudata"/>
+## void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+lightuserdata.  Returns the lightuserdata pointer on success. The function
+raises a Lua error on failure.
+
+<a name="luat_getfieldcheckint"/>
+## int luaT_getfieldcheckint(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is an
+int. Returns the int value pointer on success. The function raises a Lua
+error on failure.
+
+<a name="luat_getfieldcheckstring"/>
+## const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+string. Returns a pointer to the string on success. The function raises a
+Lua error on failure.
+
+<a name="luat_getfieldcheckboolean"/>
+## int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+boolean. On success, returns 1 if the boolean is `true`, 0 if it is
+`false`. The function raises a Lua error on failure.
+
+<a name="luat_getfieldchecktable"/>
+## void luaT_getfieldchecktable(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+table. On success, push the table on the stack. The function raises a Lua
+error on failure.
+
+<a name="luat_typerror"/>
+### int luaT_typerror(lua_State *L, int ud, const char *tname) ###
+
+Raises a `luaL_argerror` (and returns its value), claiming that the
+object at index `ud` on the stack is not of type `tname`. Note that
+this function does not check the type, it only raises an error.
+
+<a name="luat_checkboolean"/>
+### int luaT_checkboolean(lua_State *L, int ud) ###
+
+Checks that the value at index `ud` is a boolean. On success, returns 1
+if the boolean is `true`, 0 if it is `false`. The function raises a Lua
+error on failure.
+
+<a name="luat_optboolean"/>
+### int luaT_optboolean(lua_State *L, int ud, int def) ###
+
+Checks that the value at index `ud` is a boolean. On success, returns 1
+if the boolean is `true`, 0 if it is `false`. If there is no value at
+index `ud`, returns `def`. In any other cases, raises an error.
+
+<a name="luat_registeratname"/>
+### void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name) ###
+
+This function assume a table is on the stack. It creates a table field
+`name` in the table (if this field does not exist yet), and fill up
+`methods` in this table field.
+
+<a name="luat_classrootname"/>
+### const char *luaT_classrootname(const char *tname) ###
+
+Assuming `tname` is of the form `modulename.classname`, returns
+`classname`. The returned value shall not be freed. It is a pointer
+inside `tname` string.
+
+<a name="luat_classmodulename"/>
+### const char *luaT_classmodulename(const char *tname) ###
+
+Assuming `tname` is of the form `modulename.classname`, returns
+`modulename`. The returned value shall not be freed. It is valid until the
+next call to `luaT_classrootname`.
+
+<a name="luat_stackdump"/>
+### void luaT_stackdump(lua_State *L) ###
+
+This function print outs the state of the Lua stack. It is useful for debug
+purposes.
+
diff --git a/nerv/luaT/luaT.c b/nerv/luaT/luaT.c
new file mode 100644
index 0000000..7b85ce3
--- /dev/null
+++ b/nerv/luaT/luaT.c
@@ -0,0 +1,1079 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "luaT.h"
+
+void* luaT_alloc(lua_State *L, long size)
+{
+  void *ptr;
+
+  if(size == 0)
+    return NULL;
+
+  if(size < 0)
+    luaL_error(L, "$ Torch: invalid memory size -- maybe an overflow?");
+
+  ptr = malloc(size);
+  if(!ptr)
+    luaL_error(L, "$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", size/1073741824);
+
+  return ptr;
+}
+
+void* luaT_realloc(lua_State *L, void *ptr, long size)
+{
+  if(!ptr)
+    return(luaT_alloc(L, size));
+
+  if(size == 0)
+  {
+    luaT_free(L, ptr);
+    return NULL;
+  }
+
+  if(size < 0)
+    luaL_error(L, "$ Torch: invalid memory size -- maybe an overflow?");
+
+  ptr = realloc(ptr, size);
+  if(!ptr)
+    luaL_error(L, "$ Torch: not enough memory: you tried to reallocate %dGB. Buy new RAM!", size/1073741824);
+  return ptr;
+}
+
+void luaT_free(lua_State *L, void *ptr)
+{
+  free(ptr);
+}
+
+void luaT_stackdump(lua_State *L)
+{
+  int i;
+  const char *tname = NULL;
+  int top = lua_gettop(L);
+  for(i = 1; i <= top; i++)
+  {
+    int t = lua_type(L, i);
+    printf("%3d. ", i);
+    switch(t)
+    {
+      case LUA_TSTRING:
+        printf("'%s'", lua_tostring(L,i));
+        break;
+      case LUA_TBOOLEAN:
+        printf(lua_toboolean(L, i) ? "true" : "false");
+        break;
+      case LUA_TNUMBER:
+        printf("%g", lua_tonumber(L,i));
+        break;
+      case LUA_TUSERDATA:
+        tname = luaT_typename(L, i);
+        printf("userdata %lx [%s]", (long)lua_topointer(L, i), (tname ? tname : "not a Torch object"));
+        break;
+      case 10:
+        tname = luaT_typename(L, i);
+        printf("cdata %lx [%s]", (long)lua_topointer(L, i), (tname ? tname : "not a Torch object"));
+        break;
+      case LUA_TTABLE:
+        lua_pushvalue(L, i);
+        lua_rawget(L, LUA_REGISTRYINDEX);
+        if(lua_isstring(L, -1))
+          tname = lua_tostring(L, -1); /*luaT_typenameid(L, lua_tostring(L, -1)); */
+        else
+          tname = NULL;
+        lua_pop(L, 1);
+        if(tname)
+          printf("metatable [%s]", tname);
+        else
+        {
+          tname = luaT_typename(L, i);
+          printf("table %lx [%s]", (long)lua_topointer(L, i), (tname ? tname : "not a Torch object"));
+        }
+        break;
+      default:
+        printf("Lua object type: %s", lua_typename(L,t));
+        break;
+    }
+    printf("\n");
+  }
+  printf("---------------------------------------------\n");
+}
+
+/* metatable operator methods */
+static int luaT_mt__index(lua_State *L);
+static int luaT_mt__newindex(lua_State *L);
+static int luaT_mt__tostring(lua_State *L);
+static int luaT_mt__add(lua_State *L);
+static int luaT_mt__sub(lua_State *L);
+static int luaT_mt__mul(lua_State *L);
+static int luaT_mt__div(lua_State *L);
+static int luaT_mt__mod(lua_State *L);
+static int luaT_mt__pow(lua_State *L);
+static int luaT_mt__unm(lua_State *L);
+static int luaT_mt__concat(lua_State *L);
+static int luaT_mt__len(lua_State *L);
+static int luaT_mt__eq(lua_State *L);
+static int luaT_mt__lt(lua_State *L);
+static int luaT_mt__le(lua_State *L);
+static int luaT_mt__call(lua_State *L);
+
+/* Constructor-metatable methods */
+static int luaT_cmt__call(lua_State *L);
+static int luaT_cmt__newindex(lua_State *L);
+
+const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parenttname,
+                              lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory)
+{
+  lua_pushcfunction(L, luaT_lua_newmetatable);
+  lua_pushstring(L, tname);
+  (parenttname ? lua_pushstring(L, parenttname) : lua_pushnil(L));
+  (constructor ? lua_pushcfunction(L, constructor) : lua_pushnil(L));
+  (destructor ? lua_pushcfunction(L, destructor) : lua_pushnil(L));
+  (factory ? lua_pushcfunction(L, factory) : lua_pushnil(L));
+  lua_call(L, 5, 1);
+  return luaT_typenameid(L, tname);
+}
+
+int luaT_pushmetatable(lua_State *L, const char *tname)
+{
+  lua_getfield(L, LUA_REGISTRYINDEX, tname);
+  if(lua_isnil(L, -1))
+  {
+    lua_pop(L, 1);
+    return 0;
+  }
+  return 1;
+}
+
+const char *luaT_typenameid(lua_State *L, const char *tname)
+{
+  if(luaT_pushmetatable(L, tname))
+  {
+    const char *tnameid = NULL;
+    lua_rawget(L, LUA_REGISTRYINDEX);
+    if(lua_isstring(L, -1))
+      tnameid = lua_tostring(L, -1);
+    lua_pop(L, 1); /* the string/nil */
+    return tnameid;
+  }
+  return NULL;
+}
+
+static const char cdataname[] = ""
+  "local _, ffi = pcall(require, 'ffi')\n"
+  "if ffi then\n"
+  "  local id2name = {}\n"
+  "  return function(cdata, name)\n"
+  "    local id = tonumber(ffi.typeof(cdata))\n"
+  "    if id then\n"
+  "      if name then\n"
+  "        id2name[id] = name\n"
+  "        return name\n"
+  "      else\n"
+  "        return rawget(id2name, id)\n"
+  "      end\n"
+  "    end\n"
+  "    return nil\n"
+  "  end\n"
+  "else\n"
+  "  return function() end\n"
+  "end\n";
+
+static const char* luaT_cdataname(lua_State *L, int ud, const char *tname)
+{
+  lua_pushstring(L, "__cdataname");
+  lua_rawget(L, LUA_REGISTRYINDEX);
+  if(lua_isnil(L,-1))
+  {
+    lua_pop(L, 1);
+
+    if(luaL_dostring(L, cdataname)) /* did something go wrong? */
+      luaL_error(L, "internal error (could not load cdataname): %s", lua_tostring(L, -1));
+
+    lua_pushstring(L, "__cdataname");
+    lua_pushvalue(L, -2);
+    lua_rawset(L, LUA_REGISTRYINDEX);
+  }
+  if(!lua_isfunction(L, -1)) /* should not happen */
+    luaL_error(L, "internal error (cdataname is not a function)");
+
+  lua_pushvalue(L, ud);
+  if(tname)
+    lua_pushstring(L, tname);
+  if(lua_pcall(L, (tname ? 2 : 1), 1, 0))
+    luaL_error(L, "internal error (cdataname): %s", lua_tostring(L, -1));
+
+  tname = lua_tostring(L, -1);
+  lua_pop(L, 1);
+
+  return tname;
+}
+
+const char* luaT_typename(lua_State *L, int ud)
+{
+  if(lua_type(L, ud) == 10)
+    return luaT_cdataname(L, ud, NULL);
+  else if(lua_getmetatable(L, ud))
+  {
+    const char *tname = NULL;
+    lua_rawget(L, LUA_REGISTRYINDEX);
+    if(lua_isstring(L, -1))
+      tname = lua_tostring(L, -1);
+    lua_pop(L, 1); /* the string/nil */
+    return tname;
+  }
+  return NULL;
+}
+
+void luaT_pushudata(lua_State *L, void *udata, const char *tname)
+{
+  if(udata)
+  {
+    void **udata_p = lua_newuserdata(L, sizeof(void*));
+    *udata_p = udata;
+    if(!luaT_pushmetatable(L, tname))
+      luaL_error(L, "Torch internal problem: cannot find metatable for type <%s>", tname);
+    lua_setmetatable(L, -2);
+  }
+  else
+    lua_pushnil(L);
+}
+
+void *luaT_toudata(lua_State *L, int ud, const char *tname)
+{
+  void **p = lua_touserdata(L, ud);
+  if(p != NULL) /* value is a userdata? */
+  {
+    if(!luaT_pushmetatable(L, tname))
+      luaL_error(L, "Torch internal problem: cannot find metatable for type <%s>", tname);
+
+    /* initialize the table we want to get the metatable on */
+    /* note that we have to be careful with indices, as we just inserted stuff */
+    lua_pushvalue(L, (ud < 0 ? ud - 1 : ud));
+    while(lua_getmetatable(L, -1)) /* get the next metatable */
+    {
+      lua_remove(L, -2); /* remove the previous metatable [or object, if first time] */
+      if(lua_rawequal(L, -1, -2))
+      {
+        lua_pop(L, 2);  /* remove the two metatables */
+        return *p;
+      }
+    }
+    lua_pop(L, 2); /* remove the two metatables */
+  }
+  return NULL;
+}
+
+int luaT_isudata(lua_State *L, int ud, const char *tname)
+{
+  if(luaT_toudata(L, ud, tname))
+    return 1;
+  else
+    return 0;
+}
+
+void *luaT_checkudata(lua_State *L, int ud, const char *tname)
+{
+  void *p = luaT_toudata(L, ud, tname);
+  if(!p)
+    luaT_typerror(L, ud, tname);
+  return p;
+}
+
+void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname)
+{
+  void *p;
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  p = luaT_toudata(L, -1, tname);
+  if(!p)
+    luaL_error(L, "bad argument #%d (field %s is not a %s)", ud, field, tname);
+  return p;
+}
+
+void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field)
+{
+  void *p;
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+
+  if(!lua_islightuserdata(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a light userdata)", ud, field);
+
+  p = lua_touserdata(L, -1);
+
+  return p;
+}
+
+double luaT_getfieldchecknumber(lua_State *L, int ud, const char *field)
+{
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  if(!lua_isnumber(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a number)", ud, field);
+  return lua_tonumber(L, -1);
+}
+
+int luaT_getfieldcheckint(lua_State *L, int ud, const char *field)
+{
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  if(!lua_isnumber(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a number)", ud, field);
+  return (int)lua_tonumber(L, -1);
+}
+
+const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field)
+{
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  if(!lua_isstring(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a string)", ud, field);
+  return lua_tostring(L, -1);
+}
+
+int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field)
+{
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  if(!lua_isboolean(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a boolean)", ud, field);
+  return lua_toboolean(L, -1);
+}
+
+void luaT_getfieldchecktable(lua_State *L, int ud, const char *field)
+{
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  if(!lua_istable(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a table)", ud, field);
+}
+
+/**** type checks as in luaL ****/
+int luaT_typerror(lua_State *L, int ud, const char *tname)
+{
+  const char *msg;
+  const char *tnameud = luaT_typename(L, ud);
+
+  if(!tnameud)
+    tnameud = lua_typename(L, ud);
+
+  msg = lua_pushfstring(L, "%s expected, got %s",
+                        tname,
+                        (tnameud ? tnameud : "unknown object"));
+
+  return luaL_argerror(L, ud, msg);
+}
+
+int luaT_checkboolean(lua_State *L, int ud)
+{
+  if(!lua_isboolean(L, ud))
+    luaT_typerror(L, ud, lua_typename(L, LUA_TBOOLEAN));
+  return lua_toboolean(L, ud);
+}
+
+int luaT_optboolean(lua_State *L, int ud, int def)
+{
+  if(lua_isnoneornil(L,ud))
+    return def;
+
+  return luaT_checkboolean(L, ud);
+}
+
+void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name)
+{
+  int idx = lua_gettop(L);
+
+  luaL_checktype(L, idx, LUA_TTABLE);
+  lua_pushstring(L, name);
+  lua_rawget(L, idx);
+
+  if(lua_isnil(L, -1))
+  {
+    lua_pop(L, 1);
+    lua_pushstring(L, name);
+    lua_newtable(L);
+    lua_rawset(L, idx);
+
+    lua_pushstring(L, name);
+    lua_rawget(L, idx);
+  }
+
+  luaL_register(L, NULL, methods);
+  lua_pop(L, 1);
+}
+
+
+/* utility functions */
+const char *luaT_classrootname(const char *tname)
+{
+  int i;
+  int sz = strlen(tname);
+
+  for(i = 0; i < sz; i++)
+  {
+    if(tname[i] == '.')
+      return tname+i+1;
+  }
+  return tname;
+}
+
+/* module_name must be a buffer at least as big as tname 
+ * return true if the class is part of a module */
+int luaT_classmodulename(const char *tname, char *module_name)
+{
+  char chars[] = {'.', '\0'};
+  size_t n;
+  n = strcspn(tname, chars);
+  strncpy(module_name, tname, n);
+  module_name[n] = '\0';
+  return tname[n] == '.';
+}
+
+/* Lua only functions */
+int luaT_lua_newmetatable(lua_State *L)
+{
+  const char* tname = luaL_checkstring(L, 1);
+  char module_name[256];
+  int is_in_module = 0;
+  is_in_module = luaT_classmodulename(tname, module_name);
+
+  lua_settop(L, 5);
+  luaL_argcheck(L, lua_isnoneornil(L, 2) || lua_isstring(L, 2), 2, "parent class name or nil expected");
+  luaL_argcheck(L, lua_isnoneornil(L, 3) || lua_isfunction(L, 3), 3, "constructor function or nil expected");
+  luaL_argcheck(L, lua_isnoneornil(L, 4) || lua_isfunction(L, 4), 4, "destructor function or nil expected");
+  luaL_argcheck(L, lua_isnoneornil(L, 5) || lua_isfunction(L, 5), 5, "factory function or nil expected");
+
+  if(is_in_module)
+    lua_getfield(L, LUA_GLOBALSINDEX, module_name);
+  else
+    lua_pushvalue(L, LUA_GLOBALSINDEX);
+  if(!lua_istable(L, 6))
+    luaL_error(L, "while creating metatable %s: bad argument #1 (%s is an invalid module name)", tname, module_name);
+
+  /* we first create the new metaclass if we have to */
+  if(!luaT_pushmetatable(L, tname))
+  {
+    /* create the metatable */
+    lua_newtable(L);
+
+    /* registry[name] = metatable */
+    lua_pushvalue(L, -1);
+    lua_setfield(L, LUA_REGISTRYINDEX, tname);
+
+    /* registry[metatable] = tname */
+    lua_pushvalue(L, -1);
+    lua_pushstring(L, tname);
+    lua_rawset(L, LUA_REGISTRYINDEX);
+
+    /* __index handling */
+    lua_pushcfunction(L, luaT_mt__index);
+    lua_setfield(L, -2, "__index");
+
+    /* __newindex handling */
+    lua_pushcfunction(L, luaT_mt__newindex);
+    lua_setfield(L, -2, "__newindex");
+
+    /* __typename contains the typename */
+    lua_pushstring(L, tname);
+    lua_setfield(L, -2, "__typename");
+
+    /* __metatable is self */
+    lua_pushvalue(L, -1);
+    lua_setfield(L, -2, "__metatable");
+
+    /* by default, __version equals 1 */
+    lua_pushnumber(L, 1);
+    lua_setfield(L, -2, "__version");
+
+    /* assign default operator functions */
+    lua_pushcfunction(L, luaT_mt__tostring);
+    lua_setfield(L, -2, "__tostring");
+
+    lua_pushcfunction(L, luaT_mt__add);
+    lua_setfield(L, -2, "__add");
+
+    lua_pushcfunction(L, luaT_mt__sub);
+    lua_setfield(L, -2, "__sub");
+
+    lua_pushcfunction(L, luaT_mt__mul);
+    lua_setfield(L, -2, "__mul");
+
+    lua_pushcfunction(L, luaT_mt__div);
+    lua_setfield(L, -2, "__div");
+
+    lua_pushcfunction(L, luaT_mt__mod);
+    lua_setfield(L, -2, "__mod");
+
+    lua_pushcfunction(L, luaT_mt__pow);
+    lua_setfield(L, -2, "__pow");
+
+    lua_pushcfunction(L, luaT_mt__unm);
+    lua_setfield(L, -2, "__unm");
+
+    lua_pushcfunction(L, luaT_mt__concat);
+    lua_setfield(L, -2, "__concat");
+
+    lua_pushcfunction(L, luaT_mt__len);
+    lua_setfield(L, -2, "__len");
+
+    lua_pushcfunction(L, luaT_mt__eq);
+    lua_setfield(L, -2, "__eq");
+
+    lua_pushcfunction(L, luaT_mt__lt);
+    lua_setfield(L, -2, "__lt");
+
+    lua_pushcfunction(L, luaT_mt__le);
+    lua_setfield(L, -2, "__le");
+
+    lua_pushcfunction(L, luaT_mt__call);
+    lua_setfield(L, -2, "__call");
+  }
+
+  /* we assign the parent class if necessary */
+  if(!lua_isnoneornil(L, 2))
+  {
+    if(lua_getmetatable(L, -1))
+      luaL_error(L, "class %s has been already assigned a parent class\n", tname);
+    else
+    {
+      const char* parenttname = luaL_checkstring(L, 2);
+      if(!luaT_pushmetatable(L, parenttname))
+        luaL_error(L, "bad argument #2 (invalid parent class name %s)", parenttname);
+      lua_setmetatable(L, -2);
+    }
+  }
+
+  /* register the destructor function  */
+  if(!lua_isnoneornil(L, 4))
+  {
+    /* does it exists already? */
+    lua_pushstring(L, "__gc");
+    lua_rawget(L, -2);
+
+    if(lua_isnil(L, -1))
+    {
+      lua_pop(L, 1); /* pop nil */
+      lua_pushstring(L, "__gc");
+      lua_pushvalue(L, 4);
+      lua_rawset(L, -3);
+    }
+    else
+      luaL_error(L, "%s has been already assigned a destructor", tname);
+  }
+
+  /* register the factory function  */
+  if(!lua_isnoneornil(L, 5))
+  {
+    /* does it exists already? */
+    lua_pushstring(L, "__factory");
+    lua_rawget(L, -2);
+
+    if(lua_isnil(L, -1))
+    {
+      lua_pop(L, 1); /* pop nil */
+      lua_pushstring(L, "__factory");
+      lua_pushvalue(L, 5);
+      lua_rawset(L, -3);
+    }
+    else
+      luaL_error(L, "%s has been already assigned a factory", tname);
+  }
+
+  /******** Constructor table and metatable ********/
+  lua_pushstring(L, "__constructor");
+  lua_rawget(L, -2);
+  if(lua_isnil(L, -1))
+  {
+    lua_pop(L, 1);                        /* pop nil */
+    lua_newtable(L);                      /* fancy table */
+    lua_newtable(L);                      /* fancy metatable */
+
+    lua_pushvalue(L, -3);                 /* metatable */
+    lua_setfield(L, -2, "__index");       /* so we can get the methods */
+
+    lua_pushcfunction(L, luaT_cmt__newindex);
+    lua_setfield(L, -2, "__newindex");    /* so we add new methods */
+
+    lua_pushcfunction(L, luaT_cmt__call);
+    lua_setfield(L, -2, "__call");        /* so we can create, we are here for only that */
+
+    lua_pushvalue(L, -3);
+    lua_setfield(L, -2, "__metatable");   /* redirect to metatable with methods */
+
+    lua_setmetatable(L, -2);              /* constructor metatable is ... this fancy metatable */
+
+    /* set metatable[__constructor] = constructor-metatable */
+    lua_pushstring(L, "__constructor");
+    lua_pushvalue(L, -2);
+    lua_rawset(L, -4);
+  }
+
+  /* register the constructor function  */
+  if(!lua_isnoneornil(L, 3))
+  {
+    /* get constructor metatable */
+    lua_getmetatable(L, -1);
+
+    /* does it exists already? */
+    lua_pushstring(L, "__new");
+    lua_rawget(L, -2);
+
+    if(lua_isnil(L, -1))
+    {
+      lua_pop(L, 1); /* pop nil */
+      lua_pushstring(L, "__new");
+      lua_pushvalue(L, 3);
+      lua_rawset(L, -3);
+
+      /* set "new" in the metatable too */
+      lua_pushstring(L, "new");
+      lua_pushvalue(L, 3);
+      lua_rawset(L, -5);
+    }
+    else
+      luaL_error(L, "%s has been already assigned a constructor", tname);
+
+    /* pop constructor metatable */
+    lua_pop(L, 1);
+  }
+
+  /* module.name = constructor metatable */
+  lua_setfield(L, 6, luaT_classrootname(tname));
+
+  return 1; /* returns the metatable */
+}
+
+/* Lua only utility functions */
+
+/* add any custom type, provided the object has a metatable */
+int luaT_lua_metatype(lua_State *L)
+{
+  if( (lua_gettop(L) != 2) && (lua_gettop(L) != 3) )
+    luaL_error(L, "expecting: string table [ctype]");
+
+  luaL_checkstring(L, 1);
+  luaL_checktype(L, 2, LUA_TTABLE);
+
+  if(lua_gettop(L) == 3)
+  {
+    if(!luaT_cdataname(L, 3, lua_tostring(L, 1)))
+      luaL_error(L, "could not register cdata type -- missing ffi library?");
+  }
+
+  /* registry[name] = metatable */
+  lua_pushvalue(L, 1);
+  lua_pushvalue(L, 2);
+  lua_rawset(L, LUA_REGISTRYINDEX);
+
+  /* registry[metatable] = tname */
+  lua_pushvalue(L, 2);
+  lua_pushvalue(L, 1);
+  lua_rawset(L, LUA_REGISTRYINDEX);
+
+  return 0;
+}
+
+/* return a userdata from a C pointer */
+/* you are better to know what you are doing */
+int luaT_lua_pushudata(lua_State *L)
+{
+  void *udata = NULL;
+  const char *tname = luaL_checkstring(L, 2);
+
+  if(lua_type(L, 1) == 10)
+    udata = *((void**)lua_topointer(L, 1));
+  else if(lua_isnumber(L, 1))
+    udata = (void*)(long)lua_tonumber(L, 1);
+  else
+    luaL_argerror(L, 1, "expecting number or cdata");
+
+  luaT_pushudata(L, udata, tname);
+
+  return 1;
+}
+
+int luaT_lua_factory(lua_State *L)
+{
+  const char* tname = luaL_checkstring(L, 1);
+  if(luaT_pushmetatable(L, tname) && !lua_isnil(L, -1))
+  {
+    lua_pushstring(L, "__factory");
+    lua_rawget(L, -2);
+  }
+  else
+  {
+    lua_pushnil(L);
+  }
+  return 1;
+}
+
+int luaT_lua_getconstructortable(lua_State *L)
+{
+  const char* tname = luaL_checkstring(L, 1);
+  if(luaT_pushmetatable(L, tname))
+  {
+    lua_pushstring(L, "__constructor");
+    lua_rawget(L, -2);
+    return 1;
+  }
+  return 0;
+}
+
+
+int luaT_lua_typename(lua_State *L)
+{
+  const char* tname = NULL;
+  luaL_checkany(L, 1);
+  if((tname = luaT_typename(L, 1)))
+  {
+    lua_pushstring(L, tname);
+    return 1;
+  }
+  return 0;
+}
+
+int luaT_lua_isequal(lua_State *L)
+{
+  if(lua_isuserdata(L, 1) && lua_isuserdata(L, 2))
+  {
+    void **u1, **u2;
+    luaL_argcheck(L, luaT_typename(L, 1), 1, "Torch object expected");
+    luaL_argcheck(L, luaT_typename(L, 2), 2, "Torch object expected");
+
+    u1 = lua_touserdata(L, 1);
+    u2 = lua_touserdata(L, 2);
+    if(*u1 == *u2)
+      lua_pushboolean(L, 1);
+    else
+      lua_pushboolean(L, 0);
+  }
+  else if(lua_istable(L, 1) && lua_istable(L, 2))
+    lua_pushboolean(L, lua_rawequal(L, 1, 2));
+  else
+    lua_pushboolean(L, 0);
+  return 1;
+}
+
+int luaT_lua_pointer(lua_State *L)
+{
+  if(lua_isuserdata(L, 1))
+  {
+    void **ptr;
+    luaL_argcheck(L, luaT_typename(L, 1), 1, "Torch object expected");
+    ptr = lua_touserdata(L, 1);
+    lua_pushnumber(L, (long)(*ptr));
+    return 1;
+  }
+  else if(lua_istable(L, 1) || lua_isthread(L, 1) || lua_isfunction(L, 1))
+  {
+    const void* ptr = lua_topointer(L, 1);
+    lua_pushnumber(L, (long)(ptr));
+    return 1;
+  }
+  else if(lua_type(L, 1) == 10) /* cdata */
+  {
+    /* we want the pointer holded by cdata */
+    /* not the pointer on the cdata object */
+    const void* ptr = *((void**)lua_topointer(L, 1));
+    lua_pushnumber(L, (long)(ptr));
+    return 1;
+  }
+  else if(lua_isstring(L, 1))
+  {
+    const char* ptr = lua_tostring(L, 1);
+    lua_pushnumber(L, (long)(ptr));
+    return 1;
+  }
+  else
+    luaL_error(L, "Torch object, table, thread, cdata or function expected");
+
+  return 0;
+}
+
+int luaT_lua_setenv(lua_State *L)
+{
+  if(!lua_isfunction(L, 1) && !lua_isuserdata(L, 1))
+    luaL_typerror(L, 1, "function or userdata");
+  luaL_checktype(L, 2, LUA_TTABLE);
+  lua_setfenv(L, 1);
+  return 0;
+}
+
+int luaT_lua_getenv(lua_State *L)
+{
+  if(!lua_isfunction(L, 1) && !lua_isuserdata(L, 1))
+    luaL_typerror(L, 1, "function or userdata");
+  lua_getfenv(L, 1);
+  return 1;
+}
+
+int luaT_lua_getmetatable(lua_State *L)
+{
+  const char *tname = luaL_checkstring(L, 1);
+  if(luaT_pushmetatable(L, tname))
+    return 1;
+  return 0;
+}
+
+int luaT_lua_version(lua_State *L)
+{
+  luaL_checkany(L, 1);
+
+  if(lua_type(L, 1) == 10)
+  {
+    const char *tname = luaT_cdataname(L, 1, NULL);
+    if(tname)
+    {
+      luaT_pushmetatable(L, tname);
+      lua_pushstring(L, "__version");
+      lua_rawget(L, -2);
+      return 1;
+    }
+    return 0;
+  }
+  else if(lua_getmetatable(L, 1))
+  {
+    lua_pushstring(L, "__version");
+    lua_rawget(L, -2);
+    return 1;
+  }
+  return 0;
+}
+
+int luaT_lua_setmetatable(lua_State *L)
+{
+  const char *tname = luaL_checkstring(L, 2);
+  luaL_checktype(L, 1, LUA_TTABLE);
+
+  if(!luaT_pushmetatable(L, tname))
+    luaL_error(L, "unknown typename %s\n", tname);
+  lua_setmetatable(L, 1);
+
+  return 1;
+}
+
+/* metatable operator methods */
+static int luaT_mt__index(lua_State *L)
+{
+  if(!lua_getmetatable(L, 1))
+    luaL_error(L, "critical internal indexing error: no metatable found");
+
+  if(!lua_istable(L, -1))
+    luaL_error(L, "critical internal indexing error: not a metatable");
+
+  /* test for __index__ method first */
+  lua_getfield(L, -1, "__index__");
+  if(!lua_isnil(L, -1))
+  {
+    int result;
+
+    if(!lua_isfunction(L, -1))
+      luaL_error(L, "critical internal indexing error: __index__ is not a function");
+
+    lua_pushvalue(L, 1);
+    lua_pushvalue(L, 2);
+
+    lua_call(L, 2, LUA_MULTRET); /* DEBUG: risque: faut vraiment retourner 1 ou 2 valeurs... */
+
+    result = lua_toboolean(L, -1);
+    lua_pop(L, 1);
+
+    if(result)
+      return 1;
+
+    /* on the stack: 1. the object 2. the value 3. the metatable */
+    /* apparently, __index wants only one element returned */
+    /* return lua_gettop(L)-3; */
+
+  }
+  else
+    lua_pop(L, 1); /* remove nil __index__ on the stack */
+
+  lua_pushvalue(L, 2);
+  lua_gettable(L, -2);
+
+  return 1;
+}
+
+static int luaT_mt__newindex(lua_State *L)
+{
+  if(!lua_getmetatable(L, 1))
+    luaL_error(L, "critical internal indexing error: no metatable found");
+
+  if(!lua_istable(L, -1))
+    luaL_error(L, "critical internal indexing error: not a metatable");
+
+  /* test for __newindex__ method first */
+  lua_getfield(L, -1, "__newindex__");
+  if(!lua_isnil(L, -1))
+  {
+    int result;
+
+    if(!lua_isfunction(L, -1))
+      luaL_error(L, "critical internal indexing error: __newindex__ is not a function");
+
+    lua_pushvalue(L, 1);
+    lua_pushvalue(L, 2);
+    lua_pushvalue(L, 3);
+
+    lua_call(L, 3, 1); /* DEBUG: risque: faut vraiment retourner qqch */
+
+    result = lua_toboolean(L, -1);
+    lua_pop(L, 1);
+
+    if(result)
+      return 0;
+  }
+  else
+    lua_pop(L, 1); /* remove nil __newindex__ on the stack */
+
+  lua_pop(L, 1);    /* pop the metatable */
+  if(lua_istable(L, 1))
+    lua_rawset(L, 1);
+  else
+    luaL_error(L, "the class %s cannot be indexed", luaT_typename(L, 1));
+
+  return 0;
+}
+
+/* note: check dans metatable pour ca, donc necessaire */
+#define MT_DECLARE_OPERATOR(NAME, NIL_BEHAVIOR)                     \
+  int luaT_mt__##NAME(lua_State *L)                                 \
+  {                                                                 \
+    if(!lua_getmetatable(L, 1))                                     \
+      luaL_error(L, "internal error in __" #NAME ": no metatable"); \
+                                                                    \
+    lua_getfield(L, -1, "__" #NAME "__");                           \
+    if(lua_isnil(L, -1))                                                \
+    {                                                                   \
+      NIL_BEHAVIOR;                                                     \
+    }                                                                   \
+    else                                                                \
+    {                                                                   \
+      if(lua_isfunction(L, -1))                                         \
+      {                                                                 \
+        lua_insert(L, 1); /* insert function */                         \
+        lua_pop(L, 1); /* remove metatable */                           \
+        lua_call(L, lua_gettop(L)-1, LUA_MULTRET); /* we return the result of the call */ \
+        return lua_gettop(L);                                           \
+      }                                                                 \
+      /* we return the thing the user left in __tostring__ */           \
+    }                                                                   \
+    return 0;                                                           \
+  }
+
+MT_DECLARE_OPERATOR(tostring,
+                    lua_pushstring(L, luaT_typename(L, 1));
+                    return 1;)
+MT_DECLARE_OPERATOR(add, luaL_error(L, "%s has no addition operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(sub, luaL_error(L, "%s has no substraction operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(mul, luaL_error(L, "%s has no multiplication operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(div, luaL_error(L, "%s has no division operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(mod, luaL_error(L, "%s has no modulo operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(pow, luaL_error(L, "%s has no power operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(unm, luaL_error(L, "%s has no negation operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(concat, luaL_error(L, "%s has no concat operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(len, luaL_error(L, "%s has no length operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(eq,
+                    lua_settop(L, 2);
+                    lua_pushcfunction(L, luaT_lua_isequal);
+                    lua_insert(L, 1);
+                    lua_call(L, 2, 1);
+                    return 1;)
+MT_DECLARE_OPERATOR(lt, luaL_error(L, "%s has no lower than operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(le, luaL_error(L, "%s has no lower or equal than operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(call, luaL_error(L, "%s has no call operator", luaT_typename(L, 1)))
+
+
+/* constructor metatable methods */
+int luaT_cmt__call(lua_State *L)
+{
+  if(!lua_istable(L, 1))
+    luaL_error(L, "internal error in __call: not a constructor table");
+
+  if(!lua_getmetatable(L, 1))
+    luaL_error(L, "internal error in __call: no metatable available");
+
+  lua_pushstring(L, "__new");
+  lua_rawget(L, -2);
+
+  if(lua_isnil(L, -1))
+    luaL_error(L, "no constructor available");
+
+  lua_remove(L, 1); /* remove constructor atable */
+  lua_insert(L, 1); /* insert constructor */
+  lua_pop(L, 1);    /* remove fancy metatable */
+
+  lua_call(L, lua_gettop(L)-1, LUA_MULTRET);
+  return lua_gettop(L);
+}
+
+int luaT_cmt__newindex(lua_State *L)
+{
+  if(!lua_istable(L, 1))
+    luaL_error(L, "internal error in __newindex: not a constructor table");
+
+  if(!lua_getmetatable(L, 1))
+    luaL_error(L, "internal error in __newindex: no metatable available");
+
+  lua_pushstring(L, "__metatable");
+  lua_rawget(L, -2);
+
+  if(!lua_istable(L, -1))
+    luaL_error(L, "internal error in __newindex: no metaclass available");
+
+  lua_insert(L, 2);
+  lua_pop(L, 1); /* remove the metatable over the constructor table */
+
+  lua_rawset(L, -3);
+
+  return 0;
+}
+
+/******************** deprecated functions ********************/
+int luaT_pushmetaclass(lua_State *L, const char *tname)
+{
+  return luaT_pushmetatable(L, tname);
+}
+
+const char* luaT_id(lua_State *L, int ud)
+{
+  return luaT_typename(L, ud);
+}
+
+const char* luaT_id2typename(lua_State *L, const char *id)
+{
+  return id;
+}
+
+const char* luaT_typename2id(lua_State *L, const char *tname)
+{
+  return luaT_typenameid(L, tname);
+}
+
+int luaT_getmetaclass(lua_State *L, int index)
+{
+  return lua_getmetatable(L, index);
+}
+
+const char* luaT_checktypename2id(lua_State *L, const char *tname)
+{
+  const char* id = luaT_typenameid(L, tname);
+  if(!id)
+    luaL_error(L, "unknown class <%s>", tname);
+  return id;
+}
+
+void luaT_registeratid(lua_State *L, const struct luaL_Reg *methods, const char *id)
+{
+  luaT_registeratname(L, methods, id);
+}
+
+/**************************************************************/
diff --git a/nerv/luaT/luaT.h b/nerv/luaT/luaT.h
new file mode 100644
index 0000000..5e8dd2f
--- /dev/null
+++ b/nerv/luaT/luaT.h
@@ -0,0 +1,111 @@
+#ifndef LUAT_UTILS_INC
+#define LUAT_UTILS_INC
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <lua.h>
+#include <lauxlib.h>
+#ifdef __cplusplus
+}
+#endif
+
+#ifndef LUA_EXTERNC
+# ifdef __cplusplus
+#  define LUA_EXTERNC extern "C"
+# else
+#  define LUA_EXTERNC extern
+# endif
+#endif
+
+#ifdef _MSC_VER
+# define DLL_EXPORT __declspec(dllexport)
+# define DLL_IMPORT __declspec(dllimport)
+# ifdef luaT_EXPORTS
+#  define LUAT_API LUA_EXTERNC DLL_EXPORT
+# else
+#  define LUAT_API LUA_EXTERNC DLL_IMPORT
+# endif
+#else
+# define DLL_EXPORT
+# define DLL_IMPORT
+# define LUAT_API LUA_EXTERNC
+#endif
+
+
+/* C functions */
+
+LUAT_API void* luaT_alloc(lua_State *L, long size);
+LUAT_API void* luaT_realloc(lua_State *L, void *ptr, long size);
+LUAT_API void luaT_free(lua_State *L, void *ptr);
+
+LUAT_API const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parenttname,
+                                       lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory);
+
+LUAT_API int luaT_pushmetatable(lua_State *L, const char *tname);
+
+LUAT_API const char* luaT_typenameid(lua_State *L, const char *tname);
+LUAT_API const char* luaT_typename(lua_State *L, int ud);
+
+LUAT_API void luaT_pushudata(lua_State *L, void *udata, const char *tname);
+LUAT_API void *luaT_toudata(lua_State *L, int ud, const char *tname);
+LUAT_API int luaT_isudata(lua_State *L, int ud, const char *tname);
+LUAT_API void *luaT_checkudata(lua_State *L, int ud, const char *tname);
+
+LUAT_API void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname);
+LUAT_API void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field);
+LUAT_API double luaT_getfieldchecknumber(lua_State *L, int ud, const char *field);
+LUAT_API int luaT_getfieldcheckint(lua_State *L, int ud, const char *field);
+LUAT_API const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field);
+LUAT_API int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field);
+LUAT_API void luaT_getfieldchecktable(lua_State *L, int ud, const char *field);
+
+LUAT_API int luaT_typerror(lua_State *L, int ud, const char *tname);
+
+LUAT_API int luaT_checkboolean(lua_State *L, int ud);
+LUAT_API int luaT_optboolean(lua_State *L, int ud, int def);
+
+LUAT_API void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name);
+
+/* utility functions */
+LUAT_API const char *luaT_classrootname(const char *tname);
+LUAT_API int luaT_classmodulename(const char *tname, char *module_name);
+
+/* debug */
+LUAT_API void luaT_stackdump(lua_State *L);
+
+/* Lua functions */
+LUAT_API int luaT_lua_newmetatable(lua_State *L);
+LUAT_API int luaT_lua_factory(lua_State *L);
+LUAT_API int luaT_lua_getconstructortable(lua_State *L);
+LUAT_API int luaT_lua_typename(lua_State *L);
+LUAT_API int luaT_lua_isequal(lua_State *L);
+LUAT_API int luaT_lua_pointer(lua_State *L);
+LUAT_API int luaT_lua_setenv(lua_State *L);
+LUAT_API int luaT_lua_getenv(lua_State *L);
+LUAT_API int luaT_lua_getmetatable(lua_State *L);
+LUAT_API int luaT_lua_version(lua_State *L);
+LUAT_API int luaT_lua_setmetatable(lua_State *L);
+LUAT_API int luaT_lua_metatype(lua_State *L);
+LUAT_API int luaT_lua_pushudata(lua_State *L);
+
+/* deprecated functions */
+/* ids have been replaced by string names to identify classes */
+/* comments show what function (that you should use) they call now */
+#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+#define LUAT_DEPRECATED  __attribute__((__deprecated__))
+#elif defined(_MSC_VER)
+#define LUAT_DEPRECATED __declspec(deprecated)
+#else
+#define LUAT_DEPRECATED
+#endif
+
+LUAT_API LUAT_DEPRECATED int luaT_pushmetaclass(lua_State *L, const char *tname); /* same as luaT_pushmetatable */
+LUAT_API LUAT_DEPRECATED const char* luaT_id(lua_State *L, int ud); /* same as luaT_typename */
+LUAT_API LUAT_DEPRECATED const char* luaT_id2typename(lua_State *L, const char *id); /*  same as luaT_typenameid */
+LUAT_API LUAT_DEPRECATED const char* luaT_typename2id(lua_State *L, const char*); /* same as luaT_typenameid */
+LUAT_API LUAT_DEPRECATED int luaT_getmetaclass(lua_State *L, int index); /* same as luaT_getmetatable */
+LUAT_API LUAT_DEPRECATED const char* luaT_checktypename2id(lua_State *L, const char *tname);  /* same as luaT_typenameid */
+LUAT_API LUAT_DEPRECATED void luaT_registeratid(lua_State *L, const struct luaL_Reg *methods, const char *id); /* same as luaT_registeratname */
+
+#endif
diff --git a/nerv/matrix/cuda_helper.h b/nerv/matrix/cuda_helper.h
new file mode 100644
index 0000000..fde6f18
--- /dev/null
+++ b/nerv/matrix/cuda_helper.h
@@ -0,0 +1,75 @@
+#ifndef NERV_CUDA_HELPER_H
+#define NERV_CUDA_HELPER_H
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "driver_types.h"
+#include "cublas_v2.h"
+#define CUBLAS_SAFE_SYNC_CALL(call) \
+    do { \
+        cublasStatus_t  err = (call); \
+        if (err != CUBLAS_STATUS_SUCCESS) \
+            nerv_error(L, "cumatrix cublas error: %s at %s:%d", \
+                        cublasGetErrorString(err), __FILE__, __LINE__); \
+        cudaDeviceSynchronize(); \
+    } while (0)
+
+#define CUDA_SAFE_CALL(call) \
+    do { \
+        cudaError_t err = (call); \
+        if (err != cudaSuccess) \
+            nerv_error(L, "cumatrix CUDA error: %s at %s:%d", \
+                            cudaGetErrorString(err), __FILE__, __LINE__); \
+    } while (0)
+
+#define CUDA_SAFE_SYNC_CALL(call) \
+    do { \
+        CUDA_SAFE_CALL(call); \
+        cudaDeviceSynchronize(); \
+    } while (0)
+
+#define CHECK_SAME_DIMENSION(a, b) \
+    do { \
+        if (!(a->nrow == b->nrow && a->ncol == b->ncol)) \
+            nerv_error(L, "matrices should be of the same dimension"); \
+    } while (0)
+
+static const char *cublasGetErrorString(cublasStatus_t err) {
+    switch (err)
+    {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+/*        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+        case CUBLAS_STATUS_LICENSE_ERROR:
+            return "CUBLAS_STATUS_LICENSE_ERROR"; */
+    }
+    return "<unknown>";
+}
+
+#define PROFILE_START \
+    do { \
+        cudaEventRecord(profile_start, 0);
+#define PROFILE_STOP \
+        cudaEventRecord(profile_stop, 0); \
+        cudaEventSynchronize(profile_stop); \
+        float milliseconds = 0; \
+        cudaEventElapsedTime(&milliseconds, profile_start, profile_stop); \
+        accu_profile(__func__, milliseconds / 1000); \
+    } while (0);
+
+#define PROFILE_END
+#endif
diff --git a/nerv/matrix/cukernel.cu b/nerv/matrix/cukernel.cu
new file mode 100644
index 0000000..a19030a
--- /dev/null
+++ b/nerv/matrix/cukernel.cu
@@ -0,0 +1,17 @@
+#define NERV_GENERIC_CUKERNEL
+
+#define cudak_(NAME) cudak_float_ ## NAME
+#define MATRIX_USE_FLOAT
+#include "generic/elem_type.h"
+#include "generic/cukernel.cu"
+#undef cudak_
+#undef MATRIX_USE_FLOAT
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_FMT
+#undef MATRIX_ELEM_WRITE_FMT
+
+#define cudak_(NAME) cudak_double_ ## NAME
+#define MATRIX_USE_DOUBLE
+#include "generic/elem_type.h"
+#include "generic/cukernel.cu"
diff --git a/nerv/matrix/cukernel.h b/nerv/matrix/cukernel.h
new file mode 100644
index 0000000..8a1494f
--- /dev/null
+++ b/nerv/matrix/cukernel.h
@@ -0,0 +1,20 @@
+#ifdef NERV_GENERIC_CUKERNEL
+void cudak_(cuda_mul_elem)(const Matrix *a, const Matrix *b, Matrix *c);
+void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b);
+void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b);
+void cudak_(cuda_sigmoid_grad)(const Matrix *output, const Matrix *err, Matrix *nerr);
+void cudak_(cuda_rowsum)(const Matrix *a, Matrix *b);
+void cudak_(cuda_rowmax)(const Matrix *a, Matrix *b);
+void cudak_(cuda_rowmax_idx)(const Matrix *a, Matrix *b, Matrix *idx);
+void cudak_(cuda_colsum)(const Matrix *a, Matrix *b);
+void cudak_(cuda_colsame)(const Matrix *a, const Matrix *ref, Matrix *b);
+void cudak_(cuda_softmax_denominator)(const Matrix *a, const Matrix *max, Matrix *b);
+void cudak_(cuda_softmax_final)(const Matrix *a, const Matrix *max, const Matrix *deno, Matrix *b);
+void cudak_(cuda_add_row)(const Matrix *a, Matrix *b, double beta);
+void cudak_(cuda_fill)(Matrix *a, double val);
+void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context);
+void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step);
+void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b);
+void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b);
+void cudak_(cuda_decompress)(const Matrix *a, Matrix *b);
+#endif
diff --git a/nerv/matrix/cumatrix.c b/nerv/matrix/cumatrix.c
new file mode 100644
index 0000000..af34fb4
--- /dev/null
+++ b/nerv/matrix/cumatrix.c
@@ -0,0 +1,87 @@
+#define NERV_GENERIC_CUMATRIX
+#include "../common.h"
+#include "cuda_helper.h"
+#include <string.h>
+#define PROFILE_HASHMAP_SIZE 123457
+static cublasHandle_t cublas_handle;
+static cudaEvent_t profile_start, profile_stop;
+static HashMap *profile;
+
+static int print_profile(lua_State *L) {
+    (void)L;
+    size_t i;
+    fprintf(stderr, "*** [nerv cumatrix profile] **\n");
+    for (i = 0; i < profile->size; i++)
+    {
+        HashNode *ptr;
+        for (ptr = profile->bucket[i]; ptr; ptr = ptr->next)
+        {
+            fprintf(stderr, "%s:\t%.6f\n", ptr->key, *(float *)ptr->val);
+        }
+    }
+    return 0;
+}
+
+static int clear_profile(lua_State *L) {
+    (void)L;
+    hashmap_clear(profile);
+    return 0;
+}
+
+void accu_profile(const char *name, float delta) {
+    float *val = hashmap_getval(profile, name);
+    if (!val)
+    {
+        val = malloc(sizeof(float));
+        *val = 0;
+        hashmap_setval(profile, name, val);
+    }
+    *val += delta;
+}
+
+static const luaL_Reg cumatrix_methods[] = {
+    {"print_profile", print_profile},
+    {"clear_profile", clear_profile},
+    {NULL, NULL}
+};
+
+extern void nerv_matrix_cuda_float_init(lua_State *L);
+extern void nerv_matrix_cuda_double_init(lua_State *L);
+
+void nerv_cumatrix_init(lua_State *L) {
+    luaL_register(L, NULL, cumatrix_methods);
+    cublasCreate(&cublas_handle);
+    cudaEventCreate(&profile_start);
+    cudaEventCreate(&profile_stop);
+    profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
+    nerv_matrix_cuda_float_init(L);
+    nerv_matrix_cuda_double_init(L);
+}
+
+#define MATRIX_USE_FLOAT
+#define cuda_matrix_(NAME) cuda_matrix_float_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_cuda_float_##NAME
+#define cudak_(NAME) cudak_float_ ## NAME
+#define NERV_CUBLAS_(NAME) cublasS##NAME
+#define MATRIX_CUMATRIX_HOST_TNAME nerv_matrix_host_float_tname
+const char *nerv_matrix_(tname) = "nerv.CuMatrixFloat";
+#include "generic/cumatrix.c"
+#undef NERV_CUBLAS_
+#undef cudak_
+#undef nerv_matrix_
+#undef cuda_matrix_
+#undef MATRIX_USE_FLOAT
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_FMT
+#undef MATRIX_ELEM_WRITE_FMT
+#undef MATRIX_CUMATRIX_HOST_TNAME
+
+#define MATRIX_USE_DOUBLE
+#define cuda_matrix_(NAME) cuda_matrix_double_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_cuda_double_##NAME
+#define cudak_(NAME) cudak_double_ ## NAME
+#define NERV_CUBLAS_(NAME) cublasD##NAME
+#define MATRIX_CUMATRIX_HOST_TNAME nerv_matrix_host_double_tname
+const char *nerv_matrix_(tname) = "nerv.CuMatrixDouble";
+#include "generic/cumatrix.c"
diff --git a/nerv/matrix/generic/cukernel.cu b/nerv/matrix/generic/cukernel.cu
new file mode 100644
index 0000000..d6c8adc
--- /dev/null
+++ b/nerv/matrix/generic/cukernel.cu
@@ -0,0 +1,571 @@
+#ifdef NERV_GENERIC_CUKERNEL
+#include <assert.h>
+#include <stdio.h>
+#include "matrix.h"
+#include "cuda.h"
+#include "float.h"
+#define CUDA_THREADS_N 16
+#define CUDA_THREADS_NN ((CUDA_THREADS_N) * (CUDA_THREADS_N))
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+__global__ void cudak_(log_elem)(const MATRIX_ELEM *a, MATRIX_ELEM *b, 
+                                int nrow, int ncol, int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    long idx;
+    MATRIX_ELEM tmp;
+    if (i >= nrow || j >= ncol) return;
+    idx = j + i * stride;
+    tmp = a[idx];
+    if(tmp < FLT_MIN) tmp = FLT_MIN;
+    b[idx] = log(tmp);
+}
+
+__global__ void cudak_(mul_elem)(const MATRIX_ELEM *a, const MATRIX_ELEM *b,
+                                MATRIX_ELEM *c, 
+                                int nrow, int ncol, int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    long idx;
+    if (i >= nrow || j >= ncol) return;
+    idx = j + i * stride;
+    c[idx] = a[idx] * b[idx];
+}
+
+__global__ void cudak_(sigmoid)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                        int nrow, int ncol, int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    long idx;
+    if (i >= nrow || j >= ncol) return;
+    idx = j + i * stride;
+    b[idx] = 1.0 / (1.0 + exp(-a[idx]));
+}
+
+__global__ void cudak_(sigmoid_grad)(const MATRIX_ELEM *output,
+                                    const MATRIX_ELEM *err,
+                                    MATRIX_ELEM *nerr,
+                                    int nrow, int ncol, int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    long idx;
+    if (i >= nrow || j >= ncol) return;
+    idx = j + i * stride;
+    nerr[idx] = output[idx] * (1.0 - output[idx]) * err[idx];
+}
+
+__global__ void cudak_(softmax_final)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                        const MATRIX_ELEM *max, const MATRIX_ELEM *deno,
+                        int nrow, int ncol, int stride, int mstride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    long idx;
+    if (i >= nrow || j >= ncol) return;
+    idx = j + i * stride;
+    b[idx] = exp(a[idx] - max[0 + i * mstride]) / deno[0 + i * mstride];
+}
+
+__global__ void cudak_(block_reduce_rowsum)(const MATRIX_ELEM *input,
+                                            MATRIX_ELEM *output,
+                                            const int istride, const int ostride,
+                                            const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0;
+    __syncthreads();
+    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+            cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_colsum)(const MATRIX_ELEM *input,
+                                MATRIX_ELEM *output,
+                                const int istride, const int ostride,
+                                const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    cudak_(arr)[threadIdx.y] = i < n ? input[blockIdx.x + istride * i] : 0;
+    __syncthreads();
+    for (int offset = blockDim.y >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.y < offset)
+            cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset];
+        __syncthreads();
+    }
+    if (threadIdx.y == 0)
+        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_colsame)(const MATRIX_ELEM *input,
+                                            const MATRIX_ELEM *ref_input,
+                                            MATRIX_ELEM *output,
+                                            const int istride, const int ostride,
+                                            const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    cudak_(arr)[threadIdx.y] = (i < n && input[blockIdx.x + istride * i] == \
+                                        ref_input[blockIdx.x + istride * i]) ? 1.0 : 0;
+    __syncthreads();
+    for (int offset = blockDim.y >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.y < offset)
+            cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset];
+        __syncthreads();
+    }
+    if (threadIdx.y == 0)
+        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_softmax_rowsum)(const MATRIX_ELEM *input,
+                                        MATRIX_ELEM *output,
+                                        const MATRIX_ELEM *max,
+                                        const int istride, const int ostride,
+                                        const int mstride, const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    cudak_(arr)[threadIdx.x] = j < n ? exp(input[j + istride * blockIdx.y] - \
+                                    max[0 + mstride * blockIdx.y]) : 0;
+    __syncthreads();
+    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+            cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_rowmax)(const MATRIX_ELEM *input,
+                                            MATRIX_ELEM *output,
+                                            const int istride, const int ostride,
+                                            const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
+    __syncthreads();
+    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+        {
+            MATRIX_ELEM l = cudak_(arr)[threadIdx.x],
+                        r = cudak_(arr)[threadIdx.x + offset];
+            if (r > l)
+                cudak_(arr)[threadIdx.x] = r;
+        }
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_rowmax_idx)(const MATRIX_ELEM *input,
+                                                const MATRIX_ELEM *idx_input,
+                                                MATRIX_ELEM *output,
+                                                MATRIX_ELEM *idx_output,
+                                                const int istride, const int ostride,
+                                                const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    MATRIX_ELEM *arr_val = cudak_(arr);
+    MATRIX_ELEM *arr_idx = arr_val + blockDim.x;
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    arr_val[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
+    arr_idx[threadIdx.x] = j < n ? idx_input[j + istride * blockIdx.y] : 0;
+    __syncthreads();
+    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+        {
+            MATRIX_ELEM l = arr_val[threadIdx.x],
+                        r = arr_val[threadIdx.x + offset];
+            if (r > l)
+            {
+                arr_val[threadIdx.x] = r;
+                arr_idx[threadIdx.x] = arr_idx[threadIdx.x + offset];
+            }
+        }
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+    {
+        output[blockIdx.x + ostride * blockIdx.y] = arr_val[0];
+        idx_output[blockIdx.x + ostride * blockIdx.y] = arr_idx[0];
+    }
+}
+
+__global__ void cudak_(add_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                int nrow, int ncol, int stride, double beta) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[j + i * stride] += beta * a[j];
+}
+
+__global__ void cudak_(fill)(MATRIX_ELEM *a,
+                            int nrow, int ncol, int stride, double val) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    a[j + i * stride] = val;
+}
+
+__global__ void cudak_(expand_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                    int nrow, int ncol,
+                                    int enrow, int encol,
+                                    int stride, int estride,
+                                    int context) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    int ridx;
+    if (i >= enrow || j >= encol) return;
+    ridx = i + j / ncol - context;
+    if (ridx < 0) ridx = 0;
+    else if (ridx >= nrow) ridx = nrow - 1;
+    b[j + i * estride] = a[j % ncol + ridx * stride];
+}
+
+__global__ void cudak_(rearrange_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                    int nrow, int ncol,
+                                    int stride, int step, int orig_dim) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[j + i * stride] = a[j / step + (j % step) * orig_dim + i * stride];
+}
+
+__global__ void cudak_(scale_rows_by_col)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                        int nrow, int ncol,
+                                        int astride, int bstride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[j + i * bstride] *= a[i * astride];
+}
+
+__global__ void cudak_(scale_rows_by_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                        int nrow, int ncol,
+                                        int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[j + i * stride] *= a[j];
+}
+
+__global__ void cudak_(decompress)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                    int nrow, int ncol,
+                                    int stride_a, int stride_b) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[lrintf(a[j + i * stride_a]) + i * stride_b] = 1.0;
+}
+
+__global__ void cudak_(gen_col_idx)(MATRIX_ELEM *b,
+                                    int nrow, int ncol, int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[j + i * stride] = j;
+}
+
+extern "C" {
+#include "../cukernel.h"
+    void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(log_elem)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_mul_elem)(const Matrix *a, const Matrix *b,
+                                Matrix *c) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(mul_elem)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             MATRIX_ELEM_PTR(c),
+             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(sigmoid)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
+            b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_sigmoid_grad)(const Matrix *output,
+                                    const Matrix *err, Matrix *nerr) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(nerr->ncol, threadsPerBlock.x),
+                CEIL_DIV(nerr->nrow, threadsPerBlock.y));
+        cudak_(sigmoid_grad)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(output), MATRIX_ELEM_PTR(err),
+             MATRIX_ELEM_PTR(nerr),
+             nerr->nrow, nerr->ncol,
+             nerr->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_rowsum)(const Matrix *a, Matrix *b) {
+        dim3 block(CUDA_THREADS_NN, 1);
+        int ncol = a->ncol;
+        int blocks_per_row = CEIL_DIV(ncol, block.x);
+        dim3 grid(blocks_per_row, a->nrow);
+        MATRIX_ELEM *res;
+        size_t stride;
+        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+        cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), res,
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             ncol);
+        ncol = blocks_per_row;
+        assert((unsigned long)ncol <= block.x);
+        grid.x = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (res, MATRIX_ELEM_PTR(b),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             ncol);
+        cudaStreamSynchronize(0);
+        cudaFree(res);
+    }
+
+    void cudak_(cuda_colsame)(const Matrix *a, const Matrix *ref, Matrix *b) {
+        dim3 block(1, CUDA_THREADS_NN);
+        int nrow = a->nrow;
+        int blocks_per_col = CEIL_DIV(nrow, block.y);
+        dim3 grid(a->ncol, blocks_per_col);
+        MATRIX_ELEM *res;
+        size_t stride;
+        cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col);
+        cudak_(block_reduce_colsame)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(ref), res,
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             nrow);
+        nrow = blocks_per_col;
+        assert((unsigned long)nrow <= block.y);
+        grid.y = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
+            (res, MATRIX_ELEM_PTR(b),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             nrow);
+        cudaStreamSynchronize(0);
+        cudaFree(res);
+    }
+
+    void cudak_(cuda_colsum)(const Matrix *a, Matrix *b) {
+        dim3 block(1, CUDA_THREADS_NN);
+        int nrow = a->nrow;
+        int blocks_per_col = CEIL_DIV(nrow, block.y);
+        dim3 grid(a->ncol, blocks_per_col);
+        MATRIX_ELEM *res;
+        size_t stride;
+        cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col);
+        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), res,
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             nrow);
+        nrow = blocks_per_col;
+        assert((unsigned long)nrow <= block.y);
+        grid.y = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
+            (res, MATRIX_ELEM_PTR(b),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             nrow);
+        cudaStreamSynchronize(0);
+        cudaFree(res);
+    }
+
+    void cudak_(cuda_softmax_final)(const Matrix *a, const Matrix *max,
+                            const Matrix *deno, Matrix *b) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(softmax_final)<<<numBlocks, threadsPerBlock>>> \
+                (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+                MATRIX_ELEM_PTR(max), MATRIX_ELEM_PTR(deno),
+                b->nrow, b->ncol,
+                b->stride / sizeof(MATRIX_ELEM),
+                max->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_softmax_denominator)(const Matrix *a, const Matrix *max, Matrix *b) {
+        dim3 block(CUDA_THREADS_NN, 1);
+        int ncol = a->ncol;
+        int blocks_per_row = CEIL_DIV(ncol, block.x);
+        dim3 grid(blocks_per_row, a->nrow);
+        MATRIX_ELEM *res;
+        size_t stride;
+        assert(max->ncol == 1);
+        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+        cudak_(block_reduce_softmax_rowsum) \
+            <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), res, MATRIX_ELEM_PTR(max),
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             max->stride / sizeof(MATRIX_ELEM),
+             ncol);
+        ncol = blocks_per_row;
+        assert((unsigned long)ncol <= block.x);
+        grid.x = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_rowsum) \
+            <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (res, MATRIX_ELEM_PTR(b),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             ncol);
+        cudaStreamSynchronize(0);
+        cudaFree(res);
+    }
+
+    void cudak_(cuda_rowmax)(const Matrix *a, Matrix *b) {
+        dim3 block(CUDA_THREADS_NN, 1);
+        int ncol = a->ncol;
+        int blocks_per_row = CEIL_DIV(ncol, block.x);
+        dim3 grid(blocks_per_row, a->nrow);
+        MATRIX_ELEM *res;
+        size_t stride;
+        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+        cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), res,
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             ncol);
+        ncol = blocks_per_row;
+        assert((unsigned long)ncol <= block.x);
+        grid.x = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (res, MATRIX_ELEM_PTR(b),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             ncol);
+        cudaStreamSynchronize(0);
+        cudaFree(res);
+    }
+
+    void cudak_(cuda_rowmax_idx)(const Matrix *a, Matrix *b, Matrix *b_idx) {
+        dim3 block(CUDA_THREADS_NN, 1);
+        int ncol = a->ncol;
+        int blocks_per_row = CEIL_DIV(ncol, block.x);
+        dim3 grid(blocks_per_row, a->nrow);
+        MATRIX_ELEM *a_idx, *res, *res_idx;
+        size_t stride;
+        cudaMallocPitch(&a_idx, &stride, a->stride, a->nrow);
+        cudak_(gen_col_idx)<<<grid, block>>>(a_idx, a->nrow, ncol, stride / sizeof(MATRIX_ELEM));
+        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+        cudaMallocPitch(&res_idx, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_rowmax_idx)<<<grid, block,
+                                        2 * block.x * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), a_idx, res, res_idx,
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             ncol);
+        ncol = blocks_per_row;
+        assert((unsigned long)ncol <= block.x);
+        grid.x = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_rowmax_idx)<<<grid, block,
+                                        2 * block.x * sizeof(MATRIX_ELEM)>>> \
+            (res, res_idx, MATRIX_ELEM_PTR(b), MATRIX_ELEM_PTR(b_idx),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             ncol);
+        cudaStreamSynchronize(0);
+        cudaFree(a_idx);
+        cudaFree(res);
+        cudaFree(res_idx);
+    }
+
+    /* in-place calc */
+    void cudak_(cuda_add_row)(const Matrix *a, Matrix *b, double beta) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(add_row)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
+            b->stride / sizeof(MATRIX_ELEM), beta);
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_fill)(Matrix *a, double val) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x),
+                CEIL_DIV(a->nrow, threadsPerBlock.y));
+        cudak_(fill)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), a->nrow, a->ncol,
+            a->stride / sizeof(MATRIX_ELEM), val);
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(expand_frm)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             a->nrow, a->ncol,
+             b->nrow, b->ncol,
+             a->stride / sizeof(MATRIX_ELEM),
+             b->stride / sizeof(MATRIX_ELEM),
+             context);
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(rearrange_frm)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM),
+             step, b->ncol / step);
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(scale_rows_by_col)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             b->nrow, b->ncol,
+             a->stride / sizeof(MATRIX_ELEM),
+             b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(scale_rows_by_row)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_decompress)(const Matrix *a, Matrix *b) {
+        dim3 threadsPerBlock(1, CUDA_THREADS_NN);
+        dim3 numBlocks(1, CEIL_DIV(a->nrow, threadsPerBlock.y));
+        cudak_(decompress)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             a->nrow, a->ncol,
+             a->stride / sizeof(MATRIX_ELEM),
+             b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+}
+#endif
diff --git a/nerv/matrix/generic/cumatrix.c b/nerv/matrix/generic/cumatrix.c
new file mode 100644
index 0000000..b5d1a35
--- /dev/null
+++ b/nerv/matrix/generic/cumatrix.c
@@ -0,0 +1,493 @@
+#ifdef NERV_GENERIC_CUMATRIX
+#include "matrix.h"
+#include "elem_type.h"
+
+#define MATRIX_DATA_FREE(L, ptr) cuda_matrix_(free)(L, ptr)
+#define MATRIX_DATA_ALLOC(L, dptr, stride, width, height) \
+                            cuda_matrix_(alloc)(L, dptr, stride, width, height)
+#define MATRIX_DATA_WRITE(L, data, idx, val) cuda_matrix_(write)(L, data, idx, val)
+#define MATRIX_DATA_READ(L, data, idx) cuda_matrix_(read)(L, data, idx)
+#define MATRIX_INIT(L) cuda_matrix_(init)(L)
+#define MATRIX_BASE_TNAME nerv_matrix_cuda_tname
+#define NERV_GENERIC_MATRIX
+#define NERV_GENERIC_CUKERNEL
+#include "../../common.h"
+#include "../cukernel.h"
+#include "../cuda_helper.h"
+
+Matrix *nerv_matrix_(new_)(lua_State *L, long nrow, long ncol);
+void nerv_matrix_(data_free)(lua_State *L, Matrix *self);
+
+static void nerv_matrix_(add_)(lua_State *L, const Matrix *a, const Matrix *b,
+                                const Matrix *c,
+                                MATRIX_ELEM alpha, MATRIX_ELEM beta) {
+    PROFILE_START
+    CUBLAS_SAFE_SYNC_CALL(
+            NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
+                a->ncol, a->nrow,
+                &alpha,
+                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+                &beta,
+                MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM),
+                MATRIX_ELEM_PTR(c), c->stride / sizeof(MATRIX_ELEM)));
+    PROFILE_STOP
+}
+
+static int nerv_matrix_(add)(lua_State *L) {
+    Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname));
+    MATRIX_ELEM alpha = luaL_checknumber(L, 4);
+    MATRIX_ELEM beta = luaL_checknumber(L, 5);
+    CHECK_SAME_DIMENSION(a, b);
+    CHECK_SAME_DIMENSION(a, c);
+    nerv_matrix_(add_)(L, a, b, c, alpha, beta);
+    return 0;
+}
+
+static int nerv_matrix_(get_cublas_op)(char ch) {
+    return (ch == 'T' || ch == 't') ? CUBLAS_OP_T : CUBLAS_OP_N;
+}
+
+static int nerv_matrix_(mul)(lua_State *L) {
+#define SWAP(a, b) \
+    do { int t = (a); (a) = (b); (b) = t; } while (0)
+
+    Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname));
+    MATRIX_ELEM alpha = luaL_checknumber(L, 4);
+    MATRIX_ELEM beta = luaL_checknumber(L, 5);
+    int nargs = lua_gettop(L);
+    int ta = nargs > 5 ? nerv_matrix_(get_cublas_op)(*luaL_checkstring(L, 6)) \
+                            : CUBLAS_OP_N;
+    int tb = nargs > 6 ? nerv_matrix_(get_cublas_op)(*luaL_checkstring(L, 7)) \
+                            : CUBLAS_OP_N;
+    int am = a->nrow, an = a->ncol;
+    int bm = b->nrow, bn = b->ncol;
+    if (ta == CUBLAS_OP_T) SWAP(am, an);
+    if (tb == CUBLAS_OP_T) SWAP(bm, bn);
+    if (an != bm)
+        nerv_error(L, "Wrong dimension of multipliers");
+/*    MATRIX_ELEM alpha = 1.0f, beta = 0.0f; */
+    /* Because matrix in Nerv is row-major, here b comes first */
+    PROFILE_START
+    CUBLAS_SAFE_SYNC_CALL(
+            NERV_CUBLAS_(gemm)(cublas_handle, tb, ta,
+                bn, am, bm,
+                &alpha,
+                MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM),
+                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+                &beta,
+                MATRIX_ELEM_PTR(c), c->stride / sizeof(MATRIX_ELEM)));
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(create)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = nerv_matrix_(new_)(L, a->nrow, a->ncol);
+    luaT_pushudata(L, b, nerv_matrix_(tname));
+    return 1;
+}
+
+static int nerv_matrix_(sigmoid)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    CHECK_SAME_DIMENSION(a, b);
+    PROFILE_START
+    cudak_(cuda_sigmoid)(b, a);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(sigmoid_grad)(lua_State *L) {
+    Matrix *nerr = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *err = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    Matrix *output = luaT_checkudata(L, 3, nerv_matrix_(tname));
+    CHECK_SAME_DIMENSION(nerr, err);
+    CHECK_SAME_DIMENSION(nerr, output);
+    PROFILE_START
+    cudak_(cuda_sigmoid_grad)(output, err, nerr);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(softmax)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *max, *max_idx;
+    Matrix *dno;
+    CHECK_SAME_DIMENSION(a, b);
+    max = nerv_matrix_(new_)(L, a->nrow, 1);
+    max_idx = nerv_matrix_(new_)(L, a->nrow, 1);
+    dno = nerv_matrix_(new_)(L, a->nrow, 1);
+    PROFILE_START
+    cudak_(cuda_rowmax_idx)(a, max, max_idx);
+    cudak_(cuda_softmax_denominator)(a, max, dno);
+    cudak_(cuda_softmax_final)(a, max, dno, b);
+    PROFILE_STOP
+    nerv_matrix_(data_free)(L, max);
+    nerv_matrix_(data_free)(L, dno);
+    luaT_pushudata(L, max_idx, nerv_matrix_(tname));
+    return 1;
+}
+
+static int nerv_matrix_(rowsum)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1);
+    PROFILE_START
+    cudak_(cuda_rowsum)(a, b);
+    PROFILE_STOP
+    luaT_pushudata(L, b, nerv_matrix_(tname));
+    return 1;
+}
+
+static int nerv_matrix_(colsum)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = nerv_matrix_(new_)(L, 1, a->ncol);
+    PROFILE_START
+    cudak_(cuda_colsum)(a, b);
+    PROFILE_STOP
+    luaT_pushudata(L, b, nerv_matrix_(tname));
+    return 1;
+}
+
+static int nerv_matrix_(colsame)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *ref = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    Matrix *b = nerv_matrix_(new_)(L, 1, a->ncol);
+    CHECK_SAME_DIMENSION(a, ref);
+    PROFILE_START
+    cudak_(cuda_colsame)(a, ref, b);
+    PROFILE_STOP
+    luaT_pushudata(L, b, nerv_matrix_(tname));
+    return 1;
+}
+
+static int nerv_matrix_(rowmax)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1);
+    PROFILE_START
+    cudak_(cuda_rowmax)(a, b);
+    PROFILE_STOP
+    luaT_pushudata(L, b, nerv_matrix_(tname));
+    return 1;
+}
+
+static int nerv_matrix_(rowmax_idx)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1);
+    Matrix *idx = nerv_matrix_(new_)(L, a->nrow, 1);
+    PROFILE_START
+    cudak_(cuda_rowmax_idx)(a, b, idx);
+    PROFILE_STOP
+    luaT_pushudata(L, b, nerv_matrix_(tname));
+    luaT_pushudata(L, idx, nerv_matrix_(tname));
+    return 2;
+}
+
+static int nerv_matrix_(add_row)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    double beta = luaL_checknumber(L, 3);
+    if (a->ncol != b->ncol)
+        nerv_error(L, "the number of columns is not the same");
+    if (a->nrow != 1)
+        nerv_error(L, "a row vector is expected");
+    PROFILE_START
+    cudak_(cuda_add_row)(a, b, beta);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(fill)(lua_State *L) {
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    double val = luaL_checknumber(L, 2);
+    PROFILE_START
+    cudak_(cuda_fill)(self, val);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(copy_fromd)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    int nargs = lua_gettop(L);
+    int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
+    int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
+    int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
+    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+            a_begin + b_end - b_begin <= a->nrow))
+        nerv_error(L, "invalid copy interval");
+    if (a->ncol != b->ncol)
+        nerv_error(L, "matrices should be of the same dimension");
+    PROFILE_START
+    CUDA_SAFE_SYNC_CALL(
+            cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride,
+                MATRIX_ROW_PTR(b, b_begin), b->stride,
+                sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin,
+                cudaMemcpyDeviceToDevice));
+    PROFILE_STOP
+    return 0;
+}
+
+extern const char *MATRIX_CUMATRIX_HOST_TNAME;
+static int nerv_matrix_(copy_fromh)(lua_State *L) { 
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
+    int nargs = lua_gettop(L);
+    int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
+    int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
+    int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
+    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+            a_begin + b_end - b_begin <= a->nrow))
+        nerv_error(L, "invalid copy interval");
+    if (a->ncol != b->ncol)
+        nerv_error(L, "matrices should be of the same dimension");
+    PROFILE_START
+    CUDA_SAFE_SYNC_CALL(
+            cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride,
+                MATRIX_ROW_PTR(b, b_begin), b->stride,
+                sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin,
+                cudaMemcpyHostToDevice));
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(copy_toh)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
+    int nargs = lua_gettop(L);
+    int a_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
+    int a_end = nargs > 3 ? luaL_checkinteger(L, 4) : a->nrow;
+    int b_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
+    if (!(0 <= a_begin && a_begin < a_end && a_end <= a->nrow &&
+            b_begin + a_end - a_begin <= b->nrow))
+        nerv_error(L, "invalid copy interval");
+    if (b->ncol != a->ncol)
+        nerv_error(L, "matrices should be of the same dimension");
+    PROFILE_START
+    CUDA_SAFE_SYNC_CALL(
+            cudaMemcpy2D(MATRIX_ROW_PTR(b, b_begin), b->stride,
+                MATRIX_ROW_PTR(a, a_begin), a->stride,
+                sizeof(MATRIX_ELEM) * a->ncol, a_end - a_begin,
+                cudaMemcpyDeviceToHost));
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(trans)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = nerv_matrix_(new_)(L, a->ncol, a->nrow);
+    MATRIX_ELEM alpha = 1, beta = 0;
+    /* FIXME: possible memory leak when lua error is raised */
+    PROFILE_START
+    CUBLAS_SAFE_SYNC_CALL(
+            NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T,
+                a->nrow, a->ncol,
+                &alpha,
+                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+                &beta,
+                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+                MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM)));
+    PROFILE_STOP
+    luaT_pushudata(L, b, nerv_matrix_(tname));
+    return 1;
+}
+
+static int nerv_matrix_(mul_elem)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname));
+    Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    CHECK_SAME_DIMENSION(a, b);
+    CHECK_SAME_DIMENSION(a, c);
+    PROFILE_START
+    cudak_(cuda_mul_elem)(a, b, c);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(log_elem)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    CHECK_SAME_DIMENSION(a, b);
+    PROFILE_START
+    cudak_(cuda_log_elem)(a, b);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(decompress)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b;
+    int orig_col = luaL_checkinteger(L, 2);
+    if (a->ncol != 1)
+        nerv_error(L, "the compressed matrix must be a column vector");
+    b = nerv_matrix_(new_)(L, a->nrow, orig_col);
+    PROFILE_START
+    cudak_(cuda_fill)(b, 0.0);
+    cudak_(cuda_decompress)(a, b);
+    PROFILE_STOP
+    luaT_pushudata(L, b, nerv_matrix_(tname));
+    return 1;
+}
+
+extern const char *nerv_matrix_host_int_tname;
+static int nerv_matrix_(copy_rows_fromh_by_idx)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
+    Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_int_tname);
+    long nrow = a->nrow;
+    int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0;
+    if (!(0 <= b_begin && b_begin + nrow <= idx->ncol))
+        nerv_error(L, "invalid copy interval");
+    long *idx_ptr = idx->data.i;
+    int i;
+    if (idx->nrow != 1)
+        nerv_error(L, "index should be a vector");
+    if (a->ncol != b->ncol)
+        nerv_error(L, "source/destination dimension mismatch");
+    cudaStream_t *streams = (cudaStream_t*)malloc(sizeof(cudaStream_t) * nrow);
+    for (i = 0; i < nrow; i++)
+    {
+        int src_row = idx_ptr[b_begin + i];
+        if (!(0 <= src_row && src_row < b->nrow))
+            nerv_error(L, "invalid index");
+        CUDA_SAFE_CALL(cudaStreamCreate(streams + i));
+        CUDA_SAFE_CALL(cudaMemcpyAsync(MATRIX_ROW_PTR(a, i),
+                    MATRIX_ROW_PTR(b, src_row),
+                    b->stride,
+                    cudaMemcpyHostToDevice, streams[i]));
+    }
+    for (i = 0; i < nrow; i++)
+    {
+        CUDA_SAFE_CALL(cudaStreamSynchronize(streams[i]));
+        CUDA_SAFE_CALL(cudaStreamDestroy(streams[i]));
+    }
+    free(streams);
+    return 0;
+}
+
+static int nerv_matrix_(expand_frm)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    int context = luaL_checkinteger(L, 3);
+    if (a->nrow != b->nrow)
+        nerv_error(L, "mismatching number of frames");
+    if (a->ncol != b->ncol * (context * 2 + 1))
+        nerv_error(L, "the width should be 2 * context + 1");
+    PROFILE_START
+    cudak_(cuda_expand_frm)(b, a, context);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(rearrange_frm)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    int step = luaL_checkinteger(L, 3);
+    CHECK_SAME_DIMENSION(a, b);
+    if (b->ncol % step)
+        nerv_error(L, "the dimension of columns is not divisible by step");
+    PROFILE_START
+    cudak_(cuda_rearrange_frm)(b, a, step);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(scale_rows_by_col)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    if (a->nrow != b->nrow)
+        nerv_error(L, "the number of rows is not the same");
+    if (b->ncol != 1)
+        nerv_error(L, "a column vector is expected");
+    PROFILE_START
+    cudak_(cuda_scale_rows_by_col)(b, a);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(scale_rows_by_row)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    if (a->ncol != b->ncol)
+        nerv_error(L, "the number of columns is not the same");
+    if (b->nrow != 1)
+        nerv_error(L, "a row vector is expected");
+    PROFILE_START
+    cudak_(cuda_scale_rows_by_row)(b, a);
+    PROFILE_STOP
+    return 0;
+}
+
+static const luaL_Reg nerv_matrix_(extra_methods)[] = {
+    {"create", nerv_matrix_(create)},
+    {"colsum", nerv_matrix_(colsum)},
+    {"colsame", nerv_matrix_(colsame)},
+    {"rowsum", nerv_matrix_(rowsum)},
+    {"rowmax", nerv_matrix_(rowmax)},
+    {"rowmax_idx", nerv_matrix_(rowmax_idx)},
+    {"trans", nerv_matrix_(trans)},
+    {"decompress", nerv_matrix_(decompress)},
+    /* in-place calc */
+    {"copy_fromh", nerv_matrix_(copy_fromh)},
+    {"copy_fromd", nerv_matrix_(copy_fromd)},
+    {"copy_toh", nerv_matrix_(copy_toh)},
+    {"add", nerv_matrix_(add)},
+    {"mul", nerv_matrix_(mul)},
+    {"add_row", nerv_matrix_(add_row)},
+    {"fill", nerv_matrix_(fill)},
+    {"sigmoid", nerv_matrix_(sigmoid)},
+    {"sigmoid_grad", nerv_matrix_(sigmoid_grad)},
+    {"softmax", nerv_matrix_(softmax)},
+    {"mul_elem", nerv_matrix_(mul_elem)},
+    {"log_elem", nerv_matrix_(log_elem)},
+    {"copy_rows_fromh_by_idx", nerv_matrix_(copy_rows_fromh_by_idx)},
+    {"expand_frm", nerv_matrix_(expand_frm)},
+    {"rearrange_frm", nerv_matrix_(rearrange_frm)},
+    {"scale_rows_by_row", nerv_matrix_(scale_rows_by_row)},
+    {"scale_rows_by_col", nerv_matrix_(scale_rows_by_col)},
+    {NULL, NULL}
+};
+
+static void cuda_matrix_(init)(lua_State *L) {
+    luaN_append_methods(L, nerv_matrix_(extra_methods));
+}
+
+static void cuda_matrix_(free)(lua_State *L, MATRIX_ELEM *ptr) {
+    CUDA_SAFE_SYNC_CALL(cudaFree(ptr));
+}
+
+static void cuda_matrix_(alloc)(lua_State *L, MATRIX_ELEM **dptr,
+                                size_t *stride, long width, long height) {
+    PROFILE_START
+    CUDA_SAFE_SYNC_CALL(cudaMallocPitch((void **)dptr, stride, width, height));
+    PROFILE_STOP
+}
+
+static MATRIX_ELEM cuda_matrix_(read)(lua_State *L, MATRIX_ELEM *data,
+                                        int idx) {
+    MATRIX_ELEM res;
+    CUDA_SAFE_SYNC_CALL(cudaMemcpy(&res, data + idx,
+                sizeof(MATRIX_ELEM), cudaMemcpyDeviceToHost));
+    return res;
+}
+
+static void cuda_matrix_(write)(lua_State *L, MATRIX_ELEM *data,
+                                int idx, MATRIX_ELEM val) {
+    CUDA_SAFE_SYNC_CALL(cudaMemcpy(data + idx, &val,
+                sizeof(MATRIX_ELEM), cudaMemcpyHostToDevice));
+}
+
+int nerv_matrix_(get_elem)(lua_State *L) {
+    return nerv_error_method_not_implemented(L);
+}
+
+int nerv_matrix_(set_elem)(lua_State *L) {
+    return nerv_error_method_not_implemented(L);
+}
+
+#include "matrix.c"
+#endif
diff --git a/nerv/matrix/generic/elem_type.h b/nerv/matrix/generic/elem_type.h
new file mode 100644
index 0000000..bffe940
--- /dev/null
+++ b/nerv/matrix/generic/elem_type.h
@@ -0,0 +1,22 @@
+#ifdef MATRIX_USE_FLOAT
+
+#define MATRIX_ELEM float
+#define MATRIX_ELEM_FMT "%f"
+#define MATRIX_ELEM_WRITE_FMT "%.8f"
+#define MATRIX_ELEM_PTR(self) ((self)->data.f)
+
+#elif defined(MATRIX_USE_DOUBLE)
+
+#define MATRIX_ELEM double
+#define MATRIX_ELEM_FMT "%lf"
+#define MATRIX_ELEM_WRITE_FMT "%.8lf"
+#define MATRIX_ELEM_PTR(self) ((self)->data.d)
+
+#elif defined(MATRIX_USE_INT)
+
+#define MATRIX_ELEM long
+#define MATRIX_ELEM_FMT "%ld"
+#define MATRIX_ELEM_WRITE_FMT "%ld"
+#define MATRIX_ELEM_PTR(self) ((self)->data.i)
+
+#endif
diff --git a/nerv/matrix/generic/matrix.c b/nerv/matrix/generic/matrix.c
new file mode 100644
index 0000000..e17fb42
--- /dev/null
+++ b/nerv/matrix/generic/matrix.c
@@ -0,0 +1,155 @@
+#ifdef NERV_GENERIC_MATRIX
+#include "../../common.h"
+#include "matrix.h"
+
+extern const char *nerv_matrix_(tname);
+extern const char *MATRIX_BASE_TNAME;
+
+void nerv_matrix_(data_free)(lua_State *L, Matrix *self) {
+    (void)L;
+    assert(*self->data_ref > 0);
+    if (--(*self->data_ref) == 0)
+    {
+        /* free matrix data */
+        MATRIX_DATA_FREE(L, MATRIX_ELEM_PTR(self));
+        free(self->data_ref);
+        free(self);
+    }
+}
+
+void nerv_matrix_(data_retain)(Matrix *self) {
+    (*self->data_ref)++;
+}
+
+Matrix *nerv_matrix_(new_)(lua_State *L, long nrow, long ncol) {
+    Matrix *self = (Matrix *)malloc(sizeof(Matrix));
+    self->nrow = nrow;
+    self->ncol = ncol;
+    self->nmax = self->nrow * self->ncol;
+    MATRIX_DATA_ALLOC(L, &MATRIX_ELEM_PTR(self), &self->stride,
+                        sizeof(MATRIX_ELEM) * self->ncol, self->nrow);
+    self->data_ref = (long *)malloc(sizeof(long));
+    *self->data_ref = 0;
+    nerv_matrix_(data_retain)(self);
+    return self;
+}
+
+int nerv_matrix_(new)(lua_State *L) {
+    luaT_pushudata(L, nerv_matrix_(new_)(L, luaL_checkinteger(L, 1),
+                                        luaL_checkinteger(L, 2)),
+                    nerv_matrix_(tname));
+    return 1;
+}
+
+int nerv_matrix_(destroy)(lua_State *L) {
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    nerv_matrix_(data_free)(L, self);
+    return 1;
+}
+
+int nerv_matrix_(get_elem)(lua_State *L); 
+int nerv_matrix_(set_elem)(lua_State *L);
+
+static Matrix *nerv_matrix_(getrow)(Matrix *self, int row) {
+    Matrix *prow = (Matrix *)malloc(sizeof(Matrix));
+    prow->ncol = self->ncol;
+    prow->nrow = 1;
+    prow->stride = self->stride;
+    prow->nmax = prow->ncol;
+    MATRIX_ELEM_PTR(prow) = MATRIX_ROW_PTR(self, row);
+    prow->data_ref = self->data_ref;
+    nerv_matrix_(data_retain)(prow);
+    return prow;
+}
+
+static int nerv_matrix_(newindex)(lua_State *L) {
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    if (lua_isnumber(L, 2))
+    {
+        int idx = luaL_checkinteger(L, 2);
+        if (self->nrow == 1)
+        {
+            if (idx < 0 || idx >= self->ncol)
+                nerv_error(L, "index must be within range [0, %d)", self->ncol);
+            MATRIX_DATA_WRITE(L, MATRIX_ELEM_PTR(self), idx,
+                                luaL_checknumber(L, 3));
+        }
+        else
+            nerv_error(L, "cannot assign to row vector");
+        lua_pushboolean(L, 1);
+        return 1;
+    }
+    else
+    {
+        lua_pushboolean(L, 0);
+        return 1;
+    }
+}
+
+
+static int nerv_matrix_(index)(lua_State *L) {
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    if (lua_isnumber(L, 2))
+    {
+        int idx = luaL_checkinteger(L, 2);
+        if (self->nrow == 1)
+        {
+            if (idx < 0 || idx >= self->ncol)
+                nerv_error(L, "index must be within range [0, %d)", self->ncol);
+            lua_pushnumber(L, MATRIX_DATA_READ(L, MATRIX_ELEM_PTR(self), idx));
+        }
+        else
+        {
+            if (idx < 0 || idx >= self->nrow)
+                nerv_error(L, "index must be within range [0, %d)", self->nrow);
+            luaT_pushudata(L, nerv_matrix_(getrow)(self, idx), nerv_matrix_(tname));
+        }
+        lua_pushboolean(L, 1);
+        return 2;
+    }
+    else
+    {
+        lua_pushboolean(L, 0);
+        return 1;
+    }
+}
+
+static int nerv_matrix_(ncol)(lua_State *L) {
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    lua_pushinteger(L, self->ncol);
+    return 1;
+}
+
+static int nerv_matrix_(nrow)(lua_State *L) {
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    lua_pushinteger(L, self->nrow);
+    return 1;
+}
+
+static int nerv_matrix_(get_dataref_value)(lua_State *L) {                                                                                                                                               
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));                                                                                                                                           
+    lua_pushinteger(L, *(self->data_ref));                                                                                                                                                               
+    return 1;                                                                                                                                                                                            
+}      
+
+static const luaL_Reg nerv_matrix_(methods)[] = {
+    {"get_elem", nerv_matrix_(get_elem)},
+    {"set_elem", nerv_matrix_(set_elem)},
+    {"ncol", nerv_matrix_(ncol)},
+    {"nrow", nerv_matrix_(nrow)},
+    {"get_dataref_value", nerv_matrix_(get_dataref_value)},
+    {"__index__", nerv_matrix_(index)},
+    {"__newindex__", nerv_matrix_(newindex)},
+    {NULL, NULL}
+};
+
+void nerv_matrix_(init)(lua_State *L) {
+    luaT_newmetatable(L, nerv_matrix_(tname), MATRIX_BASE_TNAME,
+                        nerv_matrix_(new), nerv_matrix_(destroy), NULL);
+    luaL_register(L, NULL, nerv_matrix_(methods));
+#ifdef MATRIX_INIT
+    MATRIX_INIT(L);
+#endif
+    lua_pop(L, 1);
+}
+#endif
diff --git a/nerv/matrix/generic/matrix.h b/nerv/matrix/generic/matrix.h
new file mode 100644
index 0000000..833724b
--- /dev/null
+++ b/nerv/matrix/generic/matrix.h
@@ -0,0 +1,19 @@
+#ifndef NERV_GENERIC_MATRIX_H
+#define NERV_GENERIC_MATRIX_H
+
+#include <stddef.h>
+typedef struct Matrix {
+    size_t stride;              /* size of a row */
+    long ncol, nrow, nmax;    /* dimension of the matrix */
+    union {
+        float *f;
+        double *d;
+        long *i;
+    } data;                   /* pointer to actual storage */
+    long *data_ref;
+} Matrix;
+
+#define MATRIX_ROW_PTR(self, row) \
+    (MATRIX_ELEM *)((char *)MATRIX_ELEM_PTR(self) + (row) * (self)->stride)
+
+#endif
diff --git a/nerv/matrix/generic/mmatrix.c b/nerv/matrix/generic/mmatrix.c
new file mode 100644
index 0000000..b0f0791
--- /dev/null
+++ b/nerv/matrix/generic/mmatrix.c
@@ -0,0 +1,122 @@
+#ifdef NERV_GENERIC_MMATRIX
+#include "matrix.h"
+#include "elem_type.h"
+#define MATRIX_DATA_FREE(L, ptr) free(ptr)
+#define MATRIX_DATA_ALLOC(L, dptr, stride, width, height) \
+                            host_matrix_(alloc)(L, dptr, stride, width, height)
+#define MATRIX_DATA_WRITE(L, data, idx, val) (data[idx] = val)
+#define MATRIX_DATA_READ(L, data, idx) (data[idx])
+#define MATRIX_INIT(L) host_matrix_(init)(L)
+#define MATRIX_BASE_TNAME nerv_matrix_host_tname
+#define NERV_GENERIC_MATRIX
+#include "../../common.h"
+#include "../../io/chunk_file.h"
+#include "string.h"
+
+static void host_matrix_(alloc)(lua_State *L,
+                                MATRIX_ELEM **dptr, size_t *stride,
+                                long width, long height) {
+    if ((*dptr = (MATRIX_ELEM *)malloc(width * height)) == NULL)
+        nerv_error(L, "mmatrix insufficient memory");
+    *stride = width;
+}
+
+int nerv_matrix_(get_elem)(lua_State *L) {
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    int idx = luaL_checkinteger(L, 2);
+    if (idx < 0 || idx >= self->nmax)
+        nerv_error(L, "index must be within range [0, %d)", self->nmax);
+    lua_pushnumber(L, MATRIX_ELEM_PTR(self)[idx]);
+    return 1;
+}
+
+int nerv_matrix_(set_elem)(lua_State *L) {
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    int idx = luaL_checkinteger(L, 2);
+    MATRIX_ELEM v = luaL_checknumber(L, 3);
+    if (idx < 0 || idx >= self->nmax)
+        nerv_error(L, "index must be within range [0, %d)", self->nmax);
+    MATRIX_ELEM_PTR(self)[idx] = v;
+    return 0;
+}
+
+static const luaL_Reg nerv_matrix_(extra_methods)[];
+static void host_matrix_(init)(lua_State *L) {
+    luaN_append_methods(L, nerv_matrix_(extra_methods));
+#ifdef MMATRIX_INIT
+    MMATRIX_INIT(L);
+#endif
+}
+
+#include "matrix.c"
+
+int nerv_matrix_(load)(lua_State *L) {
+    ChunkData *chunk = luaT_checkudata(L, 1, nerv_chunk_data_tname);
+    Matrix *self;
+    int i, j;
+    long nrow, ncol;
+    FILE *fp = chunk->fp;
+    if (fscanf(fp, "%ld %ld", &nrow, &ncol) != 2)
+        return 0;
+    self = nerv_matrix_(new_)(L, nrow, ncol);
+    for (i = 0; i < nrow; i++)
+    {
+        MATRIX_ELEM *row = MATRIX_ROW_PTR(self, i);
+        for (j = 0; j < ncol; j++)
+            if (fscanf(fp, MATRIX_ELEM_FMT, row + j) != 1)
+            {
+                free(self);
+                return 0;
+            }
+    }
+    luaT_pushudata(L, self, nerv_matrix_(tname));
+    return 1;
+}
+
+int nerv_matrix_(save)(lua_State *L) {
+    ChunkFileHandle *chunk = luaT_checkudata(L, 2,
+                                nerv_chunk_file_handle_tname);
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    int i, j;
+    long nrow = self->nrow, ncol = self->ncol;
+    FILE *fp = chunk->fp;
+    if (fprintf(fp, "%ld %ld\n", nrow, ncol) < 0)
+        return 0;
+    for (i = 0; i < nrow; i++)
+    {
+        MATRIX_ELEM *row = MATRIX_ROW_PTR(self, i);
+        for (j = 0; j < ncol; j++)
+            if (fprintf(fp, MATRIX_ELEM_WRITE_FMT " ", row[j]) < 0)
+                return 0;
+        if (fprintf(fp, "\n") < 0)
+             return 0;
+    }
+    return 0;
+}
+
+static int nerv_matrix_(copy_from)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    int nargs = lua_gettop(L);
+    int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
+    int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
+    int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
+    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+            a_begin + b_end - b_begin <= a->nrow))
+        nerv_error(L, "invalid copy interval");
+    if (a->ncol != b->ncol)
+        nerv_error(L, "matrices should be of the same dimension");
+    memmove(MATRIX_ROW_PTR(a, a_begin),
+            MATRIX_ROW_PTR(b, b_begin),
+            sizeof(MATRIX_ELEM) * b->ncol * (b_end - b_begin));
+    return 0;
+}
+
+static const luaL_Reg nerv_matrix_(extra_methods)[] = {
+    {"load", nerv_matrix_(load)},
+    {"save", nerv_matrix_(save)},
+    {"copy_from", nerv_matrix_(copy_from)},
+    {NULL, NULL}
+};
+
+#endif
diff --git a/nerv/matrix/init.c b/nerv/matrix/init.c
new file mode 100644
index 0000000..c29d7e9
--- /dev/null
+++ b/nerv/matrix/init.c
@@ -0,0 +1,35 @@
+#include "../common.h"
+#include "generic/matrix.h"
+
+const char *nerv_matrix_tname = "nerv.Matrix";
+const char *nerv_matrix_cuda_tname = "nerv.CuMatrix";
+const char *nerv_matrix_host_tname = "nerv.MMatrix";
+
+void nerv_cumatrix_init(lua_State *L);
+void nerv_mmatrix_init(lua_State *L);
+
+static const luaL_Reg matrix_methods[] = {
+    {"__tostring__", nerv_error_method_not_implemented },
+    {"__add__", nerv_error_method_not_implemented },
+    {"__sub__", nerv_error_method_not_implemented },
+    {"__mul__", nerv_error_method_not_implemented },
+    {NULL, NULL}
+};
+
+void nerv_matrix_init(lua_State *L) {
+    /* abstract base class: Matrix */
+    luaT_newmetatable(L, nerv_matrix_tname, NULL, NULL, NULL, NULL);
+    luaL_register(L, NULL, matrix_methods);
+    lua_pop(L, 1);
+
+    /* CuMatrix inherits from Matrix */
+    luaT_newmetatable(L, nerv_matrix_cuda_tname, nerv_matrix_tname,
+                            NULL, NULL, NULL);
+    nerv_cumatrix_init(L);
+    lua_pop(L, 1);
+    /* MMatrix inherits from Matrix */
+    luaT_newmetatable(L, nerv_matrix_host_tname, nerv_matrix_tname,
+                            NULL, NULL, NULL);
+    nerv_mmatrix_init(L);
+    lua_pop(L, 1);
+}
diff --git a/nerv/matrix/init.lua b/nerv/matrix/init.lua
new file mode 100644
index 0000000..1a8925f
--- /dev/null
+++ b/nerv/matrix/init.lua
@@ -0,0 +1,77 @@
+function nerv.Matrix:__tostring__()
+    local ncol = self:ncol()
+    local nrow = self:nrow()
+    local strt = {}
+    local fmt
+    if self.fmt then
+        fmt = self.fmt
+    else
+        fmt = "%.8f "
+    end
+    if nrow == 1 then
+        for col = 0, ncol - 1 do
+            table.insert(strt, string.format(fmt, self[col]))
+        end
+        table.insert(strt, "\n")
+    else
+        for row = 0, nrow - 1 do
+            local rp = self[row]
+            for col = 0, ncol - 1 do
+                table.insert(strt, string.format(fmt, rp[col]))
+            end
+            table.insert(strt, "\n")
+        end
+    end
+    table.insert(strt, string.format(
+        "[%s %d x %d]", self.__typename, nrow, ncol))
+    return table.concat(strt)
+end
+
+-- gen: a function takes take indices of the matrix and return the generated
+-- all entrys in the matrix will be assigned by calling gen(i, j)
+function nerv.Matrix:generate(gen)
+    if (self:nrow() == 1) then
+        for j = 0, self:ncol() - 1 do
+            self[j] = gen(j)
+        end
+    else
+        for i = 0, self:nrow() - 1 do
+            local row = self[i]
+            for j = 0, self:ncol() - 1 do
+                row[j] = gen(i, j)
+            end
+        end
+    end
+end
+
+nerv.MMatrixInt.fmt = "%d "
+
+function nerv.CuMatrix:__add__(b)
+    c = self:create()
+    c:add(self, b, 1.0, 1.0)
+    return c
+end
+
+function nerv.CuMatrix:__sub__(b)
+    c = self:create()
+    c:add(self, b, 1.0, -1.0)
+    return c
+end
+
+function nerv.CuMatrix:__mul__(b)
+    c = nerv.get_type(self.__typename)(self:nrow(), b:ncol())
+    c:mul(self, b, 1.0, 0.0, 'N', 'N')
+    return c
+end
+
+function nerv.CuMatrixFloat.new_from_host(mat)
+    local res = nerv.CuMatrixFloat(mat:nrow(), mat:ncol())
+    res:copy_fromh(mat)
+    return res
+end
+
+function nerv.CuMatrixFloat:new_to_host()
+    local res = nerv.MMatrixFloat(self:nrow(), self:ncol())
+    self:copy_toh(res)
+    return res
+end
diff --git a/nerv/matrix/mmatrix.c b/nerv/matrix/mmatrix.c
new file mode 100644
index 0000000..d1d68b9
--- /dev/null
+++ b/nerv/matrix/mmatrix.c
@@ -0,0 +1,77 @@
+#define NERV_GENERIC_MMATRIX
+#include <stdlib.h>
+#include "../common.h"
+void nerv_matrix_host_float_init(lua_State *L);
+void nerv_matrix_host_double_init(lua_State *L);
+void nerv_matrix_host_int_init(lua_State *L);
+
+void nerv_mmatrix_init(lua_State *L) {
+    srand(1);
+    nerv_matrix_host_float_init(L);
+    nerv_matrix_host_double_init(L);
+    nerv_matrix_host_int_init(L);
+}
+
+#define MATRIX_USE_FLOAT
+#define host_matrix_(NAME) host_matrix_float_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME
+const char *nerv_matrix_(tname) = "nerv.MMatrixFloat";
+#include "generic/mmatrix.c"
+#undef nerv_matrix_
+#undef host_matrix_
+#undef MATRIX_USE_FLOAT
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_FMT
+#undef MATRIX_ELEM_WRITE_FMT
+
+#define NERV_GENERIC_MMATRIX
+#define MATRIX_USE_DOUBLE
+#define host_matrix_(NAME) host_matrix_double_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_host_double_##NAME
+const char *nerv_matrix_(tname) = "nerv.MMatrixDouble";
+#include "generic/mmatrix.c"
+#undef nerv_matrix_
+#undef host_matrix_
+#undef MATRIX_USE_DOUBLE
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_FMT
+#undef MATRIX_ELEM_WRITE_FMT
+
+#define NERV_GENERIC_MMATRIX
+#define MATRIX_USE_INT
+#define host_matrix_(NAME) host_matrix_int_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_host_int_##NAME
+const char *nerv_matrix_(tname) = "nerv.MMatrixInt";
+#define MMATRIX_INIT(L) host_matrix_(init_extra)(L)
+
+static const luaL_Reg nerv_matrix_(extra_methods_int)[];
+static void host_matrix_(init_extra)(lua_State *L) {
+    luaN_append_methods(L, nerv_matrix_(extra_methods_int));
+}
+
+#include "generic/mmatrix.c"
+
+static int nerv_matrix_(perm_gen)(lua_State *L) {
+    int i, ncol = luaL_checkinteger(L, 1);
+    Matrix *self = nerv_matrix_(new_)(L, 1, ncol);
+    long *prow = self->data.i;
+    for (i = 0; i < ncol; i++)
+        prow[i] = i;
+    for (i = ncol - 1; i >= 0; i--)
+    {
+        size_t j = rand() % (i + 1);
+        long tmp = prow[i];
+        prow[i] = prow[j];
+        prow[j] = tmp;
+    }
+    luaT_pushudata(L, self, nerv_matrix_(tname));
+    return 1;
+}
+
+static const luaL_Reg nerv_matrix_(extra_methods_int)[] = {
+    {"perm_gen", nerv_matrix_(perm_gen)},
+    {NULL, NULL}
+};
+
diff --git a/nerv/nerv b/nerv/nerv
new file mode 100644
index 0000000..7571659
--- /dev/null
+++ b/nerv/nerv
@@ -0,0 +1,13 @@
+#! /usr/bin/env luajit
+require 'nerv'
+print("Greetings")
+if #arg < 1 then
+    return
+end
+local script = arg[1]
+local script_arg = {}
+for i = 2, #arg do
+    table.insert(script_arg, arg[i])
+end
+arg = script_arg
+dofile(script)
diff --git a/nerv/nerv-scm-1.rockspec b/nerv/nerv-scm-1.rockspec
new file mode 100644
index 0000000..d14140a
--- /dev/null
+++ b/nerv/nerv-scm-1.rockspec
@@ -0,0 +1,38 @@
+package = "nerv"
+version = "scm-1"
+source = {
+    url = "..." -- We don't have one yet
+}
+description = {
+    summary = "An example for the LuaRocks tutorial.",
+    detailed = [[
+    ]],
+    homepage = "https://github.com/Determinant/nerv", -- We don't have one yet
+    license = "BSD" -- or whatever you like
+}
+dependencies = {
+    "lua >= 5.1"
+    -- If you depend on other rocks, add them here
+}
+build = {
+    -- We'll start here.
+    type = "make",
+    build_variables = {
+        CFLAGS="$(CFLAGS)",
+        LIBFLAG="$(LIBFLAG)",
+        LUA_LIBDIR="$(LUA_LIBDIR)",
+        LUA_BINDIR="$(LUA_BINDIR)",
+        LUA_INCDIR="$(LUA_INCDIR)",
+        LUA="$(LUA)",
+    },
+    install_variables = {
+        INST_PREFIX="$(PREFIX)",
+        INST_BINDIR="$(BINDIR)",
+        INST_LIBDIR="$(LIBDIR)",
+        INST_LUADIR="$(LUADIR)",
+        INST_CONFDIR="$(CONFDIR)",
+    },
+    install = {
+        bin = {"nerv"}
+    }
+}
diff --git a/nerv/nerv.c b/nerv/nerv.c
new file mode 100644
index 0000000..a59eadc
--- /dev/null
+++ b/nerv/nerv.c
@@ -0,0 +1,38 @@
+#include "common.h"
+
+extern void nerv_example_init(lua_State *L);
+extern void nerv_matrix_init(lua_State *L);
+extern void nerv_io_init(lua_State *L);
+
+static const luaL_Reg nerv_utils_methods[] = {
+    {"setmetatable", luaT_lua_setmetatable},
+    {"getmetatable", luaT_lua_getmetatable},
+    {"newmetatable", luaT_lua_newmetatable},
+    {"typename", luaT_lua_typename},
+    {NULL, NULL}
+};
+
+void nerv_utils_init(lua_State *L) {
+    luaL_register(L, NULL, nerv_utils_methods);
+}
+
+int luaopen_libnerv(lua_State *L) {
+    lua_newtable(L);
+    /* duplicate table */
+    lua_pushvalue(L, -1);
+    /* set table to global index */
+    lua_setfield(L, LUA_GLOBALSINDEX, "nerv");
+    /* A table reference still remains.
+     *
+     * The following initialization functions should obey to the rule that they
+     * maintain the stack properly to guarantee the stack stays the same before
+     * and after invoking the call (i.e. stay balanced).
+     *
+     * Also note that they can make use of the value at top of the stack which
+     * references to the `nerv` global table. */
+    nerv_utils_init(L);
+    nerv_example_init(L);
+    nerv_matrix_init(L);
+    nerv_io_init(L);
+    return 1;
+}
diff --git a/nerv/nn/init.lua b/nerv/nn/init.lua
new file mode 100644
index 0000000..cbaf52b
--- /dev/null
+++ b/nerv/nn/init.lua
@@ -0,0 +1,3 @@
+nerv.include('layer_repo.lua')
+nerv.include('param_repo.lua')
+nerv.include('layer_dag.lua')
diff --git a/nerv/nn/layer_dag.lua b/nerv/nn/layer_dag.lua
new file mode 100644
index 0000000..8e30216
--- /dev/null
+++ b/nerv/nn/layer_dag.lua
@@ -0,0 +1,249 @@
+local DAGLayer = nerv.class("nerv.DAGLayer", "nerv.Layer")
+
+local function parse_id(str)
+    local id, port, _
+    _, _, id, port = string.find(str, "([a-zA-Z0-9_]+)%[([0-9]+)%]")
+    if id == nil or port == nil then
+        _, _, id, port = string.find(str, "(.+)%[([0-9]+)%]")
+        if not (id == "<input>" or id == "<output>") then
+            nerv.error("wrong format of connection id")
+        end
+    end
+    port = tonumber(port)
+    return id, port
+end
+
+local function discover(id, layers, layer_repo)
+    local ref = layers[id]
+    if id == "<input>" or id == "<output>" then
+        return nil
+    end
+    if ref == nil then
+        local layer = layer_repo:get_layer(id)
+        local dim_in, dim_out = layer:get_dim()
+        ref = {
+            layer = layer,
+            inputs = {},
+            outputs = {},
+            err_inputs = {},
+            err_outputs = {},
+            next_layers = {},
+            input_len = #dim_in,
+            output_len = #dim_out,
+            in_deg = 0,
+            visited = false
+        }
+        layers[id] = ref
+    end
+    return ref
+end
+
+function DAGLayer:__init(id, global_conf, layer_conf)
+    local layers = {}
+    local inputs = {}
+    local outputs = {}
+    local dim_in = layer_conf.dim_in
+    local dim_out = layer_conf.dim_out
+    local parsed_conn = {}
+    for from, to in pairs(layer_conf.connections) do
+        local id_from, port_from = parse_id(from)
+        local id_to, port_to = parse_id(to)
+        local ref_from = discover(id_from, layers, layer_conf.sub_layers)
+        local ref_to = discover(id_to, layers, layer_conf.sub_layers)
+        local input_dim, output_dim, _
+        if ref_from and ref_from.outputs[port_from] ~= nil then
+            nerv.error("%s has already been attached", from)
+        end
+        if ref_to and ref_to.inputs[port_to] ~= nil then
+            nerv.error("%s has already been attached", to)
+        end
+        if id_from == "<input>" then
+            input_dim, _ = ref_to.layer:get_dim()
+            if dim_in[port_from] ~= input_dim[port_to] then
+                nerv.error("mismatching data dimension between %s and %s", from, to)
+            end
+            inputs[port_from] = {ref_to, port_to}
+            ref_to.inputs[port_to] = inputs -- just a place holder
+        elseif id_to == "<output>" then
+            _, output_dim = ref_from.layer:get_dim()
+            if output_dim[port_from] ~= dim_out[port_to] then
+                nerv.error("mismatching data dimension between %s and %s", from, to)
+            end
+            outputs[port_to] = {ref_from, port_from}
+            ref_from.outputs[port_from] = outputs -- just a place holder
+        else
+            _, output_dim = ref_from.layer:get_dim()
+            input_dim, _ = ref_to.layer:get_dim()
+            if output_dim[port_from] ~= input_dim[port_to] then
+                nerv.error("mismatching data dimension between %s and %s", from, to)
+            end
+
+            table.insert(parsed_conn,
+                {{ref_from, port_from}, {ref_to, port_to}})
+            table.insert(ref_from.next_layers, ref_to) -- add edge
+            ref_to.in_deg = ref_to.in_deg + 1          -- increase the in-degree of the target layer
+        end
+    end
+
+    -- topology sort
+    local queue = {}
+    local l = 1
+    local r = 1
+    for id, ref in pairs(layers) do
+        if ref.in_deg == 0 then
+            table.insert(queue, ref)
+            nerv.info("adding source layer: %s", id)
+            r = r + 1
+        end
+    end
+    if l == r then
+        nerv.error("loop detected")
+    end
+    while l < r do
+        local cur = queue[l]
+        cur.visited = true
+        l = l + 1
+        for _, nl in pairs(cur.next_layers) do
+            nl.in_deg = nl.in_deg - 1 
+            if nl.in_deg == 0 then
+                table.insert(queue, nl)
+                r = r + 1
+            end
+        end
+    end
+    for i = 1, #queue do
+        nerv.info("enqueued layer: %s", queue[i].layer.id)
+    end
+
+    for id, ref in pairs(layers) do
+        -- check wether the graph is connected
+        if ref.visited == false then
+            nerv.warning("layer %s is ignored", id)
+        end
+    end
+
+    self.layers = layers
+    self.inputs = inputs
+    self.outputs = outputs
+    self.dim_in = dim_in
+    self.dim_out = dim_out
+    self.parsed_conn = parsed_conn
+    self.queue = queue
+    self.gconf = global_conf
+end
+
+function DAGLayer:init(batch_size)
+    for i, conn in ipairs(self.parsed_conn) do
+        local _, output_dim
+        local ref_from, port_from, ref_to, port_to
+        ref_from, port_from = unpack(conn[1])
+        ref_to, port_to = unpack(conn[2])
+        _, output_dim = ref_from.layer:get_dim()
+        local mid = self.gconf.cumat_type(batch_size,
+                                        output_dim[port_from])
+        local err_mid = mid:create()
+
+        ref_from.outputs[port_from] = mid
+        ref_to.inputs[port_to] = mid
+
+        ref_from.err_inputs[port_from] = err_mid
+        ref_to.err_outputs[port_to] = err_mid
+    end
+    for id, ref in pairs(self.layers) do
+        for i = 1, ref.input_len do
+            if ref.inputs[i] == nil then
+                nerv.error("dangling input port %d of layer %s", i, id)
+            end
+        end
+        for i = 1, ref.output_len do
+            if ref.outputs[i] == nil then
+                nerv.error("dangling output port %d of layer %s", i, id)
+            end
+        end
+        -- initialize sub layers
+        ref.layer:init(batch_size)
+    end
+    for i = 1, #self.dim_in do
+        if self.inputs[i] == nil then
+            nerv.error("dangling port %d of layer <input>", i)
+        end
+    end
+    for i = 1, #self.dim_out do
+        if self.outputs[i] == nil then
+            nerv.error("dangling port %d of layer <output>", i)
+        end
+    end
+end
+
+function DAGLayer:set_inputs(input)
+    for i = 1, #self.dim_in do
+        local layer = self.inputs[i][1]
+        local port = self.inputs[i][2]
+        layer.inputs[port] = input[i]
+    end
+end
+
+function DAGLayer:set_outputs(output)
+    for i = 1, #self.dim_out do
+        local layer = self.outputs[i][1]
+        local port = self.outputs[i][2]
+        layer.outputs[port] = output[i]
+    end
+end
+
+function DAGLayer:set_err_inputs(bp_err)
+    for i = 1, #self.dim_out do
+        local layer = self.outputs[i][1]
+        local port = self.outputs[i][2]
+        layer.err_inputs[port] = bp_err[i]
+    end
+end
+
+function DAGLayer:set_err_outputs(next_bp_err)
+    for i = 1, #self.dim_in do
+        local layer = self.inputs[i][1]
+        local port = self.inputs[i][2]
+        layer.err_outputs[port] = next_bp_err[i]
+    end
+end
+
+function DAGLayer:update(bp_err, input, output)
+    self:set_err_inputs(bp_err)
+    self:set_inputs(input)
+    self:set_outputs(output)
+    -- print("update")
+    for id, ref in pairs(self.queue) do
+        -- print(ref.layer.id)
+        ref.layer:update(ref.err_inputs, ref.inputs, ref.outputs)
+    end
+end
+
+function DAGLayer:propagate(input, output)
+    self:set_inputs(input)
+    self:set_outputs(output)
+    for i = 1, #self.queue do
+        local ref = self.queue[i]
+        -- print(ref.layer.id)
+        ref.layer:propagate(ref.inputs, ref.outputs)
+    end
+end
+
+function DAGLayer:back_propagate(bp_err, next_bp_err, input, output)
+    self:set_err_outputs(next_bp_err)
+    self:set_err_inputs(bp_err)
+    self:set_inputs(input)
+    self:set_outputs(output)
+    for i = #self.queue, 1, -1 do
+        local ref = self.queue[i]
+        -- print(ref.layer.id)
+        ref.layer:back_propagate(ref.err_inputs, ref.err_outputs, ref.inputs, ref.outputs)
+    end
+end
+
+function DAGLayer:get_params()
+    local param_repos = {}
+    for id, ref in pairs(self.queue) do
+        table.insert(param_repos, ref.layer:get_params())
+    end
+    return nerv.ParamRepo.merge(param_repos)
+end
diff --git a/nerv/nn/layer_repo.lua b/nerv/nn/layer_repo.lua
new file mode 100644
index 0000000..602c37c
--- /dev/null
+++ b/nerv/nn/layer_repo.lua
@@ -0,0 +1,34 @@
+local LayerRepo = nerv.class("nerv.LayerRepo")
+
+function LayerRepo:__init(layer_spec, param_repo, global_conf)
+    local layers = {}
+    for ltype, llist in pairs(layer_spec) do
+        local layer_type = nerv.get_type(ltype)
+        for id, spec in pairs(llist) do
+            if layers[id] ~= nil then
+                nerv.error("a layer with id %s already exists", id)
+            end
+            nerv.info("create layer: %s", id)
+            if type(spec[2]) ~= "table" then
+                nerv.error("layer config table is need")
+            end
+            layer_config = spec[2]
+            if type(spec[1]) ~= "table" then
+                nerv.error("parameter description table is needed")
+            end
+            for pname, pid in pairs(spec[1]) do
+                layer_config[pname] = param_repo:get_param(pid)
+            end
+            layers[id] = layer_type(id, global_conf, layer_config)
+        end
+    end
+    self.layers = layers
+end
+
+function LayerRepo:get_layer(lid)
+    local layer = self.layers[lid]
+    if layer == nil then
+        nerv.error("layer with id %s not found", lid)
+    end
+    return layer
+end
diff --git a/nerv/nn/param_repo.lua b/nerv/nn/param_repo.lua
new file mode 100644
index 0000000..ab971ba
--- /dev/null
+++ b/nerv/nn/param_repo.lua
@@ -0,0 +1,76 @@
+local ParamRepo = nerv.class("nerv.ParamRepo")
+function ParamRepo:__init(plist)
+    self.params = {}
+    if plist ~= nil then
+        for i, p in ipairs(plist) do
+            self.params[p.id] = p
+        end
+    end
+end
+
+function ParamRepo:add(pid, p)
+    if self.params[pid] ~= nil then
+        nerv.error("duplicate params with the same id: %s", pid)
+    end
+    self.params[pid] = p
+end
+
+function ParamRepo:remove(pid, p)
+    if self.params[pid] == nil then
+        nerv.error("param %s does not exit", pid)
+    end
+    table.remove(self.params, pid)
+end
+
+function ParamRepo.merge(repos)
+    local self = nerv.ParamRepo()
+    for i, repo in ipairs(repos) do
+        if not nerv.is_type(repo, "nerv.ParamRepo") then
+            nerv.error("nerv.ParamRepo objects expected, got %s", repo)
+        end
+        for pid, p in pairs(repo.params) do
+            self:add(pid, p)
+        end
+    end
+    return self
+end
+
+function ParamRepo:import(param_files, pids, gconf)
+    if type(param_files) ~= "table" then
+        nerv.error("param file table is need")
+    end
+    for i = 1, #param_files do
+        local pf = nerv.ChunkFile(param_files[i], "r")
+        for cid, cspec in pairs(pf.metadata) do
+            if pids == nil or pids[cid] ~= nil then
+                local p = pf:read_chunk(cid, gconf)
+                if not nerv.is_type(p, "nerv.Param") then
+                    nerv.error("param chunk is expected")
+                end
+                self:add(cid, p)
+            end
+        end
+    end
+end
+
+function ParamRepo:export(param_file, pids)
+    cf = nerv.ChunkFile(param_file, "w")
+    if pids == nil then
+        for id, p in pairs(self.params) do
+            cf:write_chunk(p)
+        end
+    else
+        for i, pid in ipairs(pids) do
+            cf:write_chunk(self:get_param(pid))
+        end
+    end
+    cf:close()
+end
+
+function ParamRepo:get_param(pid)
+    local p = self.params[pid]
+    if p == nil then
+        nerv.error("param with id %s not found", pid)
+    end
+    return p
+end
author	Determinant <[email protected]>	2015-06-22 19:01:29 +0800
committer	Determinant <[email protected]>	2015-06-22 19:01:29 +0800
commit	2497fd9e7a0fae5ee4887890d7a312e0e08a93b8 (patch)
tree	382f97575bd2df9ee6abb1662b11b279fc22d72b /nerv
parent	196e9b48a3541caccdffc5743001cced70667091 (diff)