From 2497fd9e7a0fae5ee4887890d7a312e0e08a93b8 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Mon, 22 Jun 2015 19:01:29 +0800
Subject: major change: use luarocks to manage project

---
 .gitignore                              |    2 +-
 .gitmodules                             |    3 +
 Makefile                                |   71 +-
 build_luajit.sh                         |    5 -
 common.c                                |   76 ---
 common.h                                |   36 --
 doc/nerv.md                             |   17 -
 doc/nerv_class.md                       |   36 --
 doc/nerv_io.md                          |  113 ----
 doc/nerv_layer.md                       |  180 ------
 doc/nerv_matrix.md                      |  165 -----
 doc/nerv_nn.md                          |  256 --------
 doc/nerv_param.md                       |   27 -
 examples/asr_trainer.lua                |  106 ---
 examples/chunk_file_example.lua         |   53 --
 examples/cumatrix_example.lua           |   31 -
 examples/cumatrix_from_mmatrix.lua      |   32 -
 examples/mmatrix_example.lua            |   20 -
 examples/oop_example.c                  |  101 ---
 examples/oop_example.lua                |   16 -
 examples/swb_baseline.lua               |  166 -----
 examples/test_dnn_layers.lua            |   78 ---
 examples/test_nn_lib.lua                |  164 -----
 io/chunk_file.c                         |  325 ----------
 io/chunk_file.h                         |   23 -
 io/init.c                               |    6 -
 io/init.lua                             |   55 --
 io/sgd_buffer.lua                       |  111 ----
 layer/affine.lua                        |   91 ---
 layer/bias.lua                          |   28 -
 layer/combiner.lua                      |   59 --
 layer/init.lua                          |   79 ---
 layer/mse.lua                           |   52 --
 layer/sigmoid.lua                       |   31 -
 layer/softmax_ce.lua                    |   68 --
 layer/window.lua                        |   28 -
 luaT/README.md                          |  239 -------
 luaT/luaT.c                             | 1079 -------------------------------
 luaT/luaT.h                             |  111 ----
 luarocks                                |    1 +
 matrix/cuda_helper.h                    |   75 ---
 matrix/cukernel.cu                      |   17 -
 matrix/cukernel.h                       |   20 -
 matrix/cumatrix.c                       |   87 ---
 matrix/generic/cukernel.cu              |  571 ----------------
 matrix/generic/cumatrix.c               |  493 --------------
 matrix/generic/elem_type.h              |   22 -
 matrix/generic/matrix.c                 |  155 -----
 matrix/generic/matrix.h                 |   19 -
 matrix/generic/mmatrix.c                |  122 ----
 matrix/init.c                           |   35 -
 matrix/init.lua                         |   77 ---
 matrix/mmatrix.c                        |   77 ---
 nerv                                    |    2 -
 nerv.c                                  |   38 --
 nerv.lua                                |  114 ----
 nerv/.gitignore                         |    1 +
 nerv/Makefile                           |   60 ++
 nerv/common.c                           |   76 +++
 nerv/common.h                           |   36 ++
 nerv/doc/nerv.md                        |   17 +
 nerv/doc/nerv_class.md                  |   36 ++
 nerv/doc/nerv_io.md                     |  113 ++++
 nerv/doc/nerv_layer.md                  |  180 ++++++
 nerv/doc/nerv_matrix.md                 |  165 +++++
 nerv/doc/nerv_nn.md                     |  256 ++++++++
 nerv/doc/nerv_param.md                  |   27 +
 nerv/examples/asr_trainer.lua           |  106 +++
 nerv/examples/chunk_file_example.lua    |   53 ++
 nerv/examples/cumatrix_example.lua      |   31 +
 nerv/examples/cumatrix_from_mmatrix.lua |   32 +
 nerv/examples/mmatrix_example.lua       |   20 +
 nerv/examples/oop_example.c             |  101 +++
 nerv/examples/oop_example.lua           |   16 +
 nerv/examples/swb_baseline.lua          |  166 +++++
 nerv/examples/test_dnn_layers.lua       |   78 +++
 nerv/examples/test_nn_lib.lua           |  164 +++++
 nerv/init.lua                           |  128 ++++
 nerv/io/chunk_file.c                    |  325 ++++++++++
 nerv/io/chunk_file.h                    |   23 +
 nerv/io/init.c                          |    6 +
 nerv/io/init.lua                        |   55 ++
 nerv/io/sgd_buffer.lua                  |  111 ++++
 nerv/layer/affine.lua                   |   91 +++
 nerv/layer/bias.lua                     |   28 +
 nerv/layer/combiner.lua                 |   59 ++
 nerv/layer/init.lua                     |   79 +++
 nerv/layer/mse.lua                      |   52 ++
 nerv/layer/sigmoid.lua                  |   31 +
 nerv/layer/softmax_ce.lua               |   68 ++
 nerv/layer/window.lua                   |   28 +
 nerv/luaT/README.md                     |  239 +++++++
 nerv/luaT/luaT.c                        | 1079 +++++++++++++++++++++++++++++++
 nerv/luaT/luaT.h                        |  111 ++++
 nerv/matrix/cuda_helper.h               |   75 +++
 nerv/matrix/cukernel.cu                 |   17 +
 nerv/matrix/cukernel.h                  |   20 +
 nerv/matrix/cumatrix.c                  |   87 +++
 nerv/matrix/generic/cukernel.cu         |  571 ++++++++++++++++
 nerv/matrix/generic/cumatrix.c          |  493 ++++++++++++++
 nerv/matrix/generic/elem_type.h         |   22 +
 nerv/matrix/generic/matrix.c            |  155 +++++
 nerv/matrix/generic/matrix.h            |   19 +
 nerv/matrix/generic/mmatrix.c           |  122 ++++
 nerv/matrix/init.c                      |   35 +
 nerv/matrix/init.lua                    |   77 +++
 nerv/matrix/mmatrix.c                   |   77 +++
 nerv/nerv                               |   13 +
 nerv/nerv-scm-1.rockspec                |   38 ++
 nerv/nerv.c                             |   38 ++
 nerv/nn/init.lua                        |    3 +
 nerv/nn/layer_dag.lua                   |  249 +++++++
 nerv/nn/layer_repo.lua                  |   34 +
 nerv/nn/param_repo.lua                  |   76 +++
 nn/init.lua                             |    3 -
 nn/layer_dag.lua                        |  249 -------
 nn/layer_repo.lua                       |   34 -
 nn/param_repo.lua                       |   76 ---
 pl/compat.lua                           |  137 ----
 pl/utils.lua                            |  476 --------------
 speech                                  |    2 +-
 tools/build_luajit.sh                   |    4 +
 tools/build_luarocks.sh                 |    4 +
 123 files changed, 6491 insertions(+), 7026 deletions(-)
 delete mode 100755 build_luajit.sh
 delete mode 100644 common.c
 delete mode 100644 common.h
 delete mode 100644 doc/nerv.md
 delete mode 100644 doc/nerv_class.md
 delete mode 100644 doc/nerv_io.md
 delete mode 100644 doc/nerv_layer.md
 delete mode 100644 doc/nerv_matrix.md
 delete mode 100644 doc/nerv_nn.md
 delete mode 100644 doc/nerv_param.md
 delete mode 100644 examples/asr_trainer.lua
 delete mode 100644 examples/chunk_file_example.lua
 delete mode 100644 examples/cumatrix_example.lua
 delete mode 100644 examples/cumatrix_from_mmatrix.lua
 delete mode 100644 examples/mmatrix_example.lua
 delete mode 100644 examples/oop_example.c
 delete mode 100644 examples/oop_example.lua
 delete mode 100644 examples/swb_baseline.lua
 delete mode 100644 examples/test_dnn_layers.lua
 delete mode 100644 examples/test_nn_lib.lua
 delete mode 100644 io/chunk_file.c
 delete mode 100644 io/chunk_file.h
 delete mode 100644 io/init.c
 delete mode 100644 io/init.lua
 delete mode 100644 io/sgd_buffer.lua
 delete mode 100644 layer/affine.lua
 delete mode 100644 layer/bias.lua
 delete mode 100644 layer/combiner.lua
 delete mode 100644 layer/init.lua
 delete mode 100644 layer/mse.lua
 delete mode 100644 layer/sigmoid.lua
 delete mode 100644 layer/softmax_ce.lua
 delete mode 100644 layer/window.lua
 delete mode 100644 luaT/README.md
 delete mode 100644 luaT/luaT.c
 delete mode 100644 luaT/luaT.h
 create mode 160000 luarocks
 delete mode 100644 matrix/cuda_helper.h
 delete mode 100644 matrix/cukernel.cu
 delete mode 100644 matrix/cukernel.h
 delete mode 100644 matrix/cumatrix.c
 delete mode 100644 matrix/generic/cukernel.cu
 delete mode 100644 matrix/generic/cumatrix.c
 delete mode 100644 matrix/generic/elem_type.h
 delete mode 100644 matrix/generic/matrix.c
 delete mode 100644 matrix/generic/matrix.h
 delete mode 100644 matrix/generic/mmatrix.c
 delete mode 100644 matrix/init.c
 delete mode 100644 matrix/init.lua
 delete mode 100644 matrix/mmatrix.c
 delete mode 100755 nerv
 delete mode 100644 nerv.c
 delete mode 100644 nerv.lua
 create mode 100644 nerv/.gitignore
 create mode 100644 nerv/Makefile
 create mode 100644 nerv/common.c
 create mode 100644 nerv/common.h
 create mode 100644 nerv/doc/nerv.md
 create mode 100644 nerv/doc/nerv_class.md
 create mode 100644 nerv/doc/nerv_io.md
 create mode 100644 nerv/doc/nerv_layer.md
 create mode 100644 nerv/doc/nerv_matrix.md
 create mode 100644 nerv/doc/nerv_nn.md
 create mode 100644 nerv/doc/nerv_param.md
 create mode 100644 nerv/examples/asr_trainer.lua
 create mode 100644 nerv/examples/chunk_file_example.lua
 create mode 100644 nerv/examples/cumatrix_example.lua
 create mode 100644 nerv/examples/cumatrix_from_mmatrix.lua
 create mode 100644 nerv/examples/mmatrix_example.lua
 create mode 100644 nerv/examples/oop_example.c
 create mode 100644 nerv/examples/oop_example.lua
 create mode 100644 nerv/examples/swb_baseline.lua
 create mode 100644 nerv/examples/test_dnn_layers.lua
 create mode 100644 nerv/examples/test_nn_lib.lua
 create mode 100644 nerv/init.lua
 create mode 100644 nerv/io/chunk_file.c
 create mode 100644 nerv/io/chunk_file.h
 create mode 100644 nerv/io/init.c
 create mode 100644 nerv/io/init.lua
 create mode 100644 nerv/io/sgd_buffer.lua
 create mode 100644 nerv/layer/affine.lua
 create mode 100644 nerv/layer/bias.lua
 create mode 100644 nerv/layer/combiner.lua
 create mode 100644 nerv/layer/init.lua
 create mode 100644 nerv/layer/mse.lua
 create mode 100644 nerv/layer/sigmoid.lua
 create mode 100644 nerv/layer/softmax_ce.lua
 create mode 100644 nerv/layer/window.lua
 create mode 100644 nerv/luaT/README.md
 create mode 100644 nerv/luaT/luaT.c
 create mode 100644 nerv/luaT/luaT.h
 create mode 100644 nerv/matrix/cuda_helper.h
 create mode 100644 nerv/matrix/cukernel.cu
 create mode 100644 nerv/matrix/cukernel.h
 create mode 100644 nerv/matrix/cumatrix.c
 create mode 100644 nerv/matrix/generic/cukernel.cu
 create mode 100644 nerv/matrix/generic/cumatrix.c
 create mode 100644 nerv/matrix/generic/elem_type.h
 create mode 100644 nerv/matrix/generic/matrix.c
 create mode 100644 nerv/matrix/generic/matrix.h
 create mode 100644 nerv/matrix/generic/mmatrix.c
 create mode 100644 nerv/matrix/init.c
 create mode 100644 nerv/matrix/init.lua
 create mode 100644 nerv/matrix/mmatrix.c
 create mode 100644 nerv/nerv
 create mode 100644 nerv/nerv-scm-1.rockspec
 create mode 100644 nerv/nerv.c
 create mode 100644 nerv/nn/init.lua
 create mode 100644 nerv/nn/layer_dag.lua
 create mode 100644 nerv/nn/layer_repo.lua
 create mode 100644 nerv/nn/param_repo.lua
 delete mode 100644 nn/init.lua
 delete mode 100644 nn/layer_dag.lua
 delete mode 100644 nn/layer_repo.lua
 delete mode 100644 nn/param_repo.lua
 delete mode 100644 pl/compat.lua
 delete mode 100644 pl/utils.lua
 create mode 100755 tools/build_luajit.sh
 create mode 100755 tools/build_luarocks.sh

diff --git a/.gitignore b/.gitignore
index d8e6051..24f2f11 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
 *.o
-build/
+install/
 *.swp
 *.swo
diff --git a/.gitmodules b/.gitmodules
index 2bb1788..1432de9 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "speech"]
 	path = speech
 	url = https://github.com/Determinant/nerv-speech.git
+[submodule "luarocks"]
+	path = luarocks
+	url = https://github.com/keplerproject/luarocks.git
diff --git a/Makefile b/Makefile
index 8f1d491..4282d6d 100644
--- a/Makefile
+++ b/Makefile
@@ -1,63 +1,10 @@
-.PHONY: all clean luajit
-BUILD_DIR := $(CURDIR)/build
-OBJS := nerv.o luaT.o common.o \
-		matrix/mmatrix.o matrix/cumatrix.o matrix/init.o matrix/cukernel.o \
-		io/init.o io/chunk_file.o \
-		examples/oop_example.o
-LIBS := libnerv.so
-LUA_LIBS := matrix/init.lua io/init.lua nerv.lua \
-			pl/utils.lua pl/compat.lua \
-			layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua \
-			layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua \
-			nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua \
-			io/sgd_buffer.lua
-INCLUDE := -I build/luajit-2.0/include/luajit-2.0/ -DLUA_USE_APICHECK
-CUDA_BASE := /usr/local/cuda-6.5
-#CUDA_BASE := /usr/local/cuda-5.0
-CUDA_INCLUDE := -I $(CUDA_BASE)/include/
-INCLUDE += $(CUDA_INCLUDE)
-LDFLAGS := -L$(CUDA_BASE)/lib64/  -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcublas
-CFLAGS := -Wall -Wextra -O2
-OBJ_DIR := $(BUILD_DIR)/objs
-LUA_DIR := $(BUILD_DIR)/lua
-LIB_DIR := $(BUILD_DIR)/lib
-SUBDIR := matrix io layer examples pl nn
-NVCC := $(CUDA_BASE)/bin/nvcc
-NVCC_FLAGS := -Xcompiler -fPIC,-Wall,-Wextra
-
-OBJS := $(addprefix $(OBJ_DIR)/,$(OBJS))
-OBJ_SUBDIR := $(addprefix $(OBJ_DIR)/,$(SUBDIR))
-LUA_SUBDIR := $(addprefix $(LUA_DIR)/,$(SUBDIR))
-LIBS := $(addprefix $(BUILD_DIR)/lib/,$(LIBS))
-LUA_LIBS := $(addprefix $(LUA_DIR)/,$(LUA_LIBS))
-
-all: luajit $(OBJ_DIR) $(LIB_DIR) $(OBJ_SUBDIR) $(LIBS) $(LUA_DIR) $(LUA_SUBDIR) $(LUA_LIBS)
+.PHONY: all clean install luajit luarocks nerv
+SHELL := /bin/bash
+PREFIX := $(CURDIR)/install/
+all: luajit luarocks
 luajit:
-	./build_luajit.sh
-$(OBJ_DIR) $(LIB_DIR) $(LUA_DIR) $(OBJ_SUBDIR) $(LUA_SUBDIR):
-	-mkdir -p $@
-$(OBJ_DIR)/%.o: %.c $(patsubst /%.o,/%.c,$@)
-	gcc -c -o $@ $< $(INCLUDE) -fPIC $(CFLAGS)
-$(OBJ_DIR)/matrix/cukernel.o: matrix/cukernel.cu
-	$(NVCC) -c -o $@ $< $(INCLUDE) $(NVCC_FLAGS)
-$(LUA_DIR)/%.lua: %.lua
-	cp $< $@
-$(OBJ_DIR)/luaT.o:
-	gcc -c -o $@ luaT/luaT.c $(INCLUDE) -fPIC
-$(LIBS): $(OBJS)
-	gcc -shared -o $@ $(OBJS) $(LDFLAGS)
-
-$(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c matrix/generic/cukernel.cu
-$(OBJ_DIR)/matrix/mmatrix.o: matrix/generic/mmatrix.c matrix/generic/matrix.c
-$(OBJ_DIR)/matrix/cukernel.o: matrix/generic/cukernel.cu
-
-.PHONY: speech
-
-speech:
-	-mkdir -p build/objs/speech/tnet_io
-	$(MAKE) -C speech/ BUILD_DIR=$(BUILD_DIR) LIB_DIR=$(LIB_DIR) OBJ_DIR=$(CURDIR)/build/objs/speech/ LUA_DIR=$(LUA_DIR)
-
-clean:
-	-rm -rf $(OBJ_DIR)
-	-rm -rf $(LUA_DIR)
-	-rm -rf $(LIB_DIR)
+	PREFIX=$(PREFIX) ./tools/build_luajit.sh
+luarocks:
+	PREFIX=$(PREFIX) ./tools/build_luarocks.sh
+install:
+	cd nerv; $(PREFIX)/bin/luarocks make
diff --git a/build_luajit.sh b/build_luajit.sh
deleted file mode 100755
index bb23b5f..0000000
--- a/build_luajit.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-PREFIX="${PWD}/build"
-LUAJIT_SRC='luajit-2.0/'
-LUAJIT_PREFIX="${PREFIX}/luajit-2.0"
-[[ -d $LUAJIT_PREFIX ]] || (cd "$LUAJIT_SRC"; make && make PREFIX="$LUAJIT_PREFIX" install)
diff --git a/common.c b/common.c
deleted file mode 100644
index b4e39e6..0000000
--- a/common.c
+++ /dev/null
@@ -1,76 +0,0 @@
-#include "common.h"
-#include <stdarg.h>
-int nerv_error(lua_State *L, const char *err_mesg_fmt, ...) {
-    va_list ap;
-    va_start(ap, err_mesg_fmt);
-    lua_pushstring(L, "[nerv] internal error: ");
-    lua_pushvfstring(L, err_mesg_fmt, ap); 
-    lua_concat(L, 2);
-    lua_error(L);
-    va_end(ap);
-    return 0;
-}
-
-int nerv_error_method_not_implemented(lua_State *L) {
-    return nerv_error(L, "method not implemented"); 
-}
-
-void luaN_append_methods(lua_State *L, const luaL_Reg *mlist) {
-    for (; mlist->func; mlist++)
-    {
-        lua_pushcfunction(L, mlist->func);
-        lua_setfield(L, -2, mlist->name);
-    }
-}
-
-HashMap *hashmap_create(size_t size, HashKey_t hfunc, HashMapCmp_t cmp) {
-    HashMap *res = (HashMap *)malloc(sizeof(HashMap));
-    res->bucket = calloc(size, sizeof(HashNode));
-    res->cmp = cmp;
-    res->hfunc = hfunc;
-    res->size = size;
-    return res;
-}
-
-void *hashmap_getval(HashMap *h, const char *key) {
-    size_t idx = h->hfunc(key) % h->size;
-    HashNode *ptr;
-    for (ptr = h->bucket[idx]; ptr; ptr = ptr->next)
-    {
-        if (!h->cmp(ptr->key, key))
-            return ptr->val;
-    }
-    return NULL;
-}
-
-void hashmap_setval(HashMap *h, const char *key, void *val) {
-    size_t idx = h->hfunc(key) % h->size;
-    HashNode *ptr = malloc(sizeof(HashNode));
-    ptr->next = h->bucket[idx];
-    h->bucket[idx] = ptr;
-    ptr->key = key;
-    ptr->val = val;
-}
-
-void hashmap_clear(HashMap *h) {
-    size_t i;
-    for (i = 0; i < h->size; i++)
-    {
-        HashNode *ptr, *nptr;
-        for (ptr = h->bucket[i]; ptr; ptr = nptr)
-        {
-            nptr = ptr->next;
-            free(ptr->val);
-            free(ptr);
-        }
-        h->bucket[i] = NULL;
-    }
-}
-
-size_t bkdr_hash(const char *key) {
-    unsigned int seed = 131;
-    unsigned int res = 0;
-    while (*key)
-        res = res * seed + *key++;
-    return res;
-}
diff --git a/common.h b/common.h
deleted file mode 100644
index e21c7a5..0000000
--- a/common.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef NERV_COMMON_H
-#define NERV_COMMON_H
-#include "lua.h"
-#include "lauxlib.h"
-#include "lualib.h"
-#include "luaT/luaT.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-typedef struct HashNode {
-    const char *key;
-    void *val;
-    struct HashNode *next;
-} HashNode;
-
-typedef int (*HashMapCmp_t)(const char *a, const char *b);
-typedef size_t (*HashKey_t)(const char *key);
-
-typedef struct HashMap {
-    HashNode **bucket;
-    HashMapCmp_t cmp;
-    HashKey_t hfunc;
-    size_t size;
-} HashMap;
-
-HashMap *hashmap_create(size_t size, HashKey_t hfunc, HashMapCmp_t cmp);
-void *hashmap_getval(HashMap *h, const char *key);
-void hashmap_setval(HashMap *h, const char *key, void *val);
-void hashmap_clear(HashMap *h);
-
-size_t bkdr_hash(const char *key);
-
-int nerv_error(lua_State *L, const char *err_mesg_fmt, ...); 
-int nerv_error_method_not_implemented(lua_State *L);
-void luaN_append_methods(lua_State *L, const luaL_Reg *mlist);
-#endif
diff --git a/doc/nerv.md b/doc/nerv.md
deleted file mode 100644
index 28411f5..0000000
--- a/doc/nerv.md
+++ /dev/null
@@ -1,17 +0,0 @@
-#The Nerv utility functions#
-Part of the [Nerv](../README.md) toolkit.
-##Methods##
-* __string = nerv.typename(obj a)__  
-A registered function, the original function is `luaT_lua_typename`. In some cases if you call `type(a)`  for object of some class in __Nerv__(like __Nerv.CuMatrix__) it will only return "userdata"(because it is created in C), in this case you can use this method to get its type.
-
----
-
-* __metatable = nerv.getmetatable(string tname)__  
-A registered function, the original function is `luaT_lua_getmetatable`. `tname` should be a class name that has been registered in __luaT__.
-
-* __metatable = nerv.newmetatable(string tname, string parenttname, function constructor, function destructor, function factory)__  
-A registered function, the original function is `luaT_newmetatable`, it returns the metatable of the created class by the name `tname`.
-* __string = nerv.setmetatable(table self, string tname)__  
-A registered function, the original function is `luaT_lua_setmetatable`. It assigns the metatable registered in __luaT__ by the name *tname* to the table *self*. And return *tname* to user.
-* __table = nerv.get_type(string typename)__  
-Returns the type(`loadstring("return " .. typename)`).
\ No newline at end of file
diff --git a/doc/nerv_class.md b/doc/nerv_class.md
deleted file mode 100644
index 99f63e7..0000000
--- a/doc/nerv_class.md
+++ /dev/null
@@ -1,36 +0,0 @@
-#The Nerv OOP#
-Part of the [Nerv](../README.md) toolkit.
-##Methods##
-* __metatable mt, metatable mpt = nerv.class(string tname, string parenttname)__  
-This method is used to create a class by the name `tname`, which inherits `parenttname` in __Nerv__, then you create a new instance of this class by calling `obj=tname(...)`. The  `tname.__init(...)` method(if defined) will be called in the constructing. The metatable of the class and its parent class will be returned.
-
-##Examples##
-* This example implements a simple `nerv.Counter` class which is inherited by `nerv.BetterCounter`.  
-
-```
-do
-    nerv.class("nerv.Counter")
-    function nerv.Counter:__init(c)
-        if (c) then
-            self.c = c
-        else
-            self.c = 0
-        end
-    end
-end
-do
-    local mt, mpt = nerv.class("nerv.BetterCounter", "nerv.Counter")
-    function nerv.BetterCounter:__init(c, bc)
-        mpt.__init(self, c)
-        if (bc) then
-            self.bc = bc
-        else
-            self.bc = 0
-        end
-    end
-end
-c1 = nerv.Counter(1)
-print(c1.c)
-bc1 = nerv.BetterCounter(1, 1)
-print(bc1.c, bc1.bc)
-```
\ No newline at end of file
diff --git a/doc/nerv_io.md b/doc/nerv_io.md
deleted file mode 100644
index 07589df..0000000
--- a/doc/nerv_io.md
+++ /dev/null
@@ -1,113 +0,0 @@
-#The Nerv IO Package#
-Part of the [Nerv](../README.md) toolkit.
-
-##Description##
-The main class that the user uses to store and read parameter object to and from files is __nerv.ChunkFile__.  
-In the file, a parameter object will be saved using a standard format. First is the length(in byte) of this object, then a table which includes some meta information of the object, and a data area. Below is an example text file.  
-```
-[0000000000202]
-{type="nerv.ExampleP",info={message="just-a-try"},id="exampleP1"}
-3 3
-5.000000 5.000000 5.000000 
-5.000000 5.000000 5.000000 
-5.000000 5.000000 5.000000 
-1 3
-4.000000 4.000000 4.000000 
-[0000000000202]
-{type="nerv.ExampleP",info={message="just-a-try"},id="exampleP2"}
-3 3
-4.000000 4.000000 4.000000 
-4.000000 4.000000 4.000000 
-4.000000 4.000000 4.000000 
-1 3
-3.000000 3.000000 3.000000 
-```
-
-##Methods##
-* __ChunkFile ChunkFile(string fn, string mode)__  
-`mode` can be `r` or `w`, for reading or writing a file. The returned __ChunkFile__ will be ready to write or read objects which follows the __nerv.Param__ interface(using `write_chunk` and `read_chunk`). 
-* __void ChunkFile.write_chunk(ChunkFile self, Param p)__  
-Write `p` into the file. `p:write` will be called.
-* __Param ChunkFile.read_chunk(ChunkFile self, string id, table global_conf)__  
-Read the __Param__ object by id `id` from the file `self`. It will be constructed using `__init(id, global_conf)`. `p:read` will be called.
-* __void ChunkFile.close(ChunkFile self)__  
-Close the opened file.
-
-##Examples##
-* An example showing how to use __ChunkFile__ to store and read parameter objects.
-```
-require 'io'
-do
-    local mt, mpt = nerv.class('nerv.ExampleP', 'nerv.Param')
-    function nerv.ExampleP:__init(id, global_conf)
-        self.id = id
-        self.global_conf = global_conf
-        self.matrix = nerv.MMatrixFloat(3, 3)
-        for i = 0, 2, 1 do
-            for j = 0, 2, 1 do
-                self.matrix[i][j] = 3
-            end
-        end
-        self.bias = nerv.MMatrixFloat(1, 3)
-        for i = 0, 2, 1 do
-            self.bias[i] = 2;
-        end
-        self:set_info({message = 'just-a-try'})
-    end
-    function nerv.ExampleP:addOne()
-        for i = 0, 2, 1 do
-            for j = 0, 2, 1 do
-                self.matrix[i][j] = self.matrix[i][j] + 1
-            end
-        end
-        for i = 0, 2, 1 do
-            self.bias[i] = self.bias[i] + 1
-        end
-    end
-    function nerv.ExampleP:read(pcdata)
-        self.matrix = nerv.MMatrixFloat.load(pcdata)
-        self.bias = nerv.MMatrixFloat.load(pcdata)
-    end
-    function nerv.ExampleP:write(pfhandle)
-        self.matrix:save(pfhandle) 
-        self.bias:save(pfhandle)
-    end
-end
-global_conf = {}
-do
-    local f = nerv.ChunkFile('../tmp', 'w')
-    local exampleP1 = nerv.ExampleP('exampleP1', global_conf)
-    local exampleP2 = nerv.ExampleP('exampleP2', global_conf)
-    exampleP1:addOne() 
-    exampleP1:addOne()
-    exampleP2:addOne()
-
-    f:write_chunk(exampleP1)
-    f:write_chunk(exampleP2)
-    f:close()
-end
-do
-    local f = nerv.ChunkFile('../tmp', 'r')
-    local exampleP1 = f:read_chunk('exampleP1', global_conf)
-    local exampleP2 = f:read_chunk('exampleP2', global_conf)
-    f:close()
-    print(exampleP1.matrix)
-    print(exampleP2.matrix)
-end
-```
-
-##Developer Notes##
-* There are four classes in to deal with chunk data, which are __nerv.ChunkFile__, __nerv.ChunkFileHandle__, __nerv.ChunkInfo__, __nerv.ChunkData__. Below is the underlying C structs.
-```
-typedef struct ChunkFileHandle {
-    FILE *fp;
-} ChunkFileHandle;
-typedef struct ChunkInfo {
-    off_t offset, length;
-} ChunkInfo;
-typedef struct ChunkData {
-    FILE *fp;
-    char *data;
-} ChunkData;
-```
-* In __Nerv.io__, a returned(by `ChunkFile.__init`) __nerv.ChunkFile__ will have a member `handle`, which is a __nerv.ChunkFileHandle__.  
\ No newline at end of file
diff --git a/doc/nerv_layer.md b/doc/nerv_layer.md
deleted file mode 100644
index de2fb12..0000000
--- a/doc/nerv_layer.md
+++ /dev/null
@@ -1,180 +0,0 @@
-#The Nerv Layer Package#
-Part of the [Nerv](../README.md) toolkit.
-
-##Description##
-__nerv.Layer__ is the base class and most of its methods are abstract.  
-###Class hierarchy and their members###
-* __nerv.Layer__.  
-	* `table dim_in` It specifies the dimensions of the inputs.  
-	* `table dim_out` It specifies the dimensions of the outputs.  
-	* `string id` ID of this layer.
-	* `table gconf` Stores the `global_conf`.
-* __nerv.AffineLayer__ inherits __nerv.Layer__, both `#dim_in` and `#dim_out` are 1. 
-	* `MatrixParam ltp` The liner transform parameter.
-	* `BiasParam bp` The bias parameter.
-* __nerv.BiasLayer__ inherits __nerv.Layer__, both `#dim_in` nad `#dim_out` are 1.
-	* `BiasParam bias` The bias parameter.
-* __nerv.SigmoidLayer__ inherits __nerv.Layer__, both `#dim_in` and `#dim_out` are 1.
-* __nerv.SoftmaxCELayer__ inherits __nerv.Layer__, `#dim_in` is 2 and `#dim_out` is -1(optional). `input[1]` is the input to the softmax layer, `input[2]` is the reference distribution. In its `propagate(input, output)` method, if `output[1] ~= nil`, cross\_entropy value will outputed.
-	* `float total_ce` Records the accumlated cross entropy value.
-	* `int total_frams` Records how many frames have passed.  
-	* `bool compressed` The reference distribution can be a one-hot format. This feature is enabled by `layer_conf.compressed`.
-
-##Methods##
-* __void Layer.\_\_init(Layer self, string id, table global_conf, table layer_conf)__  
-Abstract method.  
-The constructing method should assign `id` to `self.id` and `global_conf` to `self.gconf`, `layer_conf.dim_in` to `self.dim_in`, `layer_conf.dim_out` to `self.dim_out`. `dim_in` and `dim_out` are a list specifies the dimensions of the inputs and outputs. Also, `layer_conf` will include the parameters, which should also be properly saved.
-* __void Layer.init(Layer self)__  
-Abstract method.  
-Initialization method, in this method the layer should do some self-checking and allocate space for intermediate results.
-* __void Layer.update(Layer self, table bp_err, table input, table output)__  
-Abstract method.  
-`bp_err[i]` should be the error on `output[i]`. In this method the parameters of `self` is updated.
-* __void Layer.propagate(Layer self, table input, table output)__  
-Abstract method.  
-Given `input` and the current parameters, propagate and store the result in `output`.
-* __void Layer.back_propagate(Layer self, Matrix next_bp_err, Matrix bp_err, Matrix input, Matrix output)__  
-Abstract method.  
-Calculate the error on the inputs and store them in `next_bp_err`.
-
-* __void Layer.check_dim_len(int len_in, int len_out)__  
-Check whether `#self.dim_in == len_in` and `#self.dim_out == len_out`, if violated, an error will be posted.
-* __void Layer.get_params(Layer self)__  
-Abstract method.  
-The layer should return a list containing its parameters.
-
-####nerv.Layer.get\_dim(self)####
-*	Returns:
-	`dim_in`: __table__.  
-    `dim_out`: __table__.  
-*	Parameters:  
-	`self`: __nerv.Layer__.
-*	Description:  
-	Returns `self.dim_in, self.dim_out`.
-
-##Examples##
-* a basic example using __Nerv__ layers to a linear classification.
-
-```
-require 'math'
-
-require 'layer.affine'
-require 'layer.softmax_ce'
-
---[[Example using layers, a simple two-classification problem]]--
-
-function calculate_accurate(networkO, labelM)
-    sum = 0
-    for i = 0, networkO:nrow() - 1, 1 do
-        if (labelM[i][0] == 1 and networkO[i][0] >= 0.5) then
-            sum = sum + 1
-        end
-        if (labelM[i][1] == 1 and networkO[i][1] >= 0.5) then
-            sum = sum + 1
-        end 
-    end
-    return sum
-end
-
---[[begin global setting and data generation]]--
-global_conf =  {lrate = 10, 
-                wcost = 1e-6,
-                momentum = 0.9,
-                cumat_type = nerv.CuMatrixFloat}
-
-input_dim = 5
-data_num = 100
-ansV = nerv.CuMatrixFloat(input_dim, 1)
-for i = 0, input_dim - 1, 1 do
-    ansV[i][0] = math.random() - 0.5
-end
-ansB = math.random() - 0.5
-print('displaying ansV')
-print(ansV)
-print('displaying ansB(bias)')
-print(ansB)
-
-dataM = nerv.CuMatrixFloat(data_num, input_dim)
-for i = 0, data_num - 1, 1 do
-    for j = 0, input_dim - 1, 1 do
-        dataM[i][j] = math.random() * 2 - 1
-    end
-end
-refM = nerv.CuMatrixFloat(data_num, 1)
-refM:fill(ansB)
-refM:mul(dataM, ansV, 1, 1) --refM = dataM * ansV + ansB
-
-labelM = nerv.CuMatrixFloat(data_num, 2)
-for i = 0, data_num - 1, 1 do
-    if (refM[i][0] > 0) then
-        labelM[i][0] = 1 
-        labelM[i][1] = 0
-    else
-        labelM[i][0] = 0
-        labelM[i][1] = 1
-    end
-end
---[[global setting and data generation end]]--
-
-
---[[begin network building]]--
---parameters
-affineL_ltp = nerv.LinearTransParam('AffineL_ltp', global_conf)
-affineL_ltp.trans = nerv.CuMatrixFloat(input_dim, 2)
-for i = 0, input_dim - 1, 1 do
-    for j = 0, 1, 1 do
-        affineL_ltp.trans[i][j] = math.random() - 0.5 
-    end
-end
-affineL_bp = nerv.BiasParam('AffineL_bp', global_conf)
-affineL_bp.trans = nerv.CuMatrixFloat(1, 2)
-for j = 0, 1, 1 do
-    affineL_bp.trans[j] = math.random() - 0.5
-end
-
---layers
-affineL = nerv.AffineLayer('AffineL', global_conf, {['ltp'] = affineL_ltp,
-                                                      ['bp'] = affineL_bp,
-                                                      dim_in = {input_dim},
-                                                      dim_out = {2}})
-softmaxL = nerv.SoftmaxCELayer('softmaxL', global_conf, {dim_in = {2, 2},
-                                                         dim_out = {}})
-print('layers initializing...')
-affineL:init()
-softmaxL:init()
---[[network building end]]--
-
-
---[[begin space allocation]]--
-print('network input&output&error space allocation...')
-affineI = {dataM} --input to the network is data
-affineO = {nerv.CuMatrixFloat(data_num, 2)}
-softmaxI = {affineO[1], labelM}
-softmaxO = {}
-output = nerv.CuMatrixFloat(data_num, 2)
-
-affineE = {nerv.CuMatrixFloat(data_num, 2)}
---[[space allocation end]]--
-
-
---[[begin training]]--
-ce_last = 0
-for l = 0, 10, 1 do
-    affineL:propagate(affineI, affineO)
-    softmaxL:propagate(softmaxI, softmaxO)
-    output:softmax(softmaxI[1])
-
-    softmaxL:back_propagate(affineE, {}, softmaxI, softmaxO)
-    
-    affineL:update(affineE, affineI, affineO) 
-
-    if (l % 5 == 0) then
-        nerv.utils.printf("training iteration %d finished\n", l)
-        nerv.utils.printf("cross entropy: %.8f\n", softmaxL.total_ce - ce_last)
-        ce_last = softmaxL.total_ce 
-        nerv.utils.printf("accurate labels: %d\n", calculate_accurate(output, labelM))
-        nerv.utils.printf("total frames processed: %.8f\n", softmaxL.total_frames)
-    end
-end
---[[end training]]--
-```
diff --git a/doc/nerv_matrix.md b/doc/nerv_matrix.md
deleted file mode 100644
index 22971d2..0000000
--- a/doc/nerv_matrix.md
+++ /dev/null
@@ -1,165 +0,0 @@
-#The Nerv Matrix Package#
-Part of the [Nerv](../README.md) toolkit.
-
-##Description##
-###Underlying structure###
-In the begining is could be useful to know something about the underlying structure of a __Nerv__ matrix. Please keep in mind that matrice in __Nerv__ is row-major.  
-Every matrix object is a encapsulation of a C struct that describes the attributes of this matrix.  
-```
-typedef struct Matrix {
-    size_t stride;              /* size of a row */
-    long ncol, nrow, nmax;    /* dimension of the matrix, nmax is simply nrow * ncol */
-    union {
-        float *f;
-        double *d;
-        long *i;
-    } data;                   /* pointer to actual storage */
-    long *data_ref;
-} Matrix;
-```
-It is worth mentioning that that `data_ref` is a counter which counts the number of references to its memory space, mind that it will also be increased when a row of the matrix is referenced(`col = m[2]`). A __Nerv__ matrix will deallocate its space when this counter is decreased to zero.
-Also note that all assigning operation in __Nerv__ is reference copy, you can use `copy_tod` or `copy_toh` method to copy value. Also, row assigning operations like `m1[2]=m2[3]` is forbidden in __Nerv__.
-
-###Class hierarchy###
-The class hierarchy of the matrix classes can be clearly observed in `matrix/init.c`.
-First there is a abstract base class __Nerv.Matrix__, which is inherited by __Nerv.CuMatrix__ and __Nerv.MMatrix__(also abstract).  
-Finally, there is __Nerv.CuMatrixFloat__, __Nerv.CuMatrixDouble__, inheriting __Nerv.CuMatrix__, and __Nerv.MMatrixFloat__, __Nerv.MMatrixDouble__, __Nerv.MMatrixInt__ , inheriting __Nerv.MMatrix__.
-
-##Methods##
-Mind that usually a matrix object can only do calculation with matrix of its own type(a __Nerv.CuMatrixFloat__ matrix can only do add operation with a __Nerv.CuMatrixFloat__).  
-In the methods description below, __Matrix__ could be __Nerv.CuMatrixFloat__, __Nerv.CuMatrixDouble__, __Nerv.MMatrixFloat__ or __Nerv.MMatrixDouble__. __Element_type__ could be `float` or `double`, respectively.
-* __Matrix = Matrix(int nrow, int ncol)__  
-Returns a __Matrix__ object of `nrow` rows and `ncol` columns.
-* __Element_type = Matrix.get_elem(Matrix self, int index)__  
-Returns the element value at the specific index(treating the matrix as a vector). The index should be less than `nmax` of the matrix.
-* __void Matrix.set_elem(Matrix self, int index, Element_type value)__  
-Set the value at `index` to be `value`.
-* __int Matrix.ncol(Matrix self)__  
-Get `ncol`, the number of columns.
-* __int Matrix.nrow(Matrix self)__  
-Get `nrow`, the number of rows.
-* __int Matrix.get_dataref_value(Matrix self)__  
-Returns the value(not a pointer) of space the `data_ref` pointer pointed to. This function is mainly for debugging.  
-* __Matrix/Element\_type, boolean Matrix.\_\_index\_\_(Matrix self, int index)__  
-If the matrix has more than one row, will return the row at `index` as a __Matrix__ . Otherwise it will return the value at `index`.
-* __void Matrix.\_\_newindex\_\_(Matrix self, int index, Element_type value)__  
-Set the element at `index` to be `value`.
----
-* __Matrix Matrix.create(Matrix a)__  
-Return a new __Matrix__ of `a`'s size(of the same number of rows and columns).
-* __Matrix Matrix.colsum(Matrix self)__  
-Return a new __Matrix__ of size (1,`self.ncol`), which stores the sum of all columns of __Matrix__ `self`.
-* __Matrix Matrix.rowsum(Matrix self)__  
-Return a new __Matrix__ of size (`self.nrow`,1), which stores the sum of all rows of __Matrix__ `self`.
-* __Matrix Matrix.rowmax(Matrix self)__  
-Return a new __Matrix__ of size (`self.nrow`,1), which stores the max value of all rows of __Matrix__ `self`.
-* __Matrix Matrix.trans(Matrix self)__  
-Return a new __Matrix__ of size (`self.ncol`,`self.nrow`), which stores the transpose of __Matrix__ `self`.
-* __void Matrix.copy_fromh(Matrix self, MMatrix a)__  
-Copy the content of a __MMatrix__ `a` to __Matrix__ `self`, they should be of the same size.
-* __void Matrix.copy_fromd(Matrix self, CuMatrix a)__  
-Copy the content of a __CuMatrix__ `a` to __Matrix__ `self`, they should be of the same size.
-* __void Matrix.copy_toh(Matrix self, MMatrix a)__  
-Copy the content of the __Matrix__ `self` to a __MMatrix__ `a`.
-* __void Matrix.copy_tod(Matrix self, CuMatrix a)__  
-Copy the content of the __Matrix__ `self` to a __CuMatrix__ `a`.
-* __void Matrix.add(Matrix self, Matrix ma, Matrix mb, Element_type alpha, Element_type beta)__  
-It sets the content of __Matrix__ `self` to be `alpha * ma + beta * mb`.__Matrix__ `ma,mb,self` should be of the same size.
-* __void Matrix.mul(Matrix self, Matrix ma, Matrix mb, Element_type alpha, Element_type beta, [string ta, string tb])__  
-It sets the content of __Matrix__ `self` to be `beta * self + alpha * ma * mb`. `ta` and `tb` is optional, if `ta` is 'T', then `ma` will be transposed, also if `tb` is 'T', `mb` will be transposed.
-* __void Matrix.add_row(Matrix self, Matrix va, Element_type beta)__  
-Add `beta * va` to every row of __Matrix__ `self`.
-* __void Matrix.fill(Matrix self, Element_type value)__  
-Fill the content of __Matrix__ `self` to be `value`.
-* __void Matrix.sigmoid(Matrix self, Matrix ma)__  
-Set the element of __Matrix__ `self` to be elementwise-sigmoid of `ma`.
-* __void Matrix.sigmoid_grad(Matrix self, Matrix err, Matrix output)__  
-Set the element of __Matrix__ `self`, to be `self[i][j]=err[i][j]*output[i][j]*(1-output[i][j])`. This function is used to propagate sigmoid layer error.
-* __void Matrix.softmax(Matrix self, Matrix a)__  
-Calculate a row-by-row softmax of __Matrix__ `a` and save the result in `self`.
-* __void Matrix.mul_elem(Matrix self, Matrix ma, Matrix mb)__  
-Calculate element-wise multiplication of __Matrix__ `ma` and `mb`, store the result in `self`.
-* __void Matrix.log_elem(Matrix self, Matrix ma)__  
-Calculate element-wise log of __Matrix__ `ma`, store the result in `self`.
-* __void Matrix.copy_rows_fromh_by_idx(Matrix self, MMatrix ma, MMatrixInt idx)__  
-`idx` should be a row vector. This function copy the rows of `ma` to `self` according to `idx`, in other words, it assigns `ma[idx[i]]` to `self[i]`.
-* __void Matrix.expand_frm(Matrix self, Matrix a, int context)__  
-Treating each row of `a` as speech feature, and do a feature expansion. The `self` should of size `(a.nrow, a.ncol * (context * 2 + 1))`. `self[i]` will be `(a[i-context] a[i-context+1] ... a[i] a[i+1] a[i+context])`. `a[0]` and `a[nrow]` will be copied to extend the index range.
-* __void Matrix.rearrange_frm(Matrix self, Matrix a, int step)__  
-Rearrange `a` according to its feature dimension. The `step` is the length of context. So, `self[i][j]` will be assigned `a[i][j / step + (j % step) * (a.ncol / step)]`. `a` and `self` should be of the same size and `step` should be divisible by `a.ncol`.
-* __void Matrix.scale_row(Matrix self, Matrix scale)__  
-Scale each column of `self` according to a vector `scale`. `scale` should be of size `1 * self.ncol`.
-* __Matrix Matrix.\_\_add\_\_(Matrix ma, Matrix mb)__  
-Returns a new __Matrix__ which stores the result of `ma+mb`.
-* __Matrix Matrix.\_\_sub\_\_(Matrix ma, Matrix mb)__  
-Returns a new __Matrix__ which stores the result of `ma-mb`.
-* __Matrix Matrix.\_\_mul\_\_(Matrix ma, Matrix mb)__  
-Returns a new __Matrix__ which stores the result of `ma*mb`.
-* __CuMatrix CuMatrix.new_from_host(MMatrix m)__  
-Return a new __CuMatrix__ which is a copy of `m`.
-* __MMatrix CuMatrix.new_to_host(CuMatrix self)__  
-Return a new __MMatrix__ which is a copy of `self`.
-* __string Matrix.\_\_tostring\_\_(Matrix self)__  
-Returns a string containing values of __Matrix__ `self`.
----
-* __MMatrix MMatrix.load(ChunkData chunk)__  
-Return a new __MMatrix__ loaded from the file position in `chunk`.
-* __void MMatrix.save(MMatrix self, ChunkFileHandle chunk)__  
-Write `self` to the file position in `chunk`.
-* __void MMatrix.copy_from(MMatrix ma, MMatrix mb,[int b_bgein, int b_end, int a_begin])__  
-Copy a part of `mb`(rows of index `[b_begin..b_end)`) to `ma` beginning at row index `a_begin`. If not specified, `b_begin` will be `0`, `b_end` will be `b.nrow`, `a_begin` will be `0`.
-
-##Examples##
-* Use `get_dataref_value` to test __Nerv__'s matrix space allocation.  
-```
-m = 10
-n = 10
-fm = nerv.MMatrixFloat(m, n)
-dm = nerv.MMatrixDouble(m, n)
-for i = 0, m - 1 do
-    for j = 0, n - 1 do
-        t = i / (j + 1)
-        fm[i][j] = t
-        dm[i][j] = t
-    end
-end
-print("test fm:get_dataref_value:", fm:get_dataref_value())
-print("forced a garbade collect")
-collectgarbage("collect")
-print("test fm:get_dataref_value:", fm:get_dataref_value())
-print(fm)
-print(dm)
-```
-* Test some __Matrix__ calculations.
-```
-m = 4
-n = 4
-fm = nerv.CuMatrixFloat(m, n)
-dm = nerv.CuMatrixDouble(m, n)
-for i = 0, m - 1 do
-    for j = 0, n - 1 do
-        -- local t = math.random(10)
-        t = i / (j + 1)
-        fm[i][j] = t
-        dm[i][j] = t
-    end
-end
-print(fm)
-fs = fm:create()
-fs:softmax(fm)
--- print(fs)
-print(dm)
-ds = dm:create()
-ds:softmax(dm)
--- print(ds)
-print(fs)
-print(fs + fs)
-print(ds + ds)
-print(fs - fs)
-print(ds - ds)
-a = fs:create()
-a:mul_elem(fs, fs)
-print(a)
-a:log_elem(fs)
-print(a)
-```
\ No newline at end of file
diff --git a/doc/nerv_nn.md b/doc/nerv_nn.md
deleted file mode 100644
index c57447d..0000000
--- a/doc/nerv_nn.md
+++ /dev/null
@@ -1,256 +0,0 @@
-#The Nerv NN Package#
-Part of the [Nerv](../README.md) toolkit.
-
-##Description##
-###Class hierarchy###
-it contains __nerv.LayerRepo__, __nerv.ParamRepo__, and __nerv.DAGLayer__(inherits __nerv.Layer__).
-
-###Class hierarchy and their members###
-####nerv.ParamRepo#### 
-Get parameter object by ID.  
-*	`table param_table` Contains the mapping of parameter ID to parameter file(__nerv.ChunkFile__)  
-*  __nerv.LayerRepo__ Get layer object by ID.  
-* 	`table layers` Contains the mapping of layer ID to layer object.
-objects.
-
-####__nerv.DAGLayer__####
-Inherits __nerv.Layer__.  
-* 	`layers`: __table__, a mapping from a layer ID to its "ref". A ref is a structure that contains reference to space allocations and other info of the layer.
-* 	`inputs`: __table__, a mapping from the inputs ports of the DAG layer to the input ports of the sublayer, the key is the port number, the value is `{ref, port}`.
-* 	`outputs`:__table__, the counterpart of `inputs`.
-* 	`parsed_conn`: __table__, a list of parsed connections, each entry is of format `{{ref_from, port_from}, {ref_to, port_to}}`.
-* 	`queue`: __table__, a list of "ref"s, the propagation of the DAGLayer will follow this order, and back-propagation will follow a reverse order.
-	
-##Methods##
-
-###__nerv.ParamRepo__###
-
-####nerv.ParamRepo:\_\_init(param\_files)####
-* 	Parameters:  
-	`param_files`: __table__
-*	Description:  
-	`param_files` is a list of file names that stores parameters, the newed __ParamRepo__ will read them from file and store the mapping for future fetching.  
-    
-####nerv.Param ParamRepo.get_param(ParamRepo self, string pid, table global_conf)####
-*	Returns:  
-	__nerv.Layer__  
-*	Parameters:  
-	`self`: __nerv.ParamRepo__.  
-    `pid`: __string__.  
-    `global_conf`: __table__.  
-*	Description:  
-	__ParamRepo__ will find the __nerv.ChunkFile__ `pf` that contains parameter of ID `pid` and return `pf:read_chunk(pid, global_conf)`.
-
-###__nerv.LayerRepo__###
-####nerv.LayerRepo:\_\_init(layer\_spec, param\_repo, global\_conf)####
-* 	Returns:  
-  	__nerv.LayerRepo__.  
-* 	Parameters:  
-  	`self`: __nerv.ParamRepo__.  
-  	`layer_spec`: __table__.  
-  	`param_repo`: __nerv.ParamRepo__.  
-  	`global_conf`: __table__.  
-* 	Description:  
-  	__LayerRepo__ will construct the layers specified in `layer_spec`. Every entry in the `layer_spec` table should follow the format below:  
-	
-    > layer_spec : {[layer_type1] = llist1, [layer_type2] = llist2, ...}   
-  	> llist : {layer1, layer2, ...}   
-  	> layer : layerid = {param_config, layer_config}   
-  	> param_config : {param1 = paramID1, param2 = paramID2}  	 
-    
-  	__LayerRepo__ will merge `param_config` into `layer_config` and construct a layer by calling `layer_type(layerid, global_conf, layer_config)`.    
-
-####nerv.LayerRepo.get\_layer(self, lid)####
-* 	Returns:  
-	__nerv.LayerRepo__, the layer with ID `lid`.
-* 	Parameters:  
-	`self`:__nerv.LayerRepo__.  
-	`lid`:__string__.  
-*	Description:   
-	Returns the layer with ID `lid`.
-    
-###nerv.DAGLayer###
-####nerv.DAGLayer:\_\_init(id, global\_conf, layer\_conf)####
-*	Returns:  
-	__nerv.DAGLayer__  
-*	Parameters:  
-	`id`: __string__  
-    `global_conf`: __table__  
-    `layer_conf`: __table__  
-*	Description:  
-	The `layer_conf` should contain `layer_conf.sub_layers` which is a __nerv.LayerRepo__ storing the sub layers of the DAGLayer. It should also contain `layer_conf.connections`, which is a string-to-string mapping table describing the DAG connections. See an example below:
-    
-    ```
-    dagL = nerv.DAGLayer("DAGL", global_conf, {["dim_in"] = {input_dim, 2}, ["dim_out"] = {}, ["sub_layers"] = layerRepo,
-    	["connections"] = {
-    	["<input>[1]"] = "AffineL[1]",
-    	["AffineL[1]"] = "SoftmaxL[1]",
-    	["<input>[2]"] = "SoftmaxL[2]",
-  	}})
-    ```
-    
-####nerv.DAGLayer.init(self, batch\_size)####
-*	Parameters:  
-	`self`: __nerv.DAGLayer__  
-    `batch_size`: __int__
-* 	Description:  
-	This initialization method will allocate space for output and input matrice, and will call `init()` for each of its sub layers.
-    
-
-####nerv.DAGLayer.propagate(self, input, output)####
-*	Parameters:  
-	`self`: __nerv.DAGLayer__  
-    `input`: __table__  
-    `output`: __table__  
-*	Description:  
-	The same function as __nerv.Layer.propagate__, do propagation for each layer in the order of `self.queue`.
-
-####nerv.DAGLayer.back\_propagate(self, next\_bp\_err, bp\_err, input, output)####
-*	Parameters:  
-	`self`: __nerv.DAGLayer__  
-    `next_bp_err`: __table__  
-    `bp_err`: __table__  
-    `input`: __table__  
-    `output`: __table__  
-*	Description:  
-	The same function as __nerv.Layer.back_propagate__, do back-propagation for each layer in the reverse order of `self.queue`.
-
-####nerv.DAGLayer.update(self, bp\_err, input, output)####
-*	Parameters:  
-	`self`: __nerv.DAGLayer__  
-    `bp_err`: __table__  
-    `input`: __table__  
-    `output`: __table__  
-*	Description:  
-	The same function as __nerv.Layer.update__, do update for each layer in the order of `self.queue`.
-    
-##Examples##
-*	aaa
-	
-```
-require 'math'
-
-require 'layer.affine'
-require 'layer.softmax_ce'
-
---[[Example using DAGLayer, a simple two-classification problem]]--
-
---[[begin global setting and data generation]]--
-global_conf =  {lrate = 10, 
-                wcost = 1e-6,
-                momentum = 0.9,
-                cumat_type = nerv.CuMatrixFloat,
-               }
-
-input_dim = 5
-data_num = 100
-param_fn = "../tmp"
-ansV = nerv.CuMatrixFloat(input_dim, 1)
-for i = 0, input_dim - 1, 1 do
-    ansV[i][0] = math.random() - 0.5
-end
-ansB = math.random() - 0.5
-print('displaying ansV')
-print(ansV)
-print('displaying ansB(bias)')
-print(ansB)
-
-dataM = nerv.CuMatrixFloat(data_num, input_dim)
-for i = 0, data_num - 1, 1 do
-    for j = 0, input_dim - 1, 1 do
-        dataM[i][j] = math.random() * 2 - 1
-    end
-end
-refM = nerv.CuMatrixFloat(data_num, 1)
-refM:fill(ansB)
-refM:mul(dataM, ansV, 1, 1) --refM = dataM * ansV + ansB
-
-labelM = nerv.CuMatrixFloat(data_num, 2)
-for i = 0, data_num - 1, 1 do
-    if (refM[i][0] > 0) then
-        labelM[i][0] = 1 
-        labelM[i][1] = 0
-    else
-        labelM[i][0] = 0
-        labelM[i][1] = 1
-    end
-end
---[[global setting and data generation end]]--
-
-
---[[begin network building]]--
---parameters
-do
-    local affineL_ltp = nerv.LinearTransParam('AffineL_ltp', global_conf)
-    affineL_ltp.trans = nerv.CuMatrixFloat(input_dim, 2)
-    for i = 0, input_dim - 1, 1 do
-        for j = 0, 1, 1 do
-            affineL_ltp.trans[i][j] = math.random() - 0.5 
-        end
-    end
-    local affineL_bp = nerv.BiasParam('AffineL_bp', global_conf)
-    affineL_bp.trans = nerv.CuMatrixFloat(1, 2)
-    for j = 0, 1, 1 do
-        affineL_bp.trans[j] = math.random() - 0.5
-    end
-
-    local chunk = nerv.ChunkFile(param_fn, 'w')
-    chunk:write_chunk(affineL_ltp)
-    chunk:write_chunk(affineL_bp)
-    chunk:close()
-
-    paramRepo = nerv.ParamRepo({param_fn})
-end
-
---layers
-layerRepo = nerv.LayerRepo({
-        ["nerv.AffineLayer"] = 
-        {
-            ["AffineL"] = {{["ltp"] = "AffineL_ltp", ["bp"] = "AffineL_bp"}, {["dim_in"] = {input_dim}, ["dim_out"] = {2}}},
-        },
-        ["nerv.SoftmaxCELayer"] = 
-        {
-            ["SoftmaxL"] = {{}, {["dim_in"] = {2, 2}, ["dim_out"] = {}}}
-        },
-        }, paramRepo, global_conf)
-affineL = layerRepo:get_layer("AffineL")
-softmaxL = layerRepo:get_layer("SoftmaxL")
-print('layers initializing...')
-dagL = nerv.DAGLayer("DAGL", global_conf, {["dim_in"] = {input_dim, 2}, ["dim_out"] = {}, ["sub_layers"] = layerRepo,
-        ["connections"] = {
-           ["<input>[1]"] = "AffineL[1]",
-           ["AffineL[1]"] = "SoftmaxL[1]",
-           ["<input>[2]"] = "SoftmaxL[2]",
-        }})
-dagL:init(data_num)
---affineL:init()
---softmaxL:init()
---[[network building end]]--
-
-
---[[begin space allocation]]--
-print('network input&output&error space allocation...')
-dagL_input = {dataM, labelM}
-dagL_output = {}
-dagL_err = {}
-dagL_ierr = {nerv.CuMatrixFloat(data_num, input_dim), nerv.CuMatrixFloat(data_num, 2)}
---[[space allocation end]]--
-
-
---[[begin training]]--
-ce_last = 0
-for l = 0, 10, 1 do
-    dagL:propagate(dagL_input, dagL_output)
-    dagL:back_propagate(dagL_ierr, dagL_err, dagL_input, dagL_output)
-    dagL:update(dagL_err, dagL_input, dagL_output)
-    
-    if (l % 2 == 0) then
-        nerv.utils.printf("training iteration %d finished\n", l)
-        nerv.utils.printf("cross entropy: %.8f\n", softmaxL.total_ce - ce_last)
-        --nerv.utils.printf("accurate labels: %d\n", calculate_accurate(output, labelM))
-        nerv.utils.printf("total frames processed: %.8f\n", softmaxL.total_frames)
-    end
-    ce_last = softmaxL.total_ce 
-end
---[[end training]]--
-```
\ No newline at end of file
diff --git a/doc/nerv_param.md b/doc/nerv_param.md
deleted file mode 100644
index 167cb11..0000000
--- a/doc/nerv_param.md
+++ /dev/null
@@ -1,27 +0,0 @@
-#The Nerv Parameter Package#
-Part of the [Nerv](../README.md) toolkit.
-
-##Description##
-###Class hierarchy###
-There is a base class __Nerv.Param__ defined in `layer/init.lua`.
-
-###Class hierarchy and their members###
-* __nerv.MatrixParam__ inherits __nerv.Param__  
-	* `Matrix trans` stores the parameter matrix.
-* __nerv.LinearTransParam__ inherits __Nerv.MatrixParam__.  
-* __Nerv.BiasParam__ inherits __Nerv.MatrixParam__.  
-
-##Methods##
-* __void Param.\_\_init(Param self, string id, table global_conf)__  
-Constructor of a __Param__, it will set `self.id` to be `id` and `self.gconf` to be `global_conf`.
-* __void Param.set_info(Param self, table info)__  
-Set `self.info` to be `info`.
-* __table Param.get_info(Param self)__  
-Returns `self.info`.
-* __void Param.read(Param self, ChunkData pcdata)__  
-Abstract method.  
-In this method, `self` should in turn calls its members to load from `pcdata`.
-* __void Param.write(Param self, ChunkFileHandle pfhandle)__  
-Abstract method.  
-Save parameters to file. In this method, `self` should in turn calls its members to save to `pfhandle`.
-
diff --git a/examples/asr_trainer.lua b/examples/asr_trainer.lua
deleted file mode 100644
index a5727be..0000000
--- a/examples/asr_trainer.lua
+++ /dev/null
@@ -1,106 +0,0 @@
-function build_trainer(ifname)
-    local param_repo = nerv.ParamRepo()
-    param_repo:import(ifname, nil, gconf)
-    local sublayer_repo = make_sublayer_repo(param_repo)
-    local layer_repo = make_layer_repo(sublayer_repo, param_repo)
-    local crit = get_criterion_layer(sublayer_repo)
-    local network = get_network(layer_repo)
-    local input_order = get_input_order()
-    local iterative_trainer = function (prefix, scp_file, bp)
-        gconf.randomize = bp
-        -- build buffer
-        local buffer = make_buffer(make_readers(scp_file, layer_repo))
-        -- initialize the network
-        network:init(gconf.batch_size)
-        gconf.cnt = 0
-        err_input = {nerv.CuMatrixFloat(256, 1)}
-        err_input[1]:fill(1)
-        for data in buffer.get_data, buffer do
-            -- prine stat periodically
-            gconf.cnt = gconf.cnt + 1
-            if gconf.cnt == 1000 then
-                print_stat(sublayer_repo)
-                nerv.CuMatrix.print_profile()
-                nerv.CuMatrix.clear_profile()
-                gconf.cnt = 0
-                -- break
-            end
-            local input = {}
---            if gconf.cnt == 100 then break end
-            for i, id in ipairs(input_order) do
-                if data[id] == nil then
-                    nerv.error("input data %s not found", id)
-                end
-                table.insert(input, data[id])
-            end
-            local output = {nerv.CuMatrixFloat(256, 1)}
-            err_output = {input[1]:create()}
-            network:propagate(input, output)
-            if bp then
-                network:back_propagate(err_input, err_output, input, output)
-                network:update(err_input, input, output)
-            end
-            -- collect garbage in-time to save GPU memory
-            collectgarbage("collect")
-        end
-        print_stat(sublayer_repo)
-        nerv.CuMatrix.print_profile()
-        nerv.CuMatrix.clear_profile()
-        if (not bp) and prefix ~= nil then
-            nerv.info("writing back...")
-            local fname = string.format("%s_cv%.3f.nerv",
-                            prefix, get_accuracy(sublayer_repo))
-            network:get_params():export(fname, nil)
-        end
-        return get_accuracy(sublayer_repo)
-    end
-    return iterative_trainer
-end
-
-dofile(arg[1])
-start_halving_inc = 0.5
-halving_factor = 0.6
-end_halving_inc = 0.1
-min_iter = 1
-max_iter = 20
-min_halving = 5
-gconf.batch_size = 256
-gconf.buffer_size = 81920
-
-local pf0 = gconf.initialized_param
-local trainer = build_trainer(pf0)
---local trainer = build_trainer("c3.nerv")
-local accu_best = trainer(nil, gconf.cv_scp, false)
-local do_halving = false
-
-nerv.info("initial cross validation: %.3f", accu_best)
-for i = 1, max_iter do
-    nerv.info("[NN] begin iteration %d with lrate = %.6f", i, gconf.lrate)
-    local accu_tr = trainer(nil, gconf.tr_scp, true)
-    nerv.info("[TR] training set %d: %.3f", i, accu_tr)
-    local accu_new = trainer(
-                        string.format("%s_%s_iter_%d_lr%f_tr%.3f",
-                            string.gsub(
-                                (string.gsub(pf0[1], "(.*/)(.*)", "%2")),
-                                "(.*)%..*", "%1"),
-                            os.date("%Y%m%d%H%M%S"),
-                            i, gconf.lrate,
-                            accu_tr),
-                        gconf.cv_scp, false)
-    nerv.info("[CV] cross validation %d: %.3f", i, accu_new)
-    -- TODO: revert the weights
-    local accu_diff = accu_new - accu_best
-    if do_halving and accu_diff < end_halving_inc and i > min_iter then
-        break
-    end
-    if accu_diff < start_halving_inc and i >= min_halving then
-        do_halving = true
-    end
-    if do_halving then
-        gconf.lrate = gconf.lrate * halving_factor
-    end
-    if accu_new > accu_best then
-        accu_best = accu_new
-    end
---    nerv.Matrix.print_profile()
-end
diff --git a/examples/chunk_file_example.lua b/examples/chunk_file_example.lua
deleted file mode 100644
index 5961c98..0000000
--- a/examples/chunk_file_example.lua
+++ /dev/null
@@ -1,53 +0,0 @@
--- To define a readable and writable chunk, one must define a class with the
--- following methods: __init(id, global_conf), read(handle), write(handle),
--- get_info(), set_info(info) and an id attribute. This file demonstrates a
--- basic chunk implementation which manages the I/O of a matrix
-
-local MatrixChunk = nerv.class("nerv.MatrixChunk")
-
-function MatrixChunk:__init(id, global_conf)
-    self.id = id
-    self.info = {}
-    self.gconf = global_conf
-end
-
-function MatrixChunk:read(handle)
-    -- pass the read handle to the matrix method
-    self.data = nerv.MMatrixFloat.load(handle)
-end
-
-function MatrixChunk:write(handle)
-    -- pass the write handle to the matrix method
-    self.data:save(handle)
-end
-
-function MatrixChunk:get_info()
-    return self.info
-end
-
-function MatrixChunk:set_info(info)
-    self.info = info
-end
-
-function MatrixChunk.create_from_matrix(id, mat)
-    local ins = nerv.MatrixChunk(id)
-    ins.data = mat
-    return ins
-end
-
-mat = nerv.MMatrixFloat(3, 4)
-for i = 0, 2 do
-    for j = 0, 3 do
-        mat[i][j] = i + j
-    end
-end
-
-cd = nerv.MatrixChunk.create_from_matrix("matrix1", mat)
-
-cf = nerv.ChunkFile("test.nerv", "w")
-cf:write_chunk(cd)
-cf:close()
-
-cf2 = nerv.ChunkFile("test.nerv", "r")
-cd2 = cf2:read_chunk("matrix1")
-print(cd2.data)
diff --git a/examples/cumatrix_example.lua b/examples/cumatrix_example.lua
deleted file mode 100644
index 544fc7f..0000000
--- a/examples/cumatrix_example.lua
+++ /dev/null
@@ -1,31 +0,0 @@
-m = 4
-n = 4
-fm = nerv.CuMatrixFloat(m, n)
-dm = nerv.CuMatrixDouble(m, n)
-for i = 0, m - 1 do
-    for j = 0, n - 1 do
-        -- local t = math.random(10)
-        t = i / (j + 1)
-        fm[i][j] = t
-        dm[i][j] = t
-    end
-end
-print(fm)
-fs = fm:create()
-fs:softmax(fm)
--- print(fs)
-print(dm)
-ds = dm:create()
-ds:softmax(dm)
--- print(ds)
-print(fs)
-print(fs + fs)
-print(ds + ds)
-print(fs - fs)
-print(ds - ds)
-
-a = fs:create()
-a:mul_elem(fs, fs)
-print(a)
-a:log_elem(fs)
-print(a)
diff --git a/examples/cumatrix_from_mmatrix.lua b/examples/cumatrix_from_mmatrix.lua
deleted file mode 100644
index 2309e14..0000000
--- a/examples/cumatrix_from_mmatrix.lua
+++ /dev/null
@@ -1,32 +0,0 @@
-m = 3
-n = 4
-fm = nerv.MMatrixFloat(m, n)
-dm = nerv.MMatrixDouble(m, n)
-for i = 0, m - 1 do
-    for j = 0, n - 1 do
-        -- local t = math.random(10)
-        t = i / (j + 1)
-        fm[i][j] = t
-        dm[i][j] = t
-    end
-end
-print(fm)
-print(dm)
-
-fc = nerv.CuMatrixFloat(m, n)
-dc = nerv.CuMatrixDouble(m, n)
-fc:copy_fromh(fm)
-dc:copy_fromh(dm)
-print("fc and dc")
-print(fc)
-print(dc)
-dc[1]:copy_tod(dc[0])
-print("dc[1] copied to dc[0]")
-print(dc)
-print("softmax of fc and dc")
-sfc = fc:create()
-sdc = dc:create()
-sfc:softmax(fc)
-print(sfc)
-sdc:softmax(dc)
-print(sdc)
diff --git a/examples/mmatrix_example.lua b/examples/mmatrix_example.lua
deleted file mode 100644
index 8ddfe84..0000000
--- a/examples/mmatrix_example.lua
+++ /dev/null
@@ -1,20 +0,0 @@
-m = 10
-n = 10
-fm = nerv.MMatrixFloat(m, n)
-dm = nerv.MMatrixDouble(m, n)
-for i = 0, m - 1 do
-    for j = 0, n - 1 do
-        -- local t = math.random(10)
-        t = i / (j + 1)
-        fm[i][j] = t
-        dm[i][j] = t
-    end
-end
-print("test fm:get_dataref_value:", fm:get_dataref_value())
-print("forced a garbade collect")
-collectgarbage("collect")
-print("test fm:get_dataref_value:", fm:get_dataref_value())
-print(fm)
--- print(fm:softmax())
-print(dm)
--- print(dm:softmax())
diff --git a/examples/oop_example.c b/examples/oop_example.c
deleted file mode 100644
index 59dfc5a..0000000
--- a/examples/oop_example.c
+++ /dev/null
@@ -1,101 +0,0 @@
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include "../common.h"
-
-#define SQR(x) ((x) * (x))
-
-const char *point_tname = "nerv.Point";
-const char *better_point_tname = "nerv.BetterPoint";
-
-typedef struct {
-    double x, y;
-} Point;
-
-static int point_norm (lua_State *L) {
-    Point *p = luaT_checkudata(L, 1, point_tname);
-    lua_pushnumber(L, sqrt(SQR(p->x) + SQR(p->y)));
-    return 1;
-}
-
-static int point_set_x (lua_State *L) {
-    Point *p = luaT_checkudata(L, 1, point_tname);
-    p->x = luaL_checknumber(L, 2);
-    return 0;
-}
-
-static int point_set_y (lua_State *L) {
-    Point *p = luaT_checkudata(L, 1, point_tname);
-    p->y = luaL_checknumber(L, 2);
-    return 0;
-}
-
-/* generic constructor */
-void point_new_(Point *self, double x, double y) {
-    self->x = x;
-    self->y = y;
-}
-
-int point_new(lua_State *L) {
-    /* `_new` function should create the object itself */
-    Point *self = (Point *)malloc(sizeof(Point));
-    point_new_(self, luaL_checknumber(L, 1), luaL_checknumber(L, 2));
-    luaT_pushudata(L, self, point_tname);
-    fprintf(stderr, "[example] %s constructor is invoked\n",
-            point_tname);
-    return 1;
-}
-
-static const luaL_Reg point_methods[] = {
-    {"set_x", point_set_x},
-    {"set_y", point_set_y},
-    {"norm", point_norm},
-    {NULL, NULL}
-};
-
-
-/* the subclass method overrides the one from baseclass */
-static int better_point_norm (lua_State *L) {
-    Point *p = luaT_checkudata(L, 1, point_tname);
-    lua_pushnumber(L, fabs(p->x) + fabs(p->y));
-    return 1;
-}
-
-int better_point_new(lua_State *L) {
-    /* `_new` function should create the object itself */
-    Point *self = (Point *)malloc(sizeof(Point));
-    point_new_(self, luaL_checknumber(L, 1), luaL_checknumber(L, 2));
-    luaT_pushudata(L, self, better_point_tname);
-    fprintf(stderr, "[example] %s constructor is invoked\n",
-            better_point_tname);
-    return 1;
-}
-
-static const luaL_Reg better_point_methods[] = {
-    {"norm", better_point_norm},
-    {NULL, NULL}
-};
-
-void nerv_point_init(lua_State *L) {
-    /* create a class and let luaT know */
-    luaT_newmetatable(L, point_tname, NULL, point_new, NULL, NULL);
-    /* register member functions */
-    luaL_register(L, NULL, point_methods);
-    /* keep the stack balanced, see `nerv.c` */
-    lua_pop(L, 1);
-}
-
-void nerv_better_point_init(lua_State *L) {
-    /* create a class and let luaT know */
-    luaT_newmetatable(L, better_point_tname, point_tname,
-                        better_point_new, NULL, NULL);
-    /* register member functions */
-    luaL_register(L, NULL, better_point_methods);
-    /* keep the stack balanced, see `nerv.c` */
-    lua_pop(L, 1);
-}
-
-void nerv_example_init(lua_State *L) {
-    nerv_point_init(L);
-    nerv_better_point_init(L);
-}
diff --git a/examples/oop_example.lua b/examples/oop_example.lua
deleted file mode 100644
index b753288..0000000
--- a/examples/oop_example.lua
+++ /dev/null
@@ -1,16 +0,0 @@
-p = nerv.Point(0, 0) -- create a Point instance
-print(p)
-print(p:norm()) -- get 2-norm of the Point
-p:set_x(1.0)
-p:set_y(2.0)
-print(p:norm()) -- get 2-norm of the Point
-
-bp = nerv.BetterPoint(1, 2)
--- use methods from base class
-bp:set_x(1.0)
-bp:set_y(2.0)
-print(bp)
-print(bp:norm()) --get 1-norm of the Point
-
-print(p.__typename)
-print(bp.__typename)
diff --git a/examples/swb_baseline.lua b/examples/swb_baseline.lua
deleted file mode 100644
index 8b7e01a..0000000
--- a/examples/swb_baseline.lua
+++ /dev/null
@@ -1,166 +0,0 @@
-require 'speech.init'
-gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
-        cumat_type = nerv.CuMatrixFloat,
-        mmat_type = nerv.MMatrixFloat,
-        frm_ext = 5,
-        tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
-        cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp",
-        htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
-        initialized_param = {"/slfs1/users/mfy43/swb_init.nerv",
-                "/slfs1/users/mfy43/swb_global_transf.nerv"},
-        debug = false}
-
-function make_sublayer_repo(param_repo)
-    return nerv.LayerRepo(
-    {
-        -- global transf
-        ["nerv.BiasLayer"] =
-        {
-            blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}},
-            blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}}
-        },
-        ["nerv.WindowLayer"] =
-        {
-            wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}},
-            wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}}
-        },
-        -- biased linearity
-        ["nerv.AffineLayer"] =
-        {
-            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
-            {dim_in = {429}, dim_out = {2048}}},
-            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
-            {dim_in = {2048}, dim_out = {3001}}}
-        },
-        ["nerv.SigmoidLayer"] =
-        {
-            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
-        },
-        ["nerv.SoftmaxCELayer"] =
-        {
-            ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}}
-        }
-    }, param_repo, gconf)
-end
-
-function make_layer_repo(sublayer_repo, param_repo)
-    return nerv.LayerRepo(
-    {
-        ["nerv.DAGLayer"] =
-        {
-            global_transf = {{}, {
-                dim_in = {429}, dim_out = {429},
-                sub_layers = sublayer_repo,
-                connections = {
-                    ["<input>[1]"] = "blayer1[1]",
-                    ["blayer1[1]"] = "wlayer1[1]",
-                    ["wlayer1[1]"] = "blayer2[1]",
-                    ["blayer2[1]"] = "wlayer2[1]",
-                    ["wlayer2[1]"] = "<output>[1]"
-                }
-            }},
-            main = {{}, {
-                dim_in = {429, 1}, dim_out = {1},
-                sub_layers = sublayer_repo,
-                connections = {
-                    ["<input>[1]"] = "affine0[1]",
-                    ["affine0[1]"] = "sigmoid0[1]",
-                    ["sigmoid0[1]"] = "affine1[1]",
-                    ["affine1[1]"] = "sigmoid1[1]",
-                    ["sigmoid1[1]"] = "affine2[1]",
-                    ["affine2[1]"] = "sigmoid2[1]",
-                    ["sigmoid2[1]"] = "affine3[1]",
-                    ["affine3[1]"] = "sigmoid3[1]",
-                    ["sigmoid3[1]"] = "affine4[1]",
-                    ["affine4[1]"] = "sigmoid4[1]",
-                    ["sigmoid4[1]"] = "affine5[1]",
-                    ["affine5[1]"] = "sigmoid5[1]",
-                    ["sigmoid5[1]"] = "affine6[1]",
-                    ["affine6[1]"] = "sigmoid6[1]",
-                    ["sigmoid6[1]"] = "affine7[1]",
-                    ["affine7[1]"] = "ce_crit[1]",
-                    ["<input>[2]"] = "ce_crit[2]",
-                    ["ce_crit[1]"] = "<output>[1]"
-                }
-            }}
-        }
-    }, param_repo, gconf)
-end
-
-function get_criterion_layer(sublayer_repo)
-    return sublayer_repo:get_layer("ce_crit")
-end
-
-function get_network(layer_repo)
-    return layer_repo:get_layer("main")
-end
-
-function make_readers(scp_file, layer_repo)
-    return {
-                {reader = nerv.TNetReader(gconf,
-                    {
-                        id = "main_scp",
-                        scp_file = scp_file,
-                        conf_file = gconf.htk_conf,
-                        frm_ext = gconf.frm_ext,
-                        mlfs = {
-                            phone_state = {
-                                file = "/slfs1/users/mfy43/swb_ivec/ref.mlf",
-                                format = "map",
-                                format_arg = "/slfs1/users/mfy43/swb_ivec/dict",
-                                dir = "*/",
-                                ext = "lab"
-                            }
-                        },
-                        global_transf = layer_repo:get_layer("global_transf")
-                    }),
-                data = {main_scp = 429, phone_state = 1}}
-            }
-end
-
-function make_buffer(readers)
-    return nerv.SGDBuffer(gconf,
-        {
-            buffer_size = gconf.buffer_size,
-            randomize = gconf.randomize,
-            readers = readers
-        })
-end
-
-function get_input_order()
-    return {"main_scp", "phone_state"}
-end
-
-function get_accuracy(sublayer_repo)
-    local ce_crit = sublayer_repo:get_layer("ce_crit")
-    return ce_crit.total_correct / ce_crit.total_frames * 100
-end
-
-function print_stat(sublayer_repo)
-    local ce_crit = sublayer_repo:get_layer("ce_crit")
-    nerv.info("*** training stat begin ***")
-    nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce)
-    nerv.printf("correct:\t\t%d\n", ce_crit.total_correct)
-    nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames)
-    nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames)
-    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(sublayer_repo))
-    nerv.info("*** training stat end ***")
-end
diff --git a/examples/test_dnn_layers.lua b/examples/test_dnn_layers.lua
deleted file mode 100644
index 64c0dec..0000000
--- a/examples/test_dnn_layers.lua
+++ /dev/null
@@ -1,78 +0,0 @@
-require 'layer.affine'
-require 'layer.sigmoid'
-require 'layer.softmax_ce'
-
-global_conf = {lrate = 0.8, wcost = 1e-6,
-                momentum = 0.9, cumat_type = nerv.CuMatrixFloat}
-
-pf = nerv.ChunkFile("affine.param", "r")
-ltp = pf:read_chunk("a", global_conf)
-bp = pf:read_chunk("b", global_conf)
-
--- print(bp.trans)
-
-af = nerv.AffineLayer("test", global_conf, {["ltp"] = ltp,
-                                            ["bp"] = bp,
-                                            dim_in = {429},
-                                            dim_out = {2048}})
-sg = nerv.SigmoidLayer("test2", global_conf, {dim_in = {2048},
-                                                dim_out = {2048}})
-sm = nerv.SoftmaxCELayer("test3", global_conf, {dim_in = {2048, 2048},
-                                                dim_out = {}})
-af:init()
-sg:init()
-sm:init()
-
-df = nerv.ChunkFile("input.param", "r")
-
-label = nerv.CuMatrixFloat(10, 2048)
-label:fill(0)
-for i = 0, 9 do
-    label[i][i] = 1.0
-end
-
-input1 = {df:read_chunk("input", global_conf).trans}
-output1 = {nerv.CuMatrixFloat(10, 2048)}
-input2 = output1
-output2 = {nerv.CuMatrixFloat(10, 2048)}
-input3 = {output2[1], label}
-output3 = {}
-err_input1 = {}
-err_output1 = {nerv.CuMatrixFloat(10, 2048)}
-err_input2 = err_output1
-err_output2 = {nerv.CuMatrixFloat(10, 2048)}
-err_input3 = err_output2
-err_output3 = {input1[1]:create()}
-
-for i = 0, 3 do
-    -- propagate
-    af:propagate(input1, output1)
-    sg:propagate(input2, output2)
-    sm:propagate(input3, output3)
-
-    -- back_propagate
-    sm:back_propagate(err_output1, err_input1, input3, output3)
-    sg:back_propagate(err_output2, err_input2, input2, output2)
-    af:back_propagate(err_output3, err_input3, input1, output1)
-
-    -- update
-    sm:update(err_input1, input3, output3)
-    sg:update(err_input2, input2, output2)
-    af:update(err_input3, input1, output1)
-
-
-    print("output1")
-    print(output1[1])
-    print("output2")
-    print(output2[1])
-    print("err_output1")
-    print(err_output1[1])
-    print("err_output2")
-    print(err_output2[1])
-    nerv.printf("cross entropy: %.8f\n", sm.total_ce)
-    nerv.printf("frames: %.8f\n", sm.total_frames)
-end
-print("linear")
-print(af.ltp.trans)
-print("linear2")
-print(af.bp.trans)
diff --git a/examples/test_nn_lib.lua b/examples/test_nn_lib.lua
deleted file mode 100644
index 5444810..0000000
--- a/examples/test_nn_lib.lua
+++ /dev/null
@@ -1,164 +0,0 @@
-require 'speech.init'
-gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
-        cumat_type = nerv.CuMatrixFloat,
-        mmat_type = nerv.MMatrixFloat,
-        batch_size = 256}
-
-param_repo = nerv.ParamRepo({"converted.nerv", "global_transf.nerv"})
-sublayer_repo = nerv.LayerRepo(
-    {
-        -- global transf
-        ["nerv.BiasLayer"] =
-        {
-            blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}},
-            blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}}
-        },
-        ["nerv.WindowLayer"] =
-        {
-            wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}},
-            wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}}
-        },
-        -- biased linearity
-        ["nerv.AffineLayer"] =
-        {
-            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
-                        {dim_in = {429}, dim_out = {2048}}},
-            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
-                        {dim_in = {2048}, dim_out = {2048}}},
-            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
-                        {dim_in = {2048}, dim_out = {2048}}},
-            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
-                        {dim_in = {2048}, dim_out = {2048}}},
-            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
-                        {dim_in = {2048}, dim_out = {2048}}},
-            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
-                        {dim_in = {2048}, dim_out = {2048}}},
-            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
-                        {dim_in = {2048}, dim_out = {2048}}},
-            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
-                        {dim_in = {2048}, dim_out = {3001}}}
-        },
-        ["nerv.SigmoidLayer"] =
-        {
-            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
-        },
-        ["nerv.SoftmaxCELayer"] =
-        {
-            softmax_ce0 = {{}, {dim_in = {3001, 1}, dim_out = {}, compressed = true}}
-        }
-    }, param_repo, gconf)
-
-layer_repo = nerv.LayerRepo(
-    {
-        ["nerv.DAGLayer"] =
-        {
-            global_transf = {{}, {
-                dim_in = {429}, dim_out = {429},
-                sub_layers = sublayer_repo,
-                connections = {
-                    ["<input>[1]"] = "blayer1[1]",
-                    ["blayer1[1]"] = "wlayer1[1]",
-                    ["wlayer1[1]"] = "blayer2[1]",
-                    ["blayer2[1]"] = "wlayer2[1]",
-                    ["wlayer2[1]"] = "<output>[1]"
-                }
-            }},
-            main = {{}, {
-                dim_in = {429, 1}, dim_out = {},
-                sub_layers = sublayer_repo,
-                connections = {
-                    ["<input>[1]"] = "affine0[1]",
-                    ["affine0[1]"] = "sigmoid0[1]",
-                    ["sigmoid0[1]"] = "affine1[1]",
-                    ["affine1[1]"] = "sigmoid1[1]",
-                    ["sigmoid1[1]"] = "affine2[1]",
-                    ["affine2[1]"] = "sigmoid2[1]",
-                    ["sigmoid2[1]"] = "affine3[1]",
-                    ["affine3[1]"] = "sigmoid3[1]",
-                    ["sigmoid3[1]"] = "affine4[1]",
-                    ["affine4[1]"] = "sigmoid4[1]",
-                    ["sigmoid4[1]"] = "affine5[1]",
-                    ["affine5[1]"] = "sigmoid5[1]",
-                    ["sigmoid5[1]"] = "affine6[1]",
-                    ["affine6[1]"] = "sigmoid6[1]",
-                    ["sigmoid6[1]"] = "affine7[1]",
-                    ["affine7[1]"] = "softmax_ce0[1]",
-                    ["<input>[2]"] = "softmax_ce0[2]"
-                }
-            }}
-        }
-    }, param_repo, gconf)
-
-tnet_reader = nerv.TNetReader(gconf,
-    {
-        id = "main_scp",
-        scp_file = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
---        scp_file = "t.scp",
-        conf_file = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
-        frm_ext = 5,
-        mlfs = {
-            ref = {
-                file = "/slfs1/users/mfy43/swb_ivec/ref.mlf",
-                format = "map",
-                format_arg = "/slfs1/users/mfy43/swb_ivec/dict",
-                dir = "*/",
-                ext = "lab"
-            }
-        },
-        global_transf = layer_repo:get_layer("global_transf")
-    })
-
-buffer = nerv.SGDBuffer(gconf,
-    {
-        buffer_size = 81920,
-        randomize = true,
-        readers = {
-            { reader = tnet_reader,
-              data = {main_scp = 429, ref = 1}}
-        }
-    })
-
-sm = sublayer_repo:get_layer("softmax_ce0")
-main = layer_repo:get_layer("main")
-main:init(gconf.batch_size)
-gconf.cnt = 0
--- data = buffer:get_data()
--- input = {data.main_scp, data.ref}
--- while true do
-for data in buffer.get_data, buffer do
---    if gconf.cnt == 100 then break end
---    gconf.cnt = gconf.cnt + 1
-
-    input = {data.main_scp, data.ref}
-    output = {}
-    err_input = {}
-    err_output = {input[1]:create()}
-    
-    main:propagate(input, output)
-    main:back_propagate(err_output, err_input, input, output)
-    main:update(err_input, input, output)
-
---    nerv.printf("cross entropy: %.8f\n", sm.total_ce)
---    nerv.printf("correct: %d\n", sm.total_correct)
---    nerv.printf("frames: %d\n", sm.total_frames)
---    nerv.printf("err/frm: %.8f\n", sm.total_ce / sm.total_frames)
---    nerv.printf("accuracy: %.8f\n", sm.total_correct / sm.total_frames)
-    collectgarbage("collect")
-end
-nerv.printf("cross entropy: %.8f\n", sm.total_ce)
-nerv.printf("correct: %d\n", sm.total_correct)
-nerv.printf("accuracy: %.3f%%\n", sm.total_correct / sm.total_frames * 100)
-nerv.printf("writing back...\n")
-cf = nerv.ChunkFile("output.nerv", "w")
-for i, p in ipairs(main:get_params()) do
-    print(p)
-    cf:write_chunk(p)
-end
-cf:close()
-nerv.Matrix.print_profile()
diff --git a/io/chunk_file.c b/io/chunk_file.c
deleted file mode 100644
index c0b6b9f..0000000
--- a/io/chunk_file.c
+++ /dev/null
@@ -1,325 +0,0 @@
-#include <stdio.h>
-#include <ctype.h>
-#include <string.h>
-#include "../common.h"
-#include "chunk_file.h"
-
-#define INVALID_FORMAT_ERROR(fn) \
-    nerv_error(L, "Invalid chunk file: %s", fn)
-#define CHECK_FORMAT(exp, ret, fname) \
-    do { \
-        if ((exp) != (ret)) INVALID_FORMAT_ERROR(fn); \
-    } while (0)
-#define CHECK_FILE_OPEN(pfh) \
-    do { \
-        if ((pfh)->closed) \
-            nerv_error(L, "operations on a closed file"); \
-    } while (0)
-
-const char *nerv_chunk_file_tname = "nerv.ChunkFile";
-const char *nerv_chunk_file_handle_tname = "nerv.ChunkFileHandle";
-const char *nerv_chunk_info_tname = "nerv.ChunkInfo";
-const char *nerv_chunk_data_tname = "nerv.ChunkData";
-
-#define PARAM_HEADER_SIZE 16
-
-enum {
-    NORMAL,
-    INVALID_FORMAT,
-    END_OF_FILE,
-    SECTION_OVERFLOW,
-    WRITE_ERROR
-};
-
-size_t read_chunk_header_plain(FILE *fp, int *status) {
-    static char buff[PARAM_HEADER_SIZE];
-    int i;
-    size_t size = 0;
-    *status = NORMAL;
-    if (fread(buff, 1, PARAM_HEADER_SIZE, fp) != PARAM_HEADER_SIZE)
-    {
-        if (feof(fp)) *status = END_OF_FILE;
-        else *status = INVALID_FORMAT;
-    }
-    for (i = 0; i < PARAM_HEADER_SIZE; i++)
-        if (isdigit(buff[i]))
-            size = size * 10 + buff[i] - '0';
-/*    fprintf(stderr, "header: %lu\n", size); */
-    return size;
-}
-
-#define CHECK_WRITE(status) \
-    do { \
-        if (status == SECTION_OVERFLOW) \
-            nerv_error(L, "section overflowed"); \
-        else if (status == WRITE_ERROR) \
-            nerv_error(L, "error while writing"); \
-    } while (0)
-
-void write_chunk_header_plain(FILE *fp, size_t size, int *status) {
-    static char buff[PARAM_HEADER_SIZE];
-    int i;
-    *status = NORMAL;
-    for (i = PARAM_HEADER_SIZE - 3; i > 0; i--, size /= 10)
-        buff[i] = size % 10 + '0';
-    if (size)
-    {
-        *status = SECTION_OVERFLOW;
-        return;
-    }
-    buff[0] = '[';
-    buff[PARAM_HEADER_SIZE - 2] = ']';
-    buff[PARAM_HEADER_SIZE - 1] = '\n';
-    if (fwrite(buff, 1, PARAM_HEADER_SIZE, fp) != PARAM_HEADER_SIZE)
-    {
-        *status = WRITE_ERROR;
-        return;
-    }
-}
-
-ChunkData *get_chunk_data(FILE *fp, ChunkInfo *info) {
-    ChunkData *pcd = (ChunkData *)malloc(sizeof(ChunkData));
-    pcd->data = (char *)malloc(info->length);
-    pcd->fp = fmemopen(pcd->data, info->length, "r");
-    assert(fseeko(fp, info->offset, SEEK_SET) == 0);
-    if (fread(pcd->data, 1, info->length, fp) != (size_t)info->length)
-        return NULL;
-    return pcd;
-}
-
-const char *read_chunk_metadata(lua_State *L, FILE *fp, const char *fn) {
-#define LINEBUFF_SIZE 1024
-    static char buff[7 + LINEBUFF_SIZE] = "return ";
-    CHECK_FORMAT(fgets(buff + 7, LINEBUFF_SIZE, fp), buff + 7, fn);
-    /* fprintf(stderr, "metadata: %s\n", buff); */
-    return buff;
-}
-
-void write_chunk_metadata(FILE *fp, const char *metadata_str, int *status) {
-    size_t size = strlen(metadata_str);
-    *status = NORMAL;
-    if (fwrite(metadata_str, 1, size, fp) != size ||
-        fprintf(fp, "\n") < 0)
-    {
-        *status = WRITE_ERROR;
-        return;
-    }
-    /* fprintf(stderr, "metadata: %s\n", metadata_str); */
-}
-
-
-int nerv_chunk_file_open_write(lua_State *L, const char *fn) {
-    FILE *fp = fopen(fn, "w");
-    ChunkFileHandle *lfp;
-    if (!fp) nerv_error(L, "Error while opening chunk file: %s", fn);
-    lfp = (ChunkFileHandle *)malloc(sizeof(ChunkFileHandle));
-    lfp->fp = fp;
-    lfp->closed = 0;
-    luaT_pushudata(L, lfp, nerv_chunk_file_handle_tname);
-    lua_setfield(L, -2, "handle");
-    luaT_pushmetatable(L, nerv_chunk_file_tname);
-    lua_setmetatable(L, -2);
-    return 1;
-}
-
-int nerv_chunk_file_open_read(lua_State *L, const char *fn) {
-    FILE *fp = fopen(fn, "r");
-    int i, status;
-    size_t chunk_len;
-    off_t offset;
-    ChunkFileHandle *lfp;
-
-    if (!fp) nerv_error(L, "Error while opening chunk file: %s", fn);
-    offset = ftello(fp);
-    lua_newtable(L);
-    /* fprintf(stderr, "%d\n", (int)offset); */
-    for (i = 0;; offset += chunk_len, i++)
-    {
-        ChunkInfo *pci;
-        /* fprintf(stderr, "reading chunk %d from %d\n", i, (int)offset); */
-        /* skip to the begining of chunk i */
-        CHECK_FORMAT(fseeko(fp, offset, SEEK_SET), 0, fn);
-        /* read header */
-        chunk_len = read_chunk_header_plain(fp, &status);
-        if (status == END_OF_FILE) break;
-        else if (status == INVALID_FORMAT)
-            INVALID_FORMAT_ERROR(fn);
-        /* read metadata */
-        luaL_loadstring(L, read_chunk_metadata(L, fp, fn));
-        CHECK_FORMAT(lua_pcall(L, 0, 1, 0), 0, fn);
-        CHECK_FORMAT(lua_istable(L, -1), 1, fn);
-        /* stack: obj_table, metadata */
-        /* chunk info */
-        pci = (ChunkInfo *)malloc(sizeof(ChunkInfo));
-        pci->offset = ftello(fp);
-        pci->length = chunk_len - (pci->offset - offset);
-        /* fprintf(stderr, "%d + %d (skip %lu)\n", (int)pci->offset,
-                (int)pci->length, chunk_len); */
-        luaT_pushudata(L, pci, nerv_chunk_info_tname);
-        lua_setfield(L, -2, "chunk");
-        /* stack: obj_table, metadata */
-        /* get id */
-        lua_getfield(L, -1, "id");
-        /* stack: obj_table, metadata, id */
-        if (!lua_isstring(L, -1))
-            nerv_error(L, "id field in metadata must be a string");
-        lua_pushvalue(L, -1);
-        /* stack: obj_table, metadata, id, id */
-        lua_gettable(L, -4);
-        /* stack: obj_table, metadata, id, obj[id] */
-        if (!lua_isnil(L, -1))
-            nerv_error(L, "conflicting id");
-        lua_pop(L, 1);
-        /* stack: obj_table, metadata, id */
-        lua_pushvalue(L, -2);
-        /* stack: obj_table, metadata, id, metadata */
-        lua_settable(L, -4);
-        /* stack: obj_table, metadata */
-        lua_pop(L, 1);
-    }
-    lua_setfield(L, -2, "metadata");
-    lfp = (ChunkFileHandle *)malloc(sizeof(ChunkFileHandle));
-    lfp->fp = fp;
-    lfp->closed = 0;
-    luaT_pushudata(L, lfp, nerv_chunk_file_handle_tname);
-    lua_setfield(L, -2, "handle");
-    luaT_pushmetatable(L, nerv_chunk_file_tname);
-    lua_setmetatable(L, -2);
-    return 1;
-}
-
-int nerv_chunk_file_new_(lua_State *L, const char *fn, const char *mode) {
-    int rd = 1, bin = 0;
-    size_t i, len = strlen(mode);
-    for (i = 0; i < len; i++)
-        switch (mode[i])
-        {
-            case 'r': rd = 1; break;
-            case 'w': rd = 0; break;
-            case 'b': bin = 1; break;
-        }
-    return rd ? nerv_chunk_file_open_read(L, fn) : \
-                nerv_chunk_file_open_write(L, fn);
-}
-
-int nerv_chunk_file___init(lua_State *L) {
-    lua_pushvalue(L, 1);
-    return nerv_chunk_file_new_(L, luaL_checkstring(L, 2),
-                                    luaL_checkstring(L, 3));
-}
-
-int nerv_chunk_file_new(lua_State *L) {
-    lua_newtable(L);
-    return nerv_chunk_file_new_(L, luaL_checkstring(L, 1),
-                                    luaL_checkstring(L, 2));
-}
-
-int nerv_chunk_file_write_chunkdata(lua_State *L) {
-    ChunkFileHandle *pfh;
-    int status;
-    off_t start;
-    size_t size;
-    const char *metadata_str = lua_tolstring(L, 2, NULL);
-    lua_getfield(L, 1, "handle");
-    pfh = luaT_checkudata(L, -1, nerv_chunk_file_handle_tname);
-    CHECK_FILE_OPEN(pfh);
-    start = ftello(pfh->fp);
-    write_chunk_header_plain(pfh->fp, 0, &status); /* fill zeros */
-    CHECK_WRITE(status);
-    write_chunk_metadata(pfh->fp, metadata_str, &status);
-    CHECK_WRITE(status);
-    lua_pushvalue(L, 3);
-    lua_getfield(L, -1, "write");
-    if (!lua_isfunction(L, -1))
-        nerv_error(L, "\"write\" method must be implemented");
-    lua_pushvalue(L, -2);
-    lua_pushvalue(L, 4); /* pass handle as parameter to write() */
-    lua_call(L, 2, 0); /* let the write() to write */
-    lua_pop(L, 1);
-    size = ftello(pfh->fp) - start;
-    fseeko(pfh->fp, start, SEEK_SET);
-    /* write the calced size */
-    write_chunk_header_plain(pfh->fp, size, &status);
-    CHECK_WRITE(status);
-    fseeko(pfh->fp, 0, SEEK_END);
-    return 0;
-}
-
-int nerv_chunk_file_get_chunkdata(lua_State *L) {
-    ChunkFileHandle *pfh;
-    ChunkInfo *pci;
-    ChunkData *pcd;
-    const char *id = luaL_checkstring(L, 2);
-
-    lua_getfield(L, 1, "handle");
-    pfh = luaT_checkudata(L, -1, nerv_chunk_file_handle_tname);
-    CHECK_FILE_OPEN(pfh);
-    lua_pop(L, 1); /* pop handle */
-    lua_getfield(L, 1, "metadata");
-    /* now stack: self, k, metadata */
-    lua_getfield(L, -1, id);
-    /* now stack: self, k, metadata, kth{} */
-    if (lua_isnil(L, -1)) /* no chunck with the id */
-        return 0;
-    lua_getfield(L, -1, "chunk");
-    pci = luaT_checkudata(L, -1, nerv_chunk_info_tname);
-    if (!(pcd = get_chunk_data(pfh->fp, pci)))
-        nerv_error(L, "unexpected end of file");
-    luaT_pushudata(L, pcd, nerv_chunk_data_tname);
-    return 1;
-}
-
-int nerv_chunk_file_close(lua_State *L) {
-    ChunkFileHandle *pfh;
-    lua_getfield(L, 1, "handle");
-    pfh = luaT_checkudata(L, -1, nerv_chunk_file_handle_tname);
-    CHECK_FILE_OPEN(pfh);
-    fclose(pfh->fp);
-    pfh->closed = 1;
-    return 0;
-}
-
-int nerv_chunk_file_handle_destroy(lua_State *L) {
-    ChunkFileHandle *pfh = luaT_checkudata(L, 1,
-                                nerv_chunk_file_handle_tname);
-    if (!pfh->closed) fclose(pfh->fp);
-    free(pfh);
-    return 0;
-}
-
-static int nerv_chunk_info_destroy(lua_State *L) {
-    ChunkInfo *pci = luaT_checkudata(L, 1, nerv_chunk_info_tname);
-    free(pci);
-    return 0;
-}
-
-static int nerv_chunk_data_destroy(lua_State *L) {
-    ChunkData *pcd = luaT_checkudata(L, 1, nerv_chunk_data_tname);
-    fclose(pcd->fp);
-    free(pcd->data);
-    free(pcd);
-    return 0;
-}
-
-static const luaL_Reg nerv_chunk_file_methods[] = {
-    {"get_chunkdata", nerv_chunk_file_get_chunkdata},
-    {"_write_chunkdata", nerv_chunk_file_write_chunkdata},
-    {"close", nerv_chunk_file_close},
-    {"__init", nerv_chunk_file___init},
-    {NULL, NULL}
-};
-
-void nerv_chunk_file_init(lua_State *L) {
-    luaT_newmetatable(L, nerv_chunk_file_tname, NULL,
-                        nerv_chunk_file_new,
-                        NULL, NULL);
-    luaL_register(L, NULL, nerv_chunk_file_methods);
-    lua_pop(L, 1);
-    luaT_newmetatable(L, nerv_chunk_file_handle_tname, NULL,
-                        NULL, nerv_chunk_file_handle_destroy, NULL);
-    luaT_newmetatable(L, nerv_chunk_info_tname, NULL,
-                        NULL, nerv_chunk_info_destroy, NULL);
-    luaT_newmetatable(L, nerv_chunk_data_tname, NULL,
-                        NULL, nerv_chunk_data_destroy, NULL);
-}
-
diff --git a/io/chunk_file.h b/io/chunk_file.h
deleted file mode 100644
index 9bae59d..0000000
--- a/io/chunk_file.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef NERV_LAYER_FILE_H
-#define NERV_LAYER_FILE_H
-
-extern const char *nerv_chunk_file_tname;
-extern const char *nerv_chunk_file_handle_tname;
-extern const char *nerv_chunk_info_tname;
-extern const char *nerv_chunk_data_tname;
-
-typedef struct ChunkFileHandle {
-    FILE *fp;
-    int closed;
-} ChunkFileHandle;
-
-typedef struct ChunkInfo {
-    off_t offset, length;
-} ChunkInfo;
-
-typedef struct ChunkData {
-    FILE *fp;
-    char *data;
-} ChunkData;
-
-#endif
diff --git a/io/init.c b/io/init.c
deleted file mode 100644
index 70585f7..0000000
--- a/io/init.c
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "../common.h"
-
-extern void nerv_chunk_file_init(lua_State *L);
-void nerv_io_init(lua_State *L) {
-    nerv_chunk_file_init(L);
-}
diff --git a/io/init.lua b/io/init.lua
deleted file mode 100644
index b722a81..0000000
--- a/io/init.lua
+++ /dev/null
@@ -1,55 +0,0 @@
-function nerv.ChunkFile:write_chunkdata(metadata, writer)
-    if type(metadata) ~= "table" then
-        nerv.error("metadata should be a Lua table")
-        return
-    end
-    return self:_write_chunkdata(table.tostring(metadata), writer)
-end
-
-function nerv.ChunkFile:write_chunk(chunk)
-    local id = chunk.id
-    local type = chunk.__typename
-    if id == nil then
-        nerv.error("id of chunk %s must be specified", type)
-    end
-    self:write_chunkdata({id = id,
-                            type = type,
-                            info = chunk:get_info()}, chunk)
-end
-
-function nerv.ChunkFile:read_chunk(id, global_conf)
-    if self.metadata == nil then
-        nerv.error("wrong file opening mode")
-    end
-    local metadata = self.metadata[id]
-    if metadata == nil then
-        nerv.error("chunk with id %s does not exist", id)
-    end
-    local chunk_type = nerv.get_type(metadata.type)
-    local chunk = chunk_type(id, global_conf)
-    chunk:set_info(metadata.info)
-    chunk:read(self:get_chunkdata(id))
-    return chunk
-end
-
-local DataReader = nerv.class("nerv.DataReader")
-
-function DataReader:__init(global_conf, reader_conf)
-    nerv.error_method_not_implemented()
-end
-
-function DataReader:get_data()
-    nerv.error_method_not_implemented()
-end
-
-local DataBuffer = nerv.class("nerv.DataBuffer")
-
-function DataBuffer:__init(global_conf, buffer_conf)
-    nerv.error_method_not_implemented()
-end
-
-function DataBuffer:get_batch()
-    nerv.error_method_not_implemented()
-end
-
-require 'io.sgd_buffer'
diff --git a/io/sgd_buffer.lua b/io/sgd_buffer.lua
deleted file mode 100644
index f4f7dfe..0000000
--- a/io/sgd_buffer.lua
+++ /dev/null
@@ -1,111 +0,0 @@
-local SGDBuffer = nerv.class("nerv.SGDBuffer", "nerv.DataBuffer")
-
-function SGDBuffer:__init(global_conf, buffer_conf)
-    self.gconf = global_conf
-    self.buffer_size = math.floor(buffer_conf.buffer_size /
-                                global_conf.batch_size) * global_conf.batch_size
-    self.randomize = buffer_conf.randomize
-    if self.randomize == nil then
-        self.randomize = false
-    end
-    self.head = 0
-    self.tail = 0
-    self.readers = {}
-    for i, reader_spec in ipairs(buffer_conf.readers) do
-        local buffs = {}
-        for id, width in pairs(reader_spec.data) do
-            buffs[id] = {data = global_conf.mmat_type(self.buffer_size, width),
-                        leftover = nil,
-                        width = width}
-        end
-        table.insert(self.readers, {buffs = buffs,
-                                    reader = reader_spec.reader,
-                                    tail = 0,
-                                    has_leftover = false})
-    end
-end
-
-function SGDBuffer:saturate()
-    local buffer_size = self.buffer_size
-    self.head = 0
-    self.tail = buffer_size
-    for i, reader in ipairs(self.readers) do
-        reader.tail = 0
-        if reader.has_leftover then
-            local lrow
-            for id, buff in pairs(reader.buffs) do
-                lrow = buff.leftover:nrow()
-                if lrow > buffer_size then
-                    nerv.error("buffer size is too small to contain leftovers")
-                end
-                buff.data:copy_from(buff.leftover, 0, lrow)
-                buff.leftover = nil
-            end
-            nerv.printf("leftover: %d\n", lrow)
-            reader.tail = lrow
-            reader.has_leftover = false
-        end
-        while reader.tail < buffer_size do
-            local data = reader.reader:get_data()
-            if data == nil then
-                break
-            end
-            local drow = nil
-            for id, d in pairs(data) do
-                if drow == nil then
-                    drow = d:nrow()
-                elseif d:nrow() ~= drow then
-                    nerv.error("reader provides with inconsistent rows of data")
-                end
-            end
-            local remain = buffer_size - reader.tail
-            if drow > remain then
-                for id, buff in pairs(reader.buffs) do
-                    local d = data[id]
-                    if d == nil then
-                        nerv.error("reader does not provide data for %s", id)
-                    end
-                    buff.leftover = self.gconf.mmat_type(drow - remain,
-                                                        buff.width)
-                    buff.leftover:copy_from(d, remain, drow)
-                end
-                drow = remain
-                reader.has_leftover = true
-            end
-            for id, buff in pairs(reader.buffs) do
-                buff.data:copy_from(data[id], 0, drow, reader.tail)
-            end
-            reader.tail = reader.tail + drow
-        end
-        self.tail = math.min(self.tail, reader.tail)
-    end
-    self.rand_map = nerv.MMatrixInt.perm_gen(self.tail) -- generate shuffled index
-    collectgarbage("collect")
-    return self.tail >= self.gconf.batch_size
-end
-
-function SGDBuffer:get_data()
-    local batch_size = self.gconf.batch_size
-    if self.head >= self.tail then -- buffer is empty
-        if not self:saturate() then
-            return nil -- the remaining data cannot build a batch
-        end
-    end
-    if self.head + batch_size > self.tail then
-        return nil -- the remaining data cannot build a batch
-    end
-    local res = {}
-    for i, reader in ipairs(self.readers) do
-        for id, buff in pairs(reader.buffs) do
-            local batch = self.gconf.cumat_type(batch_size, buff.width)
-            if self.randomize then
-                batch:copy_rows_fromh_by_idx(buff.data, self.rand_map, self.head)
-            else
-                batch:copy_fromh(buff.data, self.head, self.head + batch_size)
-            end
-            res[id] = batch
-        end
-    end
-    self.head = self.head + batch_size
-    return res
-end
diff --git a/layer/affine.lua b/layer/affine.lua
deleted file mode 100644
index 00cbcfb..0000000
--- a/layer/affine.lua
+++ /dev/null
@@ -1,91 +0,0 @@
-local MatrixParam = nerv.class('nerv.MatrixParam', 'nerv.Param')
-local LinearTransParam = nerv.class('nerv.LinearTransParam', 'nerv.MatrixParam')
-local BiasParam = nerv.class('nerv.BiasParam', 'nerv.MatrixParam')
-local AffineLayer = nerv.class('nerv.AffineLayer', 'nerv.Layer')
-
-function MatrixParam:read(handle)
-    self.trans = self.gconf.cumat_type.new_from_host(
-                    nerv.MMatrixFloat.load(handle))
-end
-
-function MatrixParam:write(handle)
-    self.trans:new_to_host():save(handle)
-end
-
-function MatrixParam:train_init()
-    self.correction = self.trans:create()
-    self.correction:fill(0)
-end
-
-function MatrixParam:update(gradient)
-    local gconf = self.gconf
-    self.correction:add(self.correction, gradient, gconf.momentum, 1.0)
-    -- momentum gain
-    local mmt_gain = 1.0 / (1.0 - gconf.momentum);
-    local n = self.gconf.batch_size * mmt_gain
-    -- perform update
-    self.trans:add(self.trans, self.correction, 1.0, -gconf.lrate / n)
-end
-
-function LinearTransParam:update(gradient)
-    MatrixParam.update(self, gradient)
-    local gconf = self.gconf
-    -- weight decay
-    self.trans:add(self.trans, self.trans, 1.0, -gconf.lrate * gconf.wcost)
-end
-
-function AffineLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.ltp = layer_conf.ltp
-    self.bp = layer_conf.bp
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self.gconf = global_conf
-    self:check_dim_len(1, 1) -- exactly one input and one output
-    self.direct_update = layer_conf.direct_update
-end
-
-function AffineLayer:init(batch_size)
-    if self.ltp.trans:ncol() ~= self.bp.trans:ncol() then
-        nerv.error("mismatching dimensions of linear transform and bias paramter")
-    end
-    if self.dim_in[1] ~= self.ltp.trans:nrow() then
-        nerv.error("mismatching dimensions of linear transform parameter and input")
-    end
-    if self.dim_out[1] ~= self.ltp.trans:ncol() then
-        nerv.error("mismatching dimensions of linear transform parameter and output")
-    end
-    self.ltp_grad = self.ltp.trans:create()
-    self.ltp:train_init()
-    self.bp:train_init()
-end
-
-function AffineLayer:update(bp_err, input, output)
-    if self.direct_update then
-        self.ltp.correction:mul(input[1], bp_err[1], 1.0, gconf.momentum, 'T', 'N')
-        -- momentum gain
-        local mmt_gain = 1.0 / (1.0 - gconf.momentum);
-        local n = self.gconf.batch_size * mmt_gain
-        -- perform update
-        self.ltp.trans:add(self.ltp.trans, self.ltp.correction, 1.0, -gconf.lrate / n)
-    else
-        self.ltp_grad:mul(input[1], bp_err[1], 1.0, 0.0, 'T', 'N')
-        self.ltp:update(self.ltp_grad)
-    end
-    self.bp:update(bp_err[1]:colsum())
-end
-
-function AffineLayer:propagate(input, output)
-    -- apply linear transform
-    output[1]:mul(input[1], self.ltp.trans, 1.0, 0.0, 'N', 'N')
-    -- add bias
-    output[1]:add_row(self.bp.trans, 1.0)
-end
-
-function AffineLayer:back_propagate(bp_err, next_bp_err, input, output)
-    next_bp_err[1]:mul(bp_err[1], self.ltp.trans, 1.0, 0.0, 'N', 'T')
-end
-
-function AffineLayer:get_params()
-    return nerv.ParamRepo({self.ltp, self.bp})
-end
diff --git a/layer/bias.lua b/layer/bias.lua
deleted file mode 100644
index c99274d..0000000
--- a/layer/bias.lua
+++ /dev/null
@@ -1,28 +0,0 @@
-local BiasLayer = nerv.class("nerv.BiasLayer", "nerv.Layer")
-
-function BiasLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.gconf = global_conf
-    self.bias = layer_conf.bias
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self:check_dim_len(1, 1)
-end
-
-function BiasLayer:init()
-    if self.dim_in[1] ~= self.bias.trans:ncol() then
-        nerv.error("mismatching dimensions of input and bias parameter")
-    end
-    if self.dim_out[1] ~= self.bias.trans:ncol() then
-        nerv.error("mismatching dimensions of output and bias parameter")
-    end
-end
-
-function BiasLayer:propagate(input, output)
-    output[1]:copy_fromd(input[1])
-    output[1]:add_row(self.bias.trans, 1.0)
-end
-
-function BiasLayer:get_params()
-    return nerv.ParamRepo({self.bias})
-end
diff --git a/layer/combiner.lua b/layer/combiner.lua
deleted file mode 100644
index 7bd7617..0000000
--- a/layer/combiner.lua
+++ /dev/null
@@ -1,59 +0,0 @@
-local CombinerLayer = nerv.class('nerv.CombinerLayer', 'nerv.Layer')
-
-function CombinerLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.lambda = layer_conf.lambda
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self.gconf = global_conf
-    self:check_dim_len(#self.lambda, -1)
-    if #self.dim_in < 1 then
-        nerv.error("no input specified")
-    end
-    if #self.dim_out < 1 then
-        nerv.error("no output specified")
-    end
-end
-
-function CombinerLayer:init(batch_size)
-    local dim = self.dim_in[1]
-    for i = 2, #self.dim_in do
-        if self.dim_in[i] ~= dim then
-            nerv.error("mismatching dimensions of inputs")
-        end
-    end
-    for i = 1, #self.dim_out do
-        if self.dim_out[i] ~= dim then
-            nerv.error("mismatching dimensions of inputs/outputs")
-        end
-    end
-    self.sum = self.gconf.cumat_type(batch_size, dim)
-end
-
-function CombinerLayer:update(bp_err, input, output)
-end
-
-function CombinerLayer:propagate(input, output)
-    output[1]:fill(0)
-    for i = 1, #self.dim_in do
-        output[1]:add(output[1], input[i], 1.0, self.lambda[i])
-    end
-    for i = 2, #self.dim_out do
-        output[i]:copy_fromd(output[1])
-    end
-end
-
-function CombinerLayer:back_propagate(bp_err, next_bp_err, input, output)
-    local sum = self.sum
-    sum:copy_fromd(bp_err[1])
-    for i = 2, #self.dim_out do
-        sum:add(sum, bp_err[i], 1.0, 1.0)
-    end
-    for i = 1, #self.dim_in do
-        next_bp_err[i]:add(next_bp_err[i], sum, 0.0, self.lambda[i])
-    end
-end
-
-function CombinerLayer:get_params()
-    return nerv.ParamRepo({})
-end
diff --git a/layer/init.lua b/layer/init.lua
deleted file mode 100644
index e39af94..0000000
--- a/layer/init.lua
+++ /dev/null
@@ -1,79 +0,0 @@
--- The following methods must be implemented to let a layer work properly
-
-local Param = nerv.class('nerv.Param')
-
-function Param:__init(id, global_conf)
-    self.id = id
-    self.gconf = global_conf
-end
-
-function Param:get_info()
-    return self.info
-end
-
-function Param:set_info(info)
-    self.info = info
-end
-
-function Param:read(handle)
-    nerv.error_method_not_implemented()
-end
-
-function Param:write(handle)
-    nerv.error_method_not_implemented()
-end
-
-function Param:update(gradient)
-    nerv.error_method_not_implemented()
-end
-
-local Layer = nerv.class('nerv.Layer')
-
-function Layer:__init(id, global_conf, layer_conf)
-    nerv.error_method_not_implemented()
-end
-
-function Layer:init(batch_size)
-    nerv.error_method_not_implemented()
-end
-
-function Layer:update(bp_err, input, output)
-    nerv.error_method_not_implemented()
-end
-
-function Layer:propagate(input, output)
-    nerv.error_method_not_implemented()
-end
-
-function Layer:back_propagate(bp_err, next_bp_err, input, output)
-    nerv.error_method_not_implemented()
-end
-
-function Layer:check_dim_len(len_in, len_out)
-    local expected_in = #self.dim_in
-    local expected_out = #self.dim_out
-    if len_in > 0 and expected_in ~= len_in then
-        nerv.error("layer %s expects %d inputs, %d given",
-                    self.id, len_in, expected_in)
-    end
-    if len_out > 0 and expected_out ~= len_out then
-        nerv.error("layer %s expects %d outputs, %d given",
-                    self.id, len_out, expected_out)
-    end
-end
-
-function Layer:get_params()
-    nerv.error_method_not_implemented()
-end
-
-function Layer:get_dim()
-    return self.dim_in, self.dim_out
-end
-
-require 'layer.affine'
-require 'layer.sigmoid'
-require 'layer.softmax_ce'
-require 'layer.bias'
-require 'layer.window'
-require 'layer.mse'
-require 'layer.combiner'
diff --git a/layer/mse.lua b/layer/mse.lua
deleted file mode 100644
index 9a97add..0000000
--- a/layer/mse.lua
+++ /dev/null
@@ -1,52 +0,0 @@
-local MSELayer = nerv.class("nerv.MSELayer", "nerv.Layer")
-
-function MSELayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self.gconf = global_conf
-    self:check_dim_len(2, -1)
-end
-
-function MSELayer:init(batch_size)
-    if self.dim_in[1] ~= self.dim_in[2] then
-        nerv.error("mismatching dimensions of previous network output and labels")
-    end
-    self.scale = 1 / self.dim_in[1]
-    self.total_mse = 0.0
-    self.total_frames = 0
-    self.mse = self.gconf.cumat_type(batch_size, self.dim_in[1])
-    self.mse_sum = self.gconf.cumat_type(batch_size, 1)
-    self.diff = self.mse:create()
-end
-
-function MSELayer:update(bp_err, input, output)
-    -- no params, therefore do nothing
-end
-
-function MSELayer:propagate(input, output)
-    local mse = self.mse
-    local mse_sum = self.mse_sum
-    mse:add(input[1], input[2], 1.0, -1.0)
-    self.diff:copy_fromd(mse)
-    mse:mul_elem(mse, mse)
-    mse_sum:add(mse_sum, mse:rowsum(mse), 0.0, self.scale)
-    if output[1] ~= nil then
-        output[1]:copy_fromd(mse_sum)
-    end
-    self.total_mse = self.total_mse + mse_sum:colsum()[0]
-    self.total_frames = self.total_frames + mse_sum:nrow()
-end
-
--- NOTE: must call propagate before back_propagate
-function MSELayer:back_propagate(bp_err, next_bp_err, input, output)
-    local nbe = next_bp_err[1]
-    nbe:add(nbe, self.diff, 0.0, 2 * self.scale)
-    if bp_err[1] ~= nil then
-        nbe:scale_rows_by_col(bp_err[1])
-    end
-end
-
-function MSELayer:get_params()
-    return nerv.ParamRepo({})
-end
diff --git a/layer/sigmoid.lua b/layer/sigmoid.lua
deleted file mode 100644
index dfd09eb..0000000
--- a/layer/sigmoid.lua
+++ /dev/null
@@ -1,31 +0,0 @@
-local SigmoidLayer = nerv.class("nerv.SigmoidLayer", "nerv.Layer")
-
-function SigmoidLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.gconf = global_conf
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self:check_dim_len(1, 1)
-end
-
-function SigmoidLayer:init()
-    if self.dim_in[1] ~= self.dim_out[1] then
-        nerv.error("mismatching dimensions of input and output")
-    end
-end
-
-function SigmoidLayer:update(bp_err, input, output)
-    -- no params, therefore do nothing
-end
-
-function SigmoidLayer:propagate(input, output)
-    output[1]:sigmoid(input[1])
-end
-
-function SigmoidLayer:back_propagate(bp_err, next_bp_err, input, output)
-    next_bp_err[1]:sigmoid_grad(bp_err[1], output[1])
-end
-
-function SigmoidLayer:get_params()
-    return nerv.ParamRepo({})
-end
diff --git a/layer/softmax_ce.lua b/layer/softmax_ce.lua
deleted file mode 100644
index daf891e..0000000
--- a/layer/softmax_ce.lua
+++ /dev/null
@@ -1,68 +0,0 @@
-local SoftmaxCELayer = nerv.class("nerv.SoftmaxCELayer", "nerv.Layer")
-
-function SoftmaxCELayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.gconf = global_conf
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self.compressed = layer_conf.compressed
-    if self.compressed == nil then
-        self.compressed = false
-    end
-    self:check_dim_len(2, -1) -- two inputs: nn output and label
-end
-
-function SoftmaxCELayer:init(batch_size)
-    if not self.compressed and (self.dim_in[1] ~= self.dim_in[2]) then
-        nerv.error("mismatching dimensions of previous network output and labels")
-    end
-    self.total_ce = 0.0
-    self.total_correct = 0
-    self.total_frames = 0
-    self.softmax = self.gconf.cumat_type(batch_size, self.dim_in[1])
-    self.ce = self.softmax:create()
-end
-
-function SoftmaxCELayer:update(bp_err, input, output)
-    -- no params, therefore do nothing
-end
-
-function SoftmaxCELayer:propagate(input, output)
-    local softmax = self.softmax
-    local ce = self.ce
-    local classified = softmax:softmax(input[1])
-    local label = input[2]
-    ce:log_elem(softmax)
-    if self.compressed then
-        label = label:decompress(input[1]:ncol())
-    end
-    ce:mul_elem(ce, label)
-    ce = ce:rowsum()
-    if output[1] ~= nil then
-        output[1]:copy_fromd(ce)
-    end
-    -- add total ce
-    self.total_ce = self.total_ce - ce:colsum()[0]
-    self.total_frames = self.total_frames + softmax:nrow()
-    -- TODO: add colsame for uncompressed label
-    if self.compressed then
-        self.total_correct = self.total_correct + classified:colsame(input[2])[0]
-    end
-end
-
-function SoftmaxCELayer:back_propagate(bp_err, next_bp_err, input, output)
-    -- softmax output - label
-    local label = input[2]
-    if self.compressed then
-        label = label:decompress(input[1]:ncol())
-    end
-    local nbe = next_bp_err[1]
-    nbe:add(self.softmax, label, 1.0, -1.0)
-    if bp_err[1] ~= nil then
-        nbe:scale_rows_by_col(bp_err[1])
-    end
-end
-
-function SoftmaxCELayer:get_params()
-    return nerv.ParamRepo({})
-end
diff --git a/layer/window.lua b/layer/window.lua
deleted file mode 100644
index 4e9a3b1..0000000
--- a/layer/window.lua
+++ /dev/null
@@ -1,28 +0,0 @@
-local WindowLayer = nerv.class("nerv.WindowLayer", "nerv.Layer")
-
-function WindowLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.gconf = global_conf
-    self.window = layer_conf.window
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self:check_dim_len(1, 1)
-end
-
-function WindowLayer:init()
-    if self.dim_in[1] ~= self.window.trans:ncol() then
-        nerv.error("mismatching dimensions of input and window parameter")
-    end
-    if self.dim_out[1] ~= self.window.trans:ncol() then
-        nerv.error("mismatching dimensions of output and window parameter")
-    end
-end
-
-function WindowLayer:propagate(input, output)
-    output[1]:copy_fromd(input[1])
-    output[1]:scale_rows_by_row(self.window.trans)
-end
-
-function WindowLayer:get_params()
-    return nerv.ParamRepo({self.window})
-end
diff --git a/luaT/README.md b/luaT/README.md
deleted file mode 100644
index 6e9cf0d..0000000
--- a/luaT/README.md
+++ /dev/null
@@ -1,239 +0,0 @@
-<a name="luat.dok"/>
-# Lua Torch C API #
-
-luaT provides an API to interface Lua and C in Torch packages. It defines a
-concept of _classes_ to Lua for Torch, and provides a mechanism to easily
-handle these Lua classes from C.
-
-It additionally provides few functions that `luaL` should have defined, and
-defines several functions similar to `luaL` ones for better type error printing when using
-`luaT` classes.
-
-<a name="luat.memory.dok"/>
-## Memory functions ##
-
-Classical memory allocation functions which generate a Lua error in case of
-problem.
-
-<a name="luaT_alloc"/>
-### void* luaT_alloc(lua_State *L, long size) ###
-
-Allocates `size` bytes, and return a pointer on the allocated
-memory. A Lua error will be generated if running out of memory.
-
-<a name="luaT_realloc"/>
-### void* luaT_realloc(lua_State *L, void *ptr, long size) ###
-
-Realloc `ptr` to `size` bytes. `ptr` must have been previously
-allocated with [luaT_alloc](#luaT_alloc) or
-[luaT_realloc](#luaT_realloc), or the C `malloc` or `realloc`
-functions. A Lua error will be generated if running out of memory.
-
-<a name="luaT_free"/>
-### void luaT_free(lua_State *L, void *ptr) ###
-
-Free memory allocated at address `ptr`. The memory must have been
-previously allocated with [luaT_alloc](#luaT_alloc) or
-[luaT_realloc](#luaT_realloc), or the C `malloc` or `realloc`
-functions.
-
-<a name="luat.classcreate"/>
-## Class creation and basic handling ##
-
-A `luaT` class is basically either a Lua _table_ or _userdata_ with
-an appropriate _metatable_. This appropriate metatable is created with
-[luaT_newmetatable](#luaT_newmetatable). Contrary to luaL userdata
-functions, luaT mechanism handles inheritance. If the class inherit from
-another class, then the metatable will itself have a metatable
-corresponding to the _parent metatable_: the metatables are cascaded
-according to the class inheritance. Multiple inheritance is not supported.
-
-<a name="luat.operatoroverloading"/>
-### Operator overloading ###
-
-The metatable of a `luaT` object contains `Lua` operators like
-`__index`, `__newindex`, `__tostring`, `__add`
-(etc...). These operators will respectively look for `__index__`,
-`__newindex__`, `__tostring__`, `__add__` (etc...) in the
-metatable. If found, the corresponding function or value will be returned,
-else a Lua error will be raised.
-
-If one wants to provide `__index__` or `__newindex__` in the
-metaclass, these operators must follow a particular scheme:
-
-  * `__index__` must either return a value _and_ `true` or return `false` only. In the first case, it means `__index__` was able to handle the given argument (for e.g., the type was correct). The second case means it was not able to do anything, so `__index` in the root metatable can then try to see if the metaclass contains the required value.
-
-  * `__newindex__` must either return `true` or `false`. As for `__index__`, `true` means it could handle the argument and `false` not. If not, the root metatable `__newindex` will then raise an error if the object was a userdata, or apply a rawset if the object was a Lua table.
-
-Other metaclass operators like `__tostring__`, `__add__`, etc... do not have any particular constraint.
-
-<a name="luat_newmetatable"/>
-### const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parenttname, lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory) ###
-
-This function creates a new metatable, which is the Lua way to define a new
-object class. As for `luaL_newmetatable`, the metatable is registered in
-the Lua registry table, with the key `tname`. In addition, `tname` is
-also registered in the Lua registry, with the metatable as key (the
-typename of a given object can be thus easily retrieved).
-
-The class name `tname` must be of the form `modulename.classname`. The module name
-If not NULL, `parenttname` must be a valid typename corresponding to the
-parent class of the new class.
-
-If not NULL, `constructor`, a function `new` will be added to the metatable, pointing to this given function. The constructor might also
-be called through `modulename.classname()`, which is an alias setup by `luaT_metatable`.
-
-If not NULL, `destructor` will be called when garbage collecting the object.
-
-If not NULL, `factory` must be a Lua C function creating an empty object
-instance of the class. This functions are used in Torch for serialization.
-
-Note that classes can be partly defined in C and partly defined in Lua:
-once the metatable is created in C, it can be filled up with additional
-methods in Lua.
-
-The return value is the value returned by [luaT_typenameid](#luat_typenameid).
-
-<a name="luat_pushmetatable"/>
-### int luaT_pushmetatable(lua_State *L, const name *tname) ###
-
-Push the metatable with type name `tname` on the stack, it `tname` is a
-valid Torch class name (previously registered with luaT_newmetatable).
-
-On success, returns 1. If `tname` is invalid, nothing is pushed and it
-returns 0.
-
-<a name="luat_typenameid"/>
-### const char* luaT_typenameid(lua_State *L, const char *tname) ###
-
-If `tname` is a valid Torch class name, then returns a unique string (the
-contents will be the same than `tname`) pointing on the string registered
-in the Lua registry. This string is thus valid as long as Lua is
-running. The returned string shall not be freed.
-
-If `tname` is an invalid class name, returns NULL.
-
-<a name="luat_typename"/>
-### const char* luaT_typename(lua_State *L, int ud) ###
-
-Returns the typename of the object at index `ud` on the stack. If it is
-not a valid Torch object, returns NULL.
-
-<a name="luat_pushudata"/>
-### void luaT_pushudata(lua_State *L, void *udata, const char *tname) ###
-
-Given a C structure `udata`, push a userdata object on the stack with
-metatable corresponding to `tname`. Obviously, `tname` must be a valid
-Torch name registered with [luaT_newmetatable](#luat_newmetatable).
-
-<a name="luat_toudata"/>
-### void *luaT_toudata(lua_State *L, int ud, const char *tname) ###
-
-Returns a pointer to the original C structure previously pushed on the
-stack with [luaT_pushudata](#luat_pushudata), if the object at index
-`ud` is a valid Torch class name. Returns NULL otherwise.
-
-<a name="luat_isudata"/>
-### int luaT_isudata(lua_State *L, int ud, const char *tname) ###
-
-Returns 1 if the object at index `ud` on the stack is a valid Torch class name `tname`.
-Returns 0 otherwise.
-
-<a name="luat_getfield"/>
-### Checking fields of a table ###
-
-This functions check that the table at the given index `ud` on the Lua
-stack has a field named `field`, and that it is of the specified type.
-These function raises a Lua error on failure.
-
-<a name="luat_getfieldcheckudata"/>
-## void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname) ##
-
-Checks that the field named `field` of the table at index `ud` is a
-Torch class name `tname`.  Returns the pointer of the C structure
-previously pushed on the stack with [luaT_pushudata](#luat_pushudata) on
-success. The function raises a Lua error on failure.
-
-<a name="luat_getfieldchecklightudata"/>
-## void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field) ##
-
-Checks that the field named `field` of the table at index `ud` is a
-lightuserdata.  Returns the lightuserdata pointer on success. The function
-raises a Lua error on failure.
-
-<a name="luat_getfieldcheckint"/>
-## int luaT_getfieldcheckint(lua_State *L, int ud, const char *field) ##
-
-Checks that the field named `field` of the table at index `ud` is an
-int. Returns the int value pointer on success. The function raises a Lua
-error on failure.
-
-<a name="luat_getfieldcheckstring"/>
-## const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field) ##
-
-Checks that the field named `field` of the table at index `ud` is a
-string. Returns a pointer to the string on success. The function raises a
-Lua error on failure.
-
-<a name="luat_getfieldcheckboolean"/>
-## int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field) ##
-
-Checks that the field named `field` of the table at index `ud` is a
-boolean. On success, returns 1 if the boolean is `true`, 0 if it is
-`false`. The function raises a Lua error on failure.
-
-<a name="luat_getfieldchecktable"/>
-## void luaT_getfieldchecktable(lua_State *L, int ud, const char *field) ##
-
-Checks that the field named `field` of the table at index `ud` is a
-table. On success, push the table on the stack. The function raises a Lua
-error on failure.
-
-<a name="luat_typerror"/>
-### int luaT_typerror(lua_State *L, int ud, const char *tname) ###
-
-Raises a `luaL_argerror` (and returns its value), claiming that the
-object at index `ud` on the stack is not of type `tname`. Note that
-this function does not check the type, it only raises an error.
-
-<a name="luat_checkboolean"/>
-### int luaT_checkboolean(lua_State *L, int ud) ###
-
-Checks that the value at index `ud` is a boolean. On success, returns 1
-if the boolean is `true`, 0 if it is `false`. The function raises a Lua
-error on failure.
-
-<a name="luat_optboolean"/>
-### int luaT_optboolean(lua_State *L, int ud, int def) ###
-
-Checks that the value at index `ud` is a boolean. On success, returns 1
-if the boolean is `true`, 0 if it is `false`. If there is no value at
-index `ud`, returns `def`. In any other cases, raises an error.
-
-<a name="luat_registeratname"/>
-### void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name) ###
-
-This function assume a table is on the stack. It creates a table field
-`name` in the table (if this field does not exist yet), and fill up
-`methods` in this table field.
-
-<a name="luat_classrootname"/>
-### const char *luaT_classrootname(const char *tname) ###
-
-Assuming `tname` is of the form `modulename.classname`, returns
-`classname`. The returned value shall not be freed. It is a pointer
-inside `tname` string.
-
-<a name="luat_classmodulename"/>
-### const char *luaT_classmodulename(const char *tname) ###
-
-Assuming `tname` is of the form `modulename.classname`, returns
-`modulename`. The returned value shall not be freed. It is valid until the
-next call to `luaT_classrootname`.
-
-<a name="luat_stackdump"/>
-### void luaT_stackdump(lua_State *L) ###
-
-This function print outs the state of the Lua stack. It is useful for debug
-purposes.
-
diff --git a/luaT/luaT.c b/luaT/luaT.c
deleted file mode 100644
index 7b85ce3..0000000
--- a/luaT/luaT.c
+++ /dev/null
@@ -1,1079 +0,0 @@
-#include <stdlib.h>
-#include <string.h>
-
-#include "luaT.h"
-
-void* luaT_alloc(lua_State *L, long size)
-{
-  void *ptr;
-
-  if(size == 0)
-    return NULL;
-
-  if(size < 0)
-    luaL_error(L, "$ Torch: invalid memory size -- maybe an overflow?");
-
-  ptr = malloc(size);
-  if(!ptr)
-    luaL_error(L, "$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", size/1073741824);
-
-  return ptr;
-}
-
-void* luaT_realloc(lua_State *L, void *ptr, long size)
-{
-  if(!ptr)
-    return(luaT_alloc(L, size));
-
-  if(size == 0)
-  {
-    luaT_free(L, ptr);
-    return NULL;
-  }
-
-  if(size < 0)
-    luaL_error(L, "$ Torch: invalid memory size -- maybe an overflow?");
-
-  ptr = realloc(ptr, size);
-  if(!ptr)
-    luaL_error(L, "$ Torch: not enough memory: you tried to reallocate %dGB. Buy new RAM!", size/1073741824);
-  return ptr;
-}
-
-void luaT_free(lua_State *L, void *ptr)
-{
-  free(ptr);
-}
-
-void luaT_stackdump(lua_State *L)
-{
-  int i;
-  const char *tname = NULL;
-  int top = lua_gettop(L);
-  for(i = 1; i <= top; i++)
-  {
-    int t = lua_type(L, i);
-    printf("%3d. ", i);
-    switch(t)
-    {
-      case LUA_TSTRING:
-        printf("'%s'", lua_tostring(L,i));
-        break;
-      case LUA_TBOOLEAN:
-        printf(lua_toboolean(L, i) ? "true" : "false");
-        break;
-      case LUA_TNUMBER:
-        printf("%g", lua_tonumber(L,i));
-        break;
-      case LUA_TUSERDATA:
-        tname = luaT_typename(L, i);
-        printf("userdata %lx [%s]", (long)lua_topointer(L, i), (tname ? tname : "not a Torch object"));
-        break;
-      case 10:
-        tname = luaT_typename(L, i);
-        printf("cdata %lx [%s]", (long)lua_topointer(L, i), (tname ? tname : "not a Torch object"));
-        break;
-      case LUA_TTABLE:
-        lua_pushvalue(L, i);
-        lua_rawget(L, LUA_REGISTRYINDEX);
-        if(lua_isstring(L, -1))
-          tname = lua_tostring(L, -1); /*luaT_typenameid(L, lua_tostring(L, -1)); */
-        else
-          tname = NULL;
-        lua_pop(L, 1);
-        if(tname)
-          printf("metatable [%s]", tname);
-        else
-        {
-          tname = luaT_typename(L, i);
-          printf("table %lx [%s]", (long)lua_topointer(L, i), (tname ? tname : "not a Torch object"));
-        }
-        break;
-      default:
-        printf("Lua object type: %s", lua_typename(L,t));
-        break;
-    }
-    printf("\n");
-  }
-  printf("---------------------------------------------\n");
-}
-
-/* metatable operator methods */
-static int luaT_mt__index(lua_State *L);
-static int luaT_mt__newindex(lua_State *L);
-static int luaT_mt__tostring(lua_State *L);
-static int luaT_mt__add(lua_State *L);
-static int luaT_mt__sub(lua_State *L);
-static int luaT_mt__mul(lua_State *L);
-static int luaT_mt__div(lua_State *L);
-static int luaT_mt__mod(lua_State *L);
-static int luaT_mt__pow(lua_State *L);
-static int luaT_mt__unm(lua_State *L);
-static int luaT_mt__concat(lua_State *L);
-static int luaT_mt__len(lua_State *L);
-static int luaT_mt__eq(lua_State *L);
-static int luaT_mt__lt(lua_State *L);
-static int luaT_mt__le(lua_State *L);
-static int luaT_mt__call(lua_State *L);
-
-/* Constructor-metatable methods */
-static int luaT_cmt__call(lua_State *L);
-static int luaT_cmt__newindex(lua_State *L);
-
-const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parenttname,
-                              lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory)
-{
-  lua_pushcfunction(L, luaT_lua_newmetatable);
-  lua_pushstring(L, tname);
-  (parenttname ? lua_pushstring(L, parenttname) : lua_pushnil(L));
-  (constructor ? lua_pushcfunction(L, constructor) : lua_pushnil(L));
-  (destructor ? lua_pushcfunction(L, destructor) : lua_pushnil(L));
-  (factory ? lua_pushcfunction(L, factory) : lua_pushnil(L));
-  lua_call(L, 5, 1);
-  return luaT_typenameid(L, tname);
-}
-
-int luaT_pushmetatable(lua_State *L, const char *tname)
-{
-  lua_getfield(L, LUA_REGISTRYINDEX, tname);
-  if(lua_isnil(L, -1))
-  {
-    lua_pop(L, 1);
-    return 0;
-  }
-  return 1;
-}
-
-const char *luaT_typenameid(lua_State *L, const char *tname)
-{
-  if(luaT_pushmetatable(L, tname))
-  {
-    const char *tnameid = NULL;
-    lua_rawget(L, LUA_REGISTRYINDEX);
-    if(lua_isstring(L, -1))
-      tnameid = lua_tostring(L, -1);
-    lua_pop(L, 1); /* the string/nil */
-    return tnameid;
-  }
-  return NULL;
-}
-
-static const char cdataname[] = ""
-  "local _, ffi = pcall(require, 'ffi')\n"
-  "if ffi then\n"
-  "  local id2name = {}\n"
-  "  return function(cdata, name)\n"
-  "    local id = tonumber(ffi.typeof(cdata))\n"
-  "    if id then\n"
-  "      if name then\n"
-  "        id2name[id] = name\n"
-  "        return name\n"
-  "      else\n"
-  "        return rawget(id2name, id)\n"
-  "      end\n"
-  "    end\n"
-  "    return nil\n"
-  "  end\n"
-  "else\n"
-  "  return function() end\n"
-  "end\n";
-
-static const char* luaT_cdataname(lua_State *L, int ud, const char *tname)
-{
-  lua_pushstring(L, "__cdataname");
-  lua_rawget(L, LUA_REGISTRYINDEX);
-  if(lua_isnil(L,-1))
-  {
-    lua_pop(L, 1);
-
-    if(luaL_dostring(L, cdataname)) /* did something go wrong? */
-      luaL_error(L, "internal error (could not load cdataname): %s", lua_tostring(L, -1));
-
-    lua_pushstring(L, "__cdataname");
-    lua_pushvalue(L, -2);
-    lua_rawset(L, LUA_REGISTRYINDEX);
-  }
-  if(!lua_isfunction(L, -1)) /* should not happen */
-    luaL_error(L, "internal error (cdataname is not a function)");
-
-  lua_pushvalue(L, ud);
-  if(tname)
-    lua_pushstring(L, tname);
-  if(lua_pcall(L, (tname ? 2 : 1), 1, 0))
-    luaL_error(L, "internal error (cdataname): %s", lua_tostring(L, -1));
-
-  tname = lua_tostring(L, -1);
-  lua_pop(L, 1);
-
-  return tname;
-}
-
-const char* luaT_typename(lua_State *L, int ud)
-{
-  if(lua_type(L, ud) == 10)
-    return luaT_cdataname(L, ud, NULL);
-  else if(lua_getmetatable(L, ud))
-  {
-    const char *tname = NULL;
-    lua_rawget(L, LUA_REGISTRYINDEX);
-    if(lua_isstring(L, -1))
-      tname = lua_tostring(L, -1);
-    lua_pop(L, 1); /* the string/nil */
-    return tname;
-  }
-  return NULL;
-}
-
-void luaT_pushudata(lua_State *L, void *udata, const char *tname)
-{
-  if(udata)
-  {
-    void **udata_p = lua_newuserdata(L, sizeof(void*));
-    *udata_p = udata;
-    if(!luaT_pushmetatable(L, tname))
-      luaL_error(L, "Torch internal problem: cannot find metatable for type <%s>", tname);
-    lua_setmetatable(L, -2);
-  }
-  else
-    lua_pushnil(L);
-}
-
-void *luaT_toudata(lua_State *L, int ud, const char *tname)
-{
-  void **p = lua_touserdata(L, ud);
-  if(p != NULL) /* value is a userdata? */
-  {
-    if(!luaT_pushmetatable(L, tname))
-      luaL_error(L, "Torch internal problem: cannot find metatable for type <%s>", tname);
-
-    /* initialize the table we want to get the metatable on */
-    /* note that we have to be careful with indices, as we just inserted stuff */
-    lua_pushvalue(L, (ud < 0 ? ud - 1 : ud));
-    while(lua_getmetatable(L, -1)) /* get the next metatable */
-    {
-      lua_remove(L, -2); /* remove the previous metatable [or object, if first time] */
-      if(lua_rawequal(L, -1, -2))
-      {
-        lua_pop(L, 2);  /* remove the two metatables */
-        return *p;
-      }
-    }
-    lua_pop(L, 2); /* remove the two metatables */
-  }
-  return NULL;
-}
-
-int luaT_isudata(lua_State *L, int ud, const char *tname)
-{
-  if(luaT_toudata(L, ud, tname))
-    return 1;
-  else
-    return 0;
-}
-
-void *luaT_checkudata(lua_State *L, int ud, const char *tname)
-{
-  void *p = luaT_toudata(L, ud, tname);
-  if(!p)
-    luaT_typerror(L, ud, tname);
-  return p;
-}
-
-void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname)
-{
-  void *p;
-  lua_getfield(L, ud, field);
-  if(lua_isnil(L, -1))
-    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
-  p = luaT_toudata(L, -1, tname);
-  if(!p)
-    luaL_error(L, "bad argument #%d (field %s is not a %s)", ud, field, tname);
-  return p;
-}
-
-void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field)
-{
-  void *p;
-  lua_getfield(L, ud, field);
-  if(lua_isnil(L, -1))
-    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
-
-  if(!lua_islightuserdata(L, -1))
-    luaL_error(L, "bad argument #%d (field %s is not a light userdata)", ud, field);
-
-  p = lua_touserdata(L, -1);
-
-  return p;
-}
-
-double luaT_getfieldchecknumber(lua_State *L, int ud, const char *field)
-{
-  lua_getfield(L, ud, field);
-  if(lua_isnil(L, -1))
-    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
-  if(!lua_isnumber(L, -1))
-    luaL_error(L, "bad argument #%d (field %s is not a number)", ud, field);
-  return lua_tonumber(L, -1);
-}
-
-int luaT_getfieldcheckint(lua_State *L, int ud, const char *field)
-{
-  lua_getfield(L, ud, field);
-  if(lua_isnil(L, -1))
-    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
-  if(!lua_isnumber(L, -1))
-    luaL_error(L, "bad argument #%d (field %s is not a number)", ud, field);
-  return (int)lua_tonumber(L, -1);
-}
-
-const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field)
-{
-  lua_getfield(L, ud, field);
-  if(lua_isnil(L, -1))
-    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
-  if(!lua_isstring(L, -1))
-    luaL_error(L, "bad argument #%d (field %s is not a string)", ud, field);
-  return lua_tostring(L, -1);
-}
-
-int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field)
-{
-  lua_getfield(L, ud, field);
-  if(lua_isnil(L, -1))
-    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
-  if(!lua_isboolean(L, -1))
-    luaL_error(L, "bad argument #%d (field %s is not a boolean)", ud, field);
-  return lua_toboolean(L, -1);
-}
-
-void luaT_getfieldchecktable(lua_State *L, int ud, const char *field)
-{
-  lua_getfield(L, ud, field);
-  if(lua_isnil(L, -1))
-    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
-  if(!lua_istable(L, -1))
-    luaL_error(L, "bad argument #%d (field %s is not a table)", ud, field);
-}
-
-/**** type checks as in luaL ****/
-int luaT_typerror(lua_State *L, int ud, const char *tname)
-{
-  const char *msg;
-  const char *tnameud = luaT_typename(L, ud);
-
-  if(!tnameud)
-    tnameud = lua_typename(L, ud);
-
-  msg = lua_pushfstring(L, "%s expected, got %s",
-                        tname,
-                        (tnameud ? tnameud : "unknown object"));
-
-  return luaL_argerror(L, ud, msg);
-}
-
-int luaT_checkboolean(lua_State *L, int ud)
-{
-  if(!lua_isboolean(L, ud))
-    luaT_typerror(L, ud, lua_typename(L, LUA_TBOOLEAN));
-  return lua_toboolean(L, ud);
-}
-
-int luaT_optboolean(lua_State *L, int ud, int def)
-{
-  if(lua_isnoneornil(L,ud))
-    return def;
-
-  return luaT_checkboolean(L, ud);
-}
-
-void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name)
-{
-  int idx = lua_gettop(L);
-
-  luaL_checktype(L, idx, LUA_TTABLE);
-  lua_pushstring(L, name);
-  lua_rawget(L, idx);
-
-  if(lua_isnil(L, -1))
-  {
-    lua_pop(L, 1);
-    lua_pushstring(L, name);
-    lua_newtable(L);
-    lua_rawset(L, idx);
-
-    lua_pushstring(L, name);
-    lua_rawget(L, idx);
-  }
-
-  luaL_register(L, NULL, methods);
-  lua_pop(L, 1);
-}
-
-
-/* utility functions */
-const char *luaT_classrootname(const char *tname)
-{
-  int i;
-  int sz = strlen(tname);
-
-  for(i = 0; i < sz; i++)
-  {
-    if(tname[i] == '.')
-      return tname+i+1;
-  }
-  return tname;
-}
-
-/* module_name must be a buffer at least as big as tname 
- * return true if the class is part of a module */
-int luaT_classmodulename(const char *tname, char *module_name)
-{
-  char chars[] = {'.', '\0'};
-  size_t n;
-  n = strcspn(tname, chars);
-  strncpy(module_name, tname, n);
-  module_name[n] = '\0';
-  return tname[n] == '.';
-}
-
-/* Lua only functions */
-int luaT_lua_newmetatable(lua_State *L)
-{
-  const char* tname = luaL_checkstring(L, 1);
-  char module_name[256];
-  int is_in_module = 0;
-  is_in_module = luaT_classmodulename(tname, module_name);
-
-  lua_settop(L, 5);
-  luaL_argcheck(L, lua_isnoneornil(L, 2) || lua_isstring(L, 2), 2, "parent class name or nil expected");
-  luaL_argcheck(L, lua_isnoneornil(L, 3) || lua_isfunction(L, 3), 3, "constructor function or nil expected");
-  luaL_argcheck(L, lua_isnoneornil(L, 4) || lua_isfunction(L, 4), 4, "destructor function or nil expected");
-  luaL_argcheck(L, lua_isnoneornil(L, 5) || lua_isfunction(L, 5), 5, "factory function or nil expected");
-
-  if(is_in_module)
-    lua_getfield(L, LUA_GLOBALSINDEX, module_name);
-  else
-    lua_pushvalue(L, LUA_GLOBALSINDEX);
-  if(!lua_istable(L, 6))
-    luaL_error(L, "while creating metatable %s: bad argument #1 (%s is an invalid module name)", tname, module_name);
-
-  /* we first create the new metaclass if we have to */
-  if(!luaT_pushmetatable(L, tname))
-  {
-    /* create the metatable */
-    lua_newtable(L);
-
-    /* registry[name] = metatable */
-    lua_pushvalue(L, -1);
-    lua_setfield(L, LUA_REGISTRYINDEX, tname);
-
-    /* registry[metatable] = tname */
-    lua_pushvalue(L, -1);
-    lua_pushstring(L, tname);
-    lua_rawset(L, LUA_REGISTRYINDEX);
-
-    /* __index handling */
-    lua_pushcfunction(L, luaT_mt__index);
-    lua_setfield(L, -2, "__index");
-
-    /* __newindex handling */
-    lua_pushcfunction(L, luaT_mt__newindex);
-    lua_setfield(L, -2, "__newindex");
-
-    /* __typename contains the typename */
-    lua_pushstring(L, tname);
-    lua_setfield(L, -2, "__typename");
-
-    /* __metatable is self */
-    lua_pushvalue(L, -1);
-    lua_setfield(L, -2, "__metatable");
-
-    /* by default, __version equals 1 */
-    lua_pushnumber(L, 1);
-    lua_setfield(L, -2, "__version");
-
-    /* assign default operator functions */
-    lua_pushcfunction(L, luaT_mt__tostring);
-    lua_setfield(L, -2, "__tostring");
-
-    lua_pushcfunction(L, luaT_mt__add);
-    lua_setfield(L, -2, "__add");
-
-    lua_pushcfunction(L, luaT_mt__sub);
-    lua_setfield(L, -2, "__sub");
-
-    lua_pushcfunction(L, luaT_mt__mul);
-    lua_setfield(L, -2, "__mul");
-
-    lua_pushcfunction(L, luaT_mt__div);
-    lua_setfield(L, -2, "__div");
-
-    lua_pushcfunction(L, luaT_mt__mod);
-    lua_setfield(L, -2, "__mod");
-
-    lua_pushcfunction(L, luaT_mt__pow);
-    lua_setfield(L, -2, "__pow");
-
-    lua_pushcfunction(L, luaT_mt__unm);
-    lua_setfield(L, -2, "__unm");
-
-    lua_pushcfunction(L, luaT_mt__concat);
-    lua_setfield(L, -2, "__concat");
-
-    lua_pushcfunction(L, luaT_mt__len);
-    lua_setfield(L, -2, "__len");
-
-    lua_pushcfunction(L, luaT_mt__eq);
-    lua_setfield(L, -2, "__eq");
-
-    lua_pushcfunction(L, luaT_mt__lt);
-    lua_setfield(L, -2, "__lt");
-
-    lua_pushcfunction(L, luaT_mt__le);
-    lua_setfield(L, -2, "__le");
-
-    lua_pushcfunction(L, luaT_mt__call);
-    lua_setfield(L, -2, "__call");
-  }
-
-  /* we assign the parent class if necessary */
-  if(!lua_isnoneornil(L, 2))
-  {
-    if(lua_getmetatable(L, -1))
-      luaL_error(L, "class %s has been already assigned a parent class\n", tname);
-    else
-    {
-      const char* parenttname = luaL_checkstring(L, 2);
-      if(!luaT_pushmetatable(L, parenttname))
-        luaL_error(L, "bad argument #2 (invalid parent class name %s)", parenttname);
-      lua_setmetatable(L, -2);
-    }
-  }
-
-  /* register the destructor function  */
-  if(!lua_isnoneornil(L, 4))
-  {
-    /* does it exists already? */
-    lua_pushstring(L, "__gc");
-    lua_rawget(L, -2);
-
-    if(lua_isnil(L, -1))
-    {
-      lua_pop(L, 1); /* pop nil */
-      lua_pushstring(L, "__gc");
-      lua_pushvalue(L, 4);
-      lua_rawset(L, -3);
-    }
-    else
-      luaL_error(L, "%s has been already assigned a destructor", tname);
-  }
-
-  /* register the factory function  */
-  if(!lua_isnoneornil(L, 5))
-  {
-    /* does it exists already? */
-    lua_pushstring(L, "__factory");
-    lua_rawget(L, -2);
-
-    if(lua_isnil(L, -1))
-    {
-      lua_pop(L, 1); /* pop nil */
-      lua_pushstring(L, "__factory");
-      lua_pushvalue(L, 5);
-      lua_rawset(L, -3);
-    }
-    else
-      luaL_error(L, "%s has been already assigned a factory", tname);
-  }
-
-  /******** Constructor table and metatable ********/
-  lua_pushstring(L, "__constructor");
-  lua_rawget(L, -2);
-  if(lua_isnil(L, -1))
-  {
-    lua_pop(L, 1);                        /* pop nil */
-    lua_newtable(L);                      /* fancy table */
-    lua_newtable(L);                      /* fancy metatable */
-
-    lua_pushvalue(L, -3);                 /* metatable */
-    lua_setfield(L, -2, "__index");       /* so we can get the methods */
-
-    lua_pushcfunction(L, luaT_cmt__newindex);
-    lua_setfield(L, -2, "__newindex");    /* so we add new methods */
-
-    lua_pushcfunction(L, luaT_cmt__call);
-    lua_setfield(L, -2, "__call");        /* so we can create, we are here for only that */
-
-    lua_pushvalue(L, -3);
-    lua_setfield(L, -2, "__metatable");   /* redirect to metatable with methods */
-
-    lua_setmetatable(L, -2);              /* constructor metatable is ... this fancy metatable */
-
-    /* set metatable[__constructor] = constructor-metatable */
-    lua_pushstring(L, "__constructor");
-    lua_pushvalue(L, -2);
-    lua_rawset(L, -4);
-  }
-
-  /* register the constructor function  */
-  if(!lua_isnoneornil(L, 3))
-  {
-    /* get constructor metatable */
-    lua_getmetatable(L, -1);
-
-    /* does it exists already? */
-    lua_pushstring(L, "__new");
-    lua_rawget(L, -2);
-
-    if(lua_isnil(L, -1))
-    {
-      lua_pop(L, 1); /* pop nil */
-      lua_pushstring(L, "__new");
-      lua_pushvalue(L, 3);
-      lua_rawset(L, -3);
-
-      /* set "new" in the metatable too */
-      lua_pushstring(L, "new");
-      lua_pushvalue(L, 3);
-      lua_rawset(L, -5);
-    }
-    else
-      luaL_error(L, "%s has been already assigned a constructor", tname);
-
-    /* pop constructor metatable */
-    lua_pop(L, 1);
-  }
-
-  /* module.name = constructor metatable */
-  lua_setfield(L, 6, luaT_classrootname(tname));
-
-  return 1; /* returns the metatable */
-}
-
-/* Lua only utility functions */
-
-/* add any custom type, provided the object has a metatable */
-int luaT_lua_metatype(lua_State *L)
-{
-  if( (lua_gettop(L) != 2) && (lua_gettop(L) != 3) )
-    luaL_error(L, "expecting: string table [ctype]");
-
-  luaL_checkstring(L, 1);
-  luaL_checktype(L, 2, LUA_TTABLE);
-
-  if(lua_gettop(L) == 3)
-  {
-    if(!luaT_cdataname(L, 3, lua_tostring(L, 1)))
-      luaL_error(L, "could not register cdata type -- missing ffi library?");
-  }
-
-  /* registry[name] = metatable */
-  lua_pushvalue(L, 1);
-  lua_pushvalue(L, 2);
-  lua_rawset(L, LUA_REGISTRYINDEX);
-
-  /* registry[metatable] = tname */
-  lua_pushvalue(L, 2);
-  lua_pushvalue(L, 1);
-  lua_rawset(L, LUA_REGISTRYINDEX);
-
-  return 0;
-}
-
-/* return a userdata from a C pointer */
-/* you are better to know what you are doing */
-int luaT_lua_pushudata(lua_State *L)
-{
-  void *udata = NULL;
-  const char *tname = luaL_checkstring(L, 2);
-
-  if(lua_type(L, 1) == 10)
-    udata = *((void**)lua_topointer(L, 1));
-  else if(lua_isnumber(L, 1))
-    udata = (void*)(long)lua_tonumber(L, 1);
-  else
-    luaL_argerror(L, 1, "expecting number or cdata");
-
-  luaT_pushudata(L, udata, tname);
-
-  return 1;
-}
-
-int luaT_lua_factory(lua_State *L)
-{
-  const char* tname = luaL_checkstring(L, 1);
-  if(luaT_pushmetatable(L, tname) && !lua_isnil(L, -1))
-  {
-    lua_pushstring(L, "__factory");
-    lua_rawget(L, -2);
-  }
-  else
-  {
-    lua_pushnil(L);
-  }
-  return 1;
-}
-
-int luaT_lua_getconstructortable(lua_State *L)
-{
-  const char* tname = luaL_checkstring(L, 1);
-  if(luaT_pushmetatable(L, tname))
-  {
-    lua_pushstring(L, "__constructor");
-    lua_rawget(L, -2);
-    return 1;
-  }
-  return 0;
-}
-
-
-int luaT_lua_typename(lua_State *L)
-{
-  const char* tname = NULL;
-  luaL_checkany(L, 1);
-  if((tname = luaT_typename(L, 1)))
-  {
-    lua_pushstring(L, tname);
-    return 1;
-  }
-  return 0;
-}
-
-int luaT_lua_isequal(lua_State *L)
-{
-  if(lua_isuserdata(L, 1) && lua_isuserdata(L, 2))
-  {
-    void **u1, **u2;
-    luaL_argcheck(L, luaT_typename(L, 1), 1, "Torch object expected");
-    luaL_argcheck(L, luaT_typename(L, 2), 2, "Torch object expected");
-
-    u1 = lua_touserdata(L, 1);
-    u2 = lua_touserdata(L, 2);
-    if(*u1 == *u2)
-      lua_pushboolean(L, 1);
-    else
-      lua_pushboolean(L, 0);
-  }
-  else if(lua_istable(L, 1) && lua_istable(L, 2))
-    lua_pushboolean(L, lua_rawequal(L, 1, 2));
-  else
-    lua_pushboolean(L, 0);
-  return 1;
-}
-
-int luaT_lua_pointer(lua_State *L)
-{
-  if(lua_isuserdata(L, 1))
-  {
-    void **ptr;
-    luaL_argcheck(L, luaT_typename(L, 1), 1, "Torch object expected");
-    ptr = lua_touserdata(L, 1);
-    lua_pushnumber(L, (long)(*ptr));
-    return 1;
-  }
-  else if(lua_istable(L, 1) || lua_isthread(L, 1) || lua_isfunction(L, 1))
-  {
-    const void* ptr = lua_topointer(L, 1);
-    lua_pushnumber(L, (long)(ptr));
-    return 1;
-  }
-  else if(lua_type(L, 1) == 10) /* cdata */
-  {
-    /* we want the pointer holded by cdata */
-    /* not the pointer on the cdata object */
-    const void* ptr = *((void**)lua_topointer(L, 1));
-    lua_pushnumber(L, (long)(ptr));
-    return 1;
-  }
-  else if(lua_isstring(L, 1))
-  {
-    const char* ptr = lua_tostring(L, 1);
-    lua_pushnumber(L, (long)(ptr));
-    return 1;
-  }
-  else
-    luaL_error(L, "Torch object, table, thread, cdata or function expected");
-
-  return 0;
-}
-
-int luaT_lua_setenv(lua_State *L)
-{
-  if(!lua_isfunction(L, 1) && !lua_isuserdata(L, 1))
-    luaL_typerror(L, 1, "function or userdata");
-  luaL_checktype(L, 2, LUA_TTABLE);
-  lua_setfenv(L, 1);
-  return 0;
-}
-
-int luaT_lua_getenv(lua_State *L)
-{
-  if(!lua_isfunction(L, 1) && !lua_isuserdata(L, 1))
-    luaL_typerror(L, 1, "function or userdata");
-  lua_getfenv(L, 1);
-  return 1;
-}
-
-int luaT_lua_getmetatable(lua_State *L)
-{
-  const char *tname = luaL_checkstring(L, 1);
-  if(luaT_pushmetatable(L, tname))
-    return 1;
-  return 0;
-}
-
-int luaT_lua_version(lua_State *L)
-{
-  luaL_checkany(L, 1);
-
-  if(lua_type(L, 1) == 10)
-  {
-    const char *tname = luaT_cdataname(L, 1, NULL);
-    if(tname)
-    {
-      luaT_pushmetatable(L, tname);
-      lua_pushstring(L, "__version");
-      lua_rawget(L, -2);
-      return 1;
-    }
-    return 0;
-  }
-  else if(lua_getmetatable(L, 1))
-  {
-    lua_pushstring(L, "__version");
-    lua_rawget(L, -2);
-    return 1;
-  }
-  return 0;
-}
-
-int luaT_lua_setmetatable(lua_State *L)
-{
-  const char *tname = luaL_checkstring(L, 2);
-  luaL_checktype(L, 1, LUA_TTABLE);
-
-  if(!luaT_pushmetatable(L, tname))
-    luaL_error(L, "unknown typename %s\n", tname);
-  lua_setmetatable(L, 1);
-
-  return 1;
-}
-
-/* metatable operator methods */
-static int luaT_mt__index(lua_State *L)
-{
-  if(!lua_getmetatable(L, 1))
-    luaL_error(L, "critical internal indexing error: no metatable found");
-
-  if(!lua_istable(L, -1))
-    luaL_error(L, "critical internal indexing error: not a metatable");
-
-  /* test for __index__ method first */
-  lua_getfield(L, -1, "__index__");
-  if(!lua_isnil(L, -1))
-  {
-    int result;
-
-    if(!lua_isfunction(L, -1))
-      luaL_error(L, "critical internal indexing error: __index__ is not a function");
-
-    lua_pushvalue(L, 1);
-    lua_pushvalue(L, 2);
-
-    lua_call(L, 2, LUA_MULTRET); /* DEBUG: risque: faut vraiment retourner 1 ou 2 valeurs... */
-
-    result = lua_toboolean(L, -1);
-    lua_pop(L, 1);
-
-    if(result)
-      return 1;
-
-    /* on the stack: 1. the object 2. the value 3. the metatable */
-    /* apparently, __index wants only one element returned */
-    /* return lua_gettop(L)-3; */
-
-  }
-  else
-    lua_pop(L, 1); /* remove nil __index__ on the stack */
-
-  lua_pushvalue(L, 2);
-  lua_gettable(L, -2);
-
-  return 1;
-}
-
-static int luaT_mt__newindex(lua_State *L)
-{
-  if(!lua_getmetatable(L, 1))
-    luaL_error(L, "critical internal indexing error: no metatable found");
-
-  if(!lua_istable(L, -1))
-    luaL_error(L, "critical internal indexing error: not a metatable");
-
-  /* test for __newindex__ method first */
-  lua_getfield(L, -1, "__newindex__");
-  if(!lua_isnil(L, -1))
-  {
-    int result;
-
-    if(!lua_isfunction(L, -1))
-      luaL_error(L, "critical internal indexing error: __newindex__ is not a function");
-
-    lua_pushvalue(L, 1);
-    lua_pushvalue(L, 2);
-    lua_pushvalue(L, 3);
-
-    lua_call(L, 3, 1); /* DEBUG: risque: faut vraiment retourner qqch */
-
-    result = lua_toboolean(L, -1);
-    lua_pop(L, 1);
-
-    if(result)
-      return 0;
-  }
-  else
-    lua_pop(L, 1); /* remove nil __newindex__ on the stack */
-
-  lua_pop(L, 1);    /* pop the metatable */
-  if(lua_istable(L, 1))
-    lua_rawset(L, 1);
-  else
-    luaL_error(L, "the class %s cannot be indexed", luaT_typename(L, 1));
-
-  return 0;
-}
-
-/* note: check dans metatable pour ca, donc necessaire */
-#define MT_DECLARE_OPERATOR(NAME, NIL_BEHAVIOR)                     \
-  int luaT_mt__##NAME(lua_State *L)                                 \
-  {                                                                 \
-    if(!lua_getmetatable(L, 1))                                     \
-      luaL_error(L, "internal error in __" #NAME ": no metatable"); \
-                                                                    \
-    lua_getfield(L, -1, "__" #NAME "__");                           \
-    if(lua_isnil(L, -1))                                                \
-    {                                                                   \
-      NIL_BEHAVIOR;                                                     \
-    }                                                                   \
-    else                                                                \
-    {                                                                   \
-      if(lua_isfunction(L, -1))                                         \
-      {                                                                 \
-        lua_insert(L, 1); /* insert function */                         \
-        lua_pop(L, 1); /* remove metatable */                           \
-        lua_call(L, lua_gettop(L)-1, LUA_MULTRET); /* we return the result of the call */ \
-        return lua_gettop(L);                                           \
-      }                                                                 \
-      /* we return the thing the user left in __tostring__ */           \
-    }                                                                   \
-    return 0;                                                           \
-  }
-
-MT_DECLARE_OPERATOR(tostring,
-                    lua_pushstring(L, luaT_typename(L, 1));
-                    return 1;)
-MT_DECLARE_OPERATOR(add, luaL_error(L, "%s has no addition operator", luaT_typename(L, 1)))
-MT_DECLARE_OPERATOR(sub, luaL_error(L, "%s has no substraction operator", luaT_typename(L, 1)))
-MT_DECLARE_OPERATOR(mul, luaL_error(L, "%s has no multiplication operator", luaT_typename(L, 1)))
-MT_DECLARE_OPERATOR(div, luaL_error(L, "%s has no division operator", luaT_typename(L, 1)))
-MT_DECLARE_OPERATOR(mod, luaL_error(L, "%s has no modulo operator", luaT_typename(L, 1)))
-MT_DECLARE_OPERATOR(pow, luaL_error(L, "%s has no power operator", luaT_typename(L, 1)))
-MT_DECLARE_OPERATOR(unm, luaL_error(L, "%s has no negation operator", luaT_typename(L, 1)))
-MT_DECLARE_OPERATOR(concat, luaL_error(L, "%s has no concat operator", luaT_typename(L, 1)))
-MT_DECLARE_OPERATOR(len, luaL_error(L, "%s has no length operator", luaT_typename(L, 1)))
-MT_DECLARE_OPERATOR(eq,
-                    lua_settop(L, 2);
-                    lua_pushcfunction(L, luaT_lua_isequal);
-                    lua_insert(L, 1);
-                    lua_call(L, 2, 1);
-                    return 1;)
-MT_DECLARE_OPERATOR(lt, luaL_error(L, "%s has no lower than operator", luaT_typename(L, 1)))
-MT_DECLARE_OPERATOR(le, luaL_error(L, "%s has no lower or equal than operator", luaT_typename(L, 1)))
-MT_DECLARE_OPERATOR(call, luaL_error(L, "%s has no call operator", luaT_typename(L, 1)))
-
-
-/* constructor metatable methods */
-int luaT_cmt__call(lua_State *L)
-{
-  if(!lua_istable(L, 1))
-    luaL_error(L, "internal error in __call: not a constructor table");
-
-  if(!lua_getmetatable(L, 1))
-    luaL_error(L, "internal error in __call: no metatable available");
-
-  lua_pushstring(L, "__new");
-  lua_rawget(L, -2);
-
-  if(lua_isnil(L, -1))
-    luaL_error(L, "no constructor available");
-
-  lua_remove(L, 1); /* remove constructor atable */
-  lua_insert(L, 1); /* insert constructor */
-  lua_pop(L, 1);    /* remove fancy metatable */
-
-  lua_call(L, lua_gettop(L)-1, LUA_MULTRET);
-  return lua_gettop(L);
-}
-
-int luaT_cmt__newindex(lua_State *L)
-{
-  if(!lua_istable(L, 1))
-    luaL_error(L, "internal error in __newindex: not a constructor table");
-
-  if(!lua_getmetatable(L, 1))
-    luaL_error(L, "internal error in __newindex: no metatable available");
-
-  lua_pushstring(L, "__metatable");
-  lua_rawget(L, -2);
-
-  if(!lua_istable(L, -1))
-    luaL_error(L, "internal error in __newindex: no metaclass available");
-
-  lua_insert(L, 2);
-  lua_pop(L, 1); /* remove the metatable over the constructor table */
-
-  lua_rawset(L, -3);
-
-  return 0;
-}
-
-/******************** deprecated functions ********************/
-int luaT_pushmetaclass(lua_State *L, const char *tname)
-{
-  return luaT_pushmetatable(L, tname);
-}
-
-const char* luaT_id(lua_State *L, int ud)
-{
-  return luaT_typename(L, ud);
-}
-
-const char* luaT_id2typename(lua_State *L, const char *id)
-{
-  return id;
-}
-
-const char* luaT_typename2id(lua_State *L, const char *tname)
-{
-  return luaT_typenameid(L, tname);
-}
-
-int luaT_getmetaclass(lua_State *L, int index)
-{
-  return lua_getmetatable(L, index);
-}
-
-const char* luaT_checktypename2id(lua_State *L, const char *tname)
-{
-  const char* id = luaT_typenameid(L, tname);
-  if(!id)
-    luaL_error(L, "unknown class <%s>", tname);
-  return id;
-}
-
-void luaT_registeratid(lua_State *L, const struct luaL_Reg *methods, const char *id)
-{
-  luaT_registeratname(L, methods, id);
-}
-
-/**************************************************************/
diff --git a/luaT/luaT.h b/luaT/luaT.h
deleted file mode 100644
index 5e8dd2f..0000000
--- a/luaT/luaT.h
+++ /dev/null
@@ -1,111 +0,0 @@
-#ifndef LUAT_UTILS_INC
-#define LUAT_UTILS_INC
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-#include <lua.h>
-#include <lauxlib.h>
-#ifdef __cplusplus
-}
-#endif
-
-#ifndef LUA_EXTERNC
-# ifdef __cplusplus
-#  define LUA_EXTERNC extern "C"
-# else
-#  define LUA_EXTERNC extern
-# endif
-#endif
-
-#ifdef _MSC_VER
-# define DLL_EXPORT __declspec(dllexport)
-# define DLL_IMPORT __declspec(dllimport)
-# ifdef luaT_EXPORTS
-#  define LUAT_API LUA_EXTERNC DLL_EXPORT
-# else
-#  define LUAT_API LUA_EXTERNC DLL_IMPORT
-# endif
-#else
-# define DLL_EXPORT
-# define DLL_IMPORT
-# define LUAT_API LUA_EXTERNC
-#endif
-
-
-/* C functions */
-
-LUAT_API void* luaT_alloc(lua_State *L, long size);
-LUAT_API void* luaT_realloc(lua_State *L, void *ptr, long size);
-LUAT_API void luaT_free(lua_State *L, void *ptr);
-
-LUAT_API const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parenttname,
-                                       lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory);
-
-LUAT_API int luaT_pushmetatable(lua_State *L, const char *tname);
-
-LUAT_API const char* luaT_typenameid(lua_State *L, const char *tname);
-LUAT_API const char* luaT_typename(lua_State *L, int ud);
-
-LUAT_API void luaT_pushudata(lua_State *L, void *udata, const char *tname);
-LUAT_API void *luaT_toudata(lua_State *L, int ud, const char *tname);
-LUAT_API int luaT_isudata(lua_State *L, int ud, const char *tname);
-LUAT_API void *luaT_checkudata(lua_State *L, int ud, const char *tname);
-
-LUAT_API void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname);
-LUAT_API void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field);
-LUAT_API double luaT_getfieldchecknumber(lua_State *L, int ud, const char *field);
-LUAT_API int luaT_getfieldcheckint(lua_State *L, int ud, const char *field);
-LUAT_API const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field);
-LUAT_API int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field);
-LUAT_API void luaT_getfieldchecktable(lua_State *L, int ud, const char *field);
-
-LUAT_API int luaT_typerror(lua_State *L, int ud, const char *tname);
-
-LUAT_API int luaT_checkboolean(lua_State *L, int ud);
-LUAT_API int luaT_optboolean(lua_State *L, int ud, int def);
-
-LUAT_API void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name);
-
-/* utility functions */
-LUAT_API const char *luaT_classrootname(const char *tname);
-LUAT_API int luaT_classmodulename(const char *tname, char *module_name);
-
-/* debug */
-LUAT_API void luaT_stackdump(lua_State *L);
-
-/* Lua functions */
-LUAT_API int luaT_lua_newmetatable(lua_State *L);
-LUAT_API int luaT_lua_factory(lua_State *L);
-LUAT_API int luaT_lua_getconstructortable(lua_State *L);
-LUAT_API int luaT_lua_typename(lua_State *L);
-LUAT_API int luaT_lua_isequal(lua_State *L);
-LUAT_API int luaT_lua_pointer(lua_State *L);
-LUAT_API int luaT_lua_setenv(lua_State *L);
-LUAT_API int luaT_lua_getenv(lua_State *L);
-LUAT_API int luaT_lua_getmetatable(lua_State *L);
-LUAT_API int luaT_lua_version(lua_State *L);
-LUAT_API int luaT_lua_setmetatable(lua_State *L);
-LUAT_API int luaT_lua_metatype(lua_State *L);
-LUAT_API int luaT_lua_pushudata(lua_State *L);
-
-/* deprecated functions */
-/* ids have been replaced by string names to identify classes */
-/* comments show what function (that you should use) they call now */
-#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
-#define LUAT_DEPRECATED  __attribute__((__deprecated__))
-#elif defined(_MSC_VER)
-#define LUAT_DEPRECATED __declspec(deprecated)
-#else
-#define LUAT_DEPRECATED
-#endif
-
-LUAT_API LUAT_DEPRECATED int luaT_pushmetaclass(lua_State *L, const char *tname); /* same as luaT_pushmetatable */
-LUAT_API LUAT_DEPRECATED const char* luaT_id(lua_State *L, int ud); /* same as luaT_typename */
-LUAT_API LUAT_DEPRECATED const char* luaT_id2typename(lua_State *L, const char *id); /*  same as luaT_typenameid */
-LUAT_API LUAT_DEPRECATED const char* luaT_typename2id(lua_State *L, const char*); /* same as luaT_typenameid */
-LUAT_API LUAT_DEPRECATED int luaT_getmetaclass(lua_State *L, int index); /* same as luaT_getmetatable */
-LUAT_API LUAT_DEPRECATED const char* luaT_checktypename2id(lua_State *L, const char *tname);  /* same as luaT_typenameid */
-LUAT_API LUAT_DEPRECATED void luaT_registeratid(lua_State *L, const struct luaL_Reg *methods, const char *id); /* same as luaT_registeratname */
-
-#endif
diff --git a/luarocks b/luarocks
new file mode 160000
index 0000000..a0315b7
--- /dev/null
+++ b/luarocks
@@ -0,0 +1 @@
+Subproject commit a0315b7bc2432ea517bb90ce39df0cc8b1cd2f65
diff --git a/matrix/cuda_helper.h b/matrix/cuda_helper.h
deleted file mode 100644
index fde6f18..0000000
--- a/matrix/cuda_helper.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef NERV_CUDA_HELPER_H
-#define NERV_CUDA_HELPER_H
-#include "cuda.h"
-#include "cuda_runtime.h"
-#include "driver_types.h"
-#include "cublas_v2.h"
-#define CUBLAS_SAFE_SYNC_CALL(call) \
-    do { \
-        cublasStatus_t  err = (call); \
-        if (err != CUBLAS_STATUS_SUCCESS) \
-            nerv_error(L, "cumatrix cublas error: %s at %s:%d", \
-                        cublasGetErrorString(err), __FILE__, __LINE__); \
-        cudaDeviceSynchronize(); \
-    } while (0)
-
-#define CUDA_SAFE_CALL(call) \
-    do { \
-        cudaError_t err = (call); \
-        if (err != cudaSuccess) \
-            nerv_error(L, "cumatrix CUDA error: %s at %s:%d", \
-                            cudaGetErrorString(err), __FILE__, __LINE__); \
-    } while (0)
-
-#define CUDA_SAFE_SYNC_CALL(call) \
-    do { \
-        CUDA_SAFE_CALL(call); \
-        cudaDeviceSynchronize(); \
-    } while (0)
-
-#define CHECK_SAME_DIMENSION(a, b) \
-    do { \
-        if (!(a->nrow == b->nrow && a->ncol == b->ncol)) \
-            nerv_error(L, "matrices should be of the same dimension"); \
-    } while (0)
-
-static const char *cublasGetErrorString(cublasStatus_t err) {
-    switch (err)
-    {
-        case CUBLAS_STATUS_SUCCESS:
-            return "CUBLAS_STATUS_SUCCESS";
-        case CUBLAS_STATUS_NOT_INITIALIZED:
-            return "CUBLAS_STATUS_NOT_INITIALIZED";
-        case CUBLAS_STATUS_ALLOC_FAILED:
-            return "CUBLAS_STATUS_ALLOC_FAILED";
-        case CUBLAS_STATUS_INVALID_VALUE:
-            return "CUBLAS_STATUS_INVALID_VALUE";
-        case CUBLAS_STATUS_ARCH_MISMATCH:
-            return "CUBLAS_STATUS_ARCH_MISMATCH";
-        case CUBLAS_STATUS_MAPPING_ERROR:
-            return "CUBLAS_STATUS_MAPPING_ERROR";
-        case CUBLAS_STATUS_EXECUTION_FAILED:
-            return "CUBLAS_STATUS_EXECUTION_FAILED";
-        case CUBLAS_STATUS_INTERNAL_ERROR:
-            return "CUBLAS_STATUS_INTERNAL_ERROR";
-/*        case CUBLAS_STATUS_NOT_SUPPORTED:
-            return "CUBLAS_STATUS_NOT_SUPPORTED";
-        case CUBLAS_STATUS_LICENSE_ERROR:
-            return "CUBLAS_STATUS_LICENSE_ERROR"; */
-    }
-    return "<unknown>";
-}
-
-#define PROFILE_START \
-    do { \
-        cudaEventRecord(profile_start, 0);
-#define PROFILE_STOP \
-        cudaEventRecord(profile_stop, 0); \
-        cudaEventSynchronize(profile_stop); \
-        float milliseconds = 0; \
-        cudaEventElapsedTime(&milliseconds, profile_start, profile_stop); \
-        accu_profile(__func__, milliseconds / 1000); \
-    } while (0);
-
-#define PROFILE_END
-#endif
diff --git a/matrix/cukernel.cu b/matrix/cukernel.cu
deleted file mode 100644
index a19030a..0000000
--- a/matrix/cukernel.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-#define NERV_GENERIC_CUKERNEL
-
-#define cudak_(NAME) cudak_float_ ## NAME
-#define MATRIX_USE_FLOAT
-#include "generic/elem_type.h"
-#include "generic/cukernel.cu"
-#undef cudak_
-#undef MATRIX_USE_FLOAT
-#undef MATRIX_ELEM
-#undef MATRIX_ELEM_PTR
-#undef MATRIX_ELEM_FMT
-#undef MATRIX_ELEM_WRITE_FMT
-
-#define cudak_(NAME) cudak_double_ ## NAME
-#define MATRIX_USE_DOUBLE
-#include "generic/elem_type.h"
-#include "generic/cukernel.cu"
diff --git a/matrix/cukernel.h b/matrix/cukernel.h
deleted file mode 100644
index 8a1494f..0000000
--- a/matrix/cukernel.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifdef NERV_GENERIC_CUKERNEL
-void cudak_(cuda_mul_elem)(const Matrix *a, const Matrix *b, Matrix *c);
-void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b);
-void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b);
-void cudak_(cuda_sigmoid_grad)(const Matrix *output, const Matrix *err, Matrix *nerr);
-void cudak_(cuda_rowsum)(const Matrix *a, Matrix *b);
-void cudak_(cuda_rowmax)(const Matrix *a, Matrix *b);
-void cudak_(cuda_rowmax_idx)(const Matrix *a, Matrix *b, Matrix *idx);
-void cudak_(cuda_colsum)(const Matrix *a, Matrix *b);
-void cudak_(cuda_colsame)(const Matrix *a, const Matrix *ref, Matrix *b);
-void cudak_(cuda_softmax_denominator)(const Matrix *a, const Matrix *max, Matrix *b);
-void cudak_(cuda_softmax_final)(const Matrix *a, const Matrix *max, const Matrix *deno, Matrix *b);
-void cudak_(cuda_add_row)(const Matrix *a, Matrix *b, double beta);
-void cudak_(cuda_fill)(Matrix *a, double val);
-void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context);
-void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step);
-void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b);
-void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b);
-void cudak_(cuda_decompress)(const Matrix *a, Matrix *b);
-#endif
diff --git a/matrix/cumatrix.c b/matrix/cumatrix.c
deleted file mode 100644
index af34fb4..0000000
--- a/matrix/cumatrix.c
+++ /dev/null
@@ -1,87 +0,0 @@
-#define NERV_GENERIC_CUMATRIX
-#include "../common.h"
-#include "cuda_helper.h"
-#include <string.h>
-#define PROFILE_HASHMAP_SIZE 123457
-static cublasHandle_t cublas_handle;
-static cudaEvent_t profile_start, profile_stop;
-static HashMap *profile;
-
-static int print_profile(lua_State *L) {
-    (void)L;
-    size_t i;
-    fprintf(stderr, "*** [nerv cumatrix profile] **\n");
-    for (i = 0; i < profile->size; i++)
-    {
-        HashNode *ptr;
-        for (ptr = profile->bucket[i]; ptr; ptr = ptr->next)
-        {
-            fprintf(stderr, "%s:\t%.6f\n", ptr->key, *(float *)ptr->val);
-        }
-    }
-    return 0;
-}
-
-static int clear_profile(lua_State *L) {
-    (void)L;
-    hashmap_clear(profile);
-    return 0;
-}
-
-void accu_profile(const char *name, float delta) {
-    float *val = hashmap_getval(profile, name);
-    if (!val)
-    {
-        val = malloc(sizeof(float));
-        *val = 0;
-        hashmap_setval(profile, name, val);
-    }
-    *val += delta;
-}
-
-static const luaL_Reg cumatrix_methods[] = {
-    {"print_profile", print_profile},
-    {"clear_profile", clear_profile},
-    {NULL, NULL}
-};
-
-extern void nerv_matrix_cuda_float_init(lua_State *L);
-extern void nerv_matrix_cuda_double_init(lua_State *L);
-
-void nerv_cumatrix_init(lua_State *L) {
-    luaL_register(L, NULL, cumatrix_methods);
-    cublasCreate(&cublas_handle);
-    cudaEventCreate(&profile_start);
-    cudaEventCreate(&profile_stop);
-    profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
-    nerv_matrix_cuda_float_init(L);
-    nerv_matrix_cuda_double_init(L);
-}
-
-#define MATRIX_USE_FLOAT
-#define cuda_matrix_(NAME) cuda_matrix_float_##NAME
-#define nerv_matrix_(NAME) nerv_matrix_cuda_float_##NAME
-#define cudak_(NAME) cudak_float_ ## NAME
-#define NERV_CUBLAS_(NAME) cublasS##NAME
-#define MATRIX_CUMATRIX_HOST_TNAME nerv_matrix_host_float_tname
-const char *nerv_matrix_(tname) = "nerv.CuMatrixFloat";
-#include "generic/cumatrix.c"
-#undef NERV_CUBLAS_
-#undef cudak_
-#undef nerv_matrix_
-#undef cuda_matrix_
-#undef MATRIX_USE_FLOAT
-#undef MATRIX_ELEM
-#undef MATRIX_ELEM_PTR
-#undef MATRIX_ELEM_FMT
-#undef MATRIX_ELEM_WRITE_FMT
-#undef MATRIX_CUMATRIX_HOST_TNAME
-
-#define MATRIX_USE_DOUBLE
-#define cuda_matrix_(NAME) cuda_matrix_double_##NAME
-#define nerv_matrix_(NAME) nerv_matrix_cuda_double_##NAME
-#define cudak_(NAME) cudak_double_ ## NAME
-#define NERV_CUBLAS_(NAME) cublasD##NAME
-#define MATRIX_CUMATRIX_HOST_TNAME nerv_matrix_host_double_tname
-const char *nerv_matrix_(tname) = "nerv.CuMatrixDouble";
-#include "generic/cumatrix.c"
diff --git a/matrix/generic/cukernel.cu b/matrix/generic/cukernel.cu
deleted file mode 100644
index d6c8adc..0000000
--- a/matrix/generic/cukernel.cu
+++ /dev/null
@@ -1,571 +0,0 @@
-#ifdef NERV_GENERIC_CUKERNEL
-#include <assert.h>
-#include <stdio.h>
-#include "matrix.h"
-#include "cuda.h"
-#include "float.h"
-#define CUDA_THREADS_N 16
-#define CUDA_THREADS_NN ((CUDA_THREADS_N) * (CUDA_THREADS_N))
-#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
-__global__ void cudak_(log_elem)(const MATRIX_ELEM *a, MATRIX_ELEM *b, 
-                                int nrow, int ncol, int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    long idx;
-    MATRIX_ELEM tmp;
-    if (i >= nrow || j >= ncol) return;
-    idx = j + i * stride;
-    tmp = a[idx];
-    if(tmp < FLT_MIN) tmp = FLT_MIN;
-    b[idx] = log(tmp);
-}
-
-__global__ void cudak_(mul_elem)(const MATRIX_ELEM *a, const MATRIX_ELEM *b,
-                                MATRIX_ELEM *c, 
-                                int nrow, int ncol, int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    long idx;
-    if (i >= nrow || j >= ncol) return;
-    idx = j + i * stride;
-    c[idx] = a[idx] * b[idx];
-}
-
-__global__ void cudak_(sigmoid)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                        int nrow, int ncol, int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    long idx;
-    if (i >= nrow || j >= ncol) return;
-    idx = j + i * stride;
-    b[idx] = 1.0 / (1.0 + exp(-a[idx]));
-}
-
-__global__ void cudak_(sigmoid_grad)(const MATRIX_ELEM *output,
-                                    const MATRIX_ELEM *err,
-                                    MATRIX_ELEM *nerr,
-                                    int nrow, int ncol, int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    long idx;
-    if (i >= nrow || j >= ncol) return;
-    idx = j + i * stride;
-    nerr[idx] = output[idx] * (1.0 - output[idx]) * err[idx];
-}
-
-__global__ void cudak_(softmax_final)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                        const MATRIX_ELEM *max, const MATRIX_ELEM *deno,
-                        int nrow, int ncol, int stride, int mstride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    long idx;
-    if (i >= nrow || j >= ncol) return;
-    idx = j + i * stride;
-    b[idx] = exp(a[idx] - max[0 + i * mstride]) / deno[0 + i * mstride];
-}
-
-__global__ void cudak_(block_reduce_rowsum)(const MATRIX_ELEM *input,
-                                            MATRIX_ELEM *output,
-                                            const int istride, const int ostride,
-                                            const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0;
-    __syncthreads();
-    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-            cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_colsum)(const MATRIX_ELEM *input,
-                                MATRIX_ELEM *output,
-                                const int istride, const int ostride,
-                                const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    cudak_(arr)[threadIdx.y] = i < n ? input[blockIdx.x + istride * i] : 0;
-    __syncthreads();
-    for (int offset = blockDim.y >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.y < offset)
-            cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset];
-        __syncthreads();
-    }
-    if (threadIdx.y == 0)
-        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_colsame)(const MATRIX_ELEM *input,
-                                            const MATRIX_ELEM *ref_input,
-                                            MATRIX_ELEM *output,
-                                            const int istride, const int ostride,
-                                            const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    cudak_(arr)[threadIdx.y] = (i < n && input[blockIdx.x + istride * i] == \
-                                        ref_input[blockIdx.x + istride * i]) ? 1.0 : 0;
-    __syncthreads();
-    for (int offset = blockDim.y >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.y < offset)
-            cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset];
-        __syncthreads();
-    }
-    if (threadIdx.y == 0)
-        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_softmax_rowsum)(const MATRIX_ELEM *input,
-                                        MATRIX_ELEM *output,
-                                        const MATRIX_ELEM *max,
-                                        const int istride, const int ostride,
-                                        const int mstride, const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    cudak_(arr)[threadIdx.x] = j < n ? exp(input[j + istride * blockIdx.y] - \
-                                    max[0 + mstride * blockIdx.y]) : 0;
-    __syncthreads();
-    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-            cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_rowmax)(const MATRIX_ELEM *input,
-                                            MATRIX_ELEM *output,
-                                            const int istride, const int ostride,
-                                            const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
-    __syncthreads();
-    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-        {
-            MATRIX_ELEM l = cudak_(arr)[threadIdx.x],
-                        r = cudak_(arr)[threadIdx.x + offset];
-            if (r > l)
-                cudak_(arr)[threadIdx.x] = r;
-        }
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
-}
-
-__global__ void cudak_(block_reduce_rowmax_idx)(const MATRIX_ELEM *input,
-                                                const MATRIX_ELEM *idx_input,
-                                                MATRIX_ELEM *output,
-                                                MATRIX_ELEM *idx_output,
-                                                const int istride, const int ostride,
-                                                const int n) {
-    extern __shared__ MATRIX_ELEM cudak_(arr)[];
-    MATRIX_ELEM *arr_val = cudak_(arr);
-    MATRIX_ELEM *arr_idx = arr_val + blockDim.x;
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    arr_val[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
-    arr_idx[threadIdx.x] = j < n ? idx_input[j + istride * blockIdx.y] : 0;
-    __syncthreads();
-    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
-    {
-        if (threadIdx.x < offset)
-        {
-            MATRIX_ELEM l = arr_val[threadIdx.x],
-                        r = arr_val[threadIdx.x + offset];
-            if (r > l)
-            {
-                arr_val[threadIdx.x] = r;
-                arr_idx[threadIdx.x] = arr_idx[threadIdx.x + offset];
-            }
-        }
-        __syncthreads();
-    }
-    if (threadIdx.x == 0)
-    {
-        output[blockIdx.x + ostride * blockIdx.y] = arr_val[0];
-        idx_output[blockIdx.x + ostride * blockIdx.y] = arr_idx[0];
-    }
-}
-
-__global__ void cudak_(add_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                int nrow, int ncol, int stride, double beta) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[j + i * stride] += beta * a[j];
-}
-
-__global__ void cudak_(fill)(MATRIX_ELEM *a,
-                            int nrow, int ncol, int stride, double val) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    a[j + i * stride] = val;
-}
-
-__global__ void cudak_(expand_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                    int nrow, int ncol,
-                                    int enrow, int encol,
-                                    int stride, int estride,
-                                    int context) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    int ridx;
-    if (i >= enrow || j >= encol) return;
-    ridx = i + j / ncol - context;
-    if (ridx < 0) ridx = 0;
-    else if (ridx >= nrow) ridx = nrow - 1;
-    b[j + i * estride] = a[j % ncol + ridx * stride];
-}
-
-__global__ void cudak_(rearrange_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                    int nrow, int ncol,
-                                    int stride, int step, int orig_dim) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[j + i * stride] = a[j / step + (j % step) * orig_dim + i * stride];
-}
-
-__global__ void cudak_(scale_rows_by_col)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                        int nrow, int ncol,
-                                        int astride, int bstride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[j + i * bstride] *= a[i * astride];
-}
-
-__global__ void cudak_(scale_rows_by_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                        int nrow, int ncol,
-                                        int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[j + i * stride] *= a[j];
-}
-
-__global__ void cudak_(decompress)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
-                                    int nrow, int ncol,
-                                    int stride_a, int stride_b) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[lrintf(a[j + i * stride_a]) + i * stride_b] = 1.0;
-}
-
-__global__ void cudak_(gen_col_idx)(MATRIX_ELEM *b,
-                                    int nrow, int ncol, int stride) {
-    int j = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = blockIdx.y * blockDim.y + threadIdx.y;
-    if (i >= nrow || j >= ncol) return;
-    b[j + i * stride] = j;
-}
-
-extern "C" {
-#include "../cukernel.h"
-    void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(log_elem)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_mul_elem)(const Matrix *a, const Matrix *b,
-                                Matrix *c) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(mul_elem)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             MATRIX_ELEM_PTR(c),
-             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(sigmoid)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
-            b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_sigmoid_grad)(const Matrix *output,
-                                    const Matrix *err, Matrix *nerr) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(nerr->ncol, threadsPerBlock.x),
-                CEIL_DIV(nerr->nrow, threadsPerBlock.y));
-        cudak_(sigmoid_grad)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(output), MATRIX_ELEM_PTR(err),
-             MATRIX_ELEM_PTR(nerr),
-             nerr->nrow, nerr->ncol,
-             nerr->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_rowsum)(const Matrix *a, Matrix *b) {
-        dim3 block(CUDA_THREADS_NN, 1);
-        int ncol = a->ncol;
-        int blocks_per_row = CEIL_DIV(ncol, block.x);
-        dim3 grid(blocks_per_row, a->nrow);
-        MATRIX_ELEM *res;
-        size_t stride;
-        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
-        cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), res,
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             ncol);
-        ncol = blocks_per_row;
-        assert((unsigned long)ncol <= block.x);
-        grid.x = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (res, MATRIX_ELEM_PTR(b),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             ncol);
-        cudaStreamSynchronize(0);
-        cudaFree(res);
-    }
-
-    void cudak_(cuda_colsame)(const Matrix *a, const Matrix *ref, Matrix *b) {
-        dim3 block(1, CUDA_THREADS_NN);
-        int nrow = a->nrow;
-        int blocks_per_col = CEIL_DIV(nrow, block.y);
-        dim3 grid(a->ncol, blocks_per_col);
-        MATRIX_ELEM *res;
-        size_t stride;
-        cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col);
-        cudak_(block_reduce_colsame)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(ref), res,
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             nrow);
-        nrow = blocks_per_col;
-        assert((unsigned long)nrow <= block.y);
-        grid.y = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
-            (res, MATRIX_ELEM_PTR(b),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             nrow);
-        cudaStreamSynchronize(0);
-        cudaFree(res);
-    }
-
-    void cudak_(cuda_colsum)(const Matrix *a, Matrix *b) {
-        dim3 block(1, CUDA_THREADS_NN);
-        int nrow = a->nrow;
-        int blocks_per_col = CEIL_DIV(nrow, block.y);
-        dim3 grid(a->ncol, blocks_per_col);
-        MATRIX_ELEM *res;
-        size_t stride;
-        cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col);
-        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), res,
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             nrow);
-        nrow = blocks_per_col;
-        assert((unsigned long)nrow <= block.y);
-        grid.y = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
-            (res, MATRIX_ELEM_PTR(b),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             nrow);
-        cudaStreamSynchronize(0);
-        cudaFree(res);
-    }
-
-    void cudak_(cuda_softmax_final)(const Matrix *a, const Matrix *max,
-                            const Matrix *deno, Matrix *b) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(softmax_final)<<<numBlocks, threadsPerBlock>>> \
-                (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-                MATRIX_ELEM_PTR(max), MATRIX_ELEM_PTR(deno),
-                b->nrow, b->ncol,
-                b->stride / sizeof(MATRIX_ELEM),
-                max->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_softmax_denominator)(const Matrix *a, const Matrix *max, Matrix *b) {
-        dim3 block(CUDA_THREADS_NN, 1);
-        int ncol = a->ncol;
-        int blocks_per_row = CEIL_DIV(ncol, block.x);
-        dim3 grid(blocks_per_row, a->nrow);
-        MATRIX_ELEM *res;
-        size_t stride;
-        assert(max->ncol == 1);
-        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
-        cudak_(block_reduce_softmax_rowsum) \
-            <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), res, MATRIX_ELEM_PTR(max),
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             max->stride / sizeof(MATRIX_ELEM),
-             ncol);
-        ncol = blocks_per_row;
-        assert((unsigned long)ncol <= block.x);
-        grid.x = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_rowsum) \
-            <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (res, MATRIX_ELEM_PTR(b),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             ncol);
-        cudaStreamSynchronize(0);
-        cudaFree(res);
-    }
-
-    void cudak_(cuda_rowmax)(const Matrix *a, Matrix *b) {
-        dim3 block(CUDA_THREADS_NN, 1);
-        int ncol = a->ncol;
-        int blocks_per_row = CEIL_DIV(ncol, block.x);
-        dim3 grid(blocks_per_row, a->nrow);
-        MATRIX_ELEM *res;
-        size_t stride;
-        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
-        cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), res,
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             ncol);
-        ncol = blocks_per_row;
-        assert((unsigned long)ncol <= block.x);
-        grid.x = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
-            (res, MATRIX_ELEM_PTR(b),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             ncol);
-        cudaStreamSynchronize(0);
-        cudaFree(res);
-    }
-
-    void cudak_(cuda_rowmax_idx)(const Matrix *a, Matrix *b, Matrix *b_idx) {
-        dim3 block(CUDA_THREADS_NN, 1);
-        int ncol = a->ncol;
-        int blocks_per_row = CEIL_DIV(ncol, block.x);
-        dim3 grid(blocks_per_row, a->nrow);
-        MATRIX_ELEM *a_idx, *res, *res_idx;
-        size_t stride;
-        cudaMallocPitch(&a_idx, &stride, a->stride, a->nrow);
-        cudak_(gen_col_idx)<<<grid, block>>>(a_idx, a->nrow, ncol, stride / sizeof(MATRIX_ELEM));
-        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
-        cudaMallocPitch(&res_idx, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_rowmax_idx)<<<grid, block,
-                                        2 * block.x * sizeof(MATRIX_ELEM)>>> \
-            (MATRIX_ELEM_PTR(a), a_idx, res, res_idx,
-             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
-             ncol);
-        ncol = blocks_per_row;
-        assert((unsigned long)ncol <= block.x);
-        grid.x = 1;
-        cudaStreamSynchronize(0);
-        cudak_(block_reduce_rowmax_idx)<<<grid, block,
-                                        2 * block.x * sizeof(MATRIX_ELEM)>>> \
-            (res, res_idx, MATRIX_ELEM_PTR(b), MATRIX_ELEM_PTR(b_idx),
-             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
-             ncol);
-        cudaStreamSynchronize(0);
-        cudaFree(a_idx);
-        cudaFree(res);
-        cudaFree(res_idx);
-    }
-
-    /* in-place calc */
-    void cudak_(cuda_add_row)(const Matrix *a, Matrix *b, double beta) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(add_row)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
-            b->stride / sizeof(MATRIX_ELEM), beta);
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_fill)(Matrix *a, double val) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x),
-                CEIL_DIV(a->nrow, threadsPerBlock.y));
-        cudak_(fill)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), a->nrow, a->ncol,
-            a->stride / sizeof(MATRIX_ELEM), val);
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(expand_frm)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             a->nrow, a->ncol,
-             b->nrow, b->ncol,
-             a->stride / sizeof(MATRIX_ELEM),
-             b->stride / sizeof(MATRIX_ELEM),
-             context);
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(rearrange_frm)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM),
-             step, b->ncol / step);
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(scale_rows_by_col)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             b->nrow, b->ncol,
-             a->stride / sizeof(MATRIX_ELEM),
-             b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b) {
-        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
-        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
-                CEIL_DIV(b->nrow, threadsPerBlock.y));
-        cudak_(scale_rows_by_row)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-
-    void cudak_(cuda_decompress)(const Matrix *a, Matrix *b) {
-        dim3 threadsPerBlock(1, CUDA_THREADS_NN);
-        dim3 numBlocks(1, CEIL_DIV(a->nrow, threadsPerBlock.y));
-        cudak_(decompress)<<<numBlocks, threadsPerBlock>>> \
-            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
-             a->nrow, a->ncol,
-             a->stride / sizeof(MATRIX_ELEM),
-             b->stride / sizeof(MATRIX_ELEM));
-        cudaStreamSynchronize(0);
-    }
-}
-#endif
diff --git a/matrix/generic/cumatrix.c b/matrix/generic/cumatrix.c
deleted file mode 100644
index b5d1a35..0000000
--- a/matrix/generic/cumatrix.c
+++ /dev/null
@@ -1,493 +0,0 @@
-#ifdef NERV_GENERIC_CUMATRIX
-#include "matrix.h"
-#include "elem_type.h"
-
-#define MATRIX_DATA_FREE(L, ptr) cuda_matrix_(free)(L, ptr)
-#define MATRIX_DATA_ALLOC(L, dptr, stride, width, height) \
-                            cuda_matrix_(alloc)(L, dptr, stride, width, height)
-#define MATRIX_DATA_WRITE(L, data, idx, val) cuda_matrix_(write)(L, data, idx, val)
-#define MATRIX_DATA_READ(L, data, idx) cuda_matrix_(read)(L, data, idx)
-#define MATRIX_INIT(L) cuda_matrix_(init)(L)
-#define MATRIX_BASE_TNAME nerv_matrix_cuda_tname
-#define NERV_GENERIC_MATRIX
-#define NERV_GENERIC_CUKERNEL
-#include "../../common.h"
-#include "../cukernel.h"
-#include "../cuda_helper.h"
-
-Matrix *nerv_matrix_(new_)(lua_State *L, long nrow, long ncol);
-void nerv_matrix_(data_free)(lua_State *L, Matrix *self);
-
-static void nerv_matrix_(add_)(lua_State *L, const Matrix *a, const Matrix *b,
-                                const Matrix *c,
-                                MATRIX_ELEM alpha, MATRIX_ELEM beta) {
-    PROFILE_START
-    CUBLAS_SAFE_SYNC_CALL(
-            NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
-                a->ncol, a->nrow,
-                &alpha,
-                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
-                &beta,
-                MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM),
-                MATRIX_ELEM_PTR(c), c->stride / sizeof(MATRIX_ELEM)));
-    PROFILE_STOP
-}
-
-static int nerv_matrix_(add)(lua_State *L) {
-    Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname));
-    MATRIX_ELEM alpha = luaL_checknumber(L, 4);
-    MATRIX_ELEM beta = luaL_checknumber(L, 5);
-    CHECK_SAME_DIMENSION(a, b);
-    CHECK_SAME_DIMENSION(a, c);
-    nerv_matrix_(add_)(L, a, b, c, alpha, beta);
-    return 0;
-}
-
-static int nerv_matrix_(get_cublas_op)(char ch) {
-    return (ch == 'T' || ch == 't') ? CUBLAS_OP_T : CUBLAS_OP_N;
-}
-
-static int nerv_matrix_(mul)(lua_State *L) {
-#define SWAP(a, b) \
-    do { int t = (a); (a) = (b); (b) = t; } while (0)
-
-    Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname));
-    MATRIX_ELEM alpha = luaL_checknumber(L, 4);
-    MATRIX_ELEM beta = luaL_checknumber(L, 5);
-    int nargs = lua_gettop(L);
-    int ta = nargs > 5 ? nerv_matrix_(get_cublas_op)(*luaL_checkstring(L, 6)) \
-                            : CUBLAS_OP_N;
-    int tb = nargs > 6 ? nerv_matrix_(get_cublas_op)(*luaL_checkstring(L, 7)) \
-                            : CUBLAS_OP_N;
-    int am = a->nrow, an = a->ncol;
-    int bm = b->nrow, bn = b->ncol;
-    if (ta == CUBLAS_OP_T) SWAP(am, an);
-    if (tb == CUBLAS_OP_T) SWAP(bm, bn);
-    if (an != bm)
-        nerv_error(L, "Wrong dimension of multipliers");
-/*    MATRIX_ELEM alpha = 1.0f, beta = 0.0f; */
-    /* Because matrix in Nerv is row-major, here b comes first */
-    PROFILE_START
-    CUBLAS_SAFE_SYNC_CALL(
-            NERV_CUBLAS_(gemm)(cublas_handle, tb, ta,
-                bn, am, bm,
-                &alpha,
-                MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM),
-                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
-                &beta,
-                MATRIX_ELEM_PTR(c), c->stride / sizeof(MATRIX_ELEM)));
-    PROFILE_STOP
-    return 0;
-}
-
-static int nerv_matrix_(create)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = nerv_matrix_(new_)(L, a->nrow, a->ncol);
-    luaT_pushudata(L, b, nerv_matrix_(tname));
-    return 1;
-}
-
-static int nerv_matrix_(sigmoid)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    CHECK_SAME_DIMENSION(a, b);
-    PROFILE_START
-    cudak_(cuda_sigmoid)(b, a);
-    PROFILE_STOP
-    return 0;
-}
-
-static int nerv_matrix_(sigmoid_grad)(lua_State *L) {
-    Matrix *nerr = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *err = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    Matrix *output = luaT_checkudata(L, 3, nerv_matrix_(tname));
-    CHECK_SAME_DIMENSION(nerr, err);
-    CHECK_SAME_DIMENSION(nerr, output);
-    PROFILE_START
-    cudak_(cuda_sigmoid_grad)(output, err, nerr);
-    PROFILE_STOP
-    return 0;
-}
-
-static int nerv_matrix_(softmax)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *max, *max_idx;
-    Matrix *dno;
-    CHECK_SAME_DIMENSION(a, b);
-    max = nerv_matrix_(new_)(L, a->nrow, 1);
-    max_idx = nerv_matrix_(new_)(L, a->nrow, 1);
-    dno = nerv_matrix_(new_)(L, a->nrow, 1);
-    PROFILE_START
-    cudak_(cuda_rowmax_idx)(a, max, max_idx);
-    cudak_(cuda_softmax_denominator)(a, max, dno);
-    cudak_(cuda_softmax_final)(a, max, dno, b);
-    PROFILE_STOP
-    nerv_matrix_(data_free)(L, max);
-    nerv_matrix_(data_free)(L, dno);
-    luaT_pushudata(L, max_idx, nerv_matrix_(tname));
-    return 1;
-}
-
-static int nerv_matrix_(rowsum)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1);
-    PROFILE_START
-    cudak_(cuda_rowsum)(a, b);
-    PROFILE_STOP
-    luaT_pushudata(L, b, nerv_matrix_(tname));
-    return 1;
-}
-
-static int nerv_matrix_(colsum)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = nerv_matrix_(new_)(L, 1, a->ncol);
-    PROFILE_START
-    cudak_(cuda_colsum)(a, b);
-    PROFILE_STOP
-    luaT_pushudata(L, b, nerv_matrix_(tname));
-    return 1;
-}
-
-static int nerv_matrix_(colsame)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *ref = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    Matrix *b = nerv_matrix_(new_)(L, 1, a->ncol);
-    CHECK_SAME_DIMENSION(a, ref);
-    PROFILE_START
-    cudak_(cuda_colsame)(a, ref, b);
-    PROFILE_STOP
-    luaT_pushudata(L, b, nerv_matrix_(tname));
-    return 1;
-}
-
-static int nerv_matrix_(rowmax)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1);
-    PROFILE_START
-    cudak_(cuda_rowmax)(a, b);
-    PROFILE_STOP
-    luaT_pushudata(L, b, nerv_matrix_(tname));
-    return 1;
-}
-
-static int nerv_matrix_(rowmax_idx)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1);
-    Matrix *idx = nerv_matrix_(new_)(L, a->nrow, 1);
-    PROFILE_START
-    cudak_(cuda_rowmax_idx)(a, b, idx);
-    PROFILE_STOP
-    luaT_pushudata(L, b, nerv_matrix_(tname));
-    luaT_pushudata(L, idx, nerv_matrix_(tname));
-    return 2;
-}
-
-static int nerv_matrix_(add_row)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    double beta = luaL_checknumber(L, 3);
-    if (a->ncol != b->ncol)
-        nerv_error(L, "the number of columns is not the same");
-    if (a->nrow != 1)
-        nerv_error(L, "a row vector is expected");
-    PROFILE_START
-    cudak_(cuda_add_row)(a, b, beta);
-    PROFILE_STOP
-    return 0;
-}
-
-static int nerv_matrix_(fill)(lua_State *L) {
-    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    double val = luaL_checknumber(L, 2);
-    PROFILE_START
-    cudak_(cuda_fill)(self, val);
-    PROFILE_STOP
-    return 0;
-}
-
-static int nerv_matrix_(copy_fromd)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    int nargs = lua_gettop(L);
-    int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
-    int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
-    int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
-    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
-            a_begin + b_end - b_begin <= a->nrow))
-        nerv_error(L, "invalid copy interval");
-    if (a->ncol != b->ncol)
-        nerv_error(L, "matrices should be of the same dimension");
-    PROFILE_START
-    CUDA_SAFE_SYNC_CALL(
-            cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride,
-                MATRIX_ROW_PTR(b, b_begin), b->stride,
-                sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin,
-                cudaMemcpyDeviceToDevice));
-    PROFILE_STOP
-    return 0;
-}
-
-extern const char *MATRIX_CUMATRIX_HOST_TNAME;
-static int nerv_matrix_(copy_fromh)(lua_State *L) { 
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
-    int nargs = lua_gettop(L);
-    int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
-    int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
-    int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
-    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
-            a_begin + b_end - b_begin <= a->nrow))
-        nerv_error(L, "invalid copy interval");
-    if (a->ncol != b->ncol)
-        nerv_error(L, "matrices should be of the same dimension");
-    PROFILE_START
-    CUDA_SAFE_SYNC_CALL(
-            cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride,
-                MATRIX_ROW_PTR(b, b_begin), b->stride,
-                sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin,
-                cudaMemcpyHostToDevice));
-    PROFILE_STOP
-    return 0;
-}
-
-static int nerv_matrix_(copy_toh)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
-    int nargs = lua_gettop(L);
-    int a_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
-    int a_end = nargs > 3 ? luaL_checkinteger(L, 4) : a->nrow;
-    int b_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
-    if (!(0 <= a_begin && a_begin < a_end && a_end <= a->nrow &&
-            b_begin + a_end - a_begin <= b->nrow))
-        nerv_error(L, "invalid copy interval");
-    if (b->ncol != a->ncol)
-        nerv_error(L, "matrices should be of the same dimension");
-    PROFILE_START
-    CUDA_SAFE_SYNC_CALL(
-            cudaMemcpy2D(MATRIX_ROW_PTR(b, b_begin), b->stride,
-                MATRIX_ROW_PTR(a, a_begin), a->stride,
-                sizeof(MATRIX_ELEM) * a->ncol, a_end - a_begin,
-                cudaMemcpyDeviceToHost));
-    PROFILE_STOP
-    return 0;
-}
-
-static int nerv_matrix_(trans)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = nerv_matrix_(new_)(L, a->ncol, a->nrow);
-    MATRIX_ELEM alpha = 1, beta = 0;
-    /* FIXME: possible memory leak when lua error is raised */
-    PROFILE_START
-    CUBLAS_SAFE_SYNC_CALL(
-            NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T,
-                a->nrow, a->ncol,
-                &alpha,
-                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
-                &beta,
-                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
-                MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM)));
-    PROFILE_STOP
-    luaT_pushudata(L, b, nerv_matrix_(tname));
-    return 1;
-}
-
-static int nerv_matrix_(mul_elem)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname));
-    Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    CHECK_SAME_DIMENSION(a, b);
-    CHECK_SAME_DIMENSION(a, c);
-    PROFILE_START
-    cudak_(cuda_mul_elem)(a, b, c);
-    PROFILE_STOP
-    return 0;
-}
-
-static int nerv_matrix_(log_elem)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    CHECK_SAME_DIMENSION(a, b);
-    PROFILE_START
-    cudak_(cuda_log_elem)(a, b);
-    PROFILE_STOP
-    return 0;
-}
-
-static int nerv_matrix_(decompress)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b;
-    int orig_col = luaL_checkinteger(L, 2);
-    if (a->ncol != 1)
-        nerv_error(L, "the compressed matrix must be a column vector");
-    b = nerv_matrix_(new_)(L, a->nrow, orig_col);
-    PROFILE_START
-    cudak_(cuda_fill)(b, 0.0);
-    cudak_(cuda_decompress)(a, b);
-    PROFILE_STOP
-    luaT_pushudata(L, b, nerv_matrix_(tname));
-    return 1;
-}
-
-extern const char *nerv_matrix_host_int_tname;
-static int nerv_matrix_(copy_rows_fromh_by_idx)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
-    Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_int_tname);
-    long nrow = a->nrow;
-    int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0;
-    if (!(0 <= b_begin && b_begin + nrow <= idx->ncol))
-        nerv_error(L, "invalid copy interval");
-    long *idx_ptr = idx->data.i;
-    int i;
-    if (idx->nrow != 1)
-        nerv_error(L, "index should be a vector");
-    if (a->ncol != b->ncol)
-        nerv_error(L, "source/destination dimension mismatch");
-    cudaStream_t *streams = (cudaStream_t*)malloc(sizeof(cudaStream_t) * nrow);
-    for (i = 0; i < nrow; i++)
-    {
-        int src_row = idx_ptr[b_begin + i];
-        if (!(0 <= src_row && src_row < b->nrow))
-            nerv_error(L, "invalid index");
-        CUDA_SAFE_CALL(cudaStreamCreate(streams + i));
-        CUDA_SAFE_CALL(cudaMemcpyAsync(MATRIX_ROW_PTR(a, i),
-                    MATRIX_ROW_PTR(b, src_row),
-                    b->stride,
-                    cudaMemcpyHostToDevice, streams[i]));
-    }
-    for (i = 0; i < nrow; i++)
-    {
-        CUDA_SAFE_CALL(cudaStreamSynchronize(streams[i]));
-        CUDA_SAFE_CALL(cudaStreamDestroy(streams[i]));
-    }
-    free(streams);
-    return 0;
-}
-
-static int nerv_matrix_(expand_frm)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    int context = luaL_checkinteger(L, 3);
-    if (a->nrow != b->nrow)
-        nerv_error(L, "mismatching number of frames");
-    if (a->ncol != b->ncol * (context * 2 + 1))
-        nerv_error(L, "the width should be 2 * context + 1");
-    PROFILE_START
-    cudak_(cuda_expand_frm)(b, a, context);
-    PROFILE_STOP
-    return 0;
-}
-
-static int nerv_matrix_(rearrange_frm)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    int step = luaL_checkinteger(L, 3);
-    CHECK_SAME_DIMENSION(a, b);
-    if (b->ncol % step)
-        nerv_error(L, "the dimension of columns is not divisible by step");
-    PROFILE_START
-    cudak_(cuda_rearrange_frm)(b, a, step);
-    PROFILE_STOP
-    return 0;
-}
-
-static int nerv_matrix_(scale_rows_by_col)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    if (a->nrow != b->nrow)
-        nerv_error(L, "the number of rows is not the same");
-    if (b->ncol != 1)
-        nerv_error(L, "a column vector is expected");
-    PROFILE_START
-    cudak_(cuda_scale_rows_by_col)(b, a);
-    PROFILE_STOP
-    return 0;
-}
-
-static int nerv_matrix_(scale_rows_by_row)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    if (a->ncol != b->ncol)
-        nerv_error(L, "the number of columns is not the same");
-    if (b->nrow != 1)
-        nerv_error(L, "a row vector is expected");
-    PROFILE_START
-    cudak_(cuda_scale_rows_by_row)(b, a);
-    PROFILE_STOP
-    return 0;
-}
-
-static const luaL_Reg nerv_matrix_(extra_methods)[] = {
-    {"create", nerv_matrix_(create)},
-    {"colsum", nerv_matrix_(colsum)},
-    {"colsame", nerv_matrix_(colsame)},
-    {"rowsum", nerv_matrix_(rowsum)},
-    {"rowmax", nerv_matrix_(rowmax)},
-    {"rowmax_idx", nerv_matrix_(rowmax_idx)},
-    {"trans", nerv_matrix_(trans)},
-    {"decompress", nerv_matrix_(decompress)},
-    /* in-place calc */
-    {"copy_fromh", nerv_matrix_(copy_fromh)},
-    {"copy_fromd", nerv_matrix_(copy_fromd)},
-    {"copy_toh", nerv_matrix_(copy_toh)},
-    {"add", nerv_matrix_(add)},
-    {"mul", nerv_matrix_(mul)},
-    {"add_row", nerv_matrix_(add_row)},
-    {"fill", nerv_matrix_(fill)},
-    {"sigmoid", nerv_matrix_(sigmoid)},
-    {"sigmoid_grad", nerv_matrix_(sigmoid_grad)},
-    {"softmax", nerv_matrix_(softmax)},
-    {"mul_elem", nerv_matrix_(mul_elem)},
-    {"log_elem", nerv_matrix_(log_elem)},
-    {"copy_rows_fromh_by_idx", nerv_matrix_(copy_rows_fromh_by_idx)},
-    {"expand_frm", nerv_matrix_(expand_frm)},
-    {"rearrange_frm", nerv_matrix_(rearrange_frm)},
-    {"scale_rows_by_row", nerv_matrix_(scale_rows_by_row)},
-    {"scale_rows_by_col", nerv_matrix_(scale_rows_by_col)},
-    {NULL, NULL}
-};
-
-static void cuda_matrix_(init)(lua_State *L) {
-    luaN_append_methods(L, nerv_matrix_(extra_methods));
-}
-
-static void cuda_matrix_(free)(lua_State *L, MATRIX_ELEM *ptr) {
-    CUDA_SAFE_SYNC_CALL(cudaFree(ptr));
-}
-
-static void cuda_matrix_(alloc)(lua_State *L, MATRIX_ELEM **dptr,
-                                size_t *stride, long width, long height) {
-    PROFILE_START
-    CUDA_SAFE_SYNC_CALL(cudaMallocPitch((void **)dptr, stride, width, height));
-    PROFILE_STOP
-}
-
-static MATRIX_ELEM cuda_matrix_(read)(lua_State *L, MATRIX_ELEM *data,
-                                        int idx) {
-    MATRIX_ELEM res;
-    CUDA_SAFE_SYNC_CALL(cudaMemcpy(&res, data + idx,
-                sizeof(MATRIX_ELEM), cudaMemcpyDeviceToHost));
-    return res;
-}
-
-static void cuda_matrix_(write)(lua_State *L, MATRIX_ELEM *data,
-                                int idx, MATRIX_ELEM val) {
-    CUDA_SAFE_SYNC_CALL(cudaMemcpy(data + idx, &val,
-                sizeof(MATRIX_ELEM), cudaMemcpyHostToDevice));
-}
-
-int nerv_matrix_(get_elem)(lua_State *L) {
-    return nerv_error_method_not_implemented(L);
-}
-
-int nerv_matrix_(set_elem)(lua_State *L) {
-    return nerv_error_method_not_implemented(L);
-}
-
-#include "matrix.c"
-#endif
diff --git a/matrix/generic/elem_type.h b/matrix/generic/elem_type.h
deleted file mode 100644
index bffe940..0000000
--- a/matrix/generic/elem_type.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifdef MATRIX_USE_FLOAT
-
-#define MATRIX_ELEM float
-#define MATRIX_ELEM_FMT "%f"
-#define MATRIX_ELEM_WRITE_FMT "%.8f"
-#define MATRIX_ELEM_PTR(self) ((self)->data.f)
-
-#elif defined(MATRIX_USE_DOUBLE)
-
-#define MATRIX_ELEM double
-#define MATRIX_ELEM_FMT "%lf"
-#define MATRIX_ELEM_WRITE_FMT "%.8lf"
-#define MATRIX_ELEM_PTR(self) ((self)->data.d)
-
-#elif defined(MATRIX_USE_INT)
-
-#define MATRIX_ELEM long
-#define MATRIX_ELEM_FMT "%ld"
-#define MATRIX_ELEM_WRITE_FMT "%ld"
-#define MATRIX_ELEM_PTR(self) ((self)->data.i)
-
-#endif
diff --git a/matrix/generic/matrix.c b/matrix/generic/matrix.c
deleted file mode 100644
index e17fb42..0000000
--- a/matrix/generic/matrix.c
+++ /dev/null
@@ -1,155 +0,0 @@
-#ifdef NERV_GENERIC_MATRIX
-#include "../../common.h"
-#include "matrix.h"
-
-extern const char *nerv_matrix_(tname);
-extern const char *MATRIX_BASE_TNAME;
-
-void nerv_matrix_(data_free)(lua_State *L, Matrix *self) {
-    (void)L;
-    assert(*self->data_ref > 0);
-    if (--(*self->data_ref) == 0)
-    {
-        /* free matrix data */
-        MATRIX_DATA_FREE(L, MATRIX_ELEM_PTR(self));
-        free(self->data_ref);
-        free(self);
-    }
-}
-
-void nerv_matrix_(data_retain)(Matrix *self) {
-    (*self->data_ref)++;
-}
-
-Matrix *nerv_matrix_(new_)(lua_State *L, long nrow, long ncol) {
-    Matrix *self = (Matrix *)malloc(sizeof(Matrix));
-    self->nrow = nrow;
-    self->ncol = ncol;
-    self->nmax = self->nrow * self->ncol;
-    MATRIX_DATA_ALLOC(L, &MATRIX_ELEM_PTR(self), &self->stride,
-                        sizeof(MATRIX_ELEM) * self->ncol, self->nrow);
-    self->data_ref = (long *)malloc(sizeof(long));
-    *self->data_ref = 0;
-    nerv_matrix_(data_retain)(self);
-    return self;
-}
-
-int nerv_matrix_(new)(lua_State *L) {
-    luaT_pushudata(L, nerv_matrix_(new_)(L, luaL_checkinteger(L, 1),
-                                        luaL_checkinteger(L, 2)),
-                    nerv_matrix_(tname));
-    return 1;
-}
-
-int nerv_matrix_(destroy)(lua_State *L) {
-    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    nerv_matrix_(data_free)(L, self);
-    return 1;
-}
-
-int nerv_matrix_(get_elem)(lua_State *L); 
-int nerv_matrix_(set_elem)(lua_State *L);
-
-static Matrix *nerv_matrix_(getrow)(Matrix *self, int row) {
-    Matrix *prow = (Matrix *)malloc(sizeof(Matrix));
-    prow->ncol = self->ncol;
-    prow->nrow = 1;
-    prow->stride = self->stride;
-    prow->nmax = prow->ncol;
-    MATRIX_ELEM_PTR(prow) = MATRIX_ROW_PTR(self, row);
-    prow->data_ref = self->data_ref;
-    nerv_matrix_(data_retain)(prow);
-    return prow;
-}
-
-static int nerv_matrix_(newindex)(lua_State *L) {
-    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    if (lua_isnumber(L, 2))
-    {
-        int idx = luaL_checkinteger(L, 2);
-        if (self->nrow == 1)
-        {
-            if (idx < 0 || idx >= self->ncol)
-                nerv_error(L, "index must be within range [0, %d)", self->ncol);
-            MATRIX_DATA_WRITE(L, MATRIX_ELEM_PTR(self), idx,
-                                luaL_checknumber(L, 3));
-        }
-        else
-            nerv_error(L, "cannot assign to row vector");
-        lua_pushboolean(L, 1);
-        return 1;
-    }
-    else
-    {
-        lua_pushboolean(L, 0);
-        return 1;
-    }
-}
-
-
-static int nerv_matrix_(index)(lua_State *L) {
-    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    if (lua_isnumber(L, 2))
-    {
-        int idx = luaL_checkinteger(L, 2);
-        if (self->nrow == 1)
-        {
-            if (idx < 0 || idx >= self->ncol)
-                nerv_error(L, "index must be within range [0, %d)", self->ncol);
-            lua_pushnumber(L, MATRIX_DATA_READ(L, MATRIX_ELEM_PTR(self), idx));
-        }
-        else
-        {
-            if (idx < 0 || idx >= self->nrow)
-                nerv_error(L, "index must be within range [0, %d)", self->nrow);
-            luaT_pushudata(L, nerv_matrix_(getrow)(self, idx), nerv_matrix_(tname));
-        }
-        lua_pushboolean(L, 1);
-        return 2;
-    }
-    else
-    {
-        lua_pushboolean(L, 0);
-        return 1;
-    }
-}
-
-static int nerv_matrix_(ncol)(lua_State *L) {
-    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    lua_pushinteger(L, self->ncol);
-    return 1;
-}
-
-static int nerv_matrix_(nrow)(lua_State *L) {
-    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    lua_pushinteger(L, self->nrow);
-    return 1;
-}
-
-static int nerv_matrix_(get_dataref_value)(lua_State *L) {                                                                                                                                               
-    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));                                                                                                                                           
-    lua_pushinteger(L, *(self->data_ref));                                                                                                                                                               
-    return 1;                                                                                                                                                                                            
-}      
-
-static const luaL_Reg nerv_matrix_(methods)[] = {
-    {"get_elem", nerv_matrix_(get_elem)},
-    {"set_elem", nerv_matrix_(set_elem)},
-    {"ncol", nerv_matrix_(ncol)},
-    {"nrow", nerv_matrix_(nrow)},
-    {"get_dataref_value", nerv_matrix_(get_dataref_value)},
-    {"__index__", nerv_matrix_(index)},
-    {"__newindex__", nerv_matrix_(newindex)},
-    {NULL, NULL}
-};
-
-void nerv_matrix_(init)(lua_State *L) {
-    luaT_newmetatable(L, nerv_matrix_(tname), MATRIX_BASE_TNAME,
-                        nerv_matrix_(new), nerv_matrix_(destroy), NULL);
-    luaL_register(L, NULL, nerv_matrix_(methods));
-#ifdef MATRIX_INIT
-    MATRIX_INIT(L);
-#endif
-    lua_pop(L, 1);
-}
-#endif
diff --git a/matrix/generic/matrix.h b/matrix/generic/matrix.h
deleted file mode 100644
index 833724b..0000000
--- a/matrix/generic/matrix.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef NERV_GENERIC_MATRIX_H
-#define NERV_GENERIC_MATRIX_H
-
-#include <stddef.h>
-typedef struct Matrix {
-    size_t stride;              /* size of a row */
-    long ncol, nrow, nmax;    /* dimension of the matrix */
-    union {
-        float *f;
-        double *d;
-        long *i;
-    } data;                   /* pointer to actual storage */
-    long *data_ref;
-} Matrix;
-
-#define MATRIX_ROW_PTR(self, row) \
-    (MATRIX_ELEM *)((char *)MATRIX_ELEM_PTR(self) + (row) * (self)->stride)
-
-#endif
diff --git a/matrix/generic/mmatrix.c b/matrix/generic/mmatrix.c
deleted file mode 100644
index b0f0791..0000000
--- a/matrix/generic/mmatrix.c
+++ /dev/null
@@ -1,122 +0,0 @@
-#ifdef NERV_GENERIC_MMATRIX
-#include "matrix.h"
-#include "elem_type.h"
-#define MATRIX_DATA_FREE(L, ptr) free(ptr)
-#define MATRIX_DATA_ALLOC(L, dptr, stride, width, height) \
-                            host_matrix_(alloc)(L, dptr, stride, width, height)
-#define MATRIX_DATA_WRITE(L, data, idx, val) (data[idx] = val)
-#define MATRIX_DATA_READ(L, data, idx) (data[idx])
-#define MATRIX_INIT(L) host_matrix_(init)(L)
-#define MATRIX_BASE_TNAME nerv_matrix_host_tname
-#define NERV_GENERIC_MATRIX
-#include "../../common.h"
-#include "../../io/chunk_file.h"
-#include "string.h"
-
-static void host_matrix_(alloc)(lua_State *L,
-                                MATRIX_ELEM **dptr, size_t *stride,
-                                long width, long height) {
-    if ((*dptr = (MATRIX_ELEM *)malloc(width * height)) == NULL)
-        nerv_error(L, "mmatrix insufficient memory");
-    *stride = width;
-}
-
-int nerv_matrix_(get_elem)(lua_State *L) {
-    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    int idx = luaL_checkinteger(L, 2);
-    if (idx < 0 || idx >= self->nmax)
-        nerv_error(L, "index must be within range [0, %d)", self->nmax);
-    lua_pushnumber(L, MATRIX_ELEM_PTR(self)[idx]);
-    return 1;
-}
-
-int nerv_matrix_(set_elem)(lua_State *L) {
-    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    int idx = luaL_checkinteger(L, 2);
-    MATRIX_ELEM v = luaL_checknumber(L, 3);
-    if (idx < 0 || idx >= self->nmax)
-        nerv_error(L, "index must be within range [0, %d)", self->nmax);
-    MATRIX_ELEM_PTR(self)[idx] = v;
-    return 0;
-}
-
-static const luaL_Reg nerv_matrix_(extra_methods)[];
-static void host_matrix_(init)(lua_State *L) {
-    luaN_append_methods(L, nerv_matrix_(extra_methods));
-#ifdef MMATRIX_INIT
-    MMATRIX_INIT(L);
-#endif
-}
-
-#include "matrix.c"
-
-int nerv_matrix_(load)(lua_State *L) {
-    ChunkData *chunk = luaT_checkudata(L, 1, nerv_chunk_data_tname);
-    Matrix *self;
-    int i, j;
-    long nrow, ncol;
-    FILE *fp = chunk->fp;
-    if (fscanf(fp, "%ld %ld", &nrow, &ncol) != 2)
-        return 0;
-    self = nerv_matrix_(new_)(L, nrow, ncol);
-    for (i = 0; i < nrow; i++)
-    {
-        MATRIX_ELEM *row = MATRIX_ROW_PTR(self, i);
-        for (j = 0; j < ncol; j++)
-            if (fscanf(fp, MATRIX_ELEM_FMT, row + j) != 1)
-            {
-                free(self);
-                return 0;
-            }
-    }
-    luaT_pushudata(L, self, nerv_matrix_(tname));
-    return 1;
-}
-
-int nerv_matrix_(save)(lua_State *L) {
-    ChunkFileHandle *chunk = luaT_checkudata(L, 2,
-                                nerv_chunk_file_handle_tname);
-    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    int i, j;
-    long nrow = self->nrow, ncol = self->ncol;
-    FILE *fp = chunk->fp;
-    if (fprintf(fp, "%ld %ld\n", nrow, ncol) < 0)
-        return 0;
-    for (i = 0; i < nrow; i++)
-    {
-        MATRIX_ELEM *row = MATRIX_ROW_PTR(self, i);
-        for (j = 0; j < ncol; j++)
-            if (fprintf(fp, MATRIX_ELEM_WRITE_FMT " ", row[j]) < 0)
-                return 0;
-        if (fprintf(fp, "\n") < 0)
-             return 0;
-    }
-    return 0;
-}
-
-static int nerv_matrix_(copy_from)(lua_State *L) {
-    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
-    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
-    int nargs = lua_gettop(L);
-    int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
-    int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
-    int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
-    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
-            a_begin + b_end - b_begin <= a->nrow))
-        nerv_error(L, "invalid copy interval");
-    if (a->ncol != b->ncol)
-        nerv_error(L, "matrices should be of the same dimension");
-    memmove(MATRIX_ROW_PTR(a, a_begin),
-            MATRIX_ROW_PTR(b, b_begin),
-            sizeof(MATRIX_ELEM) * b->ncol * (b_end - b_begin));
-    return 0;
-}
-
-static const luaL_Reg nerv_matrix_(extra_methods)[] = {
-    {"load", nerv_matrix_(load)},
-    {"save", nerv_matrix_(save)},
-    {"copy_from", nerv_matrix_(copy_from)},
-    {NULL, NULL}
-};
-
-#endif
diff --git a/matrix/init.c b/matrix/init.c
deleted file mode 100644
index c29d7e9..0000000
--- a/matrix/init.c
+++ /dev/null
@@ -1,35 +0,0 @@
-#include "../common.h"
-#include "generic/matrix.h"
-
-const char *nerv_matrix_tname = "nerv.Matrix";
-const char *nerv_matrix_cuda_tname = "nerv.CuMatrix";
-const char *nerv_matrix_host_tname = "nerv.MMatrix";
-
-void nerv_cumatrix_init(lua_State *L);
-void nerv_mmatrix_init(lua_State *L);
-
-static const luaL_Reg matrix_methods[] = {
-    {"__tostring__", nerv_error_method_not_implemented },
-    {"__add__", nerv_error_method_not_implemented },
-    {"__sub__", nerv_error_method_not_implemented },
-    {"__mul__", nerv_error_method_not_implemented },
-    {NULL, NULL}
-};
-
-void nerv_matrix_init(lua_State *L) {
-    /* abstract base class: Matrix */
-    luaT_newmetatable(L, nerv_matrix_tname, NULL, NULL, NULL, NULL);
-    luaL_register(L, NULL, matrix_methods);
-    lua_pop(L, 1);
-
-    /* CuMatrix inherits from Matrix */
-    luaT_newmetatable(L, nerv_matrix_cuda_tname, nerv_matrix_tname,
-                            NULL, NULL, NULL);
-    nerv_cumatrix_init(L);
-    lua_pop(L, 1);
-    /* MMatrix inherits from Matrix */
-    luaT_newmetatable(L, nerv_matrix_host_tname, nerv_matrix_tname,
-                            NULL, NULL, NULL);
-    nerv_mmatrix_init(L);
-    lua_pop(L, 1);
-}
diff --git a/matrix/init.lua b/matrix/init.lua
deleted file mode 100644
index 1a8925f..0000000
--- a/matrix/init.lua
+++ /dev/null
@@ -1,77 +0,0 @@
-function nerv.Matrix:__tostring__()
-    local ncol = self:ncol()
-    local nrow = self:nrow()
-    local strt = {}
-    local fmt
-    if self.fmt then
-        fmt = self.fmt
-    else
-        fmt = "%.8f "
-    end
-    if nrow == 1 then
-        for col = 0, ncol - 1 do
-            table.insert(strt, string.format(fmt, self[col]))
-        end
-        table.insert(strt, "\n")
-    else
-        for row = 0, nrow - 1 do
-            local rp = self[row]
-            for col = 0, ncol - 1 do
-                table.insert(strt, string.format(fmt, rp[col]))
-            end
-            table.insert(strt, "\n")
-        end
-    end
-    table.insert(strt, string.format(
-        "[%s %d x %d]", self.__typename, nrow, ncol))
-    return table.concat(strt)
-end
-
--- gen: a function takes take indices of the matrix and return the generated
--- all entrys in the matrix will be assigned by calling gen(i, j)
-function nerv.Matrix:generate(gen)
-    if (self:nrow() == 1) then
-        for j = 0, self:ncol() - 1 do
-            self[j] = gen(j)
-        end
-    else
-        for i = 0, self:nrow() - 1 do
-            local row = self[i]
-            for j = 0, self:ncol() - 1 do
-                row[j] = gen(i, j)
-            end
-        end
-    end
-end
-
-nerv.MMatrixInt.fmt = "%d "
-
-function nerv.CuMatrix:__add__(b)
-    c = self:create()
-    c:add(self, b, 1.0, 1.0)
-    return c
-end
-
-function nerv.CuMatrix:__sub__(b)
-    c = self:create()
-    c:add(self, b, 1.0, -1.0)
-    return c
-end
-
-function nerv.CuMatrix:__mul__(b)
-    c = nerv.get_type(self.__typename)(self:nrow(), b:ncol())
-    c:mul(self, b, 1.0, 0.0, 'N', 'N')
-    return c
-end
-
-function nerv.CuMatrixFloat.new_from_host(mat)
-    local res = nerv.CuMatrixFloat(mat:nrow(), mat:ncol())
-    res:copy_fromh(mat)
-    return res
-end
-
-function nerv.CuMatrixFloat:new_to_host()
-    local res = nerv.MMatrixFloat(self:nrow(), self:ncol())
-    self:copy_toh(res)
-    return res
-end
diff --git a/matrix/mmatrix.c b/matrix/mmatrix.c
deleted file mode 100644
index d1d68b9..0000000
--- a/matrix/mmatrix.c
+++ /dev/null
@@ -1,77 +0,0 @@
-#define NERV_GENERIC_MMATRIX
-#include <stdlib.h>
-#include "../common.h"
-void nerv_matrix_host_float_init(lua_State *L);
-void nerv_matrix_host_double_init(lua_State *L);
-void nerv_matrix_host_int_init(lua_State *L);
-
-void nerv_mmatrix_init(lua_State *L) {
-    srand(1);
-    nerv_matrix_host_float_init(L);
-    nerv_matrix_host_double_init(L);
-    nerv_matrix_host_int_init(L);
-}
-
-#define MATRIX_USE_FLOAT
-#define host_matrix_(NAME) host_matrix_float_##NAME
-#define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME
-const char *nerv_matrix_(tname) = "nerv.MMatrixFloat";
-#include "generic/mmatrix.c"
-#undef nerv_matrix_
-#undef host_matrix_
-#undef MATRIX_USE_FLOAT
-#undef MATRIX_ELEM
-#undef MATRIX_ELEM_PTR
-#undef MATRIX_ELEM_FMT
-#undef MATRIX_ELEM_WRITE_FMT
-
-#define NERV_GENERIC_MMATRIX
-#define MATRIX_USE_DOUBLE
-#define host_matrix_(NAME) host_matrix_double_##NAME
-#define nerv_matrix_(NAME) nerv_matrix_host_double_##NAME
-const char *nerv_matrix_(tname) = "nerv.MMatrixDouble";
-#include "generic/mmatrix.c"
-#undef nerv_matrix_
-#undef host_matrix_
-#undef MATRIX_USE_DOUBLE
-#undef MATRIX_ELEM
-#undef MATRIX_ELEM_PTR
-#undef MATRIX_ELEM_FMT
-#undef MATRIX_ELEM_WRITE_FMT
-
-#define NERV_GENERIC_MMATRIX
-#define MATRIX_USE_INT
-#define host_matrix_(NAME) host_matrix_int_##NAME
-#define nerv_matrix_(NAME) nerv_matrix_host_int_##NAME
-const char *nerv_matrix_(tname) = "nerv.MMatrixInt";
-#define MMATRIX_INIT(L) host_matrix_(init_extra)(L)
-
-static const luaL_Reg nerv_matrix_(extra_methods_int)[];
-static void host_matrix_(init_extra)(lua_State *L) {
-    luaN_append_methods(L, nerv_matrix_(extra_methods_int));
-}
-
-#include "generic/mmatrix.c"
-
-static int nerv_matrix_(perm_gen)(lua_State *L) {
-    int i, ncol = luaL_checkinteger(L, 1);
-    Matrix *self = nerv_matrix_(new_)(L, 1, ncol);
-    long *prow = self->data.i;
-    for (i = 0; i < ncol; i++)
-        prow[i] = i;
-    for (i = ncol - 1; i >= 0; i--)
-    {
-        size_t j = rand() % (i + 1);
-        long tmp = prow[i];
-        prow[i] = prow[j];
-        prow[j] = tmp;
-    }
-    luaT_pushudata(L, self, nerv_matrix_(tname));
-    return 1;
-}
-
-static const luaL_Reg nerv_matrix_(extra_methods_int)[] = {
-    {"perm_gen", nerv_matrix_(perm_gen)},
-    {NULL, NULL}
-};
-
diff --git a/nerv b/nerv
deleted file mode 100755
index 3c16418..0000000
--- a/nerv
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/bash
-exec 'build/luajit-2.0/bin/luajit' -e "package.cpath=\"${PWD}/build/lib/?.so\"" -e "package.path=\"${PWD}/build/lua/?/init.lua;${PWD}/build/lua/?.lua;${PWD}/?.lua\"" -e "require 'nerv'" "$@"
diff --git a/nerv.c b/nerv.c
deleted file mode 100644
index a59eadc..0000000
--- a/nerv.c
+++ /dev/null
@@ -1,38 +0,0 @@
-#include "common.h"
-
-extern void nerv_example_init(lua_State *L);
-extern void nerv_matrix_init(lua_State *L);
-extern void nerv_io_init(lua_State *L);
-
-static const luaL_Reg nerv_utils_methods[] = {
-    {"setmetatable", luaT_lua_setmetatable},
-    {"getmetatable", luaT_lua_getmetatable},
-    {"newmetatable", luaT_lua_newmetatable},
-    {"typename", luaT_lua_typename},
-    {NULL, NULL}
-};
-
-void nerv_utils_init(lua_State *L) {
-    luaL_register(L, NULL, nerv_utils_methods);
-}
-
-int luaopen_libnerv(lua_State *L) {
-    lua_newtable(L);
-    /* duplicate table */
-    lua_pushvalue(L, -1);
-    /* set table to global index */
-    lua_setfield(L, LUA_GLOBALSINDEX, "nerv");
-    /* A table reference still remains.
-     *
-     * The following initialization functions should obey to the rule that they
-     * maintain the stack properly to guarantee the stack stays the same before
-     * and after invoking the call (i.e. stay balanced).
-     *
-     * Also note that they can make use of the value at top of the stack which
-     * references to the `nerv` global table. */
-    nerv_utils_init(L);
-    nerv_example_init(L);
-    nerv_matrix_init(L);
-    nerv_io_init(L);
-    return 1;
-}
diff --git a/nerv.lua b/nerv.lua
deleted file mode 100644
index a69dda6..0000000
--- a/nerv.lua
+++ /dev/null
@@ -1,114 +0,0 @@
-require 'libnerv'
-
-function nerv.error(fmt, ...)
-    error(nerv.printf("[nerv] internal error: " .. fmt .. "\n", ...))
-end
-
-function nerv.error_method_not_implemented()
-    nerv.error("method not implemented");
-end
-
-function nerv.printf(fmt, ...)
-    io.write(string.format(fmt, ...))
-end
-
-function nerv.mesg_with_timestamp(fmt, ...)
-    nerv.printf(
-        string.format("(%s)[nerv] info: %s\n",
-            os.date("%H:%M:%S %F"), fmt), ...)
-end
-
-function nerv.info(fmt, ...)
-    nerv.printf(
-        string.format("(%s)[nerv] info: %s\n",
-            os.date("%H:%M:%S %F"), fmt), ...)
-end
-
-function nerv.warning(fmt, ...)
-    nerv.printf(
-        string.format("(%s)[nerv] warning: %s\n",
-            os.date("%H:%M:%S %F"), fmt), ...)
-end
-
--- Torch C API wrapper
-function nerv.class(tname, parenttname)
-
-   local function constructor(...)
-      local self = {}
-      nerv.setmetatable(self, tname)
-      if self.__init then
-         self:__init(...)
-      end
-      return self
-   end
-
-   local function factory()
-      local self = {}
-      nerv.setmetatable(self, tname)
-      return self
-   end
-
-   local mt = nerv.newmetatable(tname, parenttname, constructor, nil, factory)
-   local mpt
-   if parenttname then
-      mpt = nerv.getmetatable(parenttname)
-   end
-   return mt, mpt
-end
-
-function table.val_to_str(v)
-  if "string" == type(v) then
-    v = string.gsub(v, "\n", "\\n")
-    if string.match(string.gsub(v,"[^'\"]",""), '^"+$') then
-      return "'" .. v .. "'"
-    end
-    return '"' .. string.gsub(v,'"', '\\"') .. '"'
-  else
-    return "table" == type(v) and table.tostring(v) or
-      tostring(v)
-  end
-end
-
-function table.key_to_str (k)
-  if "string" == type(k) and string.match(k, "^[_%a][_%a%d]*$") then
-    return k
-  else
-    return "[" .. table.val_to_str(k) .. "]"
-  end
-end
-
-function table.tostring(tbl)
-  local result, done = {}, {}
-  for k, v in ipairs(tbl) do
-    table.insert(result, table.val_to_str(v))
-    done[k] = true
-  end
-  for k, v in pairs(tbl) do
-    if not done[k] then
-      table.insert(result,
-        table.key_to_str(k) .. "=" .. table.val_to_str(v))
-    end
-  end
-  return "{" .. table.concat(result, ",") .. "}"
-end
-
-function nerv.get_type(tname)
-    return assert(loadstring("return " .. tname))()
-end
-
-function nerv.is_type(obj, tname)
-    local mt0 = nerv.getmetatable(tname)
-    local mt = getmetatable(obj)
-    while mt do
-        if mt == mt0 then
-            return true
-        end
-        mt = getmetatable(mt)
-    end
-    return false
-end
-
-require 'matrix.init'
-require 'io.init'
-require 'layer.init'
-require 'nn.init'
diff --git a/nerv/.gitignore b/nerv/.gitignore
new file mode 100644
index 0000000..567609b
--- /dev/null
+++ b/nerv/.gitignore
@@ -0,0 +1 @@
+build/
diff --git a/nerv/Makefile b/nerv/Makefile
new file mode 100644
index 0000000..b69a63e
--- /dev/null
+++ b/nerv/Makefile
@@ -0,0 +1,60 @@
+.PHONY: build install clean
+SHELL := /bin/bash
+BUILD_DIR := $(CURDIR)/build
+OBJS := nerv.o luaT.o common.o \
+		matrix/mmatrix.o matrix/cumatrix.o matrix/init.o matrix/cukernel.o \
+		io/init.o io/chunk_file.o \
+		examples/oop_example.o
+LIBS := libnerv.so
+LUA_LIBS := matrix/init.lua io/init.lua init.lua \
+			layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua \
+			layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua \
+			nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua \
+			io/sgd_buffer.lua
+INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK
+CUDA_BASE := /usr/local/cuda-6.5
+#CUDA_BASE := /usr/local/cuda-5.0
+CUDA_INCLUDE := -I $(CUDA_BASE)/include/
+INCLUDE += $(CUDA_INCLUDE)
+LDFLAGS := -L$(CUDA_BASE)/lib64/  -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcublas
+CFLAGS := -Wall -Wextra -O2
+OBJ_DIR := $(BUILD_DIR)/objs
+SUBDIR := matrix io layer examples nn
+NVCC := $(CUDA_BASE)/bin/nvcc
+NVCC_FLAGS := -Xcompiler -fPIC,-Wall,-Wextra
+
+LUA_DIR = $(INST_LUADIR)/nerv
+OBJS := $(addprefix $(OBJ_DIR)/,$(OBJS))
+OBJ_SUBDIR := $(addprefix $(OBJ_DIR)/,$(SUBDIR))
+LUA_SUBDIR := $(addprefix $(LUA_DIR)/,$(SUBDIR))
+LIBS := $(addprefix $(INST_LIBDIR)/,$(LIBS))
+LUA_LIBS := $(addprefix $(LUA_DIR)/,$(LUA_LIBS))
+
+build: $(OBJ_DIR) $(OBJ_SUBDIR) $(OBJS)
+$(OBJ_DIR) $(LUA_DIR) $(OBJ_SUBDIR) $(LUA_SUBDIR):
+	-mkdir -p $@
+$(OBJ_DIR)/%.o: %.c $(patsubst /%.o,/%.c,$@)
+	gcc -c -o $@ $< $(INCLUDE) -fPIC $(CFLAGS)
+$(OBJ_DIR)/matrix/cukernel.o: matrix/cukernel.cu
+	$(NVCC) -c -o $@ $< $(INCLUDE) $(NVCC_FLAGS)
+$(LUA_DIR)/%.lua: %.lua
+	cp $< $@
+$(OBJ_DIR)/luaT.o:
+	gcc -c -o $@ luaT/luaT.c $(INCLUDE) -fPIC
+$(LIBS): $(OBJS)
+	gcc -shared -o $@ $(OBJS) $(LDFLAGS)
+
+$(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c matrix/generic/cukernel.cu
+$(OBJ_DIR)/matrix/mmatrix.o: matrix/generic/mmatrix.c matrix/generic/matrix.c
+$(OBJ_DIR)/matrix/cukernel.o: matrix/generic/cukernel.cu
+
+.PHONY: speech
+
+speech:
+	-mkdir -p build/objs/speech/tnet_io
+	$(MAKE) -C speech/ BUILD_DIR=$(BUILD_DIR) LIB_DIR=$(LIB_DIR) OBJ_DIR=$(CURDIR)/build/objs/speech/ LUA_DIR=$(LUA_DIR)
+
+clean:
+	-rm -rf $(OBJ_DIR)
+
+install: $(LIBS) $(LUA_DIR) $(LUA_SUBDIR) $(LUA_LIBS)
diff --git a/nerv/common.c b/nerv/common.c
new file mode 100644
index 0000000..b4e39e6
--- /dev/null
+++ b/nerv/common.c
@@ -0,0 +1,76 @@
+#include "common.h"
+#include <stdarg.h>
+int nerv_error(lua_State *L, const char *err_mesg_fmt, ...) {
+    va_list ap;
+    va_start(ap, err_mesg_fmt);
+    lua_pushstring(L, "[nerv] internal error: ");
+    lua_pushvfstring(L, err_mesg_fmt, ap); 
+    lua_concat(L, 2);
+    lua_error(L);
+    va_end(ap);
+    return 0;
+}
+
+int nerv_error_method_not_implemented(lua_State *L) {
+    return nerv_error(L, "method not implemented"); 
+}
+
+void luaN_append_methods(lua_State *L, const luaL_Reg *mlist) {
+    for (; mlist->func; mlist++)
+    {
+        lua_pushcfunction(L, mlist->func);
+        lua_setfield(L, -2, mlist->name);
+    }
+}
+
+HashMap *hashmap_create(size_t size, HashKey_t hfunc, HashMapCmp_t cmp) {
+    HashMap *res = (HashMap *)malloc(sizeof(HashMap));
+    res->bucket = calloc(size, sizeof(HashNode));
+    res->cmp = cmp;
+    res->hfunc = hfunc;
+    res->size = size;
+    return res;
+}
+
+void *hashmap_getval(HashMap *h, const char *key) {
+    size_t idx = h->hfunc(key) % h->size;
+    HashNode *ptr;
+    for (ptr = h->bucket[idx]; ptr; ptr = ptr->next)
+    {
+        if (!h->cmp(ptr->key, key))
+            return ptr->val;
+    }
+    return NULL;
+}
+
+void hashmap_setval(HashMap *h, const char *key, void *val) {
+    size_t idx = h->hfunc(key) % h->size;
+    HashNode *ptr = malloc(sizeof(HashNode));
+    ptr->next = h->bucket[idx];
+    h->bucket[idx] = ptr;
+    ptr->key = key;
+    ptr->val = val;
+}
+
+void hashmap_clear(HashMap *h) {
+    size_t i;
+    for (i = 0; i < h->size; i++)
+    {
+        HashNode *ptr, *nptr;
+        for (ptr = h->bucket[i]; ptr; ptr = nptr)
+        {
+            nptr = ptr->next;
+            free(ptr->val);
+            free(ptr);
+        }
+        h->bucket[i] = NULL;
+    }
+}
+
+size_t bkdr_hash(const char *key) {
+    unsigned int seed = 131;
+    unsigned int res = 0;
+    while (*key)
+        res = res * seed + *key++;
+    return res;
+}
diff --git a/nerv/common.h b/nerv/common.h
new file mode 100644
index 0000000..e21c7a5
--- /dev/null
+++ b/nerv/common.h
@@ -0,0 +1,36 @@
+#ifndef NERV_COMMON_H
+#define NERV_COMMON_H
+#include "lua.h"
+#include "lauxlib.h"
+#include "lualib.h"
+#include "luaT/luaT.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef struct HashNode {
+    const char *key;
+    void *val;
+    struct HashNode *next;
+} HashNode;
+
+typedef int (*HashMapCmp_t)(const char *a, const char *b);
+typedef size_t (*HashKey_t)(const char *key);
+
+typedef struct HashMap {
+    HashNode **bucket;
+    HashMapCmp_t cmp;
+    HashKey_t hfunc;
+    size_t size;
+} HashMap;
+
+HashMap *hashmap_create(size_t size, HashKey_t hfunc, HashMapCmp_t cmp);
+void *hashmap_getval(HashMap *h, const char *key);
+void hashmap_setval(HashMap *h, const char *key, void *val);
+void hashmap_clear(HashMap *h);
+
+size_t bkdr_hash(const char *key);
+
+int nerv_error(lua_State *L, const char *err_mesg_fmt, ...); 
+int nerv_error_method_not_implemented(lua_State *L);
+void luaN_append_methods(lua_State *L, const luaL_Reg *mlist);
+#endif
diff --git a/nerv/doc/nerv.md b/nerv/doc/nerv.md
new file mode 100644
index 0000000..28411f5
--- /dev/null
+++ b/nerv/doc/nerv.md
@@ -0,0 +1,17 @@
+#The Nerv utility functions#
+Part of the [Nerv](../README.md) toolkit.
+##Methods##
+* __string = nerv.typename(obj a)__  
+A registered function, the original function is `luaT_lua_typename`. In some cases if you call `type(a)`  for object of some class in __Nerv__(like __Nerv.CuMatrix__) it will only return "userdata"(because it is created in C), in this case you can use this method to get its type.
+
+---
+
+* __metatable = nerv.getmetatable(string tname)__  
+A registered function, the original function is `luaT_lua_getmetatable`. `tname` should be a class name that has been registered in __luaT__.
+
+* __metatable = nerv.newmetatable(string tname, string parenttname, function constructor, function destructor, function factory)__  
+A registered function, the original function is `luaT_newmetatable`, it returns the metatable of the created class by the name `tname`.
+* __string = nerv.setmetatable(table self, string tname)__  
+A registered function, the original function is `luaT_lua_setmetatable`. It assigns the metatable registered in __luaT__ by the name *tname* to the table *self*. And return *tname* to user.
+* __table = nerv.get_type(string typename)__  
+Returns the type(`loadstring("return " .. typename)`).
\ No newline at end of file
diff --git a/nerv/doc/nerv_class.md b/nerv/doc/nerv_class.md
new file mode 100644
index 0000000..99f63e7
--- /dev/null
+++ b/nerv/doc/nerv_class.md
@@ -0,0 +1,36 @@
+#The Nerv OOP#
+Part of the [Nerv](../README.md) toolkit.
+##Methods##
+* __metatable mt, metatable mpt = nerv.class(string tname, string parenttname)__  
+This method is used to create a class by the name `tname`, which inherits `parenttname` in __Nerv__, then you create a new instance of this class by calling `obj=tname(...)`. The  `tname.__init(...)` method(if defined) will be called in the constructing. The metatable of the class and its parent class will be returned.
+
+##Examples##
+* This example implements a simple `nerv.Counter` class which is inherited by `nerv.BetterCounter`.  
+
+```
+do
+    nerv.class("nerv.Counter")
+    function nerv.Counter:__init(c)
+        if (c) then
+            self.c = c
+        else
+            self.c = 0
+        end
+    end
+end
+do
+    local mt, mpt = nerv.class("nerv.BetterCounter", "nerv.Counter")
+    function nerv.BetterCounter:__init(c, bc)
+        mpt.__init(self, c)
+        if (bc) then
+            self.bc = bc
+        else
+            self.bc = 0
+        end
+    end
+end
+c1 = nerv.Counter(1)
+print(c1.c)
+bc1 = nerv.BetterCounter(1, 1)
+print(bc1.c, bc1.bc)
+```
\ No newline at end of file
diff --git a/nerv/doc/nerv_io.md b/nerv/doc/nerv_io.md
new file mode 100644
index 0000000..07589df
--- /dev/null
+++ b/nerv/doc/nerv_io.md
@@ -0,0 +1,113 @@
+#The Nerv IO Package#
+Part of the [Nerv](../README.md) toolkit.
+
+##Description##
+The main class that the user uses to store and read parameter object to and from files is __nerv.ChunkFile__.  
+In the file, a parameter object will be saved using a standard format. First is the length(in byte) of this object, then a table which includes some meta information of the object, and a data area. Below is an example text file.  
+```
+[0000000000202]
+{type="nerv.ExampleP",info={message="just-a-try"},id="exampleP1"}
+3 3
+5.000000 5.000000 5.000000 
+5.000000 5.000000 5.000000 
+5.000000 5.000000 5.000000 
+1 3
+4.000000 4.000000 4.000000 
+[0000000000202]
+{type="nerv.ExampleP",info={message="just-a-try"},id="exampleP2"}
+3 3
+4.000000 4.000000 4.000000 
+4.000000 4.000000 4.000000 
+4.000000 4.000000 4.000000 
+1 3
+3.000000 3.000000 3.000000 
+```
+
+##Methods##
+* __ChunkFile ChunkFile(string fn, string mode)__  
+`mode` can be `r` or `w`, for reading or writing a file. The returned __ChunkFile__ will be ready to write or read objects which follows the __nerv.Param__ interface(using `write_chunk` and `read_chunk`). 
+* __void ChunkFile.write_chunk(ChunkFile self, Param p)__  
+Write `p` into the file. `p:write` will be called.
+* __Param ChunkFile.read_chunk(ChunkFile self, string id, table global_conf)__  
+Read the __Param__ object by id `id` from the file `self`. It will be constructed using `__init(id, global_conf)`. `p:read` will be called.
+* __void ChunkFile.close(ChunkFile self)__  
+Close the opened file.
+
+##Examples##
+* An example showing how to use __ChunkFile__ to store and read parameter objects.
+```
+require 'io'
+do
+    local mt, mpt = nerv.class('nerv.ExampleP', 'nerv.Param')
+    function nerv.ExampleP:__init(id, global_conf)
+        self.id = id
+        self.global_conf = global_conf
+        self.matrix = nerv.MMatrixFloat(3, 3)
+        for i = 0, 2, 1 do
+            for j = 0, 2, 1 do
+                self.matrix[i][j] = 3
+            end
+        end
+        self.bias = nerv.MMatrixFloat(1, 3)
+        for i = 0, 2, 1 do
+            self.bias[i] = 2;
+        end
+        self:set_info({message = 'just-a-try'})
+    end
+    function nerv.ExampleP:addOne()
+        for i = 0, 2, 1 do
+            for j = 0, 2, 1 do
+                self.matrix[i][j] = self.matrix[i][j] + 1
+            end
+        end
+        for i = 0, 2, 1 do
+            self.bias[i] = self.bias[i] + 1
+        end
+    end
+    function nerv.ExampleP:read(pcdata)
+        self.matrix = nerv.MMatrixFloat.load(pcdata)
+        self.bias = nerv.MMatrixFloat.load(pcdata)
+    end
+    function nerv.ExampleP:write(pfhandle)
+        self.matrix:save(pfhandle) 
+        self.bias:save(pfhandle)
+    end
+end
+global_conf = {}
+do
+    local f = nerv.ChunkFile('../tmp', 'w')
+    local exampleP1 = nerv.ExampleP('exampleP1', global_conf)
+    local exampleP2 = nerv.ExampleP('exampleP2', global_conf)
+    exampleP1:addOne() 
+    exampleP1:addOne()
+    exampleP2:addOne()
+
+    f:write_chunk(exampleP1)
+    f:write_chunk(exampleP2)
+    f:close()
+end
+do
+    local f = nerv.ChunkFile('../tmp', 'r')
+    local exampleP1 = f:read_chunk('exampleP1', global_conf)
+    local exampleP2 = f:read_chunk('exampleP2', global_conf)
+    f:close()
+    print(exampleP1.matrix)
+    print(exampleP2.matrix)
+end
+```
+
+##Developer Notes##
+* There are four classes in to deal with chunk data, which are __nerv.ChunkFile__, __nerv.ChunkFileHandle__, __nerv.ChunkInfo__, __nerv.ChunkData__. Below is the underlying C structs.
+```
+typedef struct ChunkFileHandle {
+    FILE *fp;
+} ChunkFileHandle;
+typedef struct ChunkInfo {
+    off_t offset, length;
+} ChunkInfo;
+typedef struct ChunkData {
+    FILE *fp;
+    char *data;
+} ChunkData;
+```
+* In __Nerv.io__, a returned(by `ChunkFile.__init`) __nerv.ChunkFile__ will have a member `handle`, which is a __nerv.ChunkFileHandle__.  
\ No newline at end of file
diff --git a/nerv/doc/nerv_layer.md b/nerv/doc/nerv_layer.md
new file mode 100644
index 0000000..de2fb12
--- /dev/null
+++ b/nerv/doc/nerv_layer.md
@@ -0,0 +1,180 @@
+#The Nerv Layer Package#
+Part of the [Nerv](../README.md) toolkit.
+
+##Description##
+__nerv.Layer__ is the base class and most of its methods are abstract.  
+###Class hierarchy and their members###
+* __nerv.Layer__.  
+	* `table dim_in` It specifies the dimensions of the inputs.  
+	* `table dim_out` It specifies the dimensions of the outputs.  
+	* `string id` ID of this layer.
+	* `table gconf` Stores the `global_conf`.
+* __nerv.AffineLayer__ inherits __nerv.Layer__, both `#dim_in` and `#dim_out` are 1. 
+	* `MatrixParam ltp` The liner transform parameter.
+	* `BiasParam bp` The bias parameter.
+* __nerv.BiasLayer__ inherits __nerv.Layer__, both `#dim_in` nad `#dim_out` are 1.
+	* `BiasParam bias` The bias parameter.
+* __nerv.SigmoidLayer__ inherits __nerv.Layer__, both `#dim_in` and `#dim_out` are 1.
+* __nerv.SoftmaxCELayer__ inherits __nerv.Layer__, `#dim_in` is 2 and `#dim_out` is -1(optional). `input[1]` is the input to the softmax layer, `input[2]` is the reference distribution. In its `propagate(input, output)` method, if `output[1] ~= nil`, cross\_entropy value will outputed.
+	* `float total_ce` Records the accumlated cross entropy value.
+	* `int total_frams` Records how many frames have passed.  
+	* `bool compressed` The reference distribution can be a one-hot format. This feature is enabled by `layer_conf.compressed`.
+
+##Methods##
+* __void Layer.\_\_init(Layer self, string id, table global_conf, table layer_conf)__  
+Abstract method.  
+The constructing method should assign `id` to `self.id` and `global_conf` to `self.gconf`, `layer_conf.dim_in` to `self.dim_in`, `layer_conf.dim_out` to `self.dim_out`. `dim_in` and `dim_out` are a list specifies the dimensions of the inputs and outputs. Also, `layer_conf` will include the parameters, which should also be properly saved.
+* __void Layer.init(Layer self)__  
+Abstract method.  
+Initialization method, in this method the layer should do some self-checking and allocate space for intermediate results.
+* __void Layer.update(Layer self, table bp_err, table input, table output)__  
+Abstract method.  
+`bp_err[i]` should be the error on `output[i]`. In this method the parameters of `self` is updated.
+* __void Layer.propagate(Layer self, table input, table output)__  
+Abstract method.  
+Given `input` and the current parameters, propagate and store the result in `output`.
+* __void Layer.back_propagate(Layer self, Matrix next_bp_err, Matrix bp_err, Matrix input, Matrix output)__  
+Abstract method.  
+Calculate the error on the inputs and store them in `next_bp_err`.
+
+* __void Layer.check_dim_len(int len_in, int len_out)__  
+Check whether `#self.dim_in == len_in` and `#self.dim_out == len_out`, if violated, an error will be posted.
+* __void Layer.get_params(Layer self)__  
+Abstract method.  
+The layer should return a list containing its parameters.
+
+####nerv.Layer.get\_dim(self)####
+*	Returns:
+	`dim_in`: __table__.  
+    `dim_out`: __table__.  
+*	Parameters:  
+	`self`: __nerv.Layer__.
+*	Description:  
+	Returns `self.dim_in, self.dim_out`.
+
+##Examples##
+* a basic example using __Nerv__ layers to a linear classification.
+
+```
+require 'math'
+
+require 'layer.affine'
+require 'layer.softmax_ce'
+
+--[[Example using layers, a simple two-classification problem]]--
+
+function calculate_accurate(networkO, labelM)
+    sum = 0
+    for i = 0, networkO:nrow() - 1, 1 do
+        if (labelM[i][0] == 1 and networkO[i][0] >= 0.5) then
+            sum = sum + 1
+        end
+        if (labelM[i][1] == 1 and networkO[i][1] >= 0.5) then
+            sum = sum + 1
+        end 
+    end
+    return sum
+end
+
+--[[begin global setting and data generation]]--
+global_conf =  {lrate = 10, 
+                wcost = 1e-6,
+                momentum = 0.9,
+                cumat_type = nerv.CuMatrixFloat}
+
+input_dim = 5
+data_num = 100
+ansV = nerv.CuMatrixFloat(input_dim, 1)
+for i = 0, input_dim - 1, 1 do
+    ansV[i][0] = math.random() - 0.5
+end
+ansB = math.random() - 0.5
+print('displaying ansV')
+print(ansV)
+print('displaying ansB(bias)')
+print(ansB)
+
+dataM = nerv.CuMatrixFloat(data_num, input_dim)
+for i = 0, data_num - 1, 1 do
+    for j = 0, input_dim - 1, 1 do
+        dataM[i][j] = math.random() * 2 - 1
+    end
+end
+refM = nerv.CuMatrixFloat(data_num, 1)
+refM:fill(ansB)
+refM:mul(dataM, ansV, 1, 1) --refM = dataM * ansV + ansB
+
+labelM = nerv.CuMatrixFloat(data_num, 2)
+for i = 0, data_num - 1, 1 do
+    if (refM[i][0] > 0) then
+        labelM[i][0] = 1 
+        labelM[i][1] = 0
+    else
+        labelM[i][0] = 0
+        labelM[i][1] = 1
+    end
+end
+--[[global setting and data generation end]]--
+
+
+--[[begin network building]]--
+--parameters
+affineL_ltp = nerv.LinearTransParam('AffineL_ltp', global_conf)
+affineL_ltp.trans = nerv.CuMatrixFloat(input_dim, 2)
+for i = 0, input_dim - 1, 1 do
+    for j = 0, 1, 1 do
+        affineL_ltp.trans[i][j] = math.random() - 0.5 
+    end
+end
+affineL_bp = nerv.BiasParam('AffineL_bp', global_conf)
+affineL_bp.trans = nerv.CuMatrixFloat(1, 2)
+for j = 0, 1, 1 do
+    affineL_bp.trans[j] = math.random() - 0.5
+end
+
+--layers
+affineL = nerv.AffineLayer('AffineL', global_conf, {['ltp'] = affineL_ltp,
+                                                      ['bp'] = affineL_bp,
+                                                      dim_in = {input_dim},
+                                                      dim_out = {2}})
+softmaxL = nerv.SoftmaxCELayer('softmaxL', global_conf, {dim_in = {2, 2},
+                                                         dim_out = {}})
+print('layers initializing...')
+affineL:init()
+softmaxL:init()
+--[[network building end]]--
+
+
+--[[begin space allocation]]--
+print('network input&output&error space allocation...')
+affineI = {dataM} --input to the network is data
+affineO = {nerv.CuMatrixFloat(data_num, 2)}
+softmaxI = {affineO[1], labelM}
+softmaxO = {}
+output = nerv.CuMatrixFloat(data_num, 2)
+
+affineE = {nerv.CuMatrixFloat(data_num, 2)}
+--[[space allocation end]]--
+
+
+--[[begin training]]--
+ce_last = 0
+for l = 0, 10, 1 do
+    affineL:propagate(affineI, affineO)
+    softmaxL:propagate(softmaxI, softmaxO)
+    output:softmax(softmaxI[1])
+
+    softmaxL:back_propagate(affineE, {}, softmaxI, softmaxO)
+    
+    affineL:update(affineE, affineI, affineO) 
+
+    if (l % 5 == 0) then
+        nerv.utils.printf("training iteration %d finished\n", l)
+        nerv.utils.printf("cross entropy: %.8f\n", softmaxL.total_ce - ce_last)
+        ce_last = softmaxL.total_ce 
+        nerv.utils.printf("accurate labels: %d\n", calculate_accurate(output, labelM))
+        nerv.utils.printf("total frames processed: %.8f\n", softmaxL.total_frames)
+    end
+end
+--[[end training]]--
+```
diff --git a/nerv/doc/nerv_matrix.md b/nerv/doc/nerv_matrix.md
new file mode 100644
index 0000000..22971d2
--- /dev/null
+++ b/nerv/doc/nerv_matrix.md
@@ -0,0 +1,165 @@
+#The Nerv Matrix Package#
+Part of the [Nerv](../README.md) toolkit.
+
+##Description##
+###Underlying structure###
+In the begining is could be useful to know something about the underlying structure of a __Nerv__ matrix. Please keep in mind that matrice in __Nerv__ is row-major.  
+Every matrix object is a encapsulation of a C struct that describes the attributes of this matrix.  
+```
+typedef struct Matrix {
+    size_t stride;              /* size of a row */
+    long ncol, nrow, nmax;    /* dimension of the matrix, nmax is simply nrow * ncol */
+    union {
+        float *f;
+        double *d;
+        long *i;
+    } data;                   /* pointer to actual storage */
+    long *data_ref;
+} Matrix;
+```
+It is worth mentioning that that `data_ref` is a counter which counts the number of references to its memory space, mind that it will also be increased when a row of the matrix is referenced(`col = m[2]`). A __Nerv__ matrix will deallocate its space when this counter is decreased to zero.
+Also note that all assigning operation in __Nerv__ is reference copy, you can use `copy_tod` or `copy_toh` method to copy value. Also, row assigning operations like `m1[2]=m2[3]` is forbidden in __Nerv__.
+
+###Class hierarchy###
+The class hierarchy of the matrix classes can be clearly observed in `matrix/init.c`.
+First there is a abstract base class __Nerv.Matrix__, which is inherited by __Nerv.CuMatrix__ and __Nerv.MMatrix__(also abstract).  
+Finally, there is __Nerv.CuMatrixFloat__, __Nerv.CuMatrixDouble__, inheriting __Nerv.CuMatrix__, and __Nerv.MMatrixFloat__, __Nerv.MMatrixDouble__, __Nerv.MMatrixInt__ , inheriting __Nerv.MMatrix__.
+
+##Methods##
+Mind that usually a matrix object can only do calculation with matrix of its own type(a __Nerv.CuMatrixFloat__ matrix can only do add operation with a __Nerv.CuMatrixFloat__).  
+In the methods description below, __Matrix__ could be __Nerv.CuMatrixFloat__, __Nerv.CuMatrixDouble__, __Nerv.MMatrixFloat__ or __Nerv.MMatrixDouble__. __Element_type__ could be `float` or `double`, respectively.
+* __Matrix = Matrix(int nrow, int ncol)__  
+Returns a __Matrix__ object of `nrow` rows and `ncol` columns.
+* __Element_type = Matrix.get_elem(Matrix self, int index)__  
+Returns the element value at the specific index(treating the matrix as a vector). The index should be less than `nmax` of the matrix.
+* __void Matrix.set_elem(Matrix self, int index, Element_type value)__  
+Set the value at `index` to be `value`.
+* __int Matrix.ncol(Matrix self)__  
+Get `ncol`, the number of columns.
+* __int Matrix.nrow(Matrix self)__  
+Get `nrow`, the number of rows.
+* __int Matrix.get_dataref_value(Matrix self)__  
+Returns the value(not a pointer) of space the `data_ref` pointer pointed to. This function is mainly for debugging.  
+* __Matrix/Element\_type, boolean Matrix.\_\_index\_\_(Matrix self, int index)__  
+If the matrix has more than one row, will return the row at `index` as a __Matrix__ . Otherwise it will return the value at `index`.
+* __void Matrix.\_\_newindex\_\_(Matrix self, int index, Element_type value)__  
+Set the element at `index` to be `value`.
+---
+* __Matrix Matrix.create(Matrix a)__  
+Return a new __Matrix__ of `a`'s size(of the same number of rows and columns).
+* __Matrix Matrix.colsum(Matrix self)__  
+Return a new __Matrix__ of size (1,`self.ncol`), which stores the sum of all columns of __Matrix__ `self`.
+* __Matrix Matrix.rowsum(Matrix self)__  
+Return a new __Matrix__ of size (`self.nrow`,1), which stores the sum of all rows of __Matrix__ `self`.
+* __Matrix Matrix.rowmax(Matrix self)__  
+Return a new __Matrix__ of size (`self.nrow`,1), which stores the max value of all rows of __Matrix__ `self`.
+* __Matrix Matrix.trans(Matrix self)__  
+Return a new __Matrix__ of size (`self.ncol`,`self.nrow`), which stores the transpose of __Matrix__ `self`.
+* __void Matrix.copy_fromh(Matrix self, MMatrix a)__  
+Copy the content of a __MMatrix__ `a` to __Matrix__ `self`, they should be of the same size.
+* __void Matrix.copy_fromd(Matrix self, CuMatrix a)__  
+Copy the content of a __CuMatrix__ `a` to __Matrix__ `self`, they should be of the same size.
+* __void Matrix.copy_toh(Matrix self, MMatrix a)__  
+Copy the content of the __Matrix__ `self` to a __MMatrix__ `a`.
+* __void Matrix.copy_tod(Matrix self, CuMatrix a)__  
+Copy the content of the __Matrix__ `self` to a __CuMatrix__ `a`.
+* __void Matrix.add(Matrix self, Matrix ma, Matrix mb, Element_type alpha, Element_type beta)__  
+It sets the content of __Matrix__ `self` to be `alpha * ma + beta * mb`.__Matrix__ `ma,mb,self` should be of the same size.
+* __void Matrix.mul(Matrix self, Matrix ma, Matrix mb, Element_type alpha, Element_type beta, [string ta, string tb])__  
+It sets the content of __Matrix__ `self` to be `beta * self + alpha * ma * mb`. `ta` and `tb` is optional, if `ta` is 'T', then `ma` will be transposed, also if `tb` is 'T', `mb` will be transposed.
+* __void Matrix.add_row(Matrix self, Matrix va, Element_type beta)__  
+Add `beta * va` to every row of __Matrix__ `self`.
+* __void Matrix.fill(Matrix self, Element_type value)__  
+Fill the content of __Matrix__ `self` to be `value`.
+* __void Matrix.sigmoid(Matrix self, Matrix ma)__  
+Set the element of __Matrix__ `self` to be elementwise-sigmoid of `ma`.
+* __void Matrix.sigmoid_grad(Matrix self, Matrix err, Matrix output)__  
+Set the element of __Matrix__ `self`, to be `self[i][j]=err[i][j]*output[i][j]*(1-output[i][j])`. This function is used to propagate sigmoid layer error.
+* __void Matrix.softmax(Matrix self, Matrix a)__  
+Calculate a row-by-row softmax of __Matrix__ `a` and save the result in `self`.
+* __void Matrix.mul_elem(Matrix self, Matrix ma, Matrix mb)__  
+Calculate element-wise multiplication of __Matrix__ `ma` and `mb`, store the result in `self`.
+* __void Matrix.log_elem(Matrix self, Matrix ma)__  
+Calculate element-wise log of __Matrix__ `ma`, store the result in `self`.
+* __void Matrix.copy_rows_fromh_by_idx(Matrix self, MMatrix ma, MMatrixInt idx)__  
+`idx` should be a row vector. This function copy the rows of `ma` to `self` according to `idx`, in other words, it assigns `ma[idx[i]]` to `self[i]`.
+* __void Matrix.expand_frm(Matrix self, Matrix a, int context)__  
+Treating each row of `a` as speech feature, and do a feature expansion. The `self` should of size `(a.nrow, a.ncol * (context * 2 + 1))`. `self[i]` will be `(a[i-context] a[i-context+1] ... a[i] a[i+1] a[i+context])`. `a[0]` and `a[nrow]` will be copied to extend the index range.
+* __void Matrix.rearrange_frm(Matrix self, Matrix a, int step)__  
+Rearrange `a` according to its feature dimension. The `step` is the length of context. So, `self[i][j]` will be assigned `a[i][j / step + (j % step) * (a.ncol / step)]`. `a` and `self` should be of the same size and `step` should be divisible by `a.ncol`.
+* __void Matrix.scale_row(Matrix self, Matrix scale)__  
+Scale each column of `self` according to a vector `scale`. `scale` should be of size `1 * self.ncol`.
+* __Matrix Matrix.\_\_add\_\_(Matrix ma, Matrix mb)__  
+Returns a new __Matrix__ which stores the result of `ma+mb`.
+* __Matrix Matrix.\_\_sub\_\_(Matrix ma, Matrix mb)__  
+Returns a new __Matrix__ which stores the result of `ma-mb`.
+* __Matrix Matrix.\_\_mul\_\_(Matrix ma, Matrix mb)__  
+Returns a new __Matrix__ which stores the result of `ma*mb`.
+* __CuMatrix CuMatrix.new_from_host(MMatrix m)__  
+Return a new __CuMatrix__ which is a copy of `m`.
+* __MMatrix CuMatrix.new_to_host(CuMatrix self)__  
+Return a new __MMatrix__ which is a copy of `self`.
+* __string Matrix.\_\_tostring\_\_(Matrix self)__  
+Returns a string containing values of __Matrix__ `self`.
+---
+* __MMatrix MMatrix.load(ChunkData chunk)__  
+Return a new __MMatrix__ loaded from the file position in `chunk`.
+* __void MMatrix.save(MMatrix self, ChunkFileHandle chunk)__  
+Write `self` to the file position in `chunk`.
+* __void MMatrix.copy_from(MMatrix ma, MMatrix mb,[int b_bgein, int b_end, int a_begin])__  
+Copy a part of `mb`(rows of index `[b_begin..b_end)`) to `ma` beginning at row index `a_begin`. If not specified, `b_begin` will be `0`, `b_end` will be `b.nrow`, `a_begin` will be `0`.
+
+##Examples##
+* Use `get_dataref_value` to test __Nerv__'s matrix space allocation.  
+```
+m = 10
+n = 10
+fm = nerv.MMatrixFloat(m, n)
+dm = nerv.MMatrixDouble(m, n)
+for i = 0, m - 1 do
+    for j = 0, n - 1 do
+        t = i / (j + 1)
+        fm[i][j] = t
+        dm[i][j] = t
+    end
+end
+print("test fm:get_dataref_value:", fm:get_dataref_value())
+print("forced a garbade collect")
+collectgarbage("collect")
+print("test fm:get_dataref_value:", fm:get_dataref_value())
+print(fm)
+print(dm)
+```
+* Test some __Matrix__ calculations.
+```
+m = 4
+n = 4
+fm = nerv.CuMatrixFloat(m, n)
+dm = nerv.CuMatrixDouble(m, n)
+for i = 0, m - 1 do
+    for j = 0, n - 1 do
+        -- local t = math.random(10)
+        t = i / (j + 1)
+        fm[i][j] = t
+        dm[i][j] = t
+    end
+end
+print(fm)
+fs = fm:create()
+fs:softmax(fm)
+-- print(fs)
+print(dm)
+ds = dm:create()
+ds:softmax(dm)
+-- print(ds)
+print(fs)
+print(fs + fs)
+print(ds + ds)
+print(fs - fs)
+print(ds - ds)
+a = fs:create()
+a:mul_elem(fs, fs)
+print(a)
+a:log_elem(fs)
+print(a)
+```
\ No newline at end of file
diff --git a/nerv/doc/nerv_nn.md b/nerv/doc/nerv_nn.md
new file mode 100644
index 0000000..c57447d
--- /dev/null
+++ b/nerv/doc/nerv_nn.md
@@ -0,0 +1,256 @@
+#The Nerv NN Package#
+Part of the [Nerv](../README.md) toolkit.
+
+##Description##
+###Class hierarchy###
+it contains __nerv.LayerRepo__, __nerv.ParamRepo__, and __nerv.DAGLayer__(inherits __nerv.Layer__).
+
+###Class hierarchy and their members###
+####nerv.ParamRepo#### 
+Get parameter object by ID.  
+*	`table param_table` Contains the mapping of parameter ID to parameter file(__nerv.ChunkFile__)  
+*  __nerv.LayerRepo__ Get layer object by ID.  
+* 	`table layers` Contains the mapping of layer ID to layer object.
+objects.
+
+####__nerv.DAGLayer__####
+Inherits __nerv.Layer__.  
+* 	`layers`: __table__, a mapping from a layer ID to its "ref". A ref is a structure that contains reference to space allocations and other info of the layer.
+* 	`inputs`: __table__, a mapping from the inputs ports of the DAG layer to the input ports of the sublayer, the key is the port number, the value is `{ref, port}`.
+* 	`outputs`:__table__, the counterpart of `inputs`.
+* 	`parsed_conn`: __table__, a list of parsed connections, each entry is of format `{{ref_from, port_from}, {ref_to, port_to}}`.
+* 	`queue`: __table__, a list of "ref"s, the propagation of the DAGLayer will follow this order, and back-propagation will follow a reverse order.
+	
+##Methods##
+
+###__nerv.ParamRepo__###
+
+####nerv.ParamRepo:\_\_init(param\_files)####
+* 	Parameters:  
+	`param_files`: __table__
+*	Description:  
+	`param_files` is a list of file names that stores parameters, the newed __ParamRepo__ will read them from file and store the mapping for future fetching.  
+    
+####nerv.Param ParamRepo.get_param(ParamRepo self, string pid, table global_conf)####
+*	Returns:  
+	__nerv.Layer__  
+*	Parameters:  
+	`self`: __nerv.ParamRepo__.  
+    `pid`: __string__.  
+    `global_conf`: __table__.  
+*	Description:  
+	__ParamRepo__ will find the __nerv.ChunkFile__ `pf` that contains parameter of ID `pid` and return `pf:read_chunk(pid, global_conf)`.
+
+###__nerv.LayerRepo__###
+####nerv.LayerRepo:\_\_init(layer\_spec, param\_repo, global\_conf)####
+* 	Returns:  
+  	__nerv.LayerRepo__.  
+* 	Parameters:  
+  	`self`: __nerv.ParamRepo__.  
+  	`layer_spec`: __table__.  
+  	`param_repo`: __nerv.ParamRepo__.  
+  	`global_conf`: __table__.  
+* 	Description:  
+  	__LayerRepo__ will construct the layers specified in `layer_spec`. Every entry in the `layer_spec` table should follow the format below:  
+	
+    > layer_spec : {[layer_type1] = llist1, [layer_type2] = llist2, ...}   
+  	> llist : {layer1, layer2, ...}   
+  	> layer : layerid = {param_config, layer_config}   
+  	> param_config : {param1 = paramID1, param2 = paramID2}  	 
+    
+  	__LayerRepo__ will merge `param_config` into `layer_config` and construct a layer by calling `layer_type(layerid, global_conf, layer_config)`.    
+
+####nerv.LayerRepo.get\_layer(self, lid)####
+* 	Returns:  
+	__nerv.LayerRepo__, the layer with ID `lid`.
+* 	Parameters:  
+	`self`:__nerv.LayerRepo__.  
+	`lid`:__string__.  
+*	Description:   
+	Returns the layer with ID `lid`.
+    
+###nerv.DAGLayer###
+####nerv.DAGLayer:\_\_init(id, global\_conf, layer\_conf)####
+*	Returns:  
+	__nerv.DAGLayer__  
+*	Parameters:  
+	`id`: __string__  
+    `global_conf`: __table__  
+    `layer_conf`: __table__  
+*	Description:  
+	The `layer_conf` should contain `layer_conf.sub_layers` which is a __nerv.LayerRepo__ storing the sub layers of the DAGLayer. It should also contain `layer_conf.connections`, which is a string-to-string mapping table describing the DAG connections. See an example below:
+    
+    ```
+    dagL = nerv.DAGLayer("DAGL", global_conf, {["dim_in"] = {input_dim, 2}, ["dim_out"] = {}, ["sub_layers"] = layerRepo,
+    	["connections"] = {
+    	["<input>[1]"] = "AffineL[1]",
+    	["AffineL[1]"] = "SoftmaxL[1]",
+    	["<input>[2]"] = "SoftmaxL[2]",
+  	}})
+    ```
+    
+####nerv.DAGLayer.init(self, batch\_size)####
+*	Parameters:  
+	`self`: __nerv.DAGLayer__  
+    `batch_size`: __int__
+* 	Description:  
+	This initialization method will allocate space for output and input matrice, and will call `init()` for each of its sub layers.
+    
+
+####nerv.DAGLayer.propagate(self, input, output)####
+*	Parameters:  
+	`self`: __nerv.DAGLayer__  
+    `input`: __table__  
+    `output`: __table__  
+*	Description:  
+	The same function as __nerv.Layer.propagate__, do propagation for each layer in the order of `self.queue`.
+
+####nerv.DAGLayer.back\_propagate(self, next\_bp\_err, bp\_err, input, output)####
+*	Parameters:  
+	`self`: __nerv.DAGLayer__  
+    `next_bp_err`: __table__  
+    `bp_err`: __table__  
+    `input`: __table__  
+    `output`: __table__  
+*	Description:  
+	The same function as __nerv.Layer.back_propagate__, do back-propagation for each layer in the reverse order of `self.queue`.
+
+####nerv.DAGLayer.update(self, bp\_err, input, output)####
+*	Parameters:  
+	`self`: __nerv.DAGLayer__  
+    `bp_err`: __table__  
+    `input`: __table__  
+    `output`: __table__  
+*	Description:  
+	The same function as __nerv.Layer.update__, do update for each layer in the order of `self.queue`.
+    
+##Examples##
+*	aaa
+	
+```
+require 'math'
+
+require 'layer.affine'
+require 'layer.softmax_ce'
+
+--[[Example using DAGLayer, a simple two-classification problem]]--
+
+--[[begin global setting and data generation]]--
+global_conf =  {lrate = 10, 
+                wcost = 1e-6,
+                momentum = 0.9,
+                cumat_type = nerv.CuMatrixFloat,
+               }
+
+input_dim = 5
+data_num = 100
+param_fn = "../tmp"
+ansV = nerv.CuMatrixFloat(input_dim, 1)
+for i = 0, input_dim - 1, 1 do
+    ansV[i][0] = math.random() - 0.5
+end
+ansB = math.random() - 0.5
+print('displaying ansV')
+print(ansV)
+print('displaying ansB(bias)')
+print(ansB)
+
+dataM = nerv.CuMatrixFloat(data_num, input_dim)
+for i = 0, data_num - 1, 1 do
+    for j = 0, input_dim - 1, 1 do
+        dataM[i][j] = math.random() * 2 - 1
+    end
+end
+refM = nerv.CuMatrixFloat(data_num, 1)
+refM:fill(ansB)
+refM:mul(dataM, ansV, 1, 1) --refM = dataM * ansV + ansB
+
+labelM = nerv.CuMatrixFloat(data_num, 2)
+for i = 0, data_num - 1, 1 do
+    if (refM[i][0] > 0) then
+        labelM[i][0] = 1 
+        labelM[i][1] = 0
+    else
+        labelM[i][0] = 0
+        labelM[i][1] = 1
+    end
+end
+--[[global setting and data generation end]]--
+
+
+--[[begin network building]]--
+--parameters
+do
+    local affineL_ltp = nerv.LinearTransParam('AffineL_ltp', global_conf)
+    affineL_ltp.trans = nerv.CuMatrixFloat(input_dim, 2)
+    for i = 0, input_dim - 1, 1 do
+        for j = 0, 1, 1 do
+            affineL_ltp.trans[i][j] = math.random() - 0.5 
+        end
+    end
+    local affineL_bp = nerv.BiasParam('AffineL_bp', global_conf)
+    affineL_bp.trans = nerv.CuMatrixFloat(1, 2)
+    for j = 0, 1, 1 do
+        affineL_bp.trans[j] = math.random() - 0.5
+    end
+
+    local chunk = nerv.ChunkFile(param_fn, 'w')
+    chunk:write_chunk(affineL_ltp)
+    chunk:write_chunk(affineL_bp)
+    chunk:close()
+
+    paramRepo = nerv.ParamRepo({param_fn})
+end
+
+--layers
+layerRepo = nerv.LayerRepo({
+        ["nerv.AffineLayer"] = 
+        {
+            ["AffineL"] = {{["ltp"] = "AffineL_ltp", ["bp"] = "AffineL_bp"}, {["dim_in"] = {input_dim}, ["dim_out"] = {2}}},
+        },
+        ["nerv.SoftmaxCELayer"] = 
+        {
+            ["SoftmaxL"] = {{}, {["dim_in"] = {2, 2}, ["dim_out"] = {}}}
+        },
+        }, paramRepo, global_conf)
+affineL = layerRepo:get_layer("AffineL")
+softmaxL = layerRepo:get_layer("SoftmaxL")
+print('layers initializing...')
+dagL = nerv.DAGLayer("DAGL", global_conf, {["dim_in"] = {input_dim, 2}, ["dim_out"] = {}, ["sub_layers"] = layerRepo,
+        ["connections"] = {
+           ["<input>[1]"] = "AffineL[1]",
+           ["AffineL[1]"] = "SoftmaxL[1]",
+           ["<input>[2]"] = "SoftmaxL[2]",
+        }})
+dagL:init(data_num)
+--affineL:init()
+--softmaxL:init()
+--[[network building end]]--
+
+
+--[[begin space allocation]]--
+print('network input&output&error space allocation...')
+dagL_input = {dataM, labelM}
+dagL_output = {}
+dagL_err = {}
+dagL_ierr = {nerv.CuMatrixFloat(data_num, input_dim), nerv.CuMatrixFloat(data_num, 2)}
+--[[space allocation end]]--
+
+
+--[[begin training]]--
+ce_last = 0
+for l = 0, 10, 1 do
+    dagL:propagate(dagL_input, dagL_output)
+    dagL:back_propagate(dagL_ierr, dagL_err, dagL_input, dagL_output)
+    dagL:update(dagL_err, dagL_input, dagL_output)
+    
+    if (l % 2 == 0) then
+        nerv.utils.printf("training iteration %d finished\n", l)
+        nerv.utils.printf("cross entropy: %.8f\n", softmaxL.total_ce - ce_last)
+        --nerv.utils.printf("accurate labels: %d\n", calculate_accurate(output, labelM))
+        nerv.utils.printf("total frames processed: %.8f\n", softmaxL.total_frames)
+    end
+    ce_last = softmaxL.total_ce 
+end
+--[[end training]]--
+```
\ No newline at end of file
diff --git a/nerv/doc/nerv_param.md b/nerv/doc/nerv_param.md
new file mode 100644
index 0000000..167cb11
--- /dev/null
+++ b/nerv/doc/nerv_param.md
@@ -0,0 +1,27 @@
+#The Nerv Parameter Package#
+Part of the [Nerv](../README.md) toolkit.
+
+##Description##
+###Class hierarchy###
+There is a base class __Nerv.Param__ defined in `layer/init.lua`.
+
+###Class hierarchy and their members###
+* __nerv.MatrixParam__ inherits __nerv.Param__  
+	* `Matrix trans` stores the parameter matrix.
+* __nerv.LinearTransParam__ inherits __Nerv.MatrixParam__.  
+* __Nerv.BiasParam__ inherits __Nerv.MatrixParam__.  
+
+##Methods##
+* __void Param.\_\_init(Param self, string id, table global_conf)__  
+Constructor of a __Param__, it will set `self.id` to be `id` and `self.gconf` to be `global_conf`.
+* __void Param.set_info(Param self, table info)__  
+Set `self.info` to be `info`.
+* __table Param.get_info(Param self)__  
+Returns `self.info`.
+* __void Param.read(Param self, ChunkData pcdata)__  
+Abstract method.  
+In this method, `self` should in turn calls its members to load from `pcdata`.
+* __void Param.write(Param self, ChunkFileHandle pfhandle)__  
+Abstract method.  
+Save parameters to file. In this method, `self` should in turn calls its members to save to `pfhandle`.
+
diff --git a/nerv/examples/asr_trainer.lua b/nerv/examples/asr_trainer.lua
new file mode 100644
index 0000000..a5727be
--- /dev/null
+++ b/nerv/examples/asr_trainer.lua
@@ -0,0 +1,106 @@
+function build_trainer(ifname)
+    local param_repo = nerv.ParamRepo()
+    param_repo:import(ifname, nil, gconf)
+    local sublayer_repo = make_sublayer_repo(param_repo)
+    local layer_repo = make_layer_repo(sublayer_repo, param_repo)
+    local crit = get_criterion_layer(sublayer_repo)
+    local network = get_network(layer_repo)
+    local input_order = get_input_order()
+    local iterative_trainer = function (prefix, scp_file, bp)
+        gconf.randomize = bp
+        -- build buffer
+        local buffer = make_buffer(make_readers(scp_file, layer_repo))
+        -- initialize the network
+        network:init(gconf.batch_size)
+        gconf.cnt = 0
+        err_input = {nerv.CuMatrixFloat(256, 1)}
+        err_input[1]:fill(1)
+        for data in buffer.get_data, buffer do
+            -- prine stat periodically
+            gconf.cnt = gconf.cnt + 1
+            if gconf.cnt == 1000 then
+                print_stat(sublayer_repo)
+                nerv.CuMatrix.print_profile()
+                nerv.CuMatrix.clear_profile()
+                gconf.cnt = 0
+                -- break
+            end
+            local input = {}
+--            if gconf.cnt == 100 then break end
+            for i, id in ipairs(input_order) do
+                if data[id] == nil then
+                    nerv.error("input data %s not found", id)
+                end
+                table.insert(input, data[id])
+            end
+            local output = {nerv.CuMatrixFloat(256, 1)}
+            err_output = {input[1]:create()}
+            network:propagate(input, output)
+            if bp then
+                network:back_propagate(err_input, err_output, input, output)
+                network:update(err_input, input, output)
+            end
+            -- collect garbage in-time to save GPU memory
+            collectgarbage("collect")
+        end
+        print_stat(sublayer_repo)
+        nerv.CuMatrix.print_profile()
+        nerv.CuMatrix.clear_profile()
+        if (not bp) and prefix ~= nil then
+            nerv.info("writing back...")
+            local fname = string.format("%s_cv%.3f.nerv",
+                            prefix, get_accuracy(sublayer_repo))
+            network:get_params():export(fname, nil)
+        end
+        return get_accuracy(sublayer_repo)
+    end
+    return iterative_trainer
+end
+
+dofile(arg[1])
+start_halving_inc = 0.5
+halving_factor = 0.6
+end_halving_inc = 0.1
+min_iter = 1
+max_iter = 20
+min_halving = 5
+gconf.batch_size = 256
+gconf.buffer_size = 81920
+
+local pf0 = gconf.initialized_param
+local trainer = build_trainer(pf0)
+--local trainer = build_trainer("c3.nerv")
+local accu_best = trainer(nil, gconf.cv_scp, false)
+local do_halving = false
+
+nerv.info("initial cross validation: %.3f", accu_best)
+for i = 1, max_iter do
+    nerv.info("[NN] begin iteration %d with lrate = %.6f", i, gconf.lrate)
+    local accu_tr = trainer(nil, gconf.tr_scp, true)
+    nerv.info("[TR] training set %d: %.3f", i, accu_tr)
+    local accu_new = trainer(
+                        string.format("%s_%s_iter_%d_lr%f_tr%.3f",
+                            string.gsub(
+                                (string.gsub(pf0[1], "(.*/)(.*)", "%2")),
+                                "(.*)%..*", "%1"),
+                            os.date("%Y%m%d%H%M%S"),
+                            i, gconf.lrate,
+                            accu_tr),
+                        gconf.cv_scp, false)
+    nerv.info("[CV] cross validation %d: %.3f", i, accu_new)
+    -- TODO: revert the weights
+    local accu_diff = accu_new - accu_best
+    if do_halving and accu_diff < end_halving_inc and i > min_iter then
+        break
+    end
+    if accu_diff < start_halving_inc and i >= min_halving then
+        do_halving = true
+    end
+    if do_halving then
+        gconf.lrate = gconf.lrate * halving_factor
+    end
+    if accu_new > accu_best then
+        accu_best = accu_new
+    end
+--    nerv.Matrix.print_profile()
+end
diff --git a/nerv/examples/chunk_file_example.lua b/nerv/examples/chunk_file_example.lua
new file mode 100644
index 0000000..5961c98
--- /dev/null
+++ b/nerv/examples/chunk_file_example.lua
@@ -0,0 +1,53 @@
+-- To define a readable and writable chunk, one must define a class with the
+-- following methods: __init(id, global_conf), read(handle), write(handle),
+-- get_info(), set_info(info) and an id attribute. This file demonstrates a
+-- basic chunk implementation which manages the I/O of a matrix
+
+local MatrixChunk = nerv.class("nerv.MatrixChunk")
+
+function MatrixChunk:__init(id, global_conf)
+    self.id = id
+    self.info = {}
+    self.gconf = global_conf
+end
+
+function MatrixChunk:read(handle)
+    -- pass the read handle to the matrix method
+    self.data = nerv.MMatrixFloat.load(handle)
+end
+
+function MatrixChunk:write(handle)
+    -- pass the write handle to the matrix method
+    self.data:save(handle)
+end
+
+function MatrixChunk:get_info()
+    return self.info
+end
+
+function MatrixChunk:set_info(info)
+    self.info = info
+end
+
+function MatrixChunk.create_from_matrix(id, mat)
+    local ins = nerv.MatrixChunk(id)
+    ins.data = mat
+    return ins
+end
+
+mat = nerv.MMatrixFloat(3, 4)
+for i = 0, 2 do
+    for j = 0, 3 do
+        mat[i][j] = i + j
+    end
+end
+
+cd = nerv.MatrixChunk.create_from_matrix("matrix1", mat)
+
+cf = nerv.ChunkFile("test.nerv", "w")
+cf:write_chunk(cd)
+cf:close()
+
+cf2 = nerv.ChunkFile("test.nerv", "r")
+cd2 = cf2:read_chunk("matrix1")
+print(cd2.data)
diff --git a/nerv/examples/cumatrix_example.lua b/nerv/examples/cumatrix_example.lua
new file mode 100644
index 0000000..544fc7f
--- /dev/null
+++ b/nerv/examples/cumatrix_example.lua
@@ -0,0 +1,31 @@
+m = 4
+n = 4
+fm = nerv.CuMatrixFloat(m, n)
+dm = nerv.CuMatrixDouble(m, n)
+for i = 0, m - 1 do
+    for j = 0, n - 1 do
+        -- local t = math.random(10)
+        t = i / (j + 1)
+        fm[i][j] = t
+        dm[i][j] = t
+    end
+end
+print(fm)
+fs = fm:create()
+fs:softmax(fm)
+-- print(fs)
+print(dm)
+ds = dm:create()
+ds:softmax(dm)
+-- print(ds)
+print(fs)
+print(fs + fs)
+print(ds + ds)
+print(fs - fs)
+print(ds - ds)
+
+a = fs:create()
+a:mul_elem(fs, fs)
+print(a)
+a:log_elem(fs)
+print(a)
diff --git a/nerv/examples/cumatrix_from_mmatrix.lua b/nerv/examples/cumatrix_from_mmatrix.lua
new file mode 100644
index 0000000..2309e14
--- /dev/null
+++ b/nerv/examples/cumatrix_from_mmatrix.lua
@@ -0,0 +1,32 @@
+m = 3
+n = 4
+fm = nerv.MMatrixFloat(m, n)
+dm = nerv.MMatrixDouble(m, n)
+for i = 0, m - 1 do
+    for j = 0, n - 1 do
+        -- local t = math.random(10)
+        t = i / (j + 1)
+        fm[i][j] = t
+        dm[i][j] = t
+    end
+end
+print(fm)
+print(dm)
+
+fc = nerv.CuMatrixFloat(m, n)
+dc = nerv.CuMatrixDouble(m, n)
+fc:copy_fromh(fm)
+dc:copy_fromh(dm)
+print("fc and dc")
+print(fc)
+print(dc)
+dc[1]:copy_tod(dc[0])
+print("dc[1] copied to dc[0]")
+print(dc)
+print("softmax of fc and dc")
+sfc = fc:create()
+sdc = dc:create()
+sfc:softmax(fc)
+print(sfc)
+sdc:softmax(dc)
+print(sdc)
diff --git a/nerv/examples/mmatrix_example.lua b/nerv/examples/mmatrix_example.lua
new file mode 100644
index 0000000..8ddfe84
--- /dev/null
+++ b/nerv/examples/mmatrix_example.lua
@@ -0,0 +1,20 @@
+m = 10
+n = 10
+fm = nerv.MMatrixFloat(m, n)
+dm = nerv.MMatrixDouble(m, n)
+for i = 0, m - 1 do
+    for j = 0, n - 1 do
+        -- local t = math.random(10)
+        t = i / (j + 1)
+        fm[i][j] = t
+        dm[i][j] = t
+    end
+end
+print("test fm:get_dataref_value:", fm:get_dataref_value())
+print("forced a garbade collect")
+collectgarbage("collect")
+print("test fm:get_dataref_value:", fm:get_dataref_value())
+print(fm)
+-- print(fm:softmax())
+print(dm)
+-- print(dm:softmax())
diff --git a/nerv/examples/oop_example.c b/nerv/examples/oop_example.c
new file mode 100644
index 0000000..59dfc5a
--- /dev/null
+++ b/nerv/examples/oop_example.c
@@ -0,0 +1,101 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../common.h"
+
+#define SQR(x) ((x) * (x))
+
+const char *point_tname = "nerv.Point";
+const char *better_point_tname = "nerv.BetterPoint";
+
+typedef struct {
+    double x, y;
+} Point;
+
+static int point_norm (lua_State *L) {
+    Point *p = luaT_checkudata(L, 1, point_tname);
+    lua_pushnumber(L, sqrt(SQR(p->x) + SQR(p->y)));
+    return 1;
+}
+
+static int point_set_x (lua_State *L) {
+    Point *p = luaT_checkudata(L, 1, point_tname);
+    p->x = luaL_checknumber(L, 2);
+    return 0;
+}
+
+static int point_set_y (lua_State *L) {
+    Point *p = luaT_checkudata(L, 1, point_tname);
+    p->y = luaL_checknumber(L, 2);
+    return 0;
+}
+
+/* generic constructor */
+void point_new_(Point *self, double x, double y) {
+    self->x = x;
+    self->y = y;
+}
+
+int point_new(lua_State *L) {
+    /* `_new` function should create the object itself */
+    Point *self = (Point *)malloc(sizeof(Point));
+    point_new_(self, luaL_checknumber(L, 1), luaL_checknumber(L, 2));
+    luaT_pushudata(L, self, point_tname);
+    fprintf(stderr, "[example] %s constructor is invoked\n",
+            point_tname);
+    return 1;
+}
+
+static const luaL_Reg point_methods[] = {
+    {"set_x", point_set_x},
+    {"set_y", point_set_y},
+    {"norm", point_norm},
+    {NULL, NULL}
+};
+
+
+/* the subclass method overrides the one from baseclass */
+static int better_point_norm (lua_State *L) {
+    Point *p = luaT_checkudata(L, 1, point_tname);
+    lua_pushnumber(L, fabs(p->x) + fabs(p->y));
+    return 1;
+}
+
+int better_point_new(lua_State *L) {
+    /* `_new` function should create the object itself */
+    Point *self = (Point *)malloc(sizeof(Point));
+    point_new_(self, luaL_checknumber(L, 1), luaL_checknumber(L, 2));
+    luaT_pushudata(L, self, better_point_tname);
+    fprintf(stderr, "[example] %s constructor is invoked\n",
+            better_point_tname);
+    return 1;
+}
+
+static const luaL_Reg better_point_methods[] = {
+    {"norm", better_point_norm},
+    {NULL, NULL}
+};
+
+void nerv_point_init(lua_State *L) {
+    /* create a class and let luaT know */
+    luaT_newmetatable(L, point_tname, NULL, point_new, NULL, NULL);
+    /* register member functions */
+    luaL_register(L, NULL, point_methods);
+    /* keep the stack balanced, see `nerv.c` */
+    lua_pop(L, 1);
+}
+
+void nerv_better_point_init(lua_State *L) {
+    /* create a class and let luaT know */
+    luaT_newmetatable(L, better_point_tname, point_tname,
+                        better_point_new, NULL, NULL);
+    /* register member functions */
+    luaL_register(L, NULL, better_point_methods);
+    /* keep the stack balanced, see `nerv.c` */
+    lua_pop(L, 1);
+}
+
+void nerv_example_init(lua_State *L) {
+    nerv_point_init(L);
+    nerv_better_point_init(L);
+}
diff --git a/nerv/examples/oop_example.lua b/nerv/examples/oop_example.lua
new file mode 100644
index 0000000..b753288
--- /dev/null
+++ b/nerv/examples/oop_example.lua
@@ -0,0 +1,16 @@
+p = nerv.Point(0, 0) -- create a Point instance
+print(p)
+print(p:norm()) -- get 2-norm of the Point
+p:set_x(1.0)
+p:set_y(2.0)
+print(p:norm()) -- get 2-norm of the Point
+
+bp = nerv.BetterPoint(1, 2)
+-- use methods from base class
+bp:set_x(1.0)
+bp:set_y(2.0)
+print(bp)
+print(bp:norm()) --get 1-norm of the Point
+
+print(p.__typename)
+print(bp.__typename)
diff --git a/nerv/examples/swb_baseline.lua b/nerv/examples/swb_baseline.lua
new file mode 100644
index 0000000..8b7e01a
--- /dev/null
+++ b/nerv/examples/swb_baseline.lua
@@ -0,0 +1,166 @@
+require 'speech.init'
+gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
+        cumat_type = nerv.CuMatrixFloat,
+        mmat_type = nerv.MMatrixFloat,
+        frm_ext = 5,
+        tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
+        cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp",
+        htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
+        initialized_param = {"/slfs1/users/mfy43/swb_init.nerv",
+                "/slfs1/users/mfy43/swb_global_transf.nerv"},
+        debug = false}
+
+function make_sublayer_repo(param_repo)
+    return nerv.LayerRepo(
+    {
+        -- global transf
+        ["nerv.BiasLayer"] =
+        {
+            blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}},
+            blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}}
+        },
+        ["nerv.WindowLayer"] =
+        {
+            wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}},
+            wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}}
+        },
+        -- biased linearity
+        ["nerv.AffineLayer"] =
+        {
+            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+            {dim_in = {429}, dim_out = {2048}}},
+            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+            {dim_in = {2048}, dim_out = {2048}}},
+            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+            {dim_in = {2048}, dim_out = {3001}}}
+        },
+        ["nerv.SigmoidLayer"] =
+        {
+            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+        },
+        ["nerv.SoftmaxCELayer"] =
+        {
+            ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}}
+        }
+    }, param_repo, gconf)
+end
+
+function make_layer_repo(sublayer_repo, param_repo)
+    return nerv.LayerRepo(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            global_transf = {{}, {
+                dim_in = {429}, dim_out = {429},
+                sub_layers = sublayer_repo,
+                connections = {
+                    ["<input>[1]"] = "blayer1[1]",
+                    ["blayer1[1]"] = "wlayer1[1]",
+                    ["wlayer1[1]"] = "blayer2[1]",
+                    ["blayer2[1]"] = "wlayer2[1]",
+                    ["wlayer2[1]"] = "<output>[1]"
+                }
+            }},
+            main = {{}, {
+                dim_in = {429, 1}, dim_out = {1},
+                sub_layers = sublayer_repo,
+                connections = {
+                    ["<input>[1]"] = "affine0[1]",
+                    ["affine0[1]"] = "sigmoid0[1]",
+                    ["sigmoid0[1]"] = "affine1[1]",
+                    ["affine1[1]"] = "sigmoid1[1]",
+                    ["sigmoid1[1]"] = "affine2[1]",
+                    ["affine2[1]"] = "sigmoid2[1]",
+                    ["sigmoid2[1]"] = "affine3[1]",
+                    ["affine3[1]"] = "sigmoid3[1]",
+                    ["sigmoid3[1]"] = "affine4[1]",
+                    ["affine4[1]"] = "sigmoid4[1]",
+                    ["sigmoid4[1]"] = "affine5[1]",
+                    ["affine5[1]"] = "sigmoid5[1]",
+                    ["sigmoid5[1]"] = "affine6[1]",
+                    ["affine6[1]"] = "sigmoid6[1]",
+                    ["sigmoid6[1]"] = "affine7[1]",
+                    ["affine7[1]"] = "ce_crit[1]",
+                    ["<input>[2]"] = "ce_crit[2]",
+                    ["ce_crit[1]"] = "<output>[1]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+end
+
+function get_criterion_layer(sublayer_repo)
+    return sublayer_repo:get_layer("ce_crit")
+end
+
+function get_network(layer_repo)
+    return layer_repo:get_layer("main")
+end
+
+function make_readers(scp_file, layer_repo)
+    return {
+                {reader = nerv.TNetReader(gconf,
+                    {
+                        id = "main_scp",
+                        scp_file = scp_file,
+                        conf_file = gconf.htk_conf,
+                        frm_ext = gconf.frm_ext,
+                        mlfs = {
+                            phone_state = {
+                                file = "/slfs1/users/mfy43/swb_ivec/ref.mlf",
+                                format = "map",
+                                format_arg = "/slfs1/users/mfy43/swb_ivec/dict",
+                                dir = "*/",
+                                ext = "lab"
+                            }
+                        },
+                        global_transf = layer_repo:get_layer("global_transf")
+                    }),
+                data = {main_scp = 429, phone_state = 1}}
+            }
+end
+
+function make_buffer(readers)
+    return nerv.SGDBuffer(gconf,
+        {
+            buffer_size = gconf.buffer_size,
+            randomize = gconf.randomize,
+            readers = readers
+        })
+end
+
+function get_input_order()
+    return {"main_scp", "phone_state"}
+end
+
+function get_accuracy(sublayer_repo)
+    local ce_crit = sublayer_repo:get_layer("ce_crit")
+    return ce_crit.total_correct / ce_crit.total_frames * 100
+end
+
+function print_stat(sublayer_repo)
+    local ce_crit = sublayer_repo:get_layer("ce_crit")
+    nerv.info("*** training stat begin ***")
+    nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce)
+    nerv.printf("correct:\t\t%d\n", ce_crit.total_correct)
+    nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames)
+    nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames)
+    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(sublayer_repo))
+    nerv.info("*** training stat end ***")
+end
diff --git a/nerv/examples/test_dnn_layers.lua b/nerv/examples/test_dnn_layers.lua
new file mode 100644
index 0000000..64c0dec
--- /dev/null
+++ b/nerv/examples/test_dnn_layers.lua
@@ -0,0 +1,78 @@
+require 'layer.affine'
+require 'layer.sigmoid'
+require 'layer.softmax_ce'
+
+global_conf = {lrate = 0.8, wcost = 1e-6,
+                momentum = 0.9, cumat_type = nerv.CuMatrixFloat}
+
+pf = nerv.ChunkFile("affine.param", "r")
+ltp = pf:read_chunk("a", global_conf)
+bp = pf:read_chunk("b", global_conf)
+
+-- print(bp.trans)
+
+af = nerv.AffineLayer("test", global_conf, {["ltp"] = ltp,
+                                            ["bp"] = bp,
+                                            dim_in = {429},
+                                            dim_out = {2048}})
+sg = nerv.SigmoidLayer("test2", global_conf, {dim_in = {2048},
+                                                dim_out = {2048}})
+sm = nerv.SoftmaxCELayer("test3", global_conf, {dim_in = {2048, 2048},
+                                                dim_out = {}})
+af:init()
+sg:init()
+sm:init()
+
+df = nerv.ChunkFile("input.param", "r")
+
+label = nerv.CuMatrixFloat(10, 2048)
+label:fill(0)
+for i = 0, 9 do
+    label[i][i] = 1.0
+end
+
+input1 = {df:read_chunk("input", global_conf).trans}
+output1 = {nerv.CuMatrixFloat(10, 2048)}
+input2 = output1
+output2 = {nerv.CuMatrixFloat(10, 2048)}
+input3 = {output2[1], label}
+output3 = {}
+err_input1 = {}
+err_output1 = {nerv.CuMatrixFloat(10, 2048)}
+err_input2 = err_output1
+err_output2 = {nerv.CuMatrixFloat(10, 2048)}
+err_input3 = err_output2
+err_output3 = {input1[1]:create()}
+
+for i = 0, 3 do
+    -- propagate
+    af:propagate(input1, output1)
+    sg:propagate(input2, output2)
+    sm:propagate(input3, output3)
+
+    -- back_propagate
+    sm:back_propagate(err_output1, err_input1, input3, output3)
+    sg:back_propagate(err_output2, err_input2, input2, output2)
+    af:back_propagate(err_output3, err_input3, input1, output1)
+
+    -- update
+    sm:update(err_input1, input3, output3)
+    sg:update(err_input2, input2, output2)
+    af:update(err_input3, input1, output1)
+
+
+    print("output1")
+    print(output1[1])
+    print("output2")
+    print(output2[1])
+    print("err_output1")
+    print(err_output1[1])
+    print("err_output2")
+    print(err_output2[1])
+    nerv.printf("cross entropy: %.8f\n", sm.total_ce)
+    nerv.printf("frames: %.8f\n", sm.total_frames)
+end
+print("linear")
+print(af.ltp.trans)
+print("linear2")
+print(af.bp.trans)
diff --git a/nerv/examples/test_nn_lib.lua b/nerv/examples/test_nn_lib.lua
new file mode 100644
index 0000000..5444810
--- /dev/null
+++ b/nerv/examples/test_nn_lib.lua
@@ -0,0 +1,164 @@
+require 'speech.init'
+gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
+        cumat_type = nerv.CuMatrixFloat,
+        mmat_type = nerv.MMatrixFloat,
+        batch_size = 256}
+
+param_repo = nerv.ParamRepo({"converted.nerv", "global_transf.nerv"})
+sublayer_repo = nerv.LayerRepo(
+    {
+        -- global transf
+        ["nerv.BiasLayer"] =
+        {
+            blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}},
+            blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}}
+        },
+        ["nerv.WindowLayer"] =
+        {
+            wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}},
+            wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}}
+        },
+        -- biased linearity
+        ["nerv.AffineLayer"] =
+        {
+            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+                        {dim_in = {429}, dim_out = {2048}}},
+            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+                        {dim_in = {2048}, dim_out = {2048}}},
+            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+                        {dim_in = {2048}, dim_out = {2048}}},
+            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+                        {dim_in = {2048}, dim_out = {2048}}},
+            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+                        {dim_in = {2048}, dim_out = {2048}}},
+            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+                        {dim_in = {2048}, dim_out = {2048}}},
+            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+                        {dim_in = {2048}, dim_out = {2048}}},
+            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+                        {dim_in = {2048}, dim_out = {3001}}}
+        },
+        ["nerv.SigmoidLayer"] =
+        {
+            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+        },
+        ["nerv.SoftmaxCELayer"] =
+        {
+            softmax_ce0 = {{}, {dim_in = {3001, 1}, dim_out = {}, compressed = true}}
+        }
+    }, param_repo, gconf)
+
+layer_repo = nerv.LayerRepo(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            global_transf = {{}, {
+                dim_in = {429}, dim_out = {429},
+                sub_layers = sublayer_repo,
+                connections = {
+                    ["<input>[1]"] = "blayer1[1]",
+                    ["blayer1[1]"] = "wlayer1[1]",
+                    ["wlayer1[1]"] = "blayer2[1]",
+                    ["blayer2[1]"] = "wlayer2[1]",
+                    ["wlayer2[1]"] = "<output>[1]"
+                }
+            }},
+            main = {{}, {
+                dim_in = {429, 1}, dim_out = {},
+                sub_layers = sublayer_repo,
+                connections = {
+                    ["<input>[1]"] = "affine0[1]",
+                    ["affine0[1]"] = "sigmoid0[1]",
+                    ["sigmoid0[1]"] = "affine1[1]",
+                    ["affine1[1]"] = "sigmoid1[1]",
+                    ["sigmoid1[1]"] = "affine2[1]",
+                    ["affine2[1]"] = "sigmoid2[1]",
+                    ["sigmoid2[1]"] = "affine3[1]",
+                    ["affine3[1]"] = "sigmoid3[1]",
+                    ["sigmoid3[1]"] = "affine4[1]",
+                    ["affine4[1]"] = "sigmoid4[1]",
+                    ["sigmoid4[1]"] = "affine5[1]",
+                    ["affine5[1]"] = "sigmoid5[1]",
+                    ["sigmoid5[1]"] = "affine6[1]",
+                    ["affine6[1]"] = "sigmoid6[1]",
+                    ["sigmoid6[1]"] = "affine7[1]",
+                    ["affine7[1]"] = "softmax_ce0[1]",
+                    ["<input>[2]"] = "softmax_ce0[2]"
+                }
+            }}
+        }
+    }, param_repo, gconf)
+
+tnet_reader = nerv.TNetReader(gconf,
+    {
+        id = "main_scp",
+        scp_file = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
+--        scp_file = "t.scp",
+        conf_file = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
+        frm_ext = 5,
+        mlfs = {
+            ref = {
+                file = "/slfs1/users/mfy43/swb_ivec/ref.mlf",
+                format = "map",
+                format_arg = "/slfs1/users/mfy43/swb_ivec/dict",
+                dir = "*/",
+                ext = "lab"
+            }
+        },
+        global_transf = layer_repo:get_layer("global_transf")
+    })
+
+buffer = nerv.SGDBuffer(gconf,
+    {
+        buffer_size = 81920,
+        randomize = true,
+        readers = {
+            { reader = tnet_reader,
+              data = {main_scp = 429, ref = 1}}
+        }
+    })
+
+sm = sublayer_repo:get_layer("softmax_ce0")
+main = layer_repo:get_layer("main")
+main:init(gconf.batch_size)
+gconf.cnt = 0
+-- data = buffer:get_data()
+-- input = {data.main_scp, data.ref}
+-- while true do
+for data in buffer.get_data, buffer do
+--    if gconf.cnt == 100 then break end
+--    gconf.cnt = gconf.cnt + 1
+
+    input = {data.main_scp, data.ref}
+    output = {}
+    err_input = {}
+    err_output = {input[1]:create()}
+    
+    main:propagate(input, output)
+    main:back_propagate(err_output, err_input, input, output)
+    main:update(err_input, input, output)
+
+--    nerv.printf("cross entropy: %.8f\n", sm.total_ce)
+--    nerv.printf("correct: %d\n", sm.total_correct)
+--    nerv.printf("frames: %d\n", sm.total_frames)
+--    nerv.printf("err/frm: %.8f\n", sm.total_ce / sm.total_frames)
+--    nerv.printf("accuracy: %.8f\n", sm.total_correct / sm.total_frames)
+    collectgarbage("collect")
+end
+nerv.printf("cross entropy: %.8f\n", sm.total_ce)
+nerv.printf("correct: %d\n", sm.total_correct)
+nerv.printf("accuracy: %.3f%%\n", sm.total_correct / sm.total_frames * 100)
+nerv.printf("writing back...\n")
+cf = nerv.ChunkFile("output.nerv", "w")
+for i, p in ipairs(main:get_params()) do
+    print(p)
+    cf:write_chunk(p)
+end
+cf:close()
+nerv.Matrix.print_profile()
diff --git a/nerv/init.lua b/nerv/init.lua
new file mode 100644
index 0000000..89010a7
--- /dev/null
+++ b/nerv/init.lua
@@ -0,0 +1,128 @@
+require 'libnerv'
+
+function nerv.error(fmt, ...)
+    error(nerv.printf("[nerv] internal error: " .. fmt .. "\n", ...))
+end
+
+function nerv.error_method_not_implemented()
+    nerv.error("method not implemented");
+end
+
+function nerv.printf(fmt, ...)
+    io.write(string.format(fmt, ...))
+end
+
+function nerv.mesg_with_timestamp(fmt, ...)
+    nerv.printf(
+        string.format("(%s)[nerv] info: %s\n",
+            os.date("%H:%M:%S %F"), fmt), ...)
+end
+
+function nerv.info(fmt, ...)
+    nerv.printf(
+        string.format("(%s)[nerv] info: %s\n",
+            os.date("%H:%M:%S %F"), fmt), ...)
+end
+
+function nerv.warning(fmt, ...)
+    nerv.printf(
+        string.format("(%s)[nerv] warning: %s\n",
+            os.date("%H:%M:%S %F"), fmt), ...)
+end
+
+-- Torch C API wrapper
+function nerv.class(tname, parenttname)
+
+   local function constructor(...)
+      local self = {}
+      nerv.setmetatable(self, tname)
+      if self.__init then
+         self:__init(...)
+      end
+      return self
+   end
+
+   local function factory()
+      local self = {}
+      nerv.setmetatable(self, tname)
+      return self
+   end
+
+   local mt = nerv.newmetatable(tname, parenttname, constructor, nil, factory)
+   local mpt
+   if parenttname then
+      mpt = nerv.getmetatable(parenttname)
+   end
+   return mt, mpt
+end
+
+function table.val_to_str(v)
+  if "string" == type(v) then
+    v = string.gsub(v, "\n", "\\n")
+    if string.match(string.gsub(v,"[^'\"]",""), '^"+$') then
+      return "'" .. v .. "'"
+    end
+    return '"' .. string.gsub(v,'"', '\\"') .. '"'
+  else
+    return "table" == type(v) and table.tostring(v) or
+      tostring(v)
+  end
+end
+
+function table.key_to_str (k)
+  if "string" == type(k) and string.match(k, "^[_%a][_%a%d]*$") then
+    return k
+  else
+    return "[" .. table.val_to_str(k) .. "]"
+  end
+end
+
+function table.tostring(tbl)
+  local result, done = {}, {}
+  for k, v in ipairs(tbl) do
+    table.insert(result, table.val_to_str(v))
+    done[k] = true
+  end
+  for k, v in pairs(tbl) do
+    if not done[k] then
+      table.insert(result,
+        table.key_to_str(k) .. "=" .. table.val_to_str(v))
+    end
+  end
+  return "{" .. table.concat(result, ",") .. "}"
+end
+
+function nerv.get_type(tname)
+    return assert(loadstring("return " .. tname))()
+end
+
+function nerv.is_type(obj, tname)
+    local mt0 = nerv.getmetatable(tname)
+    local mt = getmetatable(obj)
+    while mt do
+        if mt == mt0 then
+            return true
+        end
+        mt = getmetatable(mt)
+    end
+    return false
+end
+
+function nerv.dirname(filename)
+    if filename:match(".-/.-") then
+        local name = string.gsub(filename, "(.*/)(.*)", "%1")
+        return name
+    else
+        return ''
+    end
+end
+
+function nerv.include(filename)
+    local caller = debug.getinfo(2, "S").source:sub(2)
+    dofile(nerv.dirname(caller) .. filename)
+end
+
+nerv.include('matrix/init.lua')
+nerv.include('io/init.lua')
+nerv.include('layer/init.lua')
+nerv.include('nn/init.lua')
diff --git a/nerv/io/chunk_file.c b/nerv/io/chunk_file.c
new file mode 100644
index 0000000..c0b6b9f
--- /dev/null
+++ b/nerv/io/chunk_file.c
@@ -0,0 +1,325 @@
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include "../common.h"
+#include "chunk_file.h"
+
+#define INVALID_FORMAT_ERROR(fn) \
+    nerv_error(L, "Invalid chunk file: %s", fn)
+#define CHECK_FORMAT(exp, ret, fname) \
+    do { \
+        if ((exp) != (ret)) INVALID_FORMAT_ERROR(fn); \
+    } while (0)
+#define CHECK_FILE_OPEN(pfh) \
+    do { \
+        if ((pfh)->closed) \
+            nerv_error(L, "operations on a closed file"); \
+    } while (0)
+
+const char *nerv_chunk_file_tname = "nerv.ChunkFile";
+const char *nerv_chunk_file_handle_tname = "nerv.ChunkFileHandle";
+const char *nerv_chunk_info_tname = "nerv.ChunkInfo";
+const char *nerv_chunk_data_tname = "nerv.ChunkData";
+
+#define PARAM_HEADER_SIZE 16
+
+enum {
+    NORMAL,
+    INVALID_FORMAT,
+    END_OF_FILE,
+    SECTION_OVERFLOW,
+    WRITE_ERROR
+};
+
+size_t read_chunk_header_plain(FILE *fp, int *status) {
+    static char buff[PARAM_HEADER_SIZE];
+    int i;
+    size_t size = 0;
+    *status = NORMAL;
+    if (fread(buff, 1, PARAM_HEADER_SIZE, fp) != PARAM_HEADER_SIZE)
+    {
+        if (feof(fp)) *status = END_OF_FILE;
+        else *status = INVALID_FORMAT;
+    }
+    for (i = 0; i < PARAM_HEADER_SIZE; i++)
+        if (isdigit(buff[i]))
+            size = size * 10 + buff[i] - '0';
+/*    fprintf(stderr, "header: %lu\n", size); */
+    return size;
+}
+
+#define CHECK_WRITE(status) \
+    do { \
+        if (status == SECTION_OVERFLOW) \
+            nerv_error(L, "section overflowed"); \
+        else if (status == WRITE_ERROR) \
+            nerv_error(L, "error while writing"); \
+    } while (0)
+
+void write_chunk_header_plain(FILE *fp, size_t size, int *status) {
+    static char buff[PARAM_HEADER_SIZE];
+    int i;
+    *status = NORMAL;
+    for (i = PARAM_HEADER_SIZE - 3; i > 0; i--, size /= 10)
+        buff[i] = size % 10 + '0';
+    if (size)
+    {
+        *status = SECTION_OVERFLOW;
+        return;
+    }
+    buff[0] = '[';
+    buff[PARAM_HEADER_SIZE - 2] = ']';
+    buff[PARAM_HEADER_SIZE - 1] = '\n';
+    if (fwrite(buff, 1, PARAM_HEADER_SIZE, fp) != PARAM_HEADER_SIZE)
+    {
+        *status = WRITE_ERROR;
+        return;
+    }
+}
+
+ChunkData *get_chunk_data(FILE *fp, ChunkInfo *info) {
+    ChunkData *pcd = (ChunkData *)malloc(sizeof(ChunkData));
+    pcd->data = (char *)malloc(info->length);
+    pcd->fp = fmemopen(pcd->data, info->length, "r");
+    assert(fseeko(fp, info->offset, SEEK_SET) == 0);
+    if (fread(pcd->data, 1, info->length, fp) != (size_t)info->length)
+        return NULL;
+    return pcd;
+}
+
+const char *read_chunk_metadata(lua_State *L, FILE *fp, const char *fn) {
+#define LINEBUFF_SIZE 1024
+    static char buff[7 + LINEBUFF_SIZE] = "return ";
+    CHECK_FORMAT(fgets(buff + 7, LINEBUFF_SIZE, fp), buff + 7, fn);
+    /* fprintf(stderr, "metadata: %s\n", buff); */
+    return buff;
+}
+
+void write_chunk_metadata(FILE *fp, const char *metadata_str, int *status) {
+    size_t size = strlen(metadata_str);
+    *status = NORMAL;
+    if (fwrite(metadata_str, 1, size, fp) != size ||
+        fprintf(fp, "\n") < 0)
+    {
+        *status = WRITE_ERROR;
+        return;
+    }
+    /* fprintf(stderr, "metadata: %s\n", metadata_str); */
+}
+
+
+int nerv_chunk_file_open_write(lua_State *L, const char *fn) {
+    FILE *fp = fopen(fn, "w");
+    ChunkFileHandle *lfp;
+    if (!fp) nerv_error(L, "Error while opening chunk file: %s", fn);
+    lfp = (ChunkFileHandle *)malloc(sizeof(ChunkFileHandle));
+    lfp->fp = fp;
+    lfp->closed = 0;
+    luaT_pushudata(L, lfp, nerv_chunk_file_handle_tname);
+    lua_setfield(L, -2, "handle");
+    luaT_pushmetatable(L, nerv_chunk_file_tname);
+    lua_setmetatable(L, -2);
+    return 1;
+}
+
+int nerv_chunk_file_open_read(lua_State *L, const char *fn) {
+    FILE *fp = fopen(fn, "r");
+    int i, status;
+    size_t chunk_len;
+    off_t offset;
+    ChunkFileHandle *lfp;
+
+    if (!fp) nerv_error(L, "Error while opening chunk file: %s", fn);
+    offset = ftello(fp);
+    lua_newtable(L);
+    /* fprintf(stderr, "%d\n", (int)offset); */
+    for (i = 0;; offset += chunk_len, i++)
+    {
+        ChunkInfo *pci;
+        /* fprintf(stderr, "reading chunk %d from %d\n", i, (int)offset); */
+        /* skip to the begining of chunk i */
+        CHECK_FORMAT(fseeko(fp, offset, SEEK_SET), 0, fn);
+        /* read header */
+        chunk_len = read_chunk_header_plain(fp, &status);
+        if (status == END_OF_FILE) break;
+        else if (status == INVALID_FORMAT)
+            INVALID_FORMAT_ERROR(fn);
+        /* read metadata */
+        luaL_loadstring(L, read_chunk_metadata(L, fp, fn));
+        CHECK_FORMAT(lua_pcall(L, 0, 1, 0), 0, fn);
+        CHECK_FORMAT(lua_istable(L, -1), 1, fn);
+        /* stack: obj_table, metadata */
+        /* chunk info */
+        pci = (ChunkInfo *)malloc(sizeof(ChunkInfo));
+        pci->offset = ftello(fp);
+        pci->length = chunk_len - (pci->offset - offset);
+        /* fprintf(stderr, "%d + %d (skip %lu)\n", (int)pci->offset,
+                (int)pci->length, chunk_len); */
+        luaT_pushudata(L, pci, nerv_chunk_info_tname);
+        lua_setfield(L, -2, "chunk");
+        /* stack: obj_table, metadata */
+        /* get id */
+        lua_getfield(L, -1, "id");
+        /* stack: obj_table, metadata, id */
+        if (!lua_isstring(L, -1))
+            nerv_error(L, "id field in metadata must be a string");
+        lua_pushvalue(L, -1);
+        /* stack: obj_table, metadata, id, id */
+        lua_gettable(L, -4);
+        /* stack: obj_table, metadata, id, obj[id] */
+        if (!lua_isnil(L, -1))
+            nerv_error(L, "conflicting id");
+        lua_pop(L, 1);
+        /* stack: obj_table, metadata, id */
+        lua_pushvalue(L, -2);
+        /* stack: obj_table, metadata, id, metadata */
+        lua_settable(L, -4);
+        /* stack: obj_table, metadata */
+        lua_pop(L, 1);
+    }
+    lua_setfield(L, -2, "metadata");
+    lfp = (ChunkFileHandle *)malloc(sizeof(ChunkFileHandle));
+    lfp->fp = fp;
+    lfp->closed = 0;
+    luaT_pushudata(L, lfp, nerv_chunk_file_handle_tname);
+    lua_setfield(L, -2, "handle");
+    luaT_pushmetatable(L, nerv_chunk_file_tname);
+    lua_setmetatable(L, -2);
+    return 1;
+}
+
+int nerv_chunk_file_new_(lua_State *L, const char *fn, const char *mode) {
+    int rd = 1, bin = 0;
+    size_t i, len = strlen(mode);
+    for (i = 0; i < len; i++)
+        switch (mode[i])
+        {
+            case 'r': rd = 1; break;
+            case 'w': rd = 0; break;
+            case 'b': bin = 1; break;
+        }
+    return rd ? nerv_chunk_file_open_read(L, fn) : \
+                nerv_chunk_file_open_write(L, fn);
+}
+
+int nerv_chunk_file___init(lua_State *L) {
+    lua_pushvalue(L, 1);
+    return nerv_chunk_file_new_(L, luaL_checkstring(L, 2),
+                                    luaL_checkstring(L, 3));
+}
+
+int nerv_chunk_file_new(lua_State *L) {
+    lua_newtable(L);
+    return nerv_chunk_file_new_(L, luaL_checkstring(L, 1),
+                                    luaL_checkstring(L, 2));
+}
+
+int nerv_chunk_file_write_chunkdata(lua_State *L) {
+    ChunkFileHandle *pfh;
+    int status;
+    off_t start;
+    size_t size;
+    const char *metadata_str = lua_tolstring(L, 2, NULL);
+    lua_getfield(L, 1, "handle");
+    pfh = luaT_checkudata(L, -1, nerv_chunk_file_handle_tname);
+    CHECK_FILE_OPEN(pfh);
+    start = ftello(pfh->fp);
+    write_chunk_header_plain(pfh->fp, 0, &status); /* fill zeros */
+    CHECK_WRITE(status);
+    write_chunk_metadata(pfh->fp, metadata_str, &status);
+    CHECK_WRITE(status);
+    lua_pushvalue(L, 3);
+    lua_getfield(L, -1, "write");
+    if (!lua_isfunction(L, -1))
+        nerv_error(L, "\"write\" method must be implemented");
+    lua_pushvalue(L, -2);
+    lua_pushvalue(L, 4); /* pass handle as parameter to write() */
+    lua_call(L, 2, 0); /* let the write() to write */
+    lua_pop(L, 1);
+    size = ftello(pfh->fp) - start;
+    fseeko(pfh->fp, start, SEEK_SET);
+    /* write the calced size */
+    write_chunk_header_plain(pfh->fp, size, &status);
+    CHECK_WRITE(status);
+    fseeko(pfh->fp, 0, SEEK_END);
+    return 0;
+}
+
+int nerv_chunk_file_get_chunkdata(lua_State *L) {
+    ChunkFileHandle *pfh;
+    ChunkInfo *pci;
+    ChunkData *pcd;
+    const char *id = luaL_checkstring(L, 2);
+
+    lua_getfield(L, 1, "handle");
+    pfh = luaT_checkudata(L, -1, nerv_chunk_file_handle_tname);
+    CHECK_FILE_OPEN(pfh);
+    lua_pop(L, 1); /* pop handle */
+    lua_getfield(L, 1, "metadata");
+    /* now stack: self, k, metadata */
+    lua_getfield(L, -1, id);
+    /* now stack: self, k, metadata, kth{} */
+    if (lua_isnil(L, -1)) /* no chunck with the id */
+        return 0;
+    lua_getfield(L, -1, "chunk");
+    pci = luaT_checkudata(L, -1, nerv_chunk_info_tname);
+    if (!(pcd = get_chunk_data(pfh->fp, pci)))
+        nerv_error(L, "unexpected end of file");
+    luaT_pushudata(L, pcd, nerv_chunk_data_tname);
+    return 1;
+}
+
+int nerv_chunk_file_close(lua_State *L) {
+    ChunkFileHandle *pfh;
+    lua_getfield(L, 1, "handle");
+    pfh = luaT_checkudata(L, -1, nerv_chunk_file_handle_tname);
+    CHECK_FILE_OPEN(pfh);
+    fclose(pfh->fp);
+    pfh->closed = 1;
+    return 0;
+}
+
+int nerv_chunk_file_handle_destroy(lua_State *L) {
+    ChunkFileHandle *pfh = luaT_checkudata(L, 1,
+                                nerv_chunk_file_handle_tname);
+    if (!pfh->closed) fclose(pfh->fp);
+    free(pfh);
+    return 0;
+}
+
+static int nerv_chunk_info_destroy(lua_State *L) {
+    ChunkInfo *pci = luaT_checkudata(L, 1, nerv_chunk_info_tname);
+    free(pci);
+    return 0;
+}
+
+static int nerv_chunk_data_destroy(lua_State *L) {
+    ChunkData *pcd = luaT_checkudata(L, 1, nerv_chunk_data_tname);
+    fclose(pcd->fp);
+    free(pcd->data);
+    free(pcd);
+    return 0;
+}
+
+static const luaL_Reg nerv_chunk_file_methods[] = {
+    {"get_chunkdata", nerv_chunk_file_get_chunkdata},
+    {"_write_chunkdata", nerv_chunk_file_write_chunkdata},
+    {"close", nerv_chunk_file_close},
+    {"__init", nerv_chunk_file___init},
+    {NULL, NULL}
+};
+
+void nerv_chunk_file_init(lua_State *L) {
+    luaT_newmetatable(L, nerv_chunk_file_tname, NULL,
+                        nerv_chunk_file_new,
+                        NULL, NULL);
+    luaL_register(L, NULL, nerv_chunk_file_methods);
+    lua_pop(L, 1);
+    luaT_newmetatable(L, nerv_chunk_file_handle_tname, NULL,
+                        NULL, nerv_chunk_file_handle_destroy, NULL);
+    luaT_newmetatable(L, nerv_chunk_info_tname, NULL,
+                        NULL, nerv_chunk_info_destroy, NULL);
+    luaT_newmetatable(L, nerv_chunk_data_tname, NULL,
+                        NULL, nerv_chunk_data_destroy, NULL);
+}
+
diff --git a/nerv/io/chunk_file.h b/nerv/io/chunk_file.h
new file mode 100644
index 0000000..9bae59d
--- /dev/null
+++ b/nerv/io/chunk_file.h
@@ -0,0 +1,23 @@
+#ifndef NERV_LAYER_FILE_H
+#define NERV_LAYER_FILE_H
+
+extern const char *nerv_chunk_file_tname;
+extern const char *nerv_chunk_file_handle_tname;
+extern const char *nerv_chunk_info_tname;
+extern const char *nerv_chunk_data_tname;
+
+typedef struct ChunkFileHandle {
+    FILE *fp;
+    int closed;
+} ChunkFileHandle;
+
+typedef struct ChunkInfo {
+    off_t offset, length;
+} ChunkInfo;
+
+typedef struct ChunkData {
+    FILE *fp;
+    char *data;
+} ChunkData;
+
+#endif
diff --git a/nerv/io/init.c b/nerv/io/init.c
new file mode 100644
index 0000000..70585f7
--- /dev/null
+++ b/nerv/io/init.c
@@ -0,0 +1,6 @@
+#include "../common.h"
+
+extern void nerv_chunk_file_init(lua_State *L);
+void nerv_io_init(lua_State *L) {
+    nerv_chunk_file_init(L);
+}
diff --git a/nerv/io/init.lua b/nerv/io/init.lua
new file mode 100644
index 0000000..647ff93
--- /dev/null
+++ b/nerv/io/init.lua
@@ -0,0 +1,55 @@
+function nerv.ChunkFile:write_chunkdata(metadata, writer)
+    if type(metadata) ~= "table" then
+        nerv.error("metadata should be a Lua table")
+        return
+    end
+    return self:_write_chunkdata(table.tostring(metadata), writer)
+end
+
+function nerv.ChunkFile:write_chunk(chunk)
+    local id = chunk.id
+    local type = chunk.__typename
+    if id == nil then
+        nerv.error("id of chunk %s must be specified", type)
+    end
+    self:write_chunkdata({id = id,
+                            type = type,
+                            info = chunk:get_info()}, chunk)
+end
+
+function nerv.ChunkFile:read_chunk(id, global_conf)
+    if self.metadata == nil then
+        nerv.error("wrong file opening mode")
+    end
+    local metadata = self.metadata[id]
+    if metadata == nil then
+        nerv.error("chunk with id %s does not exist", id)
+    end
+    local chunk_type = nerv.get_type(metadata.type)
+    local chunk = chunk_type(id, global_conf)
+    chunk:set_info(metadata.info)
+    chunk:read(self:get_chunkdata(id))
+    return chunk
+end
+
+local DataReader = nerv.class("nerv.DataReader")
+
+function DataReader:__init(global_conf, reader_conf)
+    nerv.error_method_not_implemented()
+end
+
+function DataReader:get_data()
+    nerv.error_method_not_implemented()
+end
+
+local DataBuffer = nerv.class("nerv.DataBuffer")
+
+function DataBuffer:__init(global_conf, buffer_conf)
+    nerv.error_method_not_implemented()
+end
+
+function DataBuffer:get_batch()
+    nerv.error_method_not_implemented()
+end
+
+nerv.include('sgd_buffer.lua')
diff --git a/nerv/io/sgd_buffer.lua b/nerv/io/sgd_buffer.lua
new file mode 100644
index 0000000..f4f7dfe
--- /dev/null
+++ b/nerv/io/sgd_buffer.lua
@@ -0,0 +1,111 @@
+local SGDBuffer = nerv.class("nerv.SGDBuffer", "nerv.DataBuffer")
+
+function SGDBuffer:__init(global_conf, buffer_conf)
+    self.gconf = global_conf
+    self.buffer_size = math.floor(buffer_conf.buffer_size /
+                                global_conf.batch_size) * global_conf.batch_size
+    self.randomize = buffer_conf.randomize
+    if self.randomize == nil then
+        self.randomize = false
+    end
+    self.head = 0
+    self.tail = 0
+    self.readers = {}
+    for i, reader_spec in ipairs(buffer_conf.readers) do
+        local buffs = {}
+        for id, width in pairs(reader_spec.data) do
+            buffs[id] = {data = global_conf.mmat_type(self.buffer_size, width),
+                        leftover = nil,
+                        width = width}
+        end
+        table.insert(self.readers, {buffs = buffs,
+                                    reader = reader_spec.reader,
+                                    tail = 0,
+                                    has_leftover = false})
+    end
+end
+
+function SGDBuffer:saturate()
+    local buffer_size = self.buffer_size
+    self.head = 0
+    self.tail = buffer_size
+    for i, reader in ipairs(self.readers) do
+        reader.tail = 0
+        if reader.has_leftover then
+            local lrow
+            for id, buff in pairs(reader.buffs) do
+                lrow = buff.leftover:nrow()
+                if lrow > buffer_size then
+                    nerv.error("buffer size is too small to contain leftovers")
+                end
+                buff.data:copy_from(buff.leftover, 0, lrow)
+                buff.leftover = nil
+            end
+            nerv.printf("leftover: %d\n", lrow)
+            reader.tail = lrow
+            reader.has_leftover = false
+        end
+        while reader.tail < buffer_size do
+            local data = reader.reader:get_data()
+            if data == nil then
+                break
+            end
+            local drow = nil
+            for id, d in pairs(data) do
+                if drow == nil then
+                    drow = d:nrow()
+                elseif d:nrow() ~= drow then
+                    nerv.error("reader provides with inconsistent rows of data")
+                end
+            end
+            local remain = buffer_size - reader.tail
+            if drow > remain then
+                for id, buff in pairs(reader.buffs) do
+                    local d = data[id]
+                    if d == nil then
+                        nerv.error("reader does not provide data for %s", id)
+                    end
+                    buff.leftover = self.gconf.mmat_type(drow - remain,
+                                                        buff.width)
+                    buff.leftover:copy_from(d, remain, drow)
+                end
+                drow = remain
+                reader.has_leftover = true
+            end
+            for id, buff in pairs(reader.buffs) do
+                buff.data:copy_from(data[id], 0, drow, reader.tail)
+            end
+            reader.tail = reader.tail + drow
+        end
+        self.tail = math.min(self.tail, reader.tail)
+    end
+    self.rand_map = nerv.MMatrixInt.perm_gen(self.tail) -- generate shuffled index
+    collectgarbage("collect")
+    return self.tail >= self.gconf.batch_size
+end
+
+function SGDBuffer:get_data()
+    local batch_size = self.gconf.batch_size
+    if self.head >= self.tail then -- buffer is empty
+        if not self:saturate() then
+            return nil -- the remaining data cannot build a batch
+        end
+    end
+    if self.head + batch_size > self.tail then
+        return nil -- the remaining data cannot build a batch
+    end
+    local res = {}
+    for i, reader in ipairs(self.readers) do
+        for id, buff in pairs(reader.buffs) do
+            local batch = self.gconf.cumat_type(batch_size, buff.width)
+            if self.randomize then
+                batch:copy_rows_fromh_by_idx(buff.data, self.rand_map, self.head)
+            else
+                batch:copy_fromh(buff.data, self.head, self.head + batch_size)
+            end
+            res[id] = batch
+        end
+    end
+    self.head = self.head + batch_size
+    return res
+end
diff --git a/nerv/layer/affine.lua b/nerv/layer/affine.lua
new file mode 100644
index 0000000..00cbcfb
--- /dev/null
+++ b/nerv/layer/affine.lua
@@ -0,0 +1,91 @@
+local MatrixParam = nerv.class('nerv.MatrixParam', 'nerv.Param')
+local LinearTransParam = nerv.class('nerv.LinearTransParam', 'nerv.MatrixParam')
+local BiasParam = nerv.class('nerv.BiasParam', 'nerv.MatrixParam')
+local AffineLayer = nerv.class('nerv.AffineLayer', 'nerv.Layer')
+
+function MatrixParam:read(handle)
+    self.trans = self.gconf.cumat_type.new_from_host(
+                    nerv.MMatrixFloat.load(handle))
+end
+
+function MatrixParam:write(handle)
+    self.trans:new_to_host():save(handle)
+end
+
+function MatrixParam:train_init()
+    self.correction = self.trans:create()
+    self.correction:fill(0)
+end
+
+function MatrixParam:update(gradient)
+    local gconf = self.gconf
+    self.correction:add(self.correction, gradient, gconf.momentum, 1.0)
+    -- momentum gain
+    local mmt_gain = 1.0 / (1.0 - gconf.momentum);
+    local n = self.gconf.batch_size * mmt_gain
+    -- perform update
+    self.trans:add(self.trans, self.correction, 1.0, -gconf.lrate / n)
+end
+
+function LinearTransParam:update(gradient)
+    MatrixParam.update(self, gradient)
+    local gconf = self.gconf
+    -- weight decay
+    self.trans:add(self.trans, self.trans, 1.0, -gconf.lrate * gconf.wcost)
+end
+
+function AffineLayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.ltp = layer_conf.ltp
+    self.bp = layer_conf.bp
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self.gconf = global_conf
+    self:check_dim_len(1, 1) -- exactly one input and one output
+    self.direct_update = layer_conf.direct_update
+end
+
+function AffineLayer:init(batch_size)
+    if self.ltp.trans:ncol() ~= self.bp.trans:ncol() then
+        nerv.error("mismatching dimensions of linear transform and bias paramter")
+    end
+    if self.dim_in[1] ~= self.ltp.trans:nrow() then
+        nerv.error("mismatching dimensions of linear transform parameter and input")
+    end
+    if self.dim_out[1] ~= self.ltp.trans:ncol() then
+        nerv.error("mismatching dimensions of linear transform parameter and output")
+    end
+    self.ltp_grad = self.ltp.trans:create()
+    self.ltp:train_init()
+    self.bp:train_init()
+end
+
+function AffineLayer:update(bp_err, input, output)
+    if self.direct_update then
+        self.ltp.correction:mul(input[1], bp_err[1], 1.0, gconf.momentum, 'T', 'N')
+        -- momentum gain
+        local mmt_gain = 1.0 / (1.0 - gconf.momentum);
+        local n = self.gconf.batch_size * mmt_gain
+        -- perform update
+        self.ltp.trans:add(self.ltp.trans, self.ltp.correction, 1.0, -gconf.lrate / n)
+    else
+        self.ltp_grad:mul(input[1], bp_err[1], 1.0, 0.0, 'T', 'N')
+        self.ltp:update(self.ltp_grad)
+    end
+    self.bp:update(bp_err[1]:colsum())
+end
+
+function AffineLayer:propagate(input, output)
+    -- apply linear transform
+    output[1]:mul(input[1], self.ltp.trans, 1.0, 0.0, 'N', 'N')
+    -- add bias
+    output[1]:add_row(self.bp.trans, 1.0)
+end
+
+function AffineLayer:back_propagate(bp_err, next_bp_err, input, output)
+    next_bp_err[1]:mul(bp_err[1], self.ltp.trans, 1.0, 0.0, 'N', 'T')
+end
+
+function AffineLayer:get_params()
+    return nerv.ParamRepo({self.ltp, self.bp})
+end
diff --git a/nerv/layer/bias.lua b/nerv/layer/bias.lua
new file mode 100644
index 0000000..c99274d
--- /dev/null
+++ b/nerv/layer/bias.lua
@@ -0,0 +1,28 @@
+local BiasLayer = nerv.class("nerv.BiasLayer", "nerv.Layer")
+
+function BiasLayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.gconf = global_conf
+    self.bias = layer_conf.bias
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self:check_dim_len(1, 1)
+end
+
+function BiasLayer:init()
+    if self.dim_in[1] ~= self.bias.trans:ncol() then
+        nerv.error("mismatching dimensions of input and bias parameter")
+    end
+    if self.dim_out[1] ~= self.bias.trans:ncol() then
+        nerv.error("mismatching dimensions of output and bias parameter")
+    end
+end
+
+function BiasLayer:propagate(input, output)
+    output[1]:copy_fromd(input[1])
+    output[1]:add_row(self.bias.trans, 1.0)
+end
+
+function BiasLayer:get_params()
+    return nerv.ParamRepo({self.bias})
+end
diff --git a/nerv/layer/combiner.lua b/nerv/layer/combiner.lua
new file mode 100644
index 0000000..7bd7617
--- /dev/null
+++ b/nerv/layer/combiner.lua
@@ -0,0 +1,59 @@
+local CombinerLayer = nerv.class('nerv.CombinerLayer', 'nerv.Layer')
+
+function CombinerLayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.lambda = layer_conf.lambda
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self.gconf = global_conf
+    self:check_dim_len(#self.lambda, -1)
+    if #self.dim_in < 1 then
+        nerv.error("no input specified")
+    end
+    if #self.dim_out < 1 then
+        nerv.error("no output specified")
+    end
+end
+
+function CombinerLayer:init(batch_size)
+    local dim = self.dim_in[1]
+    for i = 2, #self.dim_in do
+        if self.dim_in[i] ~= dim then
+            nerv.error("mismatching dimensions of inputs")
+        end
+    end
+    for i = 1, #self.dim_out do
+        if self.dim_out[i] ~= dim then
+            nerv.error("mismatching dimensions of inputs/outputs")
+        end
+    end
+    self.sum = self.gconf.cumat_type(batch_size, dim)
+end
+
+function CombinerLayer:update(bp_err, input, output)
+end
+
+function CombinerLayer:propagate(input, output)
+    output[1]:fill(0)
+    for i = 1, #self.dim_in do
+        output[1]:add(output[1], input[i], 1.0, self.lambda[i])
+    end
+    for i = 2, #self.dim_out do
+        output[i]:copy_fromd(output[1])
+    end
+end
+
+function CombinerLayer:back_propagate(bp_err, next_bp_err, input, output)
+    local sum = self.sum
+    sum:copy_fromd(bp_err[1])
+    for i = 2, #self.dim_out do
+        sum:add(sum, bp_err[i], 1.0, 1.0)
+    end
+    for i = 1, #self.dim_in do
+        next_bp_err[i]:add(next_bp_err[i], sum, 0.0, self.lambda[i])
+    end
+end
+
+function CombinerLayer:get_params()
+    return nerv.ParamRepo({})
+end
diff --git a/nerv/layer/init.lua b/nerv/layer/init.lua
new file mode 100644
index 0000000..3c55a94
--- /dev/null
+++ b/nerv/layer/init.lua
@@ -0,0 +1,79 @@
+-- The following methods must be implemented to let a layer work properly
+
+local Param = nerv.class('nerv.Param')
+
+function Param:__init(id, global_conf)
+    self.id = id
+    self.gconf = global_conf
+end
+
+function Param:get_info()
+    return self.info
+end
+
+function Param:set_info(info)
+    self.info = info
+end
+
+function Param:read(handle)
+    nerv.error_method_not_implemented()
+end
+
+function Param:write(handle)
+    nerv.error_method_not_implemented()
+end
+
+function Param:update(gradient)
+    nerv.error_method_not_implemented()
+end
+
+local Layer = nerv.class('nerv.Layer')
+
+function Layer:__init(id, global_conf, layer_conf)
+    nerv.error_method_not_implemented()
+end
+
+function Layer:init(batch_size)
+    nerv.error_method_not_implemented()
+end
+
+function Layer:update(bp_err, input, output)
+    nerv.error_method_not_implemented()
+end
+
+function Layer:propagate(input, output)
+    nerv.error_method_not_implemented()
+end
+
+function Layer:back_propagate(bp_err, next_bp_err, input, output)
+    nerv.error_method_not_implemented()
+end
+
+function Layer:check_dim_len(len_in, len_out)
+    local expected_in = #self.dim_in
+    local expected_out = #self.dim_out
+    if len_in > 0 and expected_in ~= len_in then
+        nerv.error("layer %s expects %d inputs, %d given",
+                    self.id, len_in, expected_in)
+    end
+    if len_out > 0 and expected_out ~= len_out then
+        nerv.error("layer %s expects %d outputs, %d given",
+                    self.id, len_out, expected_out)
+    end
+end
+
+function Layer:get_params()
+    nerv.error_method_not_implemented()
+end
+
+function Layer:get_dim()
+    return self.dim_in, self.dim_out
+end
+
+nerv.include('affine.lua')
+nerv.include('sigmoid.lua')
+nerv.include('softmax_ce.lua')
+nerv.include('bias.lua')
+nerv.include('window.lua')
+nerv.include('mse.lua')
+nerv.include('combiner.lua')
diff --git a/nerv/layer/mse.lua b/nerv/layer/mse.lua
new file mode 100644
index 0000000..9a97add
--- /dev/null
+++ b/nerv/layer/mse.lua
@@ -0,0 +1,52 @@
+local MSELayer = nerv.class("nerv.MSELayer", "nerv.Layer")
+
+function MSELayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self.gconf = global_conf
+    self:check_dim_len(2, -1)
+end
+
+function MSELayer:init(batch_size)
+    if self.dim_in[1] ~= self.dim_in[2] then
+        nerv.error("mismatching dimensions of previous network output and labels")
+    end
+    self.scale = 1 / self.dim_in[1]
+    self.total_mse = 0.0
+    self.total_frames = 0
+    self.mse = self.gconf.cumat_type(batch_size, self.dim_in[1])
+    self.mse_sum = self.gconf.cumat_type(batch_size, 1)
+    self.diff = self.mse:create()
+end
+
+function MSELayer:update(bp_err, input, output)
+    -- no params, therefore do nothing
+end
+
+function MSELayer:propagate(input, output)
+    local mse = self.mse
+    local mse_sum = self.mse_sum
+    mse:add(input[1], input[2], 1.0, -1.0)
+    self.diff:copy_fromd(mse)
+    mse:mul_elem(mse, mse)
+    mse_sum:add(mse_sum, mse:rowsum(mse), 0.0, self.scale)
+    if output[1] ~= nil then
+        output[1]:copy_fromd(mse_sum)
+    end
+    self.total_mse = self.total_mse + mse_sum:colsum()[0]
+    self.total_frames = self.total_frames + mse_sum:nrow()
+end
+
+-- NOTE: must call propagate before back_propagate
+function MSELayer:back_propagate(bp_err, next_bp_err, input, output)
+    local nbe = next_bp_err[1]
+    nbe:add(nbe, self.diff, 0.0, 2 * self.scale)
+    if bp_err[1] ~= nil then
+        nbe:scale_rows_by_col(bp_err[1])
+    end
+end
+
+function MSELayer:get_params()
+    return nerv.ParamRepo({})
+end
diff --git a/nerv/layer/sigmoid.lua b/nerv/layer/sigmoid.lua
new file mode 100644
index 0000000..dfd09eb
--- /dev/null
+++ b/nerv/layer/sigmoid.lua
@@ -0,0 +1,31 @@
+local SigmoidLayer = nerv.class("nerv.SigmoidLayer", "nerv.Layer")
+
+function SigmoidLayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.gconf = global_conf
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self:check_dim_len(1, 1)
+end
+
+function SigmoidLayer:init()
+    if self.dim_in[1] ~= self.dim_out[1] then
+        nerv.error("mismatching dimensions of input and output")
+    end
+end
+
+function SigmoidLayer:update(bp_err, input, output)
+    -- no params, therefore do nothing
+end
+
+function SigmoidLayer:propagate(input, output)
+    output[1]:sigmoid(input[1])
+end
+
+function SigmoidLayer:back_propagate(bp_err, next_bp_err, input, output)
+    next_bp_err[1]:sigmoid_grad(bp_err[1], output[1])
+end
+
+function SigmoidLayer:get_params()
+    return nerv.ParamRepo({})
+end
diff --git a/nerv/layer/softmax_ce.lua b/nerv/layer/softmax_ce.lua
new file mode 100644
index 0000000..daf891e
--- /dev/null
+++ b/nerv/layer/softmax_ce.lua
@@ -0,0 +1,68 @@
+local SoftmaxCELayer = nerv.class("nerv.SoftmaxCELayer", "nerv.Layer")
+
+function SoftmaxCELayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.gconf = global_conf
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self.compressed = layer_conf.compressed
+    if self.compressed == nil then
+        self.compressed = false
+    end
+    self:check_dim_len(2, -1) -- two inputs: nn output and label
+end
+
+function SoftmaxCELayer:init(batch_size)
+    if not self.compressed and (self.dim_in[1] ~= self.dim_in[2]) then
+        nerv.error("mismatching dimensions of previous network output and labels")
+    end
+    self.total_ce = 0.0
+    self.total_correct = 0
+    self.total_frames = 0
+    self.softmax = self.gconf.cumat_type(batch_size, self.dim_in[1])
+    self.ce = self.softmax:create()
+end
+
+function SoftmaxCELayer:update(bp_err, input, output)
+    -- no params, therefore do nothing
+end
+
+function SoftmaxCELayer:propagate(input, output)
+    local softmax = self.softmax
+    local ce = self.ce
+    local classified = softmax:softmax(input[1])
+    local label = input[2]
+    ce:log_elem(softmax)
+    if self.compressed then
+        label = label:decompress(input[1]:ncol())
+    end
+    ce:mul_elem(ce, label)
+    ce = ce:rowsum()
+    if output[1] ~= nil then
+        output[1]:copy_fromd(ce)
+    end
+    -- add total ce
+    self.total_ce = self.total_ce - ce:colsum()[0]
+    self.total_frames = self.total_frames + softmax:nrow()
+    -- TODO: add colsame for uncompressed label
+    if self.compressed then
+        self.total_correct = self.total_correct + classified:colsame(input[2])[0]
+    end
+end
+
+function SoftmaxCELayer:back_propagate(bp_err, next_bp_err, input, output)
+    -- softmax output - label
+    local label = input[2]
+    if self.compressed then
+        label = label:decompress(input[1]:ncol())
+    end
+    local nbe = next_bp_err[1]
+    nbe:add(self.softmax, label, 1.0, -1.0)
+    if bp_err[1] ~= nil then
+        nbe:scale_rows_by_col(bp_err[1])
+    end
+end
+
+function SoftmaxCELayer:get_params()
+    return nerv.ParamRepo({})
+end
diff --git a/nerv/layer/window.lua b/nerv/layer/window.lua
new file mode 100644
index 0000000..4e9a3b1
--- /dev/null
+++ b/nerv/layer/window.lua
@@ -0,0 +1,28 @@
+local WindowLayer = nerv.class("nerv.WindowLayer", "nerv.Layer")
+
+function WindowLayer:__init(id, global_conf, layer_conf)
+    self.id = id
+    self.gconf = global_conf
+    self.window = layer_conf.window
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
+    self:check_dim_len(1, 1)
+end
+
+function WindowLayer:init()
+    if self.dim_in[1] ~= self.window.trans:ncol() then
+        nerv.error("mismatching dimensions of input and window parameter")
+    end
+    if self.dim_out[1] ~= self.window.trans:ncol() then
+        nerv.error("mismatching dimensions of output and window parameter")
+    end
+end
+
+function WindowLayer:propagate(input, output)
+    output[1]:copy_fromd(input[1])
+    output[1]:scale_rows_by_row(self.window.trans)
+end
+
+function WindowLayer:get_params()
+    return nerv.ParamRepo({self.window})
+end
diff --git a/nerv/luaT/README.md b/nerv/luaT/README.md
new file mode 100644
index 0000000..6e9cf0d
--- /dev/null
+++ b/nerv/luaT/README.md
@@ -0,0 +1,239 @@
+<a name="luat.dok"/>
+# Lua Torch C API #
+
+luaT provides an API to interface Lua and C in Torch packages. It defines a
+concept of _classes_ to Lua for Torch, and provides a mechanism to easily
+handle these Lua classes from C.
+
+It additionally provides few functions that `luaL` should have defined, and
+defines several functions similar to `luaL` ones for better type error printing when using
+`luaT` classes.
+
+<a name="luat.memory.dok"/>
+## Memory functions ##
+
+Classical memory allocation functions which generate a Lua error in case of
+problem.
+
+<a name="luaT_alloc"/>
+### void* luaT_alloc(lua_State *L, long size) ###
+
+Allocates `size` bytes, and return a pointer on the allocated
+memory. A Lua error will be generated if running out of memory.
+
+<a name="luaT_realloc"/>
+### void* luaT_realloc(lua_State *L, void *ptr, long size) ###
+
+Realloc `ptr` to `size` bytes. `ptr` must have been previously
+allocated with [luaT_alloc](#luaT_alloc) or
+[luaT_realloc](#luaT_realloc), or the C `malloc` or `realloc`
+functions. A Lua error will be generated if running out of memory.
+
+<a name="luaT_free"/>
+### void luaT_free(lua_State *L, void *ptr) ###
+
+Free memory allocated at address `ptr`. The memory must have been
+previously allocated with [luaT_alloc](#luaT_alloc) or
+[luaT_realloc](#luaT_realloc), or the C `malloc` or `realloc`
+functions.
+
+<a name="luat.classcreate"/>
+## Class creation and basic handling ##
+
+A `luaT` class is basically either a Lua _table_ or _userdata_ with
+an appropriate _metatable_. This appropriate metatable is created with
+[luaT_newmetatable](#luaT_newmetatable). Contrary to luaL userdata
+functions, luaT mechanism handles inheritance. If the class inherit from
+another class, then the metatable will itself have a metatable
+corresponding to the _parent metatable_: the metatables are cascaded
+according to the class inheritance. Multiple inheritance is not supported.
+
+<a name="luat.operatoroverloading"/>
+### Operator overloading ###
+
+The metatable of a `luaT` object contains `Lua` operators like
+`__index`, `__newindex`, `__tostring`, `__add`
+(etc...). These operators will respectively look for `__index__`,
+`__newindex__`, `__tostring__`, `__add__` (etc...) in the
+metatable. If found, the corresponding function or value will be returned,
+else a Lua error will be raised.
+
+If one wants to provide `__index__` or `__newindex__` in the
+metaclass, these operators must follow a particular scheme:
+
+  * `__index__` must either return a value _and_ `true` or return `false` only. In the first case, it means `__index__` was able to handle the given argument (for e.g., the type was correct). The second case means it was not able to do anything, so `__index` in the root metatable can then try to see if the metaclass contains the required value.
+
+  * `__newindex__` must either return `true` or `false`. As for `__index__`, `true` means it could handle the argument and `false` not. If not, the root metatable `__newindex` will then raise an error if the object was a userdata, or apply a rawset if the object was a Lua table.
+
+Other metaclass operators like `__tostring__`, `__add__`, etc... do not have any particular constraint.
+
+<a name="luat_newmetatable"/>
+### const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parenttname, lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory) ###
+
+This function creates a new metatable, which is the Lua way to define a new
+object class. As for `luaL_newmetatable`, the metatable is registered in
+the Lua registry table, with the key `tname`. In addition, `tname` is
+also registered in the Lua registry, with the metatable as key (the
+typename of a given object can be thus easily retrieved).
+
+The class name `tname` must be of the form `modulename.classname`. The module name
+If not NULL, `parenttname` must be a valid typename corresponding to the
+parent class of the new class.
+
+If not NULL, `constructor`, a function `new` will be added to the metatable, pointing to this given function. The constructor might also
+be called through `modulename.classname()`, which is an alias setup by `luaT_metatable`.
+
+If not NULL, `destructor` will be called when garbage collecting the object.
+
+If not NULL, `factory` must be a Lua C function creating an empty object
+instance of the class. This functions are used in Torch for serialization.
+
+Note that classes can be partly defined in C and partly defined in Lua:
+once the metatable is created in C, it can be filled up with additional
+methods in Lua.
+
+The return value is the value returned by [luaT_typenameid](#luat_typenameid).
+
+<a name="luat_pushmetatable"/>
+### int luaT_pushmetatable(lua_State *L, const name *tname) ###
+
+Push the metatable with type name `tname` on the stack, it `tname` is a
+valid Torch class name (previously registered with luaT_newmetatable).
+
+On success, returns 1. If `tname` is invalid, nothing is pushed and it
+returns 0.
+
+<a name="luat_typenameid"/>
+### const char* luaT_typenameid(lua_State *L, const char *tname) ###
+
+If `tname` is a valid Torch class name, then returns a unique string (the
+contents will be the same than `tname`) pointing on the string registered
+in the Lua registry. This string is thus valid as long as Lua is
+running. The returned string shall not be freed.
+
+If `tname` is an invalid class name, returns NULL.
+
+<a name="luat_typename"/>
+### const char* luaT_typename(lua_State *L, int ud) ###
+
+Returns the typename of the object at index `ud` on the stack. If it is
+not a valid Torch object, returns NULL.
+
+<a name="luat_pushudata"/>
+### void luaT_pushudata(lua_State *L, void *udata, const char *tname) ###
+
+Given a C structure `udata`, push a userdata object on the stack with
+metatable corresponding to `tname`. Obviously, `tname` must be a valid
+Torch name registered with [luaT_newmetatable](#luat_newmetatable).
+
+<a name="luat_toudata"/>
+### void *luaT_toudata(lua_State *L, int ud, const char *tname) ###
+
+Returns a pointer to the original C structure previously pushed on the
+stack with [luaT_pushudata](#luat_pushudata), if the object at index
+`ud` is a valid Torch class name. Returns NULL otherwise.
+
+<a name="luat_isudata"/>
+### int luaT_isudata(lua_State *L, int ud, const char *tname) ###
+
+Returns 1 if the object at index `ud` on the stack is a valid Torch class name `tname`.
+Returns 0 otherwise.
+
+<a name="luat_getfield"/>
+### Checking fields of a table ###
+
+This functions check that the table at the given index `ud` on the Lua
+stack has a field named `field`, and that it is of the specified type.
+These function raises a Lua error on failure.
+
+<a name="luat_getfieldcheckudata"/>
+## void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+Torch class name `tname`.  Returns the pointer of the C structure
+previously pushed on the stack with [luaT_pushudata](#luat_pushudata) on
+success. The function raises a Lua error on failure.
+
+<a name="luat_getfieldchecklightudata"/>
+## void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+lightuserdata.  Returns the lightuserdata pointer on success. The function
+raises a Lua error on failure.
+
+<a name="luat_getfieldcheckint"/>
+## int luaT_getfieldcheckint(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is an
+int. Returns the int value pointer on success. The function raises a Lua
+error on failure.
+
+<a name="luat_getfieldcheckstring"/>
+## const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+string. Returns a pointer to the string on success. The function raises a
+Lua error on failure.
+
+<a name="luat_getfieldcheckboolean"/>
+## int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+boolean. On success, returns 1 if the boolean is `true`, 0 if it is
+`false`. The function raises a Lua error on failure.
+
+<a name="luat_getfieldchecktable"/>
+## void luaT_getfieldchecktable(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+table. On success, push the table on the stack. The function raises a Lua
+error on failure.
+
+<a name="luat_typerror"/>
+### int luaT_typerror(lua_State *L, int ud, const char *tname) ###
+
+Raises a `luaL_argerror` (and returns its value), claiming that the
+object at index `ud` on the stack is not of type `tname`. Note that
+this function does not check the type, it only raises an error.
+
+<a name="luat_checkboolean"/>
+### int luaT_checkboolean(lua_State *L, int ud) ###
+
+Checks that the value at index `ud` is a boolean. On success, returns 1
+if the boolean is `true`, 0 if it is `false`. The function raises a Lua
+error on failure.
+
+<a name="luat_optboolean"/>
+### int luaT_optboolean(lua_State *L, int ud, int def) ###
+
+Checks that the value at index `ud` is a boolean. On success, returns 1
+if the boolean is `true`, 0 if it is `false`. If there is no value at
+index `ud`, returns `def`. In any other cases, raises an error.
+
+<a name="luat_registeratname"/>
+### void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name) ###
+
+This function assume a table is on the stack. It creates a table field
+`name` in the table (if this field does not exist yet), and fill up
+`methods` in this table field.
+
+<a name="luat_classrootname"/>
+### const char *luaT_classrootname(const char *tname) ###
+
+Assuming `tname` is of the form `modulename.classname`, returns
+`classname`. The returned value shall not be freed. It is a pointer
+inside `tname` string.
+
+<a name="luat_classmodulename"/>
+### const char *luaT_classmodulename(const char *tname) ###
+
+Assuming `tname` is of the form `modulename.classname`, returns
+`modulename`. The returned value shall not be freed. It is valid until the
+next call to `luaT_classrootname`.
+
+<a name="luat_stackdump"/>
+### void luaT_stackdump(lua_State *L) ###
+
+This function print outs the state of the Lua stack. It is useful for debug
+purposes.
+
diff --git a/nerv/luaT/luaT.c b/nerv/luaT/luaT.c
new file mode 100644
index 0000000..7b85ce3
--- /dev/null
+++ b/nerv/luaT/luaT.c
@@ -0,0 +1,1079 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "luaT.h"
+
+void* luaT_alloc(lua_State *L, long size)
+{
+  void *ptr;
+
+  if(size == 0)
+    return NULL;
+
+  if(size < 0)
+    luaL_error(L, "$ Torch: invalid memory size -- maybe an overflow?");
+
+  ptr = malloc(size);
+  if(!ptr)
+    luaL_error(L, "$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", size/1073741824);
+
+  return ptr;
+}
+
+void* luaT_realloc(lua_State *L, void *ptr, long size)
+{
+  if(!ptr)
+    return(luaT_alloc(L, size));
+
+  if(size == 0)
+  {
+    luaT_free(L, ptr);
+    return NULL;
+  }
+
+  if(size < 0)
+    luaL_error(L, "$ Torch: invalid memory size -- maybe an overflow?");
+
+  ptr = realloc(ptr, size);
+  if(!ptr)
+    luaL_error(L, "$ Torch: not enough memory: you tried to reallocate %dGB. Buy new RAM!", size/1073741824);
+  return ptr;
+}
+
+void luaT_free(lua_State *L, void *ptr)
+{
+  free(ptr);
+}
+
+void luaT_stackdump(lua_State *L)
+{
+  int i;
+  const char *tname = NULL;
+  int top = lua_gettop(L);
+  for(i = 1; i <= top; i++)
+  {
+    int t = lua_type(L, i);
+    printf("%3d. ", i);
+    switch(t)
+    {
+      case LUA_TSTRING:
+        printf("'%s'", lua_tostring(L,i));
+        break;
+      case LUA_TBOOLEAN:
+        printf(lua_toboolean(L, i) ? "true" : "false");
+        break;
+      case LUA_TNUMBER:
+        printf("%g", lua_tonumber(L,i));
+        break;
+      case LUA_TUSERDATA:
+        tname = luaT_typename(L, i);
+        printf("userdata %lx [%s]", (long)lua_topointer(L, i), (tname ? tname : "not a Torch object"));
+        break;
+      case 10:
+        tname = luaT_typename(L, i);
+        printf("cdata %lx [%s]", (long)lua_topointer(L, i), (tname ? tname : "not a Torch object"));
+        break;
+      case LUA_TTABLE:
+        lua_pushvalue(L, i);
+        lua_rawget(L, LUA_REGISTRYINDEX);
+        if(lua_isstring(L, -1))
+          tname = lua_tostring(L, -1); /*luaT_typenameid(L, lua_tostring(L, -1)); */
+        else
+          tname = NULL;
+        lua_pop(L, 1);
+        if(tname)
+          printf("metatable [%s]", tname);
+        else
+        {
+          tname = luaT_typename(L, i);
+          printf("table %lx [%s]", (long)lua_topointer(L, i), (tname ? tname : "not a Torch object"));
+        }
+        break;
+      default:
+        printf("Lua object type: %s", lua_typename(L,t));
+        break;
+    }
+    printf("\n");
+  }
+  printf("---------------------------------------------\n");
+}
+
+/* metatable operator methods */
+static int luaT_mt__index(lua_State *L);
+static int luaT_mt__newindex(lua_State *L);
+static int luaT_mt__tostring(lua_State *L);
+static int luaT_mt__add(lua_State *L);
+static int luaT_mt__sub(lua_State *L);
+static int luaT_mt__mul(lua_State *L);
+static int luaT_mt__div(lua_State *L);
+static int luaT_mt__mod(lua_State *L);
+static int luaT_mt__pow(lua_State *L);
+static int luaT_mt__unm(lua_State *L);
+static int luaT_mt__concat(lua_State *L);
+static int luaT_mt__len(lua_State *L);
+static int luaT_mt__eq(lua_State *L);
+static int luaT_mt__lt(lua_State *L);
+static int luaT_mt__le(lua_State *L);
+static int luaT_mt__call(lua_State *L);
+
+/* Constructor-metatable methods */
+static int luaT_cmt__call(lua_State *L);
+static int luaT_cmt__newindex(lua_State *L);
+
+const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parenttname,
+                              lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory)
+{
+  lua_pushcfunction(L, luaT_lua_newmetatable);
+  lua_pushstring(L, tname);
+  (parenttname ? lua_pushstring(L, parenttname) : lua_pushnil(L));
+  (constructor ? lua_pushcfunction(L, constructor) : lua_pushnil(L));
+  (destructor ? lua_pushcfunction(L, destructor) : lua_pushnil(L));
+  (factory ? lua_pushcfunction(L, factory) : lua_pushnil(L));
+  lua_call(L, 5, 1);
+  return luaT_typenameid(L, tname);
+}
+
+int luaT_pushmetatable(lua_State *L, const char *tname)
+{
+  lua_getfield(L, LUA_REGISTRYINDEX, tname);
+  if(lua_isnil(L, -1))
+  {
+    lua_pop(L, 1);
+    return 0;
+  }
+  return 1;
+}
+
+const char *luaT_typenameid(lua_State *L, const char *tname)
+{
+  if(luaT_pushmetatable(L, tname))
+  {
+    const char *tnameid = NULL;
+    lua_rawget(L, LUA_REGISTRYINDEX);
+    if(lua_isstring(L, -1))
+      tnameid = lua_tostring(L, -1);
+    lua_pop(L, 1); /* the string/nil */
+    return tnameid;
+  }
+  return NULL;
+}
+
+static const char cdataname[] = ""
+  "local _, ffi = pcall(require, 'ffi')\n"
+  "if ffi then\n"
+  "  local id2name = {}\n"
+  "  return function(cdata, name)\n"
+  "    local id = tonumber(ffi.typeof(cdata))\n"
+  "    if id then\n"
+  "      if name then\n"
+  "        id2name[id] = name\n"
+  "        return name\n"
+  "      else\n"
+  "        return rawget(id2name, id)\n"
+  "      end\n"
+  "    end\n"
+  "    return nil\n"
+  "  end\n"
+  "else\n"
+  "  return function() end\n"
+  "end\n";
+
+static const char* luaT_cdataname(lua_State *L, int ud, const char *tname)
+{
+  lua_pushstring(L, "__cdataname");
+  lua_rawget(L, LUA_REGISTRYINDEX);
+  if(lua_isnil(L,-1))
+  {
+    lua_pop(L, 1);
+
+    if(luaL_dostring(L, cdataname)) /* did something go wrong? */
+      luaL_error(L, "internal error (could not load cdataname): %s", lua_tostring(L, -1));
+
+    lua_pushstring(L, "__cdataname");
+    lua_pushvalue(L, -2);
+    lua_rawset(L, LUA_REGISTRYINDEX);
+  }
+  if(!lua_isfunction(L, -1)) /* should not happen */
+    luaL_error(L, "internal error (cdataname is not a function)");
+
+  lua_pushvalue(L, ud);
+  if(tname)
+    lua_pushstring(L, tname);
+  if(lua_pcall(L, (tname ? 2 : 1), 1, 0))
+    luaL_error(L, "internal error (cdataname): %s", lua_tostring(L, -1));
+
+  tname = lua_tostring(L, -1);
+  lua_pop(L, 1);
+
+  return tname;
+}
+
+const char* luaT_typename(lua_State *L, int ud)
+{
+  if(lua_type(L, ud) == 10)
+    return luaT_cdataname(L, ud, NULL);
+  else if(lua_getmetatable(L, ud))
+  {
+    const char *tname = NULL;
+    lua_rawget(L, LUA_REGISTRYINDEX);
+    if(lua_isstring(L, -1))
+      tname = lua_tostring(L, -1);
+    lua_pop(L, 1); /* the string/nil */
+    return tname;
+  }
+  return NULL;
+}
+
+void luaT_pushudata(lua_State *L, void *udata, const char *tname)
+{
+  if(udata)
+  {
+    void **udata_p = lua_newuserdata(L, sizeof(void*));
+    *udata_p = udata;
+    if(!luaT_pushmetatable(L, tname))
+      luaL_error(L, "Torch internal problem: cannot find metatable for type <%s>", tname);
+    lua_setmetatable(L, -2);
+  }
+  else
+    lua_pushnil(L);
+}
+
+void *luaT_toudata(lua_State *L, int ud, const char *tname)
+{
+  void **p = lua_touserdata(L, ud);
+  if(p != NULL) /* value is a userdata? */
+  {
+    if(!luaT_pushmetatable(L, tname))
+      luaL_error(L, "Torch internal problem: cannot find metatable for type <%s>", tname);
+
+    /* initialize the table we want to get the metatable on */
+    /* note that we have to be careful with indices, as we just inserted stuff */
+    lua_pushvalue(L, (ud < 0 ? ud - 1 : ud));
+    while(lua_getmetatable(L, -1)) /* get the next metatable */
+    {
+      lua_remove(L, -2); /* remove the previous metatable [or object, if first time] */
+      if(lua_rawequal(L, -1, -2))
+      {
+        lua_pop(L, 2);  /* remove the two metatables */
+        return *p;
+      }
+    }
+    lua_pop(L, 2); /* remove the two metatables */
+  }
+  return NULL;
+}
+
+int luaT_isudata(lua_State *L, int ud, const char *tname)
+{
+  if(luaT_toudata(L, ud, tname))
+    return 1;
+  else
+    return 0;
+}
+
+void *luaT_checkudata(lua_State *L, int ud, const char *tname)
+{
+  void *p = luaT_toudata(L, ud, tname);
+  if(!p)
+    luaT_typerror(L, ud, tname);
+  return p;
+}
+
+void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname)
+{
+  void *p;
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  p = luaT_toudata(L, -1, tname);
+  if(!p)
+    luaL_error(L, "bad argument #%d (field %s is not a %s)", ud, field, tname);
+  return p;
+}
+
+void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field)
+{
+  void *p;
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+
+  if(!lua_islightuserdata(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a light userdata)", ud, field);
+
+  p = lua_touserdata(L, -1);
+
+  return p;
+}
+
+double luaT_getfieldchecknumber(lua_State *L, int ud, const char *field)
+{
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  if(!lua_isnumber(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a number)", ud, field);
+  return lua_tonumber(L, -1);
+}
+
+int luaT_getfieldcheckint(lua_State *L, int ud, const char *field)
+{
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  if(!lua_isnumber(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a number)", ud, field);
+  return (int)lua_tonumber(L, -1);
+}
+
+const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field)
+{
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  if(!lua_isstring(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a string)", ud, field);
+  return lua_tostring(L, -1);
+}
+
+int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field)
+{
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  if(!lua_isboolean(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a boolean)", ud, field);
+  return lua_toboolean(L, -1);
+}
+
+void luaT_getfieldchecktable(lua_State *L, int ud, const char *field)
+{
+  lua_getfield(L, ud, field);
+  if(lua_isnil(L, -1))
+    luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+  if(!lua_istable(L, -1))
+    luaL_error(L, "bad argument #%d (field %s is not a table)", ud, field);
+}
+
+/**** type checks as in luaL ****/
+int luaT_typerror(lua_State *L, int ud, const char *tname)
+{
+  const char *msg;
+  const char *tnameud = luaT_typename(L, ud);
+
+  if(!tnameud)
+    tnameud = lua_typename(L, ud);
+
+  msg = lua_pushfstring(L, "%s expected, got %s",
+                        tname,
+                        (tnameud ? tnameud : "unknown object"));
+
+  return luaL_argerror(L, ud, msg);
+}
+
+int luaT_checkboolean(lua_State *L, int ud)
+{
+  if(!lua_isboolean(L, ud))
+    luaT_typerror(L, ud, lua_typename(L, LUA_TBOOLEAN));
+  return lua_toboolean(L, ud);
+}
+
+int luaT_optboolean(lua_State *L, int ud, int def)
+{
+  if(lua_isnoneornil(L,ud))
+    return def;
+
+  return luaT_checkboolean(L, ud);
+}
+
+void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name)
+{
+  int idx = lua_gettop(L);
+
+  luaL_checktype(L, idx, LUA_TTABLE);
+  lua_pushstring(L, name);
+  lua_rawget(L, idx);
+
+  if(lua_isnil(L, -1))
+  {
+    lua_pop(L, 1);
+    lua_pushstring(L, name);
+    lua_newtable(L);
+    lua_rawset(L, idx);
+
+    lua_pushstring(L, name);
+    lua_rawget(L, idx);
+  }
+
+  luaL_register(L, NULL, methods);
+  lua_pop(L, 1);
+}
+
+
+/* utility functions */
+const char *luaT_classrootname(const char *tname)
+{
+  int i;
+  int sz = strlen(tname);
+
+  for(i = 0; i < sz; i++)
+  {
+    if(tname[i] == '.')
+      return tname+i+1;
+  }
+  return tname;
+}
+
+/* module_name must be a buffer at least as big as tname 
+ * return true if the class is part of a module */
+int luaT_classmodulename(const char *tname, char *module_name)
+{
+  char chars[] = {'.', '\0'};
+  size_t n;
+  n = strcspn(tname, chars);
+  strncpy(module_name, tname, n);
+  module_name[n] = '\0';
+  return tname[n] == '.';
+}
+
+/* Lua only functions */
+int luaT_lua_newmetatable(lua_State *L)
+{
+  const char* tname = luaL_checkstring(L, 1);
+  char module_name[256];
+  int is_in_module = 0;
+  is_in_module = luaT_classmodulename(tname, module_name);
+
+  lua_settop(L, 5);
+  luaL_argcheck(L, lua_isnoneornil(L, 2) || lua_isstring(L, 2), 2, "parent class name or nil expected");
+  luaL_argcheck(L, lua_isnoneornil(L, 3) || lua_isfunction(L, 3), 3, "constructor function or nil expected");
+  luaL_argcheck(L, lua_isnoneornil(L, 4) || lua_isfunction(L, 4), 4, "destructor function or nil expected");
+  luaL_argcheck(L, lua_isnoneornil(L, 5) || lua_isfunction(L, 5), 5, "factory function or nil expected");
+
+  if(is_in_module)
+    lua_getfield(L, LUA_GLOBALSINDEX, module_name);
+  else
+    lua_pushvalue(L, LUA_GLOBALSINDEX);
+  if(!lua_istable(L, 6))
+    luaL_error(L, "while creating metatable %s: bad argument #1 (%s is an invalid module name)", tname, module_name);
+
+  /* we first create the new metaclass if we have to */
+  if(!luaT_pushmetatable(L, tname))
+  {
+    /* create the metatable */
+    lua_newtable(L);
+
+    /* registry[name] = metatable */
+    lua_pushvalue(L, -1);
+    lua_setfield(L, LUA_REGISTRYINDEX, tname);
+
+    /* registry[metatable] = tname */
+    lua_pushvalue(L, -1);
+    lua_pushstring(L, tname);
+    lua_rawset(L, LUA_REGISTRYINDEX);
+
+    /* __index handling */
+    lua_pushcfunction(L, luaT_mt__index);
+    lua_setfield(L, -2, "__index");
+
+    /* __newindex handling */
+    lua_pushcfunction(L, luaT_mt__newindex);
+    lua_setfield(L, -2, "__newindex");
+
+    /* __typename contains the typename */
+    lua_pushstring(L, tname);
+    lua_setfield(L, -2, "__typename");
+
+    /* __metatable is self */
+    lua_pushvalue(L, -1);
+    lua_setfield(L, -2, "__metatable");
+
+    /* by default, __version equals 1 */
+    lua_pushnumber(L, 1);
+    lua_setfield(L, -2, "__version");
+
+    /* assign default operator functions */
+    lua_pushcfunction(L, luaT_mt__tostring);
+    lua_setfield(L, -2, "__tostring");
+
+    lua_pushcfunction(L, luaT_mt__add);
+    lua_setfield(L, -2, "__add");
+
+    lua_pushcfunction(L, luaT_mt__sub);
+    lua_setfield(L, -2, "__sub");
+
+    lua_pushcfunction(L, luaT_mt__mul);
+    lua_setfield(L, -2, "__mul");
+
+    lua_pushcfunction(L, luaT_mt__div);
+    lua_setfield(L, -2, "__div");
+
+    lua_pushcfunction(L, luaT_mt__mod);
+    lua_setfield(L, -2, "__mod");
+
+    lua_pushcfunction(L, luaT_mt__pow);
+    lua_setfield(L, -2, "__pow");
+
+    lua_pushcfunction(L, luaT_mt__unm);
+    lua_setfield(L, -2, "__unm");
+
+    lua_pushcfunction(L, luaT_mt__concat);
+    lua_setfield(L, -2, "__concat");
+
+    lua_pushcfunction(L, luaT_mt__len);
+    lua_setfield(L, -2, "__len");
+
+    lua_pushcfunction(L, luaT_mt__eq);
+    lua_setfield(L, -2, "__eq");
+
+    lua_pushcfunction(L, luaT_mt__lt);
+    lua_setfield(L, -2, "__lt");
+
+    lua_pushcfunction(L, luaT_mt__le);
+    lua_setfield(L, -2, "__le");
+
+    lua_pushcfunction(L, luaT_mt__call);
+    lua_setfield(L, -2, "__call");
+  }
+
+  /* we assign the parent class if necessary */
+  if(!lua_isnoneornil(L, 2))
+  {
+    if(lua_getmetatable(L, -1))
+      luaL_error(L, "class %s has been already assigned a parent class\n", tname);
+    else
+    {
+      const char* parenttname = luaL_checkstring(L, 2);
+      if(!luaT_pushmetatable(L, parenttname))
+        luaL_error(L, "bad argument #2 (invalid parent class name %s)", parenttname);
+      lua_setmetatable(L, -2);
+    }
+  }
+
+  /* register the destructor function  */
+  if(!lua_isnoneornil(L, 4))
+  {
+    /* does it exists already? */
+    lua_pushstring(L, "__gc");
+    lua_rawget(L, -2);
+
+    if(lua_isnil(L, -1))
+    {
+      lua_pop(L, 1); /* pop nil */
+      lua_pushstring(L, "__gc");
+      lua_pushvalue(L, 4);
+      lua_rawset(L, -3);
+    }
+    else
+      luaL_error(L, "%s has been already assigned a destructor", tname);
+  }
+
+  /* register the factory function  */
+  if(!lua_isnoneornil(L, 5))
+  {
+    /* does it exists already? */
+    lua_pushstring(L, "__factory");
+    lua_rawget(L, -2);
+
+    if(lua_isnil(L, -1))
+    {
+      lua_pop(L, 1); /* pop nil */
+      lua_pushstring(L, "__factory");
+      lua_pushvalue(L, 5);
+      lua_rawset(L, -3);
+    }
+    else
+      luaL_error(L, "%s has been already assigned a factory", tname);
+  }
+
+  /******** Constructor table and metatable ********/
+  lua_pushstring(L, "__constructor");
+  lua_rawget(L, -2);
+  if(lua_isnil(L, -1))
+  {
+    lua_pop(L, 1);                        /* pop nil */
+    lua_newtable(L);                      /* fancy table */
+    lua_newtable(L);                      /* fancy metatable */
+
+    lua_pushvalue(L, -3);                 /* metatable */
+    lua_setfield(L, -2, "__index");       /* so we can get the methods */
+
+    lua_pushcfunction(L, luaT_cmt__newindex);
+    lua_setfield(L, -2, "__newindex");    /* so we add new methods */
+
+    lua_pushcfunction(L, luaT_cmt__call);
+    lua_setfield(L, -2, "__call");        /* so we can create, we are here for only that */
+
+    lua_pushvalue(L, -3);
+    lua_setfield(L, -2, "__metatable");   /* redirect to metatable with methods */
+
+    lua_setmetatable(L, -2);              /* constructor metatable is ... this fancy metatable */
+
+    /* set metatable[__constructor] = constructor-metatable */
+    lua_pushstring(L, "__constructor");
+    lua_pushvalue(L, -2);
+    lua_rawset(L, -4);
+  }
+
+  /* register the constructor function  */
+  if(!lua_isnoneornil(L, 3))
+  {
+    /* get constructor metatable */
+    lua_getmetatable(L, -1);
+
+    /* does it exists already? */
+    lua_pushstring(L, "__new");
+    lua_rawget(L, -2);
+
+    if(lua_isnil(L, -1))
+    {
+      lua_pop(L, 1); /* pop nil */
+      lua_pushstring(L, "__new");
+      lua_pushvalue(L, 3);
+      lua_rawset(L, -3);
+
+      /* set "new" in the metatable too */
+      lua_pushstring(L, "new");
+      lua_pushvalue(L, 3);
+      lua_rawset(L, -5);
+    }
+    else
+      luaL_error(L, "%s has been already assigned a constructor", tname);
+
+    /* pop constructor metatable */
+    lua_pop(L, 1);
+  }
+
+  /* module.name = constructor metatable */
+  lua_setfield(L, 6, luaT_classrootname(tname));
+
+  return 1; /* returns the metatable */
+}
+
+/* Lua only utility functions */
+
+/* add any custom type, provided the object has a metatable */
+int luaT_lua_metatype(lua_State *L)
+{
+  if( (lua_gettop(L) != 2) && (lua_gettop(L) != 3) )
+    luaL_error(L, "expecting: string table [ctype]");
+
+  luaL_checkstring(L, 1);
+  luaL_checktype(L, 2, LUA_TTABLE);
+
+  if(lua_gettop(L) == 3)
+  {
+    if(!luaT_cdataname(L, 3, lua_tostring(L, 1)))
+      luaL_error(L, "could not register cdata type -- missing ffi library?");
+  }
+
+  /* registry[name] = metatable */
+  lua_pushvalue(L, 1);
+  lua_pushvalue(L, 2);
+  lua_rawset(L, LUA_REGISTRYINDEX);
+
+  /* registry[metatable] = tname */
+  lua_pushvalue(L, 2);
+  lua_pushvalue(L, 1);
+  lua_rawset(L, LUA_REGISTRYINDEX);
+
+  return 0;
+}
+
+/* return a userdata from a C pointer */
+/* you are better to know what you are doing */
+int luaT_lua_pushudata(lua_State *L)
+{
+  void *udata = NULL;
+  const char *tname = luaL_checkstring(L, 2);
+
+  if(lua_type(L, 1) == 10)
+    udata = *((void**)lua_topointer(L, 1));
+  else if(lua_isnumber(L, 1))
+    udata = (void*)(long)lua_tonumber(L, 1);
+  else
+    luaL_argerror(L, 1, "expecting number or cdata");
+
+  luaT_pushudata(L, udata, tname);
+
+  return 1;
+}
+
+int luaT_lua_factory(lua_State *L)
+{
+  const char* tname = luaL_checkstring(L, 1);
+  if(luaT_pushmetatable(L, tname) && !lua_isnil(L, -1))
+  {
+    lua_pushstring(L, "__factory");
+    lua_rawget(L, -2);
+  }
+  else
+  {
+    lua_pushnil(L);
+  }
+  return 1;
+}
+
+int luaT_lua_getconstructortable(lua_State *L)
+{
+  const char* tname = luaL_checkstring(L, 1);
+  if(luaT_pushmetatable(L, tname))
+  {
+    lua_pushstring(L, "__constructor");
+    lua_rawget(L, -2);
+    return 1;
+  }
+  return 0;
+}
+
+
+int luaT_lua_typename(lua_State *L)
+{
+  const char* tname = NULL;
+  luaL_checkany(L, 1);
+  if((tname = luaT_typename(L, 1)))
+  {
+    lua_pushstring(L, tname);
+    return 1;
+  }
+  return 0;
+}
+
+int luaT_lua_isequal(lua_State *L)
+{
+  if(lua_isuserdata(L, 1) && lua_isuserdata(L, 2))
+  {
+    void **u1, **u2;
+    luaL_argcheck(L, luaT_typename(L, 1), 1, "Torch object expected");
+    luaL_argcheck(L, luaT_typename(L, 2), 2, "Torch object expected");
+
+    u1 = lua_touserdata(L, 1);
+    u2 = lua_touserdata(L, 2);
+    if(*u1 == *u2)
+      lua_pushboolean(L, 1);
+    else
+      lua_pushboolean(L, 0);
+  }
+  else if(lua_istable(L, 1) && lua_istable(L, 2))
+    lua_pushboolean(L, lua_rawequal(L, 1, 2));
+  else
+    lua_pushboolean(L, 0);
+  return 1;
+}
+
+int luaT_lua_pointer(lua_State *L)
+{
+  if(lua_isuserdata(L, 1))
+  {
+    void **ptr;
+    luaL_argcheck(L, luaT_typename(L, 1), 1, "Torch object expected");
+    ptr = lua_touserdata(L, 1);
+    lua_pushnumber(L, (long)(*ptr));
+    return 1;
+  }
+  else if(lua_istable(L, 1) || lua_isthread(L, 1) || lua_isfunction(L, 1))
+  {
+    const void* ptr = lua_topointer(L, 1);
+    lua_pushnumber(L, (long)(ptr));
+    return 1;
+  }
+  else if(lua_type(L, 1) == 10) /* cdata */
+  {
+    /* we want the pointer holded by cdata */
+    /* not the pointer on the cdata object */
+    const void* ptr = *((void**)lua_topointer(L, 1));
+    lua_pushnumber(L, (long)(ptr));
+    return 1;
+  }
+  else if(lua_isstring(L, 1))
+  {
+    const char* ptr = lua_tostring(L, 1);
+    lua_pushnumber(L, (long)(ptr));
+    return 1;
+  }
+  else
+    luaL_error(L, "Torch object, table, thread, cdata or function expected");
+
+  return 0;
+}
+
+int luaT_lua_setenv(lua_State *L)
+{
+  if(!lua_isfunction(L, 1) && !lua_isuserdata(L, 1))
+    luaL_typerror(L, 1, "function or userdata");
+  luaL_checktype(L, 2, LUA_TTABLE);
+  lua_setfenv(L, 1);
+  return 0;
+}
+
+int luaT_lua_getenv(lua_State *L)
+{
+  if(!lua_isfunction(L, 1) && !lua_isuserdata(L, 1))
+    luaL_typerror(L, 1, "function or userdata");
+  lua_getfenv(L, 1);
+  return 1;
+}
+
+int luaT_lua_getmetatable(lua_State *L)
+{
+  const char *tname = luaL_checkstring(L, 1);
+  if(luaT_pushmetatable(L, tname))
+    return 1;
+  return 0;
+}
+
+int luaT_lua_version(lua_State *L)
+{
+  luaL_checkany(L, 1);
+
+  if(lua_type(L, 1) == 10)
+  {
+    const char *tname = luaT_cdataname(L, 1, NULL);
+    if(tname)
+    {
+      luaT_pushmetatable(L, tname);
+      lua_pushstring(L, "__version");
+      lua_rawget(L, -2);
+      return 1;
+    }
+    return 0;
+  }
+  else if(lua_getmetatable(L, 1))
+  {
+    lua_pushstring(L, "__version");
+    lua_rawget(L, -2);
+    return 1;
+  }
+  return 0;
+}
+
+int luaT_lua_setmetatable(lua_State *L)
+{
+  const char *tname = luaL_checkstring(L, 2);
+  luaL_checktype(L, 1, LUA_TTABLE);
+
+  if(!luaT_pushmetatable(L, tname))
+    luaL_error(L, "unknown typename %s\n", tname);
+  lua_setmetatable(L, 1);
+
+  return 1;
+}
+
+/* metatable operator methods */
+static int luaT_mt__index(lua_State *L)
+{
+  if(!lua_getmetatable(L, 1))
+    luaL_error(L, "critical internal indexing error: no metatable found");
+
+  if(!lua_istable(L, -1))
+    luaL_error(L, "critical internal indexing error: not a metatable");
+
+  /* test for __index__ method first */
+  lua_getfield(L, -1, "__index__");
+  if(!lua_isnil(L, -1))
+  {
+    int result;
+
+    if(!lua_isfunction(L, -1))
+      luaL_error(L, "critical internal indexing error: __index__ is not a function");
+
+    lua_pushvalue(L, 1);
+    lua_pushvalue(L, 2);
+
+    lua_call(L, 2, LUA_MULTRET); /* DEBUG: risque: faut vraiment retourner 1 ou 2 valeurs... */
+
+    result = lua_toboolean(L, -1);
+    lua_pop(L, 1);
+
+    if(result)
+      return 1;
+
+    /* on the stack: 1. the object 2. the value 3. the metatable */
+    /* apparently, __index wants only one element returned */
+    /* return lua_gettop(L)-3; */
+
+  }
+  else
+    lua_pop(L, 1); /* remove nil __index__ on the stack */
+
+  lua_pushvalue(L, 2);
+  lua_gettable(L, -2);
+
+  return 1;
+}
+
+static int luaT_mt__newindex(lua_State *L)
+{
+  if(!lua_getmetatable(L, 1))
+    luaL_error(L, "critical internal indexing error: no metatable found");
+
+  if(!lua_istable(L, -1))
+    luaL_error(L, "critical internal indexing error: not a metatable");
+
+  /* test for __newindex__ method first */
+  lua_getfield(L, -1, "__newindex__");
+  if(!lua_isnil(L, -1))
+  {
+    int result;
+
+    if(!lua_isfunction(L, -1))
+      luaL_error(L, "critical internal indexing error: __newindex__ is not a function");
+
+    lua_pushvalue(L, 1);
+    lua_pushvalue(L, 2);
+    lua_pushvalue(L, 3);
+
+    lua_call(L, 3, 1); /* DEBUG: risque: faut vraiment retourner qqch */
+
+    result = lua_toboolean(L, -1);
+    lua_pop(L, 1);
+
+    if(result)
+      return 0;
+  }
+  else
+    lua_pop(L, 1); /* remove nil __newindex__ on the stack */
+
+  lua_pop(L, 1);    /* pop the metatable */
+  if(lua_istable(L, 1))
+    lua_rawset(L, 1);
+  else
+    luaL_error(L, "the class %s cannot be indexed", luaT_typename(L, 1));
+
+  return 0;
+}
+
+/* note: check dans metatable pour ca, donc necessaire */
+#define MT_DECLARE_OPERATOR(NAME, NIL_BEHAVIOR)                     \
+  int luaT_mt__##NAME(lua_State *L)                                 \
+  {                                                                 \
+    if(!lua_getmetatable(L, 1))                                     \
+      luaL_error(L, "internal error in __" #NAME ": no metatable"); \
+                                                                    \
+    lua_getfield(L, -1, "__" #NAME "__");                           \
+    if(lua_isnil(L, -1))                                                \
+    {                                                                   \
+      NIL_BEHAVIOR;                                                     \
+    }                                                                   \
+    else                                                                \
+    {                                                                   \
+      if(lua_isfunction(L, -1))                                         \
+      {                                                                 \
+        lua_insert(L, 1); /* insert function */                         \
+        lua_pop(L, 1); /* remove metatable */                           \
+        lua_call(L, lua_gettop(L)-1, LUA_MULTRET); /* we return the result of the call */ \
+        return lua_gettop(L);                                           \
+      }                                                                 \
+      /* we return the thing the user left in __tostring__ */           \
+    }                                                                   \
+    return 0;                                                           \
+  }
+
+MT_DECLARE_OPERATOR(tostring,
+                    lua_pushstring(L, luaT_typename(L, 1));
+                    return 1;)
+MT_DECLARE_OPERATOR(add, luaL_error(L, "%s has no addition operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(sub, luaL_error(L, "%s has no substraction operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(mul, luaL_error(L, "%s has no multiplication operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(div, luaL_error(L, "%s has no division operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(mod, luaL_error(L, "%s has no modulo operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(pow, luaL_error(L, "%s has no power operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(unm, luaL_error(L, "%s has no negation operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(concat, luaL_error(L, "%s has no concat operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(len, luaL_error(L, "%s has no length operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(eq,
+                    lua_settop(L, 2);
+                    lua_pushcfunction(L, luaT_lua_isequal);
+                    lua_insert(L, 1);
+                    lua_call(L, 2, 1);
+                    return 1;)
+MT_DECLARE_OPERATOR(lt, luaL_error(L, "%s has no lower than operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(le, luaL_error(L, "%s has no lower or equal than operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(call, luaL_error(L, "%s has no call operator", luaT_typename(L, 1)))
+
+
+/* constructor metatable methods */
+int luaT_cmt__call(lua_State *L)
+{
+  if(!lua_istable(L, 1))
+    luaL_error(L, "internal error in __call: not a constructor table");
+
+  if(!lua_getmetatable(L, 1))
+    luaL_error(L, "internal error in __call: no metatable available");
+
+  lua_pushstring(L, "__new");
+  lua_rawget(L, -2);
+
+  if(lua_isnil(L, -1))
+    luaL_error(L, "no constructor available");
+
+  lua_remove(L, 1); /* remove constructor atable */
+  lua_insert(L, 1); /* insert constructor */
+  lua_pop(L, 1);    /* remove fancy metatable */
+
+  lua_call(L, lua_gettop(L)-1, LUA_MULTRET);
+  return lua_gettop(L);
+}
+
+int luaT_cmt__newindex(lua_State *L)
+{
+  if(!lua_istable(L, 1))
+    luaL_error(L, "internal error in __newindex: not a constructor table");
+
+  if(!lua_getmetatable(L, 1))
+    luaL_error(L, "internal error in __newindex: no metatable available");
+
+  lua_pushstring(L, "__metatable");
+  lua_rawget(L, -2);
+
+  if(!lua_istable(L, -1))
+    luaL_error(L, "internal error in __newindex: no metaclass available");
+
+  lua_insert(L, 2);
+  lua_pop(L, 1); /* remove the metatable over the constructor table */
+
+  lua_rawset(L, -3);
+
+  return 0;
+}
+
+/******************** deprecated functions ********************/
+int luaT_pushmetaclass(lua_State *L, const char *tname)
+{
+  return luaT_pushmetatable(L, tname);
+}
+
+const char* luaT_id(lua_State *L, int ud)
+{
+  return luaT_typename(L, ud);
+}
+
+const char* luaT_id2typename(lua_State *L, const char *id)
+{
+  return id;
+}
+
+const char* luaT_typename2id(lua_State *L, const char *tname)
+{
+  return luaT_typenameid(L, tname);
+}
+
+int luaT_getmetaclass(lua_State *L, int index)
+{
+  return lua_getmetatable(L, index);
+}
+
+const char* luaT_checktypename2id(lua_State *L, const char *tname)
+{
+  const char* id = luaT_typenameid(L, tname);
+  if(!id)
+    luaL_error(L, "unknown class <%s>", tname);
+  return id;
+}
+
+void luaT_registeratid(lua_State *L, const struct luaL_Reg *methods, const char *id)
+{
+  luaT_registeratname(L, methods, id);
+}
+
+/**************************************************************/
diff --git a/nerv/luaT/luaT.h b/nerv/luaT/luaT.h
new file mode 100644
index 0000000..5e8dd2f
--- /dev/null
+++ b/nerv/luaT/luaT.h
@@ -0,0 +1,111 @@
+#ifndef LUAT_UTILS_INC
+#define LUAT_UTILS_INC
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <lua.h>
+#include <lauxlib.h>
+#ifdef __cplusplus
+}
+#endif
+
+#ifndef LUA_EXTERNC
+# ifdef __cplusplus
+#  define LUA_EXTERNC extern "C"
+# else
+#  define LUA_EXTERNC extern
+# endif
+#endif
+
+#ifdef _MSC_VER
+# define DLL_EXPORT __declspec(dllexport)
+# define DLL_IMPORT __declspec(dllimport)
+# ifdef luaT_EXPORTS
+#  define LUAT_API LUA_EXTERNC DLL_EXPORT
+# else
+#  define LUAT_API LUA_EXTERNC DLL_IMPORT
+# endif
+#else
+# define DLL_EXPORT
+# define DLL_IMPORT
+# define LUAT_API LUA_EXTERNC
+#endif
+
+
+/* C functions */
+
+LUAT_API void* luaT_alloc(lua_State *L, long size);
+LUAT_API void* luaT_realloc(lua_State *L, void *ptr, long size);
+LUAT_API void luaT_free(lua_State *L, void *ptr);
+
+LUAT_API const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parenttname,
+                                       lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory);
+
+LUAT_API int luaT_pushmetatable(lua_State *L, const char *tname);
+
+LUAT_API const char* luaT_typenameid(lua_State *L, const char *tname);
+LUAT_API const char* luaT_typename(lua_State *L, int ud);
+
+LUAT_API void luaT_pushudata(lua_State *L, void *udata, const char *tname);
+LUAT_API void *luaT_toudata(lua_State *L, int ud, const char *tname);
+LUAT_API int luaT_isudata(lua_State *L, int ud, const char *tname);
+LUAT_API void *luaT_checkudata(lua_State *L, int ud, const char *tname);
+
+LUAT_API void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname);
+LUAT_API void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field);
+LUAT_API double luaT_getfieldchecknumber(lua_State *L, int ud, const char *field);
+LUAT_API int luaT_getfieldcheckint(lua_State *L, int ud, const char *field);
+LUAT_API const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field);
+LUAT_API int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field);
+LUAT_API void luaT_getfieldchecktable(lua_State *L, int ud, const char *field);
+
+LUAT_API int luaT_typerror(lua_State *L, int ud, const char *tname);
+
+LUAT_API int luaT_checkboolean(lua_State *L, int ud);
+LUAT_API int luaT_optboolean(lua_State *L, int ud, int def);
+
+LUAT_API void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name);
+
+/* utility functions */
+LUAT_API const char *luaT_classrootname(const char *tname);
+LUAT_API int luaT_classmodulename(const char *tname, char *module_name);
+
+/* debug */
+LUAT_API void luaT_stackdump(lua_State *L);
+
+/* Lua functions */
+LUAT_API int luaT_lua_newmetatable(lua_State *L);
+LUAT_API int luaT_lua_factory(lua_State *L);
+LUAT_API int luaT_lua_getconstructortable(lua_State *L);
+LUAT_API int luaT_lua_typename(lua_State *L);
+LUAT_API int luaT_lua_isequal(lua_State *L);
+LUAT_API int luaT_lua_pointer(lua_State *L);
+LUAT_API int luaT_lua_setenv(lua_State *L);
+LUAT_API int luaT_lua_getenv(lua_State *L);
+LUAT_API int luaT_lua_getmetatable(lua_State *L);
+LUAT_API int luaT_lua_version(lua_State *L);
+LUAT_API int luaT_lua_setmetatable(lua_State *L);
+LUAT_API int luaT_lua_metatype(lua_State *L);
+LUAT_API int luaT_lua_pushudata(lua_State *L);
+
+/* deprecated functions */
+/* ids have been replaced by string names to identify classes */
+/* comments show what function (that you should use) they call now */
+#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+#define LUAT_DEPRECATED  __attribute__((__deprecated__))
+#elif defined(_MSC_VER)
+#define LUAT_DEPRECATED __declspec(deprecated)
+#else
+#define LUAT_DEPRECATED
+#endif
+
+LUAT_API LUAT_DEPRECATED int luaT_pushmetaclass(lua_State *L, const char *tname); /* same as luaT_pushmetatable */
+LUAT_API LUAT_DEPRECATED const char* luaT_id(lua_State *L, int ud); /* same as luaT_typename */
+LUAT_API LUAT_DEPRECATED const char* luaT_id2typename(lua_State *L, const char *id); /*  same as luaT_typenameid */
+LUAT_API LUAT_DEPRECATED const char* luaT_typename2id(lua_State *L, const char*); /* same as luaT_typenameid */
+LUAT_API LUAT_DEPRECATED int luaT_getmetaclass(lua_State *L, int index); /* same as luaT_getmetatable */
+LUAT_API LUAT_DEPRECATED const char* luaT_checktypename2id(lua_State *L, const char *tname);  /* same as luaT_typenameid */
+LUAT_API LUAT_DEPRECATED void luaT_registeratid(lua_State *L, const struct luaL_Reg *methods, const char *id); /* same as luaT_registeratname */
+
+#endif
diff --git a/nerv/matrix/cuda_helper.h b/nerv/matrix/cuda_helper.h
new file mode 100644
index 0000000..fde6f18
--- /dev/null
+++ b/nerv/matrix/cuda_helper.h
@@ -0,0 +1,75 @@
+#ifndef NERV_CUDA_HELPER_H
+#define NERV_CUDA_HELPER_H
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "driver_types.h"
+#include "cublas_v2.h"
+#define CUBLAS_SAFE_SYNC_CALL(call) \
+    do { \
+        cublasStatus_t  err = (call); \
+        if (err != CUBLAS_STATUS_SUCCESS) \
+            nerv_error(L, "cumatrix cublas error: %s at %s:%d", \
+                        cublasGetErrorString(err), __FILE__, __LINE__); \
+        cudaDeviceSynchronize(); \
+    } while (0)
+
+#define CUDA_SAFE_CALL(call) \
+    do { \
+        cudaError_t err = (call); \
+        if (err != cudaSuccess) \
+            nerv_error(L, "cumatrix CUDA error: %s at %s:%d", \
+                            cudaGetErrorString(err), __FILE__, __LINE__); \
+    } while (0)
+
+#define CUDA_SAFE_SYNC_CALL(call) \
+    do { \
+        CUDA_SAFE_CALL(call); \
+        cudaDeviceSynchronize(); \
+    } while (0)
+
+#define CHECK_SAME_DIMENSION(a, b) \
+    do { \
+        if (!(a->nrow == b->nrow && a->ncol == b->ncol)) \
+            nerv_error(L, "matrices should be of the same dimension"); \
+    } while (0)
+
+static const char *cublasGetErrorString(cublasStatus_t err) {
+    switch (err)
+    {
+        case CUBLAS_STATUS_SUCCESS:
+            return "CUBLAS_STATUS_SUCCESS";
+        case CUBLAS_STATUS_NOT_INITIALIZED:
+            return "CUBLAS_STATUS_NOT_INITIALIZED";
+        case CUBLAS_STATUS_ALLOC_FAILED:
+            return "CUBLAS_STATUS_ALLOC_FAILED";
+        case CUBLAS_STATUS_INVALID_VALUE:
+            return "CUBLAS_STATUS_INVALID_VALUE";
+        case CUBLAS_STATUS_ARCH_MISMATCH:
+            return "CUBLAS_STATUS_ARCH_MISMATCH";
+        case CUBLAS_STATUS_MAPPING_ERROR:
+            return "CUBLAS_STATUS_MAPPING_ERROR";
+        case CUBLAS_STATUS_EXECUTION_FAILED:
+            return "CUBLAS_STATUS_EXECUTION_FAILED";
+        case CUBLAS_STATUS_INTERNAL_ERROR:
+            return "CUBLAS_STATUS_INTERNAL_ERROR";
+/*        case CUBLAS_STATUS_NOT_SUPPORTED:
+            return "CUBLAS_STATUS_NOT_SUPPORTED";
+        case CUBLAS_STATUS_LICENSE_ERROR:
+            return "CUBLAS_STATUS_LICENSE_ERROR"; */
+    }
+    return "<unknown>";
+}
+
+#define PROFILE_START \
+    do { \
+        cudaEventRecord(profile_start, 0);
+#define PROFILE_STOP \
+        cudaEventRecord(profile_stop, 0); \
+        cudaEventSynchronize(profile_stop); \
+        float milliseconds = 0; \
+        cudaEventElapsedTime(&milliseconds, profile_start, profile_stop); \
+        accu_profile(__func__, milliseconds / 1000); \
+    } while (0);
+
+#define PROFILE_END
+#endif
diff --git a/nerv/matrix/cukernel.cu b/nerv/matrix/cukernel.cu
new file mode 100644
index 0000000..a19030a
--- /dev/null
+++ b/nerv/matrix/cukernel.cu
@@ -0,0 +1,17 @@
+#define NERV_GENERIC_CUKERNEL
+
+#define cudak_(NAME) cudak_float_ ## NAME
+#define MATRIX_USE_FLOAT
+#include "generic/elem_type.h"
+#include "generic/cukernel.cu"
+#undef cudak_
+#undef MATRIX_USE_FLOAT
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_FMT
+#undef MATRIX_ELEM_WRITE_FMT
+
+#define cudak_(NAME) cudak_double_ ## NAME
+#define MATRIX_USE_DOUBLE
+#include "generic/elem_type.h"
+#include "generic/cukernel.cu"
diff --git a/nerv/matrix/cukernel.h b/nerv/matrix/cukernel.h
new file mode 100644
index 0000000..8a1494f
--- /dev/null
+++ b/nerv/matrix/cukernel.h
@@ -0,0 +1,20 @@
+#ifdef NERV_GENERIC_CUKERNEL
+void cudak_(cuda_mul_elem)(const Matrix *a, const Matrix *b, Matrix *c);
+void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b);
+void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b);
+void cudak_(cuda_sigmoid_grad)(const Matrix *output, const Matrix *err, Matrix *nerr);
+void cudak_(cuda_rowsum)(const Matrix *a, Matrix *b);
+void cudak_(cuda_rowmax)(const Matrix *a, Matrix *b);
+void cudak_(cuda_rowmax_idx)(const Matrix *a, Matrix *b, Matrix *idx);
+void cudak_(cuda_colsum)(const Matrix *a, Matrix *b);
+void cudak_(cuda_colsame)(const Matrix *a, const Matrix *ref, Matrix *b);
+void cudak_(cuda_softmax_denominator)(const Matrix *a, const Matrix *max, Matrix *b);
+void cudak_(cuda_softmax_final)(const Matrix *a, const Matrix *max, const Matrix *deno, Matrix *b);
+void cudak_(cuda_add_row)(const Matrix *a, Matrix *b, double beta);
+void cudak_(cuda_fill)(Matrix *a, double val);
+void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context);
+void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step);
+void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b);
+void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b);
+void cudak_(cuda_decompress)(const Matrix *a, Matrix *b);
+#endif
diff --git a/nerv/matrix/cumatrix.c b/nerv/matrix/cumatrix.c
new file mode 100644
index 0000000..af34fb4
--- /dev/null
+++ b/nerv/matrix/cumatrix.c
@@ -0,0 +1,87 @@
+#define NERV_GENERIC_CUMATRIX
+#include "../common.h"
+#include "cuda_helper.h"
+#include <string.h>
+#define PROFILE_HASHMAP_SIZE 123457
+static cublasHandle_t cublas_handle;
+static cudaEvent_t profile_start, profile_stop;
+static HashMap *profile;
+
+static int print_profile(lua_State *L) {
+    (void)L;
+    size_t i;
+    fprintf(stderr, "*** [nerv cumatrix profile] **\n");
+    for (i = 0; i < profile->size; i++)
+    {
+        HashNode *ptr;
+        for (ptr = profile->bucket[i]; ptr; ptr = ptr->next)
+        {
+            fprintf(stderr, "%s:\t%.6f\n", ptr->key, *(float *)ptr->val);
+        }
+    }
+    return 0;
+}
+
+static int clear_profile(lua_State *L) {
+    (void)L;
+    hashmap_clear(profile);
+    return 0;
+}
+
+void accu_profile(const char *name, float delta) {
+    float *val = hashmap_getval(profile, name);
+    if (!val)
+    {
+        val = malloc(sizeof(float));
+        *val = 0;
+        hashmap_setval(profile, name, val);
+    }
+    *val += delta;
+}
+
+static const luaL_Reg cumatrix_methods[] = {
+    {"print_profile", print_profile},
+    {"clear_profile", clear_profile},
+    {NULL, NULL}
+};
+
+extern void nerv_matrix_cuda_float_init(lua_State *L);
+extern void nerv_matrix_cuda_double_init(lua_State *L);
+
+void nerv_cumatrix_init(lua_State *L) {
+    luaL_register(L, NULL, cumatrix_methods);
+    cublasCreate(&cublas_handle);
+    cudaEventCreate(&profile_start);
+    cudaEventCreate(&profile_stop);
+    profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
+    nerv_matrix_cuda_float_init(L);
+    nerv_matrix_cuda_double_init(L);
+}
+
+#define MATRIX_USE_FLOAT
+#define cuda_matrix_(NAME) cuda_matrix_float_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_cuda_float_##NAME
+#define cudak_(NAME) cudak_float_ ## NAME
+#define NERV_CUBLAS_(NAME) cublasS##NAME
+#define MATRIX_CUMATRIX_HOST_TNAME nerv_matrix_host_float_tname
+const char *nerv_matrix_(tname) = "nerv.CuMatrixFloat";
+#include "generic/cumatrix.c"
+#undef NERV_CUBLAS_
+#undef cudak_
+#undef nerv_matrix_
+#undef cuda_matrix_
+#undef MATRIX_USE_FLOAT
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_FMT
+#undef MATRIX_ELEM_WRITE_FMT
+#undef MATRIX_CUMATRIX_HOST_TNAME
+
+#define MATRIX_USE_DOUBLE
+#define cuda_matrix_(NAME) cuda_matrix_double_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_cuda_double_##NAME
+#define cudak_(NAME) cudak_double_ ## NAME
+#define NERV_CUBLAS_(NAME) cublasD##NAME
+#define MATRIX_CUMATRIX_HOST_TNAME nerv_matrix_host_double_tname
+const char *nerv_matrix_(tname) = "nerv.CuMatrixDouble";
+#include "generic/cumatrix.c"
diff --git a/nerv/matrix/generic/cukernel.cu b/nerv/matrix/generic/cukernel.cu
new file mode 100644
index 0000000..d6c8adc
--- /dev/null
+++ b/nerv/matrix/generic/cukernel.cu
@@ -0,0 +1,571 @@
+#ifdef NERV_GENERIC_CUKERNEL
+#include <assert.h>
+#include <stdio.h>
+#include "matrix.h"
+#include "cuda.h"
+#include "float.h"
+#define CUDA_THREADS_N 16
+#define CUDA_THREADS_NN ((CUDA_THREADS_N) * (CUDA_THREADS_N))
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+__global__ void cudak_(log_elem)(const MATRIX_ELEM *a, MATRIX_ELEM *b, 
+                                int nrow, int ncol, int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    long idx;
+    MATRIX_ELEM tmp;
+    if (i >= nrow || j >= ncol) return;
+    idx = j + i * stride;
+    tmp = a[idx];
+    if(tmp < FLT_MIN) tmp = FLT_MIN;
+    b[idx] = log(tmp);
+}
+
+__global__ void cudak_(mul_elem)(const MATRIX_ELEM *a, const MATRIX_ELEM *b,
+                                MATRIX_ELEM *c, 
+                                int nrow, int ncol, int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    long idx;
+    if (i >= nrow || j >= ncol) return;
+    idx = j + i * stride;
+    c[idx] = a[idx] * b[idx];
+}
+
+__global__ void cudak_(sigmoid)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                        int nrow, int ncol, int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    long idx;
+    if (i >= nrow || j >= ncol) return;
+    idx = j + i * stride;
+    b[idx] = 1.0 / (1.0 + exp(-a[idx]));
+}
+
+__global__ void cudak_(sigmoid_grad)(const MATRIX_ELEM *output,
+                                    const MATRIX_ELEM *err,
+                                    MATRIX_ELEM *nerr,
+                                    int nrow, int ncol, int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    long idx;
+    if (i >= nrow || j >= ncol) return;
+    idx = j + i * stride;
+    nerr[idx] = output[idx] * (1.0 - output[idx]) * err[idx];
+}
+
+__global__ void cudak_(softmax_final)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                        const MATRIX_ELEM *max, const MATRIX_ELEM *deno,
+                        int nrow, int ncol, int stride, int mstride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    long idx;
+    if (i >= nrow || j >= ncol) return;
+    idx = j + i * stride;
+    b[idx] = exp(a[idx] - max[0 + i * mstride]) / deno[0 + i * mstride];
+}
+
+__global__ void cudak_(block_reduce_rowsum)(const MATRIX_ELEM *input,
+                                            MATRIX_ELEM *output,
+                                            const int istride, const int ostride,
+                                            const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0;
+    __syncthreads();
+    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+            cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_colsum)(const MATRIX_ELEM *input,
+                                MATRIX_ELEM *output,
+                                const int istride, const int ostride,
+                                const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    cudak_(arr)[threadIdx.y] = i < n ? input[blockIdx.x + istride * i] : 0;
+    __syncthreads();
+    for (int offset = blockDim.y >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.y < offset)
+            cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset];
+        __syncthreads();
+    }
+    if (threadIdx.y == 0)
+        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_colsame)(const MATRIX_ELEM *input,
+                                            const MATRIX_ELEM *ref_input,
+                                            MATRIX_ELEM *output,
+                                            const int istride, const int ostride,
+                                            const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    cudak_(arr)[threadIdx.y] = (i < n && input[blockIdx.x + istride * i] == \
+                                        ref_input[blockIdx.x + istride * i]) ? 1.0 : 0;
+    __syncthreads();
+    for (int offset = blockDim.y >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.y < offset)
+            cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset];
+        __syncthreads();
+    }
+    if (threadIdx.y == 0)
+        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_softmax_rowsum)(const MATRIX_ELEM *input,
+                                        MATRIX_ELEM *output,
+                                        const MATRIX_ELEM *max,
+                                        const int istride, const int ostride,
+                                        const int mstride, const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    cudak_(arr)[threadIdx.x] = j < n ? exp(input[j + istride * blockIdx.y] - \
+                                    max[0 + mstride * blockIdx.y]) : 0;
+    __syncthreads();
+    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+            cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_rowmax)(const MATRIX_ELEM *input,
+                                            MATRIX_ELEM *output,
+                                            const int istride, const int ostride,
+                                            const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
+    __syncthreads();
+    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+        {
+            MATRIX_ELEM l = cudak_(arr)[threadIdx.x],
+                        r = cudak_(arr)[threadIdx.x + offset];
+            if (r > l)
+                cudak_(arr)[threadIdx.x] = r;
+        }
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+        output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_rowmax_idx)(const MATRIX_ELEM *input,
+                                                const MATRIX_ELEM *idx_input,
+                                                MATRIX_ELEM *output,
+                                                MATRIX_ELEM *idx_output,
+                                                const int istride, const int ostride,
+                                                const int n) {
+    extern __shared__ MATRIX_ELEM cudak_(arr)[];
+    MATRIX_ELEM *arr_val = cudak_(arr);
+    MATRIX_ELEM *arr_idx = arr_val + blockDim.x;
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    arr_val[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
+    arr_idx[threadIdx.x] = j < n ? idx_input[j + istride * blockIdx.y] : 0;
+    __syncthreads();
+    for (int offset = blockDim.x >> 1;  offset; offset >>= 1)
+    {
+        if (threadIdx.x < offset)
+        {
+            MATRIX_ELEM l = arr_val[threadIdx.x],
+                        r = arr_val[threadIdx.x + offset];
+            if (r > l)
+            {
+                arr_val[threadIdx.x] = r;
+                arr_idx[threadIdx.x] = arr_idx[threadIdx.x + offset];
+            }
+        }
+        __syncthreads();
+    }
+    if (threadIdx.x == 0)
+    {
+        output[blockIdx.x + ostride * blockIdx.y] = arr_val[0];
+        idx_output[blockIdx.x + ostride * blockIdx.y] = arr_idx[0];
+    }
+}
+
+__global__ void cudak_(add_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                int nrow, int ncol, int stride, double beta) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[j + i * stride] += beta * a[j];
+}
+
+__global__ void cudak_(fill)(MATRIX_ELEM *a,
+                            int nrow, int ncol, int stride, double val) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    a[j + i * stride] = val;
+}
+
+__global__ void cudak_(expand_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                    int nrow, int ncol,
+                                    int enrow, int encol,
+                                    int stride, int estride,
+                                    int context) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    int ridx;
+    if (i >= enrow || j >= encol) return;
+    ridx = i + j / ncol - context;
+    if (ridx < 0) ridx = 0;
+    else if (ridx >= nrow) ridx = nrow - 1;
+    b[j + i * estride] = a[j % ncol + ridx * stride];
+}
+
+__global__ void cudak_(rearrange_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                    int nrow, int ncol,
+                                    int stride, int step, int orig_dim) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[j + i * stride] = a[j / step + (j % step) * orig_dim + i * stride];
+}
+
+__global__ void cudak_(scale_rows_by_col)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                        int nrow, int ncol,
+                                        int astride, int bstride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[j + i * bstride] *= a[i * astride];
+}
+
+__global__ void cudak_(scale_rows_by_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                        int nrow, int ncol,
+                                        int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[j + i * stride] *= a[j];
+}
+
+__global__ void cudak_(decompress)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                                    int nrow, int ncol,
+                                    int stride_a, int stride_b) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[lrintf(a[j + i * stride_a]) + i * stride_b] = 1.0;
+}
+
+__global__ void cudak_(gen_col_idx)(MATRIX_ELEM *b,
+                                    int nrow, int ncol, int stride) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow || j >= ncol) return;
+    b[j + i * stride] = j;
+}
+
+extern "C" {
+#include "../cukernel.h"
+    void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(log_elem)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_mul_elem)(const Matrix *a, const Matrix *b,
+                                Matrix *c) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(mul_elem)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             MATRIX_ELEM_PTR(c),
+             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(sigmoid)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
+            b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_sigmoid_grad)(const Matrix *output,
+                                    const Matrix *err, Matrix *nerr) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(nerr->ncol, threadsPerBlock.x),
+                CEIL_DIV(nerr->nrow, threadsPerBlock.y));
+        cudak_(sigmoid_grad)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(output), MATRIX_ELEM_PTR(err),
+             MATRIX_ELEM_PTR(nerr),
+             nerr->nrow, nerr->ncol,
+             nerr->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_rowsum)(const Matrix *a, Matrix *b) {
+        dim3 block(CUDA_THREADS_NN, 1);
+        int ncol = a->ncol;
+        int blocks_per_row = CEIL_DIV(ncol, block.x);
+        dim3 grid(blocks_per_row, a->nrow);
+        MATRIX_ELEM *res;
+        size_t stride;
+        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+        cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), res,
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             ncol);
+        ncol = blocks_per_row;
+        assert((unsigned long)ncol <= block.x);
+        grid.x = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (res, MATRIX_ELEM_PTR(b),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             ncol);
+        cudaStreamSynchronize(0);
+        cudaFree(res);
+    }
+
+    void cudak_(cuda_colsame)(const Matrix *a, const Matrix *ref, Matrix *b) {
+        dim3 block(1, CUDA_THREADS_NN);
+        int nrow = a->nrow;
+        int blocks_per_col = CEIL_DIV(nrow, block.y);
+        dim3 grid(a->ncol, blocks_per_col);
+        MATRIX_ELEM *res;
+        size_t stride;
+        cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col);
+        cudak_(block_reduce_colsame)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(ref), res,
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             nrow);
+        nrow = blocks_per_col;
+        assert((unsigned long)nrow <= block.y);
+        grid.y = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
+            (res, MATRIX_ELEM_PTR(b),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             nrow);
+        cudaStreamSynchronize(0);
+        cudaFree(res);
+    }
+
+    void cudak_(cuda_colsum)(const Matrix *a, Matrix *b) {
+        dim3 block(1, CUDA_THREADS_NN);
+        int nrow = a->nrow;
+        int blocks_per_col = CEIL_DIV(nrow, block.y);
+        dim3 grid(a->ncol, blocks_per_col);
+        MATRIX_ELEM *res;
+        size_t stride;
+        cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col);
+        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), res,
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             nrow);
+        nrow = blocks_per_col;
+        assert((unsigned long)nrow <= block.y);
+        grid.y = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
+            (res, MATRIX_ELEM_PTR(b),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             nrow);
+        cudaStreamSynchronize(0);
+        cudaFree(res);
+    }
+
+    void cudak_(cuda_softmax_final)(const Matrix *a, const Matrix *max,
+                            const Matrix *deno, Matrix *b) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(softmax_final)<<<numBlocks, threadsPerBlock>>> \
+                (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+                MATRIX_ELEM_PTR(max), MATRIX_ELEM_PTR(deno),
+                b->nrow, b->ncol,
+                b->stride / sizeof(MATRIX_ELEM),
+                max->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_softmax_denominator)(const Matrix *a, const Matrix *max, Matrix *b) {
+        dim3 block(CUDA_THREADS_NN, 1);
+        int ncol = a->ncol;
+        int blocks_per_row = CEIL_DIV(ncol, block.x);
+        dim3 grid(blocks_per_row, a->nrow);
+        MATRIX_ELEM *res;
+        size_t stride;
+        assert(max->ncol == 1);
+        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+        cudak_(block_reduce_softmax_rowsum) \
+            <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), res, MATRIX_ELEM_PTR(max),
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             max->stride / sizeof(MATRIX_ELEM),
+             ncol);
+        ncol = blocks_per_row;
+        assert((unsigned long)ncol <= block.x);
+        grid.x = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_rowsum) \
+            <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (res, MATRIX_ELEM_PTR(b),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             ncol);
+        cudaStreamSynchronize(0);
+        cudaFree(res);
+    }
+
+    void cudak_(cuda_rowmax)(const Matrix *a, Matrix *b) {
+        dim3 block(CUDA_THREADS_NN, 1);
+        int ncol = a->ncol;
+        int blocks_per_row = CEIL_DIV(ncol, block.x);
+        dim3 grid(blocks_per_row, a->nrow);
+        MATRIX_ELEM *res;
+        size_t stride;
+        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+        cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), res,
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             ncol);
+        ncol = blocks_per_row;
+        assert((unsigned long)ncol <= block.x);
+        grid.x = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+            (res, MATRIX_ELEM_PTR(b),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             ncol);
+        cudaStreamSynchronize(0);
+        cudaFree(res);
+    }
+
+    void cudak_(cuda_rowmax_idx)(const Matrix *a, Matrix *b, Matrix *b_idx) {
+        dim3 block(CUDA_THREADS_NN, 1);
+        int ncol = a->ncol;
+        int blocks_per_row = CEIL_DIV(ncol, block.x);
+        dim3 grid(blocks_per_row, a->nrow);
+        MATRIX_ELEM *a_idx, *res, *res_idx;
+        size_t stride;
+        cudaMallocPitch(&a_idx, &stride, a->stride, a->nrow);
+        cudak_(gen_col_idx)<<<grid, block>>>(a_idx, a->nrow, ncol, stride / sizeof(MATRIX_ELEM));
+        cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+        cudaMallocPitch(&res_idx, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_rowmax_idx)<<<grid, block,
+                                        2 * block.x * sizeof(MATRIX_ELEM)>>> \
+            (MATRIX_ELEM_PTR(a), a_idx, res, res_idx,
+             a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+             ncol);
+        ncol = blocks_per_row;
+        assert((unsigned long)ncol <= block.x);
+        grid.x = 1;
+        cudaStreamSynchronize(0);
+        cudak_(block_reduce_rowmax_idx)<<<grid, block,
+                                        2 * block.x * sizeof(MATRIX_ELEM)>>> \
+            (res, res_idx, MATRIX_ELEM_PTR(b), MATRIX_ELEM_PTR(b_idx),
+             stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+             ncol);
+        cudaStreamSynchronize(0);
+        cudaFree(a_idx);
+        cudaFree(res);
+        cudaFree(res_idx);
+    }
+
+    /* in-place calc */
+    void cudak_(cuda_add_row)(const Matrix *a, Matrix *b, double beta) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(add_row)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
+            b->stride / sizeof(MATRIX_ELEM), beta);
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_fill)(Matrix *a, double val) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x),
+                CEIL_DIV(a->nrow, threadsPerBlock.y));
+        cudak_(fill)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), a->nrow, a->ncol,
+            a->stride / sizeof(MATRIX_ELEM), val);
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(expand_frm)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             a->nrow, a->ncol,
+             b->nrow, b->ncol,
+             a->stride / sizeof(MATRIX_ELEM),
+             b->stride / sizeof(MATRIX_ELEM),
+             context);
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(rearrange_frm)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM),
+             step, b->ncol / step);
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(scale_rows_by_col)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             b->nrow, b->ncol,
+             a->stride / sizeof(MATRIX_ELEM),
+             b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        cudak_(scale_rows_by_row)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+
+    void cudak_(cuda_decompress)(const Matrix *a, Matrix *b) {
+        dim3 threadsPerBlock(1, CUDA_THREADS_NN);
+        dim3 numBlocks(1, CEIL_DIV(a->nrow, threadsPerBlock.y));
+        cudak_(decompress)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+             a->nrow, a->ncol,
+             a->stride / sizeof(MATRIX_ELEM),
+             b->stride / sizeof(MATRIX_ELEM));
+        cudaStreamSynchronize(0);
+    }
+}
+#endif
diff --git a/nerv/matrix/generic/cumatrix.c b/nerv/matrix/generic/cumatrix.c
new file mode 100644
index 0000000..b5d1a35
--- /dev/null
+++ b/nerv/matrix/generic/cumatrix.c
@@ -0,0 +1,493 @@
+#ifdef NERV_GENERIC_CUMATRIX
+#include "matrix.h"
+#include "elem_type.h"
+
+#define MATRIX_DATA_FREE(L, ptr) cuda_matrix_(free)(L, ptr)
+#define MATRIX_DATA_ALLOC(L, dptr, stride, width, height) \
+                            cuda_matrix_(alloc)(L, dptr, stride, width, height)
+#define MATRIX_DATA_WRITE(L, data, idx, val) cuda_matrix_(write)(L, data, idx, val)
+#define MATRIX_DATA_READ(L, data, idx) cuda_matrix_(read)(L, data, idx)
+#define MATRIX_INIT(L) cuda_matrix_(init)(L)
+#define MATRIX_BASE_TNAME nerv_matrix_cuda_tname
+#define NERV_GENERIC_MATRIX
+#define NERV_GENERIC_CUKERNEL
+#include "../../common.h"
+#include "../cukernel.h"
+#include "../cuda_helper.h"
+
+Matrix *nerv_matrix_(new_)(lua_State *L, long nrow, long ncol);
+void nerv_matrix_(data_free)(lua_State *L, Matrix *self);
+
+static void nerv_matrix_(add_)(lua_State *L, const Matrix *a, const Matrix *b,
+                                const Matrix *c,
+                                MATRIX_ELEM alpha, MATRIX_ELEM beta) {
+    PROFILE_START
+    CUBLAS_SAFE_SYNC_CALL(
+            NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
+                a->ncol, a->nrow,
+                &alpha,
+                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+                &beta,
+                MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM),
+                MATRIX_ELEM_PTR(c), c->stride / sizeof(MATRIX_ELEM)));
+    PROFILE_STOP
+}
+
+static int nerv_matrix_(add)(lua_State *L) {
+    Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname));
+    MATRIX_ELEM alpha = luaL_checknumber(L, 4);
+    MATRIX_ELEM beta = luaL_checknumber(L, 5);
+    CHECK_SAME_DIMENSION(a, b);
+    CHECK_SAME_DIMENSION(a, c);
+    nerv_matrix_(add_)(L, a, b, c, alpha, beta);
+    return 0;
+}
+
+static int nerv_matrix_(get_cublas_op)(char ch) {
+    return (ch == 'T' || ch == 't') ? CUBLAS_OP_T : CUBLAS_OP_N;
+}
+
+static int nerv_matrix_(mul)(lua_State *L) {
+#define SWAP(a, b) \
+    do { int t = (a); (a) = (b); (b) = t; } while (0)
+
+    Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname));
+    MATRIX_ELEM alpha = luaL_checknumber(L, 4);
+    MATRIX_ELEM beta = luaL_checknumber(L, 5);
+    int nargs = lua_gettop(L);
+    int ta = nargs > 5 ? nerv_matrix_(get_cublas_op)(*luaL_checkstring(L, 6)) \
+                            : CUBLAS_OP_N;
+    int tb = nargs > 6 ? nerv_matrix_(get_cublas_op)(*luaL_checkstring(L, 7)) \
+                            : CUBLAS_OP_N;
+    int am = a->nrow, an = a->ncol;
+    int bm = b->nrow, bn = b->ncol;
+    if (ta == CUBLAS_OP_T) SWAP(am, an);
+    if (tb == CUBLAS_OP_T) SWAP(bm, bn);
+    if (an != bm)
+        nerv_error(L, "Wrong dimension of multipliers");
+/*    MATRIX_ELEM alpha = 1.0f, beta = 0.0f; */
+    /* Because matrix in Nerv is row-major, here b comes first */
+    PROFILE_START
+    CUBLAS_SAFE_SYNC_CALL(
+            NERV_CUBLAS_(gemm)(cublas_handle, tb, ta,
+                bn, am, bm,
+                &alpha,
+                MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM),
+                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+                &beta,
+                MATRIX_ELEM_PTR(c), c->stride / sizeof(MATRIX_ELEM)));
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(create)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = nerv_matrix_(new_)(L, a->nrow, a->ncol);
+    luaT_pushudata(L, b, nerv_matrix_(tname));
+    return 1;
+}
+
+static int nerv_matrix_(sigmoid)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    CHECK_SAME_DIMENSION(a, b);
+    PROFILE_START
+    cudak_(cuda_sigmoid)(b, a);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(sigmoid_grad)(lua_State *L) {
+    Matrix *nerr = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *err = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    Matrix *output = luaT_checkudata(L, 3, nerv_matrix_(tname));
+    CHECK_SAME_DIMENSION(nerr, err);
+    CHECK_SAME_DIMENSION(nerr, output);
+    PROFILE_START
+    cudak_(cuda_sigmoid_grad)(output, err, nerr);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(softmax)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *max, *max_idx;
+    Matrix *dno;
+    CHECK_SAME_DIMENSION(a, b);
+    max = nerv_matrix_(new_)(L, a->nrow, 1);
+    max_idx = nerv_matrix_(new_)(L, a->nrow, 1);
+    dno = nerv_matrix_(new_)(L, a->nrow, 1);
+    PROFILE_START
+    cudak_(cuda_rowmax_idx)(a, max, max_idx);
+    cudak_(cuda_softmax_denominator)(a, max, dno);
+    cudak_(cuda_softmax_final)(a, max, dno, b);
+    PROFILE_STOP
+    nerv_matrix_(data_free)(L, max);
+    nerv_matrix_(data_free)(L, dno);
+    luaT_pushudata(L, max_idx, nerv_matrix_(tname));
+    return 1;
+}
+
+static int nerv_matrix_(rowsum)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1);
+    PROFILE_START
+    cudak_(cuda_rowsum)(a, b);
+    PROFILE_STOP
+    luaT_pushudata(L, b, nerv_matrix_(tname));
+    return 1;
+}
+
+static int nerv_matrix_(colsum)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = nerv_matrix_(new_)(L, 1, a->ncol);
+    PROFILE_START
+    cudak_(cuda_colsum)(a, b);
+    PROFILE_STOP
+    luaT_pushudata(L, b, nerv_matrix_(tname));
+    return 1;
+}
+
+static int nerv_matrix_(colsame)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *ref = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    Matrix *b = nerv_matrix_(new_)(L, 1, a->ncol);
+    CHECK_SAME_DIMENSION(a, ref);
+    PROFILE_START
+    cudak_(cuda_colsame)(a, ref, b);
+    PROFILE_STOP
+    luaT_pushudata(L, b, nerv_matrix_(tname));
+    return 1;
+}
+
+static int nerv_matrix_(rowmax)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1);
+    PROFILE_START
+    cudak_(cuda_rowmax)(a, b);
+    PROFILE_STOP
+    luaT_pushudata(L, b, nerv_matrix_(tname));
+    return 1;
+}
+
+static int nerv_matrix_(rowmax_idx)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1);
+    Matrix *idx = nerv_matrix_(new_)(L, a->nrow, 1);
+    PROFILE_START
+    cudak_(cuda_rowmax_idx)(a, b, idx);
+    PROFILE_STOP
+    luaT_pushudata(L, b, nerv_matrix_(tname));
+    luaT_pushudata(L, idx, nerv_matrix_(tname));
+    return 2;
+}
+
+static int nerv_matrix_(add_row)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    double beta = luaL_checknumber(L, 3);
+    if (a->ncol != b->ncol)
+        nerv_error(L, "the number of columns is not the same");
+    if (a->nrow != 1)
+        nerv_error(L, "a row vector is expected");
+    PROFILE_START
+    cudak_(cuda_add_row)(a, b, beta);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(fill)(lua_State *L) {
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    double val = luaL_checknumber(L, 2);
+    PROFILE_START
+    cudak_(cuda_fill)(self, val);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(copy_fromd)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    int nargs = lua_gettop(L);
+    int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
+    int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
+    int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
+    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+            a_begin + b_end - b_begin <= a->nrow))
+        nerv_error(L, "invalid copy interval");
+    if (a->ncol != b->ncol)
+        nerv_error(L, "matrices should be of the same dimension");
+    PROFILE_START
+    CUDA_SAFE_SYNC_CALL(
+            cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride,
+                MATRIX_ROW_PTR(b, b_begin), b->stride,
+                sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin,
+                cudaMemcpyDeviceToDevice));
+    PROFILE_STOP
+    return 0;
+}
+
+extern const char *MATRIX_CUMATRIX_HOST_TNAME;
+static int nerv_matrix_(copy_fromh)(lua_State *L) { 
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
+    int nargs = lua_gettop(L);
+    int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
+    int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
+    int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
+    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+            a_begin + b_end - b_begin <= a->nrow))
+        nerv_error(L, "invalid copy interval");
+    if (a->ncol != b->ncol)
+        nerv_error(L, "matrices should be of the same dimension");
+    PROFILE_START
+    CUDA_SAFE_SYNC_CALL(
+            cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride,
+                MATRIX_ROW_PTR(b, b_begin), b->stride,
+                sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin,
+                cudaMemcpyHostToDevice));
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(copy_toh)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
+    int nargs = lua_gettop(L);
+    int a_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
+    int a_end = nargs > 3 ? luaL_checkinteger(L, 4) : a->nrow;
+    int b_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
+    if (!(0 <= a_begin && a_begin < a_end && a_end <= a->nrow &&
+            b_begin + a_end - a_begin <= b->nrow))
+        nerv_error(L, "invalid copy interval");
+    if (b->ncol != a->ncol)
+        nerv_error(L, "matrices should be of the same dimension");
+    PROFILE_START
+    CUDA_SAFE_SYNC_CALL(
+            cudaMemcpy2D(MATRIX_ROW_PTR(b, b_begin), b->stride,
+                MATRIX_ROW_PTR(a, a_begin), a->stride,
+                sizeof(MATRIX_ELEM) * a->ncol, a_end - a_begin,
+                cudaMemcpyDeviceToHost));
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(trans)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = nerv_matrix_(new_)(L, a->ncol, a->nrow);
+    MATRIX_ELEM alpha = 1, beta = 0;
+    /* FIXME: possible memory leak when lua error is raised */
+    PROFILE_START
+    CUBLAS_SAFE_SYNC_CALL(
+            NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T,
+                a->nrow, a->ncol,
+                &alpha,
+                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+                &beta,
+                MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+                MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM)));
+    PROFILE_STOP
+    luaT_pushudata(L, b, nerv_matrix_(tname));
+    return 1;
+}
+
+static int nerv_matrix_(mul_elem)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname));
+    Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    CHECK_SAME_DIMENSION(a, b);
+    CHECK_SAME_DIMENSION(a, c);
+    PROFILE_START
+    cudak_(cuda_mul_elem)(a, b, c);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(log_elem)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    CHECK_SAME_DIMENSION(a, b);
+    PROFILE_START
+    cudak_(cuda_log_elem)(a, b);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(decompress)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b;
+    int orig_col = luaL_checkinteger(L, 2);
+    if (a->ncol != 1)
+        nerv_error(L, "the compressed matrix must be a column vector");
+    b = nerv_matrix_(new_)(L, a->nrow, orig_col);
+    PROFILE_START
+    cudak_(cuda_fill)(b, 0.0);
+    cudak_(cuda_decompress)(a, b);
+    PROFILE_STOP
+    luaT_pushudata(L, b, nerv_matrix_(tname));
+    return 1;
+}
+
+extern const char *nerv_matrix_host_int_tname;
+static int nerv_matrix_(copy_rows_fromh_by_idx)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
+    Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_int_tname);
+    long nrow = a->nrow;
+    int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0;
+    if (!(0 <= b_begin && b_begin + nrow <= idx->ncol))
+        nerv_error(L, "invalid copy interval");
+    long *idx_ptr = idx->data.i;
+    int i;
+    if (idx->nrow != 1)
+        nerv_error(L, "index should be a vector");
+    if (a->ncol != b->ncol)
+        nerv_error(L, "source/destination dimension mismatch");
+    cudaStream_t *streams = (cudaStream_t*)malloc(sizeof(cudaStream_t) * nrow);
+    for (i = 0; i < nrow; i++)
+    {
+        int src_row = idx_ptr[b_begin + i];
+        if (!(0 <= src_row && src_row < b->nrow))
+            nerv_error(L, "invalid index");
+        CUDA_SAFE_CALL(cudaStreamCreate(streams + i));
+        CUDA_SAFE_CALL(cudaMemcpyAsync(MATRIX_ROW_PTR(a, i),
+                    MATRIX_ROW_PTR(b, src_row),
+                    b->stride,
+                    cudaMemcpyHostToDevice, streams[i]));
+    }
+    for (i = 0; i < nrow; i++)
+    {
+        CUDA_SAFE_CALL(cudaStreamSynchronize(streams[i]));
+        CUDA_SAFE_CALL(cudaStreamDestroy(streams[i]));
+    }
+    free(streams);
+    return 0;
+}
+
+static int nerv_matrix_(expand_frm)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    int context = luaL_checkinteger(L, 3);
+    if (a->nrow != b->nrow)
+        nerv_error(L, "mismatching number of frames");
+    if (a->ncol != b->ncol * (context * 2 + 1))
+        nerv_error(L, "the width should be 2 * context + 1");
+    PROFILE_START
+    cudak_(cuda_expand_frm)(b, a, context);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(rearrange_frm)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    int step = luaL_checkinteger(L, 3);
+    CHECK_SAME_DIMENSION(a, b);
+    if (b->ncol % step)
+        nerv_error(L, "the dimension of columns is not divisible by step");
+    PROFILE_START
+    cudak_(cuda_rearrange_frm)(b, a, step);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(scale_rows_by_col)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    if (a->nrow != b->nrow)
+        nerv_error(L, "the number of rows is not the same");
+    if (b->ncol != 1)
+        nerv_error(L, "a column vector is expected");
+    PROFILE_START
+    cudak_(cuda_scale_rows_by_col)(b, a);
+    PROFILE_STOP
+    return 0;
+}
+
+static int nerv_matrix_(scale_rows_by_row)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    if (a->ncol != b->ncol)
+        nerv_error(L, "the number of columns is not the same");
+    if (b->nrow != 1)
+        nerv_error(L, "a row vector is expected");
+    PROFILE_START
+    cudak_(cuda_scale_rows_by_row)(b, a);
+    PROFILE_STOP
+    return 0;
+}
+
+static const luaL_Reg nerv_matrix_(extra_methods)[] = {
+    {"create", nerv_matrix_(create)},
+    {"colsum", nerv_matrix_(colsum)},
+    {"colsame", nerv_matrix_(colsame)},
+    {"rowsum", nerv_matrix_(rowsum)},
+    {"rowmax", nerv_matrix_(rowmax)},
+    {"rowmax_idx", nerv_matrix_(rowmax_idx)},
+    {"trans", nerv_matrix_(trans)},
+    {"decompress", nerv_matrix_(decompress)},
+    /* in-place calc */
+    {"copy_fromh", nerv_matrix_(copy_fromh)},
+    {"copy_fromd", nerv_matrix_(copy_fromd)},
+    {"copy_toh", nerv_matrix_(copy_toh)},
+    {"add", nerv_matrix_(add)},
+    {"mul", nerv_matrix_(mul)},
+    {"add_row", nerv_matrix_(add_row)},
+    {"fill", nerv_matrix_(fill)},
+    {"sigmoid", nerv_matrix_(sigmoid)},
+    {"sigmoid_grad", nerv_matrix_(sigmoid_grad)},
+    {"softmax", nerv_matrix_(softmax)},
+    {"mul_elem", nerv_matrix_(mul_elem)},
+    {"log_elem", nerv_matrix_(log_elem)},
+    {"copy_rows_fromh_by_idx", nerv_matrix_(copy_rows_fromh_by_idx)},
+    {"expand_frm", nerv_matrix_(expand_frm)},
+    {"rearrange_frm", nerv_matrix_(rearrange_frm)},
+    {"scale_rows_by_row", nerv_matrix_(scale_rows_by_row)},
+    {"scale_rows_by_col", nerv_matrix_(scale_rows_by_col)},
+    {NULL, NULL}
+};
+
+static void cuda_matrix_(init)(lua_State *L) {
+    luaN_append_methods(L, nerv_matrix_(extra_methods));
+}
+
+static void cuda_matrix_(free)(lua_State *L, MATRIX_ELEM *ptr) {
+    CUDA_SAFE_SYNC_CALL(cudaFree(ptr));
+}
+
+static void cuda_matrix_(alloc)(lua_State *L, MATRIX_ELEM **dptr,
+                                size_t *stride, long width, long height) {
+    PROFILE_START
+    CUDA_SAFE_SYNC_CALL(cudaMallocPitch((void **)dptr, stride, width, height));
+    PROFILE_STOP
+}
+
+static MATRIX_ELEM cuda_matrix_(read)(lua_State *L, MATRIX_ELEM *data,
+                                        int idx) {
+    MATRIX_ELEM res;
+    CUDA_SAFE_SYNC_CALL(cudaMemcpy(&res, data + idx,
+                sizeof(MATRIX_ELEM), cudaMemcpyDeviceToHost));
+    return res;
+}
+
+static void cuda_matrix_(write)(lua_State *L, MATRIX_ELEM *data,
+                                int idx, MATRIX_ELEM val) {
+    CUDA_SAFE_SYNC_CALL(cudaMemcpy(data + idx, &val,
+                sizeof(MATRIX_ELEM), cudaMemcpyHostToDevice));
+}
+
+int nerv_matrix_(get_elem)(lua_State *L) {
+    return nerv_error_method_not_implemented(L);
+}
+
+int nerv_matrix_(set_elem)(lua_State *L) {
+    return nerv_error_method_not_implemented(L);
+}
+
+#include "matrix.c"
+#endif
diff --git a/nerv/matrix/generic/elem_type.h b/nerv/matrix/generic/elem_type.h
new file mode 100644
index 0000000..bffe940
--- /dev/null
+++ b/nerv/matrix/generic/elem_type.h
@@ -0,0 +1,22 @@
+#ifdef MATRIX_USE_FLOAT
+
+#define MATRIX_ELEM float
+#define MATRIX_ELEM_FMT "%f"
+#define MATRIX_ELEM_WRITE_FMT "%.8f"
+#define MATRIX_ELEM_PTR(self) ((self)->data.f)
+
+#elif defined(MATRIX_USE_DOUBLE)
+
+#define MATRIX_ELEM double
+#define MATRIX_ELEM_FMT "%lf"
+#define MATRIX_ELEM_WRITE_FMT "%.8lf"
+#define MATRIX_ELEM_PTR(self) ((self)->data.d)
+
+#elif defined(MATRIX_USE_INT)
+
+#define MATRIX_ELEM long
+#define MATRIX_ELEM_FMT "%ld"
+#define MATRIX_ELEM_WRITE_FMT "%ld"
+#define MATRIX_ELEM_PTR(self) ((self)->data.i)
+
+#endif
diff --git a/nerv/matrix/generic/matrix.c b/nerv/matrix/generic/matrix.c
new file mode 100644
index 0000000..e17fb42
--- /dev/null
+++ b/nerv/matrix/generic/matrix.c
@@ -0,0 +1,155 @@
+#ifdef NERV_GENERIC_MATRIX
+#include "../../common.h"
+#include "matrix.h"
+
+extern const char *nerv_matrix_(tname);
+extern const char *MATRIX_BASE_TNAME;
+
+void nerv_matrix_(data_free)(lua_State *L, Matrix *self) {
+    (void)L;
+    assert(*self->data_ref > 0);
+    if (--(*self->data_ref) == 0)
+    {
+        /* free matrix data */
+        MATRIX_DATA_FREE(L, MATRIX_ELEM_PTR(self));
+        free(self->data_ref);
+        free(self);
+    }
+}
+
+void nerv_matrix_(data_retain)(Matrix *self) {
+    (*self->data_ref)++;
+}
+
+Matrix *nerv_matrix_(new_)(lua_State *L, long nrow, long ncol) {
+    Matrix *self = (Matrix *)malloc(sizeof(Matrix));
+    self->nrow = nrow;
+    self->ncol = ncol;
+    self->nmax = self->nrow * self->ncol;
+    MATRIX_DATA_ALLOC(L, &MATRIX_ELEM_PTR(self), &self->stride,
+                        sizeof(MATRIX_ELEM) * self->ncol, self->nrow);
+    self->data_ref = (long *)malloc(sizeof(long));
+    *self->data_ref = 0;
+    nerv_matrix_(data_retain)(self);
+    return self;
+}
+
+int nerv_matrix_(new)(lua_State *L) {
+    luaT_pushudata(L, nerv_matrix_(new_)(L, luaL_checkinteger(L, 1),
+                                        luaL_checkinteger(L, 2)),
+                    nerv_matrix_(tname));
+    return 1;
+}
+
+int nerv_matrix_(destroy)(lua_State *L) {
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    nerv_matrix_(data_free)(L, self);
+    return 1;
+}
+
+int nerv_matrix_(get_elem)(lua_State *L); 
+int nerv_matrix_(set_elem)(lua_State *L);
+
+static Matrix *nerv_matrix_(getrow)(Matrix *self, int row) {
+    Matrix *prow = (Matrix *)malloc(sizeof(Matrix));
+    prow->ncol = self->ncol;
+    prow->nrow = 1;
+    prow->stride = self->stride;
+    prow->nmax = prow->ncol;
+    MATRIX_ELEM_PTR(prow) = MATRIX_ROW_PTR(self, row);
+    prow->data_ref = self->data_ref;
+    nerv_matrix_(data_retain)(prow);
+    return prow;
+}
+
+static int nerv_matrix_(newindex)(lua_State *L) {
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    if (lua_isnumber(L, 2))
+    {
+        int idx = luaL_checkinteger(L, 2);
+        if (self->nrow == 1)
+        {
+            if (idx < 0 || idx >= self->ncol)
+                nerv_error(L, "index must be within range [0, %d)", self->ncol);
+            MATRIX_DATA_WRITE(L, MATRIX_ELEM_PTR(self), idx,
+                                luaL_checknumber(L, 3));
+        }
+        else
+            nerv_error(L, "cannot assign to row vector");
+        lua_pushboolean(L, 1);
+        return 1;
+    }
+    else
+    {
+        lua_pushboolean(L, 0);
+        return 1;
+    }
+}
+
+
+static int nerv_matrix_(index)(lua_State *L) {
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    if (lua_isnumber(L, 2))
+    {
+        int idx = luaL_checkinteger(L, 2);
+        if (self->nrow == 1)
+        {
+            if (idx < 0 || idx >= self->ncol)
+                nerv_error(L, "index must be within range [0, %d)", self->ncol);
+            lua_pushnumber(L, MATRIX_DATA_READ(L, MATRIX_ELEM_PTR(self), idx));
+        }
+        else
+        {
+            if (idx < 0 || idx >= self->nrow)
+                nerv_error(L, "index must be within range [0, %d)", self->nrow);
+            luaT_pushudata(L, nerv_matrix_(getrow)(self, idx), nerv_matrix_(tname));
+        }
+        lua_pushboolean(L, 1);
+        return 2;
+    }
+    else
+    {
+        lua_pushboolean(L, 0);
+        return 1;
+    }
+}
+
+static int nerv_matrix_(ncol)(lua_State *L) {
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    lua_pushinteger(L, self->ncol);
+    return 1;
+}
+
+static int nerv_matrix_(nrow)(lua_State *L) {
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    lua_pushinteger(L, self->nrow);
+    return 1;
+}
+
+static int nerv_matrix_(get_dataref_value)(lua_State *L) {                                                                                                                                               
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));                                                                                                                                           
+    lua_pushinteger(L, *(self->data_ref));                                                                                                                                                               
+    return 1;                                                                                                                                                                                            
+}      
+
+static const luaL_Reg nerv_matrix_(methods)[] = {
+    {"get_elem", nerv_matrix_(get_elem)},
+    {"set_elem", nerv_matrix_(set_elem)},
+    {"ncol", nerv_matrix_(ncol)},
+    {"nrow", nerv_matrix_(nrow)},
+    {"get_dataref_value", nerv_matrix_(get_dataref_value)},
+    {"__index__", nerv_matrix_(index)},
+    {"__newindex__", nerv_matrix_(newindex)},
+    {NULL, NULL}
+};
+
+void nerv_matrix_(init)(lua_State *L) {
+    luaT_newmetatable(L, nerv_matrix_(tname), MATRIX_BASE_TNAME,
+                        nerv_matrix_(new), nerv_matrix_(destroy), NULL);
+    luaL_register(L, NULL, nerv_matrix_(methods));
+#ifdef MATRIX_INIT
+    MATRIX_INIT(L);
+#endif
+    lua_pop(L, 1);
+}
+#endif
diff --git a/nerv/matrix/generic/matrix.h b/nerv/matrix/generic/matrix.h
new file mode 100644
index 0000000..833724b
--- /dev/null
+++ b/nerv/matrix/generic/matrix.h
@@ -0,0 +1,19 @@
+#ifndef NERV_GENERIC_MATRIX_H
+#define NERV_GENERIC_MATRIX_H
+
+#include <stddef.h>
+typedef struct Matrix {
+    size_t stride;              /* size of a row */
+    long ncol, nrow, nmax;    /* dimension of the matrix */
+    union {
+        float *f;
+        double *d;
+        long *i;
+    } data;                   /* pointer to actual storage */
+    long *data_ref;
+} Matrix;
+
+#define MATRIX_ROW_PTR(self, row) \
+    (MATRIX_ELEM *)((char *)MATRIX_ELEM_PTR(self) + (row) * (self)->stride)
+
+#endif
diff --git a/nerv/matrix/generic/mmatrix.c b/nerv/matrix/generic/mmatrix.c
new file mode 100644
index 0000000..b0f0791
--- /dev/null
+++ b/nerv/matrix/generic/mmatrix.c
@@ -0,0 +1,122 @@
+#ifdef NERV_GENERIC_MMATRIX
+#include "matrix.h"
+#include "elem_type.h"
+#define MATRIX_DATA_FREE(L, ptr) free(ptr)
+#define MATRIX_DATA_ALLOC(L, dptr, stride, width, height) \
+                            host_matrix_(alloc)(L, dptr, stride, width, height)
+#define MATRIX_DATA_WRITE(L, data, idx, val) (data[idx] = val)
+#define MATRIX_DATA_READ(L, data, idx) (data[idx])
+#define MATRIX_INIT(L) host_matrix_(init)(L)
+#define MATRIX_BASE_TNAME nerv_matrix_host_tname
+#define NERV_GENERIC_MATRIX
+#include "../../common.h"
+#include "../../io/chunk_file.h"
+#include "string.h"
+
+static void host_matrix_(alloc)(lua_State *L,
+                                MATRIX_ELEM **dptr, size_t *stride,
+                                long width, long height) {
+    if ((*dptr = (MATRIX_ELEM *)malloc(width * height)) == NULL)
+        nerv_error(L, "mmatrix insufficient memory");
+    *stride = width;
+}
+
+int nerv_matrix_(get_elem)(lua_State *L) {
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    int idx = luaL_checkinteger(L, 2);
+    if (idx < 0 || idx >= self->nmax)
+        nerv_error(L, "index must be within range [0, %d)", self->nmax);
+    lua_pushnumber(L, MATRIX_ELEM_PTR(self)[idx]);
+    return 1;
+}
+
+int nerv_matrix_(set_elem)(lua_State *L) {
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    int idx = luaL_checkinteger(L, 2);
+    MATRIX_ELEM v = luaL_checknumber(L, 3);
+    if (idx < 0 || idx >= self->nmax)
+        nerv_error(L, "index must be within range [0, %d)", self->nmax);
+    MATRIX_ELEM_PTR(self)[idx] = v;
+    return 0;
+}
+
+static const luaL_Reg nerv_matrix_(extra_methods)[];
+static void host_matrix_(init)(lua_State *L) {
+    luaN_append_methods(L, nerv_matrix_(extra_methods));
+#ifdef MMATRIX_INIT
+    MMATRIX_INIT(L);
+#endif
+}
+
+#include "matrix.c"
+
+int nerv_matrix_(load)(lua_State *L) {
+    ChunkData *chunk = luaT_checkudata(L, 1, nerv_chunk_data_tname);
+    Matrix *self;
+    int i, j;
+    long nrow, ncol;
+    FILE *fp = chunk->fp;
+    if (fscanf(fp, "%ld %ld", &nrow, &ncol) != 2)
+        return 0;
+    self = nerv_matrix_(new_)(L, nrow, ncol);
+    for (i = 0; i < nrow; i++)
+    {
+        MATRIX_ELEM *row = MATRIX_ROW_PTR(self, i);
+        for (j = 0; j < ncol; j++)
+            if (fscanf(fp, MATRIX_ELEM_FMT, row + j) != 1)
+            {
+                free(self);
+                return 0;
+            }
+    }
+    luaT_pushudata(L, self, nerv_matrix_(tname));
+    return 1;
+}
+
+int nerv_matrix_(save)(lua_State *L) {
+    ChunkFileHandle *chunk = luaT_checkudata(L, 2,
+                                nerv_chunk_file_handle_tname);
+    Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    int i, j;
+    long nrow = self->nrow, ncol = self->ncol;
+    FILE *fp = chunk->fp;
+    if (fprintf(fp, "%ld %ld\n", nrow, ncol) < 0)
+        return 0;
+    for (i = 0; i < nrow; i++)
+    {
+        MATRIX_ELEM *row = MATRIX_ROW_PTR(self, i);
+        for (j = 0; j < ncol; j++)
+            if (fprintf(fp, MATRIX_ELEM_WRITE_FMT " ", row[j]) < 0)
+                return 0;
+        if (fprintf(fp, "\n") < 0)
+             return 0;
+    }
+    return 0;
+}
+
+static int nerv_matrix_(copy_from)(lua_State *L) {
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    int nargs = lua_gettop(L);
+    int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
+    int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
+    int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
+    if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+            a_begin + b_end - b_begin <= a->nrow))
+        nerv_error(L, "invalid copy interval");
+    if (a->ncol != b->ncol)
+        nerv_error(L, "matrices should be of the same dimension");
+    memmove(MATRIX_ROW_PTR(a, a_begin),
+            MATRIX_ROW_PTR(b, b_begin),
+            sizeof(MATRIX_ELEM) * b->ncol * (b_end - b_begin));
+    return 0;
+}
+
+static const luaL_Reg nerv_matrix_(extra_methods)[] = {
+    {"load", nerv_matrix_(load)},
+    {"save", nerv_matrix_(save)},
+    {"copy_from", nerv_matrix_(copy_from)},
+    {NULL, NULL}
+};
+
+#endif
diff --git a/nerv/matrix/init.c b/nerv/matrix/init.c
new file mode 100644
index 0000000..c29d7e9
--- /dev/null
+++ b/nerv/matrix/init.c
@@ -0,0 +1,35 @@
+#include "../common.h"
+#include "generic/matrix.h"
+
+const char *nerv_matrix_tname = "nerv.Matrix";
+const char *nerv_matrix_cuda_tname = "nerv.CuMatrix";
+const char *nerv_matrix_host_tname = "nerv.MMatrix";
+
+void nerv_cumatrix_init(lua_State *L);
+void nerv_mmatrix_init(lua_State *L);
+
+static const luaL_Reg matrix_methods[] = {
+    {"__tostring__", nerv_error_method_not_implemented },
+    {"__add__", nerv_error_method_not_implemented },
+    {"__sub__", nerv_error_method_not_implemented },
+    {"__mul__", nerv_error_method_not_implemented },
+    {NULL, NULL}
+};
+
+void nerv_matrix_init(lua_State *L) {
+    /* abstract base class: Matrix */
+    luaT_newmetatable(L, nerv_matrix_tname, NULL, NULL, NULL, NULL);
+    luaL_register(L, NULL, matrix_methods);
+    lua_pop(L, 1);
+
+    /* CuMatrix inherits from Matrix */
+    luaT_newmetatable(L, nerv_matrix_cuda_tname, nerv_matrix_tname,
+                            NULL, NULL, NULL);
+    nerv_cumatrix_init(L);
+    lua_pop(L, 1);
+    /* MMatrix inherits from Matrix */
+    luaT_newmetatable(L, nerv_matrix_host_tname, nerv_matrix_tname,
+                            NULL, NULL, NULL);
+    nerv_mmatrix_init(L);
+    lua_pop(L, 1);
+}
diff --git a/nerv/matrix/init.lua b/nerv/matrix/init.lua
new file mode 100644
index 0000000..1a8925f
--- /dev/null
+++ b/nerv/matrix/init.lua
@@ -0,0 +1,77 @@
+function nerv.Matrix:__tostring__()
+    local ncol = self:ncol()
+    local nrow = self:nrow()
+    local strt = {}
+    local fmt
+    if self.fmt then
+        fmt = self.fmt
+    else
+        fmt = "%.8f "
+    end
+    if nrow == 1 then
+        for col = 0, ncol - 1 do
+            table.insert(strt, string.format(fmt, self[col]))
+        end
+        table.insert(strt, "\n")
+    else
+        for row = 0, nrow - 1 do
+            local rp = self[row]
+            for col = 0, ncol - 1 do
+                table.insert(strt, string.format(fmt, rp[col]))
+            end
+            table.insert(strt, "\n")
+        end
+    end
+    table.insert(strt, string.format(
+        "[%s %d x %d]", self.__typename, nrow, ncol))
+    return table.concat(strt)
+end
+
+-- gen: a function takes take indices of the matrix and return the generated
+-- all entrys in the matrix will be assigned by calling gen(i, j)
+function nerv.Matrix:generate(gen)
+    if (self:nrow() == 1) then
+        for j = 0, self:ncol() - 1 do
+            self[j] = gen(j)
+        end
+    else
+        for i = 0, self:nrow() - 1 do
+            local row = self[i]
+            for j = 0, self:ncol() - 1 do
+                row[j] = gen(i, j)
+            end
+        end
+    end
+end
+
+nerv.MMatrixInt.fmt = "%d "
+
+function nerv.CuMatrix:__add__(b)
+    c = self:create()
+    c:add(self, b, 1.0, 1.0)
+    return c
+end
+
+function nerv.CuMatrix:__sub__(b)
+    c = self:create()
+    c:add(self, b, 1.0, -1.0)
+    return c
+end
+
+function nerv.CuMatrix:__mul__(b)
+    c = nerv.get_type(self.__typename)(self:nrow(), b:ncol())
+    c:mul(self, b, 1.0, 0.0, 'N', 'N')
+    return c
+end
+
+function nerv.CuMatrixFloat.new_from_host(mat)
+    local res = nerv.CuMatrixFloat(mat:nrow(), mat:ncol())
+    res:copy_fromh(mat)
+    return res
+end
+
+function nerv.CuMatrixFloat:new_to_host()
+    local res = nerv.MMatrixFloat(self:nrow(), self:ncol())
+    self:copy_toh(res)
+    return res
+end
diff --git a/nerv/matrix/mmatrix.c b/nerv/matrix/mmatrix.c
new file mode 100644
index 0000000..d1d68b9
--- /dev/null
+++ b/nerv/matrix/mmatrix.c
@@ -0,0 +1,77 @@
+#define NERV_GENERIC_MMATRIX
+#include <stdlib.h>
+#include "../common.h"
+void nerv_matrix_host_float_init(lua_State *L);
+void nerv_matrix_host_double_init(lua_State *L);
+void nerv_matrix_host_int_init(lua_State *L);
+
+void nerv_mmatrix_init(lua_State *L) {
+    srand(1);
+    nerv_matrix_host_float_init(L);
+    nerv_matrix_host_double_init(L);
+    nerv_matrix_host_int_init(L);
+}
+
+#define MATRIX_USE_FLOAT
+#define host_matrix_(NAME) host_matrix_float_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME
+const char *nerv_matrix_(tname) = "nerv.MMatrixFloat";
+#include "generic/mmatrix.c"
+#undef nerv_matrix_
+#undef host_matrix_
+#undef MATRIX_USE_FLOAT
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_FMT
+#undef MATRIX_ELEM_WRITE_FMT
+
+#define NERV_GENERIC_MMATRIX
+#define MATRIX_USE_DOUBLE
+#define host_matrix_(NAME) host_matrix_double_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_host_double_##NAME
+const char *nerv_matrix_(tname) = "nerv.MMatrixDouble";
+#include "generic/mmatrix.c"
+#undef nerv_matrix_
+#undef host_matrix_
+#undef MATRIX_USE_DOUBLE
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_FMT
+#undef MATRIX_ELEM_WRITE_FMT
+
+#define NERV_GENERIC_MMATRIX
+#define MATRIX_USE_INT
+#define host_matrix_(NAME) host_matrix_int_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_host_int_##NAME
+const char *nerv_matrix_(tname) = "nerv.MMatrixInt";
+#define MMATRIX_INIT(L) host_matrix_(init_extra)(L)
+
+static const luaL_Reg nerv_matrix_(extra_methods_int)[];
+static void host_matrix_(init_extra)(lua_State *L) {
+    luaN_append_methods(L, nerv_matrix_(extra_methods_int));
+}
+
+#include "generic/mmatrix.c"
+
+static int nerv_matrix_(perm_gen)(lua_State *L) {
+    int i, ncol = luaL_checkinteger(L, 1);
+    Matrix *self = nerv_matrix_(new_)(L, 1, ncol);
+    long *prow = self->data.i;
+    for (i = 0; i < ncol; i++)
+        prow[i] = i;
+    for (i = ncol - 1; i >= 0; i--)
+    {
+        size_t j = rand() % (i + 1);
+        long tmp = prow[i];
+        prow[i] = prow[j];
+        prow[j] = tmp;
+    }
+    luaT_pushudata(L, self, nerv_matrix_(tname));
+    return 1;
+}
+
+static const luaL_Reg nerv_matrix_(extra_methods_int)[] = {
+    {"perm_gen", nerv_matrix_(perm_gen)},
+    {NULL, NULL}
+};
+
diff --git a/nerv/nerv b/nerv/nerv
new file mode 100644
index 0000000..7571659
--- /dev/null
+++ b/nerv/nerv
@@ -0,0 +1,13 @@
+#! /usr/bin/env luajit
+require 'nerv'
+print("Greetings")
+if #arg < 1 then
+    return
+end
+local script = arg[1]
+local script_arg = {}
+for i = 2, #arg do
+    table.insert(script_arg, arg[i])
+end
+arg = script_arg
+dofile(script)
diff --git a/nerv/nerv-scm-1.rockspec b/nerv/nerv-scm-1.rockspec
new file mode 100644
index 0000000..d14140a
--- /dev/null
+++ b/nerv/nerv-scm-1.rockspec
@@ -0,0 +1,38 @@
+package = "nerv"
+version = "scm-1"
+source = {
+    url = "..." -- We don't have one yet
+}
+description = {
+    summary = "An example for the LuaRocks tutorial.",
+    detailed = [[
+    ]],
+    homepage = "https://github.com/Determinant/nerv", -- We don't have one yet
+    license = "BSD" -- or whatever you like
+}
+dependencies = {
+    "lua >= 5.1"
+    -- If you depend on other rocks, add them here
+}
+build = {
+    -- We'll start here.
+    type = "make",
+    build_variables = {
+        CFLAGS="$(CFLAGS)",
+        LIBFLAG="$(LIBFLAG)",
+        LUA_LIBDIR="$(LUA_LIBDIR)",
+        LUA_BINDIR="$(LUA_BINDIR)",
+        LUA_INCDIR="$(LUA_INCDIR)",
+        LUA="$(LUA)",
+    },
+    install_variables = {
+        INST_PREFIX="$(PREFIX)",
+        INST_BINDIR="$(BINDIR)",
+        INST_LIBDIR="$(LIBDIR)",
+        INST_LUADIR="$(LUADIR)",
+        INST_CONFDIR="$(CONFDIR)",
+    },
+    install = {
+        bin = {"nerv"}
+    }
+}
diff --git a/nerv/nerv.c b/nerv/nerv.c
new file mode 100644
index 0000000..a59eadc
--- /dev/null
+++ b/nerv/nerv.c
@@ -0,0 +1,38 @@
+#include "common.h"
+
+extern void nerv_example_init(lua_State *L);
+extern void nerv_matrix_init(lua_State *L);
+extern void nerv_io_init(lua_State *L);
+
+static const luaL_Reg nerv_utils_methods[] = {
+    {"setmetatable", luaT_lua_setmetatable},
+    {"getmetatable", luaT_lua_getmetatable},
+    {"newmetatable", luaT_lua_newmetatable},
+    {"typename", luaT_lua_typename},
+    {NULL, NULL}
+};
+
+void nerv_utils_init(lua_State *L) {
+    luaL_register(L, NULL, nerv_utils_methods);
+}
+
+int luaopen_libnerv(lua_State *L) {
+    lua_newtable(L);
+    /* duplicate table */
+    lua_pushvalue(L, -1);
+    /* set table to global index */
+    lua_setfield(L, LUA_GLOBALSINDEX, "nerv");
+    /* A table reference still remains.
+     *
+     * The following initialization functions should obey to the rule that they
+     * maintain the stack properly to guarantee the stack stays the same before
+     * and after invoking the call (i.e. stay balanced).
+     *
+     * Also note that they can make use of the value at top of the stack which
+     * references to the `nerv` global table. */
+    nerv_utils_init(L);
+    nerv_example_init(L);
+    nerv_matrix_init(L);
+    nerv_io_init(L);
+    return 1;
+}
diff --git a/nerv/nn/init.lua b/nerv/nn/init.lua
new file mode 100644
index 0000000..cbaf52b
--- /dev/null
+++ b/nerv/nn/init.lua
@@ -0,0 +1,3 @@
+nerv.include('layer_repo.lua')
+nerv.include('param_repo.lua')
+nerv.include('layer_dag.lua')
diff --git a/nerv/nn/layer_dag.lua b/nerv/nn/layer_dag.lua
new file mode 100644
index 0000000..8e30216
--- /dev/null
+++ b/nerv/nn/layer_dag.lua
@@ -0,0 +1,249 @@
+local DAGLayer = nerv.class("nerv.DAGLayer", "nerv.Layer")
+
+local function parse_id(str)
+    local id, port, _
+    _, _, id, port = string.find(str, "([a-zA-Z0-9_]+)%[([0-9]+)%]")
+    if id == nil or port == nil then
+        _, _, id, port = string.find(str, "(.+)%[([0-9]+)%]")
+        if not (id == "<input>" or id == "<output>") then
+            nerv.error("wrong format of connection id")
+        end
+    end
+    port = tonumber(port)
+    return id, port
+end
+
+local function discover(id, layers, layer_repo)
+    local ref = layers[id]
+    if id == "<input>" or id == "<output>" then
+        return nil
+    end
+    if ref == nil then
+        local layer = layer_repo:get_layer(id)
+        local dim_in, dim_out = layer:get_dim()
+        ref = {
+            layer = layer,
+            inputs = {},
+            outputs = {},
+            err_inputs = {},
+            err_outputs = {},
+            next_layers = {},
+            input_len = #dim_in,
+            output_len = #dim_out,
+            in_deg = 0,
+            visited = false
+        }
+        layers[id] = ref
+    end
+    return ref
+end
+
+function DAGLayer:__init(id, global_conf, layer_conf)
+    local layers = {}
+    local inputs = {}
+    local outputs = {}
+    local dim_in = layer_conf.dim_in
+    local dim_out = layer_conf.dim_out
+    local parsed_conn = {}
+    for from, to in pairs(layer_conf.connections) do
+        local id_from, port_from = parse_id(from)
+        local id_to, port_to = parse_id(to)
+        local ref_from = discover(id_from, layers, layer_conf.sub_layers)
+        local ref_to = discover(id_to, layers, layer_conf.sub_layers)
+        local input_dim, output_dim, _
+        if ref_from and ref_from.outputs[port_from] ~= nil then
+            nerv.error("%s has already been attached", from)
+        end
+        if ref_to and ref_to.inputs[port_to] ~= nil then
+            nerv.error("%s has already been attached", to)
+        end
+        if id_from == "<input>" then
+            input_dim, _ = ref_to.layer:get_dim()
+            if dim_in[port_from] ~= input_dim[port_to] then
+                nerv.error("mismatching data dimension between %s and %s", from, to)
+            end
+            inputs[port_from] = {ref_to, port_to}
+            ref_to.inputs[port_to] = inputs -- just a place holder
+        elseif id_to == "<output>" then
+            _, output_dim = ref_from.layer:get_dim()
+            if output_dim[port_from] ~= dim_out[port_to] then
+                nerv.error("mismatching data dimension between %s and %s", from, to)
+            end
+            outputs[port_to] = {ref_from, port_from}
+            ref_from.outputs[port_from] = outputs -- just a place holder
+        else
+            _, output_dim = ref_from.layer:get_dim()
+            input_dim, _ = ref_to.layer:get_dim()
+            if output_dim[port_from] ~= input_dim[port_to] then
+                nerv.error("mismatching data dimension between %s and %s", from, to)
+            end
+
+            table.insert(parsed_conn,
+                {{ref_from, port_from}, {ref_to, port_to}})
+            table.insert(ref_from.next_layers, ref_to) -- add edge
+            ref_to.in_deg = ref_to.in_deg + 1          -- increase the in-degree of the target layer
+        end
+    end
+
+    -- topology sort
+    local queue = {}
+    local l = 1
+    local r = 1
+    for id, ref in pairs(layers) do
+        if ref.in_deg == 0 then
+            table.insert(queue, ref)
+            nerv.info("adding source layer: %s", id)
+            r = r + 1
+        end
+    end
+    if l == r then
+        nerv.error("loop detected")
+    end
+    while l < r do
+        local cur = queue[l]
+        cur.visited = true
+        l = l + 1
+        for _, nl in pairs(cur.next_layers) do
+            nl.in_deg = nl.in_deg - 1 
+            if nl.in_deg == 0 then
+                table.insert(queue, nl)
+                r = r + 1
+            end
+        end
+    end
+    for i = 1, #queue do
+        nerv.info("enqueued layer: %s", queue[i].layer.id)
+    end
+
+    for id, ref in pairs(layers) do
+        -- check wether the graph is connected
+        if ref.visited == false then
+            nerv.warning("layer %s is ignored", id)
+        end
+    end
+
+    self.layers = layers
+    self.inputs = inputs
+    self.outputs = outputs
+    self.dim_in = dim_in
+    self.dim_out = dim_out
+    self.parsed_conn = parsed_conn
+    self.queue = queue
+    self.gconf = global_conf
+end
+
+function DAGLayer:init(batch_size)
+    for i, conn in ipairs(self.parsed_conn) do
+        local _, output_dim
+        local ref_from, port_from, ref_to, port_to
+        ref_from, port_from = unpack(conn[1])
+        ref_to, port_to = unpack(conn[2])
+        _, output_dim = ref_from.layer:get_dim()
+        local mid = self.gconf.cumat_type(batch_size,
+                                        output_dim[port_from])
+        local err_mid = mid:create()
+
+        ref_from.outputs[port_from] = mid
+        ref_to.inputs[port_to] = mid
+
+        ref_from.err_inputs[port_from] = err_mid
+        ref_to.err_outputs[port_to] = err_mid
+    end
+    for id, ref in pairs(self.layers) do
+        for i = 1, ref.input_len do
+            if ref.inputs[i] == nil then
+                nerv.error("dangling input port %d of layer %s", i, id)
+            end
+        end
+        for i = 1, ref.output_len do
+            if ref.outputs[i] == nil then
+                nerv.error("dangling output port %d of layer %s", i, id)
+            end
+        end
+        -- initialize sub layers
+        ref.layer:init(batch_size)
+    end
+    for i = 1, #self.dim_in do
+        if self.inputs[i] == nil then
+            nerv.error("dangling port %d of layer <input>", i)
+        end
+    end
+    for i = 1, #self.dim_out do
+        if self.outputs[i] == nil then
+            nerv.error("dangling port %d of layer <output>", i)
+        end
+    end
+end
+
+function DAGLayer:set_inputs(input)
+    for i = 1, #self.dim_in do
+        local layer = self.inputs[i][1]
+        local port = self.inputs[i][2]
+        layer.inputs[port] = input[i]
+    end
+end
+
+function DAGLayer:set_outputs(output)
+    for i = 1, #self.dim_out do
+        local layer = self.outputs[i][1]
+        local port = self.outputs[i][2]
+        layer.outputs[port] = output[i]
+    end
+end
+
+function DAGLayer:set_err_inputs(bp_err)
+    for i = 1, #self.dim_out do
+        local layer = self.outputs[i][1]
+        local port = self.outputs[i][2]
+        layer.err_inputs[port] = bp_err[i]
+    end
+end
+
+function DAGLayer:set_err_outputs(next_bp_err)
+    for i = 1, #self.dim_in do
+        local layer = self.inputs[i][1]
+        local port = self.inputs[i][2]
+        layer.err_outputs[port] = next_bp_err[i]
+    end
+end
+
+function DAGLayer:update(bp_err, input, output)
+    self:set_err_inputs(bp_err)
+    self:set_inputs(input)
+    self:set_outputs(output)
+    -- print("update")
+    for id, ref in pairs(self.queue) do
+        -- print(ref.layer.id)
+        ref.layer:update(ref.err_inputs, ref.inputs, ref.outputs)
+    end
+end
+
+function DAGLayer:propagate(input, output)
+    self:set_inputs(input)
+    self:set_outputs(output)
+    for i = 1, #self.queue do
+        local ref = self.queue[i]
+        -- print(ref.layer.id)
+        ref.layer:propagate(ref.inputs, ref.outputs)
+    end
+end
+
+function DAGLayer:back_propagate(bp_err, next_bp_err, input, output)
+    self:set_err_outputs(next_bp_err)
+    self:set_err_inputs(bp_err)
+    self:set_inputs(input)
+    self:set_outputs(output)
+    for i = #self.queue, 1, -1 do
+        local ref = self.queue[i]
+        -- print(ref.layer.id)
+        ref.layer:back_propagate(ref.err_inputs, ref.err_outputs, ref.inputs, ref.outputs)
+    end
+end
+
+function DAGLayer:get_params()
+    local param_repos = {}
+    for id, ref in pairs(self.queue) do
+        table.insert(param_repos, ref.layer:get_params())
+    end
+    return nerv.ParamRepo.merge(param_repos)
+end
diff --git a/nerv/nn/layer_repo.lua b/nerv/nn/layer_repo.lua
new file mode 100644
index 0000000..602c37c
--- /dev/null
+++ b/nerv/nn/layer_repo.lua
@@ -0,0 +1,34 @@
+local LayerRepo = nerv.class("nerv.LayerRepo")
+
+function LayerRepo:__init(layer_spec, param_repo, global_conf)
+    local layers = {}
+    for ltype, llist in pairs(layer_spec) do
+        local layer_type = nerv.get_type(ltype)
+        for id, spec in pairs(llist) do
+            if layers[id] ~= nil then
+                nerv.error("a layer with id %s already exists", id)
+            end
+            nerv.info("create layer: %s", id)
+            if type(spec[2]) ~= "table" then
+                nerv.error("layer config table is need")
+            end
+            layer_config = spec[2]
+            if type(spec[1]) ~= "table" then
+                nerv.error("parameter description table is needed")
+            end
+            for pname, pid in pairs(spec[1]) do
+                layer_config[pname] = param_repo:get_param(pid)
+            end
+            layers[id] = layer_type(id, global_conf, layer_config)
+        end
+    end
+    self.layers = layers
+end
+
+function LayerRepo:get_layer(lid)
+    local layer = self.layers[lid]
+    if layer == nil then
+        nerv.error("layer with id %s not found", lid)
+    end
+    return layer
+end
diff --git a/nerv/nn/param_repo.lua b/nerv/nn/param_repo.lua
new file mode 100644
index 0000000..ab971ba
--- /dev/null
+++ b/nerv/nn/param_repo.lua
@@ -0,0 +1,76 @@
+local ParamRepo = nerv.class("nerv.ParamRepo")
+function ParamRepo:__init(plist)
+    self.params = {}
+    if plist ~= nil then
+        for i, p in ipairs(plist) do
+            self.params[p.id] = p
+        end
+    end
+end
+
+function ParamRepo:add(pid, p)
+    if self.params[pid] ~= nil then
+        nerv.error("duplicate params with the same id: %s", pid)
+    end
+    self.params[pid] = p
+end
+
+function ParamRepo:remove(pid, p)
+    if self.params[pid] == nil then
+        nerv.error("param %s does not exit", pid)
+    end
+    table.remove(self.params, pid)
+end
+
+function ParamRepo.merge(repos)
+    local self = nerv.ParamRepo()
+    for i, repo in ipairs(repos) do
+        if not nerv.is_type(repo, "nerv.ParamRepo") then
+            nerv.error("nerv.ParamRepo objects expected, got %s", repo)
+        end
+        for pid, p in pairs(repo.params) do
+            self:add(pid, p)
+        end
+    end
+    return self
+end
+
+function ParamRepo:import(param_files, pids, gconf)
+    if type(param_files) ~= "table" then
+        nerv.error("param file table is need")
+    end
+    for i = 1, #param_files do
+        local pf = nerv.ChunkFile(param_files[i], "r")
+        for cid, cspec in pairs(pf.metadata) do
+            if pids == nil or pids[cid] ~= nil then
+                local p = pf:read_chunk(cid, gconf)
+                if not nerv.is_type(p, "nerv.Param") then
+                    nerv.error("param chunk is expected")
+                end
+                self:add(cid, p)
+            end
+        end
+    end
+end
+
+function ParamRepo:export(param_file, pids)
+    cf = nerv.ChunkFile(param_file, "w")
+    if pids == nil then
+        for id, p in pairs(self.params) do
+            cf:write_chunk(p)
+        end
+    else
+        for i, pid in ipairs(pids) do
+            cf:write_chunk(self:get_param(pid))
+        end
+    end
+    cf:close()
+end
+
+function ParamRepo:get_param(pid)
+    local p = self.params[pid]
+    if p == nil then
+        nerv.error("param with id %s not found", pid)
+    end
+    return p
+end
diff --git a/nn/init.lua b/nn/init.lua
deleted file mode 100644
index 1bafa77..0000000
--- a/nn/init.lua
+++ /dev/null
@@ -1,3 +0,0 @@
-require 'nn.layer_repo'
-require 'nn.param_repo'
-require 'nn.layer_dag'
diff --git a/nn/layer_dag.lua b/nn/layer_dag.lua
deleted file mode 100644
index 8e30216..0000000
--- a/nn/layer_dag.lua
+++ /dev/null
@@ -1,249 +0,0 @@
-local DAGLayer = nerv.class("nerv.DAGLayer", "nerv.Layer")
-
-local function parse_id(str)
-    local id, port, _
-    _, _, id, port = string.find(str, "([a-zA-Z0-9_]+)%[([0-9]+)%]")
-    if id == nil or port == nil then
-        _, _, id, port = string.find(str, "(.+)%[([0-9]+)%]")
-        if not (id == "<input>" or id == "<output>") then
-            nerv.error("wrong format of connection id")
-        end
-    end
-    port = tonumber(port)
-    return id, port
-end
-
-local function discover(id, layers, layer_repo)
-    local ref = layers[id]
-    if id == "<input>" or id == "<output>" then
-        return nil
-    end
-    if ref == nil then
-        local layer = layer_repo:get_layer(id)
-        local dim_in, dim_out = layer:get_dim()
-        ref = {
-            layer = layer,
-            inputs = {},
-            outputs = {},
-            err_inputs = {},
-            err_outputs = {},
-            next_layers = {},
-            input_len = #dim_in,
-            output_len = #dim_out,
-            in_deg = 0,
-            visited = false
-        }
-        layers[id] = ref
-    end
-    return ref
-end
-
-function DAGLayer:__init(id, global_conf, layer_conf)
-    local layers = {}
-    local inputs = {}
-    local outputs = {}
-    local dim_in = layer_conf.dim_in
-    local dim_out = layer_conf.dim_out
-    local parsed_conn = {}
-    for from, to in pairs(layer_conf.connections) do
-        local id_from, port_from = parse_id(from)
-        local id_to, port_to = parse_id(to)
-        local ref_from = discover(id_from, layers, layer_conf.sub_layers)
-        local ref_to = discover(id_to, layers, layer_conf.sub_layers)
-        local input_dim, output_dim, _
-        if ref_from and ref_from.outputs[port_from] ~= nil then
-            nerv.error("%s has already been attached", from)
-        end
-        if ref_to and ref_to.inputs[port_to] ~= nil then
-            nerv.error("%s has already been attached", to)
-        end
-        if id_from == "<input>" then
-            input_dim, _ = ref_to.layer:get_dim()
-            if dim_in[port_from] ~= input_dim[port_to] then
-                nerv.error("mismatching data dimension between %s and %s", from, to)
-            end
-            inputs[port_from] = {ref_to, port_to}
-            ref_to.inputs[port_to] = inputs -- just a place holder
-        elseif id_to == "<output>" then
-            _, output_dim = ref_from.layer:get_dim()
-            if output_dim[port_from] ~= dim_out[port_to] then
-                nerv.error("mismatching data dimension between %s and %s", from, to)
-            end
-            outputs[port_to] = {ref_from, port_from}
-            ref_from.outputs[port_from] = outputs -- just a place holder
-        else
-            _, output_dim = ref_from.layer:get_dim()
-            input_dim, _ = ref_to.layer:get_dim()
-            if output_dim[port_from] ~= input_dim[port_to] then
-                nerv.error("mismatching data dimension between %s and %s", from, to)
-            end
-
-            table.insert(parsed_conn,
-                {{ref_from, port_from}, {ref_to, port_to}})
-            table.insert(ref_from.next_layers, ref_to) -- add edge
-            ref_to.in_deg = ref_to.in_deg + 1          -- increase the in-degree of the target layer
-        end
-    end
-
-    -- topology sort
-    local queue = {}
-    local l = 1
-    local r = 1
-    for id, ref in pairs(layers) do
-        if ref.in_deg == 0 then
-            table.insert(queue, ref)
-            nerv.info("adding source layer: %s", id)
-            r = r + 1
-        end
-    end
-    if l == r then
-        nerv.error("loop detected")
-    end
-    while l < r do
-        local cur = queue[l]
-        cur.visited = true
-        l = l + 1
-        for _, nl in pairs(cur.next_layers) do
-            nl.in_deg = nl.in_deg - 1 
-            if nl.in_deg == 0 then
-                table.insert(queue, nl)
-                r = r + 1
-            end
-        end
-    end
-    for i = 1, #queue do
-        nerv.info("enqueued layer: %s", queue[i].layer.id)
-    end
-
-    for id, ref in pairs(layers) do
-        -- check wether the graph is connected
-        if ref.visited == false then
-            nerv.warning("layer %s is ignored", id)
-        end
-    end
-
-    self.layers = layers
-    self.inputs = inputs
-    self.outputs = outputs
-    self.dim_in = dim_in
-    self.dim_out = dim_out
-    self.parsed_conn = parsed_conn
-    self.queue = queue
-    self.gconf = global_conf
-end
-
-function DAGLayer:init(batch_size)
-    for i, conn in ipairs(self.parsed_conn) do
-        local _, output_dim
-        local ref_from, port_from, ref_to, port_to
-        ref_from, port_from = unpack(conn[1])
-        ref_to, port_to = unpack(conn[2])
-        _, output_dim = ref_from.layer:get_dim()
-        local mid = self.gconf.cumat_type(batch_size,
-                                        output_dim[port_from])
-        local err_mid = mid:create()
-
-        ref_from.outputs[port_from] = mid
-        ref_to.inputs[port_to] = mid
-
-        ref_from.err_inputs[port_from] = err_mid
-        ref_to.err_outputs[port_to] = err_mid
-    end
-    for id, ref in pairs(self.layers) do
-        for i = 1, ref.input_len do
-            if ref.inputs[i] == nil then
-                nerv.error("dangling input port %d of layer %s", i, id)
-            end
-        end
-        for i = 1, ref.output_len do
-            if ref.outputs[i] == nil then
-                nerv.error("dangling output port %d of layer %s", i, id)
-            end
-        end
-        -- initialize sub layers
-        ref.layer:init(batch_size)
-    end
-    for i = 1, #self.dim_in do
-        if self.inputs[i] == nil then
-            nerv.error("dangling port %d of layer <input>", i)
-        end
-    end
-    for i = 1, #self.dim_out do
-        if self.outputs[i] == nil then
-            nerv.error("dangling port %d of layer <output>", i)
-        end
-    end
-end
-
-function DAGLayer:set_inputs(input)
-    for i = 1, #self.dim_in do
-        local layer = self.inputs[i][1]
-        local port = self.inputs[i][2]
-        layer.inputs[port] = input[i]
-    end
-end
-
-function DAGLayer:set_outputs(output)
-    for i = 1, #self.dim_out do
-        local layer = self.outputs[i][1]
-        local port = self.outputs[i][2]
-        layer.outputs[port] = output[i]
-    end
-end
-
-function DAGLayer:set_err_inputs(bp_err)
-    for i = 1, #self.dim_out do
-        local layer = self.outputs[i][1]
-        local port = self.outputs[i][2]
-        layer.err_inputs[port] = bp_err[i]
-    end
-end
-
-function DAGLayer:set_err_outputs(next_bp_err)
-    for i = 1, #self.dim_in do
-        local layer = self.inputs[i][1]
-        local port = self.inputs[i][2]
-        layer.err_outputs[port] = next_bp_err[i]
-    end
-end
-
-function DAGLayer:update(bp_err, input, output)
-    self:set_err_inputs(bp_err)
-    self:set_inputs(input)
-    self:set_outputs(output)
-    -- print("update")
-    for id, ref in pairs(self.queue) do
-        -- print(ref.layer.id)
-        ref.layer:update(ref.err_inputs, ref.inputs, ref.outputs)
-    end
-end
-
-function DAGLayer:propagate(input, output)
-    self:set_inputs(input)
-    self:set_outputs(output)
-    for i = 1, #self.queue do
-        local ref = self.queue[i]
-        -- print(ref.layer.id)
-        ref.layer:propagate(ref.inputs, ref.outputs)
-    end
-end
-
-function DAGLayer:back_propagate(bp_err, next_bp_err, input, output)
-    self:set_err_outputs(next_bp_err)
-    self:set_err_inputs(bp_err)
-    self:set_inputs(input)
-    self:set_outputs(output)
-    for i = #self.queue, 1, -1 do
-        local ref = self.queue[i]
-        -- print(ref.layer.id)
-        ref.layer:back_propagate(ref.err_inputs, ref.err_outputs, ref.inputs, ref.outputs)
-    end
-end
-
-function DAGLayer:get_params()
-    local param_repos = {}
-    for id, ref in pairs(self.queue) do
-        table.insert(param_repos, ref.layer:get_params())
-    end
-    return nerv.ParamRepo.merge(param_repos)
-end
diff --git a/nn/layer_repo.lua b/nn/layer_repo.lua
deleted file mode 100644
index 602c37c..0000000
--- a/nn/layer_repo.lua
+++ /dev/null
@@ -1,34 +0,0 @@
-local LayerRepo = nerv.class("nerv.LayerRepo")
-
-function LayerRepo:__init(layer_spec, param_repo, global_conf)
-    local layers = {}
-    for ltype, llist in pairs(layer_spec) do
-        local layer_type = nerv.get_type(ltype)
-        for id, spec in pairs(llist) do
-            if layers[id] ~= nil then
-                nerv.error("a layer with id %s already exists", id)
-            end
-            nerv.info("create layer: %s", id)
-            if type(spec[2]) ~= "table" then
-                nerv.error("layer config table is need")
-            end
-            layer_config = spec[2]
-            if type(spec[1]) ~= "table" then
-                nerv.error("parameter description table is needed")
-            end
-            for pname, pid in pairs(spec[1]) do
-                layer_config[pname] = param_repo:get_param(pid)
-            end
-            layers[id] = layer_type(id, global_conf, layer_config)
-        end
-    end
-    self.layers = layers
-end
-
-function LayerRepo:get_layer(lid)
-    local layer = self.layers[lid]
-    if layer == nil then
-        nerv.error("layer with id %s not found", lid)
-    end
-    return layer
-end
diff --git a/nn/param_repo.lua b/nn/param_repo.lua
deleted file mode 100644
index ab971ba..0000000
--- a/nn/param_repo.lua
+++ /dev/null
@@ -1,76 +0,0 @@
-local ParamRepo = nerv.class("nerv.ParamRepo")
-function ParamRepo:__init(plist)
-    self.params = {}
-    if plist ~= nil then
-        for i, p in ipairs(plist) do
-            self.params[p.id] = p
-        end
-    end
-end
-
-function ParamRepo:add(pid, p)
-    if self.params[pid] ~= nil then
-        nerv.error("duplicate params with the same id: %s", pid)
-    end
-    self.params[pid] = p
-end
-
-function ParamRepo:remove(pid, p)
-    if self.params[pid] == nil then
-        nerv.error("param %s does not exit", pid)
-    end
-    table.remove(self.params, pid)
-end
-
-function ParamRepo.merge(repos)
-    local self = nerv.ParamRepo()
-    for i, repo in ipairs(repos) do
-        if not nerv.is_type(repo, "nerv.ParamRepo") then
-            nerv.error("nerv.ParamRepo objects expected, got %s", repo)
-        end
-        for pid, p in pairs(repo.params) do
-            self:add(pid, p)
-        end
-    end
-    return self
-end
-
-function ParamRepo:import(param_files, pids, gconf)
-    if type(param_files) ~= "table" then
-        nerv.error("param file table is need")
-    end
-    for i = 1, #param_files do
-        local pf = nerv.ChunkFile(param_files[i], "r")
-        for cid, cspec in pairs(pf.metadata) do
-            if pids == nil or pids[cid] ~= nil then
-                local p = pf:read_chunk(cid, gconf)
-                if not nerv.is_type(p, "nerv.Param") then
-                    nerv.error("param chunk is expected")
-                end
-                self:add(cid, p)
-            end
-        end
-    end
-end
-
-function ParamRepo:export(param_file, pids)
-    cf = nerv.ChunkFile(param_file, "w")
-    if pids == nil then
-        for id, p in pairs(self.params) do
-            cf:write_chunk(p)
-        end
-    else
-        for i, pid in ipairs(pids) do
-            cf:write_chunk(self:get_param(pid))
-        end
-    end
-    cf:close()
-end
-
-function ParamRepo:get_param(pid)
-    local p = self.params[pid]
-    if p == nil then
-        nerv.error("param with id %s not found", pid)
-    end
-    return p
-end
diff --git a/pl/compat.lua b/pl/compat.lua
deleted file mode 100644
index 7959ac3..0000000
--- a/pl/compat.lua
+++ /dev/null
@@ -1,137 +0,0 @@
-----------------
---- Lua 5.1/5.2 compatibility
--- Ensures that `table.pack` and `package.searchpath` are available
--- for Lua 5.1 and LuaJIT.
--- The exported function `load` is Lua 5.2 compatible.
--- `compat.setfenv` and `compat.getfenv` are available for Lua 5.2, although
--- they are not always guaranteed to work.
--- @module pl.compat
-
-local compat = {}
-
-compat.lua51 = _VERSION == 'Lua 5.1'
-
---- execute a shell command.
--- This is a compatibility function that returns the same for Lua 5.1 and Lua 5.2
--- @param cmd a shell command
--- @return true if successful
--- @return actual return code
-function compat.execute (cmd)
-    local res1,res2,res2 = os.execute(cmd)
-    if compat.lua51 then
-        return res1==0,res1
-    else
-        return not not res1,res2
-    end
-end
-
-----------------
--- Load Lua code as a text or binary chunk.
--- @param ld code string or loader
--- @param[opt] source name of chunk for errors
--- @param[opt] mode 'b', 't' or 'bt'
--- @param[opt] env environment to load the chunk in
--- @function compat.load
-
----------------
--- Get environment of a function.
--- With Lua 5.2, may return nil for a function with no global references!
--- Based on code by [Sergey Rozhenko](http://lua-users.org/lists/lua-l/2010-06/msg00313.html)
--- @param f a function or a call stack reference
--- @function compat.setfenv
-
----------------
--- Set environment of a function
--- @param f a function or a call stack reference
--- @param env a table that becomes the new environment of `f`
--- @function compat.setfenv
-
-if compat.lua51 then -- define Lua 5.2 style load()
-    if not tostring(assert):match 'builtin' then -- but LuaJIT's load _is_ compatible
-        local lua51_load = load
-        function compat.load(str,src,mode,env)
-            local chunk,err
-            if type(str) == 'string' then
-                if str:byte(1) == 27 and not (mode or 'bt'):find 'b' then
-                    return nil,"attempt to load a binary chunk"
-                end
-                chunk,err = loadstring(str,src)
-            else
-                chunk,err = lua51_load(str,src)
-            end
-            if chunk and env then setfenv(chunk,env) end
-            return chunk,err
-        end
-    else
-        compat.load = load
-    end
-    compat.setfenv, compat.getfenv = setfenv, getfenv
-else
-    compat.load = load
-    -- setfenv/getfenv replacements for Lua 5.2
-    -- by Sergey Rozhenko
-    -- http://lua-users.org/lists/lua-l/2010-06/msg00313.html
-    -- Roberto Ierusalimschy notes that it is possible for getfenv to return nil
-    -- in the case of a function with no globals:
-    -- http://lua-users.org/lists/lua-l/2010-06/msg00315.html
-    function compat.setfenv(f, t)
-        f = (type(f) == 'function' and f or debug.getinfo(f + 1, 'f').func)
-        local name
-        local up = 0
-        repeat
-            up = up + 1
-            name = debug.getupvalue(f, up)
-        until name == '_ENV' or name == nil
-        if name then
-            debug.upvaluejoin(f, up, function() return name end, 1) -- use unique upvalue
-            debug.setupvalue(f, up, t)
-        end
-        if f ~= 0 then return f end
-    end
-
-    function compat.getfenv(f)
-        local f = f or 0
-        f = (type(f) == 'function' and f or debug.getinfo(f + 1, 'f').func)
-        local name, val
-        local up = 0
-        repeat
-            up = up + 1
-            name, val = debug.getupvalue(f, up)
-        until name == '_ENV' or name == nil
-        return val
-    end
-end
-
---- Lua 5.2 Functions Available for 5.1
--- @section lua52
-
---- pack an argument list into a table.
--- @param ... any arguments
--- @return a table with field n set to the length
--- @return the length
--- @function table.pack
-if not table.pack then
-    function table.pack (...)
-        return {n=select('#',...); ...}
-    end
-end
-
-------
--- return the full path where a Lua module name would be matched.
--- @param mod module name, possibly dotted
--- @param path a path in the same form as package.path or package.cpath
--- @see path.package_path
--- @function package.searchpath
-if not package.searchpath then
-    local sep = package.config:sub(1,1)
-    function package.searchpath (mod,path)
-        mod = mod:gsub('%.',sep)
-        for m in path:gmatch('[^;]+') do
-            local nm = m:gsub('?',mod)
-            local f = io.open(nm,'r')
-            if f then f:close(); return nm end
-        end
-    end
-end
-
-return compat
diff --git a/pl/utils.lua b/pl/utils.lua
deleted file mode 100644
index f933afb..0000000
--- a/pl/utils.lua
+++ /dev/null
@@ -1,476 +0,0 @@
---- Generally useful routines.
--- See  @{01-introduction.md.Generally_useful_functions|the Guide}.
--- @module pl.utils
-local format,gsub,byte = string.format,string.gsub,string.byte
-local compat = require 'pl.compat'
-local clock = os.clock
-local stdout = io.stdout
-local append = table.insert
-local unpack = rawget(_G,'unpack') or rawget(table,'unpack')
-
-local collisions = {}
-
-local utils = {
-    _VERSION = "1.3.2",
-    lua51 = compat.lua51,
-    setfenv = compat.setfenv,
-    getfenv = compat.getfenv,
-    load = compat.load,
-    execute = compat.execute,
-    dir_separator = _G.package.config:sub(1,1),
-    unpack = unpack
-}
-
---- end this program gracefully.
--- @param code The exit code or a message to be printed
--- @param ... extra arguments for message's format'
--- @see utils.fprintf
-function utils.quit(code,...)
-    if type(code) == 'string' then
-        utils.fprintf(io.stderr,code,...)
-        code = -1
-    else
-        utils.fprintf(io.stderr,...)
-    end
-    io.stderr:write('\n')
-    os.exit(code)
-end
-
---- print an arbitrary number of arguments using a format.
--- @param fmt The format (see string.format)
--- @param ... Extra arguments for format
-function utils.printf(fmt,...)
-    utils.assert_string(1,fmt)
-    utils.fprintf(stdout,fmt,...)
-end
-
---- write an arbitrary number of arguments to a file using a format.
--- @param f File handle to write to.
--- @param fmt The format (see string.format).
--- @param ... Extra arguments for format
-function utils.fprintf(f,fmt,...)
-    utils.assert_string(2,fmt)
-    f:write(format(fmt,...))
-end
-
-local function import_symbol(T,k,v,libname)
-    local key = rawget(T,k)
-    -- warn about collisions!
-    if key and k ~= '_M' and k ~= '_NAME' and k ~= '_PACKAGE' and k ~= '_VERSION' then
-        utils.printf("warning: '%s.%s' will not override existing symbol\n",libname,k)
-        return
-    end
-    rawset(T,k,v)
-end
-
-local function lookup_lib(T,t)
-    for k,v in pairs(T) do
-        if v == t then return k end
-    end
-    return '?'
-end
-
-local already_imported = {}
-
---- take a table and 'inject' it into the local namespace.
--- @param t The Table
--- @param T An optional destination table (defaults to callers environment)
-function utils.import(t,T)
-    T = T or _G
-    t = t or utils
-    if type(t) == 'string' then
-        t = require (t)
-    end
-    local libname = lookup_lib(T,t)
-    if already_imported[t] then return end
-    already_imported[t] = libname
-    for k,v in pairs(t) do
-        import_symbol(T,k,v,libname)
-    end
-end
-
-utils.patterns = {
-    FLOAT = '[%+%-%d]%d*%.?%d*[eE]?[%+%-]?%d*',
-    INTEGER = '[+%-%d]%d*',
-    IDEN = '[%a_][%w_]*',
-    FILE = '[%a%.\\][:%][%w%._%-\\]*'
-}
-
---- escape any 'magic' characters in a string
--- @param s The input string
-function utils.escape(s)
-    utils.assert_string(1,s)
-    return (s:gsub('[%-%.%+%[%]%(%)%$%^%%%?%*]','%%%1'))
-end
-
---- return either of two values, depending on a condition.
--- @param cond A condition
--- @param value1 Value returned if cond is true
--- @param value2 Value returned if cond is false (can be optional)
-function utils.choose(cond,value1,value2)
-    if cond then return value1
-    else return value2
-    end
-end
-
-local raise
-
---- return the contents of a file as a string
--- @param filename The file path
--- @param is_bin open in binary mode
--- @return file contents
-function utils.readfile(filename,is_bin)
-    local mode = is_bin and 'b' or ''
-    utils.assert_string(1,filename)
-    local f,err = io.open(filename,'r'..mode)
-    if not f then return utils.raise (err) end
-    local res,err = f:read('*a')
-    f:close()
-    if not res then return raise (err) end
-    return res
-end
-
---- write a string to a file
--- @param filename The file path
--- @param str The string
--- @return true or nil
--- @return error message
--- @raise error if filename or str aren't strings
-function utils.writefile(filename,str)
-    utils.assert_string(1,filename)
-    utils.assert_string(2,str)
-    local f,err = io.open(filename,'w')
-    if not f then return raise(err) end
-    f:write(str)
-    f:close()
-    return true
-end
-
---- return the contents of a file as a list of lines
--- @param filename The file path
--- @return file contents as a table
--- @raise errror if filename is not a string
-function utils.readlines(filename)
-    utils.assert_string(1,filename)
-    local f,err = io.open(filename,'r')
-    if not f then return raise(err) end
-    local res = {}
-    for line in f:lines() do
-        append(res,line)
-    end
-    f:close()
-    return res
-end
-
---- split a string into a list of strings separated by a delimiter.
--- @param s The input string
--- @param re A Lua string pattern; defaults to '%s+'
--- @param plain don't use Lua patterns
--- @param n optional maximum number of splits
--- @return a list-like table
--- @raise error if s is not a string
-function utils.split(s,re,plain,n)
-    utils.assert_string(1,s)
-    local find,sub,append = string.find, string.sub, table.insert
-    local i1,ls = 1,{}
-    if not re then re = '%s+' end
-    if re == '' then return {s} end
-    while true do
-        local i2,i3 = find(s,re,i1,plain)
-        if not i2 then
-            local last = sub(s,i1)
-            if last ~= '' then append(ls,last) end
-            if #ls == 1 and ls[1] == '' then
-                return {}
-            else
-                return ls
-            end
-        end
-        append(ls,sub(s,i1,i2-1))
-        if n and #ls == n then
-            ls[#ls] = sub(s,i1)
-            return ls
-        end
-        i1 = i3+1
-    end
-end
-
---- split a string into a number of values.
--- @param s the string
--- @param re the delimiter, default space
--- @return n values
--- @usage first,next = splitv('jane:doe',':')
--- @see split
-function utils.splitv (s,re)
-    return unpack(utils.split(s,re))
-end
-
---- convert an array of values to strings.
--- @param t a list-like table
--- @param temp buffer to use, otherwise allocate
--- @param tostr custom tostring function, called with (value,index).
--- Otherwise use `tostring`
--- @return the converted buffer
-function utils.array_tostring (t,temp,tostr)
-    temp, tostr = temp or {}, tostr or tostring
-    for i = 1,#t do
-        temp[i] = tostr(t[i],i)
-    end
-    return temp
-end
-
---- execute a shell command and return the output.
--- This function redirects the output to tempfiles and returns the content of those files.
--- @param cmd a shell command
--- @param bin boolean, if true, read output as binary file
--- @return true if successful
--- @return actual return code
--- @return stdout output (string)
--- @return errout output (string)
-function utils.executeex(cmd, bin)
-    local mode
-    local outfile = os.tmpname()
-    local errfile = os.tmpname()
-
-    if utils.dir_separator == '\\' then
-        outfile = os.getenv('TEMP')..outfile
-        errfile = os.getenv('TEMP')..errfile
-    end
-    cmd = cmd .. [[ >"]]..outfile..[[" 2>"]]..errfile..[["]]
-
-    local success, retcode = utils.execute(cmd)
-    local outcontent = utils.readfile(outfile, bin)
-    local errcontent = utils.readfile(errfile, bin)
-    os.remove(outfile)
-    os.remove(errfile)
-    return success, retcode, (outcontent or ""), (errcontent or "")
-end
-
---- 'memoize' a function (cache returned value for next call).
--- This is useful if you have a function which is relatively expensive,
--- but you don't know in advance what values will be required, so
--- building a table upfront is wasteful/impossible.
--- @param func a function of at least one argument
--- @return a function with at least one argument, which is used as the key.
-function utils.memoize(func)
-    return setmetatable({}, {
-        __index = function(self, k, ...)
-            local v = func(k,...)
-            self[k] = v
-            return v
-        end,
-        __call = function(self, k) return self[k] end
-    })
-end
-
-
-utils.stdmt = {
-    List = {_name='List'}, Map = {_name='Map'},
-    Set = {_name='Set'}, MultiMap = {_name='MultiMap'}
-}
-
-local _function_factories = {}
-
---- associate a function factory with a type.
--- A function factory takes an object of the given type and
--- returns a function for evaluating it
--- @tab mt metatable
--- @func fun a callable that returns a function
-function utils.add_function_factory (mt,fun)
-    _function_factories[mt] = fun
-end
-
-local function _string_lambda(f)
-    local raise = utils.raise
-    if f:find '^|' or f:find '_' then
-        local args,body = f:match '|([^|]*)|(.+)'
-        if f:find '_' then
-            args = '_'
-            body = f
-        else
-            if not args then return raise 'bad string lambda' end
-        end
-        local fstr = 'return function('..args..') return '..body..' end'
-        local fn,err = utils.load(fstr)
-        if not fn then return raise(err) end
-        fn = fn()
-        return fn
-    else return raise 'not a string lambda'
-    end
-end
-
---- an anonymous function as a string. This string is either of the form
--- '|args| expression' or is a function of one argument, '_'
--- @param lf function as a string
--- @return a function
--- @usage string_lambda '|x|x+1' (2) == 3
--- @usage string_lambda '_+1 (2) == 3
--- @function utils.string_lambda
-utils.string_lambda = utils.memoize(_string_lambda)
-
-local ops
-
---- process a function argument.
--- This is used throughout Penlight and defines what is meant by a function:
--- Something that is callable, or an operator string as defined by <code>pl.operator</code>,
--- such as '>' or '#'. If a function factory has been registered for the type, it will
--- be called to get the function.
--- @param idx argument index
--- @param f a function, operator string, or callable object
--- @param msg optional error message
--- @return a callable
--- @raise if idx is not a number or if f is not callable
-function utils.function_arg (idx,f,msg)
-    utils.assert_arg(1,idx,'number')
-    local tp = type(f)
-    if tp == 'function' then return f end  -- no worries!
-    -- ok, a string can correspond to an operator (like '==')
-    if tp == 'string' then
-        if not ops then ops = require 'pl.operator'.optable end
-        local fn = ops[f]
-        if fn then return fn end
-        local fn, err = utils.string_lambda(f)
-        if not fn then error(err..': '..f) end
-        return fn
-    elseif tp == 'table' or tp == 'userdata' then
-        local mt = getmetatable(f)
-        if not mt then error('not a callable object',2) end
-        local ff = _function_factories[mt]
-        if not ff then
-            if not mt.__call then error('not a callable object',2) end
-            return f
-        else
-            return ff(f) -- we have a function factory for this type!
-        end
-    end
-    if not msg then msg = " must be callable" end
-    if idx > 0 then
-        error("argument "..idx..": "..msg,2)
-    else
-        error(msg,2)
-    end
-end
-
---- bind the first argument of the function to a value.
--- @param fn a function of at least two values (may be an operator string)
--- @param p a value
--- @return a function such that f(x) is fn(p,x)
--- @raise same as @{function_arg}
--- @see func.bind1
-function utils.bind1 (fn,p)
-    fn = utils.function_arg(1,fn)
-    return function(...) return fn(p,...) end
-end
-
---- bind the second argument of the function to a value.
--- @param fn a function of at least two values (may be an operator string)
--- @param p a value
--- @return a function such that f(x) is fn(x,p)
--- @raise same as @{function_arg}
-function utils.bind2 (fn,p)
-    fn = utils.function_arg(1,fn)
-    return function(x,...) return fn(x,p,...) end
-end
-
-
---- assert that the given argument is in fact of the correct type.
--- @param n argument index
--- @param val the value
--- @param tp the type
--- @param verify an optional verfication function
--- @param msg an optional custom message
--- @param lev optional stack position for trace, default 2
--- @raise if the argument n is not the correct type
--- @usage assert_arg(1,t,'table')
--- @usage assert_arg(n,val,'string',path.isdir,'not a directory')
-function utils.assert_arg (n,val,tp,verify,msg,lev)
-    if type(val) ~= tp then
-        error(("argument %d expected a '%s', got a '%s'"):format(n,tp,type(val)),lev or 2)
-    end
-    if verify and not verify(val) then
-        error(("argument %d: '%s' %s"):format(n,val,msg),lev or 2)
-    end
-end
-
---- assert the common case that the argument is a string.
--- @param n argument index
--- @param val a value that must be a string
--- @raise val must be a string
-function utils.assert_string (n,val)
-    utils.assert_arg(n,val,'string',nil,nil,3)
-end
-
-local err_mode = 'default'
-
---- control the error strategy used by Penlight.
--- Controls how <code>utils.raise</code> works; the default is for it
--- to return nil and the error string, but if the mode is 'error' then
--- it will throw an error. If mode is 'quit' it will immediately terminate
--- the program.
--- @param mode - either 'default', 'quit'  or 'error'
--- @see utils.raise
-function utils.on_error (mode)
-    if ({['default'] = 1, ['quit'] = 2, ['error'] = 3})[mode] then
-      err_mode = mode
-    else
-      -- fail loudly
-      if err_mode == 'default' then err_mode = 'error' end
-      utils.raise("Bad argument expected string; 'default', 'quit', or 'error'. Got '"..tostring(mode).."'")
-    end
-end
-
---- used by Penlight functions to return errors.  Its global behaviour is controlled
--- by <code>utils.on_error</code>
--- @param err the error string.
--- @see utils.on_error
-function utils.raise (err)
-    if err_mode == 'default' then return nil,err
-    elseif err_mode == 'quit' then utils.quit(err)
-    else error(err,2)
-    end
-end
-
---- is the object of the specified type?.
--- If the type is a string, then use type, otherwise compare with metatable
--- @param obj An object to check
--- @param tp String of what type it should be
-function utils.is_type (obj,tp)
-    if type(tp) == 'string' then return type(obj) == tp end
-    local mt = getmetatable(obj)
-    return tp == mt
-end
-
-raise = utils.raise
-
---- load a code string or bytecode chunk.
--- @param code Lua code as a string or bytecode
--- @param name for source errors
--- @param mode kind of chunk, 't' for text, 'b' for bytecode, 'bt' for all (default)
--- @param env  the environment for the new chunk (default nil)
--- @return compiled chunk
--- @return error message (chunk is nil)
--- @function utils.load
-
----------------
--- Get environment of a function.
--- With Lua 5.2, may return nil for a function with no global references!
--- Based on code by [Sergey Rozhenko](http://lua-users.org/lists/lua-l/2010-06/msg00313.html)
--- @param f a function or a call stack reference
--- @function utils.getfenv
-
----------------
--- Set environment of a function
--- @param f a function or a call stack reference
--- @param env a table that becomes the new environment of `f`
--- @function utils.setfenv
-
---- execute a shell command.
--- This is a compatibility function that returns the same for Lua 5.1 and Lua 5.2
--- @param cmd a shell command
--- @return true if successful
--- @return actual return code
--- @function utils.execute
-
-return utils
-
-
diff --git a/speech b/speech
index 3910eb6..32eac09 160000
--- a/speech
+++ b/speech
@@ -1 +1 @@
-Subproject commit 3910eb670f3c29aba95596bb8c55cbee3368f805
+Subproject commit 32eac093cc431849a92e5a2297c5fe646fd60556
diff --git a/tools/build_luajit.sh b/tools/build_luajit.sh
new file mode 100755
index 0000000..7a66412
--- /dev/null
+++ b/tools/build_luajit.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+LUAJIT_PREFIX="$PREFIX"
+LUAJIT_SRC='luajit-2.0/'
+[[ -f "$LUAJIT_PREFIX/bin/luajit" ]] || (cd "$LUAJIT_SRC"; make && make PREFIX="$LUAJIT_PREFIX" install)
diff --git a/tools/build_luarocks.sh b/tools/build_luarocks.sh
new file mode 100755
index 0000000..e1f1c0b
--- /dev/null
+++ b/tools/build_luarocks.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+LUAROCKS_PREFIX="$PREFIX"
+LUAROCKS_SRC='luarocks/'
+[[ -f "$LUAROCKS_PREFIX/bin/luarocks" ]] || (cd "$LUAROCKS_SRC"; ./configure --prefix=$LUAROCKS_PREFIX  --with-lua-include="$LUAROCKS_PREFIX/include/luajit-2.0/" --with-lua="$LUAROCKS_PREFIX" --lua-suffix='jit' --with-lua-lib="$LUAROCKS_PREFIX/lib/"; make clean && make build && make bootstrap )
-- 
cgit v1.2.3-70-g09d2