78 files changed, 4065 insertions, 2055 deletions
diff --git a/nerv/Makefile b/nerv/Makefile
index c0db53a..68465a1 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -1,3 +1,11 @@
+ifndef LUA_BINDIR
+$(error Please build the package via luarocks: `luarocks make`)
+endif
+
+ifndef CUDA_BASE
+$(error CUDA_BASE is not set)
+endif
+
 .PHONY: build install clean
 SHELL := /bin/bash
 
@@ -6,14 +14,15 @@ LIB_PATH := $(LUA_BINDIR)/../lib
 INC_PATH := $(LUA_BINDIR)/../include/nerv
 LUA_DIR = $(INST_LUADIR)/nerv
 OBJ_DIR := $(BUILD_DIR)/objs
-ISUBDIR := io matrix luaT
+ISUBDIR := lib matrix lib/io lib/matrix lib/luaT
 SUBDIR := matrix io layer examples nn tnn lib/io lib/luaT lib/matrix
 
 INC_SUBDIR := $(addprefix $(INC_PATH)/,$(ISUBDIR))
 OBJ_SUBDIR := $(addprefix $(OBJ_DIR)/,$(SUBDIR))
 LUA_SUBDIR := $(addprefix $(LUA_DIR)/,$(SUBDIR))
 
-INCS := common.h matrix/matrix.h io/chunk_file.h luaT/luaT.h
+INCS := lib/common.h lib/matrix/matrix.h lib/matrix/mmatrix.h lib/io/chunk_file.h lib/luaT/luaT.h \
+		matrix/matrix.h
 CORE_OBJS := lib/common.o lib/io/chunk_file.o \
 			lib/matrix/mmatrix.o lib/matrix/cumatrix.o lib/matrix/cukernel.o
 NERV_OBJS := nerv.o \
@@ -33,17 +42,17 @@ LUA_LIBS := matrix/init.lua io/init.lua init.lua \
 			layer/init.lua layer/affine.lua layer/sigmoid.lua layer/tanh.lua layer/softmax_ce.lua layer/softmax.lua \
 			layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua \
 			layer/elem_mul.lua layer/lstm.lua layer/lstm_gate.lua layer/dropout.lua layer/gru.lua \
-			nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua \
-			io/sgd_buffer.lua \
-			tnn/init.lua tnn/sutil.lua tnn/tnn.lua
+			layer/graph.lua layer/rnn.lua layer/duplicate.lua layer/identity.lua \
+			nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/network.lua \
+			io/sgd_buffer.lua io/seq_buffer.lua
 
 INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK
-#CUDA_BASE := /usr/local/cuda-7.0
-CUDA_BASE := /usr/local/cuda
 CUDA_INCLUDE := -I $(CUDA_BASE)/include/
 INCLUDE += $(CUDA_INCLUDE)
 
-LDFLAGS := -L$(CUDA_BASE)/lib64/ -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcublas -lcurand
+CUDA_LDFLAGS := -L$(CUDA_BASE)/lib64/ -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcuda -lcublas -lcurand
+override CFLAGS += $(NERV_FEAT)
+
 NVCC := $(CUDA_BASE)/bin/nvcc
 EMPTY :=
 SPACE := $(EMPTY) $(EMPTY)
@@ -64,11 +73,11 @@ $(LUA_DIR)/%.lua: %.lua
 	cp $< $@
 
 $(LIB_PATH)/libnervcore.so: $(CORE_OBJS)
-	gcc -shared -o $@ $^ $(LDFLAGS) -lcblas
+	gcc -shared -o $@ $^ $(LDFLAGS) $(CUDA_LDFLAGS) $(BLAS_LDFLAGS)
 $(LIB_PATH)/libluaT.so: $(LUAT_OBJS)
-	gcc -shared -o $@ $^ $(LDFLAGS)
+	gcc -shared -o $@ $^
 $(INST_LIBDIR)/libnerv.so: $(NERV_OBJS) $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so
-	gcc -shared -o $@ $(NERV_OBJS) $(LDFLAGS) -Wl,-rpath=$(LIB_PATH) -L$(LIB_PATH) -lnervcore -lluaT
+	gcc -shared -o $@ $(NERV_OBJS) -Wl,-rpath=$(LIB_PATH) -L$(LIB_PATH) -lnervcore -lluaT
 
 $(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c
 $(OBJ_DIR)/matrix/mmatrix.o: matrix/generic/mmatrix.c matrix/generic/matrix.c
@@ -82,5 +91,5 @@ clean:
 
 install: $(LIBS) $(LUA_DIR) $(LUA_SUBDIR) $(LUA_LIBS) $(INC_SUBDIR) $(INCS)
 
-$(INC_PATH)/%.h: lib/%.h
+$(INC_PATH)/%.h: %.h
 	cp $< $@
diff --git a/nerv/doc/nerv.md b/nerv/doc/nerv.md
index 28411f5..125928d 100644
--- a/nerv/doc/nerv.md
+++ b/nerv/doc/nerv.md
@@ -1,6 +1,6 @@
-#The Nerv utility functions#
+# The Nerv utility functions
 Part of the [Nerv](../README.md) toolkit.
-##Methods##
+## Methods
 * __string = nerv.typename(obj a)__  
 A registered function, the original function is `luaT_lua_typename`. In some cases if you call `type(a)`  for object of some class in __Nerv__(like __Nerv.CuMatrix__) it will only return "userdata"(because it is created in C), in this case you can use this method to get its type.
 
@@ -14,4 +14,4 @@ A registered function, the original function is `luaT_newmetatable`, it returns
 * __string = nerv.setmetatable(table self, string tname)__  
 A registered function, the original function is `luaT_lua_setmetatable`. It assigns the metatable registered in __luaT__ by the name *tname* to the table *self*. And return *tname* to user.
 * __table = nerv.get_type(string typename)__  
-Returns the type(`loadstring("return " .. typename)`).
-\ No newline at end of file
+Returns the type(`loadstring("return " .. typename)`).
diff --git a/nerv/doc/nerv_class.md b/nerv/doc/nerv_class.md
index 99f63e7..8314b12 100644
--- a/nerv/doc/nerv_class.md
+++ b/nerv/doc/nerv_class.md
@@ -1,10 +1,10 @@
-#The Nerv OOP#
+# The Nerv OOP
 Part of the [Nerv](../README.md) toolkit.
-##Methods##
+## Methods
 * __metatable mt, metatable mpt = nerv.class(string tname, string parenttname)__  
 This method is used to create a class by the name `tname`, which inherits `parenttname` in __Nerv__, then you create a new instance of this class by calling `obj=tname(...)`. The  `tname.__init(...)` method(if defined) will be called in the constructing. The metatable of the class and its parent class will be returned.
 
-##Examples##
+## Examples
 * This example implements a simple `nerv.Counter` class which is inherited by `nerv.BetterCounter`.  
 
 ```
@@ -33,4 +33,4 @@ c1 = nerv.Counter(1)
 print(c1.c)
 bc1 = nerv.BetterCounter(1, 1)
 print(bc1.c, bc1.bc)
-```
-\ No newline at end of file
+```
diff --git a/nerv/doc/nerv_io.md b/nerv/doc/nerv_io.md
index 07589df..299362f 100644
--- a/nerv/doc/nerv_io.md
+++ b/nerv/doc/nerv_io.md
@@ -1,7 +1,7 @@
-#The Nerv IO Package#
+# The Nerv IO Package
 Part of the [Nerv](../README.md) toolkit.
 
-##Description##
+## Description
 The main class that the user uses to store and read parameter object to and from files is __nerv.ChunkFile__.  
 In the file, a parameter object will be saved using a standard format. First is the length(in byte) of this object, then a table which includes some meta information of the object, and a data area. Below is an example text file.  
 ```
@@ -23,7 +23,7 @@ In the file, a parameter object will be saved using a standard format. First is
 3.000000 3.000000 3.000000 
 ```
 
-##Methods##
+## Methods
 * __ChunkFile ChunkFile(string fn, string mode)__  
 `mode` can be `r` or `w`, for reading or writing a file. The returned __ChunkFile__ will be ready to write or read objects which follows the __nerv.Param__ interface(using `write_chunk` and `read_chunk`). 
 * __void ChunkFile.write_chunk(ChunkFile self, Param p)__  
@@ -33,7 +33,7 @@ Read the __Param__ object by id `id` from the file `self`. It will be constructe
 * __void ChunkFile.close(ChunkFile self)__  
 Close the opened file.
 
-##Examples##
+## Examples
 * An example showing how to use __ChunkFile__ to store and read parameter objects.
 ```
 require 'io'
@@ -96,7 +96,7 @@ do
 end
 ```
 
-##Developer Notes##
+## Developer Notes
 * There are four classes in to deal with chunk data, which are __nerv.ChunkFile__, __nerv.ChunkFileHandle__, __nerv.ChunkInfo__, __nerv.ChunkData__. Below is the underlying C structs.
 ```
 typedef struct ChunkFileHandle {
@@ -110,4 +110,5 @@ typedef struct ChunkData {
     char *data;
 } ChunkData;
 ```
-* In __Nerv.io__, a returned(by `ChunkFile.__init`) __nerv.ChunkFile__ will have a member `handle`, which is a __nerv.ChunkFileHandle__.  
-\ No newline at end of file
+
+* In __Nerv.io__, a returned(by `ChunkFile.__init`) __nerv.ChunkFile__ will have a member `handle`, which is a __nerv.ChunkFileHandle__.  
diff --git a/nerv/doc/nerv_layer.md b/nerv/doc/nerv_layer.md
index de2fb12..dd7c9bb 100644
--- a/nerv/doc/nerv_layer.md
+++ b/nerv/doc/nerv_layer.md
@@ -1,9 +1,9 @@
-#The Nerv Layer Package#
+# The Nerv Layer Package
 Part of the [Nerv](../README.md) toolkit.
 
-##Description##
+## Description
 __nerv.Layer__ is the base class and most of its methods are abstract.  
-###Class hierarchy and their members###
+### Class hierarchy and their members
 * __nerv.Layer__.  
 	* `table dim_in` It specifies the dimensions of the inputs.  
 	* `table dim_out` It specifies the dimensions of the outputs.  
@@ -20,7 +20,7 @@ __nerv.Layer__ is the base class and most of its methods are abstract.
 	* `int total_frams` Records how many frames have passed.  
 	* `bool compressed` The reference distribution can be a one-hot format. This feature is enabled by `layer_conf.compressed`.
 
-##Methods##
+## Methods
 * __void Layer.\_\_init(Layer self, string id, table global_conf, table layer_conf)__  
 Abstract method.  
 The constructing method should assign `id` to `self.id` and `global_conf` to `self.gconf`, `layer_conf.dim_in` to `self.dim_in`, `layer_conf.dim_out` to `self.dim_out`. `dim_in` and `dim_out` are a list specifies the dimensions of the inputs and outputs. Also, `layer_conf` will include the parameters, which should also be properly saved.
@@ -43,7 +43,7 @@ Check whether `#self.dim_in == len_in` and `#self.dim_out == len_out`, if violat
 Abstract method.  
 The layer should return a list containing its parameters.
 
-####nerv.Layer.get\_dim(self)####
+#### nerv.Layer.get\_dim(self)
 *	Returns:
 	`dim_in`: __table__.  
     `dim_out`: __table__.  
@@ -52,7 +52,7 @@ The layer should return a list containing its parameters.
 *	Description:  
 	Returns `self.dim_in, self.dim_out`.
 
-##Examples##
+## Examples
 * a basic example using __Nerv__ layers to a linear classification.
 
 ```
@@ -178,3 +178,4 @@ for l = 0, 10, 1 do
 end
 --[[end training]]--
 ```
+
diff --git a/nerv/doc/nerv_matrix.md b/nerv/doc/nerv_matrix.md
index dfd843d..3782eb3 100644
--- a/nerv/doc/nerv_matrix.md
+++ b/nerv/doc/nerv_matrix.md
@@ -1,8 +1,8 @@
-#The Nerv Matrix Package#
+# The Nerv Matrix Package
 Part of the [Nerv](../README.md) toolkit.
 
-##Description##
-###Underlying structure###
+## Description
+### Underlying structure
 In the begining is could be useful to know something about the underlying structure of a __Nerv__ matrix. Please keep in mind that matrice in __Nerv__ is row-major.  
 Every matrix object is a encapsulation of a C struct that describes the attributes of this matrix.  
 ```
@@ -20,12 +20,12 @@ typedef struct Matrix {
 It is worth mentioning that that `data_ref` is a counter which counts the number of references to its memory space, mind that it will also be increased when a row of the matrix is referenced(`col = m[2]`). A __Nerv__ matrix will deallocate its space when this counter is decreased to zero.
 Also note that all assigning operation in __Nerv__ is reference copy, you can use `copy_tod` or `copy_toh` method to copy value. Also, row assigning operations like `m1[2]=m2[3]` is forbidden in __Nerv__.
 
-###Class hierarchy###
+### Class hierarchy
 The class hierarchy of the matrix classes can be clearly observed in `matrix/init.c`.
 First there is a abstract base class __Nerv.Matrix__, which is inherited by __Nerv.CuMatrix__ and __Nerv.MMatrix__(also abstract).  
 Finally, there is __Nerv.CuMatrixFloat__, __Nerv.CuMatrixDouble__, inheriting __Nerv.CuMatrix__, and __Nerv.MMatrixFloat__, __Nerv.MMatrixDouble__, __Nerv.MMatrixInt__ , inheriting __Nerv.MMatrix__.
 
-##Methods##
+## Methods
 Mind that usually a matrix object can only do calculation with matrix of its own type(a __Nerv.CuMatrixFloat__ matrix can only do add operation with a __Nerv.CuMatrixFloat__).  
 In the methods description below, __Matrix__ could be __Nerv.CuMatrixFloat__, __Nerv.CuMatrixDouble__, __Nerv.MMatrixFloat__ or __Nerv.MMatrixDouble__. __Element_type__ could be `float` or `double`, respectively.
 * __Matrix = Matrix(int nrow, int ncol)__  
@@ -53,6 +53,8 @@ Return a new __Matrix__ of size (1,`self.ncol`), which stores the sum of all col
 Return a new __Matrix__ of size (`self.nrow`,1), which stores the sum of all rows of __Matrix__ `self`.
 * __Matrix Matrix.rowmax(Matrix self)__  
 Return a new __Matrix__ of size (`self.nrow`,1), which stores the max value of all rows of __Matrix__ `self`.
+* __Matrix Matrix.rowmax_idx(Matrix self)__  
+Return two new __Matrix__ of size (`self.nrow`,1), which stores the max value of all rows of __Matrix__ `self`, and its corresponding column indices(start from zero).
 * __Matrix Matrix.trans(Matrix self)__  
 Return a new __Matrix__ of size (`self.ncol`,`self.nrow`), which stores the transpose of __Matrix__ `self`.
 * __void Matrix.copy_fromh(Matrix self, MMatrix a)__  
@@ -81,8 +83,8 @@ Fill the content of __Matrix__ `self` to be `value`.
 Set the element of __Matrix__ `self` to be elementwise-sigmoid of `ma`.
 * __void Matrix.sigmoid_grad(Matrix self, Matrix err, Matrix output)__  
 Set the element of __Matrix__ `self`, to be `self[i][j]=err[i][j]*output[i][j]*(1-output[i][j])`. This function is used to propagate sigmoid layer error.
-* __void Matrix.softmax(Matrix self, Matrix a)__  
-Calculate a row-by-row softmax of __Matrix__ `a` and save the result in `self`.
+* __Matrix Matrix.softmax(Matrix self, Matrix a)__  
+Calculate a row-by-row softmax of __Matrix__ `a` and save the result in `self`. Returns a new `self.nrow*1` index matrix that stores the index of the maximum value of each row.
 * __void Matrix.mul_elem(Matrix self, Matrix ma, Matrix mb)__  
 Calculate element-wise multiplication of __Matrix__ `ma` and `mb`, store the result in `self`.
 * __void Matrix.log_elem(Matrix self, Matrix ma)__  
@@ -113,7 +115,7 @@ Write `self` to the file position in `chunk`.
 * __void MMatrix.copy_from(MMatrix ma, MMatrix mb,[int b_bgein, int b_end, int a_begin])__  
 Copy a part of `mb`(rows of index `[b_begin..b_end)`) to `ma` beginning at row index `a_begin`. If not specified, `b_begin` will be `0`, `b_end` will be `b.nrow`, `a_begin` will be `0`.
 
-##Examples##
+## Examples
 * Use `get_dataref_value` to test __Nerv__'s matrix space allocation.  
 ```
 m = 10
@@ -134,6 +136,7 @@ print("test fm:get_dataref_value:", fm:get_dataref_value())
 print(fm)
 print(dm)
 ```
+
 * Test some __Matrix__ calculations.
 ```
 m = 4
@@ -167,3 +170,4 @@ print(a)
 a:log_elem(fs)
 print(a)
 ```
+
diff --git a/nerv/doc/nerv_nn.md b/nerv/doc/nerv_nn.md
index c57447d..63537fb 100644
--- a/nerv/doc/nerv_nn.md
+++ b/