aboutsummaryrefslogtreecommitdiff
path: root/nerv
diff options
context:
space:
mode:
authorDeterminant <[email protected]>2015-06-22 19:01:29 +0800
committerDeterminant <[email protected]>2015-06-22 19:01:29 +0800
commit2497fd9e7a0fae5ee4887890d7a312e0e08a93b8 (patch)
tree382f97575bd2df9ee6abb1662b11b279fc22d72b /nerv
parent196e9b48a3541caccdffc5743001cced70667091 (diff)
major change: use luarocks to manage project
Diffstat (limited to 'nerv')
-rwxr-xr-xnerv2
-rw-r--r--nerv/.gitignore1
-rw-r--r--nerv/Makefile60
-rw-r--r--nerv/common.c76
-rw-r--r--nerv/common.h36
-rw-r--r--nerv/doc/nerv.md17
-rw-r--r--nerv/doc/nerv_class.md36
-rw-r--r--nerv/doc/nerv_io.md113
-rw-r--r--nerv/doc/nerv_layer.md180
-rw-r--r--nerv/doc/nerv_matrix.md165
-rw-r--r--nerv/doc/nerv_nn.md256
-rw-r--r--nerv/doc/nerv_param.md27
-rw-r--r--nerv/examples/asr_trainer.lua106
-rw-r--r--nerv/examples/chunk_file_example.lua53
-rw-r--r--nerv/examples/cumatrix_example.lua31
-rw-r--r--nerv/examples/cumatrix_from_mmatrix.lua32
-rw-r--r--nerv/examples/mmatrix_example.lua20
-rw-r--r--nerv/examples/oop_example.c101
-rw-r--r--nerv/examples/oop_example.lua16
-rw-r--r--nerv/examples/swb_baseline.lua166
-rw-r--r--nerv/examples/test_dnn_layers.lua78
-rw-r--r--nerv/examples/test_nn_lib.lua164
-rw-r--r--nerv/init.lua128
-rw-r--r--nerv/io/chunk_file.c325
-rw-r--r--nerv/io/chunk_file.h23
-rw-r--r--nerv/io/init.c6
-rw-r--r--nerv/io/init.lua55
-rw-r--r--nerv/io/sgd_buffer.lua111
-rw-r--r--nerv/layer/affine.lua91
-rw-r--r--nerv/layer/bias.lua28
-rw-r--r--nerv/layer/combiner.lua59
-rw-r--r--nerv/layer/init.lua79
-rw-r--r--nerv/layer/mse.lua52
-rw-r--r--nerv/layer/sigmoid.lua31
-rw-r--r--nerv/layer/softmax_ce.lua68
-rw-r--r--nerv/layer/window.lua28
-rw-r--r--nerv/luaT/README.md239
-rw-r--r--nerv/luaT/luaT.c1079
-rw-r--r--nerv/luaT/luaT.h111
-rw-r--r--nerv/matrix/cuda_helper.h75
-rw-r--r--nerv/matrix/cukernel.cu17
-rw-r--r--nerv/matrix/cukernel.h20
-rw-r--r--nerv/matrix/cumatrix.c87
-rw-r--r--nerv/matrix/generic/cukernel.cu571
-rw-r--r--nerv/matrix/generic/cumatrix.c493
-rw-r--r--nerv/matrix/generic/elem_type.h22
-rw-r--r--nerv/matrix/generic/matrix.c155
-rw-r--r--nerv/matrix/generic/matrix.h19
-rw-r--r--nerv/matrix/generic/mmatrix.c122
-rw-r--r--nerv/matrix/init.c35
-rw-r--r--nerv/matrix/init.lua77
-rw-r--r--nerv/matrix/mmatrix.c77
-rw-r--r--nerv/nerv13
-rw-r--r--nerv/nerv-scm-1.rockspec38
-rw-r--r--nerv/nerv.c38
-rw-r--r--nerv/nn/init.lua3
-rw-r--r--nerv/nn/layer_dag.lua249
-rw-r--r--nerv/nn/layer_repo.lua34
-rw-r--r--nerv/nn/param_repo.lua76
59 files changed, 6468 insertions, 2 deletions
diff --git a/nerv b/nerv
deleted file mode 100755
index 3c16418..0000000
--- a/nerv
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/bash
-exec 'build/luajit-2.0/bin/luajit' -e "package.cpath=\"${PWD}/build/lib/?.so\"" -e "package.path=\"${PWD}/build/lua/?/init.lua;${PWD}/build/lua/?.lua;${PWD}/?.lua\"" -e "require 'nerv'" "$@"
diff --git a/nerv/.gitignore b/nerv/.gitignore
new file mode 100644
index 0000000..567609b
--- /dev/null
+++ b/nerv/.gitignore
@@ -0,0 +1 @@
+build/
diff --git a/nerv/Makefile b/nerv/Makefile
new file mode 100644
index 0000000..b69a63e
--- /dev/null
+++ b/nerv/Makefile
@@ -0,0 +1,60 @@
+.PHONY: build install clean
+SHELL := /bin/bash
+BUILD_DIR := $(CURDIR)/build
+OBJS := nerv.o luaT.o common.o \
+ matrix/mmatrix.o matrix/cumatrix.o matrix/init.o matrix/cukernel.o \
+ io/init.o io/chunk_file.o \
+ examples/oop_example.o
+LIBS := libnerv.so
+LUA_LIBS := matrix/init.lua io/init.lua init.lua \
+ layer/init.lua layer/affine.lua layer/sigmoid.lua layer/softmax_ce.lua \
+ layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua \
+ nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua \
+ io/sgd_buffer.lua
+INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK
+CUDA_BASE := /usr/local/cuda-6.5
+#CUDA_BASE := /usr/local/cuda-5.0
+CUDA_INCLUDE := -I $(CUDA_BASE)/include/
+INCLUDE += $(CUDA_INCLUDE)
+LDFLAGS := -L$(CUDA_BASE)/lib64/ -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcublas
+CFLAGS := -Wall -Wextra -O2
+OBJ_DIR := $(BUILD_DIR)/objs
+SUBDIR := matrix io layer examples nn
+NVCC := $(CUDA_BASE)/bin/nvcc
+NVCC_FLAGS := -Xcompiler -fPIC,-Wall,-Wextra
+
+LUA_DIR = $(INST_LUADIR)/nerv
+OBJS := $(addprefix $(OBJ_DIR)/,$(OBJS))
+OBJ_SUBDIR := $(addprefix $(OBJ_DIR)/,$(SUBDIR))
+LUA_SUBDIR := $(addprefix $(LUA_DIR)/,$(SUBDIR))
+LIBS := $(addprefix $(INST_LIBDIR)/,$(LIBS))
+LUA_LIBS := $(addprefix $(LUA_DIR)/,$(LUA_LIBS))
+
+build: $(OBJ_DIR) $(OBJ_SUBDIR) $(OBJS)
+$(OBJ_DIR) $(LUA_DIR) $(OBJ_SUBDIR) $(LUA_SUBDIR):
+ -mkdir -p $@
+$(OBJ_DIR)/%.o: %.c $(patsubst /%.o,/%.c,$@)
+ gcc -c -o $@ $< $(INCLUDE) -fPIC $(CFLAGS)
+$(OBJ_DIR)/matrix/cukernel.o: matrix/cukernel.cu
+ $(NVCC) -c -o $@ $< $(INCLUDE) $(NVCC_FLAGS)
+$(LUA_DIR)/%.lua: %.lua
+ cp $< $@
+$(OBJ_DIR)/luaT.o:
+ gcc -c -o $@ luaT/luaT.c $(INCLUDE) -fPIC
+$(LIBS): $(OBJS)
+ gcc -shared -o $@ $(OBJS) $(LDFLAGS)
+
+$(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c matrix/generic/cukernel.cu
+$(OBJ_DIR)/matrix/mmatrix.o: matrix/generic/mmatrix.c matrix/generic/matrix.c
+$(OBJ_DIR)/matrix/cukernel.o: matrix/generic/cukernel.cu
+
+.PHONY: speech
+
+speech:
+ -mkdir -p build/objs/speech/tnet_io
+ $(MAKE) -C speech/ BUILD_DIR=$(BUILD_DIR) LIB_DIR=$(LIB_DIR) OBJ_DIR=$(CURDIR)/build/objs/speech/ LUA_DIR=$(LUA_DIR)
+
+clean:
+ -rm -rf $(OBJ_DIR)
+
+install: $(LIBS) $(LUA_DIR) $(LUA_SUBDIR) $(LUA_LIBS)
diff --git a/nerv/common.c b/nerv/common.c
new file mode 100644
index 0000000..b4e39e6
--- /dev/null
+++ b/nerv/common.c
@@ -0,0 +1,76 @@
+#include "common.h"
+#include <stdarg.h>
+int nerv_error(lua_State *L, const char *err_mesg_fmt, ...) {
+ va_list ap;
+ va_start(ap, err_mesg_fmt);
+ lua_pushstring(L, "[nerv] internal error: ");
+ lua_pushvfstring(L, err_mesg_fmt, ap);
+ lua_concat(L, 2);
+ lua_error(L);
+ va_end(ap);
+ return 0;
+}
+
+int nerv_error_method_not_implemented(lua_State *L) {
+ return nerv_error(L, "method not implemented");
+}
+
+void luaN_append_methods(lua_State *L, const luaL_Reg *mlist) {
+ for (; mlist->func; mlist++)
+ {
+ lua_pushcfunction(L, mlist->func);
+ lua_setfield(L, -2, mlist->name);
+ }
+}
+
+HashMap *hashmap_create(size_t size, HashKey_t hfunc, HashMapCmp_t cmp) {
+ HashMap *res = (HashMap *)malloc(sizeof(HashMap));
+ res->bucket = calloc(size, sizeof(HashNode));
+ res->cmp = cmp;
+ res->hfunc = hfunc;
+ res->size = size;
+ return res;
+}
+
+void *hashmap_getval(HashMap *h, const char *key) {
+ size_t idx = h->hfunc(key) % h->size;
+ HashNode *ptr;
+ for (ptr = h->bucket[idx]; ptr; ptr = ptr->next)
+ {
+ if (!h->cmp(ptr->key, key))
+ return ptr->val;
+ }
+ return NULL;
+}
+
+void hashmap_setval(HashMap *h, const char *key, void *val) {
+ size_t idx = h->hfunc(key) % h->size;
+ HashNode *ptr = malloc(sizeof(HashNode));
+ ptr->next = h->bucket[idx];
+ h->bucket[idx] = ptr;
+ ptr->key = key;
+ ptr->val = val;
+}
+
+void hashmap_clear(HashMap *h) {
+ size_t i;
+ for (i = 0; i < h->size; i++)
+ {
+ HashNode *ptr, *nptr;
+ for (ptr = h->bucket[i]; ptr; ptr = nptr)
+ {
+ nptr = ptr->next;
+ free(ptr->val);
+ free(ptr);
+ }
+ h->bucket[i] = NULL;
+ }
+}
+
+size_t bkdr_hash(const char *key) {
+ unsigned int seed = 131;
+ unsigned int res = 0;
+ while (*key)
+ res = res * seed + *key++;
+ return res;
+}
diff --git a/nerv/common.h b/nerv/common.h
new file mode 100644
index 0000000..e21c7a5
--- /dev/null
+++ b/nerv/common.h
@@ -0,0 +1,36 @@
+#ifndef NERV_COMMON_H
+#define NERV_COMMON_H
+#include "lua.h"
+#include "lauxlib.h"
+#include "lualib.h"
+#include "luaT/luaT.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef struct HashNode {
+ const char *key;
+ void *val;
+ struct HashNode *next;
+} HashNode;
+
+typedef int (*HashMapCmp_t)(const char *a, const char *b);
+typedef size_t (*HashKey_t)(const char *key);
+
+typedef struct HashMap {
+ HashNode **bucket;
+ HashMapCmp_t cmp;
+ HashKey_t hfunc;
+ size_t size;
+} HashMap;
+
+HashMap *hashmap_create(size_t size, HashKey_t hfunc, HashMapCmp_t cmp);
+void *hashmap_getval(HashMap *h, const char *key);
+void hashmap_setval(HashMap *h, const char *key, void *val);
+void hashmap_clear(HashMap *h);
+
+size_t bkdr_hash(const char *key);
+
+int nerv_error(lua_State *L, const char *err_mesg_fmt, ...);
+int nerv_error_method_not_implemented(lua_State *L);
+void luaN_append_methods(lua_State *L, const luaL_Reg *mlist);
+#endif
diff --git a/nerv/doc/nerv.md b/nerv/doc/nerv.md
new file mode 100644
index 0000000..28411f5
--- /dev/null
+++ b/nerv/doc/nerv.md
@@ -0,0 +1,17 @@
+#The Nerv utility functions#
+Part of the [Nerv](../README.md) toolkit.
+##Methods##
+* __string = nerv.typename(obj a)__
+A registered function, the original function is `luaT_lua_typename`. In some cases if you call `type(a)` for object of some class in __Nerv__(like __Nerv.CuMatrix__) it will only return "userdata"(because it is created in C), in this case you can use this method to get its type.
+
+---
+
+* __metatable = nerv.getmetatable(string tname)__
+A registered function, the original function is `luaT_lua_getmetatable`. `tname` should be a class name that has been registered in __luaT__.
+
+* __metatable = nerv.newmetatable(string tname, string parenttname, function constructor, function destructor, function factory)__
+A registered function, the original function is `luaT_newmetatable`, it returns the metatable of the created class by the name `tname`.
+* __string = nerv.setmetatable(table self, string tname)__
+A registered function, the original function is `luaT_lua_setmetatable`. It assigns the metatable registered in __luaT__ by the name *tname* to the table *self*. And return *tname* to user.
+* __table = nerv.get_type(string typename)__
+Returns the type(`loadstring("return " .. typename)`). \ No newline at end of file
diff --git a/nerv/doc/nerv_class.md b/nerv/doc/nerv_class.md
new file mode 100644
index 0000000..99f63e7
--- /dev/null
+++ b/nerv/doc/nerv_class.md
@@ -0,0 +1,36 @@
+#The Nerv OOP#
+Part of the [Nerv](../README.md) toolkit.
+##Methods##
+* __metatable mt, metatable mpt = nerv.class(string tname, string parenttname)__
+This method is used to create a class by the name `tname`, which inherits `parenttname` in __Nerv__, then you create a new instance of this class by calling `obj=tname(...)`. The `tname.__init(...)` method(if defined) will be called in the constructing. The metatable of the class and its parent class will be returned.
+
+##Examples##
+* This example implements a simple `nerv.Counter` class which is inherited by `nerv.BetterCounter`.
+
+```
+do
+ nerv.class("nerv.Counter")
+ function nerv.Counter:__init(c)
+ if (c) then
+ self.c = c
+ else
+ self.c = 0
+ end
+ end
+end
+do
+ local mt, mpt = nerv.class("nerv.BetterCounter", "nerv.Counter")
+ function nerv.BetterCounter:__init(c, bc)
+ mpt.__init(self, c)
+ if (bc) then
+ self.bc = bc
+ else
+ self.bc = 0
+ end
+ end
+end
+c1 = nerv.Counter(1)
+print(c1.c)
+bc1 = nerv.BetterCounter(1, 1)
+print(bc1.c, bc1.bc)
+``` \ No newline at end of file
diff --git a/nerv/doc/nerv_io.md b/nerv/doc/nerv_io.md
new file mode 100644
index 0000000..07589df
--- /dev/null
+++ b/nerv/doc/nerv_io.md
@@ -0,0 +1,113 @@
+#The Nerv IO Package#
+Part of the [Nerv](../README.md) toolkit.
+
+##Description##
+The main class that the user uses to store and read parameter object to and from files is __nerv.ChunkFile__.
+In the file, a parameter object will be saved using a standard format. First is the length(in byte) of this object, then a table which includes some meta information of the object, and a data area. Below is an example text file.
+```
+[0000000000202]
+{type="nerv.ExampleP",info={message="just-a-try"},id="exampleP1"}
+3 3
+5.000000 5.000000 5.000000
+5.000000 5.000000 5.000000
+5.000000 5.000000 5.000000
+1 3
+4.000000 4.000000 4.000000
+[0000000000202]
+{type="nerv.ExampleP",info={message="just-a-try"},id="exampleP2"}
+3 3
+4.000000 4.000000 4.000000
+4.000000 4.000000 4.000000
+4.000000 4.000000 4.000000
+1 3
+3.000000 3.000000 3.000000
+```
+
+##Methods##
+* __ChunkFile ChunkFile(string fn, string mode)__
+`mode` can be `r` or `w`, for reading or writing a file. The returned __ChunkFile__ will be ready to write or read objects which follows the __nerv.Param__ interface(using `write_chunk` and `read_chunk`).
+* __void ChunkFile.write_chunk(ChunkFile self, Param p)__
+Write `p` into the file. `p:write` will be called.
+* __Param ChunkFile.read_chunk(ChunkFile self, string id, table global_conf)__
+Read the __Param__ object by id `id` from the file `self`. It will be constructed using `__init(id, global_conf)`. `p:read` will be called.
+* __void ChunkFile.close(ChunkFile self)__
+Close the opened file.
+
+##Examples##
+* An example showing how to use __ChunkFile__ to store and read parameter objects.
+```
+require 'io'
+do
+ local mt, mpt = nerv.class('nerv.ExampleP', 'nerv.Param')
+ function nerv.ExampleP:__init(id, global_conf)
+ self.id = id
+ self.global_conf = global_conf
+ self.matrix = nerv.MMatrixFloat(3, 3)
+ for i = 0, 2, 1 do
+ for j = 0, 2, 1 do
+ self.matrix[i][j] = 3
+ end
+ end
+ self.bias = nerv.MMatrixFloat(1, 3)
+ for i = 0, 2, 1 do
+ self.bias[i] = 2;
+ end
+ self:set_info({message = 'just-a-try'})
+ end
+ function nerv.ExampleP:addOne()
+ for i = 0, 2, 1 do
+ for j = 0, 2, 1 do
+ self.matrix[i][j] = self.matrix[i][j] + 1
+ end
+ end
+ for i = 0, 2, 1 do
+ self.bias[i] = self.bias[i] + 1
+ end
+ end
+ function nerv.ExampleP:read(pcdata)
+ self.matrix = nerv.MMatrixFloat.load(pcdata)
+ self.bias = nerv.MMatrixFloat.load(pcdata)
+ end
+ function nerv.ExampleP:write(pfhandle)
+ self.matrix:save(pfhandle)
+ self.bias:save(pfhandle)
+ end
+end
+global_conf = {}
+do
+ local f = nerv.ChunkFile('../tmp', 'w')
+ local exampleP1 = nerv.ExampleP('exampleP1', global_conf)
+ local exampleP2 = nerv.ExampleP('exampleP2', global_conf)
+ exampleP1:addOne()
+ exampleP1:addOne()
+ exampleP2:addOne()
+
+ f:write_chunk(exampleP1)
+ f:write_chunk(exampleP2)
+ f:close()
+end
+do
+ local f = nerv.ChunkFile('../tmp', 'r')
+ local exampleP1 = f:read_chunk('exampleP1', global_conf)
+ local exampleP2 = f:read_chunk('exampleP2', global_conf)
+ f:close()
+ print(exampleP1.matrix)
+ print(exampleP2.matrix)
+end
+```
+
+##Developer Notes##
+* There are four classes in to deal with chunk data, which are __nerv.ChunkFile__, __nerv.ChunkFileHandle__, __nerv.ChunkInfo__, __nerv.ChunkData__. Below is the underlying C structs.
+```
+typedef struct ChunkFileHandle {
+ FILE *fp;
+} ChunkFileHandle;
+typedef struct ChunkInfo {
+ off_t offset, length;
+} ChunkInfo;
+typedef struct ChunkData {
+ FILE *fp;
+ char *data;
+} ChunkData;
+```
+* In __Nerv.io__, a returned(by `ChunkFile.__init`) __nerv.ChunkFile__ will have a member `handle`, which is a __nerv.ChunkFileHandle__. \ No newline at end of file
diff --git a/nerv/doc/nerv_layer.md b/nerv/doc/nerv_layer.md
new file mode 100644
index 0000000..de2fb12
--- /dev/null
+++ b/nerv/doc/nerv_layer.md
@@ -0,0 +1,180 @@
+#The Nerv Layer Package#
+Part of the [Nerv](../README.md) toolkit.
+
+##Description##
+__nerv.Layer__ is the base class and most of its methods are abstract.
+###Class hierarchy and their members###
+* __nerv.Layer__.
+ * `table dim_in` It specifies the dimensions of the inputs.
+ * `table dim_out` It specifies the dimensions of the outputs.
+ * `string id` ID of this layer.
+ * `table gconf` Stores the `global_conf`.
+* __nerv.AffineLayer__ inherits __nerv.Layer__, both `#dim_in` and `#dim_out` are 1.
+ * `MatrixParam ltp` The liner transform parameter.
+ * `BiasParam bp` The bias parameter.
+* __nerv.BiasLayer__ inherits __nerv.Layer__, both `#dim_in` nad `#dim_out` are 1.
+ * `BiasParam bias` The bias parameter.
+* __nerv.SigmoidLayer__ inherits __nerv.Layer__, both `#dim_in` and `#dim_out` are 1.
+* __nerv.SoftmaxCELayer__ inherits __nerv.Layer__, `#dim_in` is 2 and `#dim_out` is -1(optional). `input[1]` is the input to the softmax layer, `input[2]` is the reference distribution. In its `propagate(input, output)` method, if `output[1] ~= nil`, cross\_entropy value will outputed.
+ * `float total_ce` Records the accumlated cross entropy value.
+ * `int total_frams` Records how many frames have passed.
+ * `bool compressed` The reference distribution can be a one-hot format. This feature is enabled by `layer_conf.compressed`.
+
+##Methods##
+* __void Layer.\_\_init(Layer self, string id, table global_conf, table layer_conf)__
+Abstract method.
+The constructing method should assign `id` to `self.id` and `global_conf` to `self.gconf`, `layer_conf.dim_in` to `self.dim_in`, `layer_conf.dim_out` to `self.dim_out`. `dim_in` and `dim_out` are a list specifies the dimensions of the inputs and outputs. Also, `layer_conf` will include the parameters, which should also be properly saved.
+* __void Layer.init(Layer self)__
+Abstract method.
+Initialization method, in this method the layer should do some self-checking and allocate space for intermediate results.
+* __void Layer.update(Layer self, table bp_err, table input, table output)__
+Abstract method.
+`bp_err[i]` should be the error on `output[i]`. In this method the parameters of `self` is updated.
+* __void Layer.propagate(Layer self, table input, table output)__
+Abstract method.
+Given `input` and the current parameters, propagate and store the result in `output`.
+* __void Layer.back_propagate(Layer self, Matrix next_bp_err, Matrix bp_err, Matrix input, Matrix output)__
+Abstract method.
+Calculate the error on the inputs and store them in `next_bp_err`.
+
+* __void Layer.check_dim_len(int len_in, int len_out)__
+Check whether `#self.dim_in == len_in` and `#self.dim_out == len_out`, if violated, an error will be posted.
+* __void Layer.get_params(Layer self)__
+Abstract method.
+The layer should return a list containing its parameters.
+
+####nerv.Layer.get\_dim(self)####
+* Returns:
+ `dim_in`: __table__.
+ `dim_out`: __table__.
+* Parameters:
+ `self`: __nerv.Layer__.
+* Description:
+ Returns `self.dim_in, self.dim_out`.
+
+##Examples##
+* a basic example using __Nerv__ layers to a linear classification.
+
+```
+require 'math'
+
+require 'layer.affine'
+require 'layer.softmax_ce'
+
+--[[Example using layers, a simple two-classification problem]]--
+
+function calculate_accurate(networkO, labelM)
+ sum = 0
+ for i = 0, networkO:nrow() - 1, 1 do
+ if (labelM[i][0] == 1 and networkO[i][0] >= 0.5) then
+ sum = sum + 1
+ end
+ if (labelM[i][1] == 1 and networkO[i][1] >= 0.5) then
+ sum = sum + 1
+ end
+ end
+ return sum
+end
+
+--[[begin global setting and data generation]]--
+global_conf = {lrate = 10,
+ wcost = 1e-6,
+ momentum = 0.9,
+ cumat_type = nerv.CuMatrixFloat}
+
+input_dim = 5
+data_num = 100
+ansV = nerv.CuMatrixFloat(input_dim, 1)
+for i = 0, input_dim - 1, 1 do
+ ansV[i][0] = math.random() - 0.5
+end
+ansB = math.random() - 0.5
+print('displaying ansV')
+print(ansV)
+print('displaying ansB(bias)')
+print(ansB)
+
+dataM = nerv.CuMatrixFloat(data_num, input_dim)
+for i = 0, data_num - 1, 1 do
+ for j = 0, input_dim - 1, 1 do
+ dataM[i][j] = math.random() * 2 - 1
+ end
+end
+refM = nerv.CuMatrixFloat(data_num, 1)
+refM:fill(ansB)
+refM:mul(dataM, ansV, 1, 1) --refM = dataM * ansV + ansB
+
+labelM = nerv.CuMatrixFloat(data_num, 2)
+for i = 0, data_num - 1, 1 do
+ if (refM[i][0] > 0) then
+ labelM[i][0] = 1
+ labelM[i][1] = 0
+ else
+ labelM[i][0] = 0
+ labelM[i][1] = 1
+ end
+end
+--[[global setting and data generation end]]--
+
+
+--[[begin network building]]--
+--parameters
+affineL_ltp = nerv.LinearTransParam('AffineL_ltp', global_conf)
+affineL_ltp.trans = nerv.CuMatrixFloat(input_dim, 2)
+for i = 0, input_dim - 1, 1 do
+ for j = 0, 1, 1 do
+ affineL_ltp.trans[i][j] = math.random() - 0.5
+ end
+end
+affineL_bp = nerv.BiasParam('AffineL_bp', global_conf)
+affineL_bp.trans = nerv.CuMatrixFloat(1, 2)
+for j = 0, 1, 1 do
+ affineL_bp.trans[j] = math.random() - 0.5
+end
+
+--layers
+affineL = nerv.AffineLayer('AffineL', global_conf, {['ltp'] = affineL_ltp,
+ ['bp'] = affineL_bp,
+ dim_in = {input_dim},
+ dim_out = {2}})
+softmaxL = nerv.SoftmaxCELayer('softmaxL', global_conf, {dim_in = {2, 2},
+ dim_out = {}})
+print('layers initializing...')
+affineL:init()
+softmaxL:init()
+--[[network building end]]--
+
+
+--[[begin space allocation]]--
+print('network input&output&error space allocation...')
+affineI = {dataM} --input to the network is data
+affineO = {nerv.CuMatrixFloat(data_num, 2)}
+softmaxI = {affineO[1], labelM}
+softmaxO = {}
+output = nerv.CuMatrixFloat(data_num, 2)
+
+affineE = {nerv.CuMatrixFloat(data_num, 2)}
+--[[space allocation end]]--
+
+
+--[[begin training]]--
+ce_last = 0
+for l = 0, 10, 1 do
+ affineL:propagate(affineI, affineO)
+ softmaxL:propagate(softmaxI, softmaxO)
+ output:softmax(softmaxI[1])
+
+ softmaxL:back_propagate(affineE, {}, softmaxI, softmaxO)
+
+ affineL:update(affineE, affineI, affineO)
+
+ if (l % 5 == 0) then
+ nerv.utils.printf("training iteration %d finished\n", l)
+ nerv.utils.printf("cross entropy: %.8f\n", softmaxL.total_ce - ce_last)
+ ce_last = softmaxL.total_ce
+ nerv.utils.printf("accurate labels: %d\n", calculate_accurate(output, labelM))
+ nerv.utils.printf("total frames processed: %.8f\n", softmaxL.total_frames)
+ end
+end
+--[[end training]]--
+```
diff --git a/nerv/doc/nerv_matrix.md b/nerv/doc/nerv_matrix.md
new file mode 100644
index 0000000..22971d2
--- /dev/null
+++ b/nerv/doc/nerv_matrix.md
@@ -0,0 +1,165 @@
+#The Nerv Matrix Package#
+Part of the [Nerv](../README.md) toolkit.
+
+##Description##
+###Underlying structure###
+In the begining is could be useful to know something about the underlying structure of a __Nerv__ matrix. Please keep in mind that matrice in __Nerv__ is row-major.
+Every matrix object is a encapsulation of a C struct that describes the attributes of this matrix.
+```
+typedef struct Matrix {
+ size_t stride; /* size of a row */
+ long ncol, nrow, nmax; /* dimension of the matrix, nmax is simply nrow * ncol */
+ union {
+ float *f;
+ double *d;
+ long *i;
+ } data; /* pointer to actual storage */
+ long *data_ref;
+} Matrix;
+```
+It is worth mentioning that that `data_ref` is a counter which counts the number of references to its memory space, mind that it will also be increased when a row of the matrix is referenced(`col = m[2]`). A __Nerv__ matrix will deallocate its space when this counter is decreased to zero.
+Also note that all assigning operation in __Nerv__ is reference copy, you can use `copy_tod` or `copy_toh` method to copy value. Also, row assigning operations like `m1[2]=m2[3]` is forbidden in __Nerv__.
+
+###Class hierarchy###
+The class hierarchy of the matrix classes can be clearly observed in `matrix/init.c`.
+First there is a abstract base class __Nerv.Matrix__, which is inherited by __Nerv.CuMatrix__ and __Nerv.MMatrix__(also abstract).
+Finally, there is __Nerv.CuMatrixFloat__, __Nerv.CuMatrixDouble__, inheriting __Nerv.CuMatrix__, and __Nerv.MMatrixFloat__, __Nerv.MMatrixDouble__, __Nerv.MMatrixInt__ , inheriting __Nerv.MMatrix__.
+
+##Methods##
+Mind that usually a matrix object can only do calculation with matrix of its own type(a __Nerv.CuMatrixFloat__ matrix can only do add operation with a __Nerv.CuMatrixFloat__).
+In the methods description below, __Matrix__ could be __Nerv.CuMatrixFloat__, __Nerv.CuMatrixDouble__, __Nerv.MMatrixFloat__ or __Nerv.MMatrixDouble__. __Element_type__ could be `float` or `double`, respectively.
+* __Matrix = Matrix(int nrow, int ncol)__
+Returns a __Matrix__ object of `nrow` rows and `ncol` columns.
+* __Element_type = Matrix.get_elem(Matrix self, int index)__
+Returns the element value at the specific index(treating the matrix as a vector). The index should be less than `nmax` of the matrix.
+* __void Matrix.set_elem(Matrix self, int index, Element_type value)__
+Set the value at `index` to be `value`.
+* __int Matrix.ncol(Matrix self)__
+Get `ncol`, the number of columns.
+* __int Matrix.nrow(Matrix self)__
+Get `nrow`, the number of rows.
+* __int Matrix.get_dataref_value(Matrix self)__
+Returns the value(not a pointer) of space the `data_ref` pointer pointed to. This function is mainly for debugging.
+* __Matrix/Element\_type, boolean Matrix.\_\_index\_\_(Matrix self, int index)__
+If the matrix has more than one row, will return the row at `index` as a __Matrix__ . Otherwise it will return the value at `index`.
+* __void Matrix.\_\_newindex\_\_(Matrix self, int index, Element_type value)__
+Set the element at `index` to be `value`.
+---
+* __Matrix Matrix.create(Matrix a)__
+Return a new __Matrix__ of `a`'s size(of the same number of rows and columns).
+* __Matrix Matrix.colsum(Matrix self)__
+Return a new __Matrix__ of size (1,`self.ncol`), which stores the sum of all columns of __Matrix__ `self`.
+* __Matrix Matrix.rowsum(Matrix self)__
+Return a new __Matrix__ of size (`self.nrow`,1), which stores the sum of all rows of __Matrix__ `self`.
+* __Matrix Matrix.rowmax(Matrix self)__
+Return a new __Matrix__ of size (`self.nrow`,1), which stores the max value of all rows of __Matrix__ `self`.
+* __Matrix Matrix.trans(Matrix self)__
+Return a new __Matrix__ of size (`self.ncol`,`self.nrow`), which stores the transpose of __Matrix__ `self`.
+* __void Matrix.copy_fromh(Matrix self, MMatrix a)__
+Copy the content of a __MMatrix__ `a` to __Matrix__ `self`, they should be of the same size.
+* __void Matrix.copy_fromd(Matrix self, CuMatrix a)__
+Copy the content of a __CuMatrix__ `a` to __Matrix__ `self`, they should be of the same size.
+* __void Matrix.copy_toh(Matrix self, MMatrix a)__
+Copy the content of the __Matrix__ `self` to a __MMatrix__ `a`.
+* __void Matrix.copy_tod(Matrix self, CuMatrix a)__
+Copy the content of the __Matrix__ `self` to a __CuMatrix__ `a`.
+* __void Matrix.add(Matrix self, Matrix ma, Matrix mb, Element_type alpha, Element_type beta)__
+It sets the content of __Matrix__ `self` to be `alpha * ma + beta * mb`.__Matrix__ `ma,mb,self` should be of the same size.
+* __void Matrix.mul(Matrix self, Matrix ma, Matrix mb, Element_type alpha, Element_type beta, [string ta, string tb])__
+It sets the content of __Matrix__ `self` to be `beta * self + alpha * ma * mb`. `ta` and `tb` is optional, if `ta` is 'T', then `ma` will be transposed, also if `tb` is 'T', `mb` will be transposed.
+* __void Matrix.add_row(Matrix self, Matrix va, Element_type beta)__
+Add `beta * va` to every row of __Matrix__ `self`.
+* __void Matrix.fill(Matrix self, Element_type value)__
+Fill the content of __Matrix__ `self` to be `value`.
+* __void Matrix.sigmoid(Matrix self, Matrix ma)__
+Set the element of __Matrix__ `self` to be elementwise-sigmoid of `ma`.
+* __void Matrix.sigmoid_grad(Matrix self, Matrix err, Matrix output)__
+Set the element of __Matrix__ `self`, to be `self[i][j]=err[i][j]*output[i][j]*(1-output[i][j])`. This function is used to propagate sigmoid layer error.
+* __void Matrix.softmax(Matrix self, Matrix a)__
+Calculate a row-by-row softmax of __Matrix__ `a` and save the result in `self`.
+* __void Matrix.mul_elem(Matrix self, Matrix ma, Matrix mb)__
+Calculate element-wise multiplication of __Matrix__ `ma` and `mb`, store the result in `self`.
+* __void Matrix.log_elem(Matrix self, Matrix ma)__
+Calculate element-wise log of __Matrix__ `ma`, store the result in `self`.
+* __void Matrix.copy_rows_fromh_by_idx(Matrix self, MMatrix ma, MMatrixInt idx)__
+`idx` should be a row vector. This function copy the rows of `ma` to `self` according to `idx`, in other words, it assigns `ma[idx[i]]` to `self[i]`.
+* __void Matrix.expand_frm(Matrix self, Matrix a, int context)__
+Treating each row of `a` as speech feature, and do a feature expansion. The `self` should of size `(a.nrow, a.ncol * (context * 2 + 1))`. `self[i]` will be `(a[i-context] a[i-context+1] ... a[i] a[i+1] a[i+context])`. `a[0]` and `a[nrow]` will be copied to extend the index range.
+* __void Matrix.rearrange_frm(Matrix self, Matrix a, int step)__
+Rearrange `a` according to its feature dimension. The `step` is the length of context. So, `self[i][j]` will be assigned `a[i][j / step + (j % step) * (a.ncol / step)]`. `a` and `self` should be of the same size and `step` should be divisible by `a.ncol`.
+* __void Matrix.scale_row(Matrix self, Matrix scale)__
+Scale each column of `self` according to a vector `scale`. `scale` should be of size `1 * self.ncol`.
+* __Matrix Matrix.\_\_add\_\_(Matrix ma, Matrix mb)__
+Returns a new __Matrix__ which stores the result of `ma+mb`.
+* __Matrix Matrix.\_\_sub\_\_(Matrix ma, Matrix mb)__
+Returns a new __Matrix__ which stores the result of `ma-mb`.
+* __Matrix Matrix.\_\_mul\_\_(Matrix ma, Matrix mb)__
+Returns a new __Matrix__ which stores the result of `ma*mb`.
+* __CuMatrix CuMatrix.new_from_host(MMatrix m)__
+Return a new __CuMatrix__ which is a copy of `m`.
+* __MMatrix CuMatrix.new_to_host(CuMatrix self)__
+Return a new __MMatrix__ which is a copy of `self`.
+* __string Matrix.\_\_tostring\_\_(Matrix self)__
+Returns a string containing values of __Matrix__ `self`.
+---
+* __MMatrix MMatrix.load(ChunkData chunk)__
+Return a new __MMatrix__ loaded from the file position in `chunk`.
+* __void MMatrix.save(MMatrix self, ChunkFileHandle chunk)__
+Write `self` to the file position in `chunk`.
+* __void MMatrix.copy_from(MMatrix ma, MMatrix mb,[int b_bgein, int b_end, int a_begin])__
+Copy a part of `mb`(rows of index `[b_begin..b_end)`) to `ma` beginning at row index `a_begin`. If not specified, `b_begin` will be `0`, `b_end` will be `b.nrow`, `a_begin` will be `0`.
+
+##Examples##
+* Use `get_dataref_value` to test __Nerv__'s matrix space allocation.
+```
+m = 10
+n = 10
+fm = nerv.MMatrixFloat(m, n)
+dm = nerv.MMatrixDouble(m, n)
+for i = 0, m - 1 do
+ for j = 0, n - 1 do
+ t = i / (j + 1)
+ fm[i][j] = t
+ dm[i][j] = t
+ end
+end
+print("test fm:get_dataref_value:", fm:get_dataref_value())
+print("forced a garbade collect")
+collectgarbage("collect")
+print("test fm:get_dataref_value:", fm:get_dataref_value())
+print(fm)
+print(dm)
+```
+* Test some __Matrix__ calculations.
+```
+m = 4
+n = 4
+fm = nerv.CuMatrixFloat(m, n)
+dm = nerv.CuMatrixDouble(m, n)
+for i = 0, m - 1 do
+ for j = 0, n - 1 do
+ -- local t = math.random(10)
+ t = i / (j + 1)
+ fm[i][j] = t
+ dm[i][j] = t
+ end
+end
+print(fm)
+fs = fm:create()
+fs:softmax(fm)
+-- print(fs)
+print(dm)
+ds = dm:create()
+ds:softmax(dm)
+-- print(ds)
+print(fs)
+print(fs + fs)
+print(ds + ds)
+print(fs - fs)
+print(ds - ds)
+a = fs:create()
+a:mul_elem(fs, fs)
+print(a)
+a:log_elem(fs)
+print(a)
+``` \ No newline at end of file
diff --git a/nerv/doc/nerv_nn.md b/nerv/doc/nerv_nn.md
new file mode 100644
index 0000000..c57447d
--- /dev/null
+++ b/nerv/doc/nerv_nn.md
@@ -0,0 +1,256 @@
+#The Nerv NN Package#
+Part of the [Nerv](../README.md) toolkit.
+
+##Description##
+###Class hierarchy###
+it contains __nerv.LayerRepo__, __nerv.ParamRepo__, and __nerv.DAGLayer__(inherits __nerv.Layer__).
+
+###Class hierarchy and their members###
+####nerv.ParamRepo####
+Get parameter object by ID.
+* `table param_table` Contains the mapping of parameter ID to parameter file(__nerv.ChunkFile__)
+* __nerv.LayerRepo__ Get layer object by ID.
+* `table layers` Contains the mapping of layer ID to layer object.
+objects.
+
+####__nerv.DAGLayer__####
+Inherits __nerv.Layer__.
+* `layers`: __table__, a mapping from a layer ID to its "ref". A ref is a structure that contains reference to space allocations and other info of the layer.
+* `inputs`: __table__, a mapping from the inputs ports of the DAG layer to the input ports of the sublayer, the key is the port number, the value is `{ref, port}`.
+* `outputs`:__table__, the counterpart of `inputs`.
+* `parsed_conn`: __table__, a list of parsed connections, each entry is of format `{{ref_from, port_from}, {ref_to, port_to}}`.
+* `queue`: __table__, a list of "ref"s, the propagation of the DAGLayer will follow this order, and back-propagation will follow a reverse order.
+
+##Methods##
+
+###__nerv.ParamRepo__###
+
+####nerv.ParamRepo:\_\_init(param\_files)####
+* Parameters:
+ `param_files`: __table__
+* Description:
+ `param_files` is a list of file names that stores parameters, the newed __ParamRepo__ will read them from file and store the mapping for future fetching.
+
+####nerv.Param ParamRepo.get_param(ParamRepo self, string pid, table global_conf)####
+* Returns:
+ __nerv.Layer__
+* Parameters:
+ `self`: __nerv.ParamRepo__.
+ `pid`: __string__.
+ `global_conf`: __table__.
+* Description:
+ __ParamRepo__ will find the __nerv.ChunkFile__ `pf` that contains parameter of ID `pid` and return `pf:read_chunk(pid, global_conf)`.
+
+###__nerv.LayerRepo__###
+####nerv.LayerRepo:\_\_init(layer\_spec, param\_repo, global\_conf)####
+* Returns:
+ __nerv.LayerRepo__.
+* Parameters:
+ `self`: __nerv.ParamRepo__.
+ `layer_spec`: __table__.
+ `param_repo`: __nerv.ParamRepo__.
+ `global_conf`: __table__.
+* Description:
+ __LayerRepo__ will construct the layers specified in `layer_spec`. Every entry in the `layer_spec` table should follow the format below:
+
+ > layer_spec : {[layer_type1] = llist1, [layer_type2] = llist2, ...}
+ > llist : {layer1, layer2, ...}
+ > layer : layerid = {param_config, layer_config}
+ > param_config : {param1 = paramID1, param2 = paramID2}
+
+ __LayerRepo__ will merge `param_config` into `layer_config` and construct a layer by calling `layer_type(layerid, global_conf, layer_config)`.
+
+####nerv.LayerRepo.get\_layer(self, lid)####
+* Returns:
+ __nerv.LayerRepo__, the layer with ID `lid`.
+* Parameters:
+ `self`:__nerv.LayerRepo__.
+ `lid`:__string__.
+* Description:
+ Returns the layer with ID `lid`.
+
+###nerv.DAGLayer###
+####nerv.DAGLayer:\_\_init(id, global\_conf, layer\_conf)####
+* Returns:
+ __nerv.DAGLayer__
+* Parameters:
+ `id`: __string__
+ `global_conf`: __table__
+ `layer_conf`: __table__
+* Description:
+ The `layer_conf` should contain `layer_conf.sub_layers` which is a __nerv.LayerRepo__ storing the sub layers of the DAGLayer. It should also contain `layer_conf.connections`, which is a string-to-string mapping table describing the DAG connections. See an example below:
+
+ ```
+ dagL = nerv.DAGLayer("DAGL", global_conf, {["dim_in"] = {input_dim, 2}, ["dim_out"] = {}, ["sub_layers"] = layerRepo,
+ ["connections"] = {
+ ["<input>[1]"] = "AffineL[1]",
+ ["AffineL[1]"] = "SoftmaxL[1]",
+ ["<input>[2]"] = "SoftmaxL[2]",
+ }})
+ ```
+
+####nerv.DAGLayer.init(self, batch\_size)####
+* Parameters:
+ `self`: __nerv.DAGLayer__
+ `batch_size`: __int__
+* Description:
+ This initialization method will allocate space for output and input matrice, and will call `init()` for each of its sub layers.
+
+
+####nerv.DAGLayer.propagate(self, input, output)####
+* Parameters:
+ `self`: __nerv.DAGLayer__
+ `input`: __table__
+ `output`: __table__
+* Description:
+ The same function as __nerv.Layer.propagate__, do propagation for each layer in the order of `self.queue`.
+
+####nerv.DAGLayer.back\_propagate(self, next\_bp\_err, bp\_err, input, output)####
+* Parameters:
+ `self`: __nerv.DAGLayer__
+ `next_bp_err`: __table__
+ `bp_err`: __table__
+ `input`: __table__
+ `output`: __table__
+* Description:
+ The same function as __nerv.Layer.back_propagate__, do back-propagation for each layer in the reverse order of `self.queue`.
+
+####nerv.DAGLayer.update(self, bp\_err, input, output)####
+* Parameters:
+ `self`: __nerv.DAGLayer__
+ `bp_err`: __table__
+ `input`: __table__
+ `output`: __table__
+* Description:
+ The same function as __nerv.Layer.update__, do update for each layer in the order of `self.queue`.
+
+##Examples##
+* aaa
+
+```
+require 'math'
+
+require 'layer.affine'
+require 'layer.softmax_ce'
+
+--[[Example using DAGLayer, a simple two-classification problem]]--
+
+--[[begin global setting and data generation]]--
+global_conf = {lrate = 10,
+ wcost = 1e-6,
+ momentum = 0.9,
+ cumat_type = nerv.CuMatrixFloat,
+ }
+
+input_dim = 5
+data_num = 100
+param_fn = "../tmp"
+ansV = nerv.CuMatrixFloat(input_dim, 1)
+for i = 0, input_dim - 1, 1 do
+ ansV[i][0] = math.random() - 0.5
+end
+ansB = math.random() - 0.5
+print('displaying ansV')
+print(ansV)
+print('displaying ansB(bias)')
+print(ansB)
+
+dataM = nerv.CuMatrixFloat(data_num, input_dim)
+for i = 0, data_num - 1, 1 do
+ for j = 0, input_dim - 1, 1 do
+ dataM[i][j] = math.random() * 2 - 1
+ end
+end
+refM = nerv.CuMatrixFloat(data_num, 1)
+refM:fill(ansB)
+refM:mul(dataM, ansV, 1, 1) --refM = dataM * ansV + ansB
+
+labelM = nerv.CuMatrixFloat(data_num, 2)
+for i = 0, data_num - 1, 1 do
+ if (refM[i][0] > 0) then
+ labelM[i][0] = 1
+ labelM[i][1] = 0
+ else
+ labelM[i][0] = 0
+ labelM[i][1] = 1
+ end
+end
+--[[global setting and data generation end]]--
+
+
+--[[begin network building]]--
+--parameters
+do
+ local affineL_ltp = nerv.LinearTransParam('AffineL_ltp', global_conf)
+ affineL_ltp.trans = nerv.CuMatrixFloat(input_dim, 2)
+ for i = 0, input_dim - 1, 1 do
+ for j = 0, 1, 1 do
+ affineL_ltp.trans[i][j] = math.random() - 0.5
+ end
+ end
+ local affineL_bp = nerv.BiasParam('AffineL_bp', global_conf)
+ affineL_bp.trans = nerv.CuMatrixFloat(1, 2)
+ for j = 0, 1, 1 do
+ affineL_bp.trans[j] = math.random() - 0.5
+ end
+
+ local chunk = nerv.ChunkFile(param_fn, 'w')
+ chunk:write_chunk(affineL_ltp)
+ chunk:write_chunk(affineL_bp)
+ chunk:close()
+
+ paramRepo = nerv.ParamRepo({param_fn})
+end
+
+--layers
+layerRepo = nerv.LayerRepo({
+ ["nerv.AffineLayer"] =
+ {
+ ["AffineL"] = {{["ltp"] = "AffineL_ltp", ["bp"] = "AffineL_bp"}, {["dim_in"] = {input_dim}, ["dim_out"] = {2}}},
+ },
+ ["nerv.SoftmaxCELayer"] =
+ {
+ ["SoftmaxL"] = {{}, {["dim_in"] = {2, 2}, ["dim_out"] = {}}}
+ },
+ }, paramRepo, global_conf)
+affineL = layerRepo:get_layer("AffineL")
+softmaxL = layerRepo:get_layer("SoftmaxL")
+print('layers initializing...')
+dagL = nerv.DAGLayer("DAGL", global_conf, {["dim_in"] = {input_dim, 2}, ["dim_out"] = {}, ["sub_layers"] = layerRepo,
+ ["connections"] = {
+ ["<input>[1]"] = "AffineL[1]",
+ ["AffineL[1]"] = "SoftmaxL[1]",
+ ["<input>[2]"] = "SoftmaxL[2]",
+ }})
+dagL:init(data_num)
+--affineL:init()
+--softmaxL:init()
+--[[network building end]]--
+
+
+--[[begin space allocation]]--
+print('network input&output&error space allocation...')
+dagL_input = {dataM, labelM}
+dagL_output = {}
+dagL_err = {}
+dagL_ierr = {nerv.CuMatrixFloat(data_num, input_dim), nerv.CuMatrixFloat(data_num, 2)}
+--[[space allocation end]]--
+
+
+--[[begin training]]--
+ce_last = 0
+for l = 0, 10, 1 do
+ dagL:propagate(dagL_input, dagL_output)
+ dagL:back_propagate(dagL_ierr, dagL_err, dagL_input, dagL_output)
+ dagL:update(dagL_err, dagL_input, dagL_output)
+
+ if (l % 2 == 0) then
+ nerv.utils.printf("training iteration %d finished\n", l)
+ nerv.utils.printf("cross entropy: %.8f\n", softmaxL.total_ce - ce_last)
+ --nerv.utils.printf("accurate labels: %d\n", calculate_accurate(output, labelM))
+ nerv.utils.printf("total frames processed: %.8f\n", softmaxL.total_frames)
+ end
+ ce_last = softmaxL.total_ce
+end
+--[[end training]]--
+``` \ No newline at end of file
diff --git a/nerv/doc/nerv_param.md b/nerv/doc/nerv_param.md
new file mode 100644
index 0000000..167cb11
--- /dev/null
+++ b/nerv/doc/nerv_param.md
@@ -0,0 +1,27 @@
+#The Nerv Parameter Package#
+Part of the [Nerv](../README.md) toolkit.
+
+##Description##
+###Class hierarchy###
+There is a base class __Nerv.Param__ defined in `layer/init.lua`.
+
+###Class hierarchy and their members###
+* __nerv.MatrixParam__ inherits __nerv.Param__
+ * `Matrix trans` stores the parameter matrix.
+* __nerv.LinearTransParam__ inherits __Nerv.MatrixParam__.
+* __Nerv.BiasParam__ inherits __Nerv.MatrixParam__.
+
+##Methods##
+* __void Param.\_\_init(Param self, string id, table global_conf)__
+Constructor of a __Param__, it will set `self.id` to be `id` and `self.gconf` to be `global_conf`.
+* __void Param.set_info(Param self, table info)__
+Set `self.info` to be `info`.
+* __table Param.get_info(Param self)__
+Returns `self.info`.
+* __void Param.read(Param self, ChunkData pcdata)__
+Abstract method.
+In this method, `self` should in turn calls its members to load from `pcdata`.
+* __void Param.write(Param self, ChunkFileHandle pfhandle)__
+Abstract method.
+Save parameters to file. In this method, `self` should in turn calls its members to save to `pfhandle`.
+
diff --git a/nerv/examples/asr_trainer.lua b/nerv/examples/asr_trainer.lua
new file mode 100644
index 0000000..a5727be
--- /dev/null
+++ b/nerv/examples/asr_trainer.lua
@@ -0,0 +1,106 @@
+function build_trainer(ifname)
+ local param_repo = nerv.ParamRepo()
+ param_repo:import(ifname, nil, gconf)
+ local sublayer_repo = make_sublayer_repo(param_repo)
+ local layer_repo = make_layer_repo(sublayer_repo, param_repo)
+ local crit = get_criterion_layer(sublayer_repo)
+ local network = get_network(layer_repo)
+ local input_order = get_input_order()
+ local iterative_trainer = function (prefix, scp_file, bp)
+ gconf.randomize = bp
+ -- build buffer
+ local buffer = make_buffer(make_readers(scp_file, layer_repo))
+ -- initialize the network
+ network:init(gconf.batch_size)
+ gconf.cnt = 0
+ err_input = {nerv.CuMatrixFloat(256, 1)}
+ err_input[1]:fill(1)
+ for data in buffer.get_data, buffer do
+ -- prine stat periodically
+ gconf.cnt = gconf.cnt + 1
+ if gconf.cnt == 1000 then
+ print_stat(sublayer_repo)
+ nerv.CuMatrix.print_profile()
+ nerv.CuMatrix.clear_profile()
+ gconf.cnt = 0
+ -- break
+ end
+ local input = {}
+-- if gconf.cnt == 100 then break end
+ for i, id in ipairs(input_order) do
+ if data[id] == nil then
+ nerv.error("input data %s not found", id)
+ end
+ table.insert(input, data[id])
+ end
+ local output = {nerv.CuMatrixFloat(256, 1)}
+ err_output = {input[1]:create()}
+ network:propagate(input, output)
+ if bp then
+ network:back_propagate(err_input, err_output, input, output)
+ network:update(err_input, input, output)
+ end
+ -- collect garbage in-time to save GPU memory
+ collectgarbage("collect")
+ end
+ print_stat(sublayer_repo)
+ nerv.CuMatrix.print_profile()
+ nerv.CuMatrix.clear_profile()
+ if (not bp) and prefix ~= nil then
+ nerv.info("writing back...")
+ local fname = string.format("%s_cv%.3f.nerv",
+ prefix, get_accuracy(sublayer_repo))
+ network:get_params():export(fname, nil)
+ end
+ return get_accuracy(sublayer_repo)
+ end
+ return iterative_trainer
+end
+
+dofile(arg[1])
+start_halving_inc = 0.5
+halving_factor = 0.6
+end_halving_inc = 0.1
+min_iter = 1
+max_iter = 20
+min_halving = 5
+gconf.batch_size = 256
+gconf.buffer_size = 81920
+
+local pf0 = gconf.initialized_param
+local trainer = build_trainer(pf0)
+--local trainer = build_trainer("c3.nerv")
+local accu_best = trainer(nil, gconf.cv_scp, false)
+local do_halving = false
+
+nerv.info("initial cross validation: %.3f", accu_best)
+for i = 1, max_iter do
+ nerv.info("[NN] begin iteration %d with lrate = %.6f", i, gconf.lrate)
+ local accu_tr = trainer(nil, gconf.tr_scp, true)
+ nerv.info("[TR] training set %d: %.3f", i, accu_tr)
+ local accu_new = trainer(
+ string.format("%s_%s_iter_%d_lr%f_tr%.3f",
+ string.gsub(
+ (string.gsub(pf0[1], "(.*/)(.*)", "%2")),
+ "(.*)%..*", "%1"),
+ os.date("%Y%m%d%H%M%S"),
+ i, gconf.lrate,
+ accu_tr),
+ gconf.cv_scp, false)
+ nerv.info("[CV] cross validation %d: %.3f", i, accu_new)
+ -- TODO: revert the weights
+ local accu_diff = accu_new - accu_best
+ if do_halving and accu_diff < end_halving_inc and i > min_iter then
+ break
+ end
+ if accu_diff < start_halving_inc and i >= min_halving then
+ do_halving = true
+ end
+ if do_halving then
+ gconf.lrate = gconf.lrate * halving_factor
+ end
+ if accu_new > accu_best then
+ accu_best = accu_new
+ end
+-- nerv.Matrix.print_profile()
+end
diff --git a/nerv/examples/chunk_file_example.lua b/nerv/examples/chunk_file_example.lua
new file mode 100644
index 0000000..5961c98
--- /dev/null
+++ b/nerv/examples/chunk_file_example.lua
@@ -0,0 +1,53 @@
+-- To define a readable and writable chunk, one must define a class with the
+-- following methods: __init(id, global_conf), read(handle), write(handle),
+-- get_info(), set_info(info) and an id attribute. This file demonstrates a
+-- basic chunk implementation which manages the I/O of a matrix
+
+local MatrixChunk = nerv.class("nerv.MatrixChunk")
+
+function MatrixChunk:__init(id, global_conf)
+ self.id = id
+ self.info = {}
+ self.gconf = global_conf
+end
+
+function MatrixChunk:read(handle)
+ -- pass the read handle to the matrix method
+ self.data = nerv.MMatrixFloat.load(handle)
+end
+
+function MatrixChunk:write(handle)
+ -- pass the write handle to the matrix method
+ self.data:save(handle)
+end
+
+function MatrixChunk:get_info()
+ return self.info
+end
+
+function MatrixChunk:set_info(info)
+ self.info = info
+end
+
+function MatrixChunk.create_from_matrix(id, mat)
+ local ins = nerv.MatrixChunk(id)
+ ins.data = mat
+ return ins
+end
+
+mat = nerv.MMatrixFloat(3, 4)
+for i = 0, 2 do
+ for j = 0, 3 do
+ mat[i][j] = i + j
+ end
+end
+
+cd = nerv.MatrixChunk.create_from_matrix("matrix1", mat)
+
+cf = nerv.ChunkFile("test.nerv", "w")
+cf:write_chunk(cd)
+cf:close()
+
+cf2 = nerv.ChunkFile("test.nerv", "r")
+cd2 = cf2:read_chunk("matrix1")
+print(cd2.data)
diff --git a/nerv/examples/cumatrix_example.lua b/nerv/examples/cumatrix_example.lua
new file mode 100644
index 0000000..544fc7f
--- /dev/null
+++ b/nerv/examples/cumatrix_example.lua
@@ -0,0 +1,31 @@
+m = 4
+n = 4
+fm = nerv.CuMatrixFloat(m, n)
+dm = nerv.CuMatrixDouble(m, n)
+for i = 0, m - 1 do
+ for j = 0, n - 1 do
+ -- local t = math.random(10)
+ t = i / (j + 1)
+ fm[i][j] = t
+ dm[i][j] = t
+ end
+end
+print(fm)
+fs = fm:create()
+fs:softmax(fm)
+-- print(fs)
+print(dm)
+ds = dm:create()
+ds:softmax(dm)
+-- print(ds)
+print(fs)
+print(fs + fs)
+print(ds + ds)
+print(fs - fs)
+print(ds - ds)
+
+a = fs:create()
+a:mul_elem(fs, fs)
+print(a)
+a:log_elem(fs)
+print(a)
diff --git a/nerv/examples/cumatrix_from_mmatrix.lua b/nerv/examples/cumatrix_from_mmatrix.lua
new file mode 100644
index 0000000..2309e14
--- /dev/null
+++ b/nerv/examples/cumatrix_from_mmatrix.lua
@@ -0,0 +1,32 @@
+m = 3
+n = 4
+fm = nerv.MMatrixFloat(m, n)
+dm = nerv.MMatrixDouble(m, n)
+for i = 0, m - 1 do
+ for j = 0, n - 1 do
+ -- local t = math.random(10)
+ t = i / (j + 1)
+ fm[i][j] = t
+ dm[i][j] = t
+ end
+end
+print(fm)
+print(dm)
+
+fc = nerv.CuMatrixFloat(m, n)
+dc = nerv.CuMatrixDouble(m, n)
+fc:copy_fromh(fm)
+dc:copy_fromh(dm)
+print("fc and dc")
+print(fc)
+print(dc)
+dc[1]:copy_tod(dc[0])
+print("dc[1] copied to dc[0]")
+print(dc)
+print("softmax of fc and dc")
+sfc = fc:create()
+sdc = dc:create()
+sfc:softmax(fc)
+print(sfc)
+sdc:softmax(dc)
+print(sdc)
diff --git a/nerv/examples/mmatrix_example.lua b/nerv/examples/mmatrix_example.lua
new file mode 100644
index 0000000..8ddfe84
--- /dev/null
+++ b/nerv/examples/mmatrix_example.lua
@@ -0,0 +1,20 @@
+m = 10
+n = 10
+fm = nerv.MMatrixFloat(m, n)
+dm = nerv.MMatrixDouble(m, n)
+for i = 0, m - 1 do
+ for j = 0, n - 1 do
+ -- local t = math.random(10)
+ t = i / (j + 1)
+ fm[i][j] = t
+ dm[i][j] = t
+ end
+end
+print("test fm:get_dataref_value:", fm:get_dataref_value())
+print("forced a garbade collect")
+collectgarbage("collect")
+print("test fm:get_dataref_value:", fm:get_dataref_value())
+print(fm)
+-- print(fm:softmax())
+print(dm)
+-- print(dm:softmax())
diff --git a/nerv/examples/oop_example.c b/nerv/examples/oop_example.c
new file mode 100644
index 0000000..59dfc5a
--- /dev/null
+++ b/nerv/examples/oop_example.c
@@ -0,0 +1,101 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../common.h"
+
+#define SQR(x) ((x) * (x))
+
+const char *point_tname = "nerv.Point";
+const char *better_point_tname = "nerv.BetterPoint";
+
+typedef struct {
+ double x, y;
+} Point;
+
+static int point_norm (lua_State *L) {
+ Point *p = luaT_checkudata(L, 1, point_tname);
+ lua_pushnumber(L, sqrt(SQR(p->x) + SQR(p->y)));
+ return 1;
+}
+
+static int point_set_x (lua_State *L) {
+ Point *p = luaT_checkudata(L, 1, point_tname);
+ p->x = luaL_checknumber(L, 2);
+ return 0;
+}
+
+static int point_set_y (lua_State *L) {
+ Point *p = luaT_checkudata(L, 1, point_tname);
+ p->y = luaL_checknumber(L, 2);
+ return 0;
+}
+
+/* generic constructor */
+void point_new_(Point *self, double x, double y) {
+ self->x = x;
+ self->y = y;
+}
+
+int point_new(lua_State *L) {
+ /* `_new` function should create the object itself */
+ Point *self = (Point *)malloc(sizeof(Point));
+ point_new_(self, luaL_checknumber(L, 1), luaL_checknumber(L, 2));
+ luaT_pushudata(L, self, point_tname);
+ fprintf(stderr, "[example] %s constructor is invoked\n",
+ point_tname);
+ return 1;
+}
+
+static const luaL_Reg point_methods[] = {
+ {"set_x", point_set_x},
+ {"set_y", point_set_y},
+ {"norm", point_norm},
+ {NULL, NULL}
+};
+
+
+/* the subclass method overrides the one from baseclass */
+static int better_point_norm (lua_State *L) {
+ Point *p = luaT_checkudata(L, 1, point_tname);
+ lua_pushnumber(L, fabs(p->x) + fabs(p->y));
+ return 1;
+}
+
+int better_point_new(lua_State *L) {
+ /* `_new` function should create the object itself */
+ Point *self = (Point *)malloc(sizeof(Point));
+ point_new_(self, luaL_checknumber(L, 1), luaL_checknumber(L, 2));
+ luaT_pushudata(L, self, better_point_tname);
+ fprintf(stderr, "[example] %s constructor is invoked\n",
+ better_point_tname);
+ return 1;
+}
+
+static const luaL_Reg better_point_methods[] = {
+ {"norm", better_point_norm},
+ {NULL, NULL}
+};
+
+void nerv_point_init(lua_State *L) {
+ /* create a class and let luaT know */
+ luaT_newmetatable(L, point_tname, NULL, point_new, NULL, NULL);
+ /* register member functions */
+ luaL_register(L, NULL, point_methods);
+ /* keep the stack balanced, see `nerv.c` */
+ lua_pop(L, 1);
+}
+
+void nerv_better_point_init(lua_State *L) {
+ /* create a class and let luaT know */
+ luaT_newmetatable(L, better_point_tname, point_tname,
+ better_point_new, NULL, NULL);
+ /* register member functions */
+ luaL_register(L, NULL, better_point_methods);
+ /* keep the stack balanced, see `nerv.c` */
+ lua_pop(L, 1);
+}
+
+void nerv_example_init(lua_State *L) {
+ nerv_point_init(L);
+ nerv_better_point_init(L);
+}
diff --git a/nerv/examples/oop_example.lua b/nerv/examples/oop_example.lua
new file mode 100644
index 0000000..b753288
--- /dev/null
+++ b/nerv/examples/oop_example.lua
@@ -0,0 +1,16 @@
+p = nerv.Point(0, 0) -- create a Point instance
+print(p)
+print(p:norm()) -- get 2-norm of the Point
+p:set_x(1.0)
+p:set_y(2.0)
+print(p:norm()) -- get 2-norm of the Point
+
+bp = nerv.BetterPoint(1, 2)
+-- use methods from base class
+bp:set_x(1.0)
+bp:set_y(2.0)
+print(bp)
+print(bp:norm()) --get 1-norm of the Point
+
+print(p.__typename)
+print(bp.__typename)
diff --git a/nerv/examples/swb_baseline.lua b/nerv/examples/swb_baseline.lua
new file mode 100644
index 0000000..8b7e01a
--- /dev/null
+++ b/nerv/examples/swb_baseline.lua
@@ -0,0 +1,166 @@
+require 'speech.init'
+gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
+ cumat_type = nerv.CuMatrixFloat,
+ mmat_type = nerv.MMatrixFloat,
+ frm_ext = 5,
+ tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
+ cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp",
+ htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
+ initialized_param = {"/slfs1/users/mfy43/swb_init.nerv",
+ "/slfs1/users/mfy43/swb_global_transf.nerv"},
+ debug = false}
+
+function make_sublayer_repo(param_repo)
+ return nerv.LayerRepo(
+ {
+ -- global transf
+ ["nerv.BiasLayer"] =
+ {
+ blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}},
+ blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}}
+ },
+ ["nerv.WindowLayer"] =
+ {
+ wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}},
+ wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}}
+ },
+ -- biased linearity
+ ["nerv.AffineLayer"] =
+ {
+ affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+ {dim_in = {429}, dim_out = {2048}}},
+ affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+ {dim_in = {2048}, dim_out = {3001}}}
+ },
+ ["nerv.SigmoidLayer"] =
+ {
+ sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+ },
+ ["nerv.SoftmaxCELayer"] =
+ {
+ ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}}
+ }
+ }, param_repo, gconf)
+end
+
+function make_layer_repo(sublayer_repo, param_repo)
+ return nerv.LayerRepo(
+ {
+ ["nerv.DAGLayer"] =
+ {
+ global_transf = {{}, {
+ dim_in = {429}, dim_out = {429},
+ sub_layers = sublayer_repo,
+ connections = {
+ ["<input>[1]"] = "blayer1[1]",
+ ["blayer1[1]"] = "wlayer1[1]",
+ ["wlayer1[1]"] = "blayer2[1]",
+ ["blayer2[1]"] = "wlayer2[1]",
+ ["wlayer2[1]"] = "<output>[1]"
+ }
+ }},
+ main = {{}, {
+ dim_in = {429, 1}, dim_out = {1},
+ sub_layers = sublayer_repo,
+ connections = {
+ ["<input>[1]"] = "affine0[1]",
+ ["affine0[1]"] = "sigmoid0[1]",
+ ["sigmoid0[1]"] = "affine1[1]",
+ ["affine1[1]"] = "sigmoid1[1]",
+ ["sigmoid1[1]"] = "affine2[1]",
+ ["affine2[1]"] = "sigmoid2[1]",
+ ["sigmoid2[1]"] = "affine3[1]",
+ ["affine3[1]"] = "sigmoid3[1]",
+ ["sigmoid3[1]"] = "affine4[1]",
+ ["affine4[1]"] = "sigmoid4[1]",
+ ["sigmoid4[1]"] = "affine5[1]",
+ ["affine5[1]"] = "sigmoid5[1]",
+ ["sigmoid5[1]"] = "affine6[1]",
+ ["affine6[1]"] = "sigmoid6[1]",
+ ["sigmoid6[1]"] = "affine7[1]",
+ ["affine7[1]"] = "ce_crit[1]",
+ ["<input>[2]"] = "ce_crit[2]",
+ ["ce_crit[1]"] = "<output>[1]"
+ }
+ }}
+ }
+ }, param_repo, gconf)
+end
+
+function get_criterion_layer(sublayer_repo)
+ return sublayer_repo:get_layer("ce_crit")
+end
+
+function get_network(layer_repo)
+ return layer_repo:get_layer("main")
+end
+
+function make_readers(scp_file, layer_repo)
+ return {
+ {reader = nerv.TNetReader(gconf,
+ {
+ id = "main_scp",
+ scp_file = scp_file,
+ conf_file = gconf.htk_conf,
+ frm_ext = gconf.frm_ext,
+ mlfs = {
+ phone_state = {
+ file = "/slfs1/users/mfy43/swb_ivec/ref.mlf",
+ format = "map",
+ format_arg = "/slfs1/users/mfy43/swb_ivec/dict",
+ dir = "*/",
+ ext = "lab"
+ }
+ },
+ global_transf = layer_repo:get_layer("global_transf")
+ }),
+ data = {main_scp = 429, phone_state = 1}}
+ }
+end
+
+function make_buffer(readers)
+ return nerv.SGDBuffer(gconf,
+ {
+ buffer_size = gconf.buffer_size,
+ randomize = gconf.randomize,
+ readers = readers
+ })
+end
+
+function get_input_order()
+ return {"main_scp", "phone_state"}
+end
+
+function get_accuracy(sublayer_repo)
+ local ce_crit = sublayer_repo:get_layer("ce_crit")
+ return ce_crit.total_correct / ce_crit.total_frames * 100
+end
+
+function print_stat(sublayer_repo)
+ local ce_crit = sublayer_repo:get_layer("ce_crit")
+ nerv.info("*** training stat begin ***")
+ nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce)
+ nerv.printf("correct:\t\t%d\n", ce_crit.total_correct)
+ nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames)
+ nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames)
+ nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(sublayer_repo))
+ nerv.info("*** training stat end ***")
+end
diff --git a/nerv/examples/test_dnn_layers.lua b/nerv/examples/test_dnn_layers.lua
new file mode 100644
index 0000000..64c0dec
--- /dev/null
+++ b/nerv/examples/test_dnn_layers.lua
@@ -0,0 +1,78 @@
+require 'layer.affine'
+require 'layer.sigmoid'
+require 'layer.softmax_ce'
+
+global_conf = {lrate = 0.8, wcost = 1e-6,
+ momentum = 0.9, cumat_type = nerv.CuMatrixFloat}
+
+pf = nerv.ChunkFile("affine.param", "r")
+ltp = pf:read_chunk("a", global_conf)
+bp = pf:read_chunk("b", global_conf)
+
+-- print(bp.trans)
+
+af = nerv.AffineLayer("test", global_conf, {["ltp"] = ltp,
+ ["bp"] = bp,
+ dim_in = {429},
+ dim_out = {2048}})
+sg = nerv.SigmoidLayer("test2", global_conf, {dim_in = {2048},
+ dim_out = {2048}})
+sm = nerv.SoftmaxCELayer("test3", global_conf, {dim_in = {2048, 2048},
+ dim_out = {}})
+af:init()
+sg:init()
+sm:init()
+
+df = nerv.ChunkFile("input.param", "r")
+
+label = nerv.CuMatrixFloat(10, 2048)
+label:fill(0)
+for i = 0, 9 do
+ label[i][i] = 1.0
+end
+
+input1 = {df:read_chunk("input", global_conf).trans}
+output1 = {nerv.CuMatrixFloat(10, 2048)}
+input2 = output1
+output2 = {nerv.CuMatrixFloat(10, 2048)}
+input3 = {output2[1], label}
+output3 = {}
+err_input1 = {}
+err_output1 = {nerv.CuMatrixFloat(10, 2048)}
+err_input2 = err_output1
+err_output2 = {nerv.CuMatrixFloat(10, 2048)}
+err_input3 = err_output2
+err_output3 = {input1[1]:create()}
+
+for i = 0, 3 do
+ -- propagate
+ af:propagate(input1, output1)
+ sg:propagate(input2, output2)
+ sm:propagate(input3, output3)
+
+ -- back_propagate
+ sm:back_propagate(err_output1, err_input1, input3, output3)
+ sg:back_propagate(err_output2, err_input2, input2, output2)
+ af:back_propagate(err_output3, err_input3, input1, output1)
+
+ -- update
+ sm:update(err_input1, input3, output3)
+ sg:update(err_input2, input2, output2)
+ af:update(err_input3, input1, output1)
+
+
+ print("output1")
+ print(output1[1])
+ print("output2")
+ print(output2[1])
+ print("err_output1")
+ print(err_output1[1])
+ print("err_output2")
+ print(err_output2[1])
+ nerv.printf("cross entropy: %.8f\n", sm.total_ce)
+ nerv.printf("frames: %.8f\n", sm.total_frames)
+end
+print("linear")
+print(af.ltp.trans)
+print("linear2")
+print(af.bp.trans)
diff --git a/nerv/examples/test_nn_lib.lua b/nerv/examples/test_nn_lib.lua
new file mode 100644
index 0000000..5444810
--- /dev/null
+++ b/nerv/examples/test_nn_lib.lua
@@ -0,0 +1,164 @@
+require 'speech.init'
+gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
+ cumat_type = nerv.CuMatrixFloat,
+ mmat_type = nerv.MMatrixFloat,
+ batch_size = 256}
+
+param_repo = nerv.ParamRepo({"converted.nerv", "global_transf.nerv"})
+sublayer_repo = nerv.LayerRepo(
+ {
+ -- global transf
+ ["nerv.BiasLayer"] =
+ {
+ blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}},
+ blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}}
+ },
+ ["nerv.WindowLayer"] =
+ {
+ wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}},
+ wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}}
+ },
+ -- biased linearity
+ ["nerv.AffineLayer"] =
+ {
+ affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
+ {dim_in = {429}, dim_out = {2048}}},
+ affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
+ {dim_in = {2048}, dim_out = {2048}}},
+ affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
+ {dim_in = {2048}, dim_out = {3001}}}
+ },
+ ["nerv.SigmoidLayer"] =
+ {
+ sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
+ sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+ },
+ ["nerv.SoftmaxCELayer"] =
+ {
+ softmax_ce0 = {{}, {dim_in = {3001, 1}, dim_out = {}, compressed = true}}
+ }
+ }, param_repo, gconf)
+
+layer_repo = nerv.LayerRepo(
+ {
+ ["nerv.DAGLayer"] =
+ {
+ global_transf = {{}, {
+ dim_in = {429}, dim_out = {429},
+ sub_layers = sublayer_repo,
+ connections = {
+ ["<input>[1]"] = "blayer1[1]",
+ ["blayer1[1]"] = "wlayer1[1]",
+ ["wlayer1[1]"] = "blayer2[1]",
+ ["blayer2[1]"] = "wlayer2[1]",
+ ["wlayer2[1]"] = "<output>[1]"
+ }
+ }},
+ main = {{}, {
+ dim_in = {429, 1}, dim_out = {},
+ sub_layers = sublayer_repo,
+ connections = {
+ ["<input>[1]"] = "affine0[1]",
+ ["affine0[1]"] = "sigmoid0[1]",
+ ["sigmoid0[1]"] = "affine1[1]",
+ ["affine1[1]"] = "sigmoid1[1]",
+ ["sigmoid1[1]"] = "affine2[1]",
+ ["affine2[1]"] = "sigmoid2[1]",
+ ["sigmoid2[1]"] = "affine3[1]",
+ ["affine3[1]"] = "sigmoid3[1]",
+ ["sigmoid3[1]"] = "affine4[1]",
+ ["affine4[1]"] = "sigmoid4[1]",
+ ["sigmoid4[1]"] = "affine5[1]",
+ ["affine5[1]"] = "sigmoid5[1]",
+ ["sigmoid5[1]"] = "affine6[1]",
+ ["affine6[1]"] = "sigmoid6[1]",
+ ["sigmoid6[1]"] = "affine7[1]",
+ ["affine7[1]"] = "softmax_ce0[1]",
+ ["<input>[2]"] = "softmax_ce0[2]"
+ }
+ }}
+ }
+ }, param_repo, gconf)
+
+tnet_reader = nerv.TNetReader(gconf,
+ {
+ id = "main_scp",
+ scp_file = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
+-- scp_file = "t.scp",
+ conf_file = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
+ frm_ext = 5,
+ mlfs = {
+ ref = {
+ file = "/slfs1/users/mfy43/swb_ivec/ref.mlf",
+ format = "map",
+ format_arg = "/slfs1/users/mfy43/swb_ivec/dict",
+ dir = "*/",
+ ext = "lab"
+ }
+ },
+ global_transf = layer_repo:get_layer("global_transf")
+ })
+
+buffer = nerv.SGDBuffer(gconf,
+ {
+ buffer_size = 81920,
+ randomize = true,
+ readers = {
+ { reader = tnet_reader,
+ data = {main_scp = 429, ref = 1}}
+ }
+ })
+
+sm = sublayer_repo:get_layer("softmax_ce0")
+main = layer_repo:get_layer("main")
+main:init(gconf.batch_size)
+gconf.cnt = 0
+-- data = buffer:get_data()
+-- input = {data.main_scp, data.ref}
+-- while true do
+for data in buffer.get_data, buffer do
+-- if gconf.cnt == 100 then break end
+-- gconf.cnt = gconf.cnt + 1
+
+ input = {data.main_scp, data.ref}
+ output = {}
+ err_input = {}
+ err_output = {input[1]:create()}
+
+ main:propagate(input, output)
+ main:back_propagate(err_output, err_input, input, output)
+ main:update(err_input, input, output)
+
+-- nerv.printf("cross entropy: %.8f\n", sm.total_ce)
+-- nerv.printf("correct: %d\n", sm.total_correct)
+-- nerv.printf("frames: %d\n", sm.total_frames)
+-- nerv.printf("err/frm: %.8f\n", sm.total_ce / sm.total_frames)
+-- nerv.printf("accuracy: %.8f\n", sm.total_correct / sm.total_frames)
+ collectgarbage("collect")
+end
+nerv.printf("cross entropy: %.8f\n", sm.total_ce)
+nerv.printf("correct: %d\n", sm.total_correct)
+nerv.printf("accuracy: %.3f%%\n", sm.total_correct / sm.total_frames * 100)
+nerv.printf("writing back...\n")
+cf = nerv.ChunkFile("output.nerv", "w")
+for i, p in ipairs(main:get_params()) do
+ print(p)
+ cf:write_chunk(p)
+end
+cf:close()
+nerv.Matrix.print_profile()
diff --git a/nerv/init.lua b/nerv/init.lua
new file mode 100644
index 0000000..89010a7
--- /dev/null
+++ b/nerv/init.lua
@@ -0,0 +1,128 @@
+require 'libnerv'
+
+function nerv.error(fmt, ...)
+ error(nerv.printf("[nerv] internal error: " .. fmt .. "\n", ...))
+end
+
+function nerv.error_method_not_implemented()
+ nerv.error("method not implemented");
+end
+
+function nerv.printf(fmt, ...)
+ io.write(string.format(fmt, ...))
+end
+
+function nerv.mesg_with_timestamp(fmt, ...)
+ nerv.printf(
+ string.format("(%s)[nerv] info: %s\n",
+ os.date("%H:%M:%S %F"), fmt), ...)
+end
+
+function nerv.info(fmt, ...)
+ nerv.printf(
+ string.format("(%s)[nerv] info: %s\n",
+ os.date("%H:%M:%S %F"), fmt), ...)
+end
+
+function nerv.warning(fmt, ...)
+ nerv.printf(
+ string.format("(%s)[nerv] warning: %s\n",
+ os.date("%H:%M:%S %F"), fmt), ...)
+end
+
+-- Torch C API wrapper
+function nerv.class(tname, parenttname)
+
+ local function constructor(...)
+ local self = {}
+ nerv.setmetatable(self, tname)
+ if self.__init then
+ self:__init(...)
+ end
+ return self
+ end
+
+ local function factory()
+ local self = {}
+ nerv.setmetatable(self, tname)
+ return self
+ end
+
+ local mt = nerv.newmetatable(tname, parenttname, constructor, nil, factory)
+ local mpt
+ if parenttname then
+ mpt = nerv.getmetatable(parenttname)
+ end
+ return mt, mpt
+end
+
+function table.val_to_str(v)
+ if "string" == type(v) then
+ v = string.gsub(v, "\n", "\\n")
+ if string.match(string.gsub(v,"[^'\"]",""), '^"+$') then
+ return "'" .. v .. "'"
+ end
+ return '"' .. string.gsub(v,'"', '\\"') .. '"'
+ else
+ return "table" == type(v) and table.tostring(v) or
+ tostring(v)
+ end
+end
+
+function table.key_to_str (k)
+ if "string" == type(k) and string.match(k, "^[_%a][_%a%d]*$") then
+ return k
+ else
+ return "[" .. table.val_to_str(k) .. "]"
+ end
+end
+
+function table.tostring(tbl)
+ local result, done = {}, {}
+ for k, v in ipairs(tbl) do
+ table.insert(result, table.val_to_str(v))
+ done[k] = true
+ end
+ for k, v in pairs(tbl) do
+ if not done[k] then
+ table.insert(result,
+ table.key_to_str(k) .. "=" .. table.val_to_str(v))
+ end
+ end
+ return "{" .. table.concat(result, ",") .. "}"
+end
+
+function nerv.get_type(tname)
+ return assert(loadstring("return " .. tname))()
+end
+
+function nerv.is_type(obj, tname)
+ local mt0 = nerv.getmetatable(tname)
+ local mt = getmetatable(obj)
+ while mt do
+ if mt == mt0 then
+ return true
+ end
+ mt = getmetatable(mt)
+ end
+ return false
+end
+
+function nerv.dirname(filename)
+ if filename:match(".-/.-") then
+ local name = string.gsub(filename, "(.*/)(.*)", "%1")
+ return name
+ else
+ return ''
+ end
+end
+
+function nerv.include(filename)
+ local caller = debug.getinfo(2, "S").source:sub(2)
+ dofile(nerv.dirname(caller) .. filename)
+end
+
+nerv.include('matrix/init.lua')
+nerv.include('io/init.lua')
+nerv.include('layer/init.lua')
+nerv.include('nn/init.lua')
diff --git a/nerv/io/chunk_file.c b/nerv/io/chunk_file.c
new file mode 100644
index 0000000..c0b6b9f
--- /dev/null
+++ b/nerv/io/chunk_file.c
@@ -0,0 +1,325 @@
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include "../common.h"
+#include "chunk_file.h"
+
+#define INVALID_FORMAT_ERROR(fn) \
+ nerv_error(L, "Invalid chunk file: %s", fn)
+#define CHECK_FORMAT(exp, ret, fname) \
+ do { \
+ if ((exp) != (ret)) INVALID_FORMAT_ERROR(fn); \
+ } while (0)
+#define CHECK_FILE_OPEN(pfh) \
+ do { \
+ if ((pfh)->closed) \
+ nerv_error(L, "operations on a closed file"); \
+ } while (0)
+
+const char *nerv_chunk_file_tname = "nerv.ChunkFile";
+const char *nerv_chunk_file_handle_tname = "nerv.ChunkFileHandle";
+const char *nerv_chunk_info_tname = "nerv.ChunkInfo";
+const char *nerv_chunk_data_tname = "nerv.ChunkData";
+
+#define PARAM_HEADER_SIZE 16
+
+enum {
+ NORMAL,
+ INVALID_FORMAT,
+ END_OF_FILE,
+ SECTION_OVERFLOW,
+ WRITE_ERROR
+};
+
+size_t read_chunk_header_plain(FILE *fp, int *status) {
+ static char buff[PARAM_HEADER_SIZE];
+ int i;
+ size_t size = 0;
+ *status = NORMAL;
+ if (fread(buff, 1, PARAM_HEADER_SIZE, fp) != PARAM_HEADER_SIZE)
+ {
+ if (feof(fp)) *status = END_OF_FILE;
+ else *status = INVALID_FORMAT;
+ }
+ for (i = 0; i < PARAM_HEADER_SIZE; i++)
+ if (isdigit(buff[i]))
+ size = size * 10 + buff[i] - '0';
+/* fprintf(stderr, "header: %lu\n", size); */
+ return size;
+}
+
+#define CHECK_WRITE(status) \
+ do { \
+ if (status == SECTION_OVERFLOW) \
+ nerv_error(L, "section overflowed"); \
+ else if (status == WRITE_ERROR) \
+ nerv_error(L, "error while writing"); \
+ } while (0)
+
+void write_chunk_header_plain(FILE *fp, size_t size, int *status) {
+ static char buff[PARAM_HEADER_SIZE];
+ int i;
+ *status = NORMAL;
+ for (i = PARAM_HEADER_SIZE - 3; i > 0; i--, size /= 10)
+ buff[i] = size % 10 + '0';
+ if (size)
+ {
+ *status = SECTION_OVERFLOW;
+ return;
+ }
+ buff[0] = '[';
+ buff[PARAM_HEADER_SIZE - 2] = ']';
+ buff[PARAM_HEADER_SIZE - 1] = '\n';
+ if (fwrite(buff, 1, PARAM_HEADER_SIZE, fp) != PARAM_HEADER_SIZE)
+ {
+ *status = WRITE_ERROR;
+ return;
+ }
+}
+
+ChunkData *get_chunk_data(FILE *fp, ChunkInfo *info) {
+ ChunkData *pcd = (ChunkData *)malloc(sizeof(ChunkData));
+ pcd->data = (char *)malloc(info->length);
+ pcd->fp = fmemopen(pcd->data, info->length, "r");
+ assert(fseeko(fp, info->offset, SEEK_SET) == 0);
+ if (fread(pcd->data, 1, info->length, fp) != (size_t)info->length)
+ return NULL;
+ return pcd;
+}
+
+const char *read_chunk_metadata(lua_State *L, FILE *fp, const char *fn) {
+#define LINEBUFF_SIZE 1024
+ static char buff[7 + LINEBUFF_SIZE] = "return ";
+ CHECK_FORMAT(fgets(buff + 7, LINEBUFF_SIZE, fp), buff + 7, fn);
+ /* fprintf(stderr, "metadata: %s\n", buff); */
+ return buff;
+}
+
+void write_chunk_metadata(FILE *fp, const char *metadata_str, int *status) {
+ size_t size = strlen(metadata_str);
+ *status = NORMAL;
+ if (fwrite(metadata_str, 1, size, fp) != size ||
+ fprintf(fp, "\n") < 0)
+ {
+ *status = WRITE_ERROR;
+ return;
+ }
+ /* fprintf(stderr, "metadata: %s\n", metadata_str); */
+}
+
+
+int nerv_chunk_file_open_write(lua_State *L, const char *fn) {
+ FILE *fp = fopen(fn, "w");
+ ChunkFileHandle *lfp;
+ if (!fp) nerv_error(L, "Error while opening chunk file: %s", fn);
+ lfp = (ChunkFileHandle *)malloc(sizeof(ChunkFileHandle));
+ lfp->fp = fp;
+ lfp->closed = 0;
+ luaT_pushudata(L, lfp, nerv_chunk_file_handle_tname);
+ lua_setfield(L, -2, "handle");
+ luaT_pushmetatable(L, nerv_chunk_file_tname);
+ lua_setmetatable(L, -2);
+ return 1;
+}
+
+int nerv_chunk_file_open_read(lua_State *L, const char *fn) {
+ FILE *fp = fopen(fn, "r");
+ int i, status;
+ size_t chunk_len;
+ off_t offset;
+ ChunkFileHandle *lfp;
+
+ if (!fp) nerv_error(L, "Error while opening chunk file: %s", fn);
+ offset = ftello(fp);
+ lua_newtable(L);
+ /* fprintf(stderr, "%d\n", (int)offset); */
+ for (i = 0;; offset += chunk_len, i++)
+ {
+ ChunkInfo *pci;
+ /* fprintf(stderr, "reading chunk %d from %d\n", i, (int)offset); */
+ /* skip to the begining of chunk i */
+ CHECK_FORMAT(fseeko(fp, offset, SEEK_SET), 0, fn);
+ /* read header */
+ chunk_len = read_chunk_header_plain(fp, &status);
+ if (status == END_OF_FILE) break;
+ else if (status == INVALID_FORMAT)
+ INVALID_FORMAT_ERROR(fn);
+ /* read metadata */
+ luaL_loadstring(L, read_chunk_metadata(L, fp, fn));
+ CHECK_FORMAT(lua_pcall(L, 0, 1, 0), 0, fn);
+ CHECK_FORMAT(lua_istable(L, -1), 1, fn);
+ /* stack: obj_table, metadata */
+ /* chunk info */
+ pci = (ChunkInfo *)malloc(sizeof(ChunkInfo));
+ pci->offset = ftello(fp);
+ pci->length = chunk_len - (pci->offset - offset);
+ /* fprintf(stderr, "%d + %d (skip %lu)\n", (int)pci->offset,
+ (int)pci->length, chunk_len); */
+ luaT_pushudata(L, pci, nerv_chunk_info_tname);
+ lua_setfield(L, -2, "chunk");
+ /* stack: obj_table, metadata */
+ /* get id */
+ lua_getfield(L, -1, "id");
+ /* stack: obj_table, metadata, id */
+ if (!lua_isstring(L, -1))
+ nerv_error(L, "id field in metadata must be a string");
+ lua_pushvalue(L, -1);
+ /* stack: obj_table, metadata, id, id */
+ lua_gettable(L, -4);
+ /* stack: obj_table, metadata, id, obj[id] */
+ if (!lua_isnil(L, -1))
+ nerv_error(L, "conflicting id");
+ lua_pop(L, 1);
+ /* stack: obj_table, metadata, id */
+ lua_pushvalue(L, -2);
+ /* stack: obj_table, metadata, id, metadata */
+ lua_settable(L, -4);
+ /* stack: obj_table, metadata */
+ lua_pop(L, 1);
+ }
+ lua_setfield(L, -2, "metadata");
+ lfp = (ChunkFileHandle *)malloc(sizeof(ChunkFileHandle));
+ lfp->fp = fp;
+ lfp->closed = 0;
+ luaT_pushudata(L, lfp, nerv_chunk_file_handle_tname);
+ lua_setfield(L, -2, "handle");
+ luaT_pushmetatable(L, nerv_chunk_file_tname);
+ lua_setmetatable(L, -2);
+ return 1;
+}
+
+int nerv_chunk_file_new_(lua_State *L, const char *fn, const char *mode) {
+ int rd = 1, bin = 0;
+ size_t i, len = strlen(mode);
+ for (i = 0; i < len; i++)
+ switch (mode[i])
+ {
+ case 'r': rd = 1; break;
+ case 'w': rd = 0; break;
+ case 'b': bin = 1; break;
+ }
+ return rd ? nerv_chunk_file_open_read(L, fn) : \
+ nerv_chunk_file_open_write(L, fn);
+}
+
+int nerv_chunk_file___init(lua_State *L) {
+ lua_pushvalue(L, 1);
+ return nerv_chunk_file_new_(L, luaL_checkstring(L, 2),
+ luaL_checkstring(L, 3));
+}
+
+int nerv_chunk_file_new(lua_State *L) {
+ lua_newtable(L);
+ return nerv_chunk_file_new_(L, luaL_checkstring(L, 1),
+ luaL_checkstring(L, 2));
+}
+
+int nerv_chunk_file_write_chunkdata(lua_State *L) {
+ ChunkFileHandle *pfh;
+ int status;
+ off_t start;
+ size_t size;
+ const char *metadata_str = lua_tolstring(L, 2, NULL);
+ lua_getfield(L, 1, "handle");
+ pfh = luaT_checkudata(L, -1, nerv_chunk_file_handle_tname);
+ CHECK_FILE_OPEN(pfh);
+ start = ftello(pfh->fp);
+ write_chunk_header_plain(pfh->fp, 0, &status); /* fill zeros */
+ CHECK_WRITE(status);
+ write_chunk_metadata(pfh->fp, metadata_str, &status);
+ CHECK_WRITE(status);
+ lua_pushvalue(L, 3);
+ lua_getfield(L, -1, "write");
+ if (!lua_isfunction(L, -1))
+ nerv_error(L, "\"write\" method must be implemented");
+ lua_pushvalue(L, -2);
+ lua_pushvalue(L, 4); /* pass handle as parameter to write() */
+ lua_call(L, 2, 0); /* let the write() to write */
+ lua_pop(L, 1);
+ size = ftello(pfh->fp) - start;
+ fseeko(pfh->fp, start, SEEK_SET);
+ /* write the calced size */
+ write_chunk_header_plain(pfh->fp, size, &status);
+ CHECK_WRITE(status);
+ fseeko(pfh->fp, 0, SEEK_END);
+ return 0;
+}
+
+int nerv_chunk_file_get_chunkdata(lua_State *L) {
+ ChunkFileHandle *pfh;
+ ChunkInfo *pci;
+ ChunkData *pcd;
+ const char *id = luaL_checkstring(L, 2);
+
+ lua_getfield(L, 1, "handle");
+ pfh = luaT_checkudata(L, -1, nerv_chunk_file_handle_tname);
+ CHECK_FILE_OPEN(pfh);
+ lua_pop(L, 1); /* pop handle */
+ lua_getfield(L, 1, "metadata");
+ /* now stack: self, k, metadata */
+ lua_getfield(L, -1, id);
+ /* now stack: self, k, metadata, kth{} */
+ if (lua_isnil(L, -1)) /* no chunck with the id */
+ return 0;
+ lua_getfield(L, -1, "chunk");
+ pci = luaT_checkudata(L, -1, nerv_chunk_info_tname);
+ if (!(pcd = get_chunk_data(pfh->fp, pci)))
+ nerv_error(L, "unexpected end of file");
+ luaT_pushudata(L, pcd, nerv_chunk_data_tname);
+ return 1;
+}
+
+int nerv_chunk_file_close(lua_State *L) {
+ ChunkFileHandle *pfh;
+ lua_getfield(L, 1, "handle");
+ pfh = luaT_checkudata(L, -1, nerv_chunk_file_handle_tname);
+ CHECK_FILE_OPEN(pfh);
+ fclose(pfh->fp);
+ pfh->closed = 1;
+ return 0;
+}
+
+int nerv_chunk_file_handle_destroy(lua_State *L) {
+ ChunkFileHandle *pfh = luaT_checkudata(L, 1,
+ nerv_chunk_file_handle_tname);
+ if (!pfh->closed) fclose(pfh->fp);
+ free(pfh);
+ return 0;
+}
+
+static int nerv_chunk_info_destroy(lua_State *L) {
+ ChunkInfo *pci = luaT_checkudata(L, 1, nerv_chunk_info_tname);
+ free(pci);
+ return 0;
+}
+
+static int nerv_chunk_data_destroy(lua_State *L) {
+ ChunkData *pcd = luaT_checkudata(L, 1, nerv_chunk_data_tname);
+ fclose(pcd->fp);
+ free(pcd->data);
+ free(pcd);
+ return 0;
+}
+
+static const luaL_Reg nerv_chunk_file_methods[] = {
+ {"get_chunkdata", nerv_chunk_file_get_chunkdata},
+ {"_write_chunkdata", nerv_chunk_file_write_chunkdata},
+ {"close", nerv_chunk_file_close},
+ {"__init", nerv_chunk_file___init},
+ {NULL, NULL}
+};
+
+void nerv_chunk_file_init(lua_State *L) {
+ luaT_newmetatable(L, nerv_chunk_file_tname, NULL,
+ nerv_chunk_file_new,
+ NULL, NULL);
+ luaL_register(L, NULL, nerv_chunk_file_methods);
+ lua_pop(L, 1);
+ luaT_newmetatable(L, nerv_chunk_file_handle_tname, NULL,
+ NULL, nerv_chunk_file_handle_destroy, NULL);
+ luaT_newmetatable(L, nerv_chunk_info_tname, NULL,
+ NULL, nerv_chunk_info_destroy, NULL);
+ luaT_newmetatable(L, nerv_chunk_data_tname, NULL,
+ NULL, nerv_chunk_data_destroy, NULL);
+}
+
diff --git a/nerv/io/chunk_file.h b/nerv/io/chunk_file.h
new file mode 100644
index 0000000..9bae59d
--- /dev/null
+++ b/nerv/io/chunk_file.h
@@ -0,0 +1,23 @@
+#ifndef NERV_LAYER_FILE_H
+#define NERV_LAYER_FILE_H
+
+extern const char *nerv_chunk_file_tname;
+extern const char *nerv_chunk_file_handle_tname;
+extern const char *nerv_chunk_info_tname;
+extern const char *nerv_chunk_data_tname;
+
+typedef struct ChunkFileHandle {
+ FILE *fp;
+ int closed;
+} ChunkFileHandle;
+
+typedef struct ChunkInfo {
+ off_t offset, length;
+} ChunkInfo;
+
+typedef struct ChunkData {
+ FILE *fp;
+ char *data;
+} ChunkData;
+
+#endif
diff --git a/nerv/io/init.c b/nerv/io/init.c
new file mode 100644
index 0000000..70585f7
--- /dev/null
+++ b/nerv/io/init.c
@@ -0,0 +1,6 @@
+#include "../common.h"
+
+extern void nerv_chunk_file_init(lua_State *L);
+void nerv_io_init(lua_State *L) {
+ nerv_chunk_file_init(L);
+}
diff --git a/nerv/io/init.lua b/nerv/io/init.lua
new file mode 100644
index 0000000..647ff93
--- /dev/null
+++ b/nerv/io/init.lua
@@ -0,0 +1,55 @@
+function nerv.ChunkFile:write_chunkdata(metadata, writer)
+ if type(metadata) ~= "table" then
+ nerv.error("metadata should be a Lua table")
+ return
+ end
+ return self:_write_chunkdata(table.tostring(metadata), writer)
+end
+
+function nerv.ChunkFile:write_chunk(chunk)
+ local id = chunk.id
+ local type = chunk.__typename
+ if id == nil then
+ nerv.error("id of chunk %s must be specified", type)
+ end
+ self:write_chunkdata({id = id,
+ type = type,
+ info = chunk:get_info()}, chunk)
+end
+
+function nerv.ChunkFile:read_chunk(id, global_conf)
+ if self.metadata == nil then
+ nerv.error("wrong file opening mode")
+ end
+ local metadata = self.metadata[id]
+ if metadata == nil then
+ nerv.error("chunk with id %s does not exist", id)
+ end
+ local chunk_type = nerv.get_type(metadata.type)
+ local chunk = chunk_type(id, global_conf)
+ chunk:set_info(metadata.info)
+ chunk:read(self:get_chunkdata(id))
+ return chunk
+end
+
+local DataReader = nerv.class("nerv.DataReader")
+
+function DataReader:__init(global_conf, reader_conf)
+ nerv.error_method_not_implemented()
+end
+
+function DataReader:get_data()
+ nerv.error_method_not_implemented()
+end
+
+local DataBuffer = nerv.class("nerv.DataBuffer")
+
+function DataBuffer:__init(global_conf, buffer_conf)
+ nerv.error_method_not_implemented()
+end
+
+function DataBuffer:get_batch()
+ nerv.error_method_not_implemented()
+end
+
+nerv.include('sgd_buffer.lua')
diff --git a/nerv/io/sgd_buffer.lua b/nerv/io/sgd_buffer.lua
new file mode 100644
index 0000000..f4f7dfe
--- /dev/null
+++ b/nerv/io/sgd_buffer.lua
@@ -0,0 +1,111 @@
+local SGDBuffer = nerv.class("nerv.SGDBuffer", "nerv.DataBuffer")
+
+function SGDBuffer:__init(global_conf, buffer_conf)
+ self.gconf = global_conf
+ self.buffer_size = math.floor(buffer_conf.buffer_size /
+ global_conf.batch_size) * global_conf.batch_size
+ self.randomize = buffer_conf.randomize
+ if self.randomize == nil then
+ self.randomize = false
+ end
+ self.head = 0
+ self.tail = 0
+ self.readers = {}
+ for i, reader_spec in ipairs(buffer_conf.readers) do
+ local buffs = {}
+ for id, width in pairs(reader_spec.data) do
+ buffs[id] = {data = global_conf.mmat_type(self.buffer_size, width),
+ leftover = nil,
+ width = width}
+ end
+ table.insert(self.readers, {buffs = buffs,
+ reader = reader_spec.reader,
+ tail = 0,
+ has_leftover = false})
+ end
+end
+
+function SGDBuffer:saturate()
+ local buffer_size = self.buffer_size
+ self.head = 0
+ self.tail = buffer_size
+ for i, reader in ipairs(self.readers) do
+ reader.tail = 0
+ if reader.has_leftover then
+ local lrow
+ for id, buff in pairs(reader.buffs) do
+ lrow = buff.leftover:nrow()
+ if lrow > buffer_size then
+ nerv.error("buffer size is too small to contain leftovers")
+ end
+ buff.data:copy_from(buff.leftover, 0, lrow)
+ buff.leftover = nil
+ end
+ nerv.printf("leftover: %d\n", lrow)
+ reader.tail = lrow
+ reader.has_leftover = false
+ end
+ while reader.tail < buffer_size do
+ local data = reader.reader:get_data()
+ if data == nil then
+ break
+ end
+ local drow = nil
+ for id, d in pairs(data) do
+ if drow == nil then
+ drow = d:nrow()
+ elseif d:nrow() ~= drow then
+ nerv.error("reader provides with inconsistent rows of data")
+ end
+ end
+ local remain = buffer_size - reader.tail
+ if drow > remain then
+ for id, buff in pairs(reader.buffs) do
+ local d = data[id]
+ if d == nil then
+ nerv.error("reader does not provide data for %s", id)
+ end
+ buff.leftover = self.gconf.mmat_type(drow - remain,
+ buff.width)
+ buff.leftover:copy_from(d, remain, drow)
+ end
+ drow = remain
+ reader.has_leftover = true
+ end
+ for id, buff in pairs(reader.buffs) do
+ buff.data:copy_from(data[id], 0, drow, reader.tail)
+ end
+ reader.tail = reader.tail + drow
+ end
+ self.tail = math.min(self.tail, reader.tail)
+ end
+ self.rand_map = nerv.MMatrixInt.perm_gen(self.tail) -- generate shuffled index
+ collectgarbage("collect")
+ return self.tail >= self.gconf.batch_size
+end
+
+function SGDBuffer:get_data()
+ local batch_size = self.gconf.batch_size
+ if self.head >= self.tail then -- buffer is empty
+ if not self:saturate() then
+ return nil -- the remaining data cannot build a batch
+ end
+ end
+ if self.head + batch_size > self.tail then
+ return nil -- the remaining data cannot build a batch
+ end
+ local res = {}
+ for i, reader in ipairs(self.readers) do
+ for id, buff in pairs(reader.buffs) do
+ local batch = self.gconf.cumat_type(batch_size, buff.width)
+ if self.randomize then
+ batch:copy_rows_fromh_by_idx(buff.data, self.rand_map, self.head)
+ else
+ batch:copy_fromh(buff.data, self.head, self.head + batch_size)
+ end
+ res[id] = batch
+ end
+ end
+ self.head = self.head + batch_size
+ return res
+end
diff --git a/nerv/layer/affine.lua b/nerv/layer/affine.lua
new file mode 100644
index 0000000..00cbcfb
--- /dev/null
+++ b/nerv/layer/affine.lua
@@ -0,0 +1,91 @@
+local MatrixParam = nerv.class('nerv.MatrixParam', 'nerv.Param')
+local LinearTransParam = nerv.class('nerv.LinearTransParam', 'nerv.MatrixParam')
+local BiasParam = nerv.class('nerv.BiasParam', 'nerv.MatrixParam')
+local AffineLayer = nerv.class('nerv.AffineLayer', 'nerv.Layer')
+
+function MatrixParam:read(handle)
+ self.trans = self.gconf.cumat_type.new_from_host(
+ nerv.MMatrixFloat.load(handle))
+end
+
+function MatrixParam:write(handle)
+ self.trans:new_to_host():save(handle)
+end
+
+function MatrixParam:train_init()
+ self.correction = self.trans:create()
+ self.correction:fill(0)
+end
+
+function MatrixParam:update(gradient)
+ local gconf = self.gconf
+ self.correction:add(self.correction, gradient, gconf.momentum, 1.0)
+ -- momentum gain
+ local mmt_gain = 1.0 / (1.0 - gconf.momentum);
+ local n = self.gconf.batch_size * mmt_gain
+ -- perform update
+ self.trans:add(self.trans, self.correction, 1.0, -gconf.lrate / n)
+end
+
+function LinearTransParam:update(gradient)
+ MatrixParam.update(self, gradient)
+ local gconf = self.gconf
+ -- weight decay
+ self.trans:add(self.trans, self.trans, 1.0, -gconf.lrate * gconf.wcost)
+end
+
+function AffineLayer:__init(id, global_conf, layer_conf)
+ self.id = id
+ self.ltp = layer_conf.ltp
+ self.bp = layer_conf.bp
+ self.dim_in = layer_conf.dim_in
+ self.dim_out = layer_conf.dim_out
+ self.gconf = global_conf
+ self:check_dim_len(1, 1) -- exactly one input and one output
+ self.direct_update = layer_conf.direct_update
+end
+
+function AffineLayer:init(batch_size)
+ if self.ltp.trans:ncol() ~= self.bp.trans:ncol() then
+ nerv.error("mismatching dimensions of linear transform and bias paramter")
+ end
+ if self.dim_in[1] ~= self.ltp.trans:nrow() then
+ nerv.error("mismatching dimensions of linear transform parameter and input")
+ end
+ if self.dim_out[1] ~= self.ltp.trans:ncol() then
+ nerv.error("mismatching dimensions of linear transform parameter and output")
+ end
+ self.ltp_grad = self.ltp.trans:create()
+ self.ltp:train_init()
+ self.bp:train_init()
+end
+
+function AffineLayer:update(bp_err, input, output)
+ if self.direct_update then
+ self.ltp.correction:mul(input[1], bp_err[1], 1.0, gconf.momentum, 'T', 'N')
+ -- momentum gain
+ local mmt_gain = 1.0 / (1.0 - gconf.momentum);
+ local n = self.gconf.batch_size * mmt_gain
+ -- perform update
+ self.ltp.trans:add(self.ltp.trans, self.ltp.correction, 1.0, -gconf.lrate / n)
+ else
+ self.ltp_grad:mul(input[1], bp_err[1], 1.0, 0.0, 'T', 'N')
+ self.ltp:update(self.ltp_grad)
+ end
+ self.bp:update(bp_err[1]:colsum())
+end
+
+function AffineLayer:propagate(input, output)
+ -- apply linear transform
+ output[1]:mul(input[1], self.ltp.trans, 1.0, 0.0, 'N', 'N')
+ -- add bias
+ output[1]:add_row(self.bp.trans, 1.0)
+end
+
+function AffineLayer:back_propagate(bp_err, next_bp_err, input, output)
+ next_bp_err[1]:mul(bp_err[1], self.ltp.trans, 1.0, 0.0, 'N', 'T')
+end
+
+function AffineLayer:get_params()
+ return nerv.ParamRepo({self.ltp, self.bp})
+end
diff --git a/nerv/layer/bias.lua b/nerv/layer/bias.lua
new file mode 100644
index 0000000..c99274d
--- /dev/null
+++ b/nerv/layer/bias.lua
@@ -0,0 +1,28 @@
+local BiasLayer = nerv.class("nerv.BiasLayer", "nerv.Layer")
+
+function BiasLayer:__init(id, global_conf, layer_conf)
+ self.id = id
+ self.gconf = global_conf
+ self.bias = layer_conf.bias
+ self.dim_in = layer_conf.dim_in
+ self.dim_out = layer_conf.dim_out
+ self:check_dim_len(1, 1)
+end
+
+function BiasLayer:init()
+ if self.dim_in[1] ~= self.bias.trans:ncol() then
+ nerv.error("mismatching dimensions of input and bias parameter")
+ end
+ if self.dim_out[1] ~= self.bias.trans:ncol() then
+ nerv.error("mismatching dimensions of output and bias parameter")
+ end
+end
+
+function BiasLayer:propagate(input, output)
+ output[1]:copy_fromd(input[1])
+ output[1]:add_row(self.bias.trans, 1.0)
+end
+
+function BiasLayer:get_params()
+ return nerv.ParamRepo({self.bias})
+end
diff --git a/nerv/layer/combiner.lua b/nerv/layer/combiner.lua
new file mode 100644
index 0000000..7bd7617
--- /dev/null
+++ b/nerv/layer/combiner.lua
@@ -0,0 +1,59 @@
+local CombinerLayer = nerv.class('nerv.CombinerLayer', 'nerv.Layer')
+
+function CombinerLayer:__init(id, global_conf, layer_conf)
+ self.id = id
+ self.lambda = layer_conf.lambda
+ self.dim_in = layer_conf.dim_in
+ self.dim_out = layer_conf.dim_out
+ self.gconf = global_conf
+ self:check_dim_len(#self.lambda, -1)
+ if #self.dim_in < 1 then
+ nerv.error("no input specified")
+ end
+ if #self.dim_out < 1 then
+ nerv.error("no output specified")
+ end
+end
+
+function CombinerLayer:init(batch_size)
+ local dim = self.dim_in[1]
+ for i = 2, #self.dim_in do
+ if self.dim_in[i] ~= dim then
+ nerv.error("mismatching dimensions of inputs")
+ end
+ end
+ for i = 1, #self.dim_out do
+ if self.dim_out[i] ~= dim then
+ nerv.error("mismatching dimensions of inputs/outputs")
+ end
+ end
+ self.sum = self.gconf.cumat_type(batch_size, dim)
+end
+
+function CombinerLayer:update(bp_err, input, output)
+end
+
+function CombinerLayer:propagate(input, output)
+ output[1]:fill(0)
+ for i = 1, #self.dim_in do
+ output[1]:add(output[1], input[i], 1.0, self.lambda[i])
+ end
+ for i = 2, #self.dim_out do
+ output[i]:copy_fromd(output[1])
+ end
+end
+
+function CombinerLayer:back_propagate(bp_err, next_bp_err, input, output)
+ local sum = self.sum
+ sum:copy_fromd(bp_err[1])
+ for i = 2, #self.dim_out do
+ sum:add(sum, bp_err[i], 1.0, 1.0)
+ end
+ for i = 1, #self.dim_in do
+ next_bp_err[i]:add(next_bp_err[i], sum, 0.0, self.lambda[i])
+ end
+end
+
+function CombinerLayer:get_params()
+ return nerv.ParamRepo({})
+end
diff --git a/nerv/layer/init.lua b/nerv/layer/init.lua
new file mode 100644
index 0000000..3c55a94
--- /dev/null
+++ b/nerv/layer/init.lua
@@ -0,0 +1,79 @@
+-- The following methods must be implemented to let a layer work properly
+
+local Param = nerv.class('nerv.Param')
+
+function Param:__init(id, global_conf)
+ self.id = id
+ self.gconf = global_conf
+end
+
+function Param:get_info()
+ return self.info
+end
+
+function Param:set_info(info)
+ self.info = info
+end
+
+function Param:read(handle)
+ nerv.error_method_not_implemented()
+end
+
+function Param:write(handle)
+ nerv.error_method_not_implemented()
+end
+
+function Param:update(gradient)
+ nerv.error_method_not_implemented()
+end
+
+local Layer = nerv.class('nerv.Layer')
+
+function Layer:__init(id, global_conf, layer_conf)
+ nerv.error_method_not_implemented()
+end
+
+function Layer:init(batch_size)
+ nerv.error_method_not_implemented()
+end
+
+function Layer:update(bp_err, input, output)
+ nerv.error_method_not_implemented()
+end
+
+function Layer:propagate(input, output)
+ nerv.error_method_not_implemented()
+end
+
+function Layer:back_propagate(bp_err, next_bp_err, input, output)
+ nerv.error_method_not_implemented()
+end
+
+function Layer:check_dim_len(len_in, len_out)
+ local expected_in = #self.dim_in
+ local expected_out = #self.dim_out
+ if len_in > 0 and expected_in ~= len_in then
+ nerv.error("layer %s expects %d inputs, %d given",
+ self.id, len_in, expected_in)
+ end
+ if len_out > 0 and expected_out ~= len_out then
+ nerv.error("layer %s expects %d outputs, %d given",
+ self.id, len_out, expected_out)
+ end
+end
+
+function Layer:get_params()
+ nerv.error_method_not_implemented()
+end
+
+function Layer:get_dim()
+ return self.dim_in, self.dim_out
+end
+
+nerv.include('affine.lua')
+nerv.include('sigmoid.lua')
+nerv.include('softmax_ce.lua')
+nerv.include('bias.lua')
+nerv.include('window.lua')
+nerv.include('mse.lua')
+nerv.include('combiner.lua')
diff --git a/nerv/layer/mse.lua b/nerv/layer/mse.lua
new file mode 100644
index 0000000..9a97add
--- /dev/null
+++ b/nerv/layer/mse.lua
@@ -0,0 +1,52 @@
+local MSELayer = nerv.class("nerv.MSELayer", "nerv.Layer")
+
+function MSELayer:__init(id, global_conf, layer_conf)
+ self.id = id
+ self.dim_in = layer_conf.dim_in
+ self.dim_out = layer_conf.dim_out
+ self.gconf = global_conf
+ self:check_dim_len(2, -1)
+end
+
+function MSELayer:init(batch_size)
+ if self.dim_in[1] ~= self.dim_in[2] then
+ nerv.error("mismatching dimensions of previous network output and labels")
+ end
+ self.scale = 1 / self.dim_in[1]
+ self.total_mse = 0.0
+ self.total_frames = 0
+ self.mse = self.gconf.cumat_type(batch_size, self.dim_in[1])
+ self.mse_sum = self.gconf.cumat_type(batch_size, 1)
+ self.diff = self.mse:create()
+end
+
+function MSELayer:update(bp_err, input, output)
+ -- no params, therefore do nothing
+end
+
+function MSELayer:propagate(input, output)
+ local mse = self.mse
+ local mse_sum = self.mse_sum
+ mse:add(input[1], input[2], 1.0, -1.0)
+ self.diff:copy_fromd(mse)
+ mse:mul_elem(mse, mse)
+ mse_sum:add(mse_sum, mse:rowsum(mse), 0.0, self.scale)
+ if output[1] ~= nil then
+ output[1]:copy_fromd(mse_sum)
+ end
+ self.total_mse = self.total_mse + mse_sum:colsum()[0]
+ self.total_frames = self.total_frames + mse_sum:nrow()
+end
+
+-- NOTE: must call propagate before back_propagate
+function MSELayer:back_propagate(bp_err, next_bp_err, input, output)
+ local nbe = next_bp_err[1]
+ nbe:add(nbe, self.diff, 0.0, 2 * self.scale)
+ if bp_err[1] ~= nil then
+ nbe:scale_rows_by_col(bp_err[1])
+ end
+end
+
+function MSELayer:get_params()
+ return nerv.ParamRepo({})
+end
diff --git a/nerv/layer/sigmoid.lua b/nerv/layer/sigmoid.lua
new file mode 100644
index 0000000..dfd09eb
--- /dev/null
+++ b/nerv/layer/sigmoid.lua
@@ -0,0 +1,31 @@
+local SigmoidLayer = nerv.class("nerv.SigmoidLayer", "nerv.Layer")
+
+function SigmoidLayer:__init(id, global_conf, layer_conf)
+ self.id = id
+ self.gconf = global_conf
+ self.dim_in = layer_conf.dim_in
+ self.dim_out = layer_conf.dim_out
+ self:check_dim_len(1, 1)
+end
+
+function SigmoidLayer:init()
+ if self.dim_in[1] ~= self.dim_out[1] then
+ nerv.error("mismatching dimensions of input and output")
+ end
+end
+
+function SigmoidLayer:update(bp_err, input, output)
+ -- no params, therefore do nothing
+end
+
+function SigmoidLayer:propagate(input, output)
+ output[1]:sigmoid(input[1])
+end
+
+function SigmoidLayer:back_propagate(bp_err, next_bp_err, input, output)
+ next_bp_err[1]:sigmoid_grad(bp_err[1], output[1])
+end
+
+function SigmoidLayer:get_params()
+ return nerv.ParamRepo({})
+end
diff --git a/nerv/layer/softmax_ce.lua b/nerv/layer/softmax_ce.lua
new file mode 100644
index 0000000..daf891e
--- /dev/null
+++ b/nerv/layer/softmax_ce.lua
@@ -0,0 +1,68 @@
+local SoftmaxCELayer = nerv.class("nerv.SoftmaxCELayer", "nerv.Layer")
+
+function SoftmaxCELayer:__init(id, global_conf, layer_conf)
+ self.id = id
+ self.gconf = global_conf
+ self.dim_in = layer_conf.dim_in
+ self.dim_out = layer_conf.dim_out
+ self.compressed = layer_conf.compressed
+ if self.compressed == nil then
+ self.compressed = false
+ end
+ self:check_dim_len(2, -1) -- two inputs: nn output and label
+end
+
+function SoftmaxCELayer:init(batch_size)
+ if not self.compressed and (self.dim_in[1] ~= self.dim_in[2]) then
+ nerv.error("mismatching dimensions of previous network output and labels")
+ end
+ self.total_ce = 0.0
+ self.total_correct = 0
+ self.total_frames = 0
+ self.softmax = self.gconf.cumat_type(batch_size, self.dim_in[1])
+ self.ce = self.softmax:create()
+end
+
+function SoftmaxCELayer:update(bp_err, input, output)
+ -- no params, therefore do nothing
+end
+
+function SoftmaxCELayer:propagate(input, output)
+ local softmax = self.softmax
+ local ce = self.ce
+ local classified = softmax:softmax(input[1])
+ local label = input[2]
+ ce:log_elem(softmax)
+ if self.compressed then
+ label = label:decompress(input[1]:ncol())
+ end
+ ce:mul_elem(ce, label)
+ ce = ce:rowsum()
+ if output[1] ~= nil then
+ output[1]:copy_fromd(ce)
+ end
+ -- add total ce
+ self.total_ce = self.total_ce - ce:colsum()[0]
+ self.total_frames = self.total_frames + softmax:nrow()
+ -- TODO: add colsame for uncompressed label
+ if self.compressed then
+ self.total_correct = self.total_correct + classified:colsame(input[2])[0]
+ end
+end
+
+function SoftmaxCELayer:back_propagate(bp_err, next_bp_err, input, output)
+ -- softmax output - label
+ local label = input[2]
+ if self.compressed then
+ label = label:decompress(input[1]:ncol())
+ end
+ local nbe = next_bp_err[1]
+ nbe:add(self.softmax, label, 1.0, -1.0)
+ if bp_err[1] ~= nil then
+ nbe:scale_rows_by_col(bp_err[1])
+ end
+end
+
+function SoftmaxCELayer:get_params()
+ return nerv.ParamRepo({})
+end
diff --git a/nerv/layer/window.lua b/nerv/layer/window.lua
new file mode 100644
index 0000000..4e9a3b1
--- /dev/null
+++ b/nerv/layer/window.lua
@@ -0,0 +1,28 @@
+local WindowLayer = nerv.class("nerv.WindowLayer", "nerv.Layer")
+
+function WindowLayer:__init(id, global_conf, layer_conf)
+ self.id = id
+ self.gconf = global_conf
+ self.window = layer_conf.window
+ self.dim_in = layer_conf.dim_in
+ self.dim_out = layer_conf.dim_out
+ self:check_dim_len(1, 1)
+end
+
+function WindowLayer:init()
+ if self.dim_in[1] ~= self.window.trans:ncol() then
+ nerv.error("mismatching dimensions of input and window parameter")
+ end
+ if self.dim_out[1] ~= self.window.trans:ncol() then
+ nerv.error("mismatching dimensions of output and window parameter")
+ end
+end
+
+function WindowLayer:propagate(input, output)
+ output[1]:copy_fromd(input[1])
+ output[1]:scale_rows_by_row(self.window.trans)
+end
+
+function WindowLayer:get_params()
+ return nerv.ParamRepo({self.window})
+end
diff --git a/nerv/luaT/README.md b/nerv/luaT/README.md
new file mode 100644
index 0000000..6e9cf0d
--- /dev/null
+++ b/nerv/luaT/README.md
@@ -0,0 +1,239 @@
+<a name="luat.dok"/>
+# Lua Torch C API #
+
+luaT provides an API to interface Lua and C in Torch packages. It defines a
+concept of _classes_ to Lua for Torch, and provides a mechanism to easily
+handle these Lua classes from C.
+
+It additionally provides few functions that `luaL` should have defined, and
+defines several functions similar to `luaL` ones for better type error printing when using
+`luaT` classes.
+
+<a name="luat.memory.dok"/>
+## Memory functions ##
+
+Classical memory allocation functions which generate a Lua error in case of
+problem.
+
+<a name="luaT_alloc"/>
+### void* luaT_alloc(lua_State *L, long size) ###
+
+Allocates `size` bytes, and return a pointer on the allocated
+memory. A Lua error will be generated if running out of memory.
+
+<a name="luaT_realloc"/>
+### void* luaT_realloc(lua_State *L, void *ptr, long size) ###
+
+Realloc `ptr` to `size` bytes. `ptr` must have been previously
+allocated with [luaT_alloc](#luaT_alloc) or
+[luaT_realloc](#luaT_realloc), or the C `malloc` or `realloc`
+functions. A Lua error will be generated if running out of memory.
+
+<a name="luaT_free"/>
+### void luaT_free(lua_State *L, void *ptr) ###
+
+Free memory allocated at address `ptr`. The memory must have been
+previously allocated with [luaT_alloc](#luaT_alloc) or
+[luaT_realloc](#luaT_realloc), or the C `malloc` or `realloc`
+functions.
+
+<a name="luat.classcreate"/>
+## Class creation and basic handling ##
+
+A `luaT` class is basically either a Lua _table_ or _userdata_ with
+an appropriate _metatable_. This appropriate metatable is created with
+[luaT_newmetatable](#luaT_newmetatable). Contrary to luaL userdata
+functions, luaT mechanism handles inheritance. If the class inherit from
+another class, then the metatable will itself have a metatable
+corresponding to the _parent metatable_: the metatables are cascaded
+according to the class inheritance. Multiple inheritance is not supported.
+
+<a name="luat.operatoroverloading"/>
+### Operator overloading ###
+
+The metatable of a `luaT` object contains `Lua` operators like
+`__index`, `__newindex`, `__tostring`, `__add`
+(etc...). These operators will respectively look for `__index__`,
+`__newindex__`, `__tostring__`, `__add__` (etc...) in the
+metatable. If found, the corresponding function or value will be returned,
+else a Lua error will be raised.
+
+If one wants to provide `__index__` or `__newindex__` in the
+metaclass, these operators must follow a particular scheme:
+
+ * `__index__` must either return a value _and_ `true` or return `false` only. In the first case, it means `__index__` was able to handle the given argument (for e.g., the type was correct). The second case means it was not able to do anything, so `__index` in the root metatable can then try to see if the metaclass contains the required value.
+
+ * `__newindex__` must either return `true` or `false`. As for `__index__`, `true` means it could handle the argument and `false` not. If not, the root metatable `__newindex` will then raise an error if the object was a userdata, or apply a rawset if the object was a Lua table.
+
+Other metaclass operators like `__tostring__`, `__add__`, etc... do not have any particular constraint.
+
+<a name="luat_newmetatable"/>
+### const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parenttname, lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory) ###
+
+This function creates a new metatable, which is the Lua way to define a new
+object class. As for `luaL_newmetatable`, the metatable is registered in
+the Lua registry table, with the key `tname`. In addition, `tname` is
+also registered in the Lua registry, with the metatable as key (the
+typename of a given object can be thus easily retrieved).
+
+The class name `tname` must be of the form `modulename.classname`. The module name
+If not NULL, `parenttname` must be a valid typename corresponding to the
+parent class of the new class.
+
+If not NULL, `constructor`, a function `new` will be added to the metatable, pointing to this given function. The constructor might also
+be called through `modulename.classname()`, which is an alias setup by `luaT_metatable`.
+
+If not NULL, `destructor` will be called when garbage collecting the object.
+
+If not NULL, `factory` must be a Lua C function creating an empty object
+instance of the class. This functions are used in Torch for serialization.
+
+Note that classes can be partly defined in C and partly defined in Lua:
+once the metatable is created in C, it can be filled up with additional
+methods in Lua.
+
+The return value is the value returned by [luaT_typenameid](#luat_typenameid).
+
+<a name="luat_pushmetatable"/>
+### int luaT_pushmetatable(lua_State *L, const name *tname) ###
+
+Push the metatable with type name `tname` on the stack, it `tname` is a
+valid Torch class name (previously registered with luaT_newmetatable).
+
+On success, returns 1. If `tname` is invalid, nothing is pushed and it
+returns 0.
+
+<a name="luat_typenameid"/>
+### const char* luaT_typenameid(lua_State *L, const char *tname) ###
+
+If `tname` is a valid Torch class name, then returns a unique string (the
+contents will be the same than `tname`) pointing on the string registered
+in the Lua registry. This string is thus valid as long as Lua is
+running. The returned string shall not be freed.
+
+If `tname` is an invalid class name, returns NULL.
+
+<a name="luat_typename"/>
+### const char* luaT_typename(lua_State *L, int ud) ###
+
+Returns the typename of the object at index `ud` on the stack. If it is
+not a valid Torch object, returns NULL.
+
+<a name="luat_pushudata"/>
+### void luaT_pushudata(lua_State *L, void *udata, const char *tname) ###
+
+Given a C structure `udata`, push a userdata object on the stack with
+metatable corresponding to `tname`. Obviously, `tname` must be a valid
+Torch name registered with [luaT_newmetatable](#luat_newmetatable).
+
+<a name="luat_toudata"/>
+### void *luaT_toudata(lua_State *L, int ud, const char *tname) ###
+
+Returns a pointer to the original C structure previously pushed on the
+stack with [luaT_pushudata](#luat_pushudata), if the object at index
+`ud` is a valid Torch class name. Returns NULL otherwise.
+
+<a name="luat_isudata"/>
+### int luaT_isudata(lua_State *L, int ud, const char *tname) ###
+
+Returns 1 if the object at index `ud` on the stack is a valid Torch class name `tname`.
+Returns 0 otherwise.
+
+<a name="luat_getfield"/>
+### Checking fields of a table ###
+
+This functions check that the table at the given index `ud` on the Lua
+stack has a field named `field`, and that it is of the specified type.
+These function raises a Lua error on failure.
+
+<a name="luat_getfieldcheckudata"/>
+## void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+Torch class name `tname`. Returns the pointer of the C structure
+previously pushed on the stack with [luaT_pushudata](#luat_pushudata) on
+success. The function raises a Lua error on failure.
+
+<a name="luat_getfieldchecklightudata"/>
+## void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+lightuserdata. Returns the lightuserdata pointer on success. The function
+raises a Lua error on failure.
+
+<a name="luat_getfieldcheckint"/>
+## int luaT_getfieldcheckint(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is an
+int. Returns the int value pointer on success. The function raises a Lua
+error on failure.
+
+<a name="luat_getfieldcheckstring"/>
+## const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+string. Returns a pointer to the string on success. The function raises a
+Lua error on failure.
+
+<a name="luat_getfieldcheckboolean"/>
+## int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+boolean. On success, returns 1 if the boolean is `true`, 0 if it is
+`false`. The function raises a Lua error on failure.
+
+<a name="luat_getfieldchecktable"/>
+## void luaT_getfieldchecktable(lua_State *L, int ud, const char *field) ##
+
+Checks that the field named `field` of the table at index `ud` is a
+table. On success, push the table on the stack. The function raises a Lua
+error on failure.
+
+<a name="luat_typerror"/>
+### int luaT_typerror(lua_State *L, int ud, const char *tname) ###
+
+Raises a `luaL_argerror` (and returns its value), claiming that the
+object at index `ud` on the stack is not of type `tname`. Note that
+this function does not check the type, it only raises an error.
+
+<a name="luat_checkboolean"/>
+### int luaT_checkboolean(lua_State *L, int ud) ###
+
+Checks that the value at index `ud` is a boolean. On success, returns 1
+if the boolean is `true`, 0 if it is `false`. The function raises a Lua
+error on failure.
+
+<a name="luat_optboolean"/>
+### int luaT_optboolean(lua_State *L, int ud, int def) ###
+
+Checks that the value at index `ud` is a boolean. On success, returns 1
+if the boolean is `true`, 0 if it is `false`. If there is no value at
+index `ud`, returns `def`. In any other cases, raises an error.
+
+<a name="luat_registeratname"/>
+### void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name) ###
+
+This function assume a table is on the stack. It creates a table field
+`name` in the table (if this field does not exist yet), and fill up
+`methods` in this table field.
+
+<a name="luat_classrootname"/>
+### const char *luaT_classrootname(const char *tname) ###
+
+Assuming `tname` is of the form `modulename.classname`, returns
+`classname`. The returned value shall not be freed. It is a pointer
+inside `tname` string.
+
+<a name="luat_classmodulename"/>
+### const char *luaT_classmodulename(const char *tname) ###
+
+Assuming `tname` is of the form `modulename.classname`, returns
+`modulename`. The returned value shall not be freed. It is valid until the
+next call to `luaT_classrootname`.
+
+<a name="luat_stackdump"/>
+### void luaT_stackdump(lua_State *L) ###
+
+This function print outs the state of the Lua stack. It is useful for debug
+purposes.
+
diff --git a/nerv/luaT/luaT.c b/nerv/luaT/luaT.c
new file mode 100644
index 0000000..7b85ce3
--- /dev/null
+++ b/nerv/luaT/luaT.c
@@ -0,0 +1,1079 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "luaT.h"
+
+void* luaT_alloc(lua_State *L, long size)
+{
+ void *ptr;
+
+ if(size == 0)
+ return NULL;
+
+ if(size < 0)
+ luaL_error(L, "$ Torch: invalid memory size -- maybe an overflow?");
+
+ ptr = malloc(size);
+ if(!ptr)
+ luaL_error(L, "$ Torch: not enough memory: you tried to allocate %dGB. Buy new RAM!", size/1073741824);
+
+ return ptr;
+}
+
+void* luaT_realloc(lua_State *L, void *ptr, long size)
+{
+ if(!ptr)
+ return(luaT_alloc(L, size));
+
+ if(size == 0)
+ {
+ luaT_free(L, ptr);
+ return NULL;
+ }
+
+ if(size < 0)
+ luaL_error(L, "$ Torch: invalid memory size -- maybe an overflow?");
+
+ ptr = realloc(ptr, size);
+ if(!ptr)
+ luaL_error(L, "$ Torch: not enough memory: you tried to reallocate %dGB. Buy new RAM!", size/1073741824);
+ return ptr;
+}
+
+void luaT_free(lua_State *L, void *ptr)
+{
+ free(ptr);
+}
+
+void luaT_stackdump(lua_State *L)
+{
+ int i;
+ const char *tname = NULL;
+ int top = lua_gettop(L);
+ for(i = 1; i <= top; i++)
+ {
+ int t = lua_type(L, i);
+ printf("%3d. ", i);
+ switch(t)
+ {
+ case LUA_TSTRING:
+ printf("'%s'", lua_tostring(L,i));
+ break;
+ case LUA_TBOOLEAN:
+ printf(lua_toboolean(L, i) ? "true" : "false");
+ break;
+ case LUA_TNUMBER:
+ printf("%g", lua_tonumber(L,i));
+ break;
+ case LUA_TUSERDATA:
+ tname = luaT_typename(L, i);
+ printf("userdata %lx [%s]", (long)lua_topointer(L, i), (tname ? tname : "not a Torch object"));
+ break;
+ case 10:
+ tname = luaT_typename(L, i);
+ printf("cdata %lx [%s]", (long)lua_topointer(L, i), (tname ? tname : "not a Torch object"));
+ break;
+ case LUA_TTABLE:
+ lua_pushvalue(L, i);
+ lua_rawget(L, LUA_REGISTRYINDEX);
+ if(lua_isstring(L, -1))
+ tname = lua_tostring(L, -1); /*luaT_typenameid(L, lua_tostring(L, -1)); */
+ else
+ tname = NULL;
+ lua_pop(L, 1);
+ if(tname)
+ printf("metatable [%s]", tname);
+ else
+ {
+ tname = luaT_typename(L, i);
+ printf("table %lx [%s]", (long)lua_topointer(L, i), (tname ? tname : "not a Torch object"));
+ }
+ break;
+ default:
+ printf("Lua object type: %s", lua_typename(L,t));
+ break;
+ }
+ printf("\n");
+ }
+ printf("---------------------------------------------\n");
+}
+
+/* metatable operator methods */
+static int luaT_mt__index(lua_State *L);
+static int luaT_mt__newindex(lua_State *L);
+static int luaT_mt__tostring(lua_State *L);
+static int luaT_mt__add(lua_State *L);
+static int luaT_mt__sub(lua_State *L);
+static int luaT_mt__mul(lua_State *L);
+static int luaT_mt__div(lua_State *L);
+static int luaT_mt__mod(lua_State *L);
+static int luaT_mt__pow(lua_State *L);
+static int luaT_mt__unm(lua_State *L);
+static int luaT_mt__concat(lua_State *L);
+static int luaT_mt__len(lua_State *L);
+static int luaT_mt__eq(lua_State *L);
+static int luaT_mt__lt(lua_State *L);
+static int luaT_mt__le(lua_State *L);
+static int luaT_mt__call(lua_State *L);
+
+/* Constructor-metatable methods */
+static int luaT_cmt__call(lua_State *L);
+static int luaT_cmt__newindex(lua_State *L);
+
+const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parenttname,
+ lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory)
+{
+ lua_pushcfunction(L, luaT_lua_newmetatable);
+ lua_pushstring(L, tname);
+ (parenttname ? lua_pushstring(L, parenttname) : lua_pushnil(L));
+ (constructor ? lua_pushcfunction(L, constructor) : lua_pushnil(L));
+ (destructor ? lua_pushcfunction(L, destructor) : lua_pushnil(L));
+ (factory ? lua_pushcfunction(L, factory) : lua_pushnil(L));
+ lua_call(L, 5, 1);
+ return luaT_typenameid(L, tname);
+}
+
+int luaT_pushmetatable(lua_State *L, const char *tname)
+{
+ lua_getfield(L, LUA_REGISTRYINDEX, tname);
+ if(lua_isnil(L, -1))
+ {
+ lua_pop(L, 1);
+ return 0;
+ }
+ return 1;
+}
+
+const char *luaT_typenameid(lua_State *L, const char *tname)
+{
+ if(luaT_pushmetatable(L, tname))
+ {
+ const char *tnameid = NULL;
+ lua_rawget(L, LUA_REGISTRYINDEX);
+ if(lua_isstring(L, -1))
+ tnameid = lua_tostring(L, -1);
+ lua_pop(L, 1); /* the string/nil */
+ return tnameid;
+ }
+ return NULL;
+}
+
+static const char cdataname[] = ""
+ "local _, ffi = pcall(require, 'ffi')\n"
+ "if ffi then\n"
+ " local id2name = {}\n"
+ " return function(cdata, name)\n"
+ " local id = tonumber(ffi.typeof(cdata))\n"
+ " if id then\n"
+ " if name then\n"
+ " id2name[id] = name\n"
+ " return name\n"
+ " else\n"
+ " return rawget(id2name, id)\n"
+ " end\n"
+ " end\n"
+ " return nil\n"
+ " end\n"
+ "else\n"
+ " return function() end\n"
+ "end\n";
+
+static const char* luaT_cdataname(lua_State *L, int ud, const char *tname)
+{
+ lua_pushstring(L, "__cdataname");
+ lua_rawget(L, LUA_REGISTRYINDEX);
+ if(lua_isnil(L,-1))
+ {
+ lua_pop(L, 1);
+
+ if(luaL_dostring(L, cdataname)) /* did something go wrong? */
+ luaL_error(L, "internal error (could not load cdataname): %s", lua_tostring(L, -1));
+
+ lua_pushstring(L, "__cdataname");
+ lua_pushvalue(L, -2);
+ lua_rawset(L, LUA_REGISTRYINDEX);
+ }
+ if(!lua_isfunction(L, -1)) /* should not happen */
+ luaL_error(L, "internal error (cdataname is not a function)");
+
+ lua_pushvalue(L, ud);
+ if(tname)
+ lua_pushstring(L, tname);
+ if(lua_pcall(L, (tname ? 2 : 1), 1, 0))
+ luaL_error(L, "internal error (cdataname): %s", lua_tostring(L, -1));
+
+ tname = lua_tostring(L, -1);
+ lua_pop(L, 1);
+
+ return tname;
+}
+
+const char* luaT_typename(lua_State *L, int ud)
+{
+ if(lua_type(L, ud) == 10)
+ return luaT_cdataname(L, ud, NULL);
+ else if(lua_getmetatable(L, ud))
+ {
+ const char *tname = NULL;
+ lua_rawget(L, LUA_REGISTRYINDEX);
+ if(lua_isstring(L, -1))
+ tname = lua_tostring(L, -1);
+ lua_pop(L, 1); /* the string/nil */
+ return tname;
+ }
+ return NULL;
+}
+
+void luaT_pushudata(lua_State *L, void *udata, const char *tname)
+{
+ if(udata)
+ {
+ void **udata_p = lua_newuserdata(L, sizeof(void*));
+ *udata_p = udata;
+ if(!luaT_pushmetatable(L, tname))
+ luaL_error(L, "Torch internal problem: cannot find metatable for type <%s>", tname);
+ lua_setmetatable(L, -2);
+ }
+ else
+ lua_pushnil(L);
+}
+
+void *luaT_toudata(lua_State *L, int ud, const char *tname)
+{
+ void **p = lua_touserdata(L, ud);
+ if(p != NULL) /* value is a userdata? */
+ {
+ if(!luaT_pushmetatable(L, tname))
+ luaL_error(L, "Torch internal problem: cannot find metatable for type <%s>", tname);
+
+ /* initialize the table we want to get the metatable on */
+ /* note that we have to be careful with indices, as we just inserted stuff */
+ lua_pushvalue(L, (ud < 0 ? ud - 1 : ud));
+ while(lua_getmetatable(L, -1)) /* get the next metatable */
+ {
+ lua_remove(L, -2); /* remove the previous metatable [or object, if first time] */
+ if(lua_rawequal(L, -1, -2))
+ {
+ lua_pop(L, 2); /* remove the two metatables */
+ return *p;
+ }
+ }
+ lua_pop(L, 2); /* remove the two metatables */
+ }
+ return NULL;
+}
+
+int luaT_isudata(lua_State *L, int ud, const char *tname)
+{
+ if(luaT_toudata(L, ud, tname))
+ return 1;
+ else
+ return 0;
+}
+
+void *luaT_checkudata(lua_State *L, int ud, const char *tname)
+{
+ void *p = luaT_toudata(L, ud, tname);
+ if(!p)
+ luaT_typerror(L, ud, tname);
+ return p;
+}
+
+void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname)
+{
+ void *p;
+ lua_getfield(L, ud, field);
+ if(lua_isnil(L, -1))
+ luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+ p = luaT_toudata(L, -1, tname);
+ if(!p)
+ luaL_error(L, "bad argument #%d (field %s is not a %s)", ud, field, tname);
+ return p;
+}
+
+void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field)
+{
+ void *p;
+ lua_getfield(L, ud, field);
+ if(lua_isnil(L, -1))
+ luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+
+ if(!lua_islightuserdata(L, -1))
+ luaL_error(L, "bad argument #%d (field %s is not a light userdata)", ud, field);
+
+ p = lua_touserdata(L, -1);
+
+ return p;
+}
+
+double luaT_getfieldchecknumber(lua_State *L, int ud, const char *field)
+{
+ lua_getfield(L, ud, field);
+ if(lua_isnil(L, -1))
+ luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+ if(!lua_isnumber(L, -1))
+ luaL_error(L, "bad argument #%d (field %s is not a number)", ud, field);
+ return lua_tonumber(L, -1);
+}
+
+int luaT_getfieldcheckint(lua_State *L, int ud, const char *field)
+{
+ lua_getfield(L, ud, field);
+ if(lua_isnil(L, -1))
+ luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+ if(!lua_isnumber(L, -1))
+ luaL_error(L, "bad argument #%d (field %s is not a number)", ud, field);
+ return (int)lua_tonumber(L, -1);
+}
+
+const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field)
+{
+ lua_getfield(L, ud, field);
+ if(lua_isnil(L, -1))
+ luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+ if(!lua_isstring(L, -1))
+ luaL_error(L, "bad argument #%d (field %s is not a string)", ud, field);
+ return lua_tostring(L, -1);
+}
+
+int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field)
+{
+ lua_getfield(L, ud, field);
+ if(lua_isnil(L, -1))
+ luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+ if(!lua_isboolean(L, -1))
+ luaL_error(L, "bad argument #%d (field %s is not a boolean)", ud, field);
+ return lua_toboolean(L, -1);
+}
+
+void luaT_getfieldchecktable(lua_State *L, int ud, const char *field)
+{
+ lua_getfield(L, ud, field);
+ if(lua_isnil(L, -1))
+ luaL_error(L, "bad argument #%d (field %s does not exist)", ud, field);
+ if(!lua_istable(L, -1))
+ luaL_error(L, "bad argument #%d (field %s is not a table)", ud, field);
+}
+
+/**** type checks as in luaL ****/
+int luaT_typerror(lua_State *L, int ud, const char *tname)
+{
+ const char *msg;
+ const char *tnameud = luaT_typename(L, ud);
+
+ if(!tnameud)
+ tnameud = lua_typename(L, ud);
+
+ msg = lua_pushfstring(L, "%s expected, got %s",
+ tname,
+ (tnameud ? tnameud : "unknown object"));
+
+ return luaL_argerror(L, ud, msg);
+}
+
+int luaT_checkboolean(lua_State *L, int ud)
+{
+ if(!lua_isboolean(L, ud))
+ luaT_typerror(L, ud, lua_typename(L, LUA_TBOOLEAN));
+ return lua_toboolean(L, ud);
+}
+
+int luaT_optboolean(lua_State *L, int ud, int def)
+{
+ if(lua_isnoneornil(L,ud))
+ return def;
+
+ return luaT_checkboolean(L, ud);
+}
+
+void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name)
+{
+ int idx = lua_gettop(L);
+
+ luaL_checktype(L, idx, LUA_TTABLE);
+ lua_pushstring(L, name);
+ lua_rawget(L, idx);
+
+ if(lua_isnil(L, -1))
+ {
+ lua_pop(L, 1);
+ lua_pushstring(L, name);
+ lua_newtable(L);
+ lua_rawset(L, idx);
+
+ lua_pushstring(L, name);
+ lua_rawget(L, idx);
+ }
+
+ luaL_register(L, NULL, methods);
+ lua_pop(L, 1);
+}
+
+
+/* utility functions */
+const char *luaT_classrootname(const char *tname)
+{
+ int i;
+ int sz = strlen(tname);
+
+ for(i = 0; i < sz; i++)
+ {
+ if(tname[i] == '.')
+ return tname+i+1;
+ }
+ return tname;
+}
+
+/* module_name must be a buffer at least as big as tname
+ * return true if the class is part of a module */
+int luaT_classmodulename(const char *tname, char *module_name)
+{
+ char chars[] = {'.', '\0'};
+ size_t n;
+ n = strcspn(tname, chars);
+ strncpy(module_name, tname, n);
+ module_name[n] = '\0';
+ return tname[n] == '.';
+}
+
+/* Lua only functions */
+int luaT_lua_newmetatable(lua_State *L)
+{
+ const char* tname = luaL_checkstring(L, 1);
+ char module_name[256];
+ int is_in_module = 0;
+ is_in_module = luaT_classmodulename(tname, module_name);
+
+ lua_settop(L, 5);
+ luaL_argcheck(L, lua_isnoneornil(L, 2) || lua_isstring(L, 2), 2, "parent class name or nil expected");
+ luaL_argcheck(L, lua_isnoneornil(L, 3) || lua_isfunction(L, 3), 3, "constructor function or nil expected");
+ luaL_argcheck(L, lua_isnoneornil(L, 4) || lua_isfunction(L, 4), 4, "destructor function or nil expected");
+ luaL_argcheck(L, lua_isnoneornil(L, 5) || lua_isfunction(L, 5), 5, "factory function or nil expected");
+
+ if(is_in_module)
+ lua_getfield(L, LUA_GLOBALSINDEX, module_name);
+ else
+ lua_pushvalue(L, LUA_GLOBALSINDEX);
+ if(!lua_istable(L, 6))
+ luaL_error(L, "while creating metatable %s: bad argument #1 (%s is an invalid module name)", tname, module_name);
+
+ /* we first create the new metaclass if we have to */
+ if(!luaT_pushmetatable(L, tname))
+ {
+ /* create the metatable */
+ lua_newtable(L);
+
+ /* registry[name] = metatable */
+ lua_pushvalue(L, -1);
+ lua_setfield(L, LUA_REGISTRYINDEX, tname);
+
+ /* registry[metatable] = tname */
+ lua_pushvalue(L, -1);
+ lua_pushstring(L, tname);
+ lua_rawset(L, LUA_REGISTRYINDEX);
+
+ /* __index handling */
+ lua_pushcfunction(L, luaT_mt__index);
+ lua_setfield(L, -2, "__index");
+
+ /* __newindex handling */
+ lua_pushcfunction(L, luaT_mt__newindex);
+ lua_setfield(L, -2, "__newindex");
+
+ /* __typename contains the typename */
+ lua_pushstring(L, tname);
+ lua_setfield(L, -2, "__typename");
+
+ /* __metatable is self */
+ lua_pushvalue(L, -1);
+ lua_setfield(L, -2, "__metatable");
+
+ /* by default, __version equals 1 */
+ lua_pushnumber(L, 1);
+ lua_setfield(L, -2, "__version");
+
+ /* assign default operator functions */
+ lua_pushcfunction(L, luaT_mt__tostring);
+ lua_setfield(L, -2, "__tostring");
+
+ lua_pushcfunction(L, luaT_mt__add);
+ lua_setfield(L, -2, "__add");
+
+ lua_pushcfunction(L, luaT_mt__sub);
+ lua_setfield(L, -2, "__sub");
+
+ lua_pushcfunction(L, luaT_mt__mul);
+ lua_setfield(L, -2, "__mul");
+
+ lua_pushcfunction(L, luaT_mt__div);
+ lua_setfield(L, -2, "__div");
+
+ lua_pushcfunction(L, luaT_mt__mod);
+ lua_setfield(L, -2, "__mod");
+
+ lua_pushcfunction(L, luaT_mt__pow);
+ lua_setfield(L, -2, "__pow");
+
+ lua_pushcfunction(L, luaT_mt__unm);
+ lua_setfield(L, -2, "__unm");
+
+ lua_pushcfunction(L, luaT_mt__concat);
+ lua_setfield(L, -2, "__concat");
+
+ lua_pushcfunction(L, luaT_mt__len);
+ lua_setfield(L, -2, "__len");
+
+ lua_pushcfunction(L, luaT_mt__eq);
+ lua_setfield(L, -2, "__eq");
+
+ lua_pushcfunction(L, luaT_mt__lt);
+ lua_setfield(L, -2, "__lt");
+
+ lua_pushcfunction(L, luaT_mt__le);
+ lua_setfield(L, -2, "__le");
+
+ lua_pushcfunction(L, luaT_mt__call);
+ lua_setfield(L, -2, "__call");
+ }
+
+ /* we assign the parent class if necessary */
+ if(!lua_isnoneornil(L, 2))
+ {
+ if(lua_getmetatable(L, -1))
+ luaL_error(L, "class %s has been already assigned a parent class\n", tname);
+ else
+ {
+ const char* parenttname = luaL_checkstring(L, 2);
+ if(!luaT_pushmetatable(L, parenttname))
+ luaL_error(L, "bad argument #2 (invalid parent class name %s)", parenttname);
+ lua_setmetatable(L, -2);
+ }
+ }
+
+ /* register the destructor function */
+ if(!lua_isnoneornil(L, 4))
+ {
+ /* does it exists already? */
+ lua_pushstring(L, "__gc");
+ lua_rawget(L, -2);
+
+ if(lua_isnil(L, -1))
+ {
+ lua_pop(L, 1); /* pop nil */
+ lua_pushstring(L, "__gc");
+ lua_pushvalue(L, 4);
+ lua_rawset(L, -3);
+ }
+ else
+ luaL_error(L, "%s has been already assigned a destructor", tname);
+ }
+
+ /* register the factory function */
+ if(!lua_isnoneornil(L, 5))
+ {
+ /* does it exists already? */
+ lua_pushstring(L, "__factory");
+ lua_rawget(L, -2);
+
+ if(lua_isnil(L, -1))
+ {
+ lua_pop(L, 1); /* pop nil */
+ lua_pushstring(L, "__factory");
+ lua_pushvalue(L, 5);
+ lua_rawset(L, -3);
+ }
+ else
+ luaL_error(L, "%s has been already assigned a factory", tname);
+ }
+
+ /******** Constructor table and metatable ********/
+ lua_pushstring(L, "__constructor");
+ lua_rawget(L, -2);
+ if(lua_isnil(L, -1))
+ {
+ lua_pop(L, 1); /* pop nil */
+ lua_newtable(L); /* fancy table */
+ lua_newtable(L); /* fancy metatable */
+
+ lua_pushvalue(L, -3); /* metatable */
+ lua_setfield(L, -2, "__index"); /* so we can get the methods */
+
+ lua_pushcfunction(L, luaT_cmt__newindex);
+ lua_setfield(L, -2, "__newindex"); /* so we add new methods */
+
+ lua_pushcfunction(L, luaT_cmt__call);
+ lua_setfield(L, -2, "__call"); /* so we can create, we are here for only that */
+
+ lua_pushvalue(L, -3);
+ lua_setfield(L, -2, "__metatable"); /* redirect to metatable with methods */
+
+ lua_setmetatable(L, -2); /* constructor metatable is ... this fancy metatable */
+
+ /* set metatable[__constructor] = constructor-metatable */
+ lua_pushstring(L, "__constructor");
+ lua_pushvalue(L, -2);
+ lua_rawset(L, -4);
+ }
+
+ /* register the constructor function */
+ if(!lua_isnoneornil(L, 3))
+ {
+ /* get constructor metatable */
+ lua_getmetatable(L, -1);
+
+ /* does it exists already? */
+ lua_pushstring(L, "__new");
+ lua_rawget(L, -2);
+
+ if(lua_isnil(L, -1))
+ {
+ lua_pop(L, 1); /* pop nil */
+ lua_pushstring(L, "__new");
+ lua_pushvalue(L, 3);
+ lua_rawset(L, -3);
+
+ /* set "new" in the metatable too */
+ lua_pushstring(L, "new");
+ lua_pushvalue(L, 3);
+ lua_rawset(L, -5);
+ }
+ else
+ luaL_error(L, "%s has been already assigned a constructor", tname);
+
+ /* pop constructor metatable */
+ lua_pop(L, 1);
+ }
+
+ /* module.name = constructor metatable */
+ lua_setfield(L, 6, luaT_classrootname(tname));
+
+ return 1; /* returns the metatable */
+}
+
+/* Lua only utility functions */
+
+/* add any custom type, provided the object has a metatable */
+int luaT_lua_metatype(lua_State *L)
+{
+ if( (lua_gettop(L) != 2) && (lua_gettop(L) != 3) )
+ luaL_error(L, "expecting: string table [ctype]");
+
+ luaL_checkstring(L, 1);
+ luaL_checktype(L, 2, LUA_TTABLE);
+
+ if(lua_gettop(L) == 3)
+ {
+ if(!luaT_cdataname(L, 3, lua_tostring(L, 1)))
+ luaL_error(L, "could not register cdata type -- missing ffi library?");
+ }
+
+ /* registry[name] = metatable */
+ lua_pushvalue(L, 1);
+ lua_pushvalue(L, 2);
+ lua_rawset(L, LUA_REGISTRYINDEX);
+
+ /* registry[metatable] = tname */
+ lua_pushvalue(L, 2);
+ lua_pushvalue(L, 1);
+ lua_rawset(L, LUA_REGISTRYINDEX);
+
+ return 0;
+}
+
+/* return a userdata from a C pointer */
+/* you are better to know what you are doing */
+int luaT_lua_pushudata(lua_State *L)
+{
+ void *udata = NULL;
+ const char *tname = luaL_checkstring(L, 2);
+
+ if(lua_type(L, 1) == 10)
+ udata = *((void**)lua_topointer(L, 1));
+ else if(lua_isnumber(L, 1))
+ udata = (void*)(long)lua_tonumber(L, 1);
+ else
+ luaL_argerror(L, 1, "expecting number or cdata");
+
+ luaT_pushudata(L, udata, tname);
+
+ return 1;
+}
+
+int luaT_lua_factory(lua_State *L)
+{
+ const char* tname = luaL_checkstring(L, 1);
+ if(luaT_pushmetatable(L, tname) && !lua_isnil(L, -1))
+ {
+ lua_pushstring(L, "__factory");
+ lua_rawget(L, -2);
+ }
+ else
+ {
+ lua_pushnil(L);
+ }
+ return 1;
+}
+
+int luaT_lua_getconstructortable(lua_State *L)
+{
+ const char* tname = luaL_checkstring(L, 1);
+ if(luaT_pushmetatable(L, tname))
+ {
+ lua_pushstring(L, "__constructor");
+ lua_rawget(L, -2);
+ return 1;
+ }
+ return 0;
+}
+
+
+int luaT_lua_typename(lua_State *L)
+{
+ const char* tname = NULL;
+ luaL_checkany(L, 1);
+ if((tname = luaT_typename(L, 1)))
+ {
+ lua_pushstring(L, tname);
+ return 1;
+ }
+ return 0;
+}
+
+int luaT_lua_isequal(lua_State *L)
+{
+ if(lua_isuserdata(L, 1) && lua_isuserdata(L, 2))
+ {
+ void **u1, **u2;
+ luaL_argcheck(L, luaT_typename(L, 1), 1, "Torch object expected");
+ luaL_argcheck(L, luaT_typename(L, 2), 2, "Torch object expected");
+
+ u1 = lua_touserdata(L, 1);
+ u2 = lua_touserdata(L, 2);
+ if(*u1 == *u2)
+ lua_pushboolean(L, 1);
+ else
+ lua_pushboolean(L, 0);
+ }
+ else if(lua_istable(L, 1) && lua_istable(L, 2))
+ lua_pushboolean(L, lua_rawequal(L, 1, 2));
+ else
+ lua_pushboolean(L, 0);
+ return 1;
+}
+
+int luaT_lua_pointer(lua_State *L)
+{
+ if(lua_isuserdata(L, 1))
+ {
+ void **ptr;
+ luaL_argcheck(L, luaT_typename(L, 1), 1, "Torch object expected");
+ ptr = lua_touserdata(L, 1);
+ lua_pushnumber(L, (long)(*ptr));
+ return 1;
+ }
+ else if(lua_istable(L, 1) || lua_isthread(L, 1) || lua_isfunction(L, 1))
+ {
+ const void* ptr = lua_topointer(L, 1);
+ lua_pushnumber(L, (long)(ptr));
+ return 1;
+ }
+ else if(lua_type(L, 1) == 10) /* cdata */
+ {
+ /* we want the pointer holded by cdata */
+ /* not the pointer on the cdata object */
+ const void* ptr = *((void**)lua_topointer(L, 1));
+ lua_pushnumber(L, (long)(ptr));
+ return 1;
+ }
+ else if(lua_isstring(L, 1))
+ {
+ const char* ptr = lua_tostring(L, 1);
+ lua_pushnumber(L, (long)(ptr));
+ return 1;
+ }
+ else
+ luaL_error(L, "Torch object, table, thread, cdata or function expected");
+
+ return 0;
+}
+
+int luaT_lua_setenv(lua_State *L)
+{
+ if(!lua_isfunction(L, 1) && !lua_isuserdata(L, 1))
+ luaL_typerror(L, 1, "function or userdata");
+ luaL_checktype(L, 2, LUA_TTABLE);
+ lua_setfenv(L, 1);
+ return 0;
+}
+
+int luaT_lua_getenv(lua_State *L)
+{
+ if(!lua_isfunction(L, 1) && !lua_isuserdata(L, 1))
+ luaL_typerror(L, 1, "function or userdata");
+ lua_getfenv(L, 1);
+ return 1;
+}
+
+int luaT_lua_getmetatable(lua_State *L)
+{
+ const char *tname = luaL_checkstring(L, 1);
+ if(luaT_pushmetatable(L, tname))
+ return 1;
+ return 0;
+}
+
+int luaT_lua_version(lua_State *L)
+{
+ luaL_checkany(L, 1);
+
+ if(lua_type(L, 1) == 10)
+ {
+ const char *tname = luaT_cdataname(L, 1, NULL);
+ if(tname)
+ {
+ luaT_pushmetatable(L, tname);
+ lua_pushstring(L, "__version");
+ lua_rawget(L, -2);
+ return 1;
+ }
+ return 0;
+ }
+ else if(lua_getmetatable(L, 1))
+ {
+ lua_pushstring(L, "__version");
+ lua_rawget(L, -2);
+ return 1;
+ }
+ return 0;
+}
+
+int luaT_lua_setmetatable(lua_State *L)
+{
+ const char *tname = luaL_checkstring(L, 2);
+ luaL_checktype(L, 1, LUA_TTABLE);
+
+ if(!luaT_pushmetatable(L, tname))
+ luaL_error(L, "unknown typename %s\n", tname);
+ lua_setmetatable(L, 1);
+
+ return 1;
+}
+
+/* metatable operator methods */
+static int luaT_mt__index(lua_State *L)
+{
+ if(!lua_getmetatable(L, 1))
+ luaL_error(L, "critical internal indexing error: no metatable found");
+
+ if(!lua_istable(L, -1))
+ luaL_error(L, "critical internal indexing error: not a metatable");
+
+ /* test for __index__ method first */
+ lua_getfield(L, -1, "__index__");
+ if(!lua_isnil(L, -1))
+ {
+ int result;
+
+ if(!lua_isfunction(L, -1))
+ luaL_error(L, "critical internal indexing error: __index__ is not a function");
+
+ lua_pushvalue(L, 1);
+ lua_pushvalue(L, 2);
+
+ lua_call(L, 2, LUA_MULTRET); /* DEBUG: risque: faut vraiment retourner 1 ou 2 valeurs... */
+
+ result = lua_toboolean(L, -1);
+ lua_pop(L, 1);
+
+ if(result)
+ return 1;
+
+ /* on the stack: 1. the object 2. the value 3. the metatable */
+ /* apparently, __index wants only one element returned */
+ /* return lua_gettop(L)-3; */
+
+ }
+ else
+ lua_pop(L, 1); /* remove nil __index__ on the stack */
+
+ lua_pushvalue(L, 2);
+ lua_gettable(L, -2);
+
+ return 1;
+}
+
+static int luaT_mt__newindex(lua_State *L)
+{
+ if(!lua_getmetatable(L, 1))
+ luaL_error(L, "critical internal indexing error: no metatable found");
+
+ if(!lua_istable(L, -1))
+ luaL_error(L, "critical internal indexing error: not a metatable");
+
+ /* test for __newindex__ method first */
+ lua_getfield(L, -1, "__newindex__");
+ if(!lua_isnil(L, -1))
+ {
+ int result;
+
+ if(!lua_isfunction(L, -1))
+ luaL_error(L, "critical internal indexing error: __newindex__ is not a function");
+
+ lua_pushvalue(L, 1);
+ lua_pushvalue(L, 2);
+ lua_pushvalue(L, 3);
+
+ lua_call(L, 3, 1); /* DEBUG: risque: faut vraiment retourner qqch */
+
+ result = lua_toboolean(L, -1);
+ lua_pop(L, 1);
+
+ if(result)
+ return 0;
+ }
+ else
+ lua_pop(L, 1); /* remove nil __newindex__ on the stack */
+
+ lua_pop(L, 1); /* pop the metatable */
+ if(lua_istable(L, 1))
+ lua_rawset(L, 1);
+ else
+ luaL_error(L, "the class %s cannot be indexed", luaT_typename(L, 1));
+
+ return 0;
+}
+
+/* note: check dans metatable pour ca, donc necessaire */
+#define MT_DECLARE_OPERATOR(NAME, NIL_BEHAVIOR) \
+ int luaT_mt__##NAME(lua_State *L) \
+ { \
+ if(!lua_getmetatable(L, 1)) \
+ luaL_error(L, "internal error in __" #NAME ": no metatable"); \
+ \
+ lua_getfield(L, -1, "__" #NAME "__"); \
+ if(lua_isnil(L, -1)) \
+ { \
+ NIL_BEHAVIOR; \
+ } \
+ else \
+ { \
+ if(lua_isfunction(L, -1)) \
+ { \
+ lua_insert(L, 1); /* insert function */ \
+ lua_pop(L, 1); /* remove metatable */ \
+ lua_call(L, lua_gettop(L)-1, LUA_MULTRET); /* we return the result of the call */ \
+ return lua_gettop(L); \
+ } \
+ /* we return the thing the user left in __tostring__ */ \
+ } \
+ return 0; \
+ }
+
+MT_DECLARE_OPERATOR(tostring,
+ lua_pushstring(L, luaT_typename(L, 1));
+ return 1;)
+MT_DECLARE_OPERATOR(add, luaL_error(L, "%s has no addition operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(sub, luaL_error(L, "%s has no substraction operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(mul, luaL_error(L, "%s has no multiplication operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(div, luaL_error(L, "%s has no division operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(mod, luaL_error(L, "%s has no modulo operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(pow, luaL_error(L, "%s has no power operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(unm, luaL_error(L, "%s has no negation operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(concat, luaL_error(L, "%s has no concat operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(len, luaL_error(L, "%s has no length operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(eq,
+ lua_settop(L, 2);
+ lua_pushcfunction(L, luaT_lua_isequal);
+ lua_insert(L, 1);
+ lua_call(L, 2, 1);
+ return 1;)
+MT_DECLARE_OPERATOR(lt, luaL_error(L, "%s has no lower than operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(le, luaL_error(L, "%s has no lower or equal than operator", luaT_typename(L, 1)))
+MT_DECLARE_OPERATOR(call, luaL_error(L, "%s has no call operator", luaT_typename(L, 1)))
+
+
+/* constructor metatable methods */
+int luaT_cmt__call(lua_State *L)
+{
+ if(!lua_istable(L, 1))
+ luaL_error(L, "internal error in __call: not a constructor table");
+
+ if(!lua_getmetatable(L, 1))
+ luaL_error(L, "internal error in __call: no metatable available");
+
+ lua_pushstring(L, "__new");
+ lua_rawget(L, -2);
+
+ if(lua_isnil(L, -1))
+ luaL_error(L, "no constructor available");
+
+ lua_remove(L, 1); /* remove constructor atable */
+ lua_insert(L, 1); /* insert constructor */
+ lua_pop(L, 1); /* remove fancy metatable */
+
+ lua_call(L, lua_gettop(L)-1, LUA_MULTRET);
+ return lua_gettop(L);
+}
+
+int luaT_cmt__newindex(lua_State *L)
+{
+ if(!lua_istable(L, 1))
+ luaL_error(L, "internal error in __newindex: not a constructor table");
+
+ if(!lua_getmetatable(L, 1))
+ luaL_error(L, "internal error in __newindex: no metatable available");
+
+ lua_pushstring(L, "__metatable");
+ lua_rawget(L, -2);
+
+ if(!lua_istable(L, -1))
+ luaL_error(L, "internal error in __newindex: no metaclass available");
+
+ lua_insert(L, 2);
+ lua_pop(L, 1); /* remove the metatable over the constructor table */
+
+ lua_rawset(L, -3);
+
+ return 0;
+}
+
+/******************** deprecated functions ********************/
+int luaT_pushmetaclass(lua_State *L, const char *tname)
+{
+ return luaT_pushmetatable(L, tname);
+}
+
+const char* luaT_id(lua_State *L, int ud)
+{
+ return luaT_typename(L, ud);
+}
+
+const char* luaT_id2typename(lua_State *L, const char *id)
+{
+ return id;
+}
+
+const char* luaT_typename2id(lua_State *L, const char *tname)
+{
+ return luaT_typenameid(L, tname);
+}
+
+int luaT_getmetaclass(lua_State *L, int index)
+{
+ return lua_getmetatable(L, index);
+}
+
+const char* luaT_checktypename2id(lua_State *L, const char *tname)
+{
+ const char* id = luaT_typenameid(L, tname);
+ if(!id)
+ luaL_error(L, "unknown class <%s>", tname);
+ return id;
+}
+
+void luaT_registeratid(lua_State *L, const struct luaL_Reg *methods, const char *id)
+{
+ luaT_registeratname(L, methods, id);
+}
+
+/**************************************************************/
diff --git a/nerv/luaT/luaT.h b/nerv/luaT/luaT.h
new file mode 100644
index 0000000..5e8dd2f
--- /dev/null
+++ b/nerv/luaT/luaT.h
@@ -0,0 +1,111 @@
+#ifndef LUAT_UTILS_INC
+#define LUAT_UTILS_INC
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <lua.h>
+#include <lauxlib.h>
+#ifdef __cplusplus
+}
+#endif
+
+#ifndef LUA_EXTERNC
+# ifdef __cplusplus
+# define LUA_EXTERNC extern "C"
+# else
+# define LUA_EXTERNC extern
+# endif
+#endif
+
+#ifdef _MSC_VER
+# define DLL_EXPORT __declspec(dllexport)
+# define DLL_IMPORT __declspec(dllimport)
+# ifdef luaT_EXPORTS
+# define LUAT_API LUA_EXTERNC DLL_EXPORT
+# else
+# define LUAT_API LUA_EXTERNC DLL_IMPORT
+# endif
+#else
+# define DLL_EXPORT
+# define DLL_IMPORT
+# define LUAT_API LUA_EXTERNC
+#endif
+
+
+/* C functions */
+
+LUAT_API void* luaT_alloc(lua_State *L, long size);
+LUAT_API void* luaT_realloc(lua_State *L, void *ptr, long size);
+LUAT_API void luaT_free(lua_State *L, void *ptr);
+
+LUAT_API const char* luaT_newmetatable(lua_State *L, const char *tname, const char *parenttname,
+ lua_CFunction constructor, lua_CFunction destructor, lua_CFunction factory);
+
+LUAT_API int luaT_pushmetatable(lua_State *L, const char *tname);
+
+LUAT_API const char* luaT_typenameid(lua_State *L, const char *tname);
+LUAT_API const char* luaT_typename(lua_State *L, int ud);
+
+LUAT_API void luaT_pushudata(lua_State *L, void *udata, const char *tname);
+LUAT_API void *luaT_toudata(lua_State *L, int ud, const char *tname);
+LUAT_API int luaT_isudata(lua_State *L, int ud, const char *tname);
+LUAT_API void *luaT_checkudata(lua_State *L, int ud, const char *tname);
+
+LUAT_API void *luaT_getfieldcheckudata(lua_State *L, int ud, const char *field, const char *tname);
+LUAT_API void *luaT_getfieldchecklightudata(lua_State *L, int ud, const char *field);
+LUAT_API double luaT_getfieldchecknumber(lua_State *L, int ud, const char *field);
+LUAT_API int luaT_getfieldcheckint(lua_State *L, int ud, const char *field);
+LUAT_API const char* luaT_getfieldcheckstring(lua_State *L, int ud, const char *field);
+LUAT_API int luaT_getfieldcheckboolean(lua_State *L, int ud, const char *field);
+LUAT_API void luaT_getfieldchecktable(lua_State *L, int ud, const char *field);
+
+LUAT_API int luaT_typerror(lua_State *L, int ud, const char *tname);
+
+LUAT_API int luaT_checkboolean(lua_State *L, int ud);
+LUAT_API int luaT_optboolean(lua_State *L, int ud, int def);
+
+LUAT_API void luaT_registeratname(lua_State *L, const struct luaL_Reg *methods, const char *name);
+
+/* utility functions */
+LUAT_API const char *luaT_classrootname(const char *tname);
+LUAT_API int luaT_classmodulename(const char *tname, char *module_name);
+
+/* debug */
+LUAT_API void luaT_stackdump(lua_State *L);
+
+/* Lua functions */
+LUAT_API int luaT_lua_newmetatable(lua_State *L);
+LUAT_API int luaT_lua_factory(lua_State *L);
+LUAT_API int luaT_lua_getconstructortable(lua_State *L);
+LUAT_API int luaT_lua_typename(lua_State *L);
+LUAT_API int luaT_lua_isequal(lua_State *L);
+LUAT_API int luaT_lua_pointer(lua_State *L);
+LUAT_API int luaT_lua_setenv(lua_State *L);
+LUAT_API int luaT_lua_getenv(lua_State *L);
+LUAT_API int luaT_lua_getmetatable(lua_State *L);
+LUAT_API int luaT_lua_version(lua_State *L);
+LUAT_API int luaT_lua_setmetatable(lua_State *L);
+LUAT_API int luaT_lua_metatype(lua_State *L);
+LUAT_API int luaT_lua_pushudata(lua_State *L);
+
+/* deprecated functions */
+/* ids have been replaced by string names to identify classes */
+/* comments show what function (that you should use) they call now */
+#if (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+#define LUAT_DEPRECATED __attribute__((__deprecated__))
+#elif defined(_MSC_VER)
+#define LUAT_DEPRECATED __declspec(deprecated)
+#else
+#define LUAT_DEPRECATED
+#endif
+
+LUAT_API LUAT_DEPRECATED int luaT_pushmetaclass(lua_State *L, const char *tname); /* same as luaT_pushmetatable */
+LUAT_API LUAT_DEPRECATED const char* luaT_id(lua_State *L, int ud); /* same as luaT_typename */
+LUAT_API LUAT_DEPRECATED const char* luaT_id2typename(lua_State *L, const char *id); /* same as luaT_typenameid */
+LUAT_API LUAT_DEPRECATED const char* luaT_typename2id(lua_State *L, const char*); /* same as luaT_typenameid */
+LUAT_API LUAT_DEPRECATED int luaT_getmetaclass(lua_State *L, int index); /* same as luaT_getmetatable */
+LUAT_API LUAT_DEPRECATED const char* luaT_checktypename2id(lua_State *L, const char *tname); /* same as luaT_typenameid */
+LUAT_API LUAT_DEPRECATED void luaT_registeratid(lua_State *L, const struct luaL_Reg *methods, const char *id); /* same as luaT_registeratname */
+
+#endif
diff --git a/nerv/matrix/cuda_helper.h b/nerv/matrix/cuda_helper.h
new file mode 100644
index 0000000..fde6f18
--- /dev/null
+++ b/nerv/matrix/cuda_helper.h
@@ -0,0 +1,75 @@
+#ifndef NERV_CUDA_HELPER_H
+#define NERV_CUDA_HELPER_H
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "driver_types.h"
+#include "cublas_v2.h"
+#define CUBLAS_SAFE_SYNC_CALL(call) \
+ do { \
+ cublasStatus_t err = (call); \
+ if (err != CUBLAS_STATUS_SUCCESS) \
+ nerv_error(L, "cumatrix cublas error: %s at %s:%d", \
+ cublasGetErrorString(err), __FILE__, __LINE__); \
+ cudaDeviceSynchronize(); \
+ } while (0)
+
+#define CUDA_SAFE_CALL(call) \
+ do { \
+ cudaError_t err = (call); \
+ if (err != cudaSuccess) \
+ nerv_error(L, "cumatrix CUDA error: %s at %s:%d", \
+ cudaGetErrorString(err), __FILE__, __LINE__); \
+ } while (0)
+
+#define CUDA_SAFE_SYNC_CALL(call) \
+ do { \
+ CUDA_SAFE_CALL(call); \
+ cudaDeviceSynchronize(); \
+ } while (0)
+
+#define CHECK_SAME_DIMENSION(a, b) \
+ do { \
+ if (!(a->nrow == b->nrow && a->ncol == b->ncol)) \
+ nerv_error(L, "matrices should be of the same dimension"); \
+ } while (0)
+
+static const char *cublasGetErrorString(cublasStatus_t err) {
+ switch (err)
+ {
+ case CUBLAS_STATUS_SUCCESS:
+ return "CUBLAS_STATUS_SUCCESS";
+ case CUBLAS_STATUS_NOT_INITIALIZED:
+ return "CUBLAS_STATUS_NOT_INITIALIZED";
+ case CUBLAS_STATUS_ALLOC_FAILED:
+ return "CUBLAS_STATUS_ALLOC_FAILED";
+ case CUBLAS_STATUS_INVALID_VALUE:
+ return "CUBLAS_STATUS_INVALID_VALUE";
+ case CUBLAS_STATUS_ARCH_MISMATCH:
+ return "CUBLAS_STATUS_ARCH_MISMATCH";
+ case CUBLAS_STATUS_MAPPING_ERROR:
+ return "CUBLAS_STATUS_MAPPING_ERROR";
+ case CUBLAS_STATUS_EXECUTION_FAILED:
+ return "CUBLAS_STATUS_EXECUTION_FAILED";
+ case CUBLAS_STATUS_INTERNAL_ERROR:
+ return "CUBLAS_STATUS_INTERNAL_ERROR";
+/* case CUBLAS_STATUS_NOT_SUPPORTED:
+ return "CUBLAS_STATUS_NOT_SUPPORTED";
+ case CUBLAS_STATUS_LICENSE_ERROR:
+ return "CUBLAS_STATUS_LICENSE_ERROR"; */
+ }
+ return "<unknown>";
+}
+
+#define PROFILE_START \
+ do { \
+ cudaEventRecord(profile_start, 0);
+#define PROFILE_STOP \
+ cudaEventRecord(profile_stop, 0); \
+ cudaEventSynchronize(profile_stop); \
+ float milliseconds = 0; \
+ cudaEventElapsedTime(&milliseconds, profile_start, profile_stop); \
+ accu_profile(__func__, milliseconds / 1000); \
+ } while (0);
+
+#define PROFILE_END
+#endif
diff --git a/nerv/matrix/cukernel.cu b/nerv/matrix/cukernel.cu
new file mode 100644
index 0000000..a19030a
--- /dev/null
+++ b/nerv/matrix/cukernel.cu
@@ -0,0 +1,17 @@
+#define NERV_GENERIC_CUKERNEL
+
+#define cudak_(NAME) cudak_float_ ## NAME
+#define MATRIX_USE_FLOAT
+#include "generic/elem_type.h"
+#include "generic/cukernel.cu"
+#undef cudak_
+#undef MATRIX_USE_FLOAT
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_FMT
+#undef MATRIX_ELEM_WRITE_FMT
+
+#define cudak_(NAME) cudak_double_ ## NAME
+#define MATRIX_USE_DOUBLE
+#include "generic/elem_type.h"
+#include "generic/cukernel.cu"
diff --git a/nerv/matrix/cukernel.h b/nerv/matrix/cukernel.h
new file mode 100644
index 0000000..8a1494f
--- /dev/null
+++ b/nerv/matrix/cukernel.h
@@ -0,0 +1,20 @@
+#ifdef NERV_GENERIC_CUKERNEL
+void cudak_(cuda_mul_elem)(const Matrix *a, const Matrix *b, Matrix *c);
+void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b);
+void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b);
+void cudak_(cuda_sigmoid_grad)(const Matrix *output, const Matrix *err, Matrix *nerr);
+void cudak_(cuda_rowsum)(const Matrix *a, Matrix *b);
+void cudak_(cuda_rowmax)(const Matrix *a, Matrix *b);
+void cudak_(cuda_rowmax_idx)(const Matrix *a, Matrix *b, Matrix *idx);
+void cudak_(cuda_colsum)(const Matrix *a, Matrix *b);
+void cudak_(cuda_colsame)(const Matrix *a, const Matrix *ref, Matrix *b);
+void cudak_(cuda_softmax_denominator)(const Matrix *a, const Matrix *max, Matrix *b);
+void cudak_(cuda_softmax_final)(const Matrix *a, const Matrix *max, const Matrix *deno, Matrix *b);
+void cudak_(cuda_add_row)(const Matrix *a, Matrix *b, double beta);
+void cudak_(cuda_fill)(Matrix *a, double val);
+void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context);
+void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step);
+void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b);
+void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b);
+void cudak_(cuda_decompress)(const Matrix *a, Matrix *b);
+#endif
diff --git a/nerv/matrix/cumatrix.c b/nerv/matrix/cumatrix.c
new file mode 100644
index 0000000..af34fb4
--- /dev/null
+++ b/nerv/matrix/cumatrix.c
@@ -0,0 +1,87 @@
+#define NERV_GENERIC_CUMATRIX
+#include "../common.h"
+#include "cuda_helper.h"
+#include <string.h>
+#define PROFILE_HASHMAP_SIZE 123457
+static cublasHandle_t cublas_handle;
+static cudaEvent_t profile_start, profile_stop;
+static HashMap *profile;
+
+static int print_profile(lua_State *L) {
+ (void)L;
+ size_t i;
+ fprintf(stderr, "*** [nerv cumatrix profile] **\n");
+ for (i = 0; i < profile->size; i++)
+ {
+ HashNode *ptr;
+ for (ptr = profile->bucket[i]; ptr; ptr = ptr->next)
+ {
+ fprintf(stderr, "%s:\t%.6f\n", ptr->key, *(float *)ptr->val);
+ }
+ }
+ return 0;
+}
+
+static int clear_profile(lua_State *L) {
+ (void)L;
+ hashmap_clear(profile);
+ return 0;
+}
+
+void accu_profile(const char *name, float delta) {
+ float *val = hashmap_getval(profile, name);
+ if (!val)
+ {
+ val = malloc(sizeof(float));
+ *val = 0;
+ hashmap_setval(profile, name, val);
+ }
+ *val += delta;
+}
+
+static const luaL_Reg cumatrix_methods[] = {
+ {"print_profile", print_profile},
+ {"clear_profile", clear_profile},
+ {NULL, NULL}
+};
+
+extern void nerv_matrix_cuda_float_init(lua_State *L);
+extern void nerv_matrix_cuda_double_init(lua_State *L);
+
+void nerv_cumatrix_init(lua_State *L) {
+ luaL_register(L, NULL, cumatrix_methods);
+ cublasCreate(&cublas_handle);
+ cudaEventCreate(&profile_start);
+ cudaEventCreate(&profile_stop);
+ profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
+ nerv_matrix_cuda_float_init(L);
+ nerv_matrix_cuda_double_init(L);
+}
+
+#define MATRIX_USE_FLOAT
+#define cuda_matrix_(NAME) cuda_matrix_float_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_cuda_float_##NAME
+#define cudak_(NAME) cudak_float_ ## NAME
+#define NERV_CUBLAS_(NAME) cublasS##NAME
+#define MATRIX_CUMATRIX_HOST_TNAME nerv_matrix_host_float_tname
+const char *nerv_matrix_(tname) = "nerv.CuMatrixFloat";
+#include "generic/cumatrix.c"
+#undef NERV_CUBLAS_
+#undef cudak_
+#undef nerv_matrix_
+#undef cuda_matrix_
+#undef MATRIX_USE_FLOAT
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_FMT
+#undef MATRIX_ELEM_WRITE_FMT
+#undef MATRIX_CUMATRIX_HOST_TNAME
+
+#define MATRIX_USE_DOUBLE
+#define cuda_matrix_(NAME) cuda_matrix_double_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_cuda_double_##NAME
+#define cudak_(NAME) cudak_double_ ## NAME
+#define NERV_CUBLAS_(NAME) cublasD##NAME
+#define MATRIX_CUMATRIX_HOST_TNAME nerv_matrix_host_double_tname
+const char *nerv_matrix_(tname) = "nerv.CuMatrixDouble";
+#include "generic/cumatrix.c"
diff --git a/nerv/matrix/generic/cukernel.cu b/nerv/matrix/generic/cukernel.cu
new file mode 100644
index 0000000..d6c8adc
--- /dev/null
+++ b/nerv/matrix/generic/cukernel.cu
@@ -0,0 +1,571 @@
+#ifdef NERV_GENERIC_CUKERNEL
+#include <assert.h>
+#include <stdio.h>
+#include "matrix.h"
+#include "cuda.h"
+#include "float.h"
+#define CUDA_THREADS_N 16
+#define CUDA_THREADS_NN ((CUDA_THREADS_N) * (CUDA_THREADS_N))
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+__global__ void cudak_(log_elem)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+ int nrow, int ncol, int stride) {
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ long idx;
+ MATRIX_ELEM tmp;
+ if (i >= nrow || j >= ncol) return;
+ idx = j + i * stride;
+ tmp = a[idx];
+ if(tmp < FLT_MIN) tmp = FLT_MIN;
+ b[idx] = log(tmp);
+}
+
+__global__ void cudak_(mul_elem)(const MATRIX_ELEM *a, const MATRIX_ELEM *b,
+ MATRIX_ELEM *c,
+ int nrow, int ncol, int stride) {
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ long idx;
+ if (i >= nrow || j >= ncol) return;
+ idx = j + i * stride;
+ c[idx] = a[idx] * b[idx];
+}
+
+__global__ void cudak_(sigmoid)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+ int nrow, int ncol, int stride) {
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ long idx;
+ if (i >= nrow || j >= ncol) return;
+ idx = j + i * stride;
+ b[idx] = 1.0 / (1.0 + exp(-a[idx]));
+}
+
+__global__ void cudak_(sigmoid_grad)(const MATRIX_ELEM *output,
+ const MATRIX_ELEM *err,
+ MATRIX_ELEM *nerr,
+ int nrow, int ncol, int stride) {
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ long idx;
+ if (i >= nrow || j >= ncol) return;
+ idx = j + i * stride;
+ nerr[idx] = output[idx] * (1.0 - output[idx]) * err[idx];
+}
+
+__global__ void cudak_(softmax_final)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+ const MATRIX_ELEM *max, const MATRIX_ELEM *deno,
+ int nrow, int ncol, int stride, int mstride) {
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ long idx;
+ if (i >= nrow || j >= ncol) return;
+ idx = j + i * stride;
+ b[idx] = exp(a[idx] - max[0 + i * mstride]) / deno[0 + i * mstride];
+}
+
+__global__ void cudak_(block_reduce_rowsum)(const MATRIX_ELEM *input,
+ MATRIX_ELEM *output,
+ const int istride, const int ostride,
+ const int n) {
+ extern __shared__ MATRIX_ELEM cudak_(arr)[];
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : 0;
+ __syncthreads();
+ for (int offset = blockDim.x >> 1; offset; offset >>= 1)
+ {
+ if (threadIdx.x < offset)
+ cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
+ __syncthreads();
+ }
+ if (threadIdx.x == 0)
+ output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_colsum)(const MATRIX_ELEM *input,
+ MATRIX_ELEM *output,
+ const int istride, const int ostride,
+ const int n) {
+ extern __shared__ MATRIX_ELEM cudak_(arr)[];
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ cudak_(arr)[threadIdx.y] = i < n ? input[blockIdx.x + istride * i] : 0;
+ __syncthreads();
+ for (int offset = blockDim.y >> 1; offset; offset >>= 1)
+ {
+ if (threadIdx.y < offset)
+ cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset];
+ __syncthreads();
+ }
+ if (threadIdx.y == 0)
+ output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_colsame)(const MATRIX_ELEM *input,
+ const MATRIX_ELEM *ref_input,
+ MATRIX_ELEM *output,
+ const int istride, const int ostride,
+ const int n) {
+ extern __shared__ MATRIX_ELEM cudak_(arr)[];
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ cudak_(arr)[threadIdx.y] = (i < n && input[blockIdx.x + istride * i] == \
+ ref_input[blockIdx.x + istride * i]) ? 1.0 : 0;
+ __syncthreads();
+ for (int offset = blockDim.y >> 1; offset; offset >>= 1)
+ {
+ if (threadIdx.y < offset)
+ cudak_(arr)[threadIdx.y] += cudak_(arr)[threadIdx.y + offset];
+ __syncthreads();
+ }
+ if (threadIdx.y == 0)
+ output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_softmax_rowsum)(const MATRIX_ELEM *input,
+ MATRIX_ELEM *output,
+ const MATRIX_ELEM *max,
+ const int istride, const int ostride,
+ const int mstride, const int n) {
+ extern __shared__ MATRIX_ELEM cudak_(arr)[];
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ cudak_(arr)[threadIdx.x] = j < n ? exp(input[j + istride * blockIdx.y] - \
+ max[0 + mstride * blockIdx.y]) : 0;
+ __syncthreads();
+ for (int offset = blockDim.x >> 1; offset; offset >>= 1)
+ {
+ if (threadIdx.x < offset)
+ cudak_(arr)[threadIdx.x] += cudak_(arr)[threadIdx.x + offset];
+ __syncthreads();
+ }
+ if (threadIdx.x == 0)
+ output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_rowmax)(const MATRIX_ELEM *input,
+ MATRIX_ELEM *output,
+ const int istride, const int ostride,
+ const int n) {
+ extern __shared__ MATRIX_ELEM cudak_(arr)[];
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ cudak_(arr)[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
+ __syncthreads();
+ for (int offset = blockDim.x >> 1; offset; offset >>= 1)
+ {
+ if (threadIdx.x < offset)
+ {
+ MATRIX_ELEM l = cudak_(arr)[threadIdx.x],
+ r = cudak_(arr)[threadIdx.x + offset];
+ if (r > l)
+ cudak_(arr)[threadIdx.x] = r;
+ }
+ __syncthreads();
+ }
+ if (threadIdx.x == 0)
+ output[blockIdx.x + ostride * blockIdx.y] = cudak_(arr)[0];
+}
+
+__global__ void cudak_(block_reduce_rowmax_idx)(const MATRIX_ELEM *input,
+ const MATRIX_ELEM *idx_input,
+ MATRIX_ELEM *output,
+ MATRIX_ELEM *idx_output,
+ const int istride, const int ostride,
+ const int n) {
+ extern __shared__ MATRIX_ELEM cudak_(arr)[];
+ MATRIX_ELEM *arr_val = cudak_(arr);
+ MATRIX_ELEM *arr_idx = arr_val + blockDim.x;
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ arr_val[threadIdx.x] = j < n ? input[j + istride * blockIdx.y] : -FLT_MAX;
+ arr_idx[threadIdx.x] = j < n ? idx_input[j + istride * blockIdx.y] : 0;
+ __syncthreads();
+ for (int offset = blockDim.x >> 1; offset; offset >>= 1)
+ {
+ if (threadIdx.x < offset)
+ {
+ MATRIX_ELEM l = arr_val[threadIdx.x],
+ r = arr_val[threadIdx.x + offset];
+ if (r > l)
+ {
+ arr_val[threadIdx.x] = r;
+ arr_idx[threadIdx.x] = arr_idx[threadIdx.x + offset];
+ }
+ }
+ __syncthreads();
+ }
+ if (threadIdx.x == 0)
+ {
+ output[blockIdx.x + ostride * blockIdx.y] = arr_val[0];
+ idx_output[blockIdx.x + ostride * blockIdx.y] = arr_idx[0];
+ }
+}
+
+__global__ void cudak_(add_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+ int nrow, int ncol, int stride, double beta) {
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ if (i >= nrow || j >= ncol) return;
+ b[j + i * stride] += beta * a[j];
+}
+
+__global__ void cudak_(fill)(MATRIX_ELEM *a,
+ int nrow, int ncol, int stride, double val) {
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ if (i >= nrow || j >= ncol) return;
+ a[j + i * stride] = val;
+}
+
+__global__ void cudak_(expand_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+ int nrow, int ncol,
+ int enrow, int encol,
+ int stride, int estride,
+ int context) {
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ int ridx;
+ if (i >= enrow || j >= encol) return;
+ ridx = i + j / ncol - context;
+ if (ridx < 0) ridx = 0;
+ else if (ridx >= nrow) ridx = nrow - 1;
+ b[j + i * estride] = a[j % ncol + ridx * stride];
+}
+
+__global__ void cudak_(rearrange_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+ int nrow, int ncol,
+ int stride, int step, int orig_dim) {
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ if (i >= nrow || j >= ncol) return;
+ b[j + i * stride] = a[j / step + (j % step) * orig_dim + i * stride];
+}
+
+__global__ void cudak_(scale_rows_by_col)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+ int nrow, int ncol,
+ int astride, int bstride) {
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ if (i >= nrow || j >= ncol) return;
+ b[j + i * bstride] *= a[i * astride];
+}
+
+__global__ void cudak_(scale_rows_by_row)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+ int nrow, int ncol,
+ int stride) {
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ if (i >= nrow || j >= ncol) return;
+ b[j + i * stride] *= a[j];
+}
+
+__global__ void cudak_(decompress)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+ int nrow, int ncol,
+ int stride_a, int stride_b) {
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ if (i >= nrow || j >= ncol) return;
+ b[lrintf(a[j + i * stride_a]) + i * stride_b] = 1.0;
+}
+
+__global__ void cudak_(gen_col_idx)(MATRIX_ELEM *b,
+ int nrow, int ncol, int stride) {
+ int j = blockIdx.x * blockDim.x + threadIdx.x;
+ int i = blockIdx.y * blockDim.y + threadIdx.y;
+ if (i >= nrow || j >= ncol) return;
+ b[j + i * stride] = j;
+}
+
+extern "C" {
+#include "../cukernel.h"
+ void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b) {
+ dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+ dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+ CEIL_DIV(b->nrow, threadsPerBlock.y));
+ cudak_(log_elem)<<<numBlocks, threadsPerBlock>>> \
+ (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+ b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+ cudaStreamSynchronize(0);
+ }
+
+ void cudak_(cuda_mul_elem)(const Matrix *a, const Matrix *b,
+ Matrix *c) {
+ dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+ dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+ CEIL_DIV(b->nrow, threadsPerBlock.y));
+ cudak_(mul_elem)<<<numBlocks, threadsPerBlock>>> \
+ (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+ MATRIX_ELEM_PTR(c),
+ b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+ cudaStreamSynchronize(0);
+ }
+
+ void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b) {
+ dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+ dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+ CEIL_DIV(b->nrow, threadsPerBlock.y));
+ cudak_(sigmoid)<<<numBlocks, threadsPerBlock>>> \
+ (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
+ b->stride / sizeof(MATRIX_ELEM));
+ cudaStreamSynchronize(0);
+ }
+
+ void cudak_(cuda_sigmoid_grad)(const Matrix *output,
+ const Matrix *err, Matrix *nerr) {
+ dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+ dim3 numBlocks(CEIL_DIV(nerr->ncol, threadsPerBlock.x),
+ CEIL_DIV(nerr->nrow, threadsPerBlock.y));
+ cudak_(sigmoid_grad)<<<numBlocks, threadsPerBlock>>> \
+ (MATRIX_ELEM_PTR(output), MATRIX_ELEM_PTR(err),
+ MATRIX_ELEM_PTR(nerr),
+ nerr->nrow, nerr->ncol,
+ nerr->stride / sizeof(MATRIX_ELEM));
+ cudaStreamSynchronize(0);
+ }
+
+ void cudak_(cuda_rowsum)(const Matrix *a, Matrix *b) {
+ dim3 block(CUDA_THREADS_NN, 1);
+ int ncol = a->ncol;
+ int blocks_per_row = CEIL_DIV(ncol, block.x);
+ dim3 grid(blocks_per_row, a->nrow);
+ MATRIX_ELEM *res;
+ size_t stride;
+ cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+ cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+ (MATRIX_ELEM_PTR(a), res,
+ a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+ ncol);
+ ncol = blocks_per_row;
+ assert((unsigned long)ncol <= block.x);
+ grid.x = 1;
+ cudaStreamSynchronize(0);
+ cudak_(block_reduce_rowsum)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+ (res, MATRIX_ELEM_PTR(b),
+ stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+ ncol);
+ cudaStreamSynchronize(0);
+ cudaFree(res);
+ }
+
+ void cudak_(cuda_colsame)(const Matrix *a, const Matrix *ref, Matrix *b) {
+ dim3 block(1, CUDA_THREADS_NN);
+ int nrow = a->nrow;
+ int blocks_per_col = CEIL_DIV(nrow, block.y);
+ dim3 grid(a->ncol, blocks_per_col);
+ MATRIX_ELEM *res;
+ size_t stride;
+ cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col);
+ cudak_(block_reduce_colsame)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
+ (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(ref), res,
+ a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+ nrow);
+ nrow = blocks_per_col;
+ assert((unsigned long)nrow <= block.y);
+ grid.y = 1;
+ cudaStreamSynchronize(0);
+ cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
+ (res, MATRIX_ELEM_PTR(b),
+ stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+ nrow);
+ cudaStreamSynchronize(0);
+ cudaFree(res);
+ }
+
+ void cudak_(cuda_colsum)(const Matrix *a, Matrix *b) {
+ dim3 block(1, CUDA_THREADS_NN);
+ int nrow = a->nrow;
+ int blocks_per_col = CEIL_DIV(nrow, block.y);
+ dim3 grid(a->ncol, blocks_per_col);
+ MATRIX_ELEM *res;
+ size_t stride;
+ cudaMallocPitch(&res, &stride, a->ncol * sizeof(MATRIX_ELEM), blocks_per_col);
+ cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
+ (MATRIX_ELEM_PTR(a), res,
+ a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+ nrow);
+ nrow = blocks_per_col;
+ assert((unsigned long)nrow <= block.y);
+ grid.y = 1;
+ cudaStreamSynchronize(0);
+ cudak_(block_reduce_colsum)<<<grid, block, block.y * sizeof(MATRIX_ELEM)>>> \
+ (res, MATRIX_ELEM_PTR(b),
+ stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+ nrow);
+ cudaStreamSynchronize(0);
+ cudaFree(res);
+ }
+
+ void cudak_(cuda_softmax_final)(const Matrix *a, const Matrix *max,
+ const Matrix *deno, Matrix *b) {
+ dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+ dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+ CEIL_DIV(b->nrow, threadsPerBlock.y));
+ cudak_(softmax_final)<<<numBlocks, threadsPerBlock>>> \
+ (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+ MATRIX_ELEM_PTR(max), MATRIX_ELEM_PTR(deno),
+ b->nrow, b->ncol,
+ b->stride / sizeof(MATRIX_ELEM),
+ max->stride / sizeof(MATRIX_ELEM));
+ cudaStreamSynchronize(0);
+ }
+
+ void cudak_(cuda_softmax_denominator)(const Matrix *a, const Matrix *max, Matrix *b) {
+ dim3 block(CUDA_THREADS_NN, 1);
+ int ncol = a->ncol;
+ int blocks_per_row = CEIL_DIV(ncol, block.x);
+ dim3 grid(blocks_per_row, a->nrow);
+ MATRIX_ELEM *res;
+ size_t stride;
+ assert(max->ncol == 1);
+ cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+ cudak_(block_reduce_softmax_rowsum) \
+ <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+ (MATRIX_ELEM_PTR(a), res, MATRIX_ELEM_PTR(max),
+ a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+ max->stride / sizeof(MATRIX_ELEM),
+ ncol);
+ ncol = blocks_per_row;
+ assert((unsigned long)ncol <= block.x);
+ grid.x = 1;
+ cudaStreamSynchronize(0);
+ cudak_(block_reduce_rowsum) \
+ <<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+ (res, MATRIX_ELEM_PTR(b),
+ stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+ ncol);
+ cudaStreamSynchronize(0);
+ cudaFree(res);
+ }
+
+ void cudak_(cuda_rowmax)(const Matrix *a, Matrix *b) {
+ dim3 block(CUDA_THREADS_NN, 1);
+ int ncol = a->ncol;
+ int blocks_per_row = CEIL_DIV(ncol, block.x);
+ dim3 grid(blocks_per_row, a->nrow);
+ MATRIX_ELEM *res;
+ size_t stride;
+ cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+ cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+ (MATRIX_ELEM_PTR(a), res,
+ a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+ ncol);
+ ncol = blocks_per_row;
+ assert((unsigned long)ncol <= block.x);
+ grid.x = 1;
+ cudaStreamSynchronize(0);
+ cudak_(block_reduce_rowmax)<<<grid, block, block.x * sizeof(MATRIX_ELEM)>>> \
+ (res, MATRIX_ELEM_PTR(b),
+ stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+ ncol);
+ cudaStreamSynchronize(0);
+ cudaFree(res);
+ }
+
+ void cudak_(cuda_rowmax_idx)(const Matrix *a, Matrix *b, Matrix *b_idx) {
+ dim3 block(CUDA_THREADS_NN, 1);
+ int ncol = a->ncol;
+ int blocks_per_row = CEIL_DIV(ncol, block.x);
+ dim3 grid(blocks_per_row, a->nrow);
+ MATRIX_ELEM *a_idx, *res, *res_idx;
+ size_t stride;
+ cudaMallocPitch(&a_idx, &stride, a->stride, a->nrow);
+ cudak_(gen_col_idx)<<<grid, block>>>(a_idx, a->nrow, ncol, stride / sizeof(MATRIX_ELEM));
+ cudaMallocPitch(&res, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+ cudaMallocPitch(&res_idx, &stride, blocks_per_row * sizeof(MATRIX_ELEM), a->nrow);
+ cudaStreamSynchronize(0);
+ cudak_(block_reduce_rowmax_idx)<<<grid, block,
+ 2 * block.x * sizeof(MATRIX_ELEM)>>> \
+ (MATRIX_ELEM_PTR(a), a_idx, res, res_idx,
+ a->stride / sizeof(MATRIX_ELEM), stride / sizeof(MATRIX_ELEM),
+ ncol);
+ ncol = blocks_per_row;
+ assert((unsigned long)ncol <= block.x);
+ grid.x = 1;
+ cudaStreamSynchronize(0);
+ cudak_(block_reduce_rowmax_idx)<<<grid, block,
+ 2 * block.x * sizeof(MATRIX_ELEM)>>> \
+ (res, res_idx, MATRIX_ELEM_PTR(b), MATRIX_ELEM_PTR(b_idx),
+ stride / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM),
+ ncol);
+ cudaStreamSynchronize(0);
+ cudaFree(a_idx);
+ cudaFree(res);
+ cudaFree(res_idx);
+ }
+
+ /* in-place calc */
+ void cudak_(cuda_add_row)(const Matrix *a, Matrix *b, double beta) {
+ dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+ dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+ CEIL_DIV(b->nrow, threadsPerBlock.y));
+ cudak_(add_row)<<<numBlocks, threadsPerBlock>>> \
+ (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
+ b->stride / sizeof(MATRIX_ELEM), beta);
+ cudaStreamSynchronize(0);
+ }
+
+ void cudak_(cuda_fill)(Matrix *a, double val) {
+ dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+ dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x),
+ CEIL_DIV(a->nrow, threadsPerBlock.y));
+ cudak_(fill)<<<numBlocks, threadsPerBlock>>> \
+ (MATRIX_ELEM_PTR(a), a->nrow, a->ncol,
+ a->stride / sizeof(MATRIX_ELEM), val);
+ cudaStreamSynchronize(0);
+ }
+
+ void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context) {
+ dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+ dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+ CEIL_DIV(b->nrow, threadsPerBlock.y));
+ cudak_(expand_frm)<<<numBlocks, threadsPerBlock>>> \
+ (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+ a->nrow, a->ncol,
+ b->nrow, b->ncol,
+ a->stride / sizeof(MATRIX_ELEM),
+ b->stride / sizeof(MATRIX_ELEM),
+ context);
+ cudaStreamSynchronize(0);
+ }
+
+ void cudak_(cuda_rearrange_frm)(const Matrix *a, Matrix *b, int step) {
+ dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+ dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+ CEIL_DIV(b->nrow, threadsPerBlock.y));
+ cudak_(rearrange_frm)<<<numBlocks, threadsPerBlock>>> \
+ (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+ b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM),
+ step, b->ncol / step);
+ cudaStreamSynchronize(0);
+ }
+
+ void cudak_(cuda_scale_rows_by_col)(const Matrix *a, Matrix *b) {
+ dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+ dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+ CEIL_DIV(b->nrow, threadsPerBlock.y));
+ cudak_(scale_rows_by_col)<<<numBlocks, threadsPerBlock>>> \
+ (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+ b->nrow, b->ncol,
+ a->stride / sizeof(MATRIX_ELEM),
+ b->stride / sizeof(MATRIX_ELEM));
+ cudaStreamSynchronize(0);
+ }
+
+ void cudak_(cuda_scale_rows_by_row)(const Matrix *a, Matrix *b) {
+ dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+ dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+ CEIL_DIV(b->nrow, threadsPerBlock.y));
+ cudak_(scale_rows_by_row)<<<numBlocks, threadsPerBlock>>> \
+ (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+ b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+ cudaStreamSynchronize(0);
+ }
+
+ void cudak_(cuda_decompress)(const Matrix *a, Matrix *b) {
+ dim3 threadsPerBlock(1, CUDA_THREADS_NN);
+ dim3 numBlocks(1, CEIL_DIV(a->nrow, threadsPerBlock.y));
+ cudak_(decompress)<<<numBlocks, threadsPerBlock>>> \
+ (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
+ a->nrow, a->ncol,
+ a->stride / sizeof(MATRIX_ELEM),
+ b->stride / sizeof(MATRIX_ELEM));
+ cudaStreamSynchronize(0);
+ }
+}
+#endif
diff --git a/nerv/matrix/generic/cumatrix.c b/nerv/matrix/generic/cumatrix.c
new file mode 100644
index 0000000..b5d1a35
--- /dev/null
+++ b/nerv/matrix/generic/cumatrix.c
@@ -0,0 +1,493 @@
+#ifdef NERV_GENERIC_CUMATRIX
+#include "matrix.h"
+#include "elem_type.h"
+
+#define MATRIX_DATA_FREE(L, ptr) cuda_matrix_(free)(L, ptr)
+#define MATRIX_DATA_ALLOC(L, dptr, stride, width, height) \
+ cuda_matrix_(alloc)(L, dptr, stride, width, height)
+#define MATRIX_DATA_WRITE(L, data, idx, val) cuda_matrix_(write)(L, data, idx, val)
+#define MATRIX_DATA_READ(L, data, idx) cuda_matrix_(read)(L, data, idx)
+#define MATRIX_INIT(L) cuda_matrix_(init)(L)
+#define MATRIX_BASE_TNAME nerv_matrix_cuda_tname
+#define NERV_GENERIC_MATRIX
+#define NERV_GENERIC_CUKERNEL
+#include "../../common.h"
+#include "../cukernel.h"
+#include "../cuda_helper.h"
+
+Matrix *nerv_matrix_(new_)(lua_State *L, long nrow, long ncol);
+void nerv_matrix_(data_free)(lua_State *L, Matrix *self);
+
+static void nerv_matrix_(add_)(lua_State *L, const Matrix *a, const Matrix *b,
+ const Matrix *c,
+ MATRIX_ELEM alpha, MATRIX_ELEM beta) {
+ PROFILE_START
+ CUBLAS_SAFE_SYNC_CALL(
+ NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
+ a->ncol, a->nrow,
+ &alpha,
+ MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+ &beta,
+ MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM),
+ MATRIX_ELEM_PTR(c), c->stride / sizeof(MATRIX_ELEM)));
+ PROFILE_STOP
+}
+
+static int nerv_matrix_(add)(lua_State *L) {
+ Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname));
+ MATRIX_ELEM alpha = luaL_checknumber(L, 4);
+ MATRIX_ELEM beta = luaL_checknumber(L, 5);
+ CHECK_SAME_DIMENSION(a, b);
+ CHECK_SAME_DIMENSION(a, c);
+ nerv_matrix_(add_)(L, a, b, c, alpha, beta);
+ return 0;
+}
+
+static int nerv_matrix_(get_cublas_op)(char ch) {
+ return (ch == 'T' || ch == 't') ? CUBLAS_OP_T : CUBLAS_OP_N;
+}
+
+static int nerv_matrix_(mul)(lua_State *L) {
+#define SWAP(a, b) \
+ do { int t = (a); (a) = (b); (b) = t; } while (0)
+
+ Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname));
+ MATRIX_ELEM alpha = luaL_checknumber(L, 4);
+ MATRIX_ELEM beta = luaL_checknumber(L, 5);
+ int nargs = lua_gettop(L);
+ int ta = nargs > 5 ? nerv_matrix_(get_cublas_op)(*luaL_checkstring(L, 6)) \
+ : CUBLAS_OP_N;
+ int tb = nargs > 6 ? nerv_matrix_(get_cublas_op)(*luaL_checkstring(L, 7)) \
+ : CUBLAS_OP_N;
+ int am = a->nrow, an = a->ncol;
+ int bm = b->nrow, bn = b->ncol;
+ if (ta == CUBLAS_OP_T) SWAP(am, an);
+ if (tb == CUBLAS_OP_T) SWAP(bm, bn);
+ if (an != bm)
+ nerv_error(L, "Wrong dimension of multipliers");
+/* MATRIX_ELEM alpha = 1.0f, beta = 0.0f; */
+ /* Because matrix in Nerv is row-major, here b comes first */
+ PROFILE_START
+ CUBLAS_SAFE_SYNC_CALL(
+ NERV_CUBLAS_(gemm)(cublas_handle, tb, ta,
+ bn, am, bm,
+ &alpha,
+ MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM),
+ MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+ &beta,
+ MATRIX_ELEM_PTR(c), c->stride / sizeof(MATRIX_ELEM)));
+ PROFILE_STOP
+ return 0;
+}
+
+static int nerv_matrix_(create)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = nerv_matrix_(new_)(L, a->nrow, a->ncol);
+ luaT_pushudata(L, b, nerv_matrix_(tname));
+ return 1;
+}
+
+static int nerv_matrix_(sigmoid)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ CHECK_SAME_DIMENSION(a, b);
+ PROFILE_START
+ cudak_(cuda_sigmoid)(b, a);
+ PROFILE_STOP
+ return 0;
+}
+
+static int nerv_matrix_(sigmoid_grad)(lua_State *L) {
+ Matrix *nerr = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *err = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ Matrix *output = luaT_checkudata(L, 3, nerv_matrix_(tname));
+ CHECK_SAME_DIMENSION(nerr, err);
+ CHECK_SAME_DIMENSION(nerr, output);
+ PROFILE_START
+ cudak_(cuda_sigmoid_grad)(output, err, nerr);
+ PROFILE_STOP
+ return 0;
+}
+
+static int nerv_matrix_(softmax)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *max, *max_idx;
+ Matrix *dno;
+ CHECK_SAME_DIMENSION(a, b);
+ max = nerv_matrix_(new_)(L, a->nrow, 1);
+ max_idx = nerv_matrix_(new_)(L, a->nrow, 1);
+ dno = nerv_matrix_(new_)(L, a->nrow, 1);
+ PROFILE_START
+ cudak_(cuda_rowmax_idx)(a, max, max_idx);
+ cudak_(cuda_softmax_denominator)(a, max, dno);
+ cudak_(cuda_softmax_final)(a, max, dno, b);
+ PROFILE_STOP
+ nerv_matrix_(data_free)(L, max);
+ nerv_matrix_(data_free)(L, dno);
+ luaT_pushudata(L, max_idx, nerv_matrix_(tname));
+ return 1;
+}
+
+static int nerv_matrix_(rowsum)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1);
+ PROFILE_START
+ cudak_(cuda_rowsum)(a, b);
+ PROFILE_STOP
+ luaT_pushudata(L, b, nerv_matrix_(tname));
+ return 1;
+}
+
+static int nerv_matrix_(colsum)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = nerv_matrix_(new_)(L, 1, a->ncol);
+ PROFILE_START
+ cudak_(cuda_colsum)(a, b);
+ PROFILE_STOP
+ luaT_pushudata(L, b, nerv_matrix_(tname));
+ return 1;
+}
+
+static int nerv_matrix_(colsame)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *ref = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ Matrix *b = nerv_matrix_(new_)(L, 1, a->ncol);
+ CHECK_SAME_DIMENSION(a, ref);
+ PROFILE_START
+ cudak_(cuda_colsame)(a, ref, b);
+ PROFILE_STOP
+ luaT_pushudata(L, b, nerv_matrix_(tname));
+ return 1;
+}
+
+static int nerv_matrix_(rowmax)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1);
+ PROFILE_START
+ cudak_(cuda_rowmax)(a, b);
+ PROFILE_STOP
+ luaT_pushudata(L, b, nerv_matrix_(tname));
+ return 1;
+}
+
+static int nerv_matrix_(rowmax_idx)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = nerv_matrix_(new_)(L, a->nrow, 1);
+ Matrix *idx = nerv_matrix_(new_)(L, a->nrow, 1);
+ PROFILE_START
+ cudak_(cuda_rowmax_idx)(a, b, idx);
+ PROFILE_STOP
+ luaT_pushudata(L, b, nerv_matrix_(tname));
+ luaT_pushudata(L, idx, nerv_matrix_(tname));
+ return 2;
+}
+
+static int nerv_matrix_(add_row)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ double beta = luaL_checknumber(L, 3);
+ if (a->ncol != b->ncol)
+ nerv_error(L, "the number of columns is not the same");
+ if (a->nrow != 1)
+ nerv_error(L, "a row vector is expected");
+ PROFILE_START
+ cudak_(cuda_add_row)(a, b, beta);
+ PROFILE_STOP
+ return 0;
+}
+
+static int nerv_matrix_(fill)(lua_State *L) {
+ Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ double val = luaL_checknumber(L, 2);
+ PROFILE_START
+ cudak_(cuda_fill)(self, val);
+ PROFILE_STOP
+ return 0;
+}
+
+static int nerv_matrix_(copy_fromd)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ int nargs = lua_gettop(L);
+ int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
+ int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
+ int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
+ if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+ a_begin + b_end - b_begin <= a->nrow))
+ nerv_error(L, "invalid copy interval");
+ if (a->ncol != b->ncol)
+ nerv_error(L, "matrices should be of the same dimension");
+ PROFILE_START
+ CUDA_SAFE_SYNC_CALL(
+ cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride,
+ MATRIX_ROW_PTR(b, b_begin), b->stride,
+ sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin,
+ cudaMemcpyDeviceToDevice));
+ PROFILE_STOP
+ return 0;
+}
+
+extern const char *MATRIX_CUMATRIX_HOST_TNAME;
+static int nerv_matrix_(copy_fromh)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
+ int nargs = lua_gettop(L);
+ int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
+ int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
+ int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
+ if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+ a_begin + b_end - b_begin <= a->nrow))
+ nerv_error(L, "invalid copy interval");
+ if (a->ncol != b->ncol)
+ nerv_error(L, "matrices should be of the same dimension");
+ PROFILE_START
+ CUDA_SAFE_SYNC_CALL(
+ cudaMemcpy2D(MATRIX_ROW_PTR(a, a_begin), a->stride,
+ MATRIX_ROW_PTR(b, b_begin), b->stride,
+ sizeof(MATRIX_ELEM) * b->ncol, b_end - b_begin,
+ cudaMemcpyHostToDevice));
+ PROFILE_STOP
+ return 0;
+}
+
+static int nerv_matrix_(copy_toh)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
+ int nargs = lua_gettop(L);
+ int a_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
+ int a_end = nargs > 3 ? luaL_checkinteger(L, 4) : a->nrow;
+ int b_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
+ if (!(0 <= a_begin && a_begin < a_end && a_end <= a->nrow &&
+ b_begin + a_end - a_begin <= b->nrow))
+ nerv_error(L, "invalid copy interval");
+ if (b->ncol != a->ncol)
+ nerv_error(L, "matrices should be of the same dimension");
+ PROFILE_START
+ CUDA_SAFE_SYNC_CALL(
+ cudaMemcpy2D(MATRIX_ROW_PTR(b, b_begin), b->stride,
+ MATRIX_ROW_PTR(a, a_begin), a->stride,
+ sizeof(MATRIX_ELEM) * a->ncol, a_end - a_begin,
+ cudaMemcpyDeviceToHost));
+ PROFILE_STOP
+ return 0;
+}
+
+static int nerv_matrix_(trans)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = nerv_matrix_(new_)(L, a->ncol, a->nrow);
+ MATRIX_ELEM alpha = 1, beta = 0;
+ /* FIXME: possible memory leak when lua error is raised */
+ PROFILE_START
+ CUBLAS_SAFE_SYNC_CALL(
+ NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T,
+ a->nrow, a->ncol,
+ &alpha,
+ MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+ &beta,
+ MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM),
+ MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM)));
+ PROFILE_STOP
+ luaT_pushudata(L, b, nerv_matrix_(tname));
+ return 1;
+}
+
+static int nerv_matrix_(mul_elem)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname));
+ Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ CHECK_SAME_DIMENSION(a, b);
+ CHECK_SAME_DIMENSION(a, c);
+ PROFILE_START
+ cudak_(cuda_mul_elem)(a, b, c);
+ PROFILE_STOP
+ return 0;
+}
+
+static int nerv_matrix_(log_elem)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ CHECK_SAME_DIMENSION(a, b);
+ PROFILE_START
+ cudak_(cuda_log_elem)(a, b);
+ PROFILE_STOP
+ return 0;
+}
+
+static int nerv_matrix_(decompress)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b;
+ int orig_col = luaL_checkinteger(L, 2);
+ if (a->ncol != 1)
+ nerv_error(L, "the compressed matrix must be a column vector");
+ b = nerv_matrix_(new_)(L, a->nrow, orig_col);
+ PROFILE_START
+ cudak_(cuda_fill)(b, 0.0);
+ cudak_(cuda_decompress)(a, b);
+ PROFILE_STOP
+ luaT_pushudata(L, b, nerv_matrix_(tname));
+ return 1;
+}
+
+extern const char *nerv_matrix_host_int_tname;
+static int nerv_matrix_(copy_rows_fromh_by_idx)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME);
+ Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_int_tname);
+ long nrow = a->nrow;
+ int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0;
+ if (!(0 <= b_begin && b_begin + nrow <= idx->ncol))
+ nerv_error(L, "invalid copy interval");
+ long *idx_ptr = idx->data.i;
+ int i;
+ if (idx->nrow != 1)
+ nerv_error(L, "index should be a vector");
+ if (a->ncol != b->ncol)
+ nerv_error(L, "source/destination dimension mismatch");
+ cudaStream_t *streams = (cudaStream_t*)malloc(sizeof(cudaStream_t) * nrow);
+ for (i = 0; i < nrow; i++)
+ {
+ int src_row = idx_ptr[b_begin + i];
+ if (!(0 <= src_row && src_row < b->nrow))
+ nerv_error(L, "invalid index");
+ CUDA_SAFE_CALL(cudaStreamCreate(streams + i));
+ CUDA_SAFE_CALL(cudaMemcpyAsync(MATRIX_ROW_PTR(a, i),
+ MATRIX_ROW_PTR(b, src_row),
+ b->stride,
+ cudaMemcpyHostToDevice, streams[i]));
+ }
+ for (i = 0; i < nrow; i++)
+ {
+ CUDA_SAFE_CALL(cudaStreamSynchronize(streams[i]));
+ CUDA_SAFE_CALL(cudaStreamDestroy(streams[i]));
+ }
+ free(streams);
+ return 0;
+}
+
+static int nerv_matrix_(expand_frm)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ int context = luaL_checkinteger(L, 3);
+ if (a->nrow != b->nrow)
+ nerv_error(L, "mismatching number of frames");
+ if (a->ncol != b->ncol * (context * 2 + 1))
+ nerv_error(L, "the width should be 2 * context + 1");
+ PROFILE_START
+ cudak_(cuda_expand_frm)(b, a, context);
+ PROFILE_STOP
+ return 0;
+}
+
+static int nerv_matrix_(rearrange_frm)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ int step = luaL_checkinteger(L, 3);
+ CHECK_SAME_DIMENSION(a, b);
+ if (b->ncol % step)
+ nerv_error(L, "the dimension of columns is not divisible by step");
+ PROFILE_START
+ cudak_(cuda_rearrange_frm)(b, a, step);
+ PROFILE_STOP
+ return 0;
+}
+
+static int nerv_matrix_(scale_rows_by_col)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ if (a->nrow != b->nrow)
+ nerv_error(L, "the number of rows is not the same");
+ if (b->ncol != 1)
+ nerv_error(L, "a column vector is expected");
+ PROFILE_START
+ cudak_(cuda_scale_rows_by_col)(b, a);
+ PROFILE_STOP
+ return 0;
+}
+
+static int nerv_matrix_(scale_rows_by_row)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ if (a->ncol != b->ncol)
+ nerv_error(L, "the number of columns is not the same");
+ if (b->nrow != 1)
+ nerv_error(L, "a row vector is expected");
+ PROFILE_START
+ cudak_(cuda_scale_rows_by_row)(b, a);
+ PROFILE_STOP
+ return 0;
+}
+
+static const luaL_Reg nerv_matrix_(extra_methods)[] = {
+ {"create", nerv_matrix_(create)},
+ {"colsum", nerv_matrix_(colsum)},
+ {"colsame", nerv_matrix_(colsame)},
+ {"rowsum", nerv_matrix_(rowsum)},
+ {"rowmax", nerv_matrix_(rowmax)},
+ {"rowmax_idx", nerv_matrix_(rowmax_idx)},
+ {"trans", nerv_matrix_(trans)},
+ {"decompress", nerv_matrix_(decompress)},
+ /* in-place calc */
+ {"copy_fromh", nerv_matrix_(copy_fromh)},
+ {"copy_fromd", nerv_matrix_(copy_fromd)},
+ {"copy_toh", nerv_matrix_(copy_toh)},
+ {"add", nerv_matrix_(add)},
+ {"mul", nerv_matrix_(mul)},
+ {"add_row", nerv_matrix_(add_row)},
+ {"fill", nerv_matrix_(fill)},
+ {"sigmoid", nerv_matrix_(sigmoid)},
+ {"sigmoid_grad", nerv_matrix_(sigmoid_grad)},
+ {"softmax", nerv_matrix_(softmax)},
+ {"mul_elem", nerv_matrix_(mul_elem)},
+ {"log_elem", nerv_matrix_(log_elem)},
+ {"copy_rows_fromh_by_idx", nerv_matrix_(copy_rows_fromh_by_idx)},
+ {"expand_frm", nerv_matrix_(expand_frm)},
+ {"rearrange_frm", nerv_matrix_(rearrange_frm)},
+ {"scale_rows_by_row", nerv_matrix_(scale_rows_by_row)},
+ {"scale_rows_by_col", nerv_matrix_(scale_rows_by_col)},
+ {NULL, NULL}
+};
+
+static void cuda_matrix_(init)(lua_State *L) {
+ luaN_append_methods(L, nerv_matrix_(extra_methods));
+}
+
+static void cuda_matrix_(free)(lua_State *L, MATRIX_ELEM *ptr) {
+ CUDA_SAFE_SYNC_CALL(cudaFree(ptr));
+}
+
+static void cuda_matrix_(alloc)(lua_State *L, MATRIX_ELEM **dptr,
+ size_t *stride, long width, long height) {
+ PROFILE_START
+ CUDA_SAFE_SYNC_CALL(cudaMallocPitch((void **)dptr, stride, width, height));
+ PROFILE_STOP
+}
+
+static MATRIX_ELEM cuda_matrix_(read)(lua_State *L, MATRIX_ELEM *data,
+ int idx) {
+ MATRIX_ELEM res;
+ CUDA_SAFE_SYNC_CALL(cudaMemcpy(&res, data + idx,
+ sizeof(MATRIX_ELEM), cudaMemcpyDeviceToHost));
+ return res;
+}
+
+static void cuda_matrix_(write)(lua_State *L, MATRIX_ELEM *data,
+ int idx, MATRIX_ELEM val) {
+ CUDA_SAFE_SYNC_CALL(cudaMemcpy(data + idx, &val,
+ sizeof(MATRIX_ELEM), cudaMemcpyHostToDevice));
+}
+
+int nerv_matrix_(get_elem)(lua_State *L) {
+ return nerv_error_method_not_implemented(L);
+}
+
+int nerv_matrix_(set_elem)(lua_State *L) {
+ return nerv_error_method_not_implemented(L);
+}
+
+#include "matrix.c"
+#endif
diff --git a/nerv/matrix/generic/elem_type.h b/nerv/matrix/generic/elem_type.h
new file mode 100644
index 0000000..bffe940
--- /dev/null
+++ b/nerv/matrix/generic/elem_type.h
@@ -0,0 +1,22 @@
+#ifdef MATRIX_USE_FLOAT
+
+#define MATRIX_ELEM float
+#define MATRIX_ELEM_FMT "%f"
+#define MATRIX_ELEM_WRITE_FMT "%.8f"
+#define MATRIX_ELEM_PTR(self) ((self)->data.f)
+
+#elif defined(MATRIX_USE_DOUBLE)
+
+#define MATRIX_ELEM double
+#define MATRIX_ELEM_FMT "%lf"
+#define MATRIX_ELEM_WRITE_FMT "%.8lf"
+#define MATRIX_ELEM_PTR(self) ((self)->data.d)
+
+#elif defined(MATRIX_USE_INT)
+
+#define MATRIX_ELEM long
+#define MATRIX_ELEM_FMT "%ld"
+#define MATRIX_ELEM_WRITE_FMT "%ld"
+#define MATRIX_ELEM_PTR(self) ((self)->data.i)
+
+#endif
diff --git a/nerv/matrix/generic/matrix.c b/nerv/matrix/generic/matrix.c
new file mode 100644
index 0000000..e17fb42
--- /dev/null
+++ b/nerv/matrix/generic/matrix.c
@@ -0,0 +1,155 @@
+#ifdef NERV_GENERIC_MATRIX
+#include "../../common.h"
+#include "matrix.h"
+
+extern const char *nerv_matrix_(tname);
+extern const char *MATRIX_BASE_TNAME;
+
+void nerv_matrix_(data_free)(lua_State *L, Matrix *self) {
+ (void)L;
+ assert(*self->data_ref > 0);
+ if (--(*self->data_ref) == 0)
+ {
+ /* free matrix data */
+ MATRIX_DATA_FREE(L, MATRIX_ELEM_PTR(self));
+ free(self->data_ref);
+ free(self);
+ }
+}
+
+void nerv_matrix_(data_retain)(Matrix *self) {
+ (*self->data_ref)++;
+}
+
+Matrix *nerv_matrix_(new_)(lua_State *L, long nrow, long ncol) {
+ Matrix *self = (Matrix *)malloc(sizeof(Matrix));
+ self->nrow = nrow;
+ self->ncol = ncol;
+ self->nmax = self->nrow * self->ncol;
+ MATRIX_DATA_ALLOC(L, &MATRIX_ELEM_PTR(self), &self->stride,
+ sizeof(MATRIX_ELEM) * self->ncol, self->nrow);
+ self->data_ref = (long *)malloc(sizeof(long));
+ *self->data_ref = 0;
+ nerv_matrix_(data_retain)(self);
+ return self;
+}
+
+int nerv_matrix_(new)(lua_State *L) {
+ luaT_pushudata(L, nerv_matrix_(new_)(L, luaL_checkinteger(L, 1),
+ luaL_checkinteger(L, 2)),
+ nerv_matrix_(tname));
+ return 1;
+}
+
+int nerv_matrix_(destroy)(lua_State *L) {
+ Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ nerv_matrix_(data_free)(L, self);
+ return 1;
+}
+
+int nerv_matrix_(get_elem)(lua_State *L);
+int nerv_matrix_(set_elem)(lua_State *L);
+
+static Matrix *nerv_matrix_(getrow)(Matrix *self, int row) {
+ Matrix *prow = (Matrix *)malloc(sizeof(Matrix));
+ prow->ncol = self->ncol;
+ prow->nrow = 1;
+ prow->stride = self->stride;
+ prow->nmax = prow->ncol;
+ MATRIX_ELEM_PTR(prow) = MATRIX_ROW_PTR(self, row);
+ prow->data_ref = self->data_ref;
+ nerv_matrix_(data_retain)(prow);
+ return prow;
+}
+
+static int nerv_matrix_(newindex)(lua_State *L) {
+ Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ if (lua_isnumber(L, 2))
+ {
+ int idx = luaL_checkinteger(L, 2);
+ if (self->nrow == 1)
+ {
+ if (idx < 0 || idx >= self->ncol)
+ nerv_error(L, "index must be within range [0, %d)", self->ncol);
+ MATRIX_DATA_WRITE(L, MATRIX_ELEM_PTR(self), idx,
+ luaL_checknumber(L, 3));
+ }
+ else
+ nerv_error(L, "cannot assign to row vector");
+ lua_pushboolean(L, 1);
+ return 1;
+ }
+ else
+ {
+ lua_pushboolean(L, 0);
+ return 1;
+ }
+}
+
+
+static int nerv_matrix_(index)(lua_State *L) {
+ Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ if (lua_isnumber(L, 2))
+ {
+ int idx = luaL_checkinteger(L, 2);
+ if (self->nrow == 1)
+ {
+ if (idx < 0 || idx >= self->ncol)
+ nerv_error(L, "index must be within range [0, %d)", self->ncol);
+ lua_pushnumber(L, MATRIX_DATA_READ(L, MATRIX_ELEM_PTR(self), idx));
+ }
+ else
+ {
+ if (idx < 0 || idx >= self->nrow)
+ nerv_error(L, "index must be within range [0, %d)", self->nrow);
+ luaT_pushudata(L, nerv_matrix_(getrow)(self, idx), nerv_matrix_(tname));
+ }
+ lua_pushboolean(L, 1);
+ return 2;
+ }
+ else
+ {
+ lua_pushboolean(L, 0);
+ return 1;
+ }
+}
+
+static int nerv_matrix_(ncol)(lua_State *L) {
+ Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ lua_pushinteger(L, self->ncol);
+ return 1;
+}
+
+static int nerv_matrix_(nrow)(lua_State *L) {
+ Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ lua_pushinteger(L, self->nrow);
+ return 1;
+}
+
+static int nerv_matrix_(get_dataref_value)(lua_State *L) {
+ Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ lua_pushinteger(L, *(self->data_ref));
+ return 1;
+}
+
+static const luaL_Reg nerv_matrix_(methods)[] = {
+ {"get_elem", nerv_matrix_(get_elem)},
+ {"set_elem", nerv_matrix_(set_elem)},
+ {"ncol", nerv_matrix_(ncol)},
+ {"nrow", nerv_matrix_(nrow)},
+ {"get_dataref_value", nerv_matrix_(get_dataref_value)},
+ {"__index__", nerv_matrix_(index)},
+ {"__newindex__", nerv_matrix_(newindex)},
+ {NULL, NULL}
+};
+
+void nerv_matrix_(init)(lua_State *L) {
+ luaT_newmetatable(L, nerv_matrix_(tname), MATRIX_BASE_TNAME,
+ nerv_matrix_(new), nerv_matrix_(destroy), NULL);
+ luaL_register(L, NULL, nerv_matrix_(methods));
+#ifdef MATRIX_INIT
+ MATRIX_INIT(L);
+#endif
+ lua_pop(L, 1);
+}
+#endif
diff --git a/nerv/matrix/generic/matrix.h b/nerv/matrix/generic/matrix.h
new file mode 100644
index 0000000..833724b
--- /dev/null
+++ b/nerv/matrix/generic/matrix.h
@@ -0,0 +1,19 @@
+#ifndef NERV_GENERIC_MATRIX_H
+#define NERV_GENERIC_MATRIX_H
+
+#include <stddef.h>
+typedef struct Matrix {
+ size_t stride; /* size of a row */
+ long ncol, nrow, nmax; /* dimension of the matrix */
+ union {
+ float *f;
+ double *d;
+ long *i;
+ } data; /* pointer to actual storage */
+ long *data_ref;
+} Matrix;
+
+#define MATRIX_ROW_PTR(self, row) \
+ (MATRIX_ELEM *)((char *)MATRIX_ELEM_PTR(self) + (row) * (self)->stride)
+
+#endif
diff --git a/nerv/matrix/generic/mmatrix.c b/nerv/matrix/generic/mmatrix.c
new file mode 100644
index 0000000..b0f0791
--- /dev/null
+++ b/nerv/matrix/generic/mmatrix.c
@@ -0,0 +1,122 @@
+#ifdef NERV_GENERIC_MMATRIX
+#include "matrix.h"
+#include "elem_type.h"
+#define MATRIX_DATA_FREE(L, ptr) free(ptr)
+#define MATRIX_DATA_ALLOC(L, dptr, stride, width, height) \
+ host_matrix_(alloc)(L, dptr, stride, width, height)
+#define MATRIX_DATA_WRITE(L, data, idx, val) (data[idx] = val)
+#define MATRIX_DATA_READ(L, data, idx) (data[idx])
+#define MATRIX_INIT(L) host_matrix_(init)(L)
+#define MATRIX_BASE_TNAME nerv_matrix_host_tname
+#define NERV_GENERIC_MATRIX
+#include "../../common.h"
+#include "../../io/chunk_file.h"
+#include "string.h"
+
+static void host_matrix_(alloc)(lua_State *L,
+ MATRIX_ELEM **dptr, size_t *stride,
+ long width, long height) {
+ if ((*dptr = (MATRIX_ELEM *)malloc(width * height)) == NULL)
+ nerv_error(L, "mmatrix insufficient memory");
+ *stride = width;
+}
+
+int nerv_matrix_(get_elem)(lua_State *L) {
+ Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ int idx = luaL_checkinteger(L, 2);
+ if (idx < 0 || idx >= self->nmax)
+ nerv_error(L, "index must be within range [0, %d)", self->nmax);
+ lua_pushnumber(L, MATRIX_ELEM_PTR(self)[idx]);
+ return 1;
+}
+
+int nerv_matrix_(set_elem)(lua_State *L) {
+ Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ int idx = luaL_checkinteger(L, 2);
+ MATRIX_ELEM v = luaL_checknumber(L, 3);
+ if (idx < 0 || idx >= self->nmax)
+ nerv_error(L, "index must be within range [0, %d)", self->nmax);
+ MATRIX_ELEM_PTR(self)[idx] = v;
+ return 0;
+}
+
+static const luaL_Reg nerv_matrix_(extra_methods)[];
+static void host_matrix_(init)(lua_State *L) {
+ luaN_append_methods(L, nerv_matrix_(extra_methods));
+#ifdef MMATRIX_INIT
+ MMATRIX_INIT(L);
+#endif
+}
+
+#include "matrix.c"
+
+int nerv_matrix_(load)(lua_State *L) {
+ ChunkData *chunk = luaT_checkudata(L, 1, nerv_chunk_data_tname);
+ Matrix *self;
+ int i, j;
+ long nrow, ncol;
+ FILE *fp = chunk->fp;
+ if (fscanf(fp, "%ld %ld", &nrow, &ncol) != 2)
+ return 0;
+ self = nerv_matrix_(new_)(L, nrow, ncol);
+ for (i = 0; i < nrow; i++)
+ {
+ MATRIX_ELEM *row = MATRIX_ROW_PTR(self, i);
+ for (j = 0; j < ncol; j++)
+ if (fscanf(fp, MATRIX_ELEM_FMT, row + j) != 1)
+ {
+ free(self);
+ return 0;
+ }
+ }
+ luaT_pushudata(L, self, nerv_matrix_(tname));
+ return 1;
+}
+
+int nerv_matrix_(save)(lua_State *L) {
+ ChunkFileHandle *chunk = luaT_checkudata(L, 2,
+ nerv_chunk_file_handle_tname);
+ Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ int i, j;
+ long nrow = self->nrow, ncol = self->ncol;
+ FILE *fp = chunk->fp;
+ if (fprintf(fp, "%ld %ld\n", nrow, ncol) < 0)
+ return 0;
+ for (i = 0; i < nrow; i++)
+ {
+ MATRIX_ELEM *row = MATRIX_ROW_PTR(self, i);
+ for (j = 0; j < ncol; j++)
+ if (fprintf(fp, MATRIX_ELEM_WRITE_FMT " ", row[j]) < 0)
+ return 0;
+ if (fprintf(fp, "\n") < 0)
+ return 0;
+ }
+ return 0;
+}
+
+static int nerv_matrix_(copy_from)(lua_State *L) {
+ Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+ Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+ int nargs = lua_gettop(L);
+ int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0;
+ int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow;
+ int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0;
+ if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow &&
+ a_begin + b_end - b_begin <= a->nrow))
+ nerv_error(L, "invalid copy interval");
+ if (a->ncol != b->ncol)
+ nerv_error(L, "matrices should be of the same dimension");
+ memmove(MATRIX_ROW_PTR(a, a_begin),
+ MATRIX_ROW_PTR(b, b_begin),
+ sizeof(MATRIX_ELEM) * b->ncol * (b_end - b_begin));
+ return 0;
+}
+
+static const luaL_Reg nerv_matrix_(extra_methods)[] = {
+ {"load", nerv_matrix_(load)},
+ {"save", nerv_matrix_(save)},
+ {"copy_from", nerv_matrix_(copy_from)},
+ {NULL, NULL}
+};
+
+#endif
diff --git a/nerv/matrix/init.c b/nerv/matrix/init.c
new file mode 100644
index 0000000..c29d7e9
--- /dev/null
+++ b/nerv/matrix/init.c
@@ -0,0 +1,35 @@
+#include "../common.h"
+#include "generic/matrix.h"
+
+const char *nerv_matrix_tname = "nerv.Matrix";
+const char *nerv_matrix_cuda_tname = "nerv.CuMatrix";
+const char *nerv_matrix_host_tname = "nerv.MMatrix";
+
+void nerv_cumatrix_init(lua_State *L);
+void nerv_mmatrix_init(lua_State *L);
+
+static const luaL_Reg matrix_methods[] = {
+ {"__tostring__", nerv_error_method_not_implemented },
+ {"__add__", nerv_error_method_not_implemented },
+ {"__sub__", nerv_error_method_not_implemented },
+ {"__mul__", nerv_error_method_not_implemented },
+ {NULL, NULL}
+};
+
+void nerv_matrix_init(lua_State *L) {
+ /* abstract base class: Matrix */
+ luaT_newmetatable(L, nerv_matrix_tname, NULL, NULL, NULL, NULL);
+ luaL_register(L, NULL, matrix_methods);
+ lua_pop(L, 1);
+
+ /* CuMatrix inherits from Matrix */
+ luaT_newmetatable(L, nerv_matrix_cuda_tname, nerv_matrix_tname,
+ NULL, NULL, NULL);
+ nerv_cumatrix_init(L);
+ lua_pop(L, 1);
+ /* MMatrix inherits from Matrix */
+ luaT_newmetatable(L, nerv_matrix_host_tname, nerv_matrix_tname,
+ NULL, NULL, NULL);
+ nerv_mmatrix_init(L);
+ lua_pop(L, 1);
+}
diff --git a/nerv/matrix/init.lua b/nerv/matrix/init.lua
new file mode 100644
index 0000000..1a8925f
--- /dev/null
+++ b/nerv/matrix/init.lua
@@ -0,0 +1,77 @@
+function nerv.Matrix:__tostring__()
+ local ncol = self:ncol()
+ local nrow = self:nrow()
+ local strt = {}
+ local fmt
+ if self.fmt then
+ fmt = self.fmt
+ else
+ fmt = "%.8f "
+ end
+ if nrow == 1 then
+ for col = 0, ncol - 1 do
+ table.insert(strt, string.format(fmt, self[col]))
+ end
+ table.insert(strt, "\n")
+ else
+ for row = 0, nrow - 1 do
+ local rp = self[row]
+ for col = 0, ncol - 1 do
+ table.insert(strt, string.format(fmt, rp[col]))
+ end
+ table.insert(strt, "\n")
+ end
+ end
+ table.insert(strt, string.format(
+ "[%s %d x %d]", self.__typename, nrow, ncol))
+ return table.concat(strt)
+end
+
+-- gen: a function takes take indices of the matrix and return the generated
+-- all entrys in the matrix will be assigned by calling gen(i, j)
+function nerv.Matrix:generate(gen)
+ if (self:nrow() == 1) then
+ for j = 0, self:ncol() - 1 do
+ self[j] = gen(j)
+ end
+ else
+ for i = 0, self:nrow() - 1 do
+ local row = self[i]
+ for j = 0, self:ncol() - 1 do
+ row[j] = gen(i, j)
+ end
+ end
+ end
+end
+
+nerv.MMatrixInt.fmt = "%d "
+
+function nerv.CuMatrix:__add__(b)
+ c = self:create()
+ c:add(self, b, 1.0, 1.0)
+ return c
+end
+
+function nerv.CuMatrix:__sub__(b)
+ c = self:create()
+ c:add(self, b, 1.0, -1.0)
+ return c
+end
+
+function nerv.CuMatrix:__mul__(b)
+ c = nerv.get_type(self.__typename)(self:nrow(), b:ncol())
+ c:mul(self, b, 1.0, 0.0, 'N', 'N')
+ return c
+end
+
+function nerv.CuMatrixFloat.new_from_host(mat)
+ local res = nerv.CuMatrixFloat(mat:nrow(), mat:ncol())
+ res:copy_fromh(mat)
+ return res
+end
+
+function nerv.CuMatrixFloat:new_to_host()
+ local res = nerv.MMatrixFloat(self:nrow(), self:ncol())
+ self:copy_toh(res)
+ return res
+end
diff --git a/nerv/matrix/mmatrix.c b/nerv/matrix/mmatrix.c
new file mode 100644
index 0000000..d1d68b9
--- /dev/null
+++ b/nerv/matrix/mmatrix.c
@@ -0,0 +1,77 @@
+#define NERV_GENERIC_MMATRIX
+#include <stdlib.h>
+#include "../common.h"
+void nerv_matrix_host_float_init(lua_State *L);
+void nerv_matrix_host_double_init(lua_State *L);
+void nerv_matrix_host_int_init(lua_State *L);
+
+void nerv_mmatrix_init(lua_State *L) {
+ srand(1);
+ nerv_matrix_host_float_init(L);
+ nerv_matrix_host_double_init(L);
+ nerv_matrix_host_int_init(L);
+}
+
+#define MATRIX_USE_FLOAT
+#define host_matrix_(NAME) host_matrix_float_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME
+const char *nerv_matrix_(tname) = "nerv.MMatrixFloat";
+#include "generic/mmatrix.c"
+#undef nerv_matrix_
+#undef host_matrix_
+#undef MATRIX_USE_FLOAT
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_FMT
+#undef MATRIX_ELEM_WRITE_FMT
+
+#define NERV_GENERIC_MMATRIX
+#define MATRIX_USE_DOUBLE
+#define host_matrix_(NAME) host_matrix_double_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_host_double_##NAME
+const char *nerv_matrix_(tname) = "nerv.MMatrixDouble";
+#include "generic/mmatrix.c"
+#undef nerv_matrix_
+#undef host_matrix_
+#undef MATRIX_USE_DOUBLE
+#undef MATRIX_ELEM
+#undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_FMT
+#undef MATRIX_ELEM_WRITE_FMT
+
+#define NERV_GENERIC_MMATRIX
+#define MATRIX_USE_INT
+#define host_matrix_(NAME) host_matrix_int_##NAME
+#define nerv_matrix_(NAME) nerv_matrix_host_int_##NAME
+const char *nerv_matrix_(tname) = "nerv.MMatrixInt";
+#define MMATRIX_INIT(L) host_matrix_(init_extra)(L)
+
+static const luaL_Reg nerv_matrix_(extra_methods_int)[];
+static void host_matrix_(init_extra)(lua_State *L) {
+ luaN_append_methods(L, nerv_matrix_(extra_methods_int));
+}
+
+#include "generic/mmatrix.c"
+
+static int nerv_matrix_(perm_gen)(lua_State *L) {
+ int i, ncol = luaL_checkinteger(L, 1);
+ Matrix *self = nerv_matrix_(new_)(L, 1, ncol);
+ long *prow = self->data.i;
+ for (i = 0; i < ncol; i++)
+ prow[i] = i;
+ for (i = ncol - 1; i >= 0; i--)
+ {
+ size_t j = rand() % (i + 1);
+ long tmp = prow[i];
+ prow[i] = prow[j];
+ prow[j] = tmp;
+ }
+ luaT_pushudata(L, self, nerv_matrix_(tname));
+ return 1;
+}
+
+static const luaL_Reg nerv_matrix_(extra_methods_int)[] = {
+ {"perm_gen", nerv_matrix_(perm_gen)},
+ {NULL, NULL}
+};
+
diff --git a/nerv/nerv b/nerv/nerv
new file mode 100644
index 0000000..7571659
--- /dev/null
+++ b/nerv/nerv
@@ -0,0 +1,13 @@
+#! /usr/bin/env luajit
+require 'nerv'
+print("Greetings")
+if #arg < 1 then
+ return
+end
+local script = arg[1]
+local script_arg = {}
+for i = 2, #arg do
+ table.insert(script_arg, arg[i])
+end
+arg = script_arg
+dofile(script)
diff --git a/nerv/nerv-scm-1.rockspec b/nerv/nerv-scm-1.rockspec
new file mode 100644
index 0000000..d14140a
--- /dev/null
+++ b/nerv/nerv-scm-1.rockspec
@@ -0,0 +1,38 @@
+package = "nerv"
+version = "scm-1"
+source = {
+ url = "..." -- We don't have one yet
+}
+description = {
+ summary = "An example for the LuaRocks tutorial.",
+ detailed = [[
+ ]],
+ homepage = "https://github.com/Determinant/nerv", -- We don't have one yet
+ license = "BSD" -- or whatever you like
+}
+dependencies = {
+ "lua >= 5.1"
+ -- If you depend on other rocks, add them here
+}
+build = {
+ -- We'll start here.
+ type = "make",
+ build_variables = {
+ CFLAGS="$(CFLAGS)",
+ LIBFLAG="$(LIBFLAG)",
+ LUA_LIBDIR="$(LUA_LIBDIR)",
+ LUA_BINDIR="$(LUA_BINDIR)",
+ LUA_INCDIR="$(LUA_INCDIR)",
+ LUA="$(LUA)",
+ },
+ install_variables = {
+ INST_PREFIX="$(PREFIX)",
+ INST_BINDIR="$(BINDIR)",
+ INST_LIBDIR="$(LIBDIR)",
+ INST_LUADIR="$(LUADIR)",
+ INST_CONFDIR="$(CONFDIR)",
+ },
+ install = {
+ bin = {"nerv"}
+ }
+}
diff --git a/nerv/nerv.c b/nerv/nerv.c
new file mode 100644
index 0000000..a59eadc
--- /dev/null
+++ b/nerv/nerv.c
@@ -0,0 +1,38 @@
+#include "common.h"
+
+extern void nerv_example_init(lua_State *L);
+extern void nerv_matrix_init(lua_State *L);
+extern void nerv_io_init(lua_State *L);
+
+static const luaL_Reg nerv_utils_methods[] = {
+ {"setmetatable", luaT_lua_setmetatable},
+ {"getmetatable", luaT_lua_getmetatable},
+ {"newmetatable", luaT_lua_newmetatable},
+ {"typename", luaT_lua_typename},
+ {NULL, NULL}
+};
+
+void nerv_utils_init(lua_State *L) {
+ luaL_register(L, NULL, nerv_utils_methods);
+}
+
+int luaopen_libnerv(lua_State *L) {
+ lua_newtable(L);
+ /* duplicate table */
+ lua_pushvalue(L, -1);
+ /* set table to global index */
+ lua_setfield(L, LUA_GLOBALSINDEX, "nerv");
+ /* A table reference still remains.
+ *
+ * The following initialization functions should obey to the rule that they
+ * maintain the stack properly to guarantee the stack stays the same before
+ * and after invoking the call (i.e. stay balanced).
+ *
+ * Also note that they can make use of the value at top of the stack which
+ * references to the `nerv` global table. */
+ nerv_utils_init(L);
+ nerv_example_init(L);
+ nerv_matrix_init(L);
+ nerv_io_init(L);
+ return 1;
+}
diff --git a/nerv/nn/init.lua b/nerv/nn/init.lua
new file mode 100644
index 0000000..cbaf52b
--- /dev/null
+++ b/nerv/nn/init.lua
@@ -0,0 +1,3 @@
+nerv.include('layer_repo.lua')
+nerv.include('param_repo.lua')
+nerv.include('layer_dag.lua')
diff --git a/nerv/nn/layer_dag.lua b/nerv/nn/layer_dag.lua
new file mode 100644
index 0000000..8e30216
--- /dev/null
+++ b/nerv/nn/layer_dag.lua
@@ -0,0 +1,249 @@
+local DAGLayer = nerv.class("nerv.DAGLayer", "nerv.Layer")
+
+local function parse_id(str)
+ local id, port, _
+ _, _, id, port = string.find(str, "([a-zA-Z0-9_]+)%[([0-9]+)%]")
+ if id == nil or port == nil then
+ _, _, id, port = string.find(str, "(.+)%[([0-9]+)%]")
+ if not (id == "<input>" or id == "<output>") then
+ nerv.error("wrong format of connection id")
+ end
+ end
+ port = tonumber(port)
+ return id, port
+end
+
+local function discover(id, layers, layer_repo)
+ local ref = layers[id]
+ if id == "<input>" or id == "<output>" then
+ return nil
+ end
+ if ref == nil then
+ local layer = layer_repo:get_layer(id)
+ local dim_in, dim_out = layer:get_dim()
+ ref = {
+ layer = layer,
+ inputs = {},
+ outputs = {},
+ err_inputs = {},
+ err_outputs = {},
+ next_layers = {},
+ input_len = #dim_in,
+ output_len = #dim_out,
+ in_deg = 0,
+ visited = false
+ }
+ layers[id] = ref
+ end
+ return ref
+end
+
+function DAGLayer:__init(id, global_conf, layer_conf)
+ local layers = {}
+ local inputs = {}
+ local outputs = {}
+ local dim_in = layer_conf.dim_in
+ local dim_out = layer_conf.dim_out
+ local parsed_conn = {}
+ for from, to in pairs(layer_conf.connections) do
+ local id_from, port_from = parse_id(from)
+ local id_to, port_to = parse_id(to)
+ local ref_from = discover(id_from, layers, layer_conf.sub_layers)
+ local ref_to = discover(id_to, layers, layer_conf.sub_layers)
+ local input_dim, output_dim, _
+ if ref_from and ref_from.outputs[port_from] ~= nil then
+ nerv.error("%s has already been attached", from)
+ end
+ if ref_to and ref_to.inputs[port_to] ~= nil then
+ nerv.error("%s has already been attached", to)
+ end
+ if id_from == "<input>" then
+ input_dim, _ = ref_to.layer:get_dim()
+ if dim_in[port_from] ~= input_dim[port_to] then
+ nerv.error("mismatching data dimension between %s and %s", from, to)
+ end
+ inputs[port_from] = {ref_to, port_to}
+ ref_to.inputs[port_to] = inputs -- just a place holder
+ elseif id_to == "<output>" then
+ _, output_dim = ref_from.layer:get_dim()
+ if output_dim[port_from] ~= dim_out[port_to] then
+ nerv.error("mismatching data dimension between %s and %s", from, to)
+ end
+ outputs[port_to] = {ref_from, port_from}
+ ref_from.outputs[port_from] = outputs -- just a place holder
+ else
+ _, output_dim = ref_from.layer:get_dim()
+ input_dim, _ = ref_to.layer:get_dim()
+ if output_dim[port_from] ~= input_dim[port_to] then
+ nerv.error("mismatching data dimension between %s and %s", from, to)
+ end
+
+ table.insert(parsed_conn,
+ {{ref_from, port_from}, {ref_to, port_to}})
+ table.insert(ref_from.next_layers, ref_to) -- add edge
+ ref_to.in_deg = ref_to.in_deg + 1 -- increase the in-degree of the target layer
+ end
+ end
+
+ -- topology sort
+ local queue = {}
+ local l = 1
+ local r = 1
+ for id, ref in pairs(layers) do
+ if ref.in_deg == 0 then
+ table.insert(queue, ref)
+ nerv.info("adding source layer: %s", id)
+ r = r + 1
+ end
+ end
+ if l == r then
+ nerv.error("loop detected")
+ end
+ while l < r do
+ local cur = queue[l]
+ cur.visited = true
+ l = l + 1
+ for _, nl in pairs(cur.next_layers) do
+ nl.in_deg = nl.in_deg - 1
+ if nl.in_deg == 0 then
+ table.insert(queue, nl)
+ r = r + 1
+ end
+ end
+ end
+ for i = 1, #queue do
+ nerv.info("enqueued layer: %s", queue[i].layer.id)
+ end
+
+ for id, ref in pairs(layers) do
+ -- check wether the graph is connected
+ if ref.visited == false then
+ nerv.warning("layer %s is ignored", id)
+ end
+ end
+
+ self.layers = layers
+ self.inputs = inputs
+ self.outputs = outputs
+ self.dim_in = dim_in
+ self.dim_out = dim_out
+ self.parsed_conn = parsed_conn
+ self.queue = queue
+ self.gconf = global_conf
+end
+
+function DAGLayer:init(batch_size)
+ for i, conn in ipairs(self.parsed_conn) do
+ local _, output_dim
+ local ref_from, port_from, ref_to, port_to
+ ref_from, port_from = unpack(conn[1])
+ ref_to, port_to = unpack(conn[2])
+ _, output_dim = ref_from.layer:get_dim()
+ local mid = self.gconf.cumat_type(batch_size,
+ output_dim[port_from])
+ local err_mid = mid:create()
+
+ ref_from.outputs[port_from] = mid
+ ref_to.inputs[port_to] = mid
+
+ ref_from.err_inputs[port_from] = err_mid
+ ref_to.err_outputs[port_to] = err_mid
+ end
+ for id, ref in pairs(self.layers) do
+ for i = 1, ref.input_len do
+ if ref.inputs[i] == nil then
+ nerv.error("dangling input port %d of layer %s", i, id)
+ end
+ end
+ for i = 1, ref.output_len do
+ if ref.outputs[i] == nil then
+ nerv.error("dangling output port %d of layer %s", i, id)
+ end
+ end
+ -- initialize sub layers
+ ref.layer:init(batch_size)
+ end
+ for i = 1, #self.dim_in do
+ if self.inputs[i] == nil then
+ nerv.error("dangling port %d of layer <input>", i)
+ end
+ end
+ for i = 1, #self.dim_out do
+ if self.outputs[i] == nil then
+ nerv.error("dangling port %d of layer <output>", i)
+ end
+ end
+end
+
+function DAGLayer:set_inputs(input)
+ for i = 1, #self.dim_in do
+ local layer = self.inputs[i][1]
+ local port = self.inputs[i][2]
+ layer.inputs[port] = input[i]
+ end
+end
+
+function DAGLayer:set_outputs(output)
+ for i = 1, #self.dim_out do
+ local layer = self.outputs[i][1]
+ local port = self.outputs[i][2]
+ layer.outputs[port] = output[i]
+ end
+end
+
+function DAGLayer:set_err_inputs(bp_err)
+ for i = 1, #self.dim_out do
+ local layer = self.outputs[i][1]
+ local port = self.outputs[i][2]
+ layer.err_inputs[port] = bp_err[i]
+ end
+end
+
+function DAGLayer:set_err_outputs(next_bp_err)
+ for i = 1, #self.dim_in do
+ local layer = self.inputs[i][1]
+ local port = self.inputs[i][2]
+ layer.err_outputs[port] = next_bp_err[i]
+ end
+end
+
+function DAGLayer:update(bp_err, input, output)
+ self:set_err_inputs(bp_err)
+ self:set_inputs(input)
+ self:set_outputs(output)
+ -- print("update")
+ for id, ref in pairs(self.queue) do
+ -- print(ref.layer.id)
+ ref.layer:update(ref.err_inputs, ref.inputs, ref.outputs)
+ end
+end
+
+function DAGLayer:propagate(input, output)
+ self:set_inputs(input)
+ self:set_outputs(output)
+ for i = 1, #self.queue do
+ local ref = self.queue[i]
+ -- print(ref.layer.id)
+ ref.layer:propagate(ref.inputs, ref.outputs)
+ end
+end
+
+function DAGLayer:back_propagate(bp_err, next_bp_err, input, output)
+ self:set_err_outputs(next_bp_err)
+ self:set_err_inputs(bp_err)
+ self:set_inputs(input)
+ self:set_outputs(output)
+ for i = #self.queue, 1, -1 do
+ local ref = self.queue[i]
+ -- print(ref.layer.id)
+ ref.layer:back_propagate(ref.err_inputs, ref.err_outputs, ref.inputs, ref.outputs)
+ end
+end
+
+function DAGLayer:get_params()
+ local param_repos = {}
+ for id, ref in pairs(self.queue) do
+ table.insert(param_repos, ref.layer:get_params())
+ end
+ return nerv.ParamRepo.merge(param_repos)
+end
diff --git a/nerv/nn/layer_repo.lua b/nerv/nn/layer_repo.lua
new file mode 100644
index 0000000..602c37c
--- /dev/null
+++ b/nerv/nn/layer_repo.lua
@@ -0,0 +1,34 @@
+local LayerRepo = nerv.class("nerv.LayerRepo")
+
+function LayerRepo:__init(layer_spec, param_repo, global_conf)
+ local layers = {}
+ for ltype, llist in pairs(layer_spec) do
+ local layer_type = nerv.get_type(ltype)
+ for id, spec in pairs(llist) do
+ if layers[id] ~= nil then
+ nerv.error("a layer with id %s already exists", id)
+ end
+ nerv.info("create layer: %s", id)
+ if type(spec[2]) ~= "table" then
+ nerv.error("layer config table is need")
+ end
+ layer_config = spec[2]
+ if type(spec[1]) ~= "table" then
+ nerv.error("parameter description table is needed")
+ end
+ for pname, pid in pairs(spec[1]) do
+ layer_config[pname] = param_repo:get_param(pid)
+ end
+ layers[id] = layer_type(id, global_conf, layer_config)
+ end
+ end
+ self.layers = layers
+end
+
+function LayerRepo:get_layer(lid)
+ local layer = self.layers[lid]
+ if layer == nil then
+ nerv.error("layer with id %s not found", lid)
+ end
+ return layer
+end
diff --git a/nerv/nn/param_repo.lua b/nerv/nn/param_repo.lua
new file mode 100644
index 0000000..ab971ba
--- /dev/null
+++ b/nerv/nn/param_repo.lua
@@ -0,0 +1,76 @@
+local ParamRepo = nerv.class("nerv.ParamRepo")
+function ParamRepo:__init(plist)
+ self.params = {}
+ if plist ~= nil then
+ for i, p in ipairs(plist) do
+ self.params[p.id] = p
+ end
+ end
+end
+
+function ParamRepo:add(pid, p)
+ if self.params[pid] ~= nil then
+ nerv.error("duplicate params with the same id: %s", pid)
+ end
+ self.params[pid] = p
+end
+
+function ParamRepo:remove(pid, p)
+ if self.params[pid] == nil then
+ nerv.error("param %s does not exit", pid)
+ end
+ table.remove(self.params, pid)
+end
+
+function ParamRepo.merge(repos)
+ local self = nerv.ParamRepo()
+ for i, repo in ipairs(repos) do
+ if not nerv.is_type(repo, "nerv.ParamRepo") then
+ nerv.error("nerv.ParamRepo objects expected, got %s", repo)
+ end
+ for pid, p in pairs(repo.params) do
+ self:add(pid, p)
+ end
+ end
+ return self
+end
+
+function ParamRepo:import(param_files, pids, gconf)
+ if type(param_files) ~= "table" then
+ nerv.error("param file table is need")
+ end
+ for i = 1, #param_files do
+ local pf = nerv.ChunkFile(param_files[i], "r")
+ for cid, cspec in pairs(pf.metadata) do
+ if pids == nil or pids[cid] ~= nil then
+ local p = pf:read_chunk(cid, gconf)
+ if not nerv.is_type(p, "nerv.Param") then
+ nerv.error("param chunk is expected")
+ end
+ self:add(cid, p)
+ end
+ end
+ end
+end
+
+function ParamRepo:export(param_file, pids)
+ cf = nerv.ChunkFile(param_file, "w")
+ if pids == nil then
+ for id, p in pairs(self.params) do
+ cf:write_chunk(p)
+ end
+ else
+ for i, pid in ipairs(pids) do
+ cf:write_chunk(self:get_param(pid))
+ end
+ end
+ cf:close()
+end
+
+function ParamRepo:get_param(pid)
+ local p = self.params[pid]
+ if p == nil then
+ nerv.error("param with id %s not found", pid)
+ end
+ return p
+end