diff options
83 files changed, 4169 insertions, 2124 deletions
diff --git a/.gitmodules b/.gitmodules index 9f556c5..2b346c4 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,9 @@ [submodule "luajit-2.0"] path = luajit-2.0 - url = http://luajit.org/git/luajit-2.0.git + url = https://speechlab.sjtu.edu.cn/gitlab/nerv-dev/luajit.git [submodule "luarocks"] path = luarocks - url = https://github.com/keplerproject/luarocks.git + url = https://speechlab.sjtu.edu.cn/gitlab/nerv-dev/luarocks.git +[submodule "Penlight"] + path = Penlight + url = https://speechlab.sjtu.edu.cn/gitlab/nerv-dev/Penlight.git @@ -1,19 +1,42 @@ .PHONY: all clean install luajit luarocks speech +############## EDIT THESE LINES ##################### SHELL := /bin/bash PREFIX := $(CURDIR)/install/ -all: luajit luarocks install +#CUDA_BASE := /usr/local/cuda-7.0 +CUDA_BASE := /usr/local/cuda +BLAS_BASE := /usr/lib/ +BLAS_LDFLAGS := -L$(BLAS_BASE) -Wl,-rpath=$(BLAS_BASE) +BLAS_TYPE := atlas +KALDI_BASE := /speechlab/tools/KALDI/kaldi-master/ +####################################################### +MKL_LDFLAGS := -lmkl_rt +ATLAS_LDFLAGS := -lcblas -llapack_atlas +ifeq ($(BLAS_TYPE), mkl) +BLAS_LDFLAGS += $(MKL_LDFLAGS) +else ifeq ($(BLAS_TYPE), atlas) +BLAS_LDFLAGS += $(ATLAS_LDFLAGS) +else +$(error Invalid blas type) +endif +export CUDA_BASE +export KALDI_BASE +export BLAS_LDFLAGS + +.PHONY: nerv speech/speech_utils speech/htk_io speech/kaldi_io speech/kaldi_decode \ + nerv-clean speech/speech_utils-clean speech/htk_io-clean speech/kaldi_io-clean speech/kaldi_decode-clean \ + Penlight + +all: luajit luarocks Penlight nerv luajit: PREFIX=$(PREFIX) ./tools/build_luajit.sh luarocks: PREFIX=$(PREFIX) ./tools/build_luarocks.sh -install: - cd nerv; $(PREFIX)/bin/luarocks make CFLAGS=$(CFLAGS) -speech: - cd speech/speech_utils; $(PREFIX)/bin/luarocks make - cd speech/htk_io; $(PREFIX)/bin/luarocks make - cd speech/kaldi_io; $(PREFIX)/bin/luarocks make -clean: - cd nerv && make clean - cd speech/speech_utils && make clean - cd speech/htk_io && make clean - cd speech/kaldi_io && make clean + +speech: speech/speech_utils speech/htk_io speech/kaldi_io speech/kaldi_decode +speech-clean: speech/speech_utils-clean speech/htk_io-clean speech/kaldi_io-clean speech/kaldi_decode-clean +clean: nerv-clean speech-clean + +nerv Penlight speech/speech_utils speech/htk_io speech/kaldi_io speech/kaldi_decode: + cd $@; $(PREFIX)/bin/luarocks make +nerv-clean speech/speech_utils-clean speech/htk_io-clean speech/kaldi_io-clean speech/kaldi_decode-clean: + cd $(subst -clean,,$@); make clean LUA_BINDIR=$(PREFIX)/bin/ diff --git a/Penlight b/Penlight new file mode 160000 +Subproject 16d149338af9efc910528641c5240c5641aeb8d diff --git a/README.md b/README.md deleted file mode 100644 index fe9dfc1..0000000 --- a/README.md +++ /dev/null @@ -1,55 +0,0 @@ -#The Nerv Toolkit User Manual# -NOTE: This readme is obsolete and will be rearranged, for further information, please check http://nerv-sjtu.github.io/nerv/ - -This user manual will information about how to use __Nerv__ and __Nerv__'s interface. - -##How to make and start using## -First make sure you have __lua__ and __CUDA__ installed on your computer. -__Nerv__ is currently developed via github.You can download and make __Nerv__ by doing the following: -``` -cd ~ -git clone https://github.com/Nerv-SJTU/nerv.git -cd nerv -git submodule init && git submodule update -make -#To include some new CUDA feature(e.x. atomicCAS), use "make CFLAGS=-D__NERV_FUTURE_CUDA_7" - -#further, if you want the speech modules -git clone https://github.com/Nerv-SJTU/nerv-speech.git speech -make speech -``` -The `git submodule` command is for the __luajit__ repository inside __Nerv__. -Now, you can try to run some example scripts. -``` -./install/bin/nerv examples/cumatrix_example.lua -``` -To get an example of DNN(for ASR) training, run(this requires the speech modules) -You need to be at or (copy files from) `/slfs1`(SJTU speechlab cluster) to get this running. -``` -./install/bin/nerv nerv/examples/asr_trainer.lua nerv/examples/swb_baseline.lua -``` - -##How to contribute## -Fork the original repository, then use the __pull&merge__ function in github to contribute. -The pull&merge request can be found on your dashboard in github. See this [sync-help] to sync with the original repository. - -##Nerv Packages## -* __luaT__ -Nerv uses [luaT]\(a [Torch] library\) to define lua class in C. -* __[The Nerv OOP](nerv/doc/nerv_class.md)__ -Enables object-oriented programming in Nerv. -* __[The Nerv utility functions](nerv/doc/nerv.md)__ -Inlcudes some utility functions from luaT to implement __Nerv.Class__. -* __[The Nerv Matrix Package](nerv/doc/nerv_matrix.md)__ -The matrix package is a basic package in __Nerv__ that is used to store and manipulate matrices. -* __[The Nerv IO Package](nerv/doc/nerv_io.md)__ -The IO package is used to read and write parameters to file. -* __[The Nerv Parameter Package](nerv/doc/nerv_param.md)__ -The parameter package is used to store, read model parameters from file. -* __[The Nerv Layer Package](nerv/doc/nerv_layer.md)__ -The layer package is used to define propagation and backpropagation of different type of layers. -* __[The Nerv NN Package](nerv/doc/nerv_nn.md)__ -The nn package is for organizing a neural network, it contains __nerv.LayerRepo__, __nerv.ParamRepo__, and __nerv.DAGLayer__. -[luaT]:https://github.com/torch/torch7/tree/master/lib/luaT -[Torch]:https://github.com/torch -[sync-help]:https://help.github.com/articles/syncing-a-fork/ diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..c00743c --- /dev/null +++ b/README.rst @@ -0,0 +1,64 @@ +NERV Toolkit +============ + +NOTE: This readme is in-progress. + +Installation +------------ +First, make sure you have at least one implementation of BLAS and CUDA installed +on your computer. + +- Checkout NERV: + + :: + + bash + git clone https://speechlab.sjtu.edu.cn/gitlab/nerv-dev/nerv.git + +- Checkout submodules (luajit, luarocks, Penlight, etc.): + + :: + + cd nerv + git submodule init && git submodule update + +- Build NERV: you can specify either ``mkl`` or ``atlas`` to ``BLAS_TYPE``. + ``BLAS_BASE`` is the directory containing BLAS ``.so`` files. By default, + ``atlas`` is used for ``BLAS_TYPE``, ``/usr/lib/`` is used for ``BLAS_BASE``, + and ``/usr/local/cuda`` is used for ``CUDA_BASE``. + + :: + + # an example for compiling on SJTU Speechlab major cluster + make BLAS_TYPE=mkl BLAS_BASE=/home/intel/mkl/lib/intel64/ CUDA_BASE=/usr/local/cuda + +- To include some new features (e.g. ``atomicCAS`` in CUDA), add corresponding flags to + ``NERV_FEAT`` (e.g. ``NERV_FEAT=-D__NERV_FUTURE_CUDA_7``) while making: + + :: + + make NERV_FEAT=-D__NERV_FUTURE_CUDA_7 BLAS_TYPE=mkl BLAS_BASE=/home/intel/mkl/lib/intel64/ CUDA_BASE=/usr/local/cuda + +- For speech tasks, you need to install related lua rocks (Lua packages): + + :: + + # checkout speech repository to local directory nerv/speech (suppose you're + # still at the root directory of NERV repo) + git clone https://speechlab.sjtu.edu.cn/gitlab/nerv-dev/nerv-speech.git speech + # build and install HTK I/O support, Kaldi I/O support, Kaldi decoding support, etc. + make speech BLAS_TYPE=mkl BLAS_BASE=/home/intel/mkl/lib/intel64/ + +Example & Tutorial +------------------ +For speech tasks, please refer to ``tutorial/`` in ``nerv-speech`` repository. + +Contribution +------------ +The basic rule is simple: just fork the original repository, then create a pull +request (merge request) to the administrator of the project. If you want to fix +any bugs in existing code, don't hesitate to create a pull (merge) request to +the repository with clear and detailed analysis of the problem. If you want to +add additional task-specific functionalities (modules) for speech to NERV, +please create a luarocks-compliant package and also a pull (merge) request to +the ``nerv-speech`` repository instead of ``nerv``. diff --git a/nerv/Makefile b/nerv/Makefile index c0db53a..68465a1 100644 --- a/nerv/Makefile +++ b/nerv/Makefile @@ -1,3 +1,11 @@ +ifndef LUA_BINDIR +$(error Please build the package via luarocks: `luarocks make`) +endif + +ifndef CUDA_BASE +$(error CUDA_BASE is not set) +endif + .PHONY: build install clean SHELL := /bin/bash @@ -6,14 +14,15 @@ LIB_PATH := $(LUA_BINDIR)/../lib INC_PATH := $(LUA_BINDIR)/../include/nerv LUA_DIR = $(INST_LUADIR)/nerv OBJ_DIR := $(BUILD_DIR)/objs -ISUBDIR := io matrix luaT +ISUBDIR := lib matrix lib/io lib/matrix lib/luaT SUBDIR := matrix io layer examples nn tnn lib/io lib/luaT lib/matrix INC_SUBDIR := $(addprefix $(INC_PATH)/,$(ISUBDIR)) OBJ_SUBDIR := $(addprefix $(OBJ_DIR)/,$(SUBDIR)) LUA_SUBDIR := $(addprefix $(LUA_DIR)/,$(SUBDIR)) -INCS := common.h matrix/matrix.h io/chunk_file.h luaT/luaT.h +INCS := lib/common.h lib/matrix/matrix.h lib/matrix/mmatrix.h lib/io/chunk_file.h lib/luaT/luaT.h \ + matrix/matrix.h CORE_OBJS := lib/common.o lib/io/chunk_file.o \ lib/matrix/mmatrix.o lib/matrix/cumatrix.o lib/matrix/cukernel.o NERV_OBJS := nerv.o \ @@ -33,17 +42,17 @@ LUA_LIBS := matrix/init.lua io/init.lua init.lua \ layer/init.lua layer/affine.lua layer/sigmoid.lua layer/tanh.lua layer/softmax_ce.lua layer/softmax.lua \ layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua \ layer/elem_mul.lua layer/lstm.lua layer/lstm_gate.lua layer/dropout.lua layer/gru.lua \ - nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua \ - io/sgd_buffer.lua \ - tnn/init.lua tnn/sutil.lua tnn/tnn.lua + layer/graph.lua layer/rnn.lua layer/duplicate.lua layer/identity.lua \ + nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/network.lua \ + io/sgd_buffer.lua io/seq_buffer.lua INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK -#CUDA_BASE := /usr/local/cuda-7.0 -CUDA_BASE := /usr/local/cuda CUDA_INCLUDE := -I $(CUDA_BASE)/include/ INCLUDE += $(CUDA_INCLUDE) -LDFLAGS := -L$(CUDA_BASE)/lib64/ -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcublas -lcurand +CUDA_LDFLAGS := -L$(CUDA_BASE)/lib64/ -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcuda -lcublas -lcurand +override CFLAGS += $(NERV_FEAT) + NVCC := $(CUDA_BASE)/bin/nvcc EMPTY := SPACE := $(EMPTY) $(EMPTY) @@ -64,11 +73,11 @@ $(LUA_DIR)/%.lua: %.lua cp $< $@ $(LIB_PATH)/libnervcore.so: $(CORE_OBJS) - gcc -shared -o $@ $^ $(LDFLAGS) -lcblas + gcc -shared -o $@ $^ $(LDFLAGS) $(CUDA_LDFLAGS) $(BLAS_LDFLAGS) $(LIB_PATH)/libluaT.so: $(LUAT_OBJS) - gcc -shared -o $@ $^ $(LDFLAGS) + gcc -shared -o $@ $^ $(INST_LIBDIR)/libnerv.so: $(NERV_OBJS) $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so - gcc -shared -o $@ $(NERV_OBJS) $(LDFLAGS) -Wl,-rpath=$(LIB_PATH) -L$(LIB_PATH) -lnervcore -lluaT + gcc -shared -o $@ $(NERV_OBJS) -Wl,-rpath=$(LIB_PATH) -L$(LIB_PATH) -lnervcore -lluaT $(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c $(OBJ_DIR)/matrix/mmatrix.o: matrix/generic/mmatrix.c matrix/generic/matrix.c @@ -82,5 +91,5 @@ clean: install: $(LIBS) $(LUA_DIR) $(LUA_SUBDIR) $(LUA_LIBS) $(INC_SUBDIR) $(INCS) -$(INC_PATH)/%.h: lib/%.h +$(INC_PATH)/%.h: %.h cp $< $@ diff --git a/nerv/doc/nerv.md b/nerv/doc/nerv.md index 28411f5..125928d 100644 --- a/nerv/doc/nerv.md +++ b/nerv/doc/nerv.md @@ -1,6 +1,6 @@ -#The Nerv utility functions# +# The Nerv utility functions Part of the [Nerv](../README.md) toolkit. -##Methods## +## Methods * __string = nerv.typename(obj a)__ A registered function, the original function is `luaT_lua_typename`. In some cases if you call `type(a)` for object of some class in __Nerv__(like __Nerv.CuMatrix__) it will only return "userdata"(because it is created in C), in this case you can use this method to get its type. @@ -14,4 +14,4 @@ A registered function, the original function is `luaT_newmetatable`, it returns * __string = nerv.setmetatable(table self, string tname)__ A registered function, the original function is `luaT_lua_setmetatable`. It assigns the metatable registered in __luaT__ by the name *tname* to the table *self*. And return *tname* to user. * __table = nerv.get_type(string typename)__ -Returns the type(`loadstring("return " .. typename)`).
\ No newline at end of file +Returns the type(`loadstring("return " .. typename)`). diff --git a/nerv/doc/nerv_class.md b/nerv/doc/nerv_class.md index 99f63e7..8314b12 100644 --- a/nerv/doc/nerv_class.md +++ b/nerv/doc/nerv_class.md @@ -1,10 +1,10 @@ -#The Nerv OOP# +# The Nerv OOP Part of the [Nerv](../README.md) toolkit. -##Methods## +## Methods * __metatable mt, metatable mpt = nerv.class(string tname, string parenttname)__ This method is used to create a class by the name `tname`, which inherits `parenttname` in __Nerv__, then you create a new instance of this class by calling `obj=tname(...)`. The `tname.__init(...)` method(if defined) will be called in the constructing. The metatable of the class and its parent class will be returned. -##Examples## +## Examples * This example implements a simple `nerv.Counter` class which is inherited by `nerv.BetterCounter`. ``` @@ -33,4 +33,4 @@ c1 = nerv.Counter(1) print(c1.c) bc1 = nerv.BetterCounter(1, 1) print(bc1.c, bc1.bc) -```
\ No newline at end of file +``` diff --git a/nerv/doc/nerv_io.md b/nerv/doc/nerv_io.md index 07589df..299362f 100644 --- a/nerv/doc/nerv_io.md +++ b/nerv/doc/nerv_io.md @@ -1,7 +1,7 @@ -#The Nerv IO Package# +# The Nerv IO Package Part of the [Nerv](../README.md) toolkit. -##Description## +## Description The main class that the user uses to store and read parameter object to and from files is __nerv.ChunkFile__. In the file, a parameter object will be saved using a standard format. First is the length(in byte) of this object, then a table which includes some meta information of the object, and a data area. Below is an example text file. ``` @@ -23,7 +23,7 @@ In the file, a parameter object will be saved using a standard format. First is 3.000000 3.000000 3.000000 ``` -##Methods## +## Methods * __ChunkFile ChunkFile(string fn, string mode)__ `mode` can be `r` or `w`, for reading or writing a file. The returned __ChunkFile__ will be ready to write or read objects which follows the __nerv.Param__ interface(using `write_chunk` and `read_chunk`). * __void ChunkFile.write_chunk(ChunkFile self, Param p)__ @@ -33,7 +33,7 @@ Read the __Param__ object by id `id` from the file `self`. It will be constructe * __void ChunkFile.close(ChunkFile self)__ Close the opened file. -##Examples## +## Examples * An example showing how to use __ChunkFile__ to store and read parameter objects. ``` require 'io' @@ -96,7 +96,7 @@ do end ``` -##Developer Notes## +## Developer Notes * There are four classes in to deal with chunk data, which are __nerv.ChunkFile__, __nerv.ChunkFileHandle__, __nerv.ChunkInfo__, __nerv.ChunkData__. Below is the underlying C structs. ``` typedef struct ChunkFileHandle { @@ -110,4 +110,5 @@ typedef struct ChunkData { char *data; } ChunkData; ``` -* In __Nerv.io__, a returned(by `ChunkFile.__init`) __nerv.ChunkFile__ will have a member `handle`, which is a __nerv.ChunkFileHandle__.
\ No newline at end of file + +* In __Nerv.io__, a returned(by `ChunkFile.__init`) __nerv.ChunkFile__ will have a member `handle`, which is a __nerv.ChunkFileHandle__. diff --git a/nerv/doc/nerv_layer.md b/nerv/doc/nerv_layer.md index de2fb12..dd7c9bb 100644 --- a/nerv/doc/nerv_layer.md +++ b/nerv/doc/nerv_layer.md @@ -1,9 +1,9 @@ -#The Nerv Layer Package# +# The Nerv Layer Package Part of the [Nerv](../README.md) toolkit. -##Description## +## Description __nerv.Layer__ is the base class and most of its methods are abstract. -###Class hierarchy and their members### +### Class hierarchy and their members * __nerv.Layer__. * `table dim_in` It specifies the dimensions of the inputs. * `table dim_out` It specifies the dimensions of the outputs. @@ -20,7 +20,7 @@ __nerv.Layer__ is the base class and most of its methods are abstract. * `int total_frams` Records how many frames have passed. * `bool compressed` The reference distribution can be a one-hot format. This feature is enabled by `layer_conf.compressed`. -##Methods## +## Methods * __void Layer.\_\_init(Layer self, string id, table global_conf, table layer_conf)__ Abstract method. The constructing method should assign `id` to `self.id` and `global_conf` to `self.gconf`, `layer_conf.dim_in` to `self.dim_in`, `layer_conf.dim_out` to `self.dim_out`. `dim_in` and `dim_out` are a list specifies the dimensions of the inputs and outputs. Also, `layer_conf` will include the parameters, which should also be properly saved. @@ -43,7 +43,7 @@ Check whether `#self.dim_in == len_in` and `#self.dim_out == len_out`, if violat Abstract method. The layer should return a list containing its parameters. -####nerv.Layer.get\_dim(self)#### +#### nerv.Layer.get\_dim(self) * Returns: `dim_in`: __table__. `dim_out`: __table__. @@ -52,7 +52,7 @@ The layer should return a list containing its parameters. * Description: Returns `self.dim_in, self.dim_out`. -##Examples## +## Examples * a basic example using __Nerv__ layers to a linear classification. ``` @@ -178,3 +178,4 @@ for l = 0, 10, 1 do end --[[end training]]-- ``` + diff --git a/nerv/doc/nerv_matrix.md b/nerv/doc/nerv_matrix.md index dfd843d..3782eb3 100644 --- a/nerv/doc/nerv_matrix.md +++ b/nerv/doc/nerv_matrix.md @@ -1,8 +1,8 @@ -#The Nerv Matrix Package# +# The Nerv Matrix Package Part of the [Nerv](../README.md) toolkit. -##Description## -###Underlying structure### +## Description +### Underlying structure In the begining is could be useful to know something about the underlying structure of a __Nerv__ matrix. Please keep in mind that matrice in __Nerv__ is row-major. Every matrix object is a encapsulation of a C struct that describes the attributes of this matrix. ``` @@ -20,12 +20,12 @@ typedef struct Matrix { It is worth mentioning that that `data_ref` is a counter which counts the number of references to its memory space, mind that it will also be increased when a row of the matrix is referenced(`col = m[2]`). A __Nerv__ matrix will deallocate its space when this counter is decreased to zero. Also note that all assigning operation in __Nerv__ is reference copy, you can use `copy_tod` or `copy_toh` method to copy value. Also, row assigning operations like `m1[2]=m2[3]` is forbidden in __Nerv__. -###Class hierarchy### +### Class hierarchy The class hierarchy of the matrix classes can be clearly observed in `matrix/init.c`. First there is a abstract base class __Nerv.Matrix__, which is inherited by __Nerv.CuMatrix__ and __Nerv.MMatrix__(also abstract). Finally, there is __Nerv.CuMatrixFloat__, __Nerv.CuMatrixDouble__, inheriting __Nerv.CuMatrix__, and __Nerv.MMatrixFloat__, __Nerv.MMatrixDouble__, __Nerv.MMatrixInt__ , inheriting __Nerv.MMatrix__. -##Methods## +## Methods Mind that usually a matrix object can only do calculation with matrix of its own type(a __Nerv.CuMatrixFloat__ matrix can only do add operation with a __Nerv.CuMatrixFloat__). In the methods description below, __Matrix__ could be __Nerv.CuMatrixFloat__, __Nerv.CuMatrixDouble__, __Nerv.MMatrixFloat__ or __Nerv.MMatrixDouble__. __Element_type__ could be `float` or `double`, respectively. * __Matrix = Matrix(int nrow, int ncol)__ @@ -53,6 +53,8 @@ Return a new __Matrix__ of size (1,`self.ncol`), which stores the sum of all col Return a new __Matrix__ of size (`self.nrow`,1), which stores the sum of all rows of __Matrix__ `self`. * __Matrix Matrix.rowmax(Matrix self)__ Return a new __Matrix__ of size (`self.nrow`,1), which stores the max value of all rows of __Matrix__ `self`. +* __Matrix Matrix.rowmax_idx(Matrix self)__ +Return two new __Matrix__ of size (`self.nrow`,1), which stores the max value of all rows of __Matrix__ `self`, and its corresponding column indices(start from zero). * __Matrix Matrix.trans(Matrix self)__ Return a new __Matrix__ of size (`self.ncol`,`self.nrow`), which stores the transpose of __Matrix__ `self`. * __void Matrix.copy_fromh(Matrix self, MMatrix a)__ @@ -81,8 +83,8 @@ Fill the content of __Matrix__ `self` to be `value`. Set the element of __Matrix__ `self` to be elementwise-sigmoid of `ma`. * __void Matrix.sigmoid_grad(Matrix self, Matrix err, Matrix output)__ Set the element of __Matrix__ `self`, to be `self[i][j]=err[i][j]*output[i][j]*(1-output[i][j])`. This function is used to propagate sigmoid layer error. -* __void Matrix.softmax(Matrix self, Matrix a)__ -Calculate a row-by-row softmax of __Matrix__ `a` and save the result in `self`. +* __Matrix Matrix.softmax(Matrix self, Matrix a)__ +Calculate a row-by-row softmax of __Matrix__ `a` and save the result in `self`. Returns a new `self.nrow*1` index matrix that stores the index of the maximum value of each row. * __void Matrix.mul_elem(Matrix self, Matrix ma, Matrix mb)__ Calculate element-wise multiplication of __Matrix__ `ma` and `mb`, store the result in `self`. * __void Matrix.log_elem(Matrix self, Matrix ma)__ @@ -113,7 +115,7 @@ Write `self` to the file position in `chunk`. * __void MMatrix.copy_from(MMatrix ma, MMatrix mb,[int b_bgein, int b_end, int a_begin])__ Copy a part of `mb`(rows of index `[b_begin..b_end)`) to `ma` beginning at row index `a_begin`. If not specified, `b_begin` will be `0`, `b_end` will be `b.nrow`, `a_begin` will be `0`. -##Examples## +## Examples * Use `get_dataref_value` to test __Nerv__'s matrix space allocation. ``` m = 10 @@ -134,6 +136,7 @@ print("test fm:get_dataref_value:", fm:get_dataref_value()) print(fm) print(dm) ``` + * Test some __Matrix__ calculations. ``` m = 4 @@ -167,3 +170,4 @@ print(a) a:log_elem(fs) print(a) ``` + diff --git a/nerv/doc/nerv_nn.md b/nerv/doc/nerv_nn.md index c57447d..63537fb 100644 --- a/nerv/doc/nerv_nn.md +++ b/nerv/doc/nerv_nn.md @@ -1,19 +1,19 @@ -#The Nerv NN Package# +# The Nerv NN Package Part of the [Nerv](../README.md) toolkit. -##Description## -###Class hierarchy### +## Description +### Class hierarchy it contains __nerv.LayerRepo__, __nerv.ParamRepo__, and __nerv.DAGLayer__(inherits __nerv.Layer__). -###Class hierarchy and their members### -####nerv.ParamRepo#### +### Class hierarchy and their members +#### nerv.ParamRepo Get parameter object by ID. * `table param_table` Contains the mapping of parameter ID to parameter file(__nerv.ChunkFile__) * __nerv.LayerRepo__ Get layer object by ID. * `table layers` Contains the mapping of layer ID to layer object. objects. -####__nerv.DAGLayer__#### +#### __nerv.DAGLayer__ Inherits __nerv.Layer__. * `layers`: __table__, a mapping from a layer ID to its "ref". A ref is a structure that contains reference to space allocations and other info of the layer. * `inputs`: __table__, a mapping from the inputs ports of the DAG layer to the input ports of the sublayer, the key is the port number, the value is `{ref, port}`. @@ -21,17 +21,17 @@ Inherits __nerv.Layer__. * `parsed_conn`: __table__, a list of parsed connections, each entry is of format `{{ref_from, port_from}, {ref_to, port_to}}`. * `queue`: __table__, a list of "ref"s, the propagation of the DAGLayer will follow this order, and back-propagation will follow a reverse order. -##Methods## +## Methods -###__nerv.ParamRepo__### +### __nerv.ParamRepo__ -####nerv.ParamRepo:\_\_init(param\_files)#### +#### nerv.ParamRepo:\_\_init(param\_files) * Parameters: `param_files`: __table__ * Description: `param_files` is a list of file names that stores parameters, the newed __ParamRepo__ will read them from file and store the mapping for future fetching. -####nerv.Param ParamRepo.get_param(ParamRepo self, string pid, table global_conf)#### +#### nerv.Param ParamRepo.get_param(ParamRepo self, string pid, table global_conf) * Returns: __nerv.Layer__ * Parameters: @@ -41,8 +41,8 @@ Inherits __nerv.Layer__. * Description: __ParamRepo__ will find the __nerv.ChunkFile__ `pf` that contains parameter of ID `pid` and return `pf:read_chunk(pid, global_conf)`. -###__nerv.LayerRepo__### -####nerv.LayerRepo:\_\_init(layer\_spec, param\_repo, global\_conf)#### +### __nerv.LayerRepo__ +#### nerv.LayerRepo:\_\_init(layer\_spec, param\_repo, global\_conf) * Returns: __nerv.LayerRepo__. * Parameters: @@ -60,7 +60,7 @@ Inherits __nerv.Layer__. __LayerRepo__ will merge `param_config` into `layer_config` and construct a layer by calling `layer_type(layerid, global_conf, layer_config)`. -####nerv.LayerRepo.get\_layer(self, lid)#### +#### nerv.LayerRepo.get\_layer(self, lid) * Returns: __nerv.LayerRepo__, the layer with ID `lid`. * Parameters: @@ -69,8 +69,8 @@ Inherits __nerv.Layer__. * Description: Returns the layer with ID `lid`. -###nerv.DAGLayer### -####nerv.DAGLayer:\_\_init(id, global\_conf, layer\_conf)#### +### nerv.DAGLayer +#### nerv.DAGLayer:\_\_init(id, global\_conf, layer\_conf) * Returns: __nerv.DAGLayer__ * Parameters: @@ -89,7 +89,7 @@ Inherits __nerv.Layer__. }}) ``` -####nerv.DAGLayer.init(self, batch\_size)#### +#### nerv.DAGLayer.init(self, batch\_size) * Parameters: `self`: __nerv.DAGLayer__ `batch_size`: __int__ @@ -97,7 +97,7 @@ Inherits __nerv.Layer__. This initialization method will allocate space for output and input matrice, and will call `init()` for each of its sub layers. -####nerv.DAGLayer.propagate(self, input, output)#### +#### nerv.DAGLayer.propagate(self, input, output) * Parameters: `self`: __nerv.DAGLayer__ `input`: __table__ @@ -105,7 +105,7 @@ Inherits __nerv.Layer__. * Description: The same function as __nerv.Layer.propagate__, do propagation for each layer in the order of `self.queue`. -####nerv.DAGLayer.back\_propagate(self, next\_bp\_err, bp\_err, input, output)#### +#### nerv.DAGLayer.back\_propagate(self, next\_bp\_err, bp\_err, input, output) * Parameters: `self`: __nerv.DAGLayer__ `next_bp_err`: __table__ @@ -115,7 +115,7 @@ Inherits __nerv.Layer__. * Description: The same function as __nerv.Layer.back_propagate__, do back-propagation for each layer in the reverse order of `self.queue`. -####nerv.DAGLayer.update(self, bp\_err, input, output)#### +#### nerv.DAGLayer.update(self, bp\_err, input, output) * Parameters: `self`: __nerv.DAGLayer__ `bp_err`: __table__ @@ -124,7 +124,7 @@ Inherits __nerv.Layer__. * Description: The same function as __nerv.Layer.update__, do update for each layer in the order of `self.queue`. -##Examples## +## Examples * aaa ``` @@ -253,4 +253,5 @@ for l = 0, 10, 1 do ce_last = softmaxL.total_ce end --[[end training]]-- -```
\ No newline at end of file +``` + diff --git a/nerv/doc/nerv_param.md b/nerv/doc/nerv_param.md index 167cb11..98793f0 100644 --- a/nerv/doc/nerv_param.md +++ b/nerv/doc/nerv_param.md @@ -1,17 +1,17 @@ -#The Nerv Parameter Package# +# The Nerv Parameter Package Part of the [Nerv](../README.md) toolkit. -##Description## -###Class hierarchy### +## Description +### Class hierarchy There is a base class __Nerv.Param__ defined in `layer/init.lua`. -###Class hierarchy and their members### +### Class hierarchy and their members * __nerv.MatrixParam__ inherits __nerv.Param__ * `Matrix trans` stores the parameter matrix. * __nerv.LinearTransParam__ inherits __Nerv.MatrixParam__. * __Nerv.BiasParam__ inherits __Nerv.MatrixParam__. -##Methods## +## Methods * __void Param.\_\_init(Param self, string id, table global_conf)__ Constructor of a __Param__, it will set `self.id` to be `id` and `self.gconf` to be `global_conf`. * __void Param.set_info(Param self, table info)__ diff --git a/nerv/examples/asr_trainer.lua b/nerv/examples/asr_trainer.lua index 3fa2653..6bdf57c 100644 --- a/nerv/examples/asr_trainer.lua +++ b/nerv/examples/asr_trainer.lua @@ -1,25 +1,48 @@ -function build_trainer(ifname) - local param_repo = nerv.ParamRepo() - param_repo:import(ifname, nil, gconf) - local layer_repo = make_layer_repo(param_repo) - local network = get_network(layer_repo) - local global_transf = get_global_transf(layer_repo) - local input_order = get_input_order() +require 'lfs' +require 'pl' +local function build_trainer(ifname) + local host_param_repo = nerv.ParamRepo() local mat_type + local src_loc_type + local train_loc_type + host_param_repo:import(ifname, nil, gconf) if gconf.use_cpu then mat_type = gconf.mmat_type + src_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST + train_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST else mat_type = gconf.cumat_type + src_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST + train_loc_type = nerv.ParamRepo.LOC_TYPES.ON_DEVICE end - local iterative_trainer = function (prefix, scp_file, bp) + local param_repo = host_param_repo:copy(train_loc_type) + local layer_repo = make_layer_repo(param_repo) + local network = get_network(layer_repo) + local global_transf = get_global_transf(layer_repo) + local input_order = get_input_order() + + network = nerv.Network("nt", gconf, {network = network}) + network:init(gconf.batch_size, 1) + global_transf = nerv.Network("gt", gconf, {network = global_transf}) + global_transf:init(gconf.batch_size, 1) + + local iterative_trainer = function (prefix, scp_file, bp, rebind_param_repo) + -- rebind the params if necessary + if rebind_param_repo then + host_param_repo = rebind_param_repo + param_repo = host_param_repo:copy(train_loc_type) + layer_repo:rebind(param_repo) + rebind_param_repo = nil + end gconf.randomize = bp -- build buffer local buffer = make_buffer(make_readers(scp_file, layer_repo)) -- initialize the network - network:init(gconf.batch_size) gconf.cnt = 0 err_input = {mat_type(gconf.batch_size, 1)} err_input[1]:fill(1) + network:epoch_init() + global_transf:epoch_init() for data in buffer.get_data, buffer do -- prine stat periodically gconf.cnt = gconf.cnt + 1 @@ -53,10 +76,17 @@ function build_trainer(ifname) for i = 1, #input do table.insert(err_output, input[i]:create()) end - network:propagate(input, output) + network:mini_batch_init({seq_length = table.vector(gconf.batch_size, 1), + new_seq = {}, + do_train = bp, + input = {input}, + output = {output}, + err_input = {err_input}, + err_output = {err_output}}) + network:propagate() if bp then - network:back_propagate(err_input, err_output, input, output) - network:update(err_input, input, output) + network:back_propagate() + network:update() end -- collect garbage in-time to save GPU memory collectgarbage("collect") @@ -64,61 +94,193 @@ function build_trainer(ifname) print_stat(layer_repo) mat_type.print_profile() mat_type.clear_profile() - if (not bp) and prefix ~= nil then - nerv.info("writing back...") - local fname = string.format("%s_cv%.3f.nerv", - prefix, get_accuracy(layer_repo)) - network:get_params():export(fname, nil) + local fname + if (not bp) then + host_param_repo = param_repo:copy(src_loc_type) + if prefix ~= nil then + nerv.info("writing back...") + fname = string.format("%s_cv%.3f.nerv", + prefix, get_accuracy(layer_repo)) + host_param_repo:export(fname, nil) + end end - return get_accuracy(layer_repo) + return get_accuracy(layer_repo), host_param_repo, fname end return iterative_trainer end -dofile(arg[1]) -start_halving_inc = 0.5 -halving_factor = 0.6 -end_halving_inc = 0.1 -min_iter = 1 -max_iter = 20 -min_halving = 5 -gconf.batch_size = 256 -gconf.buffer_size = 81920 +local function check_and_add_defaults(spec, opts) + local function get_opt_val(k) + return opts[string.gsub(k, '_', '-')].val + end + local opt_v = get_opt_val("resume_from") + if opt_v then + gconf = dofile(opt_v) + else + for k, v in pairs(spec) do + local opt_v = get_opt_val(k) + if opt_v ~= nil then + gconf[k] = opt_v + elseif gconf[k] ~= nil then + elseif v ~= nil then + gconf[k] = v + end + end + end +end -local pf0 = gconf.initialized_param -local trainer = build_trainer(pf0) ---local trainer = build_trainer("c3.nerv") -local accu_best = trainer(nil, gconf.cv_scp, false) -local do_halving = false - -nerv.info("initial cross validation: %.3f", accu_best) -for i = 1, max_iter do - nerv.info("[NN] begin iteration %d with lrate = %.6f", i, gconf.lrate) - local accu_tr = trainer(nil, gconf.tr_scp, true) - nerv.info("[TR] training set %d: %.3f", i, accu_tr) - local accu_new = trainer( - string.format("%s_%s_iter_%d_lr%f_tr%.3f", - string.gsub( - (string.gsub(pf0[1], "(.*/)(.*)", "%2")), - "(.*)%..*", "%1"), - os.date("%Y%m%d%H%M%S"), - i, gconf.lrate, - accu_tr), - gconf.cv_scp, false) - nerv.info("[CV] cross validation %d: %.3f", i, accu_new) - -- TODO: revert the weights - local accu_diff = accu_new - accu_best - if do_halving and accu_diff < end_halving_inc and i > min_iter then - break +local function make_options(spec) + local options = {} + for k, v in pairs(spec) do + table.insert(options, + {string.gsub(k, '_', '-'), nil, type(v), default = v}) end - if accu_diff < start_halving_inc and i >= min_halving then - do_halving = true + return options +end + +local function print_help(options) + nerv.printf("Usage: <asr_trainer.lua> [options] network_config.lua\n") + nerv.print_usage(options) +end + +local function print_gconf() + local key_maxlen = 0 + for k, v in pairs(gconf) do + key_maxlen = math.max(key_maxlen, #k or 0) end - if do_halving then - gconf.lrate = gconf.lrate * halving_factor + local function pattern_gen() + return string.format("%%-%ds = %%s\n", key_maxlen) end - if accu_new > accu_best then - accu_best = accu_new + nerv.info("ready to train with the following gconf settings:") + nerv.printf(pattern_gen(), "Key", "Value") + for k, v in pairs(gconf) do + nerv.printf(pattern_gen(), k or "", v or "") end +end + +local function dump_gconf(fname) + local f = io.open(fname, "w") + f:write("return ") + f:write(table.tostring(gconf)) + f:close() +end + +local trainer_defaults = { + lrate = 0.8, + batch_size = 256, + buffer_size = 81920, + wcost = 1e-6, + momentum = 0.9, + start_halving_inc = 0.5, + halving_factor = 0.6, + end_halving_inc = 0.1, + cur_iter = 1, + min_iter = 1, + max_iter = 20, + min_halving = 5, + do_halving = false, + cumat_tname = "nerv.CuMatrixFloat", + mmat_tname = "nerv.MMatrixFloat", + debug = false, +} + +local options = make_options(trainer_defaults) +local extra_opt_spec = { + {"tr-scp", nil, "string"}, + {"cv-scp", nil, "string"}, + {"resume-from", nil, "string"}, + {"help", "h", "boolean", default = false, desc = "show this help information"}, + {"dir", nil, "string", desc = "specify the working directory"}, +} + +table.extend(options, extra_opt_spec) + +arg, opts = nerv.parse_args(arg, options) + +if #arg < 1 or opts["help"].val then + print_help(options) + return +end + +dofile(arg[1]) + +--[[ + +Rule: command-line option overrides network config overrides trainer default. +Note: config key like aaa_bbbb_cc could be overriden by specifying +--aaa-bbbb-cc to command-line arguments. + +]]-- + +check_and_add_defaults(trainer_defaults, opts) +gconf.mmat_type = nerv.get_type(gconf.mmat_tname) +gconf.cumat_type = nerv.get_type(gconf.cumat_tname) +gconf.use_cpu = econf.use_cpu or false + +local pf0 = gconf.initialized_param +local date_pattern = "%Y%m%d%H%M%S" +local logfile_name = "log" +local working_dir = opts["dir"].val or string.format("nerv_%s", os.date(date_pattern)) +local rebind_param_repo = nil + +print_gconf() +if not lfs.mkdir(working_dir) then + nerv.error("[asr_trainer] working directory already exists") +end +-- copy the network config +dir.copyfile(arg[1], working_dir) +-- set logfile path +nerv.set_logfile(path.join(working_dir, logfile_name)) +path.chdir(working_dir) + +-- start the training +local trainer = build_trainer(pf0) +local pr_prev +gconf.accu_best, pr_prev = trainer(nil, gconf.cv_scp, false) +nerv.info("initial cross validation: %.3f", gconf.accu_best) +for i = gconf.cur_iter, gconf.max_iter do + local stop = false + gconf.cur_iter = i + dump_gconf(string.format("iter_%d.meta", i)) + repeat -- trick to implement `continue` statement + nerv.info("[NN] begin iteration %d with lrate = %.6f", i, gconf.lrate) + local accu_tr = trainer(nil, gconf.tr_scp, true, rebind_param_repo) + nerv.info("[TR] training set %d: %.3f", i, accu_tr) + local param_prefix = string.format("%s_%s_iter_%d_lr%f_tr%.3f", + string.gsub( + (string.gsub(pf0[1], "(.*/)(.*)", "%2")), + "(.*)%..*", "%1"), + os.date(date_pattern), + i, gconf.lrate, + accu_tr) + local accu_new, pr_new, param_fname = trainer(param_prefix, gconf.cv_scp, false) + nerv.info("[CV] cross validation %d: %.3f", i, accu_new) + local accu_prev = gconf.accu_best + if accu_new < gconf.accu_best then + nerv.info("rejecting the trained params, rollback to the previous one") + file.move(param_fname, param_fname .. ".rejected") + rebind_param_repo = pr_prev + break -- `continue` equivalent + else + nerv.info("accepting the trained params") + gconf.accu_best = accu_new + pr_prev = pr_new + gconf.initialized_param = {path.join(path.currentdir(), param_fname)} + end + if gconf.do_halving and + gconf.accu_best - accu_prev < gconf.end_halving_inc and + i > gconf.min_iter then + stop = true + break + end + if gconf.accu_best - accu_prev < gconf.start_halving_inc and + i >= gconf.min_halving then + gconf.do_halving = true + end + if gconf.do_halving then + gconf.lrate = gconf.lrate * gconf.halving_factor + end + until true + if stop then break end -- nerv.Matrix.print_profile() end diff --git a/nerv/examples/network_debug/config.lua b/nerv/examples/network_debug/config.lua new file mode 100644 index 0000000..e20d5a9 --- /dev/null +++ b/nerv/examples/network_debug/config.lua @@ -0,0 +1,62 @@ +function get_global_conf() + local global_conf = { + lrate = 0.15, + wcost = 1e-5, + momentum = 0, + clip = 5, + cumat_type = nerv.CuMatrixFloat, + mmat_type = nerv.MMatrixFloat, + vocab_size = 10000, + nn_act_default = 0, + hidden_size = 300, + layer_num = 1, + chunk_size = 15, + batch_size = 20, + max_iter = 35, + param_random = function() return (math.random() / 5 - 0.1) end, + dropout_rate = 0.5, + timer = nerv.Timer(), + pr = nerv.ParamRepo(), + } + return global_conf +end + +function get_layers(global_conf) + local pr = global_conf.pr + local layers = { + ['nerv.LSTMLayer'] = {}, + ['nerv.DropoutLayer'] = {}, + ['nerv.SelectLinearLayer'] = { + ['select'] = {dim_in = {1}, dim_out = {global_conf.hidden_size}, vocab = global_conf.vocab_size, pr = pr}, + }, + ['nerv.AffineLayer'] = { + output = {dim_in = {global_conf.hidden_size}, dim_out = {global_conf.vocab_size}, pr = pr} + }, + ['nerv.SoftmaxCELayer'] = { + softmax = {dim_in = {global_conf.vocab_size, global_conf.vocab_size}, dim_out = {1}, compressed = true}, + }, + } + for i = 1, global_conf.layer_num do + layers['nerv.LSTMLayer']['lstm' .. i] = {dim_in = {global_conf.hidden_size}, dim_out = {global_conf.hidden_size}, pr = pr} + layers['nerv.DropoutLayer']['dropout' .. i] = {dim_in = {global_conf.hidden_size}, dim_out = {global_conf.hidden_size}} + end + return layers +end + +function get_connections(global_conf) + local connections = { + {'<input>[1]', 'select[1]', 0}, + {'select[1]', 'lstm1[1]', 0}, + {'dropout' .. global_conf.layer_num .. '[1]', 'output[1]', 0}, + {'output[1]', 'softmax[1]', 0}, + {'<input>[2]', 'softmax[2]', 0}, + {'softmax[1]', '<output>[1]', 0}, + } + for i = 1, global_conf.layer_num do + table.insert(connections, {'lstm' .. i .. '[1]', 'dropout' .. i .. '[1]', 0}) + if i < 1 then + table.insert(connections, {'dropout' .. (i - 1) .. '[1]', 'lstm' .. i .. '[1]', 0}) + end + end + return connections +end diff --git a/nerv/examples/network_debug/main.lua b/nerv/examples/network_debug/main.lua new file mode 100644 index 0000000..790c404 --- /dev/null +++ b/nerv/examples/network_debug/main.lua @@ -0,0 +1,45 @@ +nerv.include('reader.lua') +nerv.include('timer.lua') +nerv.include('config.lua') +nerv.include(arg[1]) + +local global_conf = get_global_conf() +local timer = global_conf.timer + +timer:tic('IO') + +local data_path = 'examples/lmptb/PTBdata/' +local train_reader = nerv.Reader(data_path .. 'vocab', data_path .. 'ptb.train.txt.adds') +local val_reader = nerv.Reader(data_path .. 'vocab', data_path .. 'ptb.valid.txt.adds') + +local train_data = train_reader:get_all_batch(global_conf) +local val_data = val_reader:get_all_batch(global_conf) + +local layers = get_layers(global_conf) +local connections = get_connections(global_conf) + +local NN = nerv.NN(global_conf, train_data, val_data, layers, connections) + +timer:toc('IO') +timer:check('IO') +io.flush() + +timer:tic('global') +local best_cv = 1e10 +for i = 1, global_conf.max_iter do + timer:tic('Epoch' .. i) + local train_ppl, val_ppl = NN:epoch() + if val_ppl < best_cv then + best_cv = val_ppl + else + global_conf.lrate = global_conf.lrate / 2.0 + end + nerv.printf('Epoch %d: %f %f %f\n', i, global_conf.lrate, train_ppl, val_ppl) + timer:toc('Epoch' .. i) + timer:check('Epoch' .. i) + io.flush() +end +timer:toc('global') +timer:check('global') +timer:check('network') +timer:check('gc') diff --git a/nerv/examples/network_debug/network.lua b/nerv/examples/network_debug/network.lua new file mode 100644 index 0000000..5518e27 --- /dev/null +++ b/nerv/examples/network_debug/network.lua @@ -0,0 +1,110 @@ +nerv.include('select_linear.lua') + +local nn = nerv.class('nerv.NN') + +function nn:__init(global_conf, train_data, val_data, layers, connections) + self.gconf = global_conf + self.network = self:get_network(layers, connections) + self.train_data = self:get_data(train_data) + self.val_data = self:get_data(val_data) +end + +function nn:get_network(layers, connections) + local layer_repo = nerv.LayerRepo(layers, self.gconf.pr, self.gconf) + local graph = nerv.GraphLayer('graph', self.gconf, + {dim_in = {1, self.gconf.vocab_size}, dim_out = {1}, + layer_repo = layer_repo, connections = connections}) + local network = nerv.Network('network', self.gconf, + {network = graph, clip = self.gconf.clip}) + network:init(self.gconf.batch_size, self.gconf.chunk_size) + return network +end + +function nn:get_data(data) + local err_output = {} + local softmax_output = {} + local output = {} + for i = 1, self.gconf.chunk_size do + err_output[i] = self.gconf.cumat_type(self.gconf.batch_size, 1) + softmax_output[i] = self.gconf.cumat_type(self.gconf.batch_size, self.gconf.vocab_size) + output[i] = self.gconf.cumat_type(self.gconf.batch_size, 1) + end + local ret = {} + for i = 1, #data do + ret[i] = {} + ret[i].input = {} + ret[i].output = {} + ret[i].err_input = {} + ret[i].err_output = {} + for t = 1, self.gconf.chunk_size do + ret[i].input[t] = {} + ret[i].output[t] = {} + ret[i].err_input[t] = {} + ret[i].err_output[t] = {} + ret[i].input[t][1] = data[i].input[t] + ret[i].input[t][2] = data[i].output[t] + ret[i].output[t][1] = output[t] + local err_input = self.gconf.mmat_type(self.gconf.batch_size, 1) + for j = 1, self.gconf.batch_size do + if t <= data[i].seq_len[j] then + err_input[j - 1][0] = 1 + else + err_input[j - 1][0] = 0 + end + end + ret[i].err_input[t][1] = self.gconf.cumat_type.new_from_host(err_input) + ret[i].err_output[t][1] = err_output[t] + ret[i].err_output[t][2] = softmax_output[t] + end + ret[i].seq_length = data[i].seq_len + ret[i].new_seq = {} + for j = 1, self.gconf.batch_size do + if data[i].seq_start[j] then + table.insert(ret[i].new_seq, j) + end + end + end + return ret +end + +function nn:process(data, do_train) + local timer = self.gconf.timer + local total_err = 0 + local total_frame = 0 + self.network:epoch_init() + for id = 1, #data do + data[id].do_train = do_train + timer:tic('network') + self.network:mini_batch_init(data[id]) + self.network:propagate() + timer:toc('network') + for t = 1, self.gconf.chunk_size do + local tmp = data[id].output[t][1]:new_to_host() + for i = 1, self.gconf.batch_size do + if t <= data[id].seq_length[i] then + total_err = total_err + math.log10(math.exp(tmp[i - 1][0])) + total_frame = total_frame + 1 + end + end + end + if do_train then + timer:tic('network') + self.network:back_propagate() + self.network:update() + timer:toc('network') + end + timer:tic('gc') + collectgarbage('collect') + timer:toc('gc') + end + return math.pow(10, - total_err / total_frame) +end + +function nn:epoch() + local train_error = self:process(self.train_data, true) + local tmp = self.gconf.dropout_rate + self.gconf.dropout_rate = 0 + local val_error = self:process(self.val_data, false) + self.gconf.dropout_rate = tmp + return train_error, val_error +end diff --git a/nerv/examples/network_debug/reader.lua b/nerv/examples/network_debug/reader.lua new file mode 100644 index 0000000..d2624d3 --- /dev/null +++ b/nerv/examples/network_debug/reader.lua @@ -0,0 +1,113 @@ +local Reader = nerv.class('nerv.Reader') + +function Reader:__init(vocab_file, input_file) + self:get_vocab(vocab_file) + self:get_seq(input_file) +end + +function Reader:get_vocab(vocab_file) + local f = io.open(vocab_file, 'r') + local id = 0 + self.vocab = {} + while true do + local word = f:read() + if word == nil then + break + end + self.vocab[word] = id + id = id + 1 + end + self.size = id +end + +function Reader:split(s, t) + local ret = {} + for x in (s .. t):gmatch('(.-)' .. t) do + table.insert(ret, x) + end + return ret +end + +function Reader:get_seq(input_file) + local f = io.open(input_file, 'r') + self.seq = {} + while true do + local seq = f:read() + if seq == nil then + break + end + seq = self:split(seq, ' ') + local tmp = {} + for i = 1, #seq do + if seq[i] ~= '' then + table.insert(tmp, self.vocab[seq[i]]) + end + end + table.insert(self.seq, tmp) + end +end + +function Reader:get_in_out(id, pos) + return self.seq[id][pos], self.seq[id][pos + 1], pos + 1 == #self.seq[id] +end + +function Reader:get_all_batch(global_conf) + local data = {} + local pos = {} + local offset = 1 + for i = 1, global_conf.batch_size do + pos[i] = nil + end + while true do + --for i = 1, 100 do + local input = {} + local output = {} + for i = 1, global_conf.chunk_size do + input[i] = global_conf.mmat_type(global_conf.batch_size, 1) + input[i]:fill(global_conf.nn_act_default) + output[i] = global_conf.mmat_type(global_conf.batch_size, 1) + output[i]:fill(global_conf.nn_act_default) + end + local seq_start = {} + local seq_end = {} + local seq_len = {} + for i = 1, global_conf.batch_size do + seq_start[i] = false + seq_end[i] = false + seq_len[i] = 0 + end + local has_new = false + for i = 1, global_conf.batch_size do + if pos[i] == nil then + if offset < #self.seq then + seq_start[i] = true + pos[i] = {offset, 1} + offset = offset + 1 + end + end + if pos[i] ~= nil then + has_new = true + for j = 1, global_conf.chunk_size do + local final + input[j][i-1][0], output[j][i-1][0], final = self:get_in_out(pos[i][1], pos[i][2]) + seq_len[i] = j + if final then + seq_end[i] = true + pos[i] = nil + break + end + pos[i][2] = pos[i][2] + 1 + end + end + end + if not has_new then + break + end + for i = 1, global_conf.chunk_size do + input[i] = global_conf.cumat_type.new_from_host(input[i]) + output[i] = global_conf.cumat_type.new_from_host(output[i]) + end + table.insert(data, {input = input, output = output, seq_start = seq_start, seq_end = seq_end, seq_len = seq_len}) + end + return data +end diff --git a/nerv/examples/network_debug/select_linear.lua b/nerv/examples/network_debug/select_linear.lua new file mode 100644 index 0000000..91beedf --- /dev/null +++ b/nerv/examples/network_debug/select_linear.lua @@ -0,0 +1,59 @@ +local SL = nerv.class('nerv.SelectLinearLayer', 'nerv.Layer') + +--id: string +--global_conf: table +--layer_conf: table +--Get Parameters +function SL:__init(id, global_conf, layer_conf) + nerv.Layer.__init(self, id, global_conf, layer_conf) + + self.vocab = layer_conf.vocab + self.ltp = self:find_param("ltp", layer_conf, global_conf, nerv.LinearTransParam, {self.vocab, self.dim_out[1]}) --layer_conf.ltp + + self:check_dim_len(1, 1) +end + +--Check parameter +function SL:init(batch_size) + if (self.dim_in[1] ~= 1) then --one word id + nerv.error("mismatching dimensions of ltp and input") + end + if (self.dim_out[1] ~= self.ltp.trans:ncol()) then + nerv.error("mismatching dimensions of bp and output") + end + + self.batch_size = bath_size + self.ltp:train_init() +end + +function SL:update(bp_err, input, output) + --use this to produce reproducable result, don't forget to set the dropout to zero! + --for i = 1, input[1]:nrow(), 1 do + -- local word_vec = self.ltp.trans[input[1][i - 1][0]] + -- word_vec:add(word_vec, bp_err[1][i - 1], 1, - self.gconf.lrate / self.gconf.batch_size) + --end + + --I tried the update_select_rows kernel which uses atomicAdd, but it generates unreproducable result + self.ltp.trans:update_select_rows_by_colidx(bp_err[1], input[1], - self.gconf.lrate / self.gconf.batch_size, 0) + self.ltp.trans:add(self.ltp.trans, self.ltp.trans, 1.0, - self.gconf.lrate * self.gconf.wcost) +end + +function SL:propagate(input, output) + --for i = 0, input[1]:ncol() - 1, 1 do + -- if (input[1][0][i] > 0) then + -- output[1][i]:copy_fromd(self.ltp.trans[input[1][0][i]]) + -- else + -- output[1][i]:fill(0) + -- end + --end + output[1]:copy_rows_fromd_by_colidx(self.ltp.trans, input[1]) +end + +function SL:back_propagate(bp_err, next_bp_err, input, output) + --input is compressed, do nothing +end + +function SL:get_params() + local paramRepo = nerv.ParamRepo({self.ltp}) + return paramRepo +end diff --git a/nerv/examples/network_debug/timer.lua b/nerv/examples/network_debug/timer.lua new file mode 100644 index 0000000..2c54ca8 --- /dev/null +++ b/nerv/examples/network_debug/timer.lua @@ -0,0 +1,33 @@ +local Timer = nerv.class("nerv.Timer") + +function Timer:__init() + self.last = {} + self.rec = {} +end + +function Timer:tic(item) + self.last[item] = os.clock() +end + +function Timer:toc(item) + if (self.last[item] == nil) then + nerv.error("item not there") + end + if (self.rec[item] == nil) then + self.rec[item] = 0 + end + self.rec[item] = self.rec[item] + os.clock() - self.last[item] +end + +function Timer:check(item) + if self.rec[item]==nil then + nerv.error('item not there') + end + nerv.printf('"%s" lasts for %f secs.\n',item,self.rec[item]) +end + +function Timer:flush() + for key, value in pairs(self.rec) do + self.rec[key] = nil + end +end diff --git a/nerv/examples/network_debug/tnn.lua b/nerv/examples/network_debug/tnn.lua new file mode 100644 index 0000000..bf9f118 --- /dev/null +++ b/nerv/examples/network_debug/tnn.lua @@ -0,0 +1,136 @@ +nerv.include('select_linear.lua') + +local reader = nerv.class('nerv.TNNReader') + +function reader:__init(global_conf, data) + self.gconf = global_conf + self.offset = 0 + self.data = data +end + +function reader:get_batch(feeds) + self.offset = self.offset + 1 + if self.offset > #self.data then + return false + end + for i = 1, self.gconf.chunk_size do + feeds.inputs_m[i][1]:copy_from(self.data[self.offset].input[i]) + feeds.inputs_m[i][2]:copy_from(self.data[self.offset].output[i]:decompress(self.gconf.vocab_size)) + end + feeds.flags_now = self.data[self.offset].flags + feeds.flagsPack_now = self.data[self.offset].flagsPack + return true +end + +function reader:has_data(t, i) + return t <= self.data[self.offset].seq_len[i] +end + +function reader:get_err_input() + return self.data[self.offset].err_input +end + +local nn = nerv.class('nerv.NN') + +function nn:__init(global_conf, train_data, val_data, layers, connections) + self.gconf = global_conf + self.tnn = self:get_tnn(layers, connections) + self.train_data = self:get_data(train_data) + self.val_data = self:get_data(val_data) +end + +function nn:get_tnn(layers, connections) + self.gconf.dropout_rate = 0 + local layer_repo = nerv.LayerRepo(layers, self.gconf.pr, self.gconf) + local tnn = nerv.TNN('TNN', self.gconf, {dim_in = {1, self.gconf.vocab_size}, + dim_out = {1}, sub_layers = layer_repo, connections = connections, + clip = self.gconf.clip}) + tnn:init(self.gconf.batch_size, self.gconf.chunk_size) + return tnn +end + +function nn:get_data(data) + local ret = {} + for i = 1, #data do + ret[i] = {} + ret[i].input = data[i].input + ret[i].output = data[i].output + ret[i].flags = {} + ret[i].err_input = {} + for t = 1, self.gconf.chunk_size do + ret[i].flags[t] = {} + local err_input = self.gconf.mmat_type(self.gconf.batch_size, 1) + for j = 1, self.gconf.batch_size do + if t <= data[i].seq_len[j] then + ret[i].flags[t][j] = nerv.TNN.FC.SEQ_NORM + err_input[j - 1][0] = 1 + else + ret[i].flags[t][j] = 0 + err_input[j - 1][0] = 0 + end + end + ret[i].err_input[t] = self.gconf.cumat_type.new_from_host(err_input) + end + for j = 1, self.gconf.batch_size do + if data[i].seq_start[j] then + ret[i].flags[1][j] = bit.bor(ret[i].flags[1][j], nerv.TNN.FC.SEQ_START) + end + if data[i].seq_end[j] then + local t = data[i].seq_len[j] + ret[i].flags[t][j] = bit.bor(ret[i].flags[t][j], nerv.TNN.FC.SEQ_END) + end + end + ret[i].flagsPack = {} + for t = 1, self.gconf.chunk_size do + ret[i].flagsPack[t] = 0 + for j = 1, self.gconf.batch_size do + ret[i].flagsPack[t] = bit.bor(ret[i].flagsPack[t], ret[i].flags[t][j]) + end + end + ret[i].seq_len = data[i].seq_len + end + return ret +end + +function nn:process(data, do_train) + local total_err = 0 + local total_frame = 0 + local reader = nerv.TNNReader(self.gconf, data) + while true do + local r, _ = self.tnn:getfeed_from_reader(reader) + if not r then + break + end + if do_train then + self.gconf.dropout_rate = self.gconf.dropout + else + self.gconf.dropout_rate = 0 + end + self.tnn:net_propagate() + for t = 1, self.gconf.chunk_size do + local tmp = self.tnn.outputs_m[t][1]:new_to_host() + for i = 1, self.gconf.batch_size do + if reader:has_data(t, i) then + total_err = total_err + math.log10(math.exp(tmp[i - 1][0])) + total_frame = total_frame + 1 + end + end + end + if do_train then + local err_input = reader:get_err_input() + for i = 1, self.gconf.chunk_size do + self.tnn.err_inputs_m[i][1]:copy_from(err_input[i]) + end + self.tnn:net_backpropagate(false) + self.tnn:net_backpropagate(true) + end + collectgarbage('collect') + end + return math.pow(10, - total_err / total_frame) +end + +function nn:epoch() + local train_error = self:process(self.train_data, true) + local val_error = self:process(self.val_data, false) + return train_error, val_error +end diff --git a/nerv/examples/swb_baseline.lua b/nerv/examples/swb_baseline.lua index 51052ba..0ce8468 100644 --- a/nerv/examples/swb_baseline.lua +++ b/nerv/examples/swb_baseline.lua @@ -1,7 +1,5 @@ require 'htk_io' gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9, - cumat_type = nerv.CuMatrixFloat, - mmat_type = nerv.MMatrixFloat, rearrange = true, -- just to make the context order consistent with old results, deprecated frm_ext = 5, frm_trim = 5, -- trim the first and last 5 frames, TNet just does this, deprecated @@ -9,8 +7,7 @@ gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9, cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp", htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf", initialized_param = {"/slfs1/users/mfy43/swb_init.nerv", - "/slfs1/users/mfy43/swb_global_transf.nerv"}, - debug = false} + "/slfs1/users/mfy43/swb_global_transf.nerv"}} function make_layer_repo(param_repo) local layer_repo = nerv.LayerRepo( @@ -18,51 +15,51 @@ function make_layer_repo(param_repo) -- global transf ["nerv.BiasLayer"] = { - blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}}, - blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}} + blayer1 = {dim_in = {429}, dim_out = {429}, params = {bias = "bias1"}}, + blayer2 = {dim_in = {429}, dim_out = {429}, params = {bias = "bias2"}} }, ["nerv.WindowLayer"] = { - wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}}, - wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}} + wlayer1 = {dim_in = {429}, dim_out = {429}, params = {window = "window1"}}, + wlayer2 = {dim_in = {429}, dim_out = {429}, params = {window = "window2"}} }, -- biased linearity ["nerv.AffineLayer"] = { - affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"}, - {dim_in = {429}, dim_out = {2048}}}, - affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"}, - {dim_in = {2048}, dim_out = {3001}}} + affine0 = {dim_in = {429}, dim_out = {2048}, + params = {ltp = "affine0_ltp", bp = "affine0_bp"}}, + affine1 = {dim_in = {2048}, dim_out = {2048}, + params = {ltp = "affine1_ltp", bp = "affine1_bp"}}, + affine2 = {dim_in = {2048}, dim_out = {2048}, + params = {ltp = "affine2_ltp", bp = "affine2_bp"}}, + affine3 = {dim_in = {2048}, dim_out = {2048}, + params = {ltp = "affine3_ltp", bp = "affine3_bp"}}, + affine4 = {dim_in = {2048}, dim_out = {2048}, + params = {ltp = "affine4_ltp", bp = "affine4_bp"}}, + affine5 = {dim_in = {2048}, dim_out = {2048}, + params = {ltp = "affine5_ltp", bp = "affine5_bp"}}, + affine6 = {dim_in = {2048}, dim_out = {2048}, + params = {ltp = "affine6_ltp", bp = "affine6_bp"}}, + affine7 = {dim_in = {2048}, dim_out = {3001}, + params = {ltp = "affine7_ltp", bp = "affine7_bp"}} }, ["nerv.SigmoidLayer"] = { - sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}} + sigmoid0 = {dim_in = {2048}, dim_out = {2048}}, + sigmoid1 = {dim_in = {2048}, dim_out = {2048}}, + sigmoid2 = {dim_in = {2048}, dim_out = {2048}}, + sigmoid3 = {dim_in = {2048}, dim_out = {2048}}, + sigmoid4 = {dim_in = {2048}, dim_out = {2048}}, + sigmoid5 = {dim_in = {2048}, dim_out = {2048}}, + sigmoid6 = {dim_in = {2048}, dim_out = {2048}} }, ["nerv.SoftmaxCELayer"] = -- softmax + ce criterion layer for finetune output { - ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}} + ce_crit = {dim_in = {3001, 1}, dim_out = {1}, compressed = true} }, ["nerv.SoftmaxLayer"] = -- softmax for decode output { - softmax = {{}, {dim_in = {3001}, dim_out = {3001}}} + softmax = {dim_in = {3001}, dim_out = {3001}} } }, param_repo, gconf) @@ -70,7 +67,7 @@ function make_layer_repo(param_repo) { ["nerv.DAGLayer"] = { - global_transf = {{}, { + global_transf = { dim_in = {429}, dim_out = {429}, sub_layers = layer_repo, connections = { @@ -80,8 +77,8 @@ function make_layer_repo(param_repo) ["blayer2[1]"] = "wlayer2[1]", ["wlayer2[1]"] = "<output>[1]" } - }}, - main = {{}, { + }, + main = { dim_in = {429}, dim_out = {3001}, sub_layers = layer_repo, connections = { @@ -102,7 +99,7 @@ function make_layer_repo(param_repo) ["sigmoid6[1]"] = "affine7[1]", ["affine7[1]"] = "<output>[1]" } - }} + } } }, param_repo, gconf) @@ -110,7 +107,7 @@ function make_layer_repo(param_repo) { ["nerv.DAGLayer"] = { - ce_output = {{}, { + ce_output = { dim_in = {429, 1}, dim_out = {1}, sub_layers = layer_repo, connections = { @@ -119,8 +116,8 @@ function make_layer_repo(param_repo) ["<input>[2]"] = "ce_crit[2]", ["ce_crit[1]"] = "<output>[1]" } - }}, - softmax_output = {{}, { + }, + softmax_output = { dim_in = {429}, dim_out = {3001}, sub_layers = layer_repo, connections = { @@ -128,7 +125,7 @@ function make_layer_repo(param_repo) ["main[1]"] = "softmax[1]", ["softmax[1]"] = "<output>[1]" } - }} + } } }, param_repo, gconf) @@ -173,6 +170,7 @@ function make_buffer(readers) return nerv.SGDBuffer(gconf, { buffer_size = gconf.buffer_size, + batch_size = gconf.batch_size, randomize = gconf.randomize, readers = readers, use_gpu = true @@ -184,6 +182,10 @@ function get_input_order() {id = "phone_state"}} end +function get_decode_input_order() + return {{id = "main_scp", global_transf = true}} +end + function get_accuracy(layer_repo) local ce_crit = layer_repo:get_layer("ce_crit") return ce_crit.total_correct / ce_crit.total_frames * 100 diff --git a/nerv/examples/swb_baseline2.lua b/nerv/examples/swb_baseline2.lua new file mode 100644 index 0000000..8b5ebb1 --- /dev/null +++ b/nerv/examples/swb_baseline2.lua @@ -0,0 +1,203 @@ +require 'htk_io' +gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9, + rearrange = true, -- just to make the context order consistent with old results, deprecated + frm_ext = 5, + frm_trim = 5, -- trim the first and last 5 frames, TNet just does this, deprecated + tr_scp = "/speechlab/users/mfy43/swb50/train_bp.scp", + cv_scp = "/speechlab/users/mfy43/swb50/train_cv.scp", + htk_conf = "/speechlab/users/mfy43/swb50/plp_0_d_a.conf", + initialized_param = {"/speechlab/users/mfy43/swb50/swb_init.nerv", + "/speechlab/users/mfy43/swb50/swb_global_transf.nerv"}} + +function make_layer_repo(param_repo) + local layer_repo = nerv.LayerRepo( + { + -- global transf + ["nerv.BiasLayer"] = + { + blayer1 = {dim_in = {429}, dim_out = {429}, params = {bias = "bias1"}}, + blayer2 = {dim_in = {429}, dim_out = {429}, params = {bias = "bias2"}} + }, + ["nerv.WindowLayer"] = + { + wlayer1 = {dim_in = {429}, dim_out = {429}, params = {window = "window1"}}, + wlayer2 = {dim_in = {429}, dim_out = {429}, params = {window = "window2"}} + }, + -- biased linearity + ["nerv.AffineLayer"] = + { + affine0 = {dim_in = {429}, dim_out = {2048}, + params = {ltp = "affine0_ltp", bp = "affine0_bp"}}, + affine1 = {dim_in = {2048}, dim_out = {2048}, + params = {ltp = "affine1_ltp", bp = "affine1_bp"}}, + affine2 = {dim_in = {2048}, dim_out = {2048}, + params = {ltp = "affine2_ltp", bp = "affine2_bp"}}, + affine3 = {dim_in = {2048}, dim_out = {2048}, + params = {ltp = "affine3_ltp", bp = "affine3_bp"}}, + affine4 = {dim_in = {2048}, dim_out = {2048}, + params = {ltp = "affine4_ltp", bp = "affine4_bp"}}, + affine5 = {dim_in = {2048}, dim_out = {2048}, + params = {ltp = "affine5_ltp", bp = "affine5_bp"}}, + affine6 = {dim_in = {2048}, dim_out = {2048}, + params = {ltp = "affine6_ltp", bp = "affine6_bp"}}, + affine7 = {dim_in = {2048}, dim_out = {3001}, + params = {ltp = "affine7_ltp", bp = "affine7_bp"}} + }, + ["nerv.SigmoidLayer"] = + { + sigmoid0 = {dim_in = {2048}, dim_out = {2048}}, + sigmoid1 = {dim_in = {2048}, dim_out = {2048}}, + sigmoid2 = {dim_in = {2048}, dim_out = {2048}}, + sigmoid3 = {dim_in = {2048}, dim_out = {2048}}, + sigmoid4 = {dim_in = {2048}, dim_out = {2048}}, + sigmoid5 = {dim_in = {2048}, dim_out = {2048}}, + sigmoid6 = {dim_in = {2048}, dim_out = {2048}} + }, + ["nerv.SoftmaxCELayer"] = -- softmax + ce criterion layer for finetune output + { + ce_crit = {dim_in = {3001, 1}, dim_out = {1}, compressed = true} + }, + ["nerv.SoftmaxLayer"] = -- softmax for decode output + { + softmax = {dim_in = {3001}, dim_out = {3001}} + } + }, param_repo, gconf) + + layer_repo:add_layers( + { + ["nerv.DAGLayer"] = + { + global_transf = { + dim_in = {429}, dim_out = {429}, + sub_layers = layer_repo, + connections = { + ["<input>[1]"] = "blayer1[1]", + ["blayer1[1]"] = "wlayer1[1]", + ["wlayer1[1]"] = "blayer2[1]", + ["blayer2[1]"] = "wlayer2[1]", + ["wlayer2[1]"] = "<output>[1]" + } + }, + main = { + dim_in = {429}, dim_out = {3001}, + sub_layers = layer_repo, + connections = { + ["<input>[1]"] = "affine0[1]", + ["affine0[1]"] = "sigmoid0[1]", + ["sigmoid0[1]"] = "affine1[1]", + ["affine1[1]"] = "sigmoid1[1]", + ["sigmoid1[1]"] = "affine2[1]", + ["affine2[1]"] = "sigmoid2[1]", + ["sigmoid2[1]"] = "affine3[1]", + ["affine3[1]"] = "sigmoid3[1]", + ["sigmoid3[1]"] = "affine4[1]", + ["affine4[1]"] = "sigmoid4[1]", + ["sigmoid4[1]"] = "affine5[1]", + ["affine5[1]"] = "sigmoid5[1]", + ["sigmoid5[1]"] = "affine6[1]", + ["affine6[1]"] = "sigmoid6[1]", + ["sigmoid6[1]"] = "affine7[1]", + ["affine7[1]"] = "<output>[1]" + } + } + } + }, param_repo, gconf) + + layer_repo:add_layers( + { + ["nerv.DAGLayer"] = + { + ce_output = { + dim_in = {429, 1}, dim_out = {1}, + sub_layers = layer_repo, + connections = { + ["<input>[1]"] = "main[1]", + ["main[1]"] = "ce_crit[1]", + ["<input>[2]"] = "ce_crit[2]", + ["ce_crit[1]"] = "<output>[1]" + } + }, + softmax_output = { + dim_in = {429}, dim_out = {3001}, + sub_layers = layer_repo, + connections = { + ["<input>[1]"] = "main[1]", + ["main[1]"] = "softmax[1]", + ["softmax[1]"] = "<output>[1]" + } + } + } + }, param_repo, gconf) + + return layer_repo +end + +function get_network(layer_repo) + return layer_repo:get_layer("ce_output") +end + +function get_decode_network(layer_repo) + return layer_repo:get_layer("softmax_output") +end + +function get_global_transf(layer_repo) + return layer_repo:get_layer("global_transf") +end + +function make_readers(scp_file, layer_repo) + return { + {reader = nerv.TNetReader(gconf, + { + id = "main_scp", + scp_file = scp_file, + conf_file = gconf.htk_conf, + frm_ext = gconf.frm_ext, + mlfs = { + phone_state = { + file = "/speechlab/users/mfy43/swb50/ref.mlf", + format = "map", + format_arg = "/speechlab/users/mfy43/swb50/dict", + dir = "*/", + ext = "lab" + } + } + }), + data = {main_scp = 429, phone_state = 1}} + } +end + +function make_buffer(readers) + return nerv.SGDBuffer(gconf, + { + buffer_size = gconf.buffer_size, + batch_size = gconf.batch_size, + randomize = gconf.randomize, + readers = readers, + use_gpu = true + }) +end + +function get_input_order() + return {{id = "main_scp", global_transf = true}, + {id = "phone_state"}} +end + +function get_decode_input_order() + return {{id = "main_scp", global_transf = true}} +end + +function get_accuracy(layer_repo) + local ce_crit = layer_repo:get_layer("ce_crit") + return ce_crit.total_correct / ce_crit.total_frames * 100 +end + +function print_stat(layer_repo) + local ce_crit = layer_repo:get_layer("ce_crit") + nerv.info("*** training stat begin ***") + nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce) + nerv.printf("correct:\t\t%d\n", ce_crit.total_correct) + nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames) + nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames) + nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo)) + nerv.info("*** training stat end ***") +end diff --git a/nerv/examples/swb_baseline_basic.lua b/nerv/examples/swb_baseline_basic.lua deleted file mode 100644 index 71f04a3..0000000 --- a/nerv/examples/swb_baseline_basic.lua +++ /dev/null @@ -1,162 +0,0 @@ -require 'htk_io' -gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9, - cumat_type = nerv.CuMatrixFloat, - mmat_type = nerv.MMatrixFloat, - frm_ext = 5, - frm_trim = 5, - tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp", - cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp", - htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf", - initialized_param = {"/slfs1/users/mfy43/swb_init.nerv", - "/slfs1/users/mfy43/swb_global_transf.nerv"}, - debug = false} - -function make_layer_repo(param_repo) - local layer_repo = nerv.LayerRepo( - { - -- global transf - ["nerv.BiasLayer"] = - { - blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}}, - blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}} - }, - ["nerv.WindowLayer"] = - { - wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}}, - wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}} - }, - -- biased linearity - ["nerv.AffineLayer"] = - { - affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"}, - {dim_in = {429}, dim_out = {2048}}}, - affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"}, - {dim_in = {2048}, dim_out = {3001}}} - }, - ["nerv.SigmoidLayer"] = - { - sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}} - }, - ["nerv.SoftmaxCELayer"] = - { - ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}} - } - }, param_repo, gconf) - - layer_repo:add_layers( - { - ["nerv.DAGLayer"] = - { - global_transf = {{}, { - dim_in = {429}, dim_out = {429}, - sub_layers = layer_repo, - connections = { - ["<input>[1]"] = "blayer1[1]", - ["blayer1[1]"] = "wlayer1[1]", - ["wlayer1[1]"] = "blayer2[1]", - ["blayer2[1]"] = "wlayer2[1]", - ["wlayer2[1]"] = "<output>[1]" - } - }}, - main = {{}, { - dim_in = {429, 1}, dim_out = {1}, - sub_layers = layer_repo, - connections = { - ["<input>[1]"] = "affine0[1]", - ["affine0[1]"] = "sigmoid0[1]", - ["sigmoid0[1]"] = "affine1[1]", - ["affine1[1]"] = "sigmoid1[1]", - ["sigmoid1[1]"] = "affine2[1]", - ["affine2[1]"] = "sigmoid2[1]", - ["sigmoid2[1]"] = "affine3[1]", - ["affine3[1]"] = "sigmoid3[1]", - ["sigmoid3[1]"] = "affine4[1]", - ["affine4[1]"] = "sigmoid4[1]", - ["sigmoid4[1]"] = "affine5[1]", - ["affine5[1]"] = "sigmoid5[1]", - ["sigmoid5[1]"] = "affine6[1]", - ["affine6[1]"] = "sigmoid6[1]", - ["sigmoid6[1]"] = "affine7[1]", - ["affine7[1]"] = "ce_crit[1]", - ["<input>[2]"] = "ce_crit[2]", - ["ce_crit[1]"] = "<output>[1]" - } - }} - } - }, param_repo, gconf) - return layer_repo -end - -function get_network(layer_repo) - return layer_repo:get_layer("main") -end - -function make_readers(scp_file, layer_repo) - return { - {reader = nerv.TNetReader(gconf, - { - id = "main_scp", - scp_file = scp_file, - conf_file = gconf.htk_conf, - frm_ext = gconf.frm_ext, - mlfs = { - phone_state = { - file = "/slfs1/users/mfy43/swb_ivec/ref.mlf", - format = "map", - format_arg = "/slfs1/users/mfy43/swb_ivec/dict", - dir = "*/", - ext = "lab" - } - } - }), - data = {main_scp = 429, phone_state = 1}} - } -end - -function make_buffer(readers) - return nerv.SGDBuffer(gconf, - { - buffer_size = gconf.buffer_size, - randomize = gconf.randomize, - readers = readers - }) -end - -function get_input_order() - return {{id = "main_scp", global_transf = true}, - {id = "phone_state"}} -end - -function get_accuracy(layer_repo) - local ce_crit = layer_repo:get_layer("ce_crit") - return ce_crit.total_correct / ce_crit.total_frames * 100 -end - -function print_stat(layer_repo) - local ce_crit = layer_repo:get_layer("ce_crit") - nerv.info("*** training stat begin ***") - nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce) - nerv.printf("correct:\t\t%d\n", ce_crit.total_correct) - nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames) - nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames) - nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo)) - nerv.info("*** training stat end ***") -end diff --git a/nerv/examples/timit_baseline2.lua b/nerv/examples/timit_baseline2.lua new file mode 100644 index 0000000..d783c3d --- /dev/null +++ b/nerv/examples/timit_baseline2.lua @@ -0,0 +1,212 @@ +require 'kaldi_io' +gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9, frm_ext = 5, + tr_scp = "ark:/speechlab/tools/KALDI/kaldi-master/src/featbin/copy-feats " .. + "scp:/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/train.scp ark:- |", + cv_scp = "ark:/speechlab/tools/KALDI/kaldi-master/src/featbin/copy-feats " .. + "scp:/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/cv.scp ark:- |", + initialized_param = {"/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/nnet_init.nerv", + "/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/nnet_output.nerv", + "/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/nnet_trans.nerv"}, + decode_param = {"/speechlab/users/mfy43/timit/nnet_init_20160229015745_iter_13_lr0.013437_tr72.434_cv58.729.nerv", + "/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/nnet_trans.nerv"}} + +function make_layer_repo(param_repo) + local layer_repo = nerv.LayerRepo( + { + -- global transf + ["nerv.BiasLayer"] = + { + blayer1 = {dim_in = {440}, dim_out = {440}, params = {bias = "bias0"}} + }, + ["nerv.WindowLayer"] = + { + wlayer1 = {dim_in = {440}, dim_out = {440}, params = {window = "window0"}} + }, + -- biased linearity + ["nerv.AffineLayer"] = + { + affine0 = {dim_in = {440}, dim_out = {1024}, + params = {ltp = "affine0_ltp", bp = "affine0_bp"}}, + affine1 = {dim_in = {1024}, dim_out = {1024}, + params = {ltp = "affine1_ltp", bp = "affine1_bp"}}, + affine2 = {dim_in = {1024}, dim_out = {1024}, + params = {ltp = "affine2_ltp", bp = "affine2_bp"}}, + affine3 = {dim_in = {1024}, dim_out = {1024}, + params = {ltp = "affine3_ltp", bp = "affine3_bp"}}, + affine4 = {dim_in = {1024}, dim_out = {1024}, + params = {ltp = "affine4_ltp", bp = "affine4_bp"}}, + affine5 = {dim_in = {1024}, dim_out = {1024}, + params = {ltp = "affine5_ltp", bp = "affine5_bp"}}, + affine6 = {dim_in = {1024}, dim_out = {1959}, + params = {ltp = "affine6_ltp", bp = "affine6_bp"}} + }, + ["nerv.SigmoidLayer"] = + { + sigmoid0 = {dim_in = {1024}, dim_out = {1024}}, + sigmoid1 = {dim_in = {1024}, dim_out = {1024}}, + sigmoid2 = {dim_in = {1024}, dim_out = {1024}}, + sigmoid3 = {dim_in = {1024}, dim_out = {1024}}, + sigmoid4 = {dim_in = {1024}, dim_out = {1024}}, + sigmoid5 = {dim_in = {1024}, dim_out = {1024}} + }, + ["nerv.SoftmaxCELayer"] = -- softmax + ce criterion layer for finetune output + { + ce_crit = {dim_in = {1959, 1}, dim_out = {1}, compressed = true} + }, + ["nerv.SoftmaxLayer"] = -- softmax for decode output + { + softmax = {dim_in = {1959}, dim_out = {1959}} + } + }, param_repo, gconf) + + layer_repo:add_layers( + { + ["nerv.GraphLayer"] = + { + global_transf = { + dim_in = {440}, dim_out = {440}, + layer_repo = layer_repo, + connections = { + {"<input>[1]", "blayer1[1]", 0}, + {"blayer1[1]", "wlayer1[1]", 0}, + {"wlayer1[1]", "<output>[1]", 0} + } + }, + main = { + dim_in = {440}, dim_out = {1959}, + layer_repo = layer_repo, + connections = { + {"<input>[1]", "affine0[1]", 0}, + {"affine0[1]", "sigmoid0[1]", 0}, + {"sigmoid0[1]", "affine1[1]", 0}, + {"affine1[1]", "sigmoid1[1]", 0}, + {"sigmoid1[1]", "affine2[1]", 0}, + {"affine2[1]", "sigmoid2[1]", 0}, + {"sigmoid2[1]", "affine3[1]", 0}, + {"affine3[1]", "sigmoid3[1]", 0}, + {"sigmoid3[1]", "affine4[1]", 0}, + {"affine4[1]", "sigmoid4[1]", 0}, + {"sigmoid4[1]", "affine5[1]", 0}, + {"affine5[1]", "sigmoid5[1]", 0}, + {"sigmoid5[1]", "affine6[1]", 0}, + {"affine6[1]", "<output>[1]", 0} + } + } + } + }, param_repo, gconf) + + layer_repo:add_layers( + { + ["nerv.GraphLayer"] = + { + ce_output = { + dim_in = {440, 1}, dim_out = {1}, + layer_repo = layer_repo, + connections = { + {"<input>[1]", "main[1]", 0}, + {"main[1]", "ce_crit[1]", 0}, + {"<input>[2]", "ce_crit[2]", 0}, + {"ce_crit[1]", "<output>[1]", 0} + } + }, + softmax_output = { + dim_in = {440}, dim_out = {1959}, + layer_repo = layer_repo, + connections = { + {"<input>[1]", "main[1]", 0}, + {"main[1]", "softmax[1]", 0}, + {"softmax[1]", "<output>[1]", 0} + } + } + } + }, param_repo, gconf) + + return layer_repo +end + +function get_network(layer_repo) + return layer_repo:get_layer("ce_output") +end + +function get_decode_network(layer_repo) + return layer_repo:get_layer("softmax_output") +end + +function get_global_transf(layer_repo) + return layer_repo:get_layer("global_transf") +end + +function make_readers(scp_file, layer_repo) + return { + {reader = nerv.KaldiReader(gconf, + { + id = "main_scp", + feature_rspecifier = scp_file, + conf_file = gconf.htk_conf, + frm_ext = gconf.frm_ext, + mlfs = { + phone_state = { + targets_rspecifier = "ark:/speechlab/tools/KALDI/kaldi-master/src/bin/ali-to-pdf " .. + "/speechlab/users/mfy43/timit/s5/exp/tri3_ali/final.mdl " .. + "\"ark:gunzip -c /speechlab/users/mfy43/timit/s5/exp/tri3_ali/ali.*.gz |\" " .. + "ark:- | " .. + "/speechlab/tools/KALDI/kaldi-master/src/bin/ali-to-post " .. + "ark:- ark:- |", + format = "map" + } + } + }), + data = {main_scp = 440, phone_state = 1}} + } +end + +function make_decode_readers(scp_file, layer_repo) + return { + {reader = nerv.KaldiReader(gconf, + { + id = "main_scp", + feature_rspecifier = scp_file, + conf_file = gconf.htk_conf, + frm_ext = gconf.frm_ext, + mlfs = {}, + need_key = true + }), + data = {main_scp = 440, phone_state = 1}} + } +end + +function make_buffer(readers) + return nerv.SGDBuffer(gconf, + { + buffer_size = gconf.buffer_size, + batch_size = gconf.batch_size, + randomize = gconf.randomize, + readers = readers, + use_gpu = true + }) +end + +function get_input_order() + return {{id = "main_scp", global_transf = true}, + {id = "phone_state"}} +end + +function get_decode_input_order() + return {{id = "main_scp", global_transf = true}} +end + +function get_accuracy(layer_repo) + local ce_crit = layer_repo:get_layer("ce_crit") + return ce_crit.total_correct / ce_crit.total_frames * 100 +end + +function print_stat(layer_repo) + local ce_crit = layer_repo:get_layer("ce_crit") + nerv.info("*** training stat begin ***") + nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce) + nerv.printf("correct:\t\t%d\n", ce_crit.total_correct) + nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames) + nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames) + nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo)) + nerv.info("*** training stat end ***") +end diff --git a/nerv/init.lua b/nerv/init.lua index 6312df1..ff944b8 100644 --- a/nerv/init.lua +++ b/nerv/init.lua @@ -13,6 +13,10 @@ function nerv.error_method_not_implemented() nerv.error("method not implemented"); end +function nerv.set_logfile(filename) + nerv._logfile = io.open(filename, "w") +end + --- Format a string just like `sprintf` in C. -- @param fmt the format string -- @param ... args, the data to be formatted @@ -25,7 +29,13 @@ end -- @param fmt the format string -- @param ... args, the data to be formatted function nerv.printf(fmt, ...) - io.write(nerv.sprintf(fmt, ...)) + local line = nerv.sprintf(fmt, ...) + io.stderr:write(line) + -- duplicate the all output to the log file, if set + if nerv._logfile then + nerv._logfile:write(line) + nerv._logfile:flush() + end end --- Raise an global error with the formatted message. @@ -88,24 +98,27 @@ function nerv.class(tname, parenttname) end function table.val_to_str(v) - if "string" == type(v) then - v = string.gsub(v, "\n", "\\n") - if string.match(string.gsub(v,"[^'\"]",""), '^"+$') then - return "'" .. v .. "'" + if "string" == type(v) then + v = string.gsub(v, "\n", "\\n") + if string.match(string.gsub(v,"[^'\"]",""), '^"+$') then + return "'" .. v .. "'" + end + return '"' .. string.gsub(v,'"', '\\"') .. '"' + else + return "table" == type(v) and table.tostring(v) or + (("number" == type(v) or + "string" == type(v) or + "boolean" == type(v)) and tostring(v)) or + nil -- failed to serialize end - return '"' .. string.gsub(v,'"', '\\"') .. '"' - else - return "table" == type(v) and table.tostring(v) or - tostring(v) - end end function table.key_to_str (k) - if "string" == type(k) and string.match(k, "^[_%a][_%a%d]*$") then - return k - else - return "[" .. table.val_to_str(k) .. "]" - end + if "string" == type(k) and string.match(k, "^[_%a][_%a%d]*$") then + return k + else + return "[" .. table.val_to_str(k) .. "]" + end end --- Get the string representation of a table, which can be executed as a valid @@ -114,18 +127,18 @@ end -- @return the string representation which will result in a Lua table entity -- when evaluated function table.tostring(tbl) - local result, done = {}, {} - for k, v in ipairs(tbl) do - table.insert(result, table.val_to_str(v)) - done[k] = true - end - for k, v in pairs(tbl) do - if not done[k] then - table.insert(result, - table.key_to_str(k) .. "=" .. table.val_to_str(v)) + local result, done = {}, {} + for k, v in ipairs(tbl) do + table.insert(result, table.val_to_str(v)) + done[k] = true end - end - return "{" .. table.concat(result, ",") .. "}" + for k, v in pairs(tbl) do + if not done[k] then + table.insert(result, + table.key_to_str(k) .. "=" .. table.val_to_str(v)) + end + end + return "{" .. table.concat(result, ",") .. "}" end --- Get the class by name. @@ -172,10 +185,180 @@ function nerv.include(filename) return dofile(nerv.dirname(caller) .. filename) end +--- Parse the command-line options and arguments +-- @param argv the argrument list to parsed +-- @param options The specification of options, should be a list of tables, +-- each one for exactly one available option, say `v`, with `v[1]`, `v[2]`, +-- `v[3]` indicating the full name of the option, the short form of the option +-- (when it is a boolean option) and the type of the value controlled by the +-- option. `default` and `desc` keys can also be specified to set the default +-- value and description of the option. +-- +-- An example of specification: +-- {{"aaa", "a", "boolean", default = false, desc = "an option called aaa"}, +-- {"bbb", "b", "boolean", default = true, desc = "bbb is set to be true if --bbb=no does not present"}, +-- {"ccc", nil, "int", default = 0, desc = "ccc expects an integeral value"}}` +-- +-- @return args, opts The non-option arguments and parsed options. `opts` is +-- again a list of tables, each of which corresponds to one table in parameter +-- `options`. The parsed value could be accessed by `opts["aaa"].val` (which is +-- `true` if "--aaa" or "-a" is specified). +function nerv.parse_args(argv, options, unordered) + local is_opt_exp = "^[-](.*)$" + local sim_opt_exp = "^[-]([a-z]+)$" + local opt_exp = "^[-][-]([^=]+)$" + local opt_with_val_exp = "^[-][-]([^=]+)=([^=]+)$" + local opts = {} + local sopts = {} + local args = {} + local arg_start = false + local function err() + nerv.error("invalid format of option specification") + end + for _, v in ipairs(options) do + if type(v) ~= "table" or + (v[1] == nil and v[2] == nil) or + v[3] == nil then + err() + end + local opt_full = v[1] + local opt_short = v[2] + local opt_type = v[3] + local opt_meta = {type = opt_type, + desc = v.desc or "", + val = v.default} + if opt_short ~= nil then + if type(opt_short) ~= "string" or #opt_short ~= 1 then err() end + if opt_type ~= "boolean" then + nerv.error("only boolean option could have short form") + end + sopts[opt_short] = opt_meta + end + if opt_full ~= nil then + if type(opt_full) ~= "string" then err() end + opts[opt_full] = opt_meta + end + end + for _, token in ipairs(argv) do + if ((not arg_start) or unordered) and token:match(is_opt_exp) then + local k = token:match(sim_opt_exp) + if k then + for c in k:gmatch"." do + if sopts[c] then + sopts[c].val = true + else + nerv.error("invalid option -%s", c) + end + end + else + local k = token:match(opt_exp) + if k then + if opts[k] == nil then + nerv.error("invalid option %s", token) + end + if opts[k].type ~= "boolean" then + nerv.error("invalid option --%s: " .. + "a %s value needs to be specified", + k, opts[k].type) + else + opts[k].val = true + end + else + local k, v = token:match(opt_with_val_exp) + if k then + if opts[k] == nil then + nerv.error("invalid option %s", token) + end + if opts[k].type == "boolean" then + if v == "yes" then + opts[k].val = true + elseif v == "no" then + opts[k].val = false + else + nerv.error("boolean value should be \"yes\" or \"no\"") + end + elseif opts[k].type == "int" then + local t = tonumber(v) + opts[k].val = t + if t == nil or math.floor(t) ~= t then + nerv.error("int value is expected") + end + elseif opts[k].type == "number" then + local t = tonumber(v) + opts[k].val = t + if t == nil then + nerv.error("numeric value is expected") + end + elseif opts[k].type == "string" then + opts[k].val = v + else + nerv.error("unrecognized type %s", opts[k].type) + end + else + nerv.error("unrecognized option %s", token) + end + end + end + else + table.insert(args, token) + arg_start = true + end + end + return args, opts +end + +--- Print usage information of the command-line options +-- @param options the list of options used in `parse_args` +function nerv.print_usage(options) + local full_maxlen = 0 + local type_maxlen = 0 + local default_maxlen = 0 + for _, v in ipairs(options) do + local opt_full = v[1] + local opt_short = v[2] + local opt_type = v[3] + full_maxlen = math.max(full_maxlen, #opt_full or 0) + type_maxlen = math.max(full_maxlen, #opt_type or 0) + default_maxlen = math.max(full_maxlen, #tostring(v.default) or 0) + end + local function pattern_gen() + return string.format("\t%%-%ds\t%%-2s\t%%-%ds\t%%-%ds\t%%s\n", + full_maxlen, type_maxlen, default_maxlen) + end + nerv.printf("\n") + nerv.printf(pattern_gen(), "Option", "Abbr.", "Type", "Default", "Desc.") + for _, v in ipairs(options) do + local opt_full = v[1] + local opt_short = v[2] + local opt_type = v[3] + nerv.printf(pattern_gen(), + (opt_full and '--' .. opt_full) or "", + (opt_short and '-' .. opt_short) or "", + opt_type, + (v.default ~= nil and tostring(v.default)) or "", + v.desc or "") + end + nerv.printf("\n") +end + +function table.extend(tbl1, tbl2) + for _, v in ipairs(tbl2) do + table.insert(tbl1, v) + end +end + +function table.vector(len, fill) + local v = {} + fill = fill or 0 + for i = 1, len do + table.insert(v, fill) + end + return v +end + -- the following lines trigger the initialization of basic modules nerv.include('matrix/init.lua') nerv.include('io/init.lua') nerv.include('layer/init.lua') nerv.include('nn/init.lua') -nerv.include('tnn/init.lua') diff --git a/nerv/io/init.lua b/nerv/io/init.lua index eb2e3e5..c36d850 100644 --- a/nerv/io/init.lua +++ b/nerv/io/init.lua @@ -52,8 +52,9 @@ function DataBuffer:__init(global_conf, buffer_conf) nerv.error_method_not_implemented() end -function DataBuffer:get_batch() +function DataBuffer:get_data() nerv.error_method_not_implemented() end nerv.include('sgd_buffer.lua') +nerv.include('seq_buffer.lua') diff --git a/nerv/io/seq_buffer.lua b/nerv/io/seq_buffer.lua new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/nerv/io/seq_buffer.lua diff --git a/nerv/io/sgd_buffer.lua b/nerv/io/sgd_buffer.lua index 3cf4f5a..d78f6d1 100644 --- a/nerv/io/sgd_buffer.lua +++ b/nerv/io/sgd_buffer.lua @@ -2,8 +2,9 @@ local SGDBuffer = nerv.class("nerv.SGDBuffer", "nerv.DataBuffer") function SGDBuffer:__init(global_conf, buffer_conf) self.gconf = global_conf + self.batch_size = buffer_conf.batch_size self.buffer_size = math.floor(buffer_conf.buffer_size / - global_conf.batch_size) * global_conf.batch_size + self.batch_size) * self.batch_size self.randomize = buffer_conf.randomize self.consume = buffer_conf.consume local cumat_type = global_conf.cumat_type @@ -112,11 +113,11 @@ function SGDBuffer:saturate() end self.rand_map = self.perm_gen(self.tail) -- generate shuffled index collectgarbage("collect") - return self.tail >= self.gconf.batch_size + return self.tail >= self.batch_size end function SGDBuffer:get_data() - local batch_size = self.gconf.batch_size + local batch_size = self.batch_size if self.head >= self.tail then -- buffer is empty local t = os.clock() if (not self:saturate()) and (not self.consume) then diff --git a/nerv/layer/affine.lua b/nerv/layer/affine.lua index 4156dde..38743aa 100644 --- a/nerv/layer/affine.lua +++ b/nerv/layer/affine.lua @@ -8,21 +8,19 @@ local AffineLayer = nerv.class('nerv.AffineLayer', 'nerv.Layer') --- A parameter that consists of a single matrix -- @type nerv.MatrixParam +function MatrixParam:check(checker) + -- check trans matrix type + checker(self.trans) +end + --- Read from a file handle. -- @param handle the file handle function MatrixParam:read(handle) self.trans = self.gconf.mmat_type.load(handle) - if not self.gconf.use_cpu then - self.trans = self.gconf.cumat_type.new_from_host(self.trans) - end end function MatrixParam:write(handle) - local trans = self.trans - if not self.gconf.use_cpu then - trans = self.trans:new_to_host() - end - trans:save(handle) + self.trans:save(handle) end function MatrixParam:train_init() @@ -30,6 +28,12 @@ function MatrixParam:train_init() self.correction:fill(0) end +function MatrixParam:copy(copier) + local target = nerv.MatrixParam(self.id, self.gconf) + target.trans = copier(self.trans) + return target +end + function MatrixParam:_update_by_gradient(gradient, alpha, beta) local gconf = self.gconf -- momentum gain @@ -77,25 +81,24 @@ end --- The constructor. function AffineLayer:__init(id, global_conf, layer_conf) - self.id = id - self.dim_in = layer_conf.dim_in - self.dim_out = layer_conf.dim_out - if layer_conf.ltp ~= nil and layer_conf.ltp1 == nil then - layer_conf.ltp1 = layer_conf.ltp - end + nerv.Layer.__init(self, id, global_conf, layer_conf) + self:check_dim_len(-1, 1) -- exactly one output, allow multiple inputs + self:bind_params() +end + +function AffineLayer:bind_params() for i = 1, #self.dim_in do local pid = "ltp" .. i local pid_list = i == 1 and {pid, "ltp"} or pid - self["ltp" .. i] = self:find_param(pid_list, layer_conf, global_conf, + self["ltp" .. i] = self:find_param(pid_list, self.lconf, self.gconf, nerv.LinearTransParam, - {self.dim_in[i], self.dim_out[1]}) + {self.dim_in[i], self.dim_out[1]}) end self.ltp = self.ltp1 -- alias of ltp1 - self.bp = self:find_param("bp", layer_conf, global_conf, + self.bp = self:find_param("bp", self.lconf, self.gconf, nerv.BiasParam, {1, self.dim_out[1]}) - self.gconf = global_conf - self:check_dim_len(-1, 1) -- exactly one output, allow multiple inputs + end function AffineLayer:init(batch_size) @@ -142,7 +145,7 @@ function AffineLayer:back_propagate(bp_err, next_bp_err, input, output) end function AffineLayer:get_params() - local pr = nerv.ParamRepo({self.ltp1, self.bp}) + local pr = nerv.ParamRepo({self.ltp1, self.bp}, self.loc_type) for i = 2, #self.dim_in do pr:add(self["ltp" .. i].id, self["ltp" .. i]) end diff --git a/nerv/layer/bias.lua b/nerv/layer/bias.lua index 924c3da..191be78 100644 --- a/nerv/layer/bias.lua +++ b/nerv/layer/bias.lua @@ -1,12 +1,15 @@ local BiasLayer = nerv.class("nerv.BiasLayer", "nerv.Layer") function BiasLayer:__init(id, global_conf, layer_conf) - self.id = id - self.gconf = global_conf - self.bias = layer_conf.bias - self.dim_in = layer_conf.dim_in - self.dim_out = layer_conf.dim_out + nerv.Layer.__init(self, id, global_conf, layer_conf) self:check_dim_len(1, 1) + self:bind_params() +end + +function BiasLayer:bind_params() + self.bias = self:find_param("bias", self.lconf, self.gconf, + nerv.BiasParam, + {1, self.dim_out[1]}) end function BiasLayer:init() @@ -28,5 +31,5 @@ function BiasLayer:propagate(input, output) end function BiasLayer:get_params() - return nerv.ParamRepo({self.bias}) + return nerv.ParamRepo({self.bias}, self.loc_type) end diff --git a/nerv/layer/combiner.lua b/nerv/layer/combiner.lua index 22e89a9..028c970 100644 --- a/nerv/layer/combiner.lua +++ b/nerv/layer/combiner.lua @@ -1,16 +1,8 @@ local CombinerLayer = nerv.class('nerv.CombinerLayer', 'nerv.Layer') function CombinerLayer:__init(id, global_conf, layer_conf) - self.id = id + nerv.Layer.__init(self, id, global_conf, layer_conf) self.lambda = layer_conf.lambda - self.dim_in = layer_conf.dim_in - self.dim_out = layer_conf.dim_out - self.gconf = global_conf - if self.gconf.use_cpu then - self.mat_type = self.gconf.mmat_type - else - self.mat_type = self.gconf.cumat_type - end self:check_dim_len(#self.lambda, -1) if #self.dim_in < 1 then nerv.error("no input specified") @@ -20,6 +12,10 @@ function CombinerLayer:__init(id, global_conf, layer_conf) end end +function CombinerLayer:bind_params() + -- do nothing +end + function CombinerLayer:init(batch_size) local dim = self.dim_in[1] for i = 2, #self.dim_in do @@ -66,5 +62,5 @@ function CombinerLayer:back_propagate(bp_err, next_bp_err, input, output) end function CombinerLayer:get_params() - return nerv.ParamRepo({}) + return nerv.ParamRepo({}, self.loc_type) end diff --git a/nerv/layer/dropout.lua b/nerv/layer/dropout.lua index 42660cc..39a8963 100644 --- a/nerv/layer/dropout.lua +++ b/nerv/layer/dropout.lua @@ -1,22 +1,17 @@ local DropoutLayer = nerv.class("nerv.DropoutLayer", "nerv.Layer") function DropoutLayer:__init(id, global_conf, layer_conf) - self.id = id - self.gconf = global_conf - if self.gconf.use_cpu then - self.mat_type = self.gconf.mmat_type - else - self.mat_type = self.gconf.cumat_type - end - self.rate = layer_conf.dropout_rate or global_conf.dropout_rate - if self.rate == nil then + nerv.Layer.__init(self, id, global_conf, layer_conf) + if self.gconf.dropout_rate == nil then nerv.warning("[DropoutLayer:propagate] dropout rate is not set") end - self.dim_in = layer_conf.dim_in - self.dim_out = layer_conf.dim_out self:check_dim_len(1, 1) -- two inputs: nn output and label end +function DropoutLayer:bind_params() + -- do nothing +end + function DropoutLayer:init(batch_size, chunk_size) if self.dim_in[1] ~= self.dim_out[1] then nerv.error("mismatching dimensions of input and output") @@ -45,12 +40,12 @@ function DropoutLayer:propagate(input, output, t) if t == nil then t = 1 end - if self.rate then + if self.gconf.dropout_rate ~= 0 then self.mask[t]:rand_uniform() -- since we will lose a portion of the actvations, we multiply the -- activations by 1 / (1 - rate) to compensate - self.mask[t]:thres_mask(self.mask[t], self.rate, - 0, 1 / (1.0 - self.rate)) + self.mask[t]:thres_mask(self.mask[t], self.gconf.dropout_rate, + 0, 1 / (1.0 - self.gconf.dropout_rate)) output[1]:mul_elem(input[1], self.mask[t]) else output[1]:copy_fromd(input[1]) @@ -65,7 +60,7 @@ function DropoutLayer:back_propagate(bp_err, next_bp_err, input, output, t) if t == nil then t = 1 end - if self.rate then + if self.gconf.dropout_rate then next_bp_err[1]:mul_elem(bp_err[1], self.mask[t]) else next_bp_err[1]:copy_fromd(bp_err[1]) @@ -73,5 +68,5 @@ function DropoutLayer:back_propagate(bp_err, next_bp_err, input, output, t) end function DropoutLayer:get_params() - return nerv.ParamRepo({}) + return nerv.ParamRepo({}, self.loc_type) end diff --git a/nerv/layer/duplicate.lua b/nerv/layer/duplicate.lua new file mode 100644 index 0000000..137472b --- /dev/null +++ b/nerv/layer/duplicate.lua @@ -0,0 +1,41 @@ +local DuplicateLayer = nerv.class('nerv.DuplicateLayer', 'nerv.Layer') + +function DuplicateLayer:__init(id, global_conf, layer_conf) + nerv.Layer.__init(self, id, global_conf, layer_conf) + self:check_dim_len(1, -1) + if #self.dim_out < 1 then + nerv.error('no output specified') + end + for i = 1, #self.dim_out do + if self.dim_out[i] ~= self.dim_in[1] then + nerv.error('mismatching dimensions of outputs') + end + end +end + +function DuplicateLayer:init() +end + +function DuplicateLayer:batch_resize() +end + +function DuplicateLayer:propagate(input, output) + for i = 1, #self.dim_out do + output[i]:copy_from(input[1]) + -- FIXME: use reference copy to speed up + end +end + +function DuplicateLayer:back_propagate(bp_err, next_bp_err) + next_bp_err[1]:copy_from(bp_err[1]) + for i = 2, #self.dim_out do + next_bp_err[1]:add(next_bp_err[1], bp_err[i], 1.0, 1.0) + end +end + +function DuplicateLayer:update() +end + +function DuplicateLayer:get_params() + return nerv.ParamRepo({}, self.loc_type) +end diff --git a/nerv/layer/elem_mul.lua b/nerv/layer/elem_mul.lua index fe80a3f..f03649b 100644 --- a/nerv/layer/elem_mul.lua +++ b/nerv/layer/elem_mul.lua @@ -1,14 +1,15 @@ local ElemMulLayer = nerv.class('nerv.ElemMulLayer', 'nerv.Layer') function ElemMulLayer:__init(id, global_conf, layer_conf) - self.id = id - self.dim_in = layer_conf.dim_in - self.dim_out = layer_conf.dim_out - self.gconf = global_conf + nerv.Layer.__init(self, id, global_conf, layer_conf) -- element-wise multiplication of input[1] and input[2] self:check_dim_len(2, 1) end +function ElemMulLayer:bind_params() + -- do nothing +end + function ElemMulLayer:init(batch_size) if self.dim_in[1] ~= self.dim_in[2] or self.dim_in[1] ~= self.dim_out[1] then @@ -34,5 +35,5 @@ function ElemMulLayer:update(bp_err, input, output) end function ElemMulLayer:get_params() - return nerv.ParamRepo({}) + return nerv.ParamRepo({}, self.loc_type) end diff --git a/nerv/layer/graph.lua b/nerv/layer/graph.lua new file mode 100644 index 0000000..68d5f51 --- /dev/null +++ b/nerv/layer/graph.lua @@ -0,0 +1,156 @@ +local GraphLayer = nerv.class('nerv.GraphLayer', 'nerv.Layer') + +function GraphLayer:__init(id, global_conf, layer_conf) + nerv.Layer.__init(self, id, global_conf, layer_conf) + self:graph_init(layer_conf.layer_repo, layer_conf.connections) +end + +local function parse_id(str) + local id, port, _ + _, _, id, port = string.find(str, "([a-zA-Z0-9_.]+)%[([0-9]+)%]") + if id == nil or port == nil then + _, _, id, port = string.find(str, "(.+)%[([0-9]+)%]") + if not (id == "<input>" or id == "<output>") then + nerv.error("wrong format of connection id") + end + end + port = tonumber(port) + return id, port +end + +function GraphLayer:add_prefix(layers, connections) + local function ap(name) + return self.id .. '.' .. name + end + + for layer_type, sublayers in pairs(layers) do + local tmp = {} + for name, layer_config in pairs(sublayers) do + tmp[ap(name)] = layer_config + end + layers[layer_type] = tmp + end + + for i = 1, #connections do + local from, to = connections[i][1], connections[i][2] + if parse_id(from) ~= '<input>' then + connections[i][1] = ap(from) + end + if parse_id(to) ~= '<output>' then + connections[i][2] = ap(to) + end + end +end + +function GraphLayer:discover(id, layer_repo) + if id == '<output>' then + id = '<input>' + end + local layers = self.layers + local ref = layers[id] + if ref == nil then + local layer = layer_repo:get_layer(id) + local dim_in, dim_out = layer:get_dim() + self.layer_num = self.layer_num + 1 + ref = { + layer = layer, + inputs = {}, + outputs = {}, + dim_in = dim_in, + dim_out = dim_out, + id = self.layer_num, + } + layers[id] = ref + end + return ref +end + +function GraphLayer:graph_init(layer_repo, connections) + local layers = {} + layers['<input>'] = { + inputs = {}, + outputs = {}, + dim_in = self.dim_out, + dim_out = self.dim_in, + id = 0, + } + self.layers = layers + self.layer_num = 0 + self.connections = {} + + -- check data dimension between connected ports + for _, edge in pairs(connections) do + local from, to, time = edge[1], edge[2], edge[3] + local id_from, port_from = parse_id(from) + local id_to, port_to = parse_id(to) + local ref_from = self:discover(id_from, layer_repo) + local ref_to = self:discover(id_to, layer_repo) + if ref_from.outputs[port_from] ~= nil then + nerv.error('%s has already been attached', from) + end + if ref_to.inputs[port_to] ~= nil then + nerv.error('%s has already been attached', to) + end + if ref_from.dim_out[port_from] ~= ref_to.dim_in[port_to] then + nerv.error('mismatching data dimension between %s and %s', from, to) + end + if ref_from.id == 0 and ref_to.id == 0 then + nerv.error('short-circuit connection between <input> and <output>') + end + ref_from.outputs[port_from] = true + ref_to.inputs[port_to] = true + table.insert(self.connections, {ref_from.id, port_from, ref_to.id, port_to, time}) + end + + -- check dangling ports + for id, ref in pairs(layers) do + if id ~= '<input>' then + for i = 1, #ref.dim_in do + if ref.inputs[i] == nil then + nerv.error('dangling input port %d of layer %s', i, id) + end + end + for i = 1, #ref.dim_out do + if ref.outputs[i] == nil then + nerv.error('dangling output port %d of layer %s', i, id) + end + end + end + end + for i = 1, #self.dim_in do + if layers['<input>'].outputs[i] == nil then + nerv.error('dangling port %d of layer <input>', i) + end + end + for i = 1, #self.dim_out do + if layers['<input>'].inputs[i] == nil then + nerv.error('dangling port %d of layer <output>', i) + end + end +end + +function GraphLayer:set_attr(name, value) + self[name] = value + for id, ref in pairs(self.layers) do + if id ~= '<input>' then + ref.layer:set_attr(name, value) + end + end +end + +function GraphLayer:get_sublayer(id) + if self.layers[id] == nil or id == '<input>' then + nerv.error('layer with id %s not found', id) + end + return self.layers[id].layer +end + +function GraphLayer:get_params() + local param_repos = {} + for id, ref in pairs(self.layers) do + if id ~= '<input>' then + table.insert(param_repos, ref.layer:get_params()) + end + end + return nerv.ParamRepo.merge(param_repos, self.loc_type) +end diff --git a/nerv/layer/gru.lua b/nerv/layer/gru.lua index e81d21a..71718d7 100644 --- a/nerv/layer/gru.lua +++ b/nerv/layer/gru.lua @@ -4,11 +4,7 @@ function GRULayer:__init(id, global_conf, layer_conf) -- input1:x -- input2:h -- input3:c (h^~) - self.id = id - self.dim_in = layer_conf.dim_in - self.dim_out = layer_conf.dim_out - self.gconf = global_conf - + nerv.Layer.__init(self, id, global_conf, layer_conf) if self.dim_in[2] ~= self.dim_out[1] then nerv.error("dim_in[2](%d) mismatch with dim_out[1](%d)", self.dim_in[2], self.dim_out[1]) @@ -17,7 +13,7 @@ function GRULayer:__init(id, global_conf, layer_conf) -- prepare a DAGLayer to hold the lstm structure local pr = layer_conf.pr if pr == nil then - pr = nerv.ParamRepo() + pr = nerv.ParamRepo({}, self.loc_type) end local function ap(str) @@ -63,7 +59,7 @@ function GRULayer:__init(id, global_conf, layer_conf) }, } - local layerRepo = nerv.LayerRepo(layers, pr, global_conf) + self.lrepo = nerv.LayerRepo(layers, pr, global_conf) local connections = { ["<input>[1]"] = ap("inputXDup[1]"), @@ -97,12 +93,20 @@ function GRULayer:__init(id, global_conf, layer_conf) self.dag = nerv.DAGLayer(self.id, global_conf, {dim_in = self.dim_in, dim_out = self.dim_out, - sub_layers = layerRepo, + sub_layers = self.lrepo, connections = connections}) self:check_dim_len(2, 1) -- x, h and h end +function GRULayer:bind_params() + local pr = layer_conf.pr + if pr == nil then + pr = nerv.ParamRepo({}, self.loc_type) + end + self.lrepo:rebind(pr) +end + function GRULayer:init(batch_size, chunk_size) self.dag:init(batch_size, chunk_size) end diff --git a/nerv/layer/identity.lua b/nerv/layer/identity.lua new file mode 100644 index 0000000..d56337d --- /dev/null +++ b/nerv/layer/identity.lua @@ -0,0 +1,30 @@ +local IdentityLayer = nerv.class('nerv.IdentityLayer', 'nerv.Layer') + +function IdentityLayer:__init(id, global_conf, layer_conf) + nerv.Layer.__init(self, id, global_conf, layer_conf) + self:check_dim_len(1, 1) + if self.dim_in[1] ~= self.dim_out[1] then + nerv.error('mismatching dimensions of input and output') + end +end + +function IdentityLayer:init() +end + +function IdentityLayer:batch_resize() +end + +function IdentityLayer:propagate(input, output) + output[1]:copy_from(input[1]) +end + +function IdentityLayer:back_propagate(bp_err, next_bp_err) + next_bp_err[1]:copy_from(bp_err[1]) +end + +function IdentityLayer:update() +end + +function IdentityLayer:get_params() + return nerv.ParamRepo({}, self.loc_type) +end diff --git a/nerv/layer/init.lua b/nerv/layer/init.lua index 54f33ae..475ef62 100644 --- a/nerv/layer/init.lua +++ b/nerv/layer/init.lua @@ -30,7 +30,18 @@ end local Layer = nerv.class('nerv.Layer') function Layer:__init(id, global_conf, layer_conf) - nerv.error_method_not_implemented() + self.id = id + self.gconf = global_conf + self.lconf = layer_conf + if self.gconf.use_cpu then + self.mat_type = self.gconf.mmat_type + self.loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST + else + self.mat_type = self.gconf.cumat_type + self.loc_type = nerv.ParamRepo.LOC_TYPES.ON_DEVICE + end + self.dim_in = layer_conf.dim_in + self.dim_out = layer_conf.dim_out end function Layer:init(batch_size) @@ -66,34 +77,49 @@ function Layer:get_params() nerv.error_method_not_implemented() end +function Layer:bind_params() + nerv.error_method_not_implemented() +end + function Layer:get_dim() return self.dim_in, self.dim_out end -function Layer:find_param(pid_list, lconf, gconf, p_type, p_dim) - if type(pid_list) == "string" then - pid_list = {pid_list} +function Layer:set_attr(name, value) + self[name] = value +end + +function Layer:get_sublayer(id) + nerv.error('primitive layer does not have sublayers') +end + +function Layer:find_param(plist, lconf, gconf, p_type, p_dim) + if type(plist) == "string" then + plist = {plist} + end + if lconf.params == nil then + lconf.params = {} end - pid_list_str = table.tostring(pid_list) - for i, pid in ipairs(pid_list) do - if lconf[pid] ~= nil then - nerv.info("param [%s] of layer [%s] found in `layer_conf`.", pid, self.id) - return lconf[pid] + plist_str = table.tostring(plist) + local pid + for i, pname in ipairs(plist) do + if lconf.params[pname] ~= nil then + nerv.info("param id for [%s] of layer [%s] specified in `layer_conf.params`.", pname, self.id) + pid = lconf.params[pname] end - local pid_g = self.id .. '_' .. pid --global identifier - local pr = lconf.pr - local p - if pr ~= nil and pr:has_param(pid_g) == true then - nerv.info("param [%s] of layer [%s] found in `layer_conf.pr`.", pid_list_str, self.id) - p = pr:get_param(pid_g) - return p + if lconf.pr:has_param(pid) then + return lconf.pr:get_param(pid) end end - nerv.info("param [%s] of layer [%s] is not found in `layer_conf` or `layer_conf.pr`, " .. - "switch to auto-generate", pid_list_str, self.id) - local pid_g = self.id .. '_' .. pid_list[1] - p = p_type(pid_g, gconf) - p.trans = gconf.cumat_type(unpack(p_dim)) + pid = self.id .. '_' .. plist[1] + if lconf.pr:has_param(pid) then + nerv.info("param id for [%s] of layer [%s] is generated automatically.", pname, self.id) + return lconf.pr:get_param(pid) + end + nerv.info("param id for [%s] of layer [%s] is not found in the specified param repo, " .. + "switch to auto-generate", plist_str, self.id) + local p = p_type(pid, gconf) + p.trans = self.mat_type(unpack(p_dim)) if type(gconf.param_random) ~= "function" then nerv.error("a param generate function is needed") end @@ -101,6 +127,7 @@ function Layer:find_param(pid_list, lconf, gconf, p_type, p_dim) return p end +nerv.include('graph.lua') nerv.include('affine.lua') nerv.include('sigmoid.lua') nerv.include('tanh.lua') @@ -115,6 +142,9 @@ nerv.include('lstm.lua') nerv.include('lstm_gate.lua') nerv.include('dropout.lua') nerv.include('gru.lua') +nerv.include('rnn.lua') +nerv.include('duplicate.lua') +nerv.include('identity.lua') -- The following lines are for backward compatibility, and will be removed in -- the future. The use of these names are deprecated. diff --git a/nerv/layer/lstm.lua b/nerv/layer/lstm.lua index d8eee71..56f674a 100644 --- a/nerv/layer/lstm.lua +++ b/nerv/layer/lstm.lua @@ -1,143 +1,85 @@ -local LSTMLayer = nerv.class('nerv.LSTMLayer', 'nerv.Layer') +local LSTMLayer = nerv.class('nerv.LSTMLayer', 'nerv.GraphLayer') function LSTMLayer:__init(id, global_conf, layer_conf) - -- input1:x - -- input2:h - -- input3:c - self.id = id - self.dim_in = layer_conf.dim_in - self.dim_out = layer_conf.dim_out - self.gconf = global_conf + nerv.Layer.__init(self, id, global_conf, layer_conf) + self:check_dim_len(1, 1) + + local din = layer_conf.dim_in[1] + local dout = layer_conf.dim_out[1] - -- prepare a DAGLayer to hold the lstm structure local pr = layer_conf.pr if pr == nil then - pr = nerv.ParamRepo() - end - - local function ap(str) - return self.id .. '.' .. str + pr = nerv.ParamRepo({}, self.loc_type) end - local din1, din2, din3 = self.dim_in[1], self.dim_in[2], self.dim_in[3] - local dout1, dout2 = self.dim_out[1], self.dim_out[2] - local layers = { - ["nerv.CombinerLayer"] = { - [ap("inputXDup")] = {{}, {dim_in = {din1}, - dim_out = {din1, din1, din1, din1}, - lambda = {1}}}, - [ap("inputHDup")] = {{}, {dim_in = {din2}, - dim_out = {din2, din2, din2, din2}, - lambda = {1}}}, - - [ap("inputCDup")] = {{}, {dim_in = {din3}, - dim_out = {din3, din3, din3}, - lambda = {1}}}, - - [ap("mainCDup")] = {{}, {dim_in = {din3, din3}, - dim_out = {din3, din3, din3}, - lambda = {1, 1}}}, + local layers = { + ['nerv.CombinerLayer'] = { + mainCombine = {dim_in = {dout, dout}, dim_out = {dout}, lambda = {1, 1}}, }, - ["nerv.AffineLayer"] = { - [ap("mainAffineL")] = {{}, {dim_in = {din1, din2}, - dim_out = {dout1}, - pr = pr}}, + ['nerv.DuplicateLayer'] = { + inputDup = {dim_in = {din}, dim_out = {din, din, din, din}}, + outputDup = {dim_in = {dout}, dim_out = {dout, dout, dout, dout, dout}}, + cellDup = {dim_in = {dout}, dim_out = {dout, dout, dout, dout, dout}}, }, - ["nerv.TanhLayer"] = { - [ap("mainTanhL")] = {{}, {dim_in = {dout1}, dim_out = {dout1}}}, - [ap("outputTanhL")] = {{}, {dim_in = {dout1}, dim_out = {dout1}}}, + ['nerv.AffineLayer'] = { + mainAffine = {dim_in = {din, dout}, dim_out = {dout}, pr = pr}, }, - ["nerv.LSTMGateLayer"] = { - [ap("forgetGateL")] = {{}, {dim_in = {din1, din2, din3}, - dim_out = {din3}, pr = pr, - param_type = {'N', 'N', 'D'}}}, - [ap("inputGateL")] = {{}, {dim_in = {din1, din2, din3}, - dim_out = {din3}, pr = pr, - param_type = {'N', 'N', 'D'}}}, - [ap("outputGateL")] = {{}, {dim_in = {din1, din2, din3}, - dim_out = {din3}, pr = pr, - param_type = {'N', 'N', 'D'}}}, - + ['nerv.TanhLayer'] = { + mainTanh = {dim_in = {dout}, dim_out = {dout}}, + outputTanh = {dim_in = {dout}, dim_out = {dout}}, }, - ["nerv.ElemMulLayer"] = { - [ap("inputGMulL")] = {{}, {dim_in = {din3, din3}, - dim_out = {din3}}}, - [ap("forgetGMulL")] = {{}, {dim_in = {din3, din3}, - dim_out = {din3}}}, - [ap("outputGMulL")] = {{}, {dim_in = {din3, din3}, - dim_out = {din3}}}, + ['nerv.LSTMGateLayer'] = { + forgetGate = {dim_in = {din, dout, dout}, dim_out = {dout}, param_type = {'N', 'N', 'D'}, pr = pr}, + inputGate = {dim_in = {din, dout, dout}, dim_out = {dout}, param_type = {'N', 'N', 'D'}, pr = pr}, + outputGate = {dim_in = {din, dout, dout}, dim_out = {dout}, param_type = {'N', 'N', 'D'}, pr = pr}, + }, + ['nerv.ElemMulLayer'] = { + inputGateMul = {dim_in = {dout, dout}, dim_out = {dout}}, + forgetGateMul = {dim_in = {dout, dout}, dim_out = {dout}}, + outputGateMul = {dim_in = {dout, dout}, dim_out = {dout}}, }, } - local layerRepo = nerv.LayerRepo(layers, pr, global_conf) - local connections = { - ["<input>[1]"] = ap("inputXDup[1]"), - ["<input>[2]"] = ap("inputHDup[1]"), - ["<input>[3]"] = ap("inputCDup[1]"), - - [ap("inputXDup[1]")] = ap("mainAffineL[1]"), - [ap("inputHDup[1]")] = ap("mainAffineL[2]"), - [ap("mainAffineL[1]")] = ap("mainTanhL[1]"), - - [ap("inputXDup[2]")] = ap("inputGateL[1]"), - [ap("inputHDup[2]")] = ap("inputGateL[2]"), - [ap("inputCDup[1]")] = ap("inputGateL[3]"), - - [ap("inputXDup[3]")] = ap("forgetGateL[1]"), - [ap("inputHDup[3]")] = ap("forgetGateL[2]"), - [ap("inputCDup[2]")] = ap("forgetGateL[3]"), - - [ap("mainTanhL[1]")] = ap("inputGMulL[1]"), - [ap("inputGateL[1]")] = ap("inputGMulL[2]"), - - [ap("inputCDup[3]")] = ap("forgetGMulL[1]"), - [ap("forgetGateL[1]")] = ap("forgetGMulL[2]"), - - [ap("inputGMulL[1]")] = ap("mainCDup[1]"), - [ap("forgetGMulL[1]")] = ap("mainCDup[2]"), - - [ap("inputXDup[4]")] = ap("outputGateL[1]"), - [ap("inputHDup[4]")] = ap("outputGateL[2]"), - [ap("mainCDup[3]")] = ap("outputGateL[3]"), - - [ap("mainCDup[2]")] = "<output>[2]", - [ap("mainCDup[1]")] = ap("outputTanhL[1]"), - - [ap("outputTanhL[1]")] = ap("outputGMulL[1]"), - [ap("outputGateL[1]")] = ap("outputGMulL[2]"), - - [ap("outputGMulL[1]")] = "<output>[1]", + -- lstm input + {'<input>[1]', 'inputDup[1]', 0}, + + -- input gate + {'inputDup[1]', 'inputGate[1]', 0}, + {'outputDup[1]', 'inputGate[2]', 1}, + {'cellDup[1]', 'inputGate[3]', 1}, + + -- forget gate + {'inputDup[2]', 'forgetGate[1]', 0}, + {'outputDup[2]', 'forgetGate[2]', 1}, + {'cellDup[2]', 'forgetGate[3]', 1}, + + -- lstm cell + {'forgetGate[1]', 'forgetGateMul[1]', 0}, + {'cellDup[3]', 'forgetGateMul[2]', 1}, + {'inputDup[3]', 'mainAffine[1]', 0}, + {'outputDup[3]', 'mainAffine[2]', 1}, + {'mainAffine[1]', 'mainTanh[1]', 0}, + {'inputGate[1]', 'inputGateMul[1]', 0}, + {'mainTanh[1]', 'inputGateMul[2]', 0}, + {'inputGateMul[1]', 'mainCombine[1]', 0}, + {'forgetGateMul[1]', 'mainCombine[2]', 0}, + {'mainCombine[1]', 'cellDup[1]', 0}, + + -- forget gate + {'inputDup[4]', 'outputGate[1]', 0}, + {'outputDup[4]', 'outputGate[2]', 1}, + {'cellDup[4]', 'outputGate[3]', 0}, + + -- lstm output + {'cellDup[5]', 'outputTanh[1]', 0}, + {'outputGate[1]', 'outputGateMul[1]', 0}, + {'outputTanh[1]', 'outputGateMul[2]', 0}, + {'outputGateMul[1]', 'outputDup[1]', 0}, + {'outputDup[5]', '<output>[1]', 0}, } - self.dag = nerv.DAGLayer(self.id, global_conf, - {dim_in = self.dim_in, - dim_out = self.dim_out, - sub_layers = layerRepo, - connections = connections}) - - self:check_dim_len(3, 2) -- x, h, c and h, c -end - -function LSTMLayer:init(batch_size, chunk_size) - self.dag:init(batch_size, chunk_size) -end - -function LSTMLayer:batch_resize(batch_size, chunk_size) - self.dag:batch_resize(batch_size, chunk_size) -end - -function LSTMLayer:update(bp_err, input, output, t) - self.dag:update(bp_err, input, output, t) -end - -function LSTMLayer:propagate(input, output, t) - self.dag:propagate(input, output, t) -end - -function LSTMLayer:back_propagate(bp_err, next_bp_err, input, output, t) - self.dag:back_propagate(bp_err, next_bp_err, input, output, t) -end -function LSTMLayer:get_params() - return self.dag:get_params() + self:add_prefix(layers, connections) + local layer_repo = nerv.LayerRepo(layers, pr, global_conf) + self:graph_init(layer_repo, connections) end diff --git a/nerv/layer/lstm_gate.lua b/nerv/layer/lstm_gate.lua index 8785b4f..e690721 100644 --- a/nerv/layer/lstm_gate.lua +++ b/nerv/layer/lstm_gate.lua @@ -2,24 +2,23 @@ local LSTMGateLayer = nerv.class('nerv.LSTMGateLayer', 'nerv.Layer') -- NOTE: this is a full matrix gate function LSTMGateLayer:__init(id, global_conf, layer_conf) - self.id = id - self.dim_in = layer_conf.dim_in - self.dim_out = layer_conf.dim_out + nerv.Layer.__init(self, id, global_conf, layer_conf) self.param_type = layer_conf.param_type - self.gconf = global_conf + self:check_dim_len(-1, 1) --accept multiple inputs + self:bind_params() +end +function LSTMGateLayer:bind_params() for i = 1, #self.dim_in do - self["ltp" .. i] = self:find_param("ltp" .. i, layer_conf, global_conf, + self["ltp" .. i] = self:find_param("ltp" .. i, self.lconf, self.gconf, nerv.LinearTransParam, {self.dim_in[i], self.dim_out[1]}) if self.param_type[i] == 'D' then self["ltp" .. i].trans:diagonalize() end end - self.bp = self:find_param("bp", layer_conf, global_conf, + self.bp = self:find_param("bp", self.lconf, self.gconf, nerv.BiasParam, {1, self.dim_out[1]}) - - self:check_dim_len(-1, 1) --accept multiple inputs end function LSTMGateLayer:init(batch_size) @@ -76,7 +75,7 @@ function LSTMGateLayer:update(bp_err, input, output) end function LSTMGateLayer:get_params() - local pr = nerv.ParamRepo({self.bp}) + local pr = nerv.ParamRepo({self.bp}, self.loc_type) for i = 1, #self.dim_in do pr:add(self["ltp" .. i].id, self["ltp" .. i]) end diff --git a/nerv/layer/mse.lua b/nerv/layer/mse.lua index 1c218d0..458d086 100644 --- a/nerv/layer/mse.lua +++ b/nerv/layer/mse.lua @@ -1,18 +1,14 @@ local MSELayer = nerv.class("nerv.MSELayer", "nerv.Layer") function MSELayer:__init(id, global_conf, layer_conf) - self.id = id - self.dim_in = layer_conf.dim_in - self.dim_out = layer_conf.dim_out - self.gconf = global_conf - if self.gconf.use_cpu then - self.mat_type = self.gconf.mmat_type - else - self.mat_type = self.gconf.cumat_type - end + nerv.Layer.__init(self, id, global_conf, layer_conf) self:check_dim_len(2, -1) end +function MSELayer:bind_params() + -- do nothing +end + function MSELayer:init(batch_size) if self.dim_in[1] ~= self.dim_in[2] then nerv.error("mismatching dimensions of previous network output and labels") @@ -61,5 +57,5 @@ function MSELayer:back_propagate(bp_err, next_bp_err, input, output) end function MSELayer:get_params() - return nerv.ParamRepo({}) + return nerv.ParamRepo({}, self.loc_type) end diff --git a/nerv/layer/rnn.lua b/nerv/layer/rnn.lua new file mode 100644 index 0000000..0b5ccaa --- /dev/null +++ b/nerv/layer/rnn.lua @@ -0,0 +1,42 @@ +local RNNLayer = nerv.class('nerv.RNNLayer', 'nerv.GraphLayer') + +function RNNLayer:__init(id, global_conf, layer_conf) + nerv.Layer.__init(self, id, global_conf, layer_conf) + self:check_dim_len(1, 1) + + if layer_conf.activation == nil then + layer_conf.activation = 'nerv.SigmoidLayer' + end + + local din = layer_conf.dim_in[1] + local dout = layer_conf.dim_out[1] + + local pr = layer_conf.pr + if pr == nil then + pr = nerv.ParamRepo({}, self.loc_type) + end + + local layers = { + ['nerv.AffineLayer'] = { + main = {dim_in = {din, dout}, dim_out = {dout}, pr = pr}, + }, + [layers.activation] = { + activation = {dim_in = {dout}, dim_out = {dout}}, + }, + ['nerv.DuplicateLayer'] = { + duplicate = {dim_in = {dout}, dim_out = {dout, dout}}, + }, + } + + local connections = { + {'<input>[1]', 'main[1]', 0}, + {'main[1]', 'activation[1]', 0}, + {'activation[1]', 'duplicate[1]', 0}, + {'duplicate[1]', 'main[2]', 1}, + {'duplicate[2]', '<output>[1]', 0}, + } + + self:add_prefix(layers, connections) + local layer_repo = nerv.LayerRepo(layers, pr, global_conf) + self:graph_init(layer_repo, connections) +end diff --git a/nerv/layer/sigmoid.lua b/nerv/layer/sigmoid.lua index 0a8bcdc..5974ffc 100644 --- a/nerv/layer/sigmoid.lua +++ b/nerv/layer/sigmoid.lua @@ -1,19 +1,20 @@ local SigmoidLayer = nerv.class("nerv.SigmoidLayer", "nerv.Layer") function SigmoidLayer:__init(id, global_conf, layer_conf) - self.id = id - self.gconf = global_conf - self.dim_in = layer_conf.dim_in - self.dim_out = layer_conf.dim_out + nerv.Layer.__init(self, id, global_conf, layer_conf) self:check_dim_len(1, 1) -end - -function SigmoidLayer:init() if self.dim_in[1] ~= self.dim_out[1] then nerv.error("mismatching dimensions of input and output") end end +function SigmoidLayer:bind_params() + -- do nothing +end + +function SigmoidLayer:init() +end + function SigmoidLayer:batch_resize(batch_size) -- do nothing end @@ -31,5 +32,5 @@ function SigmoidLayer:back_propagate(bp_err, next_bp_err, input, output) end function SigmoidLayer:get_params() - return nerv.ParamRepo({}) + return nerv.ParamRepo({}, self.loc_type) end diff --git a/nerv/layer/softmax.lua b/nerv/layer/softmax.lua index 4205b66..f7a5163 100644 --- a/nerv/layer/softmax.lua +++ b/nerv/layer/softmax.lua @@ -1,13 +1,14 @@ local SoftmaxLayer = nerv.class("nerv.SoftmaxLayer", "nerv.Layer") function SoftmaxLayer:__init(id, global_conf, layer_conf) - self.id = id - self.gconf = global_conf - self.dim_in = layer_conf.dim_in - self.dim_out = layer_conf.dim_out + nerv.Layer.__init(self, id, global_conf, layer_conf) self:check_dim_len(1, 1) -- two inputs: nn output and label end +function SoftmaxLayer:bind_params() + -- do nothing +end + function SoftmaxLayer:init(batch_size) if self.dim_in[1] ~= self.dim_out[1] then nerv.error("mismatching dimensions of input and output") @@ -31,5 +32,5 @@ function SoftmaxLayer:back_propagate(bp_err, next_bp_err, input, output) end function SoftmaxLayer:get_params() - return nerv.ParamRepo({}) + return nerv.ParamRepo({}, self.loc_type) end diff --git a/nerv/layer/softmax_ce.lua b/nerv/layer/softmax_ce.lua index d7d650e..7b4a80c 100644 --- a/nerv/layer/softmax_ce.lua +++ b/nerv/layer/softmax_ce.lua @@ -1,15 +1,7 @@ local SoftmaxCELayer = nerv.class("nerv.SoftmaxCELayer", "nerv.Layer") function SoftmaxCELayer:__init(id, global_conf, layer_conf) - self.id = id - self.gconf = global_conf - if self.gconf.use_cpu then - self.mat_type = self.gconf.mmat_type - else - self.mat_type = self.gconf.cumat_type - end - self.dim_in = layer_conf.dim_in - self.dim_out = layer_conf.dim_out + nerv.Layer.__init(self, id, global_conf, layer_conf) self.compressed = layer_conf.compressed if self.compressed == nil then self.compressed = false @@ -17,6 +9,10 @@ function SoftmaxCELayer:__init(id, global_conf, layer_conf) self:check_dim_len(2, -1) -- two inputs: nn output and label end +function SoftmaxCELayer:bind_params() + -- do nothing +end + function SoftmaxCELayer:init(batch_size, chunk_size) if not self.compressed and (self.dim_in[1] ~= self.dim_in[2]) then nerv.error("mismatching dimensions of previous network output and labels") @@ -94,5 +90,5 @@ function SoftmaxCELayer:back_propagate(bp_err, next_bp_err, input, output, t) end function SoftmaxCELayer:get_params() - return nerv.ParamRepo({}) + return nerv.ParamRepo({}, self.loc_type) end diff --git a/nerv/layer/tanh.lua b/nerv/layer/tanh.lua index e1c32f2..7a19fc8 100644 --- a/nerv/layer/tanh.lua +++ b/nerv/layer/tanh.lua @@ -1,13 +1,14 @@ local TanhLayer = nerv.class("nerv.TanhLayer", "nerv.Layer") function TanhLayer:__init(id, global_conf, layer_conf) - self.id = id - self.gconf = global_conf - self.dim_in = layer_conf.dim_in - self.dim_out = layer_conf.dim_out + nerv.Layer.__init(self, id, global_conf, layer_conf) self:check_dim_len(1, 1) end +function TanhLayer:bind_params() + -- do nothing +end + function TanhLayer:init() if self.dim_in[1] ~= self.dim_out[1] then nerv.error("mismatching dimensions of input and output") @@ -31,5 +32,5 @@ function TanhLayer:back_propagate(bp_err, next_bp_err, input, output) end function TanhLayer:get_params() - return nerv.ParamRepo({}) + return nerv.ParamRepo({}, self.loc_type) end diff --git a/nerv/layer/window.lua b/nerv/layer/window.lua index 4933de0..364929f 100644 --- a/nerv/layer/window.lua +++ b/nerv/layer/window.lua @@ -1,12 +1,15 @@ local WindowLayer = nerv.class("nerv.WindowLayer", "nerv.Layer") function WindowLayer:__init(id, global_conf, layer_conf) - self.id = id - self.gconf = global_conf - self.window = layer_conf.window - self.dim_in = layer_conf.dim_in - self.dim_out = layer_conf.dim_out + nerv.Layer.__init(self, id, global_conf, layer_conf) self:check_dim_len(1, 1) + self:bind_params() +end + +function WindowLayer:bind_params() + self.window = self:find_param("window", self.lconf, self.gconf, + nerv.BiasParam, + {1, self.dim_out[1]}) end function WindowLayer:init() @@ -28,5 +31,5 @@ function WindowLayer:propagate(input, output) end function WindowLayer:get_params() - return nerv.ParamRepo({self.window}) + return nerv.ParamRepo({self.window}, self.loc_type) end diff --git a/nerv/lib/cblas.h b/nerv/lib/cblas.h new file mode 100644 index 0000000..4087ffb --- /dev/null +++ b/nerv/lib/cblas.h @@ -0,0 +1,596 @@ +#ifndef CBLAS_H + +#ifndef CBLAS_ENUM_DEFINED_H + #define CBLAS_ENUM_DEFINED_H + enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102 }; + enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, + AtlasConj=114}; + enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; + enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; + enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; +#endif + +#ifndef CBLAS_ENUM_ONLY +#define CBLAS_H +#define CBLAS_INDEX int + +int cblas_errprn(int ierr, int info, char *form, ...); + +/* + * =========================================================================== + * Prototypes for level 1 BLAS functions (complex are recast as routines) + * =========================================================================== + */ +float cblas_sdsdot(const int N, const float alpha, const float *X, + const int incX, const float *Y, const int incY); +double cblas_dsdot(const int N, const float *X, const int incX, const float *Y, + const int incY); +float cblas_sdot(const int N, const float *X, const int incX, + const float *Y, const int incY); +double cblas_ddot(const int N, const double *X, const int incX, + const double *Y, const int incY); +/* + * Functions having prefixes Z and C only + */ +void cblas_cdotu_sub(const int N, const void *X, const int incX, + const void *Y, const int incY, void *dotu); +void cblas_cdotc_sub(const int N, const void *X, const int incX, + const void *Y, const int incY, void *dotc); + +void cblas_zdotu_sub(const int N, const void *X, const int incX, + const void *Y, const int incY, void *dotu); +void cblas_zdotc_sub(const int N, const void *X, const int incX, + const void *Y, const int incY, void *dotc); + + +/* + * Functions having prefixes S D SC DZ + */ +float cblas_snrm2(const int N, const float *X, const int incX); +float cblas_sasum(const int N, const float *X, const int incX); + +double cblas_dnrm2(const int N, const double *X, const int incX); +double cblas_dasum(const int N, const double *X, const int incX); + +float cblas_scnrm2(const int N, const void *X, const int incX); +float cblas_scasum(const int N, const void *X, const int incX); + +double cblas_dznrm2(const int N, const void *X, const int incX); +double cblas_dzasum(const int N, const void *X, const int incX); + + +/* + * Functions having standard 4 prefixes (S D C Z) + */ +CBLAS_INDEX cblas_isamax(const int N, const float *X, const int incX); +CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX); +CBLAS_INDEX cblas_icamax(const int N, const void *X, const int incX); +CBLAS_INDEX cblas_izamax(const int N, const void *X, const int incX); + +/* + * =========================================================================== + * Prototypes for level 1 BLAS routines + * =========================================================================== + */ + +/* + * Routines with standard 4 prefixes (s, d, c, z) + */ +void cblas_sswap(const int N, float *X, const int incX, + float *Y, const int incY); +void cblas_scopy(const int N, const float *X, const int incX, + float *Y, const int incY); +void cblas_saxpy(const int N, const float alpha, const float *X, + const int incX, float *Y, const int incY); +void catlas_saxpby(const int N, const float alpha, const float *X, + const int incX, const float beta, float *Y, const int incY); +void catlas_sset + (const int N, const float alpha, float *X, const int incX); + +void cblas_dswap(const int N, double *X, const int incX, + double *Y, const int incY); +void cblas_dcopy(const int N, const double *X, const int incX, + double *Y, const int incY); +void cblas_daxpy(const int N, const double alpha, const double *X, + const int incX, double *Y, const int incY); +void catlas_daxpby(const int N, const double alpha, const double *X, + const int incX, const double beta, double *Y, const int incY); +void catlas_dset + (const int N, const double alpha, double *X, const int incX); + +void cblas_cswap(const int N, void *X, const int incX, + void *Y, const int incY); +void cblas_ccopy(const int N, const void *X, const int incX, + void *Y, const int incY); +void cblas_caxpy(const int N, const void *alpha, const void *X, + const int incX, void *Y, const int incY); +void catlas_caxpby(const int N, const void *alpha, const void *X, + const int incX, const void *beta, void *Y, const int incY); +void catlas_cset + (const int N, const void *alpha, void *X, const int incX); + +void cblas_zswap(const int N, void *X, const int incX, + void *Y, const int incY); +void cblas_zcopy(const int N, const void *X, const int incX, + void *Y, const int incY); +void cblas_zaxpy(const int N, const void *alpha, const void *X, + const int incX, void *Y, const int incY); +void catlas_zaxpby(const int N, const void *alpha, const void *X, + const int incX, const void *beta, void *Y, const int incY); +void catlas_zset + (const int N, const void *alpha, void *X, const int incX); + + +/* + * Routines with S and D prefix only + */ +void cblas_srotg(float *a, float *b, float *c, float *s); +void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); +void cblas_srot(const int N, float *X, const int incX, + float *Y, const int incY, const float c, const float s); +void cblas_srotm(const int N, float *X, const int incX, + float *Y, const int incY, const float *P); + +void cblas_drotg(double *a, double *b, double *c, double *s); +void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); +void cblas_drot(const int N, double *X, const int incX, + double *Y, const int incY, const double c, const double s); +void cblas_drotm(const int N, double *X, const int incX, + double *Y, const int incY, const double *P); + + +/* + * Routines with S D C Z CS and ZD prefixes + */ +void cblas_sscal(const int N, const float alpha, float *X, const int incX); +void cblas_dscal(const int N, const double alpha, double *X, const int incX); +void cblas_cscal(const int N, const void *alpha, void *X, const int incX); +void cblas_zscal(const int N, const void *alpha, void *X, const int incX); +void cblas_csscal(const int N, const float alpha, void *X, const int incX); +void cblas_zdscal(const int N, const double alpha, void *X, const int incX); + +/* + * Extra reference routines provided by ATLAS, but not mandated by the standard + */ +void cblas_crotg(void *a, void *b, void *c, void *s); +void cblas_zrotg(void *a, void *b, void *c, void *s); +void cblas_csrot(const int N, void *X, const int incX, void *Y, const int incY, + const float c, const float s); +void cblas_zdrot(const int N, void *X, const int incX, void *Y, const int incY, + const double c, const double s); + +/* + * =========================================================================== + * Prototypes for level 2 BLAS + * =========================================================================== + */ + +/* + * Routines with standard 4 prefixes (S, D, C, Z) + */ +void cblas_sgemv(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float *A, const int lda, + const float *X, const int incX, const float beta, + float *Y, const int incY); +void cblas_sgbmv(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const int KL, const int KU, const float alpha, + const float *A, const int lda, const float *X, + const int incX, const float beta, float *Y, const int incY); +void cblas_strmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const float *A, const int lda, + float *X, const int incX); +void cblas_stbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const int K, const float *A, const int lda, + float *X, const int incX); +void cblas_stpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const float *Ap, float *X, const int incX); +void cblas_strsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const float *A, const int lda, float *X, + const int incX); +void cblas_stbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const int K, const float *A, const int lda, + float *X, const int incX); +void cblas_stpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const float *Ap, float *X, const int incX); + +void cblas_dgemv(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const double alpha, const double *A, const int lda, + const double *X, const int incX, const double beta, + double *Y, const int incY); +void cblas_dgbmv(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const int KL, const int KU, const double alpha, + const double *A, const int lda, const double *X, + const int incX, const double beta, double *Y, const int incY); +void cblas_dtrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const double *A, const int lda, + double *X, const int incX); +void cblas_dtbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const int K, const double *A, const int lda, + double *X, const int incX); +void cblas_dtpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const double *Ap, double *X, const int incX); +void cblas_dtrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const double *A, const int lda, double *X, + const int incX); +void cblas_dtbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const int K, const double *A, const int lda, + double *X, const int incX); +void cblas_dtpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const double *Ap, double *X, const int incX); + +void cblas_cgemv(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *X, const int incX, const void *beta, + void *Y, const int incY); +void cblas_cgbmv(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const int KL, const int KU, const void *alpha, + const void *A, const int lda, const void *X, + const int incX, const void *beta, void *Y, const int incY); +void cblas_ctrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const void *A, const int lda, + void *X, const int incX); +void cblas_ctbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const int K, const void *A, const int lda, + void *X, const int incX); +void cblas_ctpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const void *Ap, void *X, const int incX); +void cblas_ctrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const void *A, const int lda, void *X, + const int incX); +void cblas_ctbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const int K, const void *A, const int lda, + void *X, const int incX); +void cblas_ctpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const void *Ap, void *X, const int incX); + +void cblas_zgemv(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *X, const int incX, const void *beta, + void *Y, const int incY); +void cblas_zgbmv(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const int M, const int N, + const int KL, const int KU, const void *alpha, + const void *A, const int lda, const void *X, + const int incX, const void *beta, void *Y, const int incY); +void cblas_ztrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const void *A, const int lda, + void *X, const int incX); +void cblas_ztbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const int K, const void *A, const int lda, + void *X, const int incX); +void cblas_ztpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const void *Ap, void *X, const int incX); +void cblas_ztrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const void *A, const int lda, void *X, + const int incX); +void cblas_ztbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const int K, const void *A, const int lda, + void *X, const int incX); +void cblas_ztpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const int N, const void *Ap, void *X, const int incX); + + +/* + * Routines with S and D prefixes only + */ +void cblas_ssymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const float alpha, const float *A, + const int lda, const float *X, const int incX, + const float beta, float *Y, const int incY); +void cblas_ssbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const int K, const float alpha, const float *A, + const int lda, const float *X, const int incX, + const float beta, float *Y, const int incY); +void cblas_sspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const float alpha, const float *Ap, + const float *X, const int incX, + const float beta, float *Y, const int incY); +void cblas_sger(const enum CBLAS_ORDER Order, const int M, const int N, + const float alpha, const float *X, const int incX, + const float *Y, const int incY, float *A, const int lda); +void cblas_ssyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const float alpha, const float *X, + const int incX, float *A, const int lda); +void cblas_sspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const float alpha, const float *X, + const int incX, float *Ap); +void cblas_ssyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const float alpha, const float *X, + const int incX, const float *Y, const int incY, float *A, + const int lda); +void cblas_sspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const float alpha, const float *X, + const int incX, const float *Y, const int incY, float *A); + +void cblas_dsymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const double alpha, const double *A, + const int lda, const double *X, const int incX, + const double beta, double *Y, const int incY); +void cblas_dsbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const int K, const double alpha, const double *A, + const int lda, const double *X, const int incX, + const double beta, double *Y, const int incY); +void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const double alpha, const double *Ap, + const double *X, const int incX, + const double beta, double *Y, const int incY); +void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N, + const double alpha, const double *X, const int incX, + const double *Y, const int incY, double *A, const int lda); +void cblas_dsyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const double alpha, const double *X, + const int incX, double *A, const int lda); +void cblas_dspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const double alpha, const double *X, + const int incX, double *Ap); +void cblas_dsyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const double alpha, const double *X, + const int incX, const double *Y, const int incY, double *A, + const int lda); +void cblas_dspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const double alpha, const double *X, + const int incX, const double *Y, const int incY, double *A); + + +/* + * Routines with C and Z prefixes only + */ +void cblas_chemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const void *alpha, const void *A, + const int lda, const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_chbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const int K, const void *alpha, const void *A, + const int lda, const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_chpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const void *alpha, const void *Ap, + const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_cgeru(const enum CBLAS_ORDER Order, const int M, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_cgerc(const enum CBLAS_ORDER Order, const int M, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_cher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const float alpha, const void *X, const int incX, + void *A, const int lda); +void cblas_chpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const float alpha, const void *X, + const int incX, void *A); +void cblas_cher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_chpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *Ap); + +void cblas_zhemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const void *alpha, const void *A, + const int lda, const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_zhbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const int K, const void *alpha, const void *A, + const int lda, const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const void *alpha, const void *Ap, + const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_zher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const double alpha, const void *X, const int incX, + void *A, const int lda); +void cblas_zhpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const int N, const double alpha, const void *X, + const int incX, void *A); +void cblas_zher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_zhpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *Ap); + +/* + * =========================================================================== + * Prototypes for level 3 BLAS + * =========================================================================== + */ + +/* + * Routines with standard 4 prefixes (S, D, C, Z) + */ +void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const float alpha, const float *A, + const int lda, const float *B, const int ldb, + const float beta, float *C, const int ldc); +void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const int M, const int N, + const float alpha, const float *A, const int lda, + const float *B, const int ldb, const float beta, + float *C, const int ldc); +void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const float alpha, const float *A, const int lda, + const float beta, float *C, const int ldc); +void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const float alpha, const float *A, const int lda, + const float *B, const int ldb, const float beta, + float *C, const int ldc); +void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const int M, const int N, + const float alpha, const float *A, const int lda, + float *B, const int ldb); +void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const int M, const int N, + const float alpha, const float *A, const int lda, + float *B, const int ldb); + +void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const double alpha, const double *A, + const int lda, const double *B, const int ldb, + const double beta, double *C, const int ldc); +void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const int M, const int N, + const double alpha, const double *A, const int lda, + const double *B, const int ldb, const double beta, + double *C, const int ldc); +void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const double alpha, const double *A, const int lda, + const double beta, double *C, const int ldc); +void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const double alpha, const double *A, const int lda, + const double *B, const int ldb, const double beta, + double *C, const int ldc); +void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const int M, const int N, + const double alpha, const double *A, const int lda, + double *B, const int ldb); +void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const int M, const int N, + const double alpha, const double *A, const int lda, + double *B, const int ldb); + +void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const void *alpha, const void *A, + const int lda, const void *B, const int ldb, + const void *beta, void *C, const int ldc); +void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *beta, void *C, const int ldc); +void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const int M, const int N, + const void *alpha, const void *A, const int lda, + void *B, const int ldb); +void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const int M, const int N, + const void *alpha, const void *A, const int lda, + void *B, const int ldb); + +void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const void *alpha, const void *A, + const int lda, const void *B, const int ldb, + const void *beta, void *C, const int ldc); +void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *beta, void *C, const int ldc); +void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const int M, const int N, + const void *alpha, const void *A, const int lda, + void *B, const int ldb); +void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const int M, const int N, + const void *alpha, const void *A, const int lda, + void *B, const int ldb); + + +/* + * Routines with prefixes C and Z only + */ +void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const float alpha, const void *A, const int lda, + const float beta, void *C, const int ldc); +void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const float beta, + void *C, const int ldc); +void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, + const enum CBLAS_UPLO Uplo, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const double alpha, const void *A, const int lda, + const double beta, void *C, const int ldc); +void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, + const enum CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const double beta, + void *C, const int ldc); + +int cblas_errprn(int ierr, int info, char *form, ...); + +#endif /* end #ifdef CBLAS_ENUM_ONLY */ +#endif diff --git a/nerv/lib/common.c b/nerv/lib/common.c index d977f8d..7fd2331 100644 --- a/nerv/lib/common.c +++ b/nerv/lib/common.c @@ -4,7 +4,7 @@ int nerv_error(lua_State *L, const char *err_mesg_fmt, ...) { va_list ap; va_start(ap, err_mesg_fmt); lua_pushstring(L, "[nerv] internal error: "); - lua_pushvfstring(L, err_mesg_fmt, ap); + lua_pushvfstring(L, err_mesg_fmt, ap); lua_concat(L, 2); lua_error(L); va_end(ap); @@ -56,7 +56,7 @@ void luaN_append_methods(lua_State *L, const luaL_Reg *mlist) { } } -HashMap *hashmap_create(size_t size, HashKey_t hfunc, HashMapCmp_t cmp) { +HashMap *nerv_hashmap_create(size_t size, HashKey_t hfunc, HashMapCmp_t cmp) { HashMap *res = (HashMap *)malloc(sizeof(HashMap)); res->bucket = calloc(size, sizeof(HashNode)); res->cmp = cmp; @@ -65,7 +65,7 @@ HashMap *hashmap_create(size_t size, HashKey_t hfunc, HashMapCmp_t cmp) { return res; } -void *hashmap_getval(HashMap *h, const char *key) { +void *nerv_hashmap_getval(HashMap *h, const char *key) { size_t idx = h->hfunc(key) % h->size; HashNode *ptr; for (ptr = h->bucket[idx]; ptr; ptr = ptr->next) @@ -76,7 +76,7 @@ void *hashmap_getval(HashMap *h, const char *key) { return NULL; } -void hashmap_setval(HashMap *h, const char *key, void *val) { +void nerv_hashmap_setval(HashMap *h, const char *key, void *val) { size_t idx = h->hfunc(key) % h->size; HashNode *ptr = malloc(sizeof(HashNode)); ptr->next = h->bucket[idx]; @@ -85,7 +85,7 @@ void hashmap_setval(HashMap *h, const char *key, void *val) { ptr->val = val; } -void hashmap_clear(HashMap *h) { +void nerv_hashmap_clear(HashMap *h) { size_t i; for (i = 0; i < h->size; i++) { @@ -100,6 +100,11 @@ void hashmap_clear(HashMap *h) { } } +void nerv_hashmap_destroy(HashMap *h) { + nerv_hashmap_clear(h); + free(h); +} + size_t bkdr_hash(const char *key) { unsigned int seed = 131; unsigned int res = 0; diff --git a/nerv/lib/common.h b/nerv/lib/common.h index 1c588d1..3d98574 100644 --- a/nerv/lib/common.h +++ b/nerv/lib/common.h @@ -7,6 +7,8 @@ #include <stdio.h> #include <stdlib.h> +#define PROFILE_HASHMAP_SIZE 123457 + typedef enum ErrCode { NERV_NORMAL, /* matrix err */ @@ -59,6 +61,21 @@ typedef struct Status { nerv_error_status(L, &status); \ } while (0) +#define CHECK_SAME_DIMENSION(a, b, status) \ + do { \ + if (!(a->nrow == b->nrow && a->ncol == b->ncol)) \ + NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); \ + } while (0) + +#define CHECK_SAME_DIMENSION_RET(a, b, status) \ + do { \ + if (!(a->nrow == b->nrow && a->ncol == b->ncol)) \ + { \ + NERV_SET_STATUS(status, MAT_MISMATCH_DIM, 0); \ + return 0; \ + } \ + } while (0) + typedef struct HashNode { const char *key; void *val; @@ -75,10 +92,11 @@ typedef struct HashMap { size_t size; } HashMap; -HashMap *hashmap_create(size_t size, HashKey_t hfunc, HashMapCmp_t cmp); -void *hashmap_getval(HashMap *h, const char *key); -void hashmap_setval(HashMap *h, const char *key, void *val); -void hashmap_clear(HashMap *h); +HashMap *nerv_hashmap_create(size_t size, HashKey_t hfunc, HashMapCmp_t cmp); +void *nerv_hashmap_getval(HashMap *h, const char *key); +void nerv_hashmap_setval(HashMap *h, const char *key, void *val); +void nerv_hashmap_clear(HashMap *h); +void nerv_hashmap_destroy(HashMap *h); size_t bkdr_hash(const char *key); diff --git a/nerv/lib/matrix/cuda_helper.h b/nerv/lib/matrix/cuda_helper.h index 13d5728..2d18486 100644 --- a/nerv/lib/matrix/cuda_helper.h +++ b/nerv/lib/matrix/cuda_helper.h @@ -54,19 +54,26 @@ cudaDeviceSynchronize(); \ } while (0) -#define CHECK_SAME_DIMENSION(a, b, status) \ +#define CURAND_SAFE_SYNC_CALL(call, status) \ do { \ - if (!(a->nrow == b->nrow && a->ncol == b->ncol)) \ - NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); \ + curandStatus_t err = (call); \ + if (err != CURAND_STATUS_SUCCESS) \ + { \ + NERV_SET_STATUS(status, MAT_CUBLAS_ERR, curandGetErrorString(err)); \ + return; \ + } \ + cudaDeviceSynchronize(); \ } while (0) -#define CHECK_SAME_DIMENSION_RET(a, b, status) \ +#define CURAND_SAFE_SYNC_CALL_RET(call, status) \ do { \ - if (!(a->nrow == b->nrow && a->ncol == b->ncol)) \ + curandStatus_t err = (call); \ + if (err != CURAND_STATUS_SUCCESS) \ { \ - NERV_SET_STATUS(status, MAT_MISMATCH_DIM, 0); \ + NERV_SET_STATUS(status, MAT_CUBLAS_ERR, curandGetErrorString(err)); \ return 0; \ } \ + cudaDeviceSynchronize(); \ } while (0) static const char *cublasGetErrorString(cublasStatus_t err) { @@ -96,15 +103,46 @@ static const char *cublasGetErrorString(cublasStatus_t err) { return "<unknown>"; } +static const char *curandGetErrorString(curandStatus_t err) { + switch (err) + { + case CURAND_STATUS_VERSION_MISMATCH: + return "Header file and linked library version do not match"; + case CURAND_STATUS_NOT_INITIALIZED: + return "Generator not initialized"; + case CURAND_STATUS_ALLOCATION_FAILED: + return "Memory allocation failed"; + case CURAND_STATUS_TYPE_ERROR: + return "Generator is wrong type"; + case CURAND_STATUS_OUT_OF_RANGE: + return "Argument out of range"; + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "Length requested is not a multple of dimension"; + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "GPU does not have double precision required by MRG32k3a"; + case CURAND_STATUS_LAUNCH_FAILURE: + return "Kernel launch failure"; + case CURAND_STATUS_PREEXISTING_FAILURE: + return "Preexisting failure on library entry"; + case CURAND_STATUS_INITIALIZATION_FAILED: + return "Initialization of CUDA failed"; + case CURAND_STATUS_ARCH_MISMATCH: + return "Architecture mismatch, GPU does not support requested feature"; + case CURAND_STATUS_INTERNAL_ERROR: + return "Internal library error"; + } + return "<unknown>"; +} #define PROFILE_START \ do { \ - cudaEventRecord(profile_start, 0); + cudaEventRecord(context->profile_start, 0); #define PROFILE_STOP \ - cudaEventRecord(profile_stop, 0); \ - cudaEventSynchronize(profile_stop); \ + cudaEventRecord(context->profile_stop, 0); \ + cudaEventSynchronize(context->profile_stop); \ float milliseconds = 0; \ - cudaEventElapsedTime(&milliseconds, profile_start, profile_stop); \ - accu_profile(__func__, milliseconds / 1000); \ + cudaEventElapsedTime(&milliseconds, context->profile_start, \ + context->profile_stop); \ + nerv_cuda_context_accu_profile(context, __func__, milliseconds / 1000); \ } while (0); #define PROFILE_END diff --git a/nerv/lib/matrix/cukernel.h b/nerv/lib/matrix/cukernel.h index c84200e..d59a070 100644 --- a/nerv/lib/matrix/cukernel.h +++ b/nerv/lib/matrix/cukernel.h @@ -3,7 +3,7 @@ void cudak_(cuda_mul_elem)(const Matrix *a, const Matrix *b, Matrix *c); void cudak_(cuda_log_elem)(const Matrix *a, Matrix *b); void cudak_(cuda_sigmoid)(const Matrix *a, Matrix *b); void cudak_(cuda_sigmoid_grad)(const Matrix *output, const Matrix *err, Matrix *nerr); -void cudak_(cuda_rand_uniform)(const Matrix *a); /* a's curand_gen may be modified */ +void cudak_(cuda_rand_uniform)(const Matrix *a, CuContext *context); /* a's curand_gen may be modified */ void cudak_(cuda_thres_mask)(const Matrix *a, const Matrix *b, double thres, double low, double high); void cudak_(cuda_tanh)(const Matrix *a, Matrix *b); void cudak_(cuda_tanh_grad)(const Matrix *output, const Matrix *err, Matrix *nerr); diff --git a/nerv/lib/matrix/cumatrix.c b/nerv/lib/matrix/cumatrix.c index d998871..aec4d60 100644 --- a/nerv/lib/matrix/cumatrix.c +++ b/nerv/lib/matrix/cumatrix.c @@ -1,23 +1,13 @@ #define NERV_GENERIC_CUMATRIX -#include "cumatrix.h" -#include "cuda_helper.h" +#define MATRIX_CONTEXT CuContext #include <string.h> #include <time.h> -#define PROFILE_HASHMAP_SIZE 123457 -static cublasHandle_t cublas_handle; -static cudaEvent_t profile_start, profile_stop; -curandGenerator_t curand_gen; -static HashMap *profile; - -void nerv_cumatrix_select_gpu(int dev, Status *status) { - fprintf(stderr, "** selecting GPU %d\n", dev); - NERV_SET_STATUS(status, NERV_NORMAL, 0); - CUDA_SAFE_SYNC_CALL(cudaSetDevice(dev), status); - CUDA_SAFE_SYNC_CALL(cublasDestroy(cublas_handle), status); - CUDA_SAFE_SYNC_CALL(cublasCreate(&cublas_handle), status); -} +#include "../common.h" +#include "cumatrix.h" +#include "cuda_helper.h" -void nerv_cumatrix_print_profile() { +void nerv_cuda_context_print_profile(CuContext *context) { + HashMap *profile = context->profile; size_t i; fprintf(stderr, "*** [nerv cumatrix profile] **\n"); for (i = 0; i < profile->size; i++) @@ -30,28 +20,112 @@ void nerv_cumatrix_print_profile() { } } -void nerv_cumatrix_clear_profile() { - hashmap_clear(profile); +void nerv_cuda_context_clear_profile(CuContext *context) { + nerv_hashmap_clear(context->profile); } -void accu_profile(const char *name, float delta) { - float *val = hashmap_getval(profile, name); +void nerv_cuda_context_accu_profile(CuContext *context, + const char *name, float delta) { + HashMap *profile = context->profile; + float *val = nerv_hashmap_getval(profile, name); if (!val) { val = malloc(sizeof(float)); *val = 0; - hashmap_setval(profile, name, val); + nerv_hashmap_setval(profile, name, val); } *val += delta; } -void nerv_cumatrix_init() { - cublasCreate(&cublas_handle); - curandCreateGenerator(&curand_gen, CURAND_RNG_PSEUDO_DEFAULT); - curandSetPseudoRandomGeneratorSeed(curand_gen, time(NULL)); - cudaEventCreate(&profile_start); - cudaEventCreate(&profile_stop); - profile = hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp); +static void new_cuda_handles(CuContext *context, int dev, Status *status) { + if (context->has_handle) return; + CUDA_SAFE_SYNC_CALL(cudaSetDevice(dev), status); + CUBLAS_SAFE_SYNC_CALL(cublasCreate(&(context->cublas_handle)), status); + CURAND_SAFE_SYNC_CALL(curandCreateGenerator(&(context->curand_gen), + CURAND_RNG_PSEUDO_DEFAULT), status); + CURAND_SAFE_SYNC_CALL( + curandSetPseudoRandomGeneratorSeed(context->curand_gen, time(NULL)), + status); + CUDA_SAFE_SYNC_CALL(cudaEventCreate(&(context->profile_start)), status); + CUDA_SAFE_SYNC_CALL(cudaEventCreate(&(context->profile_stop)), status); + NERV_SET_STATUS(status, NERV_NORMAL, 0); + context->has_handle = 1; +} + +static void free_cuda_handles(CuContext *context, Status *status) { + if (!context->has_handle) return; + context->has_handle = 0; + CUBLAS_SAFE_SYNC_CALL(cublasDestroy(context->cublas_handle), status); + CURAND_SAFE_SYNC_CALL(curandDestroyGenerator(context->curand_gen), status); + CUDA_SAFE_SYNC_CALL(cudaEventDestroy(context->profile_start), status); + CUDA_SAFE_SYNC_CALL(cudaEventDestroy(context->profile_stop), status); + NERV_SET_STATUS(status, NERV_NORMAL, 0); +} + +static int choose_best_gpu(Status *status) { + int i, n, dev = 0; + float best_ratio = 0; + fprintf(stderr, "*** select a GPU based on available space\n"); + CUDA_SAFE_CALL_RET(cudaGetDeviceCount(&n), status); + for (i = 0; i < n; i++) + { + size_t avail, total; + float ratio; + CUDA_SAFE_SYNC_CALL_RET(cudaSetDevice(i), status); + CUDA_SAFE_SYNC_CALL_RET(cuMemGetInfo(&avail, &total), status); + ratio = (float)avail/total * 100; + fprintf(stderr, "* card %d: %.2f%%\n", i, ratio); + if (ratio > best_ratio) + { + best_ratio = ratio; + dev = i; + } + CUDA_SAFE_SYNC_CALL_RET(cudaDeviceReset(), status); + } + fprintf(stderr, "*** final decision: GPU %d\n", dev); + NERV_SET_STATUS(status, NERV_NORMAL, 0); + return dev; +} + +CuContext *nerv_cuda_context_create(int dev, Status *status) { + CuContext *context = (CuContext *)malloc(sizeof(CuContext)); + context->has_handle = 0; /* this line must come first */ + if (dev == -1) + { + dev = choose_best_gpu(status); + if (status->err_code != NERV_NORMAL) + return NULL; + } + new_cuda_handles(context, dev, status); + if (status->err_code != NERV_NORMAL) + return NULL; + context->profile = nerv_hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp); + NERV_SET_STATUS(status, NERV_NORMAL, 0); + return context; +} + +void nerv_cuda_context_destroy(CuContext *context, Status *status) { + free_cuda_handles(context, status); + if (status->err_code != NERV_NORMAL) + return; + nerv_hashmap_destroy(context->profile); + free(context); + NERV_SET_STATUS(status, NERV_NORMAL, 0); +} + +void nerv_cuda_context_select_gpu(CuContext *context, + int dev, Status *status) { + /* free_cuda_handles(context, status); + if (status->err_code != NERV_NORMAL) + return; + */ + /* because of cudaDeviceReset */ + context->has_handle = 0; + CUDA_SAFE_SYNC_CALL(cudaDeviceReset(), status); + new_cuda_handles(context, dev, status); + if (status->err_code != NERV_NORMAL) + return; + NERV_SET_STATUS(status, NERV_NORMAL, 0); } #define MATRIX_USE_FLOAT @@ -59,7 +133,6 @@ void nerv_cumatrix_init() { #define nerv_matrix_(NAME) nerv_matrix_cuda_float_##NAME #define cudak_(NAME) cudak_float_ ## NAME #define NERV_CUBLAS_(NAME) cublasS##NAME -#define MATRIX_CUMATRIX_HOST_TNAME nerv_matrix_host_float_tname #include "generic/cumatrix.c" #undef NERV_CUBLAS_ @@ -72,12 +145,10 @@ void nerv_cumatrix_init() { #undef MATRIX_ELEM_PTR_BASE #undef MATRIX_ELEM_FMT #undef MATRIX_ELEM_WRITE_FMT -#undef MATRIX_CUMATRIX_HOST_TNAME #define MATRIX_USE_DOUBLE #define cuda_matrix_(NAME) cuda_matrix_double_##NAME #define nerv_matrix_(NAME) nerv_matrix_cuda_double_##NAME #define cudak_(NAME) cudak_double_ ## NAME #define NERV_CUBLAS_(NAME) cublasD##NAME -#define MATRIX_CUMATRIX_HOST_TNAME nerv_matrix_host_double_tname #include "generic/cumatrix.c" diff --git a/nerv/lib/matrix/cumatrix.h b/nerv/lib/matrix/cumatrix.h index b47e14b..fd2a5ce 100644 --- a/nerv/lib/matrix/cumatrix.h +++ b/nerv/lib/matrix/cumatrix.h @@ -2,8 +2,20 @@ #define NERV_CUMATRIX_H #include "matrix.h" #include "../common.h" -void nerv_cumatrix_print_profile(); -void nerv_cumatrix_clear_profile(); -void nerv_cumatrix_init(); -void nerv_cumatrix_select_gpu(int dev, Status *status); +#include "cuda_helper.h" + +typedef struct CuContext { + int has_handle; + cublasHandle_t cublas_handle; + cudaEvent_t profile_start, profile_stop; + curandGenerator_t curand_gen; + HashMap *profile; +} CuContext; + +void nerv_cuda_context_print_profile(CuContext *context); +void nerv_cuda_context_clear_profile(CuContext *context); +void nerv_cuda_context_accu_profile(CuContext *context, const char *name, float delta); +void nerv_cuda_context_select_gpu(CuContext *context, int dev, Status *status); +CuContext *nerv_cuda_context_create(int dev, Status *status); +void nerv_cuda_context_destroy(CuContext *contex, Status *status); #endif diff --git a/nerv/lib/matrix/generic/cukernel.cu b/nerv/lib/matrix/generic/cukernel.cu index 311a6ce..93121dc 100644 --- a/nerv/lib/matrix/generic/cukernel.cu +++ b/nerv/lib/matrix/generic/cukernel.cu @@ -453,13 +453,12 @@ extern "C" { cudaStreamSynchronize(0); } - extern curandGenerator_t curand_gen; - void cudak_(cuda_rand_uniform)(const Matrix *a) { + void cudak_(cuda_rand_uniform)(const Matrix *a, CuContext *context) { #ifdef MATRIX_USE_FLOAT - curandGenerateUniform(curand_gen, MATRIX_ELEM_PTR(a), a->nrow * a->stride / sizeof(MATRIX_ELEM)); + curandGenerateUniform(context->curand_gen, MATRIX_ELEM_PTR(a), a->nrow * a->stride / sizeof(MATRIX_ELEM)); #endif #ifdef MATRIX_USE_DOUBLE - curandGenerateUniformDouble(curand_gen, MATRIX_ELEM_PTR(a), a->nrow * a->stride / sizeof(MATRIX_ELEM)); + curandGenerateUniformDouble(context->curand_gen, MATRIX_ELEM_PTR(a), a->nrow * a->stride / sizeof(MATRIX_ELEM)); #endif } diff --git a/nerv/lib/matrix/generic/cumatrix.c b/nerv/lib/matrix/generic/cumatrix.c index 1c74866..6d84663 100644 --- a/nerv/lib/matrix/generic/cumatrix.c +++ b/nerv/lib/matrix/generic/cumatrix.c @@ -1,10 +1,11 @@ #ifdef NERV_GENERIC_CUMATRIX #include "matrix.h" #include "elem_type.h" -#define MATRIX_DATA_FREE(ptr, status) cuda_matrix_(free)(ptr, status) -#define MATRIX_DATA_ALLOC(dptr, stride, width, height, status) \ - cuda_matrix_(alloc)(dptr, stride, width, height, status) - +#define MATRIX_DATA_FREE(ptr, context, status) \ + cuda_matrix_(free)(ptr, context, status) +#define MATRIX_DATA_ALLOC(dptr, stride, width, height, context, status) \ + cuda_matrix_(alloc)(dptr, stride, width, height, \ + context, status) #define NERV_GENERIC_MATRIX #define NERV_GENERIC_CUKERNEL #include "../../common.h" @@ -14,12 +15,13 @@ void nerv_matrix_(add)(Matrix *c, const Matrix *a, const Matrix *b, MATRIX_ELEM alpha, MATRIX_ELEM beta, + CuContext *context, Status *status) { CHECK_SAME_DIMENSION(a, b, status); CHECK_SAME_DIMENSION(a, c, status); PROFILE_START CUBLAS_SAFE_SYNC_CALL( - NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, + NERV_CUBLAS_(geam)(context->cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, a->ncol, a->nrow, &alpha, MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM), @@ -33,7 +35,8 @@ void nerv_matrix_(add)(Matrix *c, const Matrix *a, const Matrix *b, void nerv_matrix_(mul)(Matrix *c, const Matrix *a, const Matrix *b, MATRIX_ELEM alpha, MATRIX_ELEM beta, - int ta, int tb, Status *status) { + int ta, int tb, + CuContext *context, Status *status) { #define SWAP(a, b) \ do { int t = (a); (a) = (b); (b) = t; } while (0) @@ -46,7 +49,7 @@ void nerv_matrix_(mul)(Matrix *c, const Matrix *a, const Matrix *b, /* Because matrix in Nerv is row-major, here b comes first */ PROFILE_START CUBLAS_SAFE_SYNC_CALL( - NERV_CUBLAS_(gemm)(cublas_handle, tb, ta, + NERV_CUBLAS_(gemm)(context->cublas_handle, tb, ta, bn, am, bm, &alpha, MATRIX_ELEM_PTR(b), b->stride / sizeof(MATRIX_ELEM), @@ -58,7 +61,8 @@ void nerv_matrix_(mul)(Matrix *c, const Matrix *a, const Matrix *b, NERV_SET_STATUS(status, NERV_NORMAL, 0); } -void nerv_matrix_(sigmoid)(Matrix *a, const Matrix *b, Status *status) { +void nerv_matrix_(sigmoid)(Matrix *a, const Matrix *b, + CuContext *context, Status *status) { CHECK_SAME_DIMENSION(a, b, status); PROFILE_START cudak_(cuda_sigmoid)(b, a); @@ -67,7 +71,8 @@ void nerv_matrix_(sigmoid)(Matrix *a, const Matrix *b, Status *status) { } void nerv_matrix_(sigmoid_grad)(Matrix *nerr, const Matrix *err, - const Matrix *output, Status *status) { + const Matrix *output, + CuContext *context, Status *status) { CHECK_SAME_DIMENSION(nerr, err, status); CHECK_SAME_DIMENSION(nerr, output, status); PROFILE_START @@ -76,14 +81,16 @@ void nerv_matrix_(sigmoid_grad)(Matrix *nerr, const Matrix *err, NERV_SET_STATUS(status, NERV_NORMAL, 0); } -void nerv_matrix_(rand_uniform)(Matrix *a, Status *status) { +void nerv_matrix_(rand_uniform)(Matrix *a, CuContext *context, Status *status) { PROFILE_START - cudak_(cuda_rand_uniform)(a); + cudak_(cuda_rand_uniform)(a, context); PROFILE_STOP NERV_SET_STATUS(status, NERV_NORMAL, 0); } -void nerv_matrix_(thres_mask)(Matrix *a, Matrix *b, double thres, double low, double high, Status *status) { +void nerv_matrix_(thres_mask)(Matrix *a, Matrix *b, double thres, + double low, double high, + CuContext *context, Status *status) { CHECK_SAME_DIMENSION(a, b, status); PROFILE_START cudak_(cuda_thres_mask)(a, b, thres, low, high); @@ -91,7 +98,8 @@ void nerv_matrix_(thres_mask)(Matrix *a, Matrix *b, double thres, double low, do NERV_SET_STATUS(status, NERV_NORMAL, 0); } -void nerv_matrix_(tanh)(Matrix *a, const Matrix *b, Status *status) { +void nerv_matrix_(tanh)(Matrix *a, const Matrix *b, + CuContext *context, Status *status) { CHECK_SAME_DIMENSION(a, b, status); PROFILE_START cudak_(cuda_tanh)(b, a); @@ -99,8 +107,8 @@ void nerv_matrix_(tanh)(Matrix *a, const Matrix *b, Status *status) { NERV_SET_STATUS(status, NERV_NORMAL, 0); } -void nerv_matrix_(tanh_grad)(Matrix *nerr, const Matrix *err, - const Matrix *output, Status *status) { +void nerv_matrix_(tanh_grad)(Matrix *nerr, const Matrix *err, const Matrix *output, + CuContext *context, Status *status) { CHECK_SAME_DIMENSION(nerr, err, status); CHECK_SAME_DIMENSION(nerr, output, status); PROFILE_START @@ -109,24 +117,25 @@ void nerv_matrix_(tanh_grad)(Matrix *nerr, const Matrix *err, NERV_SET_STATUS(status, NERV_NORMAL, 0); } -Matrix *nerv_matrix_(softmax)(Matrix *b, const Matrix *a, Status *status) { +Matrix *nerv_matrix_(softmax)(Matrix *b, const Matrix *a, + CuContext *context, Status *status) { Matrix *max, *max_idx; Matrix *dno; CHECK_SAME_DIMENSION_RET(a, b, status); - max = nerv_matrix_(create)(a->nrow, 1, status); + max = nerv_matrix_(create)(a->nrow, 1, context, status); if (status->err_code != NERV_NORMAL) return NULL; - max_idx = nerv_matrix_(create)(a->nrow, 1, status); + max_idx = nerv_matrix_(create)(a->nrow, 1, context, status); if (status->err_code != NERV_NORMAL) { - nerv_matrix_(destroy)(max, status); + nerv_matrix_(destroy)(max, context, status); return NULL; } - dno = nerv_matrix_(create)(a->nrow, 1, status); + dno = nerv_matrix_(create)(a->nrow, 1, context, status); if (status->err_code != NERV_NORMAL) { /* FIXME: destroy may also fail? */ - nerv_matrix_(destroy)(max, status); - nerv_matrix_(destroy)(max_idx, status); + nerv_matrix_(destroy)(max, context, status); + nerv_matrix_(destroy)(max_idx, context, status); return NULL; } PROFILE_START @@ -134,14 +143,14 @@ Matrix *nerv_matrix_(softmax)(Matrix *b, const Matrix *a, Status *status) { cudak_(cuda_softmax_denominator)(a, max, dno); cudak_(cuda_softmax_final)(a, max, dno, b); PROFILE_STOP - nerv_matrix_(destroy)(max, status); - nerv_matrix_(destroy)(dno, status); + nerv_matrix_(destroy)(max, context, status); + nerv_matrix_(destroy)(dno, context, status); NERV_SET_STATUS(status, NERV_NORMAL, 0); return max_idx; } -Matrix *nerv_matrix_(rowsum)(Matrix *a, Status *status) { - Matrix *b = nerv_matrix_(create)(a->nrow, 1, status); +Matrix *nerv_matrix_(rowsum)(Matrix *a, CuContext *context, Status *status) { + Matrix *b = nerv_matrix_(create)(a->nrow, 1, context, status); if (status->err_code != NERV_NORMAL) return NULL; PROFILE_START @@ -151,8 +160,8 @@ Matrix *nerv_matrix_(rowsum)(Matrix *a, Status *status) { return b; } -Matrix *nerv_matrix_(colsum)(Matrix *a, Status *status) { - Matrix *b = nerv_matrix_(create)(1, a->ncol, status); +Matrix *nerv_matrix_(colsum)(Matrix *a, CuContext *context, Status *status) { + Matrix *b = nerv_matrix_(create)(1, a->ncol, context, status); if (status->err_code != NERV_NORMAL) return NULL; PROFILE_START @@ -163,8 +172,8 @@ Matrix *nerv_matrix_(colsum)(Matrix *a, Status *status) { } Matrix *nerv_matrix_(colsame)(Matrix *a, const Matrix *ref, - Status *status) { - Matrix *b = nerv_matrix_(create)(1, a->ncol, status); + CuContext *context, Status *status) { + Matrix *b = nerv_matrix_(create)(1, a->ncol, context, status); if (status->err_code != NERV_NORMAL) return NULL; CHECK_SAME_DIMENSION_RET(a, ref, status); @@ -175,8 +184,8 @@ Matrix *nerv_matrix_(colsame)(Matrix *a, const Matrix *ref, return b; } -Matrix *nerv_matrix_(rowmax)(Matrix *a, Status *status) { - Matrix *b = nerv_matrix_(create)(a->nrow, 1, status); +Matrix *nerv_matrix_(rowmax)(Matrix *a, CuContext *context, Status *status) { + Matrix *b = nerv_matrix_(create)(a->nrow, 1, context, status); if (status->err_code != NERV_NORMAL) return NULL; PROFILE_START @@ -187,15 +196,15 @@ Matrix *nerv_matrix_(rowmax)(Matrix *a, Status *status) { } void nerv_matrix_(rowmax_idx)(Matrix *a, Matrix **b, Matrix **idx, - Status *status) { - *b = nerv_matrix_(create)(a->nrow, 1, status); + CuContext *context, Status *status) { + *b = nerv_matrix_(create)(a->nrow, 1, context, status); if (status->err_code != NERV_NORMAL) return; - *idx = nerv_matrix_(create)(a->nrow, 1, status); + *idx = nerv_matrix_(create)(a->nrow, 1, context, status); if (status->err_code != NERV_NORMAL) { /* FIXME: destroy may also fail? */ - nerv_matrix_(destroy)(*b, status); + nerv_matrix_(destroy)(*b, context, status); return; } PROFILE_START @@ -205,7 +214,7 @@ void nerv_matrix_(rowmax_idx)(Matrix *a, Matrix **b, Matrix **idx, } void nerv_matrix_(add_row)(Matrix *b, const Matrix *a, double beta, - Status *status) { + CuContext *context, Status *status) { if (a->ncol != b->ncol) NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); if (a->nrow != 1) @@ -216,23 +225,25 @@ void nerv_matrix_(add_row)(Matrix *b, const Matrix *a, double beta, NERV_SET_STATUS(status, NERV_NORMAL, 0); } -void nerv_matrix_(fill)(Matrix *self, double val, Status *status) { +void nerv_matrix_(fill)(Matrix *self, double val, + CuContext *context, Status *status) { PROFILE_START cudak_(cuda_fill)(self, val); PROFILE_STOP NERV_SET_STATUS(status, NERV_NORMAL, 0); } -void nerv_matrix_(clip)(Matrix *self, double val_1, double val_2, Status *status) { +void nerv_matrix_(clip)(Matrix *self, double val1, double val2, + CuContext *context, Status *status) { PROFILE_START - cudak_(cuda_clip)(self, val_1, val_2); + cudak_(cuda_clip)(self, val1, val2); PROFILE_STOP NERV_SET_STATUS(status, NERV_NORMAL, 0); } void nerv_matrix_(copy_fromd)(Matrix *a, const Matrix *b, int a_begin, int b_begin, int b_end, - Status *status) { + CuContext *context, Status *status) { if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow && a_begin + b_end - b_begin <= a->nrow)) NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0); @@ -251,7 +262,7 @@ void nerv_matrix_(copy_fromd)(Matrix *a, const Matrix *b, void nerv_matrix_(copy_fromh)(Matrix *a, const Matrix *b, int a_begin, int b_begin, int b_end, - Status *status) { + CuContext *context, Status *status) { if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow && a_begin + b_end - b_begin <= a->nrow)) NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0); @@ -270,7 +281,7 @@ void nerv_matrix_(copy_fromh)(Matrix *a, const Matrix *b, void nerv_matrix_(copy_toh)(Matrix *a, const Matrix *b, int a_begin, int a_end, int b_begin, - Status *status) { + CuContext *context, Status *status) { if (!(0 <= a_begin && a_begin < a_end && a_end <= a->nrow && b_begin + a_end - a_begin <= b->nrow)) NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0); @@ -287,15 +298,15 @@ void nerv_matrix_(copy_toh)(Matrix *a, const Matrix *b, NERV_SET_STATUS(status, NERV_NORMAL, 0); } -Matrix *nerv_matrix_(trans)(Matrix *a, Status *status) { +Matrix *nerv_matrix_(trans)(Matrix *a, CuContext *context, Status *status) { MATRIX_ELEM alpha = 1, beta = 0; - Matrix *b = nerv_matrix_(create)(a->ncol, a->nrow, status); + Matrix *b = nerv_matrix_(create)(a->ncol, a->nrow, context, status); if (status->err_code != NERV_NORMAL) return NULL; /* FIXME: possible memory leak when lua error is raised */ PROFILE_START CUBLAS_SAFE_SYNC_CALL_RET( - NERV_CUBLAS_(geam)(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T, + NERV_CUBLAS_(geam)(context->cublas_handle, CUBLAS_OP_T, CUBLAS_OP_T, a->nrow, a->ncol, &alpha, MATRIX_ELEM_PTR(a), a->stride / sizeof(MATRIX_ELEM), @@ -309,7 +320,7 @@ Matrix *nerv_matrix_(trans)(Matrix *a, Status *status) { } void nerv_matrix_(mul_elem)(Matrix *c, const Matrix *a, const Matrix *b, - Status *status) { + CuContext *context, Status *status) { CHECK_SAME_DIMENSION(a, b, status); CHECK_SAME_DIMENSION(a, c, status); PROFILE_START @@ -318,7 +329,8 @@ void nerv_matrix_(mul_elem)(Matrix *c, const Matrix *a, const Matrix *b, NERV_SET_STATUS(status, NERV_NORMAL, 0); } -void nerv_matrix_(log_elem)(Matrix *b, const Matrix *a, Status *status) { +void nerv_matrix_(log_elem)(Matrix *b, const Matrix *a, + CuContext *context, Status *status) { CHECK_SAME_DIMENSION(a, b, status); PROFILE_START cudak_(cuda_log_elem)(a, b); @@ -326,14 +338,15 @@ void nerv_matrix_(log_elem)(Matrix *b, const Matrix *a, Status *status) { NERV_SET_STATUS(status, NERV_NORMAL, 0); } -Matrix *nerv_matrix_(decompress)(const Matrix *a, int orig_col, Status *status) { +Matrix *nerv_matrix_(decompress)(const Matrix *a, int orig_col, + CuContext *context, Status *status) { Matrix *b; if (a->ncol != 1) { NERV_SET_STATUS(status, MAT_COL_VECTOR_EXP, 0); return NULL; } - b = nerv_matrix_(create)(a->nrow, orig_col, status); + b = nerv_matrix_(create)(a->nrow, orig_col, context, status); if (status->err_code != NERV_NORMAL) return NULL; PROFILE_START @@ -345,7 +358,8 @@ Matrix *nerv_matrix_(decompress)(const Matrix *a, int orig_col, Status *status) } void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b, - const Matrix *idx, int b_begin, Status *status) { + const Matrix *idx, int b_begin, + CuContext *context, Status *status) { long nrow = a->nrow; if (!(0 <= b_begin && b_begin + nrow <= idx->ncol)) NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0); @@ -379,7 +393,8 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b, } void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b, - const Matrix *idx, int b_begin, Status *status) { + const Matrix *idx, int b_begin, + CuContext *context, Status *status) { long nrow = a->nrow; if (!(0 <= b_begin && b_begin + nrow <= idx->ncol)) NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0); @@ -394,7 +409,8 @@ void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b, } void nerv_matrix_(copy_rows_fromd_by_colidx)(Matrix *a, const Matrix *b, - const Matrix *idx, int b_begin, Status *status) { + const Matrix *idx, int b_begin, + CuContext *context, Status *status) { long nrow = a->nrow; if (!(0 <= b_begin && b_begin + nrow <= idx->nrow)) NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0); @@ -412,7 +428,9 @@ void nerv_matrix_(copy_rows_fromd_by_colidx)(Matrix *a, const Matrix *b, #ifdef __NERV_FUTURE_CUDA_7 -void nerv_matrix_(update_select_rows_by_rowidx)(Matrix *c, const Matrix *a, const Matrix *idx, double alpha, double beta, Status *status) { +void nerv_matrix_(update_select_rows_by_rowidx)(Matrix *c, const Matrix *a, + const Matrix *idx, double alpha, double beta, + CuContext *context, Status *status) { long nrow = a->nrow; if (idx->nrow != 1 || idx->ncol != a->nrow) NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0); @@ -424,7 +442,9 @@ void nerv_matrix_(update_select_rows_by_rowidx)(Matrix *c, const Matrix *a, cons NERV_SET_STATUS(status, NERV_NORMAL, 0); } -void nerv_matrix_(update_select_rows_by_colidx)(Matrix *c, const Matrix *a, const Matrix *idx, double alpha, double beta, Status *status) { +void nerv_matrix_(update_select_rows_by_colidx)(Matrix *c, const Matrix *a, + const Matrix *idx, double alpha, double beta, + CuContext *context, Status *status) { long nrow = a->nrow; if (idx->ncol != 1 || idx->nrow != a->nrow) NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0); @@ -438,20 +458,20 @@ void nerv_matrix_(update_select_rows_by_colidx)(Matrix *c, const Matrix *a, cons #endif void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b, - int context, Status *status) { + int cont, CuContext *context, Status *status) { if (a->nrow != b->nrow) NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); - if (a->ncol != b->ncol * (context * 2 + 1)) + if (a->ncol != b->ncol * (cont * 2 + 1)) NERV_EXIT_STATUS(status, MAT_GENERAL_ERR, "the width should be 2 * context + 1"); PROFILE_START - cudak_(cuda_expand_frm)(b, a, context); + cudak_(cuda_expand_frm)(b, a, cont); PROFILE_STOP NERV_SET_STATUS(status, NERV_NORMAL, 0); } void nerv_matrix_(rearrange_frm)(Matrix *a, const Matrix *b, - int step, Status *status) { + int step, CuContext *context, Status *status) { CHECK_SAME_DIMENSION(a, b, status); if (b->ncol % step) NERV_EXIT_STATUS(status, MAT_GENERAL_ERR, @@ -463,7 +483,7 @@ void nerv_matrix_(rearrange_frm)(Matrix *a, const Matrix *b, } void nerv_matrix_(scale_rows_by_col)(Matrix *a, const Matrix *b, - Status *status) { + CuContext *context, Status *status) { if (a->nrow != b->nrow) NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); if (b->ncol != 1) @@ -475,7 +495,7 @@ void nerv_matrix_(scale_rows_by_col)(Matrix *a, const Matrix *b, } void nerv_matrix_(scale_rows_by_row)(Matrix *a, const Matrix *b, - Status *status) { + CuContext *context, Status *status) { if (a->ncol != b->ncol) NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); if (b->nrow != 1) @@ -486,7 +506,8 @@ void nerv_matrix_(scale_rows_by_row)(Matrix *a, const Matrix *b, NERV_SET_STATUS(status, NERV_NORMAL, 0); } -void nerv_matrix_(prefixsum_row)(Matrix *a, const Matrix *b, Status *status) { +void nerv_matrix_(prefixsum_row)(Matrix *a, const Matrix *b, + CuContext *context, Status *status) { CHECK_SAME_DIMENSION(a, b, status); PROFILE_START cudak_(cuda_prefixsum_row)(b, a); @@ -494,7 +515,7 @@ void nerv_matrix_(prefixsum_row)(Matrix *a, const Matrix *b, Status *status) { NERV_SET_STATUS(status, NERV_NORMAL, 0); } -void nerv_matrix_(diagonalize)(Matrix *a, Status *status) { +void nerv_matrix_(diagonalize)(Matrix *a, CuContext * context, Status *status) { if (a->nrow != a->ncol) NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); PROFILE_START @@ -503,14 +524,14 @@ void nerv_matrix_(diagonalize)(Matrix *a, Status *status) { NERV_SET_STATUS(status, NERV_NORMAL, 0); } -static void cuda_matrix_(free)(MATRIX_ELEM *ptr, Status *status) { +static void cuda_matrix_(free)(MATRIX_ELEM *ptr, CuContext *context, Status *status) { CUDA_SAFE_SYNC_CALL(cudaFree(ptr), status); NERV_SET_STATUS(status, NERV_NORMAL, 0); } static void cuda_matrix_(alloc)(MATRIX_ELEM **dptr, size_t *stride, long width, long height, - Status *status) { + CuContext *context, Status *status) { PROFILE_START CUDA_SAFE_SYNC_CALL(cudaMallocPitch((void **)dptr, stride, width, height), status); diff --git a/nerv/lib/matrix/generic/cumatrix.h b/nerv/lib/matrix/generic/cumatrix.h index 48d1f13..de3a09e 100644 --- a/nerv/lib/matrix/generic/cumatrix.h +++ b/nerv/lib/matrix/generic/cumatrix.h @@ -2,77 +2,101 @@ void nerv_matrix_(add)(Matrix *c, const Matrix *a, const Matrix *b, MATRIX_ELEM alpha, MATRIX_ELEM beta, - Status *status); + CuContext *context, Status *status); void nerv_matrix_(mul)(Matrix *c, const Matrix *a, const Matrix *b, MATRIX_ELEM alpha, MATRIX_ELEM beta, - int ta, int tb, Status *status); -void nerv_matrix_(sigmoid)(Matrix *a, const Matrix *b, Status *status); + int ta, int tb, + CuContext *context, Status *status); +void nerv_matrix_(sigmoid)(Matrix *a, const Matrix *b, + CuContext *context, Status *status); void nerv_matrix_(sigmoid_grad)(Matrix *nerr, const Matrix *err, - const Matrix *output, Status *status); -void nerv_matrix_(tanh)(Matrix *a, const Matrix *b, Status *status); + const Matrix *output, + CuContext *context, Status *status); +void nerv_matrix_(tanh)(Matrix *a, const Matrix *b, + CuContext *context, Status *status); void nerv_matrix_(tanh_grad)(Matrix *nerr, const Matrix *err, - const Matrix *output, Status *status); + const Matrix *output, + CuContext *context, Status *status); -Matrix *nerv_matrix_(softmax)(Matrix *b, const Matrix *a, Status *status); -Matrix *nerv_matrix_(rowsum)(Matrix *a, Status *status); -Matrix *nerv_matrix_(colsum)(Matrix *a, Status *status); +Matrix *nerv_matrix_(softmax)(Matrix *b, const Matrix *a, + CuContext *context, Status *status); +Matrix *nerv_matrix_(rowsum)(Matrix *a, CuContext *context, Status *status); +Matrix *nerv_matrix_(colsum)(Matrix *a, CuContext *context, Status *status); Matrix *nerv_matrix_(colsame)(Matrix *a, const Matrix *ref, - Status *status); -Matrix *nerv_matrix_(rowmax)(Matrix *a, Status *status); + CuContext *context, Status *status); +Matrix *nerv_matrix_(rowmax)(Matrix *a, CuContext *context, Status *status); void nerv_matrix_(rowmax_idx)(Matrix *a, Matrix **b, Matrix **idx, - Status *status); + CuContext *context, Status *status); void nerv_matrix_(add_row)(Matrix *b, const Matrix *a, double beta, - Status *status); -void nerv_matrix_(clip)(Matrix *self, double val_1, double val_2, Status *status); -void nerv_matrix_(fill)(Matrix *self, double val, Status *status); -void nerv_matrix_(diagonalize)(Matrix *self, Status *statut); + CuContext *context, Status *status); +void nerv_matrix_(clip)(Matrix *self, double val1, double val2, + CuContext *context, Status *status); +void nerv_matrix_(fill)(Matrix *self, double val, + CuContext *context, Status *status); +void nerv_matrix_(diagonalize)(Matrix *self, + CuContext *context, Status *status); void nerv_matrix_(copy_fromd)(Matrix *a, const Matrix *b, int a_begin, int b_begin, int b_end, - Status *status); + CuContext *context, Status *status); void nerv_matrix_(copy_fromh)(Matrix *a, const Matrix *b, int a_begin, int b_begin, int b_end, - Status *status); + CuContext *context, Status *status); void nerv_matrix_(copy_toh)(Matrix *a, const Matrix *b, int a_begin, int a_end, int b_begin, - Status *status); -Matrix *nerv_matrix_(trans)(Matrix *a, Status *status); + CuContext *context, Status *status); +Matrix *nerv_matrix_(trans)(Matrix *a, CuContext *context, Status *status); void nerv_matrix_(mul_elem)(Matrix *c, const Matrix *a, const Matrix *b, - Status *status); + CuContext *context, Status *status); -void nerv_matrix_(log_elem)(Matrix *b, const Matrix *a, Status *status); +void nerv_matrix_(log_elem)(Matrix *b, const Matrix *a, + CuContext *context, Status *status); -Matrix *nerv_matrix_(decompress)(const Matrix *a, int orig_col, Status *status); +Matrix *nerv_matrix_(decompress)(const Matrix *a, int orig_col, + CuContext *context, Status *status); void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b, - const Matrix *idx, int b_begin, Status *status); + const Matrix *idx, int b_begin, + CuContext *context, Status *status); void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b, - const Matrix *idx, int b_begin, Status *status); + const Matrix *idx, int b_begin, + CuContext *context, Status *status); void nerv_matrix_(copy_rows_fromd_by_colidx)(Matrix *a, const Matrix *b, - const Matrix *idx, int b_begin, Status *status); + const Matrix *idx, int b_begin, + CuContext *context, Status *status); #ifdef __NERV_FUTURE_CUDA_7 -void nerv_matrix_(update_select_rows_by_rowidx)(Matrix *c, const Matrix *a, const Matrix *idx, double alpha, double beta, Status *status); -void nerv_matrix_(update_select_rows_by_colidx)(Matrix *c, const Matrix *a, const Matrix *idx, double alpha, double beta, Status *status); +void nerv_matrix_(update_select_rows_by_rowidx)(Matrix *c, const Matrix *a, + const Matrix *idx, double alpha, double beta, + CuContext *context, Status *status); +void nerv_matrix_(update_select_rows_by_colidx)(Matrix *c, const Matrix *a, + const Matrix *idx, double alpha, double beta, + CuContext *context, Status *status); #endif void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b, - int context, Status *status); + int cont, CuContext *context, Status *status); void nerv_matrix_(rearrange_frm)(Matrix *a, const Matrix *b, - int step, Status *status); + int step, CuContext *context, Status *status); void nerv_matrix_(scale_rows_by_col)(Matrix *a, const Matrix *b, - Status *status); + CuContext *context, Status *status); void nerv_matrix_(scale_rows_by_row)(Matrix *a, const Matrix *b, - Status *status); -void nerv_matrix_(prefixsum_row)(Matrix *a, const Matrix *b, Status *status); + CuContext *context, Status *status); +void nerv_matrix_(prefixsum_row)(Matrix *a, const Matrix *b, + CuContext *context, Status *status); void nerv_matrix_(thres_mask)(Matrix *a, Matrix *b, double thres, double low, double high, - Status *status); -void nerv_matrix_(rand_uniform)(Matrix *a, Status *status); + CuContext *context, Status *status); +void nerv_matrix_(rand_uniform)(Matrix *a, CuContext *context, Status *status); #ifdef __NERV_FUTURE_CUDA_7 -void nerv_matrix_(update_select_rows_by_rowidx)(Matrix *c, const Matrix *a, const Matrix *idx, - double alpha, double beta, Status *status); -void nerv_matrix_(update_select_rows_by_colidx)(Matrix *c, const Matrix *a, const Matrix *idx, - double alpha, double beta, Status *status); +void nerv_matrix_(update_select_rows_by_rowidx)(Matrix *c, const Matrix *a, + const Matrix *idx, + double alpha, double beta, + CuContext *context, Status *status); +void nerv_matrix_(update_select_rows_by_colidx)(Matrix *c, const Matrix *a, + const Matrix *idx, + double alpha, double beta, + CuContext *context, Status *status); #endif -void nerv_matrix_(prefixsum_row)(Matrix *a, const Matrix *b, Status *status); +void nerv_matrix_(prefixsum_row)(Matrix *a, const Matrix *b, + CuContext *context, Status *status); diff --git a/nerv/lib/matrix/generic/matrix.c b/nerv/lib/matrix/generic/matrix.c index 998d107..3bcc251 100644 --- a/nerv/lib/matrix/generic/matrix.c +++ b/nerv/lib/matrix/generic/matrix.c @@ -3,12 +3,12 @@ #include "matrix.h" /* FIXME: malloc failure detection */ -void nerv_matrix_(data_free)(Matrix *self, Status *status) { +void nerv_matrix_(data_free)(Matrix *self, MATRIX_CONTEXT *context, Status *status) { assert(*self->data_ref > 0); if (--(*self->data_ref) == 0) { /* free matrix data */ - MATRIX_DATA_FREE(MATRIX_ELEM_PTR_BASE(self), status); + MATRIX_DATA_FREE(MATRIX_ELEM_PTR_BASE(self), context, status); free(self->data_ref); free(self); } @@ -22,7 +22,8 @@ void nerv_matrix_(data_retain)(Matrix *self) { (*self->data_ref)++; } -Matrix *nerv_matrix_(create)(long nrow, long ncol, Status *status) { +Matrix *nerv_matrix_(create)(long nrow, long ncol, + MATRIX_CONTEXT *context, Status *status) { Matrix *self = (Matrix *)malloc(sizeof(Matrix)); self->nrow = nrow; self->ncol = ncol; @@ -30,7 +31,7 @@ Matrix *nerv_matrix_(create)(long nrow, long ncol, Status *status) { self->dim = 2; MATRIX_DATA_ALLOC(&MATRIX_ELEM_PTR_BASE(self), &self->stride, sizeof(MATRIX_ELEM) * self->ncol, self->nrow, - status); + context, status); if (status->err_code != NERV_NORMAL) { free(self); @@ -44,8 +45,8 @@ Matrix *nerv_matrix_(create)(long nrow, long ncol, Status *status) { return self; } -void nerv_matrix_(destroy)(Matrix *self, Status *status) { - nerv_matrix_(data_free)(self, status); +void nerv_matrix_(destroy)(Matrix *self, MATRIX_CONTEXT *context, Status *status) { + nerv_matrix_(data_free)(self, context, status); } Matrix *nerv_matrix_(getrow)(Matrix *self, int row) { diff --git a/nerv/lib/matrix/generic/matrix.h b/nerv/lib/matrix/generic/matrix.h index 69b4e6d..2770c3e 100644 --- a/nerv/lib/matrix/generic/matrix.h +++ b/nerv/lib/matrix/generic/matrix.h @@ -1,6 +1,7 @@ #include "../matrix.h" -Matrix *nerv_matrix_(create)(long nrow, long ncol, Status *status); -void nerv_matrix_(destroy)(Matrix *self, Status *status); +Matrix *nerv_matrix_(create)(long nrow, long ncol, + MATRIX_CONTEXT *context, Status *status); +void nerv_matrix_(destroy)(Matrix *self, MATRIX_CONTEXT *context, Status *status); Matrix *nerv_matrix_(getrow)(Matrix *self, int row); -void nerv_matrix_(data_free)(Matrix *self, Status *status); +void nerv_matrix_(data_free)(Matrix *self, MATRIX_CONTEXT *context, Status *status); void nerv_matrix_(data_retain)(Matrix *self); diff --git a/nerv/lib/matrix/generic/mmatrix.c b/nerv/lib/matrix/generic/mmatrix.c index 3dabe0e..6272cbe 100644 --- a/nerv/lib/matrix/generic/mmatrix.c +++ b/nerv/lib/matrix/generic/mmatrix.c @@ -1,19 +1,21 @@ #ifdef NERV_GENERIC_MMATRIX #include "matrix.h" #include "elem_type.h" -#define MATRIX_DATA_FREE(ptr, status) host_matrix_(free)(ptr, status) -#define MATRIX_DATA_ALLOC(dptr, stride, width, height, status) \ - host_matrix_(alloc)(dptr, stride, width, height, status) +#define MATRIX_DATA_FREE(ptr, context, status) \ + host_matrix_(free)(ptr, context, status) +#define MATRIX_DATA_ALLOC(dptr, stride, width, height, context, status) \ + host_matrix_(alloc)(dptr, stride, width, height, \ + context, status) #define NERV_GENERIC_MATRIX -#include "../cuda_helper.h" #include "../../common.h" +#include "../../cblas.h" #include "../../io/chunk_file.h" #include <string.h> -#include <cblas.h> +#include <math.h> #include <float.h> -Matrix *nerv_matrix_(colsum)(Matrix *a, Status *status) { - Matrix *b = nerv_matrix_(create)(1, a->ncol, status); +Matrix *nerv_matrix_(colsum)(Matrix *a, MContext *context, Status *status) { + Matrix *b = nerv_matrix_(create)(1, a->ncol, context, status); if (status->err_code != NERV_NORMAL) return NULL; MATRIX_ELEM *arow = MATRIX_ELEM_PTR(a), @@ -31,8 +33,9 @@ Matrix *nerv_matrix_(colsum)(Matrix *a, Status *status) { return b; } -Matrix *nerv_matrix_(colsame)(Matrix *a, const Matrix *ref, Status *status) { - Matrix *b = nerv_matrix_(create)(1, a->ncol, status); +Matrix *nerv_matrix_(colsame)(Matrix *a, const Matrix *ref, + MContext *context, Status *status) { + Matrix *b = nerv_matrix_(create)(1, a->ncol, context, status); if (status->err_code != NERV_NORMAL) return NULL; CHECK_SAME_DIMENSION_RET(a, ref, status); @@ -55,8 +58,8 @@ Matrix *nerv_matrix_(colsame)(Matrix *a, const Matrix *ref, Status *status) { return b; } -Matrix *nerv_matrix_(rowsum)(Matrix *a, Status *status) { - Matrix *b = nerv_matrix_(create)(a->nrow, 1, status); +Matrix *nerv_matrix_(rowsum)(Matrix *a, MContext *context, Status *status) { + Matrix *b = nerv_matrix_(create)(a->nrow, 1, context, status); if (status->err_code != NERV_NORMAL) return NULL; MATRIX_ELEM *arow = MATRIX_ELEM_PTR(a), @@ -75,8 +78,8 @@ Matrix *nerv_matrix_(rowsum)(Matrix *a, Status *status) { return b; } -Matrix *nerv_matrix_(rowmax)(Matrix *a, Status *status) { - Matrix *b = nerv_matrix_(create)(a->nrow, 1, status); +Matrix *nerv_matrix_(rowmax)(Matrix *a, MContext *context, Status *status) { + Matrix *b = nerv_matrix_(create)(a->nrow, 1, context, status); if (status->err_code != NERV_NORMAL) return NULL; MATRIX_ELEM *arow = MATRIX_ELEM_PTR(a), @@ -96,15 +99,16 @@ Matrix *nerv_matrix_(rowmax)(Matrix *a, Status *status) { return b; } -void nerv_matrix_(rowmax_idx)(Matrix *a, Matrix **b, Matrix **idx, Status *status) { - *b = nerv_matrix_(create)(a->nrow, 1, status); +void nerv_matrix_(rowmax_idx)(Matrix *a, Matrix **b, Matrix **idx, + MContext *context, Status *status) { + *b = nerv_matrix_(create)(a->nrow, 1, context, status); if (status->err_code != NERV_NORMAL) return; - *idx = nerv_matrix_(create)(a->nrow, 1, status); + *idx = nerv_matrix_(create)(a->nrow, 1, context, status); if (status->err_code != NERV_NORMAL) { /* FIXME: destroy may also fail! */ - nerv_matrix_(destroy)(*b, status); + nerv_matrix_(destroy)(*b, context, status); return; } MATRIX_ELEM *arow = MATRIX_ELEM_PTR(a), @@ -127,8 +131,8 @@ void nerv_matrix_(rowmax_idx)(Matrix *a, Matrix **b, Matrix **idx, Status *statu NERV_SET_STATUS(status, NERV_NORMAL, 0); } -Matrix *nerv_matrix_(trans)(Matrix *a, Status *status) { - Matrix *b = nerv_matrix_(create)(a->ncol, a->nrow, status); +Matrix *nerv_matrix_(trans)(Matrix *a, MContext *context, Status *status) { + Matrix *b = nerv_matrix_(create)(a->ncol, a->nrow, context, status); if (status->err_code != NERV_NORMAL) return NULL; MATRIX_ELEM *arow = MATRIX_ELEM_PTR(a); @@ -148,14 +152,15 @@ Matrix *nerv_matrix_(trans)(Matrix *a, Status *status) { return b; } -Matrix *nerv_matrix_(decompress)(const Matrix *a, int orig_col, Status *status) { +Matrix *nerv_matrix_(decompress)(const Matrix *a, int orig_col, + MContext *context, Status *status) { Matrix *b; if (a->ncol != 1) { NERV_SET_STATUS(status, MAT_COL_VECTOR_EXP, 0); return NULL; } - b = nerv_matrix_(create)(a->nrow, orig_col, status); + b = nerv_matrix_(create)(a->nrow, orig_col, context, status); if (status->err_code != NERV_NORMAL) return NULL; int i; @@ -173,7 +178,9 @@ Matrix *nerv_matrix_(decompress)(const Matrix *a, int orig_col, Status *status) return b; } -void nerv_matrix_(add)(Matrix *c, const Matrix *a, const Matrix *b, MATRIX_ELEM alpha, MATRIX_ELEM beta, Status *status) { +void nerv_matrix_(add)(Matrix *c, const Matrix *a, const Matrix *b, + MATRIX_ELEM alpha, MATRIX_ELEM beta, + MContext *context, Status *status) { CHECK_SAME_DIMENSION(a, b, status); CHECK_SAME_DIMENSION(a, c, status); int i, j; @@ -197,7 +204,7 @@ void nerv_matrix_(add)(Matrix *c, const Matrix *a, const Matrix *b, MATRIX_ELEM void nerv_matrix_(mul)(Matrix *c, const Matrix *a, const Matrix *b, MATRIX_ELEM alpha, MATRIX_ELEM beta, - int ta, int tb, Status *status) { + int ta, int tb, MContext *context, Status *status) { #define SWAP(a, b) \ do { int t = (a); (a) = (b); (b) = t; } while (0) @@ -218,7 +225,7 @@ void nerv_matrix_(mul)(Matrix *c, const Matrix *a, const Matrix *b, } void nerv_matrix_(add_row)(Matrix *b, const Matrix *a, double beta, - Status *status) { + MContext *context, Status *status) { if (a->ncol != b->ncol) NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); if (a->nrow != 1) @@ -236,23 +243,25 @@ void nerv_matrix_(add_row)(Matrix *b, const Matrix *a, double beta, NERV_SET_STATUS(status, NERV_NORMAL, 0); } -void nerv_matrix_(clip)(Matrix *self, double val_1, double val_2, Status *status) { +void nerv_matrix_(clip)(Matrix *self, double val1, double val2, + MContext *context, Status *status) { int i, j; size_t astride = self->stride; MATRIX_ELEM *arow = MATRIX_ELEM_PTR(self); for (i = 0; i < self->nrow; i++) { for (j = 0; j < self->ncol; j++) - if (arow[j] > val_2) - arow[j] = val_2; - else if (arow[j] < val_1) - arow[j] = val_1; + if (arow[j] > val2) + arow[j] = val2; + else if (arow[j] < val1) + arow[j] = val1; arow = MATRIX_NEXT_ROW_PTR(arow, astride); } NERV_SET_STATUS(status, NERV_NORMAL, 0); } -void nerv_matrix_(fill)(Matrix *self, double val, Status *status) { +void nerv_matrix_(fill)(Matrix *self, double val, + MContext *context, Status *status) { int i, j; size_t astride = self->stride; MATRIX_ELEM *arow = MATRIX_ELEM_PTR(self); @@ -265,7 +274,8 @@ void nerv_matrix_(fill)(Matrix *self, double val, Status *status) { NERV_SET_STATUS(status, NERV_NORMAL, 0); } -void nerv_matrix_(diagonalize)(Matrix *self, Status *status) { +void nerv_matrix_(diagonalize)(Matrix *selfa, + MContext *context, Status *status) { if (self->nrow != self->ncol) NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); int i, j; @@ -281,7 +291,8 @@ void nerv_matrix_(diagonalize)(Matrix *self, Status *status) { NERV_SET_STATUS(status, NERV_NORMAL, 0); } -void nerv_matrix_(sigmoid)(Matrix *a, const Matrix *b, Status *status) { +void nerv_matrix_(sigmoid)(Matrix *a, const Matrix *b, + MContext *context, Status *status) { CHECK_SAME_DIMENSION(a, b, status); int i, j; size_t astride = a->stride, bstride = b->stride; @@ -298,7 +309,8 @@ void nerv_matrix_(sigmoid)(Matrix *a, const Matrix *b, Status *status) { } void nerv_matrix_(sigmoid_grad)(Matrix *nerr, const Matrix *err, - const Matrix *output, Status *status) { + const Matrix *output, + MContext *context, Status *status) { CHECK_SAME_DIMENSION(nerr, err, status); CHECK_SAME_DIMENSION(nerr, output, status); int i, j; @@ -319,10 +331,11 @@ void nerv_matrix_(sigmoid_grad)(Matrix *nerr, const Matrix *err, NERV_SET_STATUS(status, NERV_NORMAL, 0); } -Matrix *nerv_matrix_(softmax)(Matrix *b, const Matrix *a, Status *status) { +Matrix *nerv_matrix_(softmax)(Matrix *b, const Matrix *a, + MContext *context, Status *status) { Matrix *max_idx; CHECK_SAME_DIMENSION_RET(a, b, status); - max_idx = nerv_matrix_(create)(a->nrow, 1, status); + max_idx = nerv_matrix_(create)(a->nrow, 1, context, status); if (status->err_code != NERV_NORMAL) return NULL; int i, j; @@ -353,7 +366,7 @@ Matrix *nerv_matrix_(softmax)(Matrix *b, const Matrix *a, Status *status) { } void nerv_matrix_(mul_elem)(Matrix *c, const Matrix *a, const Matrix *b, - Status *status) { + MContext *context, Status *status) { CHECK_SAME_DIMENSION(a, b, status); CHECK_SAME_DIMENSION(a, c, status); int i, j; @@ -374,7 +387,8 @@ void nerv_matrix_(mul_elem)(Matrix *c, const Matrix *a, const Matrix *b, NERV_SET_STATUS(status, NERV_NORMAL, 0); } -void nerv_matrix_(log_elem)(Matrix *b, const Matrix *a, Status *status) { +void nerv_matrix_(log_elem)(Matrix *b, const Matrix *a, + MContext *context, Status *status) { CHECK_SAME_DIMENSION(a, b, status); int i, j; size_t astride = a->stride, bstride = b->stride; @@ -399,10 +413,10 @@ void nerv_matrix_(log_elem)(Matrix *b, const Matrix *a, Status *status) { } void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b, - int context, Status *status) { + int cont, MContext *context, Status *status) { if (a->nrow != b->nrow) NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); - if (a->ncol != b->ncol * (context * 2 + 1)) + if (a->ncol != b->ncol * (cont * 2 + 1)) NERV_EXIT_STATUS(status, MAT_GENERAL_ERR, "the width should be 2 * context + 1"); int i, j, k; @@ -411,10 +425,10 @@ void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b, for (i = 0; i < a->nrow; i++) { MATRIX_ELEM *a_subrow = arow; - int start = i - context; + int start = i - cont; if (start < 0) start = 0; const MATRIX_ELEM *brow = MATRIX_ROW_PTR(b, start); - for (j = i - context; j <= i + context; j++) + for (j = i - cont; j <= i + cont; j++) { for (k = 0; k < b->ncol; k++) a_subrow[k] = brow[k]; @@ -428,7 +442,7 @@ void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b, } void nerv_matrix_(rearrange_frm)(Matrix *a, const Matrix *b, - int step, Status *status) { + int step, MContext *context, Status *status) { CHECK_SAME_DIMENSION(a, b, status); if (b->ncol % step) NERV_EXIT_STATUS(status, MAT_GENERAL_ERR, @@ -455,7 +469,7 @@ void nerv_matrix_(rearrange_frm)(Matrix *a, const Matrix *b, } void nerv_matrix_(scale_rows_by_row)(Matrix *a, const Matrix *b, - Status *status) { + MContext *context, Status *status) { if (a->ncol != b->ncol) NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); if (b->nrow != 1) @@ -474,7 +488,7 @@ void nerv_matrix_(scale_rows_by_row)(Matrix *a, const Matrix *b, } void nerv_matrix_(scale_rows_by_col)(Matrix *a, const Matrix *b, - Status *status) { + MContext *context,Status *status) { if (a->nrow != b->nrow) NERV_EXIT_STATUS(status, MAT_MISMATCH_DIM, 0); if (b->ncol != 1) @@ -493,13 +507,14 @@ void nerv_matrix_(scale_rows_by_col)(Matrix *a, const Matrix *b, NERV_SET_STATUS(status, NERV_NORMAL, 0); } -static void host_matrix_(free)(MATRIX_ELEM *ptr, Status *status) { +static void host_matrix_(free)(MATRIX_ELEM *ptr, MContext *context, Status *status) { free(ptr); NERV_SET_STATUS(status, NERV_NORMAL, 0); } static void host_matrix_(alloc)(MATRIX_ELEM **dptr, size_t *stride, - long width, long height, Status *status) { + long width, long height, + MContext *context, Status *status) { if ((*dptr = (MATRIX_ELEM *)malloc(width * height)) == NULL) NERV_EXIT_STATUS(status, MAT_INSUF_MEM, 0); *stride = width; @@ -507,7 +522,7 @@ static void host_matrix_(alloc)(MATRIX_ELEM **dptr, size_t *stride, } #include "matrix.c" -Matrix *nerv_matrix_(load)(ChunkData *cdp, Status *status) { +Matrix *nerv_matrix_(load)(ChunkData *cdp, MContext *context, Status *status) { int i, j; long nrow, ncol; FILE *fp = cdp->fp; @@ -517,7 +532,7 @@ Matrix *nerv_matrix_(load)(ChunkData *cdp, Status *status) { NERV_SET_STATUS(status, MAT_INVALID_FORMAT, 0); return 0; } - self = nerv_matrix_(create)(nrow, ncol, status); + self = nerv_matrix_(create)(nrow, ncol, context, status); if (status->err_code != NERV_NORMAL) return NULL; for (i = 0; i < nrow; i++) @@ -535,7 +550,7 @@ Matrix *nerv_matrix_(load)(ChunkData *cdp, Status *status) { return self; } -void nerv_matrix_(save)(Matrix *self, ChunkFile *cfp, Status *status) { +void nerv_matrix_(save)(Matrix *self, ChunkFile *cfp, MContext *context, Status *status) { int i, j; long nrow = self->nrow, ncol = self->ncol; FILE *fp = cfp->fp; @@ -556,7 +571,7 @@ void nerv_matrix_(save)(Matrix *self, ChunkFile *cfp, Status *status) { void nerv_matrix_(copy_fromh)(Matrix *a, const Matrix *b, int a_begin, int b_begin, int b_end, - Status *status) { + MContext *context, Status *status) { if (!(0 <= b_begin && b_begin < b_end && b_end <= b->nrow && a_begin + b_end - b_begin <= a->nrow)) NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0); @@ -569,7 +584,8 @@ void nerv_matrix_(copy_fromh)(Matrix *a, const Matrix *b, } void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b, - const Matrix *idx, int b_begin, Status *status) { + const Matrix *idx, int b_begin, + MContext *context, Status *status) { if (!(0 <= b_begin && b_begin + a->nrow <= idx->ncol)) NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0); if (idx->nrow != 1) diff --git a/nerv/lib/matrix/generic/mmatrix.h b/nerv/lib/matrix/generic/mmatrix.h index 2cbca47..6d17c99 100644 --- a/nerv/lib/matrix/generic/mmatrix.h +++ b/nerv/lib/matrix/generic/mmatrix.h @@ -3,46 +3,54 @@ void nerv_matrix_(add)(Matrix *c, const Matrix *a, const Matrix *b, MATRIX_ELEM alpha, MATRIX_ELEM beta, - Status *status); + MContext *context, Status *status); void nerv_matrix_(mul)(Matrix *c, const Matrix *a, const Matrix *b, MATRIX_ELEM alpha, MATRIX_ELEM beta, - int ta, int tb, Status *status); -void nerv_matrix_(sigmoid)(Matrix *a, const Matrix *b, Status *status); + int ta, int tb, + MContext *context, Status *status); +void nerv_matrix_(sigmoid)(Matrix *a, const Matrix *b, + MContext *context, Status *status); void nerv_matrix_(sigmoid_grad)(Matrix *nerr, const Matrix *err, - const Matrix *output, Status *status); + const Matrix *output, + MContext *context, Status *status); -Matrix *nerv_matrix_(softmax)(Matrix *b, const Matrix *a, Status *status); -Matrix *nerv_matrix_(rowsum)(Matrix *a, Status *status); -Matrix *nerv_matrix_(colsum)(Matrix *a, Status *status); +Matrix *nerv_matrix_(softmax)(Matrix *b, const Matrix *a, + MContext *context, Status *status); +Matrix *nerv_matrix_(rowsum)(Matrix *a, MContext *context, Status *status); +Matrix *nerv_matrix_(colsum)(Matrix *a, MContext *context, Status *status); Matrix *nerv_matrix_(colsame)(Matrix *a, const Matrix *ref, - Status *status); -Matrix *nerv_matrix_(rowmax)(Matrix *a, Status *status); + MContext *context, Status *status); +Matrix *nerv_matrix_(rowmax)(Matrix *a, MContext *context, Status *status); void nerv_matrix_(rowmax_idx)(Matrix *a, Matrix **b, Matrix **idx, - Status *status); + MContext *context, Status *status); void nerv_matrix_(add_row)(Matrix *b, const Matrix *a, double beta, - Status *status); -void nerv_matrix_(clip)(Matrix *self, double val_1, double val_2, Status *status); -void nerv_matrix_(fill)(Matrix *self, double val, Status *status); -void nerv_matrix_(diagonalize)(Matrix *self, Status *status); + MContext *context, Status *status); +void nerv_matrix_(clip)(Matrix *self, double val1, double val2, + MContext *context, Status *status); +void nerv_matrix_(diagonalize)(Matrix *self, MContext *context, Status *status); +void nerv_matrix_(fill)(Matrix *self, double val, MContext *context, Status *status); void nerv_matrix_(copy_fromh)(Matrix *a, const Matrix *b, int a_begin, int b_begin, int b_end, - Status *status); -Matrix *nerv_matrix_(trans)(Matrix *a, Status *status); + MContext *context, Status *status); +Matrix *nerv_matrix_(trans)(Matrix *a, MContext *context, Status *status); void nerv_matrix_(mul_elem)(Matrix *c, const Matrix *a, const Matrix *b, - Status *status); + MContext *context, Status *status); -void nerv_matrix_(log_elem)(Matrix *b, const Matrix *a, Status *status); +void nerv_matrix_(log_elem)(Matrix *b, const Matrix *a, + MContext *context, Status *status); -Matrix *nerv_matrix_(decompress)(const Matrix *a, int orig_col, Status *status); +Matrix *nerv_matrix_(decompress)(const Matrix *a, int orig_col, + MContext *context, Status *status); void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b, - const Matrix *idx, int b_begin, Status *status); + const Matrix *idx, int b_begin, + MContext *context, Status *status); void nerv_matrix_(expand_frm)(Matrix *a, const Matrix *b, - int context, Status *status); + int cont, MContext *context, Status *status); void nerv_matrix_(rearrange_frm)(Matrix *a, const Matrix *b, - int step, Status *status); + int step, MContext *context, Status *status); void nerv_matrix_(scale_rows_by_col)(Matrix *a, const Matrix *b, - Status *status); + MContext *context, Status *status); void nerv_matrix_(scale_rows_by_row)(Matrix *a, const Matrix *b, - Status *status); -Matrix *nerv_matrix_(load)(ChunkData *cdp, Status *status); -void nerv_matrix_(save)(Matrix *self, ChunkFile *cfp, Status *status); + MContext *context, Status *status); +Matrix *nerv_matrix_(load)(ChunkData *cdp, MContext *context, Status *status); +void nerv_matrix_(save)(Matrix *self, ChunkFile *cfp, MContext *context, Status *status); diff --git a/nerv/lib/matrix/mmatrix.c b/nerv/lib/matrix/mmatrix.c index 3125ab6..e40b160 100644 --- a/nerv/lib/matrix/mmatrix.c +++ b/nerv/lib/matrix/mmatrix.c @@ -1,6 +1,40 @@ #define NERV_GENERIC_MMATRIX +#define MATRIX_CONTEXT MContext +#include <string.h> #include <stdlib.h> #include "../common.h" +#include "mmatrix.h" + +void nerv_host_context_print_profile(MContext *context) { + HashMap *profile = context->profile; + size_t i; + fprintf(stderr, "*** [nerv mmatrix profile] **\n"); + for (i = 0; i < profile->size; i++) + { + HashNode *ptr; + for (ptr = profile->bucket[i]; ptr; ptr = ptr->next) + { + fprintf(stderr, "%s:\t%.6f\n", ptr->key, *(float *)ptr->val); + } + } +} + +void nerv_host_context_clear_profile(MContext *context) { + nerv_hashmap_clear(context->profile); +} + +MContext *nerv_host_context_create(Status *status) { + MContext *context = (MContext *)malloc(sizeof(MContext)); + context->profile = nerv_hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp); + NERV_SET_STATUS(status, NERV_NORMAL, 0); + return context; +} + +void nerv_host_context_destroy(MContext *context, Status *status) { + nerv_hashmap_destroy(context->profile); + free(context); + NERV_SET_STATUS(status, NERV_NORMAL, 0); +} #define MATRIX_USE_FLOAT #define host_matrix_(NAME) host_matrix_float_##NAME @@ -10,9 +44,9 @@ #include "generic/elem_type.h" #include "generic/mmatrix.c" -Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) { +Matrix *nerv_matrix_(perm_gen)(int ncol, MContext *context, Status *status) { int i; - Matrix *self = nerv_matrix_(create)(1, ncol, status); + Matrix *self = nerv_matrix_(create)(1, ncol, context, status); if (status->err_code != NERV_NORMAL) return NULL; float *prow = MATRIX_ELEM_PTR_F(self); diff --git a/nerv/lib/matrix/mmatrix.h b/nerv/lib/matrix/mmatrix.h index 31e7984..6061683 100644 --- a/nerv/lib/matrix/mmatrix.h +++ b/nerv/lib/matrix/mmatrix.h @@ -1,5 +1,15 @@ #ifndef NERV_MMATRIX_H #define NERV_MMATRIX_H #include "matrix.h" -Matrix *nerv_matrix_host_float_perm_gen(int ncol, Status *status); +#include "../common.h" + +typedef struct MContext { + HashMap *profile; +} MContext; + +Matrix *nerv_matrix_host_float_perm_gen(int ncol, MContext *context, Status *status); +void nerv_host_context_print_profile(MContext *context); +void nerv_host_context_clear_profile(MContext *context); +MContext *nerv_host_context_create(Status *status); +void nerv_host_context_destroy(MContext *contex, Status *status); #endif diff --git a/nerv/matrix/cumatrix.c b/nerv/matrix/cumatrix.c index 7f22d68..b8eef9c 100644 --- a/nerv/matrix/cumatrix.c +++ b/nerv/matrix/cumatrix.c @@ -3,46 +3,75 @@ #include "../lib/matrix/cumatrix.h" #include "../lib/matrix/cuda_helper.h" #include <string.h> -#define PROFILE_HASHMAP_SIZE 123457 -static cublasHandle_t cublas_handle; -static cudaEvent_t profile_start, profile_stop; -static HashMap *profile; -static int select_gpu(lua_State *L) { +const char *nerv_cuda_context_tname = "nerv.CuContext"; + +int nerv_cuda_context_lua_select_gpu(lua_State *L) { Status status; - int dev = luaL_checkinteger(L, 1); - nerv_cumatrix_select_gpu(dev, &status); + nerv_cuda_context_select_gpu(luaT_checkudata(L, 1, nerv_cuda_context_tname), + luaL_checkinteger(L, 2), &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } -static int print_profile(lua_State *L) { - nerv_cumatrix_print_profile(); +int nerv_cuda_context_lua_print_profile(lua_State *L) { + nerv_cuda_context_print_profile(luaT_checkudata(L, 1, nerv_cuda_context_tname)); return 0; } -static int clear_profile(lua_State *L) { - nerv_cumatrix_clear_profile(); +int nerv_cuda_context_lua_clear_profile(lua_State *L) { + nerv_cuda_context_clear_profile(luaT_checkudata(L, 1, nerv_cuda_context_tname)); return 0; } -static const luaL_Reg cumatrix_methods[] = { - {"print_profile", print_profile}, - {"clear_profile", clear_profile}, - {"select_gpu", select_gpu}, +int nerv_cuda_context_lua_new(lua_State *L) { + Status status; + int dev = lua_gettop(L) > 0 ? luaL_checkinteger(L, 1) : -1; + CuContext *self = nerv_cuda_context_create(dev, &status); + NERV_LUA_CHECK_STATUS(L, status); + luaT_pushudata(L, self, nerv_cuda_context_tname); + return 1; +} + +int nerv_cuda_context_lua_destroy(lua_State *L) { + Status status; + CuContext *self = luaT_checkudata(L, 1, nerv_cuda_context_tname); + nerv_cuda_context_destroy(self, &status); + NERV_LUA_CHECK_STATUS(L, status); + return 1; +} + +static const luaL_Reg nerv_cuda_context_methods[] = { + {"print_profile", nerv_cuda_context_lua_print_profile}, + {"clear_profile", nerv_cuda_context_lua_clear_profile}, + {"select_gpu", nerv_cuda_context_lua_select_gpu}, {NULL, NULL} }; +void nerv_cuda_context_lua_init(lua_State *L) { + luaT_newmetatable(L, nerv_cuda_context_tname, NULL, + nerv_cuda_context_lua_new, + nerv_cuda_context_lua_destroy, NULL); + luaL_register(L, NULL, nerv_cuda_context_methods); +} + extern void nerv_matrix_cuda_float_lua_init(lua_State *L); extern void nerv_matrix_cuda_double_lua_init(lua_State *L); +static const luaL_Reg cumatrix_methods[] = { + {NULL, NULL} +}; + void nerv_lua_cumatrix_init(lua_State *L) { luaL_register(L, NULL, cumatrix_methods); - nerv_cumatrix_init(); + nerv_cuda_context_lua_init(L); nerv_matrix_cuda_float_lua_init(L); nerv_matrix_cuda_double_lua_init(L); } +#define MATRIX_CONTEXT CuContext +#define MATRIX_CONTEXT_TNAME nerv_cuda_context_tname + #define MATRIX_USE_FLOAT #define cuda_matrix_(NAME) cuda_matrix_float_##NAME #define nerv_matrix_(NAME) nerv_matrix_cuda_float_##NAME diff --git a/nerv/matrix/generic/cumatrix.c b/nerv/matrix/generic/cumatrix.c index f8b8038..0c90d39 100644 --- a/nerv/matrix/generic/cumatrix.c +++ b/nerv/matrix/generic/cumatrix.c @@ -1,4 +1,6 @@ #ifdef NERV_GENERIC_CUMATRIX +#include "../matrix.h" +#include "../../lib/matrix/generic/matrix.h" #include "../../lib/matrix/generic/elem_type.h" #define MATRIX_DATA_WRITE(L, data, idx, val) cuda_matrix_(write)(L, data, idx, val) #define MATRIX_DATA_READ(L, data, idx) cuda_matrix_(read)(L, data, idx) @@ -7,7 +9,6 @@ #define NERV_GENERIC_MATRIX #define NERV_GENERIC_CUKERNEL #include "../../lib/common.h" -#include "../../lib/matrix/generic/matrix.h" #include "../../lib/matrix/generic/cumatrix.h" #define BLAS_OP_N CUBLAS_OP_N @@ -17,48 +18,58 @@ static int nerv_matrix_(lua_get_blas_op)(char ch) { static int nerv_matrix_(lua_prefixsum_row)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 3); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); - nerv_matrix_(prefixsum_row)(a, b, &status); + nerv_matrix_(prefixsum_row)(a, b, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_thres_mask)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 6); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); MATRIX_ELEM thres = luaL_checknumber(L, 3); MATRIX_ELEM low = luaL_checknumber(L, 4); MATRIX_ELEM high = luaL_checknumber(L, 5); - nerv_matrix_(thres_mask)(a, b, thres, low, high, &status); + nerv_matrix_(thres_mask)(a, b, thres, low, high, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_rand_uniform)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 2); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); - nerv_matrix_(rand_uniform)(a, &status); + nerv_matrix_(rand_uniform)(a, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_tanh)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 3); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); - nerv_matrix_(tanh)(a, b, &status); + nerv_matrix_(tanh)(a, b, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_tanh_grad)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 4); Matrix *nerr = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *err = luaT_checkudata(L, 2, nerv_matrix_(tname)); Matrix *output = luaT_checkudata(L, 3, nerv_matrix_(tname)); - nerv_matrix_(tanh_grad)(nerr, err, output, &status); + nerv_matrix_(tanh_grad)(nerr, err, output, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } @@ -66,39 +77,45 @@ static int nerv_matrix_(lua_tanh_grad)(lua_State *L) { extern const char *MATRIX_CUMATRIX_HOST_TNAME; static int nerv_matrix_(lua_copy_fromh)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 6); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); const Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME); int nargs = lua_gettop(L); int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0; int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow; int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0; - nerv_matrix_(copy_fromh)(a, b, a_begin, b_begin, b_end, &status); + nerv_matrix_(copy_fromh)(a, b, a_begin, b_begin, b_end, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_copy_toh)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 6); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); const Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME); int nargs = lua_gettop(L); int a_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0; int a_end = nargs > 3 ? luaL_checkinteger(L, 4) : a->nrow; int b_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0; - nerv_matrix_(copy_toh)(a, b, a_begin, a_end, b_begin, &status); + nerv_matrix_(copy_toh)(a, b, a_begin, a_end, b_begin, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_copy_fromd)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 6); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); int nargs = lua_gettop(L); int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0; int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow; int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0; - nerv_matrix_(copy_fromd)(a, b, a_begin, b_begin, b_end, &status); + nerv_matrix_(copy_fromd)(a, b, a_begin, b_begin, b_end, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } @@ -106,36 +123,42 @@ static int nerv_matrix_(lua_copy_fromd)(lua_State *L) { extern const char *nerv_matrix_host_float_tname; static int nerv_matrix_(lua_copy_rows_fromh_by_idx)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 5); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); const Matrix *b = luaT_checkudata(L, 2, MATRIX_CUMATRIX_HOST_TNAME); const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_host_float_tname); long nrow = a->nrow; int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0; - nerv_matrix_(copy_rows_fromh_by_idx)(a, b, idx, b_begin, &status); + nerv_matrix_(copy_rows_fromh_by_idx)(a, b, idx, b_begin, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_copy_rows_fromd_by_idx)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 5); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_(tname)); long nrow = a->nrow; int idx_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0; - nerv_matrix_(copy_rows_fromd_by_idx)(a, b, idx, idx_begin, &status); + nerv_matrix_(copy_rows_fromd_by_idx)(a, b, idx, idx_begin, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_copy_rows_fromd_by_colidx)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 5); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_(tname)); long nrow = a->nrow; int idx_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0; - nerv_matrix_(copy_rows_fromd_by_colidx)(a, b, idx, idx_begin, &status); + nerv_matrix_(copy_rows_fromd_by_colidx)(a, b, idx, idx_begin, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } @@ -145,12 +168,14 @@ static int nerv_matrix_(lua_update_select_rows_by_rowidx)(lua_State *L) { /* update c's select rows, * i.e. c[idx[i]] = c[idx[i]] * (1 - beta * alpha) + a[i] * alpha */ Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 6); Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname)); const Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname)); const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_(tname)); MATRIX_ELEM alpha = luaL_checknumber(L, 4); MATRIX_ELEM beta = luaL_checknumber(L, 5); - nerv_matrix_(update_select_rows_by_rowidx)(c, a, idx, alpha, beta, &status); + nerv_matrix_(update_select_rows_by_rowidx)(c, a, idx, alpha, beta, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } @@ -159,12 +184,14 @@ static int nerv_matrix_(lua_update_select_rows_by_colidx)(lua_State *L) { /* update c's select rows, * i.e. c[idx[i]] = c[idx[i]] * (1 - beta * alpha) + a[i] * alpha */ Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 6); Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname)); const Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname)); const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_(tname)); MATRIX_ELEM alpha = luaL_checknumber(L, 4); MATRIX_ELEM beta = luaL_checknumber(L, 5); - nerv_matrix_(update_select_rows_by_colidx)(c, a, idx, alpha, beta, &status); + nerv_matrix_(update_select_rows_by_colidx)(c, a, idx, alpha, beta, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } diff --git a/nerv/matrix/generic/matrix.c b/nerv/matrix/generic/matrix.c index 3162ffb..b544dd9 100644 --- a/nerv/matrix/generic/matrix.c +++ b/nerv/matrix/generic/matrix.c @@ -1,15 +1,18 @@ #ifdef NERV_GENERIC_MATRIX -#include "../../lib/common.h" +#include "../matrix.h" #include "../../lib/matrix/generic/matrix.h" +#include "../../lib/common.h" extern const char *nerv_matrix_(tname); extern const char *MATRIX_BASE_TNAME; - int nerv_matrix_(lua_new)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 3); Matrix *self = nerv_matrix_(create)(luaL_checkinteger(L, 1), - luaL_checkinteger(L, 2), &status); + luaL_checkinteger(L, 2), + context, &status); NERV_LUA_CHECK_STATUS(L, status); luaT_pushudata(L, self, nerv_matrix_(tname)); return 1; @@ -17,8 +20,10 @@ int nerv_matrix_(lua_new)(lua_State *L) { int nerv_matrix_(lua_destroy)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 2); Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname)); - nerv_matrix_(destroy)(self, &status); + nerv_matrix_(destroy)(self, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 1; } @@ -128,18 +133,22 @@ void nerv_matrix_(lua_init)(lua_State *L) { static int nerv_matrix_(lua_add)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 6); Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname)); const Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname)); const Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname)); MATRIX_ELEM alpha = luaL_checknumber(L, 4); MATRIX_ELEM beta = luaL_checknumber(L, 5); - nerv_matrix_(add)(c, a, b, alpha, beta, &status); + nerv_matrix_(add)(c, a, b, alpha, beta, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_mul)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 8); Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname)); @@ -150,35 +159,41 @@ static int nerv_matrix_(lua_mul)(lua_State *L) { : BLAS_OP_N; int tb = nargs > 6 ? nerv_matrix_(lua_get_blas_op)(*luaL_checkstring(L, 7)) \ : BLAS_OP_N; - nerv_matrix_(mul)(c, a, b, alpha, beta, ta, tb, &status); + nerv_matrix_(mul)(c, a, b, alpha, beta, ta, tb, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_sigmoid)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 3); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); - nerv_matrix_(sigmoid)(a, b, &status); + nerv_matrix_(sigmoid)(a, b, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_sigmoid_grad)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 4); Matrix *nerr = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *err = luaT_checkudata(L, 2, nerv_matrix_(tname)); Matrix *output = luaT_checkudata(L, 3, nerv_matrix_(tname)); - nerv_matrix_(sigmoid_grad)(nerr, err, output, &status); + nerv_matrix_(sigmoid_grad)(nerr, err, output, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_softmax)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 3); Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname)); - Matrix *max_idx = nerv_matrix_(softmax)(b, a, &status); + Matrix *max_idx = nerv_matrix_(softmax)(b, a, context, &status); NERV_LUA_CHECK_STATUS(L, status); luaT_pushudata(L, max_idx, nerv_matrix_(tname)); return 1; @@ -186,8 +201,10 @@ static int nerv_matrix_(lua_softmax)(lua_State *L) { static int nerv_matrix_(lua_rowsum)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 2); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); - Matrix *b = nerv_matrix_(rowsum)(a, &status); + Matrix *b = nerv_matrix_(rowsum)(a, context, &status); NERV_LUA_CHECK_STATUS(L, status); luaT_pushudata(L, b, nerv_matrix_(tname)); return 1; @@ -195,8 +212,10 @@ static int nerv_matrix_(lua_rowsum)(lua_State *L) { static int nerv_matrix_(lua_colsum)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 2); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); - Matrix *b = nerv_matrix_(colsum)(a, &status); + Matrix *b = nerv_matrix_(colsum)(a, context, &status); NERV_LUA_CHECK_STATUS(L, status); luaT_pushudata(L, b, nerv_matrix_(tname)); return 1; @@ -204,9 +223,11 @@ static int nerv_matrix_(lua_colsum)(lua_State *L) { static int nerv_matrix_(lua_colsame)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 3); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); const Matrix *ref = luaT_checkudata(L, 2, nerv_matrix_(tname)); - Matrix *b = nerv_matrix_(colsame)(a, ref, &status); + Matrix *b = nerv_matrix_(colsame)(a, ref, context, &status); NERV_LUA_CHECK_STATUS(L, status); luaT_pushudata(L, b, nerv_matrix_(tname)); return 1; @@ -214,8 +235,10 @@ static int nerv_matrix_(lua_colsame)(lua_State *L) { static int nerv_matrix_(lua_rowmax)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 2); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); - Matrix *b = nerv_matrix_(rowmax)(a, &status); + Matrix *b = nerv_matrix_(rowmax)(a, context, &status); NERV_LUA_CHECK_STATUS(L, status); luaT_pushudata(L, b, nerv_matrix_(tname)); return 1; @@ -223,10 +246,12 @@ static int nerv_matrix_(lua_rowmax)(lua_State *L) { static int nerv_matrix_(lua_rowmax_idx)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 2); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); Matrix *b; Matrix *idx; - nerv_matrix_(rowmax_idx)(a, &b, &idx, &status); + nerv_matrix_(rowmax_idx)(a, &b, &idx, context, &status); NERV_LUA_CHECK_STATUS(L, status); luaT_pushudata(L, b, nerv_matrix_(tname)); luaT_pushudata(L, idx, nerv_matrix_(tname)); @@ -235,37 +260,45 @@ static int nerv_matrix_(lua_rowmax_idx)(lua_State *L) { static int nerv_matrix_(lua_add_row)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 4); const Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname)); double beta = luaL_checknumber(L, 3); - nerv_matrix_(add_row)(b, a, beta, &status); + nerv_matrix_(add_row)(b, a, beta, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_fill)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 3); Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname)); double val = luaL_checknumber(L, 2); - nerv_matrix_(fill)(self, val, &status); + nerv_matrix_(fill)(self, val, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_clip)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 4); Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname)); - double val_1 = luaL_checknumber(L, 2); - double val_2 = luaL_checknumber(L, 3); - nerv_matrix_(clip)(self, val_1, val_2, &status); + double val1 = luaL_checknumber(L, 2); + double val2 = luaL_checknumber(L, 3); + nerv_matrix_(clip)(self, val1, val2, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_trans)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 2); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); - Matrix *b = nerv_matrix_(trans)(a, &status); + Matrix *b = nerv_matrix_(trans)(a, context, &status); NERV_LUA_CHECK_STATUS(L, status); luaT_pushudata(L, b, nerv_matrix_(tname)); return 1; @@ -273,28 +306,34 @@ static int nerv_matrix_(lua_trans)(lua_State *L) { static int nerv_matrix_(lua_mul_elem)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 4); const Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname)); const Matrix *b = luaT_checkudata(L, 3, nerv_matrix_(tname)); Matrix *c = luaT_checkudata(L, 1, nerv_matrix_(tname)); - nerv_matrix_(mul_elem)(c, a, b, &status); + nerv_matrix_(mul_elem)(c, a, b, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_log_elem)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 3); const Matrix *a = luaT_checkudata(L, 2, nerv_matrix_(tname)); Matrix *b = luaT_checkudata(L, 1, nerv_matrix_(tname)); - nerv_matrix_(log_elem)(b, a, &status); + nerv_matrix_(log_elem)(b, a, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_decompress)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 3); const Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); int orig_col = luaL_checkinteger(L, 2); - Matrix *b = nerv_matrix_(decompress)(a, orig_col, &status); + Matrix *b = nerv_matrix_(decompress)(a, orig_col, context, &status); NERV_LUA_CHECK_STATUS(L, status); luaT_pushudata(L, b, nerv_matrix_(tname)); return 1; @@ -302,38 +341,46 @@ static int nerv_matrix_(lua_decompress)(lua_State *L) { static int nerv_matrix_(lua_expand_frm)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 4); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); - int context = luaL_checkinteger(L, 3); - nerv_matrix_(expand_frm)(a, b, context, &status); + int cont = luaL_checkinteger(L, 3); + nerv_matrix_(expand_frm)(a, b, cont, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_rearrange_frm)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 4); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); int step = luaL_checkinteger(L, 3); - nerv_matrix_(rearrange_frm)(a, b, step, &status); + nerv_matrix_(rearrange_frm)(a, b, step, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_scale_rows_by_col)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 3); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); - nerv_matrix_(scale_rows_by_col)(a, b, &status); + nerv_matrix_(scale_rows_by_col)(a, b, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_scale_rows_by_row)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 3); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); - nerv_matrix_(scale_rows_by_row)(a, b, &status); + nerv_matrix_(scale_rows_by_row)(a, b, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } diff --git a/nerv/matrix/generic/mmatrix.c b/nerv/matrix/generic/mmatrix.c index 1665eff..a5e5969 100644 --- a/nerv/matrix/generic/mmatrix.c +++ b/nerv/matrix/generic/mmatrix.c @@ -1,4 +1,5 @@ #ifdef NERV_GENERIC_MMATRIX +#include "../matrix.h" #include "../../lib/matrix/generic/matrix.h" #include "../../lib/matrix/generic/elem_type.h" #define MATRIX_DATA_WRITE(L, data, idx, val) (data[idx] = val) @@ -7,10 +8,10 @@ #define MATRIX_BASE_TNAME nerv_matrix_host_tname #define NERV_GENERIC_MATRIX #include "../../lib/common.h" +#include "../../lib/cblas.h" #include "../../lib/matrix/generic/mmatrix.h" #include "../../io/chunk_file.h" #include <string.h> -#include <cblas.h> #define BLAS_OP_N CblasNoTrans static int nerv_matrix_(lua_get_blas_op)(char ch) { @@ -48,8 +49,10 @@ static void host_matrix_(init)(lua_State *L) { static int nerv_matrix_(lua_load)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 2); ChunkData *cdp = luaT_checkudata(L, 1, nerv_chunk_data_tname); - Matrix *self = nerv_matrix_(load)(cdp, &status); + Matrix *self = nerv_matrix_(load)(cdp, context, &status); NERV_LUA_CHECK_STATUS(L, status); luaT_pushudata(L, self, nerv_matrix_(tname)); return 1; @@ -57,23 +60,27 @@ static int nerv_matrix_(lua_load)(lua_State *L) { static int nerv_matrix_(lua_save)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 3); ChunkFile *cfp = luaT_checkudata(L, 2, nerv_chunk_file_handle_tname); Matrix *self = luaT_checkudata(L, 1, nerv_matrix_(tname)); - nerv_matrix_(save)(self, cfp, &status); + nerv_matrix_(save)(self, cfp, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } static int nerv_matrix_(lua_copy_fromh)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 6); Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); int nargs = lua_gettop(L); int b_begin = nargs > 2 ? luaL_checkinteger(L, 3) : 0; int b_end = nargs > 3 ? luaL_checkinteger(L, 4) : b->nrow; int a_begin = nargs > 4 ? luaL_checkinteger(L, 5) : 0; - nerv_matrix_(copy_fromh)(a, b, a_begin, b_begin, b_end, &status); + nerv_matrix_(copy_fromh)(a, b, a_begin, b_begin, b_end, context, &status); NERV_LUA_CHECK_STATUS(L, status); return 0; } @@ -81,12 +88,14 @@ static int nerv_matrix_(lua_copy_fromh)(lua_State *L) { static int nerv_matrix_(lua_copy_rows_fromh_by_idx)(lua_State *L) { Status status; - Matrix *a=luaT_checkudata(L,1,nerv_matrix_(tname)); - const Matrix *b=luaT_checkudata(L,2,nerv_matrix_(tname)); - const Matrix *idx=luaT_checkudata(L,3,nerv_matrix_(tname)); - int b_begin=lua_gettop(L)>3?luaL_checkinteger(L,4):0; - nerv_matrix_(copy_rows_fromh_by_idx)(a,b,idx,b_begin,&status); - NERV_LUA_CHECK_STATUS(L,status); + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 5); + Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname)); + const Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname)); + const Matrix *idx = luaT_checkudata(L, 3, nerv_matrix_(tname)); + int b_begin = lua_gettop(L) > 3 ? luaL_checkinteger(L, 4) : 0; + nerv_matrix_(copy_rows_fromh_by_idx)(a, b, idx, b_begin, context, &status); + NERV_LUA_CHECK_STATUS(L, status); return 0; } diff --git a/nerv/matrix/init.lua b/nerv/matrix/init.lua index da76e1b..722c780 100644 --- a/nerv/matrix/init.lua +++ b/nerv/matrix/init.lua @@ -40,7 +40,8 @@ end --- Assign each element in a matrix using the value returned by a callback `gen`. -- @param gen the callback used to generated the values in the matrix, to which -- the indices of row and column will be passed (e.g., `gen(i, j)`) -function nerv.Matrix:generate(gen) + +function nerv.Matrix:_generate(gen) if (self:dim() == 2) then for i = 0, self:nrow() - 1 do local row = self[i] @@ -55,6 +56,21 @@ function nerv.Matrix:generate(gen) end end +function nerv.Matrix:generate(gen) + local tmp + if nerv.is_type(self, 'nerv.CuMatrixFloat') then + tmp = nerv.MMatrixFloat(self:nrow(), self:ncol()) + elseif nerv.is_type(self, 'nerv.CuMatrixDouble') then + tmp = nerv.MMatrixDouble(self:nrow(), self:ncol()) + else + tmp = self + end + tmp:_generate(gen) + if nerv.is_type(self, 'nerv.CuMatrix') then + self:copy_fromh(tmp) + end +end + --- Create a fresh new matrix of the same matrix type (as `self`). -- @param nrow optional, the number of rows in the created matrix if specified, -- otherwise `self:nrow()` will be used @@ -87,6 +103,17 @@ function nerv.Matrix:__mul__(b) return c end +--- A wrapper function for `copy_from` +function nerv.Matrix:copy_to(b, ...) + b:copy_from(self, ...) +end + +--- The base class for all device (in-GPU) matrices +-- @type nerv.CuMatrix + +--- A wrapper function for `copy_fromd` +nerv.CuMatrix.copy_tod = nerv.Matrix.copy_to + --- CUDA float matrices -- @type nerv.CuMatrixFloat @@ -127,15 +154,14 @@ end -- @type nerv.MMatrix --- A wrapper function for `copy_fromh` -function nerv.MMatrix:copy_toh(b, ...) - b:copy_fromh(self, ...) -end +nerv.MMatrix.copy_toh = nerv.Matrix.copy_to ---- Print profiling info of host matrices -function nerv.MMatrix.print_profile() - nerv.info("mmatrix profile not available") +--- A wrapper function for `nerv.CuMatrix` copy +function nerv.MMatrix:copy_fromd(b, ...) + b:copy_toh(self, ...) end ---- Clear profiling info of host matrices -function nerv.MMatrix.clear_profile() +--- A wrapper function for `nerv.CuMatrix` copy +function nerv.MMatrix:copy_tod(b, ...) + b:copy_fromh(self, ...) end diff --git a/nerv/matrix/matrix.h b/nerv/matrix/matrix.h new file mode 100644 index 0000000..788f596 --- /dev/null +++ b/nerv/matrix/matrix.h @@ -0,0 +1,24 @@ +#ifndef NERV_LUA_MATRIX_H +#define NERV_LUA_MATRIX_H +#include "../lib/luaT/luaT.h" +#define _MATRIX_GET_CONTEXT(L, p, tname, ctname) \ + do { \ + if (lua_gettop(L) < p) \ + { \ + luaT_pushmetatable(L, tname); \ + lua_getfield(L, -1, "_default_context"); \ + context = luaT_checkudata(L, -1, ctname); \ + lua_pop(L, 2); \ + } \ + else \ + { \ + context = luaT_checkudata(L, p, ctname); \ + } \ + } while (0) + +extern const char *nerv_cuda_context_tname; +extern const char *nerv_host_context_tname; +extern const char *nerv_matrix_host_tname; +#define MATRIX_GET_CONTEXT(L, p) _MATRIX_GET_CONTEXT(L, p, nerv_matrix_(tname), MATRIX_CONTEXT_TNAME) +#define MMATRIX_GET_CONTEXT(L, p) _MATRIX_GET_CONTEXT(L, p, nerv_matrix_host_tname, nerv_host_context_tname) +#endif diff --git a/nerv/matrix/mmatrix.c b/nerv/matrix/mmatrix.c index a68506d..45cb238 100644 --- a/nerv/matrix/mmatrix.c +++ b/nerv/matrix/mmatrix.c @@ -1,17 +1,64 @@ #define NERV_GENERIC_MMATRIX #include <stdlib.h> +#include "../lib/matrix/mmatrix.h" #include "../lib/common.h" + +const char *nerv_host_context_tname = "nerv.MContext"; + +int nerv_host_context_lua_print_profile(lua_State *L) { + nerv_host_context_print_profile(luaT_checkudata(L, 1, nerv_host_context_tname)); + return 0; +} + +int nerv_host_context_lua_clear_profile(lua_State *L) { + nerv_host_context_clear_profile(luaT_checkudata(L, 1, nerv_host_context_tname)); + return 0; +} + +int nerv_host_context_lua_new(lua_State *L) { + Status status; + MContext *self = nerv_host_context_create(&status); + NERV_LUA_CHECK_STATUS(L, status); + luaT_pushudata(L, self, nerv_host_context_tname); + return 1; +} + +int nerv_host_context_lua_destroy(lua_State *L) { + Status status; + MContext *self = luaT_checkudata(L, 1, nerv_host_context_tname); + nerv_host_context_destroy(self, &status); + NERV_LUA_CHECK_STATUS(L, status); + return 1; +} + +static const luaL_Reg nerv_host_context_methods[] = { + {"print_profile", nerv_host_context_lua_print_profile}, + {"clear_profile", nerv_host_context_lua_clear_profile}, + {NULL, NULL} +}; + +void nerv_host_context_lua_init(lua_State *L) { + luaT_newmetatable(L, nerv_host_context_tname, NULL, + nerv_host_context_lua_new, + nerv_host_context_lua_destroy, NULL); + luaL_register(L, NULL, nerv_host_context_methods); +} + void nerv_matrix_host_float_lua_init(lua_State *L); void nerv_matrix_host_double_lua_init(lua_State *L); void nerv_matrix_host_int_lua_init(lua_State *L); void nerv_lua_mmatrix_init(lua_State *L) { srand(1); + nerv_host_context_lua_init(L); nerv_matrix_host_float_lua_init(L); nerv_matrix_host_double_lua_init(L); nerv_matrix_host_int_lua_init(L); } +#define MATRIX_CONTEXT MContext +#define MATRIX_CONTEXT_TNAME nerv_host_context_tname + #define MATRIX_USE_FLOAT #define host_matrix_(NAME) host_matrix_float_##NAME #define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME @@ -29,8 +76,10 @@ static void host_matrix_(init_extra)(lua_State *L) { static int nerv_matrix_(lua_perm_gen)(lua_State *L) { Status status; + MATRIX_CONTEXT *context; + MATRIX_GET_CONTEXT(L, 2); int i, ncol = luaL_checkinteger(L, 1); - Matrix *self = nerv_matrix_(perm_gen)(ncol, &status); + Matrix *self = nerv_matrix_(perm_gen)(ncol, context, &status); NERV_LUA_CHECK_STATUS(L, status); luaT_pushudata(L, self, nerv_matrix_(tname)); return 1; @@ -1,9 +1,47 @@ #! /usr/bin/env luajit require 'nerv' -print("Greetings") -if #arg < 1 then +local options = {{"help", "h", "boolean", default = false, desc = "print this help message"}, + {"use-cpu", "c", "boolean", default = false, desc = "use CPU by default (instead of gpu by default)"}, + {"select-gpu", nil, "int", default = -1, desc = "select the GPU for computation, fallback to auto mode if not specified"}} +econf = {} -- environment configuration + +local function print_help() + nerv.printf("Usage: <nerv_prog> [options] script.lua\n") + nerv.print_usage(options) +end + +nerv.printf("*** NERV: A Lua-based toolkit for high-performance deep learning (alpha) ***\n") +arg, opts = nerv.parse_args(arg, options) +if #arg < 1 or opts["help"].val then + print_help() return end + +-- only for backward compatibilty, will be removed in the future +local function _add_profile_method(cls) + local c = cls._default_context + cls.print_profile = function () c:print_profile() end + cls.clear_profile = function () c:clear_profile() end +end + +if not opts["use-cpu"].val then + local dev = opts["select-gpu"].val + nerv.info("automatically initialize a default CuContext...") + nerv.CuMatrix._default_context = nerv.CuContext(dev) + nerv.info("the default CuContext is ok") + _add_profile_method(nerv.CuMatrix) + nerv.CuMatrix.select_gpu = + function (dev) nerv.CuMatrix._default_context:select_gpu(dev) end + econf.use_cpu = false +else + econf.use_cpu = true +end + +nerv.info("automatically initialize a default MContext...") +nerv.MMatrix._default_context = nerv.MContext() +nerv.info("the default MContext is ok") +_add_profile_method(nerv.MMatrix) + local script = arg[1] local script_arg = {} for i = 2, #arg do diff --git a/nerv/nerv-scm-1.rockspec b/nerv/nerv-scm-1.rockspec index 6949f54..d039e85 100644 --- a/nerv/nerv-scm-1.rockspec +++ b/nerv/nerv-scm-1.rockspec @@ -11,12 +11,14 @@ description = { license = "BSD" } dependencies = { - "lua >= 5.1" + "lua >= 5.1", + "penlight >= 1.3.2" } build = { type = "make", build_variables = { CFLAGS="$(CFLAGS) -Wall -Wextra -g -O2", + --CFLAGS="$(CFLAGS) -Wall -Wextra -g", LIBFLAG="$(LIBFLAG)", LUA_LIBDIR="$(LUA_LIBDIR)", LUA_BINDIR="$(LUA_BINDIR)", diff --git a/nerv/nn/init.lua b/nerv/nn/init.lua index cbaf52b..1037d05 100644 --- a/nerv/nn/init.lua +++ b/nerv/nn/init.lua @@ -1,3 +1,3 @@ nerv.include('layer_repo.lua') nerv.include('param_repo.lua') -nerv.include('layer_dag.lua') +nerv.include('network.lua') diff --git a/nerv/nn/layer_dag.lua b/nerv/nn/layer_dag.lua deleted file mode 100644 index 6896878..0000000 --- a/nerv/nn/layer_dag.lua +++ /dev/null @@ -1,356 +0,0 @@ -local DAGLayer = nerv.class("nerv.DAGLayer", "nerv.Layer") - -local function parse_id(str) - local id, port, _ - _, _, id, port = string.find(str, "([a-zA-Z0-9_.]+)%[([0-9]+)%]") - if id == nil or port == nil then - _, _, id, port = string.find(str, "(.+)%[([0-9]+)%]") - if not (id == "<input>" or id == "<output>") then - nerv.error("wrong format of connection id") - end - end - port = tonumber(port) - return id, port -end - -local function discover(id, layers, layer_repo) - local ref = layers[id] - if id == "<input>" or id == "<output>" then - return nil - end - if ref == nil then - local layer = layer_repo:get_layer(id) - local dim_in, dim_out = layer:get_dim() - ref = { - layer = layer, - inputs = {}, - outputs = {}, - err_inputs = {}, - err_outputs = {}, - next_layers = {}, - input_len = #dim_in, - output_len = #dim_out, - in_deg = 0, - visited = false - } - layers[id] = ref - end - return ref -end - -local function touch_list_by_idx(list, idx) - if list[idx] == nil then - list[idx] = {} - end -end - -function DAGLayer:__init(id, global_conf, layer_conf) - local layers = {} - local inputs = {} - local outputs = {} - local dim_in = layer_conf.dim_in - local dim_out = layer_conf.dim_out - local parsed_conn = {} - for from, to in pairs(layer_conf.connections) do - local id_from, port_from = parse_id(from) - local id_to, port_to = parse_id(to) - local ref_from = discover(id_from, layers, layer_conf.sub_layers) - local ref_to = discover(id_to, layers, layer_conf.sub_layers) - local input_dim, output_dim, _ - if ref_from then - touch_list_by_idx(ref_from.outputs, 1) - if ref_from.outputs[1][port_from] ~= nil then - nerv.error("%s has already been attached", from) - end - end - if ref_to then - touch_list_by_idx(ref_to.inputs, 1) - if ref_to.inputs[1][port_to] ~= nil then - nerv.error("%s has already been attached", to) - end - end - if id_from == "<input>" then - input_dim, _ = ref_to.layer:get_dim() - if dim_in[port_from] ~= input_dim[port_to] then - nerv.error("mismatching data dimension between %s and %s", from, to) - end - inputs[port_from] = {ref_to, port_to} - ref_to.inputs[1][port_to] = inputs -- just a place holder - elseif id_to == "<output>" then - _, output_dim = ref_from.layer:get_dim() - if output_dim[port_from] ~= dim_out[port_to] then - nerv.error("mismatching data dimension between %s and %s", from, to) - end - outputs[port_to] = {ref_from, port_from} - ref_from.outputs[1][port_from] = outputs -- just a place holder - else - _, output_dim = ref_from.layer:get_dim() - input_dim, _ = ref_to.layer:get_dim() - if output_dim[port_from] ~= input_dim[port_to] then - nerv.error("mismatching data dimension between %s and %s", from, to) - end - - table.insert(parsed_conn, - {{ref_from, port_from}, {ref_to, port_to}}) - table.insert(ref_from.next_layers, ref_to) -- add edge - ref_to.in_deg = ref_to.in_deg + 1 -- increase the in-degree of the target layer - end - end - - -- topology sort - local queue = {} - local l = 1 - local r = 1 - for id, ref in pairs(layers) do - if ref.in_deg == 0 then - table.insert(queue, ref) - nerv.info("adding source layer: %s", id) - r = r + 1 - end - end - if l == r then - nerv.error("loop detected") - end - while l < r do - local cur = queue[l] - cur.visited = true - l = l + 1 - for _, nl in pairs(cur.next_layers) do - nl.in_deg = nl.in_deg - 1 - if nl.in_deg == 0 then - table.insert(queue, nl) - r = r + 1 - end - end - end - for i = 1, #queue do - nerv.info("enqueued layer: %s %s", queue[i].layer, queue[i].layer.id) - end - - for id, ref in pairs(layers) do - -- check wether the graph is connected - if ref.visited == false then - nerv.warning("layer %s is ignored", id) - end - end - - self.layers = layers - self.inputs = inputs - self.outputs = outputs - self.id = id - self.dim_in = dim_in - self.dim_out = dim_out - self.parsed_conn = parsed_conn - self.queue = queue - self.gconf = global_conf - if self.gconf.use_cpu then - self.mat_type = self.gconf.mmat_type - else - self.mat_type = self.gconf.cumat_type - end -end - -function DAGLayer:init(batch_size, chunk_size) - if chunk_size == nil then - chunk_size = 1 - end - for i, conn in ipairs(self.parsed_conn) do - local _, output_dim - local ref_from, port_from, ref_to, port_to - ref_from, port_from = unpack(conn[1]) - ref_to, port_to = unpack(conn[2]) - _, output_dim = ref_from.layer:get_dim() - local dim = 1 - if output_dim[port_from] > 0 then - dim = output_dim[port_from] - end - - for t = 1, chunk_size do - local mid = self.mat_type(batch_size, dim) - local err_mid = mid:create() - touch_list_by_idx(ref_to.inputs, t) - touch_list_by_idx(ref_from.outputs, t) - touch_list_by_idx(ref_from.err_inputs, t) - touch_list_by_idx(ref_to.err_outputs, t) - - ref_from.outputs[t][port_from] = mid - ref_to.inputs[t][port_to] = mid - - ref_from.err_inputs[t][port_from] = err_mid - ref_to.err_outputs[t][port_to] = err_mid - end - end - for id, ref in pairs(self.layers) do - for i = 1, ref.input_len do - if ref.inputs[1][i] == nil then - nerv.error("dangling input port %d of layer %s", i, id) - end - end - for i = 1, ref.output_len do - if ref.outputs[1][i] == nil then - nerv.error("dangling output port %d of layer %s", i, id) - end - end - -- initialize sub layers - ref.layer:init(batch_size, chunk_size) - end - for i = 1, #self.dim_in do - if self.inputs[i] == nil then - nerv.error("dangling port %d of layer <input>", i) - end - end - for i = 1, #self.dim_out do - if self.outputs[i] == nil then - nerv.error("dangling port %d of layer <output>", i) - end - end -end - -function DAGLayer:batch_resize(batch_size, chunk_size) - if chunk_size == nil then - chunk_size = 1 - end - - for i, conn in ipairs(self.parsed_conn) do - local _, output_dim - local ref_from, port_from, ref_to, port_to - ref_from, port_from = unpack(conn[1]) - ref_to, port_to = unpack(conn[2]) - _, output_dim = ref_from.layer:get_dim() - - if ref_from.outputs[1][port_from]:nrow() ~= batch_size - and output_dim[port_from] > 0 then - for t = 1, chunk_size do - local mid = self.mat_type(batch_size, output_dim[port_from]) - local err_mid = mid:create() - - ref_from.outputs[t][port_from] = mid - ref_to.inputs[t][port_to] = mid - - ref_from.err_inputs[t][port_from] = err_mid - ref_to.err_outputs[t][port_to] = err_mid - end - end - end - for id, ref in pairs(self.layers) do - ref.layer:batch_resize(batch_size, chunk_size) - end - collectgarbage("collect") -end - -function DAGLayer:set_inputs(input, t) - for i = 1, #self.dim_in do - if input[i] == nil then - nerv.error("some input is not provided"); - end - local layer = self.inputs[i][1] - local port = self.inputs[i][2] - touch_list_by_idx(layer.inputs, t) - layer.inputs[t][port] = input[i] - end -end - -function DAGLayer:set_outputs(output, t) - for i = 1, #self.dim_out do - if output[i] == nil then - nerv.error("some output is not provided"); - end - local layer = self.outputs[i][1] - local port = self.outputs[i][2] - touch_list_by_idx(layer.outputs, t) - layer.outputs[t][port] = output[i] - end -end - -function DAGLayer:set_err_inputs(bp_err, t) - for i = 1, #self.dim_out do - local layer = self.outputs[i][1] - local port = self.outputs[i][2] - touch_list_by_idx(layer.err_inputs, t) - layer.err_inputs[t][port] = bp_err[i] - end -end - -function DAGLayer:set_err_outputs(next_bp_err, t) - for i = 1, #self.dim_in do - local layer = self.inputs[i][1] - local port = self.inputs[i][2] - touch_list_by_idx(layer.err_outputs, t) - layer.err_outputs[t][port] = next_bp_err[i] - end -end - -function DAGLayer:update(bp_err, input, output, t) - if t == nil then - t = 1 - end - self:set_err_inputs(bp_err, t) - self:set_inputs(input, t) - self:set_outputs(output, t) - for id, ref in pairs(self.queue) do - ref.layer:update(ref.err_inputs[t], ref.inputs[t], ref.outputs[t], t) - end -end - -function DAGLayer:propagate(input, output, t) - if t == nil then - t = 1 - end - self:set_inputs(input, t) - self:set_outputs(output, t) - local ret = false - for i = 1, #self.queue do - local ref = self.queue[i] - ret = ref.layer:propagate(ref.inputs[t], ref.outputs[t], t) - end - return ret -end - -function DAGLayer:back_propagate(bp_err, next_bp_err, input, output, t) - if t == nil then - t = 1 - end - self:set_err_outputs(next_bp_err, t) - self:set_err_inputs(bp_err, t) - self:set_inputs(input, t) - self:set_outputs(output, t) - for i = #self.queue, 1, -1 do - local ref = self.queue[i] - ref.layer:back_propagate(ref.err_inputs[t], ref.err_outputs[t], ref.inputs[t], ref.outputs[t], t) - end -end - -function DAGLayer:get_params() - local param_repos = {} - for id, ref in pairs(self.queue) do - table.insert(param_repos, ref.layer:get_params()) - end - return nerv.ParamRepo.merge(param_repos) -end - -DAGLayer.PORT_TYPES = { - INPUT = {}, - OUTPUT = {}, - ERR_INPUT = {}, - ERR_OUTPUT = {} -} - -function DAGLayer:get_intermediate(id, port_type) - if id == "<input>" or id == "<output>" then - nerv.error("an actual real layer id is expected") - end - local layer = self.layers[id] - if layer == nil then - nerv.error("layer id %s not found", id) - end - if port_type == DAGLayer.PORT_TYPES.INPUT then - return layer.inputs - elseif port_type == DAGLayer.PORT_TYPES.OUTPUT then - return layer.outputs - elseif port_type == DAGLayer.PORT_TYPES.ERR_INPUT then - return layer.err_inputs - elseif port_type == DAGLayer.PORT_TYPES.ERR_OUTPUT then - return layer.err_outputs - end - nerv.error("unrecognized port type") -end diff --git a/nerv/nn/layer_repo.lua b/nerv/nn/layer_repo.lua index 3d3a79f..acef54a 100644 --- a/nerv/nn/layer_repo.lua +++ b/nerv/nn/layer_repo.lua @@ -12,29 +12,29 @@ function LayerRepo:add_layers(layer_spec, param_repo, global_conf) if layer_type == nil then nerv.error('layer type `%s` not found', ltype) end - for id, spec in pairs(llist) do - if layers[id] ~= nil then - nerv.error("a layer with id %s already exists", id) - end - nerv.info("create layer: %s", id) - if type(spec[2]) ~= "table" then + for id, lconf in pairs(llist) do + if type(lconf) ~= "table" then nerv.error("layer config table is need") end - layer_config = spec[2] - if type(spec[1]) ~= "table" then - nerv.error("parameter description table is needed") - end - for pname, pid in pairs(spec[1]) do - layer_config[pname] = param_repo:get_param(pid) + if lconf.pr == nil then + lconf.pr = param_repo end - if layer_config.pr == nil then - layer_config.pr = param_repo + if layers[id] ~= nil then + nerv.error("a layer with id %s already exists", id) end - layers[id] = layer_type(id, global_conf, layer_config) + nerv.info("create layer: %s", id) + layers[id] = layer_type(id, global_conf, lconf) end end end +function LayerRepo:rebind(param_repo) + for id, layer in pairs(self.layers) do + layer.lconf.pr = param_repo + layer:bind_params() + end +end + function LayerRepo:get_layer(lid) local layer = self.layers[lid] if layer == nil then diff --git a/nerv/nn/network.lua b/nerv/nn/network.lua new file mode 100644 index 0000000..2cb83ce --- /dev/null +++ b/nerv/nn/network.lua @@ -0,0 +1,500 @@ +local network = nerv.class('nerv.Network') + +function network:__init(id, global_conf, network_conf) + self.id = id + self.network = network_conf.network + self.dim_in = self.network.dim_in + self.dim_out = self.network.dim_out + self.gconf = global_conf + if self.gconf.use_cpu then + self.mat_type = self.gconf.mmat_type + else + self.mat_type = self.gconf.cumat_type + end + self.clip = network_conf.clip + self.nn_act_default = network_conf.nn_act_default + if self.nn_act_default == nil then + self.nn_act_default = 0 + end + self.layers = {} + self.input_conn = {} + self.output_conn = {} + self.socket = self:compile(self.network) + for i = 1, #self.dim_in do + local edge = self.socket.inputs[i] + local id, port, time = edge[1], edge[2], edge[3] + if self.input_conn[id][port] ~= nil then + nerv.error('duplicate edge') + end + self.input_conn[id][port] = {0, i, time} + end + for i = 1, #self.dim_out do + local edge = self.socket.outputs[i] + local id, port, time = edge[1], edge[2], edge[3] + if self.output_conn[id][port] ~= nil then + nerv.error('duplicate edge') + end + self.output_conn[id][port] = {0, i, time} + end + self.delay = 0 + for i = 1, #self.layers do + local dim_in, _ = self.layers[i]:get_dim() + for j = 1, #dim_in do + local time = self.input_conn[i][j][3] + if math.abs(time) > self.delay then + self.delay = math.abs(time) + end + end + end +end + +function network:compile(layer) + local socket = {inputs = {}, outputs = {}} + if not nerv.is_type(layer, 'nerv.GraphLayer') then + table.insert(self.layers, layer) + local id = #self.layers + self.input_conn[id] = {} + self.output_conn[id] = {} + local dim_in, dim_out = layer:get_dim() + for i = 1, #dim_in do + socket.inputs[i] = {id, i, 0} + end + for i = 1, #dim_out do + socket.outputs[i] = {id, i, 0} + end + else + local sublayer_socket = {} + for id, sublayer in pairs(layer.layers) do + if id ~= '<input>' then + sublayer_socket[sublayer.id] = self:compile(sublayer.layer) + end + end + for _, edge in pairs(layer.connections) do + -- id = 0 means <input> or <output> + local id_from, port_from = edge[1], edge[2] + local id_to, port_to = edge[3], edge[4] + local time = edge[5] + if id_from == 0 then + if socket.inputs[port_from] ~= nil then + nerv.error('duplicate input socket') + end + local input = sublayer_socket[id_to].inputs[port_to] + local id, port, t = input[1], input[2], input[3] + time + socket.inputs[port_from] = {id, port, t} + else + local output = sublayer_socket[id_from].outputs[port_from] + local id, port, t = output[1], output[2], output[3] + time + if id_to == 0 then + if socket.outputs[port_to] ~= nil then + nerv.error('duplicate output socket') + end + socket.outputs[port_to] = {id, port, t} + else + local input = sublayer_socket[id_to].inputs[port_to] + local id1, port1, t1 = input[1], input[2], input[3] + if self.input_conn[id1][port1] ~= nil or self.output_conn[id][port] ~= nil then + nerv.error('duplicate edge') + end + self.input_conn[id1][port1] = {id, port, t + t1} + self.output_conn[id][port] = {id1, port1, t + t1} + end + end + end + end + return socket +end + +function network:init(batch_size, chunk_size) + self.batch_size = batch_size + self.chunk_size = chunk_size + + self:topsort() + + self:make_initial_store() + collectgarbage('collect') +end + +function network:epoch_init() + for i = 1, #self.layers do + self.layers[i]:init(self.batch_size, self.chunk_size) + end +end + +function network:topsort() + nerv.info('network topology sort') + local degree = {} + for t = 1, self.chunk_size do + degree[t] = {} + for i = 1, #self.layers do + degree[t][i] = 0 + end + end + + for t = 1, self.chunk_size do + for i = 1, #self.layers do + local _, dim_out = self.layers[i]:get_dim() + for j = 1, #dim_out do + if self.output_conn[i][j] ~= nil then + local edge = self.output_conn[i][j] + local id, time = edge[1], edge[3] + t + if time >= 1 and time <= self.chunk_size and id ~= 0 then + degree[time][id] = degree[time][id] + 1 + end + end + end + end + end + + self.queue = {} + local l = 1 + local r = 0 + for t = 1, self.chunk_size do + for i = 1, #self.layers do + if degree[t][i] == 0 then + r = r + 1 + self.queue[r] = {chunk = t, id = i} + end + end + end + while l <= r do + local t, i = self.queue[l].chunk, self.queue[l].id + l = l + 1 + local _, dim_out = self.layers[i]:get_dim() + for j = 1, #dim_out do + if self.output_conn[i][j] ~= nil then + local edge = self.output_conn[i][j] + local id, time = edge[1], edge[3] + t + if time >= 1 and time <= self.chunk_size and id ~= 0 then + degree[time][id] = degree[time][id] - 1 + if degree[time][id] == 0 then + r = r + 1 + self.queue[r] = {chunk = time, id = id} + end + end + end + end + end + + if r ~= self.chunk_size * #self.layers then + nerv.error('loop detected') + end +end + +function network:make_initial_store() + nerv.info('network initing storage') + + -- allocate memory + local memory = {} + local err_memory = {} + for t = 1 - self.delay, self.chunk_size + self.delay do + memory[t] = {} + err_memory[t] = {} + for i = 1, #self.layers do + memory[t][i] = {} + err_memory[t][i] = {} + local dim_in, dim_out = self.layers[i]:get_dim() + for j = 1, #dim_in do + err_memory[t][i][j] = self.mat_type(self.batch_size, dim_in[j]) + err_memory[t][i][j]:fill(0) + end + for j = 1, #dim_out do + memory[t][i][j] = self.mat_type(self.batch_size, dim_out[j]) + memory[t][i][j]:fill(self.nn_act_default) + end + end + -- memory[t][0] stores network input + memory[t][0] = {} + for j = 1, #self.dim_in do + memory[t][0][j] = self.mat_type(self.batch_size, self.dim_in[j]) + memory[t][0][j]:fill(self.nn_act_default) + end + -- err_memory[t][0] stores network err_input + err_memory[t][0] = {} + for j = 1, #self.dim_out do + err_memory[t][0][j] = self.mat_type(self.batch_size, self.dim_out[j]) + err_memory[t][0][j]:fill(0) + end + end + + -- connect memory and reference + self.input = {} + self.output = {} + self.err_input = {} + self.err_output = {} + for t = 1, self.chunk_size do + self.input[t] = {} + self.output[t] = {} + self.err_input[t] = {} + self.err_output[t] = {} + for i = 1, #self.layers do + self.input[t][i] = {} + self.output[t][i] = {} + self.err_input[t][i] = {} + self.err_output[t][i] = {} + local dim_in, dim_out = self.layers[i]:get_dim() + for j = 1, #dim_in do + local edge = self.input_conn[i][j] + local id, port, time = edge[1], edge[2], edge[3] + if id ~= 0 or t - time < 1 or t - time > self.chunk_size then + self.input[t][i][j] = memory[t - time][id][port] + end + if id ~= 0 then + self.err_output[t][i][j] = err_memory[t][i][j] + end + end + for j = 1, #dim_out do + local edge = self.output_conn[i][j] + local id, port, time = edge[1], edge[2], edge[3] + if id ~= 0 then + self.output[t][i][j] = memory[t][i][j] + end + if id ~= 0 or t + time < 1 or t + time > self.chunk_size then + self.err_input[t][i][j] = err_memory[t + time][id][port] + end + end + end + end + + -- check dangling reference + for t = 1, self.chunk_size do + for i = 1, #self.dim_in do + local edge = self.socket.inputs[i] + local id, port, time = edge[1], edge[2], edge[3] + if t + time >= 1 and t + time <= self.chunk_size then + if self.input[t + time][id][port] ~= nil then + nerv.error('input reference not nil') + end + self.input[t + time][id][port] = true -- just a place holder + if self.err_output[t + time][id][port] ~= nil then + nerv.error('err_output reference not nil') + end + self.err_output[t + time][id][port] = true -- just a place holder + end + end + for i = 1, #self.dim_out do + local edge = self.socket.outputs[i] + local id, port, time = edge[1], edge[2], edge[3] + if t - time >= 1 and t - time <= self.chunk_size then + if self.output[t - time][id][port] ~= nil then + nerv.error('output reference not nil') + end + self.output[t - time][id][port] = true -- just a place holder + if self.err_input[t - time][id][port] ~= nil then + nerv.error('err_output reference not nil') + end + self.err_input[t - time][id][port] = true -- just a place holder + end + end + end + for t = 1, self.chunk_size do + for i = 1, #self.layers do + local dim_in, dim_out = self.layers[i]:get_dim() + for j = 1, #dim_in do + if self.input[t][i][j] == nil then + nerv.error('input reference dangling') + end + if self.err_output[t][i][j] == nil then + nerv.error('err_output reference dangling') + end + end + for j = 1, #dim_out do + if self.output[t][i][j] == nil then + nerv.error('output reference dangling') + end + if self.err_input[t][i][j] == nil then + nerv.error('err_input reference dangling') + end + end + end + end + + -- allocate reference for legacy of previous mini-batch + self.legacy = {} + for t = 1 - self.delay, 0 do + self.legacy[t] = {} + for i = 1, #self.layers do + self.legacy[t][i] = {} + local _, dim_out = self.layers[i]:get_dim() + for j = 1, #dim_out do + self.legacy[t][i][j] = memory[t][i][j] + end + end + end +end + +function network:set_input(input) + for t = 1, self.chunk_size do + for i = 1, #self.dim_in do + local edge = self.socket.inputs[i] + local id, port, time = edge[1], edge[2], edge[3] + if t + time >= 1 and t + time <= self.chunk_size then + self.input[t + time][id][port] = input[t][i] + end + end + end +end + +function network:set_output(output) + for t = 1, self.chunk_size do + for i = 1, #self.dim_out do + local edge = self.socket.outputs[i] + local id, port, time = edge[1], edge[2], edge[3] + if t - time >= 1 and t - time <= self.chunk_size then + self.output[t - time][id][port] = output[t][i] + end + end + end +end + +function network:set_err_input(err_input) + for t = 1, self.chunk_size do + for i = 1, #self.dim_out do + local edge = self.socket.outputs[i] + local id, port, time = edge[1], edge[2], edge[3] + if t - time >= 1 and t - time <= self.chunk_size then + self.err_input[t - time][id][port] = err_input[t][i] + end + end + end +end + +function network:set_err_output(err_output) + for t = 1, self.chunk_size do + for i = 1, #self.dim_in do + local edge = self.socket.inputs[i] + local id, port, time = edge[1], edge[2], edge[3] + if t + time >= 1 and t + time <= self.chunk_size then + self.err_output[t + time][id][port] = err_output[t][i] + end + end + end +end + +--[[ + [info] is a table that contains information of current mini-batch. These fields must be contained: + [input], [output] : matrix array which stores the network input and output + [seq_length] : a table contains the length of every sequences + [new_seq]: a table contains the batch number of new sequences + [do_train]: a bool value indicates do train or not + if [do_train] is true, these fileds also must be contained: + [err_input], [err_output] : matrix array which stores the network err_input and err_output +--]] +function network:mini_batch_init(info) + self.info = info + self:set_input(self.info.input) + self:set_output(self.info.output) + + -- calculate border + self.max_length = 0 + self.border = {} + for i = 1, self.chunk_size do + self.border[i] = {} + end + for i = 1, self.batch_size do + if self.info.seq_length[i] > self.max_length then + self.max_length = self.info.seq_length[i] + end + for t = 1, self.delay do + local chunk = self.info.seq_length[i] + t + if chunk > self.chunk_size then + break + end + table.insert(self.border[chunk], i) + end + end + + -- copy legacy + for t = 1 - self.delay, 0 do + for i = 1, #self.layers do + local _, dim_out = self.layers[i]:get_dim() + for j = 1, #dim_out do + if t + self.chunk_size >= 1 and self.output_conn[i][j][1] ~= 0 then + self.legacy[t][i][j]:copy_from(self.output[t + self.chunk_size][i][j]) + end + for k = 1, #self.info.new_seq do + local batch = self.info.new_seq[k] + self.legacy[t][i][j][batch - 1]:fill(self.nn_act_default) + end + end + end + end + + if self.info.do_train then + self:set_err_input(self.info.err_input) + self:set_err_output(self.info.err_output) + + -- flush border gradient + for t = self.max_length + 1, self.max_length + self.delay do + if t > self.chunk_size then + break + end + for i = 1, #self.layers do + local dim_in, _ = self.layers[i]:get_dim() + for j = 1, #dim_in do + self.err_output[t][i][j]:fill(0) + end + end + end + end +end + +function network:propagate() + for i = 1, #self.queue do + local t, id = self.queue[i].chunk, self.queue[i].id + if t <= self.max_length then + self.layers[id]:propagate(self.input[t][id], self.output[t][id], t) + end + -- flush border activation + for j = 1, #self.border[t] do + local batch = self.border[t][j] + local _, dim_out = self.layers[id]:get_dim() + for k = 1, #dim_out do + self.output[t][id][k][batch - 1]:fill(self.nn_act_default) + end + end + end +end + +function network:back_propagate() + for i = #self.queue, 1, -1 do + local t, id = self.queue[i].chunk, self.queue[i].id + if t <= self.max_length then + -- flush border gradient + for j = 1, #self.border[t] do + local batch = self.border[t][j] + local _, dim_out = self.layers[id]:get_dim() + for k = 1, #dim_out do + self.err_input[t][id][k][batch - 1]:fill(0) + end + end + self.layers[id]:back_propagate(self.err_input[t][id], self.err_output[t][id], self.input[t][id], self.output[t][id], t) + if self.clip ~= nil then + local dim_in, _ = self.layers[id]:get_dim() + for j = 1, #dim_in do + self.err_output[t][id][j]:clip(-self.clip, self.clip) + end + end + end + end +end + +function network:update() + for i = 1, #self.queue do + local t, id = self.queue[i].chunk, self.queue[i].id + if t <= self.max_length then + self.layers[id]:update(self.err_input[t][id], self.input[t][id], self.output[t][id], t) + end + end +end + +function network:set_attr(name, value) + self.network:set_attr(name, value) +end + +function network:get_sublayer(id) + return self.network:get_sublayer(id) +end + +function network:get_params() + return self.network:get_params() +end diff --git a/nerv/nn/param_repo.lua b/nerv/nn/param_repo.lua index c124e08..aba7765 100644 --- a/nerv/nn/param_repo.lua +++ b/nerv/nn/param_repo.lua @@ -1,8 +1,37 @@ local ParamRepo = nerv.class("nerv.ParamRepo") -function ParamRepo:__init(plist) + +ParamRepo.LOC_TYPES = { + ON_DEVICE = {}, + ON_HOST = {} +} + +function ParamRepo:__init(plist, loc_type) self.params = {} + self.loc_type = loc_type or ParamRepo.LOC_TYPES.ON_HOST + local function make_checker(tname) + return function (mat) + if not nerv.is_type(mat, tname) then + nerv.error("unexpected param type in repo specification") + end + end + end + self.make_copier = function (mat_type, copy_method) + return function (mat) + local target = mat_type(mat:nrow(), mat:ncol()) + mat[copy_method](mat, target) + return target + end + end + + if self.loc_type == ParamRepo.LOC_TYPES.ON_HOST then + self.checker = make_checker("nerv.MMatrix") + else + self.checker = make_checker("nerv.CuMatrix") + end + if plist ~= nil then for i, p in ipairs(plist) do + p:check(self.checker) self.params[p.id] = p end end @@ -12,6 +41,7 @@ function ParamRepo:add(pid, p) if self.params[pid] ~= nil then nerv.error("duplicate params with the same id: %s", pid) end + p:check(self.checker) self.params[pid] = p end @@ -22,8 +52,8 @@ function ParamRepo:remove(pid, p) table.remove(self.params, pid) end -function ParamRepo.merge(repos) - local self = nerv.ParamRepo() +function ParamRepo.merge(repos, loc_type) + local self = nerv.ParamRepo(nil, loc_type) for i, repo in ipairs(repos) do if not nerv.is_type(repo, "nerv.ParamRepo") then nerv.error("nerv.ParamRepo objects expected, got %s", repo) @@ -78,3 +108,26 @@ function ParamRepo:get_param(pid) end return p end + +function ParamRepo:copy(loc_type, pids) + local copier + local target = nerv.ParamRepo(nil, loc_type) + if loc_type == nil then + loc_type = self.loc_type + end + if loc_type == ParamRepo.LOC_TYPES.ON_HOST then + copier = self.make_copier(gconf.mmat_type, 'copy_toh') + else + copier = self.make_copier(gconf.cumat_type, 'copy_tod') + end + if pids == nil then + for id, p in pairs(self.params) do + target.params[id] = p:copy(copier) + end + else + for i, pid in ipairs(pids) do + target.params[pid] = self:get_param(pid):copy(copier) + end + end + return target +end diff --git a/nerv/test/matrix_func.lua b/nerv/test/matrix_func.lua index 3750ddd..07ddf9c 100644 --- a/nerv/test/matrix_func.lua +++ b/nerv/test/matrix_func.lua @@ -128,7 +128,7 @@ function _test_all_shape(mat_type, m, n, k, fill) print(a) b:copy_rows_from_by_idx(a, idx) b = mat_type(2, m) - b:copy_rows_from_by_idx(a, idx, 2, 2) + b:copy_rows_from_by_idx(a, idx, 2) print(a) print(b) -- test expand_frm diff --git a/nerv/test/parse_args.lua b/nerv/test/parse_args.lua new file mode 100644 index 0000000..34ad55e --- /dev/null +++ b/nerv/test/parse_args.lua @@ -0,0 +1,15 @@ +local options = {{"abandon", "a", "boolean", default = false, desc = "abandon your belief"}, + {"bullshit", "b", "boolean", default = false, desc = "start to bullshit"}, + {"cheat", "c", "boolean", default = false, desc = "try to cheat"}, + {"delete", "d", "boolean", default = false, desc = "remove everything"}, + {"hehe", "h", "boolean", default = false, desc = "233333"}, + {"oh", "o", "boolean", default = true, desc = "oh yes!"}, + {"uid", nil, "int", desc = "user uid"}, + {"str", nil, "string", desc = "test string"}} + +args, opts = nerv.parse_args({"arg1", "arg2", "-abcd", "arg3", + "--hehe", "--oh=no", "--uid=43", + "highfive", "--str=hello"}, options) + +nerv.print_usage(options) +print(table.tostring(args), table.tostring(opts)) diff --git a/nerv/tnn/init.lua b/nerv/tnn/init.lua deleted file mode 100644 index 44ce26b..0000000 --- a/nerv/tnn/init.lua +++ /dev/null @@ -1,47 +0,0 @@ -local LayerT = nerv.class('nerv.LayerT') - -function LayerT:__init(id, global_conf, layer_conf) - nerv.error_method_not_implemented() -end - -function LayerT:init(batch_size, chunk_size) - nerv.error_method_not_implemented() -end - -function LayerT:update(bp_err, input, output, t) - nerv.error_method_not_implemented() -end - -function LayerT:propagate(input, output, t) - nerv.error_method_not_implemented() -end - -function LayerT:back_propagate(bp_err, next_bp_err, input, output, t) - nerv.error_method_not_implemented() -end - -function LayerT:check_dim_len(len_in, len_out) - local expected_in = #self.dim_in - local expected_out = #self.dim_out - if len_in > 0 and expected_in ~= len_in then - nerv.error("layer %s expects %d inputs, %d given", - self.id, len_in, expected_in) - end - if len_out > 0 and expected_out ~= len_out then - nerv.error("layer %s expects %d outputs, %d given", - self.id, len_out, expected_out) - end -end - -LayerT.find_param = nerv.Layer.find_param - -function LayerT:get_params() - nerv.error_method_not_implemented() -end - -function LayerT:get_dim() - return self.dim_in, self.dim_out -end - -nerv.include('sutil.lua') -nerv.include('tnn.lua') diff --git a/nerv/tnn/sutil.lua b/nerv/tnn/sutil.lua deleted file mode 100644 index 6a968b7..0000000 --- a/nerv/tnn/sutil.lua +++ /dev/null @@ -1,80 +0,0 @@ -local Util = nerv.class("nerv.SUtil") --Scheduler Utility - -function Util.simple_split(inputstr, sep) - if sep == nil then - sep = "%s" - end - local t={} ; i=1 - for str in string.gmatch(inputstr, "([^"..sep.."]+)") do - t[i] = str - i = i + 1 - end - return t -end - -function Util.parse_schedule(str) - --parse a string like "1.2*10:1.5" to a list of numbers - local sch = {} - local s = Util.simple_split(str, ':') - for i = 1, #s do - local p = Util.simple_split(s[i], "%*") - if #p ~= 2 and #p ~= 1 then - nerv.error("nerv.SUtil:parse_schedule error, unit(%s) not proper, has %d components.", s[i], #p) - end - if p[2] == nil then - p[2] = "1" - end - p[1] = tonumber(p[1]) - p[2] = tonumber(p[2]) - for j = 1, p[2] do - table.insert(sch, p[1]) - end - end - - --for i = 1, #sch do - -- print(sch[i]) - --end - return sch -end - -function Util.sche_get(s, it) - --get s[it] - if s == nil then - nerv.info("Util.sche_get: warning, scheule is nil, returning zero...") - return 0 - end - if #s >= it then - return s[it] - else - nerv.info("Util.sche_get: warning, it(%d) > #schedule(%d), returning the last one of schedule(%f)...", it, #s, s[#s]) - return s[#s] - end -end - -function Util.parse_commands_set(str) - local coms = {} - local s = Util.simple_split(str, ':,') - for i = 1 ,#s do - if coms[s[i]] == 1 then - nerv.warning("nerv.SUtil.parse_commands_set command(%s) appered more than once in command_set(%s)", s[i], str) - end - coms[s[i]] = 1 - end - return coms -end - -function Util.log_redirect(fn) - nerv.log_fh = assert(io.open(fn, "w")) - nerv.info("CAUTION[LOG_REDIRECT], all nerv.printf/info/warning/error calls will be double-written to %s", fn) - nerv.printf = - function (fmt, ...) - io.write(nerv.sprintf(fmt, ...)) - nerv.log_fh:write(nerv.sprintf(fmt, ...)) - nerv.log_fh:flush() - end - nerv.error = - function (fmt, ...) - nerv.log_fh:write(nerv.sprintf("[nerv] internal error:" .. fmt .. "\n", ...)) - error(nerv.sprintf("[nerv] internal error: " .. fmt .. "\n", ...)) - end -end diff --git a/nerv/tnn/tnn.lua b/nerv/tnn/tnn.lua deleted file mode 100644 index d527fe6..0000000 --- a/nerv/tnn/tnn.lua +++ /dev/null @@ -1,596 +0,0 @@ -local TNN = nerv.class("nerv.TNN") - -local function parse_id(str) - --used to parse layerid[portid],time - local id, port, time, _ - _, _, id, port, time = string.find(str, "([a-zA-Z0-9_]+)%[([0-9]+)%][,]*([0-9]*)") - if id == nil or port == nil then - _, _, id, port, time = string.find(str, "(.+)%[([0-9]+)%][,]*([0-9]*)") - if not (id == "<input>" or id == "<output>") then - nerv.error("wrong format of connection id") - end - end - --print(str, id, port, time) - port = tonumber(port) - if (time == nil) then - time = 0 - else - time = tonumber(time) - end - --now time don't need to be parsed - return id, port -end - -local function discover(id, layers, layer_repo) - local ref = layers[id] - if id == "<input>" or id == "<output>" then - return nil - end - if ref == nil then - local layer = layer_repo:get_layer(id) - local dim_in, dim_out = layer:get_dim() - ref = { - layer = layer, - id = layer.id, - inputs_m = {}, --storage for computation, inputs_m[time][port] - inputs_b = {}, --inputs_g[time][port], whether this input can been computed - inputs_matbak_p = {}, --which is a back-up space to handle some cross-border computation, inputs_p_matbak[port] - outputs_m = {}, - outputs_b = {}, - err_inputs_m = {}, - err_inputs_matbak_p = {}, --which is a back-up space to handle some cross-border computation - err_inputs_b = {}, - err_outputs_m = {}, - err_outputs_b = {}, - i_conns_p = {}, --list of inputing connections - o_conns_p = {}, --list of outputing connections - dim_in = dim_in, --list of dimensions of ports - dim_out = dim_out, - } - layers[id] = ref - end - return ref -end - -nerv.TNN.FC = {} --flag const -nerv.TNN.FC.SEQ_START = 4 -nerv.TNN.FC.SEQ_END = 8 -nerv.TNN.FC.HAS_INPUT = 1 -nerv.TNN.FC.HAS_LABEL = 2 -nerv.TNN.FC.SEQ_NORM = bit.bor(nerv.TNN.FC.HAS_INPUT, nerv.TNN.FC.HAS_LABEL) --This instance have both input and label - -function TNN.make_initial_store(st, p, dim, batch_size, chunk_size, extend_t, global_conf, st_c, p_c, t_c) - --Return a table of matrix storage from time (1-extend_t)..(chunk_size+extend_t) - if (type(st) ~= "table") then - nerv.error("st should be a table") - end - for i = 1 - extend_t - 2, chunk_size + extend_t + 2 do --intentionally allocated more time - if (st[i] == nil) then - st[i] = {} - end - st[i][p] = global_conf.cumat_type(batch_size, dim) - st[i][p]:fill(0) - if (st_c ~= nil) then - if (st_c[i + t_c] == nil) then - st_c[i + t_c] = {} - end - st_c[i + t_c][p_c] = st[i][p] - end - end - collectgarbage("collect") --free the old one to save memory -end - -function TNN:out_of_feedrange(t) --out of chunk, or no input, for the current feed - if (t < 1 or t > self.chunk_size) then - return true - end - if (self.feeds_now.flagsPack_now[t] == 0 or self.feeds_now.flagsPack_now[t] == nil) then - return true - end - return false -end - -function TNN:__init(id, global_conf, layer_conf) - self.clip_t = layer_conf.clip_t - if self.clip_t == nil then - self.clip_t = 0 - end - if self.clip_t > 0 then - nerv.info("tnn(%s) will clip gradient across time with %f...", id, self.clip_t) - end - - self.extend_t = layer_conf.extend_t --TNN will allocate storage of time for 1-extend_t .. chunk_size+extend_t - if self.extend_t == nil then - self.extend_t = 5 - end - nerv.info("tnn(%s) will extend storage beyond MB border for time steps %d...", id, self.extend_t) - - local layers = {} - local inputs_p = {} --map:port of the TNN to layer ref and port - local outputs_p = {} - local dim_in = layer_conf.dim_in - local dim_out = layer_conf.dim_out - local parsed_conns = {} - local _ - - for id, _ in pairs(layer_conf.sub_layers.layers) do --caution: with this line, some layer not connected will be included - discover(id, layers, layer_conf.sub_layers) - end - - for _, ll in pairs(layer_conf.connections) do - local id_from, port_from = parse_id(ll[1]) - local id_to, port_to = parse_id(ll[2]) - local time_to = ll[3] - - print(id_from, id_to, time_to) - - local ref_from = discover(id_from, layers, layer_conf.sub_layers) - local ref_to = discover(id_to, layers, layer_conf.sub_layers) - - if (id_from == "<input>") then - if (dim_in[port_from] ~= ref_to.dim_in[port_to] or time_to ~= 0) then - nerv.error("mismatch dimension or wrong time %s,%s,%d", ll[1], ll[2], ll[3]) - end - inputs_p[port_from] = {["ref"] = ref_to, ["port"] = port_to} - ref_to.inputs_m[port_to] = {} --just a place holder - elseif (id_to == "<output>") then - if (dim_out[port_to] ~= ref_from.dim_out[port_from] or time_to ~= 0) then - nerv.error("mismatch dimension or wrong time %s,%s,%d", ll[1], ll[2], ll[3]) - end - outputs_p[port_to] = {["ref"] = ref_from, ["port"] = port_from} - ref_from.outputs_m[port_from] = {} --just a place holder - else - local conn_now = { - ["src"] = {["ref"] = ref_from, ["port"] = port_from}, - ["dst"] = {["ref"] = ref_to, ["port"] = port_to}, - ["time"] = time_to - } - if (ref_to.dim_in[port_to] ~= ref_from.dim_out[port_from]) then - nerv.error("mismatch dimension or wrong time %s,%s,%d", ll[1], ll[2], ll[3]) - end - table.insert(parsed_conns, conn_now) - ref_to.i_conns_p[conn_now.dst.port] = conn_now - ref_from.o_conns_p[conn_now.src.port] = conn_now - end - end - - for id, ref in pairs(layers) do - print(id, "#dim_in:", #ref.dim_in, "#dim_out:", #ref.dim_out, "#i_conns_p:", #ref.i_conns_p, "#o_conns_p", #ref.o_conns_p) - end - - self.layers = layers - self.inputs_p = inputs_p - self.outputs_p = outputs_p - self.id = id - self.dim_in = dim_in - self.dim_out = dim_out - self.parsed_conns = parsed_conns - self.gconf = global_conf -end - -function TNN:init(batch_size, chunk_size) - self.batch_size = batch_size - self.chunk_size = chunk_size - for i, conn in ipairs(self.parsed_conns) do --init storage for connections inside the NN - local _, output_dim - local ref_from, port_from, ref_to, port_to, time - ref_from, port_from = conn.src.ref, conn.src.port - ref_to, port_to = conn.dst.ref, conn.dst.port - time = conn.time - - local dim = ref_from.dim_out[port_from] - if (dim == 0) then - nerv.error("layer %s has a zero dim port", ref_from.layer.id) - end - - nerv.info("TNN initing storage %s->%s", ref_from.layer.id, ref_to.layer.id) - ref_to.inputs_matbak_p[port_to] = self.gconf.cumat_type(batch_size, dim) - self.make_initial_store(ref_from.outputs_m, port_from, dim, batch_size, chunk_size, self.extend_t, self.gconf, ref_to.inputs_m, port_to, time) - ref_from.err_inputs_matbak_p[port_from] = self.gconf.cumat_type(batch_size, dim) - self.make_initial_store(ref_from.err_inputs_m, port_from, dim, batch_size, chunk_size, self.extend_t, self.gconf, ref_to.err_outputs_m, port_to, time) - end - - self.outputs_m = {} - self.err_inputs_m = {} - for i = 1, #self.dim_out do --Init storage for output ports - local ref = self.outputs_p[i].ref - local p = self.outputs_p[i].port - self.make_initial_store(ref.outputs_m, p, self.dim_out[i], batch_size, chunk_size, self.extend_t, self.gconf, self.outputs_m, i, 0) - self.make_initial_store(ref.err_inputs_m, p, self.dim_out[i], batch_size, chunk_size, self.extend_t, self.gconf, self.err_inputs_m, i, 0) - end - - self.inputs_m = {} - self.err_outputs_m = {} - for i = 1, #self.dim_in do --Init storage for input ports - local ref = self.inputs_p[i].ref - local p = self.inputs_p[i].port - self.make_initial_store(ref.inputs_m, p, self.dim_in[i], batch_size, chunk_size, self.extend_t, self.gconf, self.inputs_m, i, 0) - self.make_initial_store(ref.err_outputs_m, p, self.dim_in[i], batch_size, chunk_size, self.extend_t, self.gconf, self.err_outputs_m, i, 0) - end - - for id, ref in pairs(self.layers) do --Calling init for child layers - for i = 1, #ref.dim_in do - if (ref.inputs_m[i] == nil or ref.err_outputs_m[i] == nil) then - nerv.error("dangling input port %d of layer %s", i, id) - end - end - for i = 1, #ref.dim_out do - if (ref.outputs_m[i] == nil or ref.err_inputs_m[i] == nil) then - nerv.error("dangling output port %d of layer %s", i, id) - end - end - -- initialize sub layers - nerv.info("TNN initing sub-layer %s", ref.id) - ref.layer:init(batch_size, chunk_size) - collectgarbage("collect") - end - - local flags_now = {} - local flagsPack_now = {} - for i = 1, chunk_size do - flags_now[i] = {} - flagsPack_now[i] = 0 - end - - self.feeds_now = {} --feeds is for the reader to fill - self.feeds_now.inputs_m = self.inputs_m - self.feeds_now.flags_now = flags_now - self.feeds_now.flagsPack_now = flagsPack_now - - self:flush_all() -end - ---[[ -function DAGLayer:batch_resize(batch_size) - self.gconf.batch_size = batch_size - - for i, conn in ipairs(self.parsed_conn) do - local _, output_dim - local ref_from, port_from, ref_to, port_to - ref_from, port_from = unpack(conn[1]) - ref_to, port_to = unpack(conn[2]) - _, output_dim = ref_from.layer:get_dim() - - if ref_from.outputs[port_from]:nrow() ~= batch_size and output_dim[port_from] > 0 then - local mid = self.gconf.cumat_type(batch_size, output_dim[port_from]) - local err_mid = mid:create() - - ref_from.outputs[port_from] = mid - ref_to.inputs[port_to] = mid - - ref_from.err_inputs[port_from] = err_mid - ref_to.err_outputs[port_to] = err_mid - end - end - for id, ref in pairs(self.layers) do - ref.layer:batch_resize(batch_size) - end - collectgarbage("collect") -end -]]-- - -function TNN:flush_all() --flush all history and activation - local _, ref - for _, ref in pairs(self.layers) do - for i = 1, #ref.dim_in do - for t = 1 - self.extend_t, self.chunk_size + self.extend_t do - ref.inputs_m[t][i]:fill(self.gconf.nn_act_default) - if (ref.inputs_b[t] == nil) then - ref.inputs_b[t] = {} - end - ref.inputs_b[t][i] = false - ref.err_outputs_m[t][i]:fill(0) - if (ref.err_outputs_b[t] == nil) then - ref.err_outputs_b[t] = {} - end - ref.err_outputs_b[t][i] = false - end - end - for i = 1, #ref.dim_out do - for t = 1 - self.extend_t, self.chunk_size + self.extend_t do - ref.outputs_m[t][i]:fill(self.gconf.nn_act_default) - if (ref.outputs_b[t] == nil) then - ref.outputs_b[t] = {} - end - ref.outputs_b[t][i] = false - ref.err_inputs_m[t][i]:fill(0) - if (ref.err_inputs_b[t] == nil) then - ref.err_inputs_b[t] = {} - end - ref.err_inputs_b[t][i] = false - end - end - end -end - ---reader: some reader ---Returns: bool, whether has new feed ---Returns: feeds, a table that will be filled with the reader's feeds -function TNN:getfeed_from_reader(reader) - local feeds_now = self.feeds_now - local got_new = reader:get_batch(feeds_now) - return got_new, feeds_now -end - -function TNN:move_right_to_nextmb(list_t) --move output history activations of 1..chunk_size to 1-chunk_size..0 - if list_t == nil then - list_t = {} - for i = self.extend_t, 1, -1 do - list_t[i] = 1 - i - end - end - for i = 1, #list_t do - t = list_t[i] - if t < 1 - self.extend_t or t > 0 then - nerv.error("MB move range error") - end - for id, ref in pairs(self.layers) do - for p = 1, #ref.dim_out do - ref.outputs_m[t][p]:copy_fromd(ref.outputs_m[t + self.chunk_size][p]) - end - end - end -end - -function TNN:net_propagate() --propagate according to feeds_now - for t = 1, self.chunk_size, 1 do - for id, ref in pairs(self.layers) do - for p = 1, #ref.dim_out do - ref.outputs_b[t][p] = false - end - for p = 1, #ref.dim_in do - ref.inputs_b[t][p] = false - end - end - end - - local feeds_now = self.feeds_now - for t = 1, self.chunk_size do --some layer maybe do not have inputs from time 1..chunk_size - for id, ref in pairs(self.layers) do - if #ref.dim_in > 0 then --some layer is just there(only to save some parameter) - self:propagate_dfs(ref, t) - end - end - end - for t = 1, self.chunk_size do - if (bit.band(feeds_now.flagsPack_now[t], nerv.TNN.FC.HAS_INPUT) > 0) then - for i = 1, #self.dim_in do - local ref = self.inputs_p[i].ref - local p = self.inputs_p[i].port - ref.inputs_b[t][p] = true - self:propagate_dfs(ref, t) - end - end - end - - local flag_out = true - for t = 1, self.chunk_size do --check whether every output has been computed - if (bit.band(feeds_now.flagsPack_now[t], nerv.TNN.FC.HAS_LABEL) > 0) then - for i = 1, #self.dim_out do - local ref = self.outputs_p[i].ref - if (ref.outputs_b[t][1] ~= true) then - flag_out = false - break - end - end - end - end - - if (flag_out == false) then - nerv.error("some thing wrong, some labeled output is not propagated") - end -end - ---ref: the TNN_ref of a layer ---t: the current time to propagate -function TNN:propagate_dfs(ref, t) - if (self:out_of_feedrange(t)) then - return - end - if (ref.outputs_b[t][1] == true) then --already propagated, 1 is just a random port - return - end - - --print("debug dfs", ref.layer.id, t) - - local flag = true --whether have all inputs - for _, conn in pairs(ref.i_conns_p) do - local p = conn.dst.port - if (not (ref.inputs_b[t][p] or self:out_of_feedrange(t - conn.time))) then - flag = false - break - end - end - if (flag == false) then - return - end - - --ok, do propagate - --print("debug ok, propagating"); - --The MB moving will cause bordering history to be changed, so it is more wise to flush the input activation - if (bit.band(self.feeds_now.flagsPack_now[t], bit.bor(nerv.TNN.FC.SEQ_START, nerv.TNN.FC.SEQ_END)) > 0) then --flush cross-border history - for i = 1, self.batch_size do - local seq_start = bit.band(self.feeds_now.flags_now[t][i], nerv.TNN.FC.SEQ_START) - local seq_end = bit.band(self.feeds_now.flags_now[t][i], nerv.TNN.FC.SEQ_END) - if (seq_start > 0 or seq_end > 0) then - for p, conn in pairs(ref.i_conns_p) do - if ((ref.i_conns_p[p].time > 0 and seq_start > 0) or (ref.i_conns_p[p].time < 0 and seq_end > 0)) then --cross-border, set to default - ref.inputs_m[t][p][i - 1]:fill(self.gconf.nn_act_default) - end - end - end - end - end - self.gconf.timer:tic("tnn_actual_layer_propagate") - ref.layer:propagate(ref.inputs_m[t], ref.outputs_m[t], t) --propagate! - self.gconf.timer:toc("tnn_actual_layer_propagate") - --[[ - if (bit.band(self.feeds_now.flagsPack_now[t], bit.bor(nerv.TNN.FC.SEQ_START, nerv.TNN.FC.SEQ_END)) > 0) then --restore cross-border history - for i = 1, self.batch_size do - local seq_start = bit.band(self.feeds_now.flags_now[t][i], nerv.TNN.FC.SEQ_START) - local seq_end = bit.band(self.feeds_now.flags_now[t][i], nerv.TNN.FC.SEQ_END) - if (seq_start > 0 or seq_end > 0) then - for p, conn in pairs(ref.o_conns_p) do - if ((ref.o_conns_p[p].time > 0 and seq_end > 0) or (ref.o_conns_p[p].time < 0 and seq_start > 0)) then - ref.outputs_m[t][p][i - 1]:fill(self.gconf.nn_act_default) - end - end - end - end - end - ]]-- - --set input flag for future layers - for i = 1, #ref.dim_out do - if (ref.outputs_b[t][i] == true) then - nerv.error("this time's outputs_b should be false") - end - ref.outputs_b[t][i] = true - end - - --try dfs for further layers - for _, conn in pairs(ref.o_conns_p) do - --print("debug dfs-searching", conn.dst.ref.layer.id) - conn.dst.ref.inputs_b[t + conn.time][conn.dst.port] = true - self:propagate_dfs(conn.dst.ref, t + conn.time) - end -end - ---do_update: bool, whether we are doing back-propagate or updating the parameters -function TNN:net_backpropagate(do_update) --propagate according to feeds_now - if do_update == nil then - nerv.error("do_update should not be nil") - end - for t = 1, self.chunk_size, 1 do - for id, ref in pairs(self.layers) do - for p = 1, #ref.dim_out do - ref.err_inputs_b[t][p] = false - end - for p = 1, #ref.dim_in do - ref.err_outputs_b[t][p] = false - end - end - end - - local feeds_now = self.feeds_now - for t = 1, self.chunk_size do --some layer maybe do not have outputs from time 1..chunk_size - for id, ref in pairs(self.layers) do - if #ref.dim_out > 0 then --some layer is just there(only to save some parameter) - self:backpropagate_dfs(ref, t, do_update) - end - end - end - for t = 1, self.chunk_size do - if bit.band(feeds_now.flagsPack_now[t], nerv.TNN.FC.HAS_LABEL) > 0 then - for i = 1, #self.dim_out do - local ref = self.outputs_p[i].ref - local p = self.outputs_p[i].port - ref.err_inputs_b[t][p] = true - self:backpropagate_dfs(ref, t, do_update) - end - end - end - - local flag_out = true - for t = 1, self.chunk_size do --check whether every output has been computed - if bit.band(feeds_now.flagsPack_now[t], nerv.TNN.FC.HAS_INPUT) > 0 then - for i = 1, #self.dim_in do - local ref = self.inputs_p[i].ref - if ref.err_outputs_b[t][1] ~= true then - flag_out = false - break - end - end - end - end - if (flag_out == false) then - nerv.error("some thing wrong, some input is not back_propagated") - end -end - ---ref: the TNN_ref of a layer ---t: the current time to propagate -function TNN:backpropagate_dfs(ref, t, do_update) - if do_update == nil then - nerv.error("got a nil do_update") - end - if self:out_of_feedrange(t) then - return - end - if ref.err_outputs_b[t][1] == true then --already back_propagated, 1 is just a random port - return - end - - --print("debug dfs", ref.layer.id, t) - - local flag = true --whether have all inputs - for _, conn in pairs(ref.o_conns_p) do - local p = conn.src.port - if (not (ref.err_inputs_b[t][p] or self:out_of_feedrange(t + conn.time))) then - flag = false - break - end - end - if (flag == false) then - return - end - - --ok, do back_propagate - --print("debug ok, back-propagating(or updating)") - if (do_update == false) then - self.gconf.timer:tic("tnn_actual_layer_backpropagate") - ref.layer:back_propagate(ref.err_inputs_m[t], ref.err_outputs_m[t], ref.inputs_m[t], ref.outputs_m[t], t) - self.gconf.timer:toc("tnn_actual_layer_backpropagate") - if self.clip_t > 0 then - for _, conn in pairs(ref.i_conns_p) do - local p = conn.dst.port --port for ref - if conn.time ~= 0 then - --print("debug clip_t tnn", ref.id, "port:", p, "clip:", self.clip_t) - ref.err_outputs_m[t][p]:clip(-self.clip_t, self.clip_t) - end - end - end - else - --print(ref.err_inputs_m[t][1]) - self.gconf.timer:tic("tnn_actual_layer_update") - ref.layer:update(ref.err_inputs_m[t], ref.inputs_m[t], ref.outputs_m[t], t) - self.gconf.timer:toc("tnn_actual_layer_update") - end - - if (do_update == false and bit.band(self.feeds_now.flagsPack_now[t], bit.bor(nerv.TNN.FC.SEQ_START, nerv.TNN.FC.SEQ_END)) > 0) then --flush cross-border errors - for i = 1, self.batch_size do - local seq_start = bit.band(self.feeds_now.flags_now[t][i], nerv.TNN.FC.SEQ_START) - local seq_end = bit.band(self.feeds_now.flags_now[t][i], nerv.TNN.FC.SEQ_END) - if (seq_start > 0 or seq_end > 0) then - for p, conn in pairs(ref.i_conns_p) do - if ((ref.i_conns_p[p].time > 0 and seq_start > 0) or (ref.i_conns_p[p].time < 0 and seq_end > 0)) then --cross-border, set to zero - ref.err_outputs_m[t][p][i - 1]:fill(0) - end - end - end - end - end - - for i = 1, #ref.dim_in do - if (ref.err_outputs_b[t][i] == true) then - nerv.error("this time's outputs_b should be false") - end - ref.err_outputs_b[t][i] = true - end - - --try dfs for further layers - for _, conn in pairs(ref.i_conns_p) do - --print("debug dfs-searching", conn.src.ref.layer.id) - conn.src.ref.err_inputs_b[t - conn.time][conn.src.port] = true - self:backpropagate_dfs(conn.src.ref, t - conn.time, do_update) - end -end - ---Return: nerv.ParamRepo -function TNN:get_params() - local param_repos = {} - for id, ref in pairs(self.layers) do - table.insert(param_repos, ref.layer:get_params()) - end - return nerv.ParamRepo.merge(param_repos) -end - |