59 files changed, 2008 insertions, 1196 deletions
diff --git a/.gitmodules b/.gitmodules
index 9f556c5..2b346c4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,9 @@
 [submodule "luajit-2.0"]
 	path = luajit-2.0
-	url = http://luajit.org/git/luajit-2.0.git
+	url = https://speechlab.sjtu.edu.cn/gitlab/nerv-dev/luajit.git
 [submodule "luarocks"]
 	path = luarocks
-	url = https://github.com/keplerproject/luarocks.git
+	url = https://speechlab.sjtu.edu.cn/gitlab/nerv-dev/luarocks.git
+[submodule "Penlight"]
+	path = Penlight
+	url = https://speechlab.sjtu.edu.cn/gitlab/nerv-dev/Penlight.git
diff --git a/Makefile b/Makefile
index 0982295..28012da 100644
--- a/Makefile
+++ b/Makefile
@@ -1,19 +1,42 @@
 .PHONY: all clean install luajit luarocks speech
+##############  EDIT THESE LINES  #####################
 SHELL := /bin/bash
 PREFIX := $(CURDIR)/install/
-all: luajit luarocks install
+#CUDA_BASE := /usr/local/cuda-7.0
+CUDA_BASE := /usr/local/cuda
+BLAS_BASE := /usr/lib/
+BLAS_LDFLAGS := -L$(BLAS_BASE) -Wl,-rpath=$(BLAS_BASE)
+BLAS_TYPE := atlas
+KALDI_BASE := /speechlab/tools/KALDI/kaldi-master/
+#######################################################
+MKL_LDFLAGS := -lmkl_rt
+ATLAS_LDFLAGS := -lcblas -llapack_atlas
+ifeq ($(BLAS_TYPE), mkl)
+BLAS_LDFLAGS += $(MKL_LDFLAGS)
+else ifeq ($(BLAS_TYPE), atlas)
+BLAS_LDFLAGS += $(ATLAS_LDFLAGS)
+else
+$(error Invalid blas type)
+endif
+export CUDA_BASE
+export KALDI_BASE
+export BLAS_LDFLAGS
+
+.PHONY: nerv speech/speech_utils speech/htk_io speech/kaldi_io speech/kaldi_decode \
+		nerv-clean speech/speech_utils-clean speech/htk_io-clean speech/kaldi_io-clean speech/kaldi_decode-clean \
+		Penlight
+
+all: luajit luarocks Penlight nerv
 luajit:
 	PREFIX=$(PREFIX) ./tools/build_luajit.sh
 luarocks:
 	PREFIX=$(PREFIX) ./tools/build_luarocks.sh
-install:
-	cd nerv; $(PREFIX)/bin/luarocks make CFLAGS=$(CFLAGS)
-speech:
-	cd speech/speech_utils; $(PREFIX)/bin/luarocks make
-	cd speech/htk_io; $(PREFIX)/bin/luarocks make
-	cd speech/kaldi_io; $(PREFIX)/bin/luarocks make
-clean:
-	cd nerv && make clean
-	cd speech/speech_utils && make clean
-	cd speech/htk_io && make clean
-	cd speech/kaldi_io && make clean
+
+speech: speech/speech_utils speech/htk_io speech/kaldi_io speech/kaldi_decode
+speech-clean: speech/speech_utils-clean speech/htk_io-clean speech/kaldi_io-clean speech/kaldi_decode-clean
+clean: nerv-clean speech-clean
+
+nerv Penlight speech/speech_utils speech/htk_io speech/kaldi_io speech/kaldi_decode:
+	cd $@; $(PREFIX)/bin/luarocks make
+nerv-clean speech/speech_utils-clean speech/htk_io-clean speech/kaldi_io-clean speech/kaldi_decode-clean:
+	cd $(subst -clean,,$@); make clean LUA_BINDIR=$(PREFIX)/bin/
diff --git a/Penlight b/Penlight
new file mode 160000
+Subproject 16d149338af9efc910528641c5240c5641aeb8d
diff --git a/README.md b/README.md
deleted file mode 100644
index fe9dfc1..0000000
--- a/README.md
+++ /dev/null
@@ -1,55 +0,0 @@
-#The Nerv Toolkit User Manual#
-NOTE: This readme is obsolete and will be rearranged, for further information, please check http://nerv-sjtu.github.io/nerv/
-
-This user manual will information about how to use __Nerv__ and __Nerv__'s interface.
-
-##How to make and start using##
-First make sure you have __lua__ and __CUDA__ installed on your computer.  
-__Nerv__ is currently developed via github.You can download and make __Nerv__ by doing the following:
-```
-cd ~
-git clone https://github.com/Nerv-SJTU/nerv.git
-cd nerv
-git submodule init && git submodule update
-make
-#To include some new CUDA feature(e.x. atomicCAS), use "make CFLAGS=-D__NERV_FUTURE_CUDA_7"
-
-#further, if you want the speech modules
-git clone https://github.com/Nerv-SJTU/nerv-speech.git speech 
-make speech
-```
-The `git submodule` command is for the __luajit__ repository inside __Nerv__.  
-Now, you can try to run some example scripts.  
-```
-./install/bin/nerv examples/cumatrix_example.lua
-```
-To get an example of DNN(for ASR) training, run(this requires the speech modules)  
-You need to be at or (copy files from) `/slfs1`(SJTU speechlab cluster) to get this running.  
-```
-./install/bin/nerv nerv/examples/asr_trainer.lua nerv/examples/swb_baseline.lua
-```
-
-##How to contribute##
-Fork the original repository, then use the __pull&merge__ function in github to contribute.  
-The pull&merge request can be found on your dashboard in github. See this [sync-help] to sync with the original repository.
-
-##Nerv Packages##
-* __luaT__  
-Nerv uses [luaT]\(a [Torch] library\) to define lua class in C.
-* __[The Nerv OOP](nerv/doc/nerv_class.md)__  
-Enables object-oriented programming in Nerv.
-* __[The Nerv utility functions](nerv/doc/nerv.md)__  
-Inlcudes some utility functions from luaT to implement __Nerv.Class__.
-* __[The Nerv Matrix Package](nerv/doc/nerv_matrix.md)__  
-The matrix package is a basic package in __Nerv__ that is used to store and manipulate matrices.
-* __[The Nerv IO Package](nerv/doc/nerv_io.md)__  
-The IO package is used to read and write parameters to file.
-* __[The Nerv Parameter Package](nerv/doc/nerv_param.md)__  
-The parameter package is used to store, read model parameters from file.
-* __[The Nerv Layer Package](nerv/doc/nerv_layer.md)__  
-The layer package is used to define propagation and backpropagation of different type of layers.
-* __[The Nerv NN Package](nerv/doc/nerv_nn.md)__  
-The nn package is for organizing a neural network, it contains __nerv.LayerRepo__, __nerv.ParamRepo__, and __nerv.DAGLayer__.
-[luaT]:https://github.com/torch/torch7/tree/master/lib/luaT
-[Torch]:https://github.com/torch
-[sync-help]:https://help.github.com/articles/syncing-a-fork/
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..c00743c
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,64 @@
+NERV Toolkit
+============
+
+NOTE: This readme is in-progress.
+
+Installation
+------------
+First, make sure you have at least one implementation of BLAS and CUDA installed
+on your computer.
+
+- Checkout NERV:
+  
+  ::
+
+    bash
+    git clone https://speechlab.sjtu.edu.cn/gitlab/nerv-dev/nerv.git
+
+- Checkout submodules (luajit, luarocks, Penlight, etc.):
+
+  ::
+
+    cd nerv
+    git submodule init && git submodule update
+
+- Build NERV: you can specify either ``mkl`` or ``atlas`` to ``BLAS_TYPE``.
+  ``BLAS_BASE`` is the directory containing BLAS ``.so`` files. By default,
+  ``atlas`` is used for ``BLAS_TYPE``, ``/usr/lib/`` is used for ``BLAS_BASE``,
+  and ``/usr/local/cuda`` is used for ``CUDA_BASE``.
+
+  ::
+
+    # an example for compiling on SJTU Speechlab major cluster
+    make BLAS_TYPE=mkl BLAS_BASE=/home/intel/mkl/lib/intel64/ CUDA_BASE=/usr/local/cuda
+
+- To include some new features (e.g. ``atomicCAS`` in CUDA), add corresponding flags to
+  ``NERV_FEAT`` (e.g. ``NERV_FEAT=-D__NERV_FUTURE_CUDA_7``) while making:
+
+  ::
+
+    make NERV_FEAT=-D__NERV_FUTURE_CUDA_7 BLAS_TYPE=mkl BLAS_BASE=/home/intel/mkl/lib/intel64/ CUDA_BASE=/usr/local/cuda
+
+- For speech tasks, you need to install related lua rocks (Lua packages):
+
+  ::
+
+    # checkout speech repository to local directory nerv/speech (suppose you're
+    # still at the root directory of NERV repo)
+    git clone https://speechlab.sjtu.edu.cn/gitlab/nerv-dev/nerv-speech.git speech
+    # build and install HTK I/O support, Kaldi I/O support, Kaldi decoding support, etc.
+    make speech BLAS_TYPE=mkl BLAS_BASE=/home/intel/mkl/lib/intel64/
+
+Example & Tutorial
+------------------
+For speech tasks, please refer to ``tutorial/`` in ``nerv-speech`` repository.
+
+Contribution
+------------
+The basic rule is simple: just fork the original repository, then create a pull
+request (merge request) to the administrator of the project. If you want to fix
+any bugs in existing code, don't hesitate to create a pull (merge) request to
+the repository with clear and detailed analysis of the problem. If you want to
+add additional task-specific functionalities (modules) for speech to NERV,
+please create a luarocks-compliant package and also a pull (merge) request to
+the ``nerv-speech`` repository instead of ``nerv``.
diff --git a/lua/config.lua b/lua/config.lua
deleted file mode 100644
index 1ec1198..0000000
--- a/lua/config.lua
+++ /dev/null
@@ -1,67 +0,0 @@
-function get_global_conf()
-    local global_conf = {
-        lrate = 0.15,
-        wcost = 1e-5,
-        momentum = 0,
-        clip = 5,
-        cumat_type = nerv.CuMatrixFloat,
-        mmat_type = nerv.MMatrixFloat,
-        vocab_size = 10000,
-        nn_act_default = 0,
-        hidden_size = 300,
-        layer_num = 1,
-        chunk_size = 15,
-        batch_size = 20,
-        max_iter = 3,
-        param_random = function() return (math.random() / 5 - 0.1) end,
-        dropout = 0.5,
-        timer = nerv.Timer(),
-        pr = nerv.ParamRepo(),
-    }
-    return global_conf
-end
-
-function get_layers(global_conf)
-    local pr = global_conf.pr
-    local layers = {
-        ['nerv.LSTMLayer'] = {},
-        ['nerv.DropoutLayer'] = {},
-        ['nerv.SelectLinearLayer'] = {
-            ['select'] = {dim_in = {1}, dim_out = {global_conf.hidden_size}, vocab = global_conf.vocab_size, pr = pr},
-        },
-        ['nerv.CombinerLayer'] = {},
-        ['nerv.AffineLayer'] = {
-            output = {dim_in = {global_conf.hidden_size}, dim_out = {global_conf.vocab_size}, pr = pr}
-        },
-        ['nerv.SoftmaxCELayer'] = {
-            softmax = {dim_in = {global_conf.vocab_size, global_conf.vocab_size}, dim_out = {1}, compressed = true},
-        },
-    }
-    for i = 1, global_conf.layer_num do
-        layers['nerv.LSTMLayer']['lstm' .. i] = {dim_in = {global_conf.hidden_size, global_conf.hidden_size, global_conf.hidden_size}, dim_out = {global_conf.hidden_size, global_conf.hidden_size}, pr = pr}
-        layers['nerv.DropoutLayer']['dropout' .. i] = {dim_in = {global_conf.hidden_size}, dim_out = {global_conf.hidden_size}}
-        layers['nerv.CombinerLayer']['dup' .. i] = {dim_in = {global_conf.hidden_size}, dim_out = {global_conf.hidden_size, global_conf.hidden_size}, lambda = {1}}
-    end
-    return layers
-end
-
-function get_connections(global_conf)
-    local connections = {
-        {'<input>[1]', 'select[1]', 0},
-        {'select[1]', 'lstm1[1]', 0},
-        {'dropout' .. global_conf.layer_num .. '[1]', 'output[1]', 0},
-        {'output[1]', 'softmax[1]', 0},
-        {'<input>[2]', 'softmax[2]', 0},
-        {'softmax[1]', '<output>[1]', 0},
-    }
-    for i = 1, global_conf.layer_num do
-        table.insert(connections, {'lstm' .. i .. '[1]', 'dup' .. i .. '[1]', 0})
-        table.insert(connections, {'lstm' .. i .. '[2]', 'lstm' .. i .. '[3]', 1})
-        table.insert(connections, {'dup' .. i .. '[1]', 'lstm' .. i .. '[2]', 1})
-        table.insert(connections, {'dup' .. i .. '[2]', 'dropout' .. i .. '[1]', 0})
-        if i > 1 then
-            table.insert(connections, {'dropout' .. (i - 1) .. '[1]', 'lstm' .. i .. '[1]', 0})
-        end
-    end
-    return connections
-end
diff --git a/lua/main.lua b/lua/main.lua
deleted file mode 100644
index ce0270a..0000000
--- a/lua/main.lua
+++ /dev/null
@@ -1,45 +0,0 @@
-nerv.include('reader.lua')
-nerv.include('timer.lua')
-nerv.include('config.lua')
-nerv.include(arg[1])
-
-local global_conf = get_global_conf()
-local timer = global_conf.timer
-
-timer:tic('IO')
-
-local data_path = 'nerv/nerv/examples/lmptb/PTBdata/'
-local train_reader = nerv.Reader(data_path .. 'vocab', data_path .. 'ptb.valid.txt.adds')
-local val_reader = nerv.Reader(data_path .. 'vocab', data_path .. 'ptb.valid.txt.adds')
-
-local train_data = train_reader:get_all_batch(global_conf)
-local val_data = val_reader:get_all_batch(global_conf)
-
-local layers = get_layers(global_conf)
-local connections = get_connections(global_conf)
-
-local NN = nerv.NN(global_conf, train_data, val_data, layers, connections)
-
-timer:toc('IO')
-timer:check('IO')
-io.flush()
-
-timer:tic('global')
-local best_cv = 1e10
-for i = 1, global_conf.max_iter do
-    timer:tic('Epoch' .. i)
-    local train_ppl, val_ppl = NN:epoch()
-    if val_ppl < best_cv then
-        best_cv = val_ppl
-    else
-        global_conf.lrate = global_conf.lrate / 2.0
-    end
-    nerv.printf('Epoch %d: %f %f %f\n', i, global_conf.lrate, train_ppl, val_ppl)
-    timer:toc('Epoch' .. i)
-    timer:check('Epoch' .. i)
-    io.flush()
-end
-timer:toc('global')
-timer:check('global')
-timer:check('network')
-timer:check('gc')
diff --git a/lua/network.lua b/lua/network.lua
deleted file mode 100644
index 0c11321..0000000
--- a/lua/network.lua
+++ /dev/null
@@ -1,113 +0,0 @@
-nerv.include('select_linear.lua')
-
-local nn = nerv.class('nerv.NN')
-
-function nn:__init(global_conf, train_data, val_data, layers, connections)
-    self.gconf = global_conf
-    self.network = self:get_network(layers, connections)
-    self.train_data = self:get_data(train_data)
-    self.val_data = self:get_data(val_data)
-end
-
-function nn:get_network(layers, connections)
-    self.gconf.dropout_rate = 0
-    local layer_repo = nerv.LayerRepo(layers, self.gconf.pr, self.gconf)
-    local graph = nerv.GraphLayer('graph', self.gconf, 
-        {dim_in = {1, self.gconf.vocab_size}, dim_out = {1}, 
-        layer_repo = layer_repo, connections = connections})
-    local network = nerv.Network('network', self.gconf, 
-        {network = graph, clip = self.gconf.clip})
-    network:init(self.gconf.batch_size, self.gconf.chunk_size)
-    return network
-end
-
-function nn:get_data(data)
-    local err_output = {}
-    local softmax_output = {}
-    local output = {}
-    for i = 1, self.gconf.chunk_size do
-        err_output[i] = self.gconf.cumat_type(self.gconf.batch_size, 1)
-        softmax_output[i] = self.gconf.cumat_type(self.gconf.batch_size, self.gconf.vocab_size)
-        output[i] = self.gconf.cumat_type(self.gconf.batch_size, 1)
-    end
-    local ret = {}
-    for i = 1, #data do
-        ret[i] = {}
-        ret[i].input = {}
-        ret[i].output = {}
-        ret[i].err_input = {}
-        ret[i].err_output = {}
-        for t = 1, self.gconf.chunk_size do
-            ret[i].input[t] = {}
-            ret[i].output[t] = {}
-            ret[i].err_input[t] = {}
-            ret[i].err_output[t] = {}
-            ret[i].input[t][1] = data[i].input[t]
-            ret[i].input[t][2] = data[i].output[t]
-            ret[i].output[t][1] = output[t]
-            local err_input = self.gconf.mmat_type(self.gconf.batch_size, 1)
-            for j = 1, self.gconf.batch_size do
-                if t <= data[i].seq_len[j] then
-                    err_input[j - 1][0] = 1
-                else
-                    err_input[j - 1][0] = 0
-                end
-            end
-            ret[i].err_input[t][1] = self.gconf.cumat_type.new_from_host(err_input)
-            ret[i].err_output[t][1] = err_output[t]
-            ret[i].err_output[t][2] = softmax_output[t]
-        end
-        ret[i].seq_length = data[i].seq_len
-        ret[i].new_seq = {}
-        for j = 1, self.gconf.batch_size do
-            if data[i].seq_start[j] then
-                table.insert(ret[i].new_seq, j)
-            end
-        end
-    end
-    return ret
-end
-
-function nn:process(data, do_train)
-    local timer = self.gconf.timer
-    local total_err = 0
-    local total_frame = 0
-    for id = 1, #data do
-        if do_train then
-            self.gconf.dropout_rate = self.gconf.dropout
-            data[id].do_train = true
-        else
-            self.gconf.dropout_rate = 0
-            data[id].do_train = false
-        end
-        timer:tic('network')
-        self.network:mini_batch_init(data[id])
-        self.network:propagate()
-        timer:toc('network')
-        for t = 1, self.gconf.chunk_size do
-            local tmp = data[id].output[t][1]:new_to_host()
-            for i = 1, self.gconf.batch_size do
-                if t <= data[id].seq_length[i] then
-                    total_err = total_err + math.log10(math.exp(tmp[i - 1][0]))
-                    total_frame = total_frame + 1
-                end
-            end
-        end
-        if do_train then
-            timer:tic('network')
-            self.network:back_propagate()
-            self.network:update()
-            timer:toc('network')
-        end
-        timer:tic('gc')
-        collectgarbage('collect')
-        timer:toc('gc')
-    end
-    return math.pow(10, - total_err / total_frame)
-end
-
-function nn:epoch()
-    local train_error = self:process(self.train_data, true)
-    local val_error = self:process(self.val_data, false)
-    return train_error, val_error
-end
diff --git a/lua/reader.lua b/lua/reader.lua
deleted file mode 100644
index 0c7bcb6..0000000
--- a/lua/reader.lua
+++ /dev/null
@@ -1,113 +0,0 @@
-local Reader = nerv.class('nerv.Reader')
-
-function Reader:__init(vocab_file, input_file)
-    self:get_vocab(vocab_file)
-    self:get_seq(input_file)
-end
-
-function Reader:get_vocab(vocab_file)
-    local f = io.open(vocab_file, 'r')
-    local id = 0
-    self.vocab = {}
-    while true do
-        local word = f:read()
-        if word == nil then
-            break
-        end
-        self.vocab[word] = id
-        id = id + 1
-    end
-    self.size = id
-end
-
-function Reader:split(s, t)
-    local ret = {}
-    for x in (s .. t):gmatch('(.-)' .. t) do
-        table.insert(ret, x)
-    end
-    return ret
-end
-
-function Reader:get_seq(input_file)
-    local f = io.open(input_file, 'r')
-    self.seq = {}
-    while true do
-        local seq = f:read()
-        if seq == nil then
-            break
-        end
-        seq = self:split(seq, ' ')
-        local tmp = {}
-        for i = 1, #seq do
-            if seq[i] ~= '' then
-                table.insert(tmp, self.vocab[seq[i]])
-            end
-        end
-        table.insert(self.seq, tmp)
-    end
-end
-
-function Reader:get_in_out(id, pos)
-    return self.seq[id][pos], self.seq[id][pos + 1], pos + 1 == #self.seq[id]
-end
-
-function Reader:get_all_batch(global_conf)
-    local data = {}
-    local pos = {}
-    local offset = 1
-    for i = 1, global_conf.batch_size do
-        pos[i] = nil
-    end
-    --while true do
-    for i = 1, 100 do
-        local input = {}
-        local output = {}
-        for i = 1, global_conf.chunk_size do
-            input[i] = global_conf.mmat_type(global_conf.batch_size, 1)
-            input[i]:fill(global_conf.nn_act_default)
-            output[i] = global_conf.mmat_type(global_conf.batch_size, 1)
-            output[i]:fill(global_conf.nn_act_default)
-        end
-        local seq_start = {}
-        local seq_end = {}
-        local seq_len = {}
-        for i = 1, global_conf.batch_size do
-            seq_start[i] = false
-            seq_end[i] = false
-            seq_len[i] = 0
-        end
-        local has_new = false
-        for i = 1, global_conf.batch_size do
-            if pos[i] == nil then
-                if offset < #self.seq then
-                    seq_start[i] = true
-                    pos[i] = {offset, 1}
-                    offset = offset + 1
-                end
-            end
-            if pos[i] ~= nil then
-                has_new = true
-                for j = 1, global_conf.chunk_size do
-                    local final
-                    input[j][i-1][0], output[j][i-1][0], final = self:get_in_out(pos[i][1], pos[i][2])
-                    seq_len[i] = j
-                    if final then
-                        seq_end[i] = true
-                        pos[i] = nil
-                        break
-                    end
-                    pos[i][2] = pos[i][2] + 1
-                end
-            end
-        end
-        if not has_new then
-            break
-        end
-        for i = 1, global_conf.chunk_size do
-            input[i] = global_conf.cumat_type.new_from_host(input[i])
-            output[i] = global_conf.cumat_type.new_from_host(output[i])
-        end
-        table.insert(data, {input = input, output = output, seq_start = seq_start, seq_end = seq_end, seq_len = seq_len})
-    end
-    return data
-end
diff --git a/lua/select_linear.lua b/lua/select_linear.lua
deleted file mode 100644
index a7e20cc..0000000
--- a/lua/select_linear.lua
+++ /dev/null
@@ -1,62 +0,0 @@
-local SL = nerv.class('nerv.SelectLinearLayer', 'nerv.Layer')
-
---id: string
---global_conf: table
---layer_conf: table
---Get Parameters
-function SL:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self.gconf = global_conf
-
-    self.vocab = layer_conf.vocab
-    self.ltp = self:find_param("ltp", layer_conf, global_conf, nerv.LinearTransParam, {self.vocab, self.dim_out[1]}) --layer_conf.ltp
- 
-    self:check_dim_len(1, 1)
-end
-
---Check parameter 
-function SL:init(batch_size)
-    if (self.dim_in[1] ~= 1) then --one word id 
-        nerv.error("mismatching dimensions of ltp and input")
-    end
-    if (self.dim_out[1] ~= self.ltp.trans:ncol()) then
-        nerv.error("mismatching dimensions of bp and output")
-    end
-    
-    self.batch_size = bath_size
-    self.ltp:train_init()
-end
-
-function SL:update(bp_err, input, output)
-    --use this to produce reproducable result, don't forget to set the dropout to zero!
-    --for i = 1, input[1]:nrow(), 1 do
-    --    local word_vec = self.ltp.trans[input[1][i - 1][0]]
-    --    word_vec:add(word_vec, bp_err[1][i - 1], 1, - self.gconf.lrate / self.gconf.batch_size)
-    --end 
-    
-    --I tried the update_select_rows kernel which uses atomicAdd, but it generates unreproducable result
-    self.ltp.trans:update_select_rows_by_colidx(bp_err[1], input[1], - self.gconf.lrate / self.gconf.batch_size, 0)
-    self.ltp.trans:add(self.ltp.trans, self.ltp.trans, 1.0, - self.gconf.lrate * self.gconf.wcost)
-end
-
-function SL:propagate(input, output)
-    --for i = 0, input[1]:ncol() - 1, 1 do
-    --    if (input[1][0][i] > 0) then
-    --        output[1][i]:copy_fromd(self.ltp.trans[input[1][0][i]])
-    --    else
-    --        output[1][i]:fill(0)
-    --    end
-    --end
-    output[1]:copy_rows_fromd_by_colidx(self.ltp.trans, input[1])
-end
-
-function SL:back_propagate(bp_err, next_bp_err, input, output)
-    --input is compressed, do nothing
-end
-
-function SL:get_params()
-    local paramRepo = nerv.ParamRepo({self.ltp})
-    return paramRepo
-end
diff --git a/lua/timer.lua b/lua/timer.lua
deleted file mode 100644
index 2c54ca8..0000000
--- a/lua/timer.lua
+++ /dev/null
@@ -1,33 +0,0 @@
-local Timer = nerv.class("nerv.Timer")
-
-function Timer:__init()
-    self.last = {}
-    self.rec = {}
-end
-
-function Timer:tic(item)
-    self.last[item] = os.clock()
-end
-
-function Timer:toc(item)
-    if (self.last[item] == nil) then
-        nerv.error("item not there")
-    end
-    if (self.rec[item] == nil) then
-        self.rec[item] = 0
-    end
-    self.rec[item] = self.rec[item] + os.clock() - self.last[item]
-end
-
-function Timer:check(item)
-    if self.rec[item]==nil then
-        nerv.error('item not there')
-    end
-    nerv.printf('"%s" lasts for %f secs.\n',item,self.rec[item])
-end
-
-function Timer:flush()
-    for key, value in pairs(self.rec) do
-        self.rec[key] = nil
-    end
-end
diff --git a/lua/tnn.lua b/lua/tnn.lua
deleted file mode 100644
index bf9f118..0000000
--- a/lua/tnn.lua
+++ /dev/null
@@ -1,136 +0,0 @@
-nerv.include('select_linear.lua')
-
-local reader = nerv.class('nerv.TNNReader')
-
-function reader:__init(global_conf, data)
-    self.gconf = global_conf
-    self.offset = 0
-    self.data = data
-end
-
-function reader:get_batch(feeds)
-    self.offset = self.offset + 1
-    if self.offset > #self.data then
-        return false
-    end
-    for i = 1, self.gconf.chunk_size do
-        feeds.inputs_m[i][1]:copy_from(self.data[self.offset].input[i])
-        feeds.inputs_m[i][2]:copy_from(self.data[self.offset].output[i]:decompress(self.gconf.vocab_size))
-    end
-    feeds.flags_now = self.data[self.offset].flags
-    feeds.flagsPack_now = self.data[self.offset].flagsPack
-    return true
-end
-
-function reader:has_data(t, i)
-    return t <= self.data[self.offset].seq_len[i]
-end
-
-function reader:get_err_input()
-    return self.data[self.offset].err_input
-end
-
-local nn = nerv.class('nerv.NN')
-
-function nn:__init(global_conf, train_data, val_data, layers, connections)
-    self.gconf = global_conf
-    self.tnn = self:get_tnn(layers, connections)
-    self.train_data = self:get_data(train_data)
-    self.val_data = self:get_data(val_data)
-end
-
-function nn:get_tnn(layers, connections)
-    self.gconf.dropout_rate = 0
-    local layer_repo = nerv.LayerRepo(layers, self.gconf.pr, self.gconf)
-    local tnn = nerv.TNN('TNN', self.gconf, {dim_in = {1, self.gconf.vocab_size}, 
-        dim_out = {1}, sub_layers = layer_repo, connections = connections, 
-        clip = self.gconf.clip})
-    tnn:init(self.gconf.batch_size, self.gconf.chunk_size)
-    return tnn
-end
-
-function nn:get_data(data)
-    local ret = {}
-    for i = 1, #data do
-        ret[i] = {}
-        ret[i].input = data[i].input
-        ret[i].output = data[i].output
-        ret[i].flags = {}
-        ret[i].err_input = {}
-        for t = 1, self.gconf.chunk_size do
-            ret[i].flags[t] = {}
-            local err_input = self.gconf.mmat_type(self.gconf.batch_size, 1)
-            for j = 1, self.gconf.batch_size do
-                if t <= data[i].seq_len[j] then
-                    ret[i].flags[t][j] = nerv.TNN.FC.SEQ_NORM
-                    err_input[j - 1][0] = 1
-                else
-                    ret[i].flags[t][j] = 0
-                    err_input[j - 1][0] = 0
-                end
-            end
-            ret[i].err_input[t] = self.gconf.cumat_type.new_from_host(err_input)
-        end
-        for j = 1, self.gconf.batch_size do
-            if data[i].seq_start[j] then
-                ret[i].flags[1][j] = bit.bor(ret[i].flags[1][j], nerv.TNN.FC.SEQ_START)
-            end
-            if data[i].seq_end[j] then
-                local t = data[i].seq_len[j]
-                ret[i].flags[t][j] = bit.bor(ret[i].flags[t][j], nerv.TNN.FC.SEQ_END)
-            end
-        end
-        ret[i].flagsPack = {}
-        for t = 1, self.gconf.chunk_size do
-            ret[i].flagsPack[t] = 0
-            for j = 1, self.gconf.batch_size do
-                ret[i].flagsPack[t] = bit.bor(ret[i].flagsPack[t], ret[i].flags[t][j])
-            end
-        end
-        ret[i].seq_len = data[i].seq_len
-    end
-    return ret
-end
-
-function nn:process(data, do_train)
-    local total_err = 0
-    local total_frame = 0
-    local reader = nerv.TNNReader(self.gconf, data)
-    while true do
-        local r, _ = self.tnn:getfeed_from_reader(reader)
-        if not r then
-            break
-        end
-        if do_train then
-            self.gconf.dropout_rate = self.gconf.dropout
-        else
-            self.gconf.dropout_rate = 0
-        end
-        self.tnn:net_propagate()
-        for t = 1, self.gconf.chunk_size do
-            local tmp = self.tnn.outputs_m[t][1]:new_to_host()
-            for i = 1, self.gconf.batch_size do
-                if reader:has_data(t, i) then
-                    total_err = total_err + math.log10(math.exp(tmp[i - 1][0]))
-                    total_frame = total_frame + 1
-                end
-            end
-        end
-        if do_train then
-            local err_input = reader:get_err_input()
-            for i = 1, self.gconf.chunk_size do
-                self.tnn.err_inputs_m[i][1]:copy_from(err_input[i])
-            end
-            self.tnn:net_backpropagate(false)
-            self.tnn:net_backpropagate(true)
-        end
-        collectgarbage('collect')
-    end
-    return math.pow(10, - total_err / total_frame)
-end
-
-function nn:epoch()
-    local train_error = self:process(self.train_data, true)
-    local val_error = self:process(self.val_data, false)
-    return train_error, val_error
-end
diff --git a/nerv/Makefile b/nerv/Makefile
index a9b4baf..421eda0 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -1,3 +1,11 @@
+ifndef LUA_BINDIR
+$(error Please build the package via luarocks: `luarocks make`)
+endif
+
+ifndef CUDA_BASE
+$(error CUDA_BASE is not set)
+endif
+
 .PHONY: build install clean
 SHELL := /bin/bash
 
@@ -34,18 +42,18 @@ LUA_LIBS := matrix/init.lua io/init.lua init.lua \
 			layer/init.lua layer/affine.lua layer/sigmoid.lua layer/tanh.lua layer/softmax_ce.lua layer/softmax.lua \
 			layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua \
 			layer/elem_mul.lua layer/lstm.lua layer/lstm_gate.lua layer/dropout.lua layer/gru.lua \
-            layer/graph.lua layer/rnn.lua layer/duplicate.lua layer/identity.lua \
+			layer/graph.lua layer/rnn.lua layer/duplicate.lua layer/identity.lua \
 			nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua nn/network.lua \
 			io/sgd_buffer.lua \
 			tnn/init.lua tnn/sutil.lua tnn/tnn.lua
 
 INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK
-#CUDA_BASE := /usr/local/cuda-7.0
-CUDA_BASE := /usr/local/cuda
 CUDA_INCLUDE := -I $(CUDA_BASE)/include/
 INCLUDE += $(CUDA_INCLUDE)
 
-LDFLAGS := -L$(CUDA_BASE)/lib64/ -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcublas -lcurand
+CUDA_LDFLAGS := -L$(CUDA_BASE)/lib64/ -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcuda -lcublas -lcurand
+override CFLAGS += $(NERV_FEAT)
+
 NVCC := $(CUDA_BASE)/bin/nvcc
 EMPTY :=
 SPACE := $(EMPTY) $(EMPTY)
@@ -66,11 +74,11 @@ $(LUA_DIR)/%.lua: %.lua
 	cp $< $@
 
 $(LIB_PATH)/libnervcore.so: $(CORE_OBJS)
-	gcc -shared -o $@ $^ $(LDFLAGS) -lcblas
+	gcc -shared -o $@ $^ $(LDFLAGS) $(CUDA_LDFLAGS) $(BLAS_LDFLAGS)
 $(LIB_PATH)/libluaT.so: $(LUAT_OBJS)
-	gcc -shared -o $@ $^ $(LDFLAGS)
+	gcc -shared -o $@ $^
 $(INST_LIBDIR)/libnerv.so: $(NERV_OBJS) $(LIB_PATH)/libnervcore.so $(LIB_PATH)/libluaT.so
-	gcc -shared -o $@ $(NERV_OBJS) $(LDFLAGS) -Wl,-rpath=$(LIB_PATH) -L$(LIB_PATH) -lnervcore -lluaT
+	gcc -shared -o $@ $(NERV_OBJS) -Wl,-rpath=$(LIB_PATH) -L$(LIB_PATH) -lnervcore -lluaT
 
 $(OBJ_DIR)/matrix/cumatrix.o: matrix/generic/cumatrix.c matrix/generic/matrix.c
 $(OBJ_DIR)/matrix/mmatrix.o: matrix/generic/mmatrix.c matrix/generic/matrix.c
diff --git a/nerv/doc/nerv.md b/nerv/doc/nerv.md
index 28411f5..125928d 100644
--- a/nerv/doc/nerv.md
+++ b/nerv/doc/nerv.md
@@ -1,6 +1,6 @@
-#The Nerv utility functions#
+# The Nerv utility functions
 Part of the [Nerv](../README.md) toolkit.
-##Methods##
+## Methods
 * __string = nerv.typename(obj a)__  
 A registered function, the original function is `luaT_lua_typename`. In some cases if you call `type(a)`  for object of some class in __Nerv__(like __Nerv.CuMatrix__) it will only return "userdata"(because it is created in C), in this case you can use this method to get its type.
 
@@ -14,4 +14,4 @@ A registered function, the original function is `luaT_newmetatable`, it returns
 * __string = nerv.setmetatable(table self, string tname)__  
 A registered function, the original function is `luaT_lua_setmetatable`. It assigns the metatable registered in __luaT__ by the name *tname* to the table *self*. And return *tname* to user.
 * __table = nerv.get_type(string typename)__  
-Returns the type(`loadstring("return " .. typename)`).
-\ No newline at end of file
+Returns the type(`loadstring("return " .. typename)`).
diff --git a/nerv/doc/nerv_class.md b/nerv/doc/nerv_class.md
index 99f63e7..8314b12 100644
--- a/nerv/doc/nerv_class.md
+++ b/nerv/doc/nerv_class.md
@@ -1,10 +1,10 @@
-#The Nerv OOP#
+# The Nerv OOP
 Part of the [Nerv](../README.md) toolkit.
-##Methods##
+## Methods
 * __metatable mt, metatable mpt = nerv.class(string tname, string parenttname)__  
 This method is used to create a class by the name `tname`, which inherits `parenttname` in __Nerv__, then you create a new instance of this class by calling `obj=tname(...)`. The  `tname.__init(...)` method(if defined) will be called in the constructing. The metatable of the class and its parent class will be returned.
 
-##Examples##
+## Examples
 * This example implements a simple `nerv.Counter` class which is inherited by `nerv.BetterCounter`.  
 
 ```
@@ -33,4 +33,4 @@ c1 = nerv.Counter(1)
 print(c1.c)
 bc1 = nerv.BetterCounter(1, 1)
 print(bc1.c, bc1.bc)
-```
-\ No newline at end of file
+```
diff --git a/nerv/doc/nerv_io.md b/nerv/doc/nerv_io.md
index 07589df..299362f 100644
--- a/nerv/doc/nerv_io.md
+++ b/nerv/doc/nerv_io.md
@@ -1,7 +1,7 @@
-#The Nerv IO Package#
+# The Nerv IO Package
 Part of the [Nerv](../README.md) toolkit.
 
-##Description##
+## Description
 The main class that the user uses to store and read parameter object to and from files is __nerv.ChunkFile__.  
 In the file, a parameter object will be saved using a standard format. First is the length(in byte) of this object, then a table which includes some meta information of the object, and a data area. Below is an example text file.  
 ```
@@ -23,7 +23,7 @@ In the file, a parameter object will be saved using a standard format. First is
 3.000000 3.000000 3.000000 
 ```
 
-##Methods##
+## Methods
 * __ChunkFile ChunkFile(string fn, string mode)__  
 `mode` can be `r` or `w`, for reading or writing a file. The returned __ChunkFile__ will be ready to write or read objects which follows the __nerv.Param__ interface(using `write_chunk` and `read_chunk`). 
 * __void ChunkFile.write_chunk(ChunkFile self, Param p)__  
@@ -33,7 +33,7 @@ Read the __Param__ object by id `id` from the file `self`. It will be constructe
 * __void ChunkFile.close(ChunkFile self)__  
 Close the opened file.
 
-##Examples##
+## Examples
 * An example showing how to use __ChunkFile__ to store and read parameter objects.
 ```
 require 'io'
@@ -96,7 +96,7 @@ do
 end
 ```
 
-##Developer Notes##
+## Developer Notes
 * There are four classes in to deal with chunk data, which are __nerv.ChunkFile__, __nerv.ChunkFileHandle__, __nerv.ChunkInfo__, __nerv.ChunkData__. Below is the underlying C structs.
 ```
 typedef struct ChunkFileHandle {
@@ -110,4 +110,5 @@ typedef struct ChunkData {
     char *data;
 } ChunkData;
 ```
-* In __Nerv.io__, a returned(by `ChunkFile.__init`) __nerv.ChunkFile__ will have a member `handle`, which is a __nerv.ChunkFileHandle__.  
-\ No newline at end of file
+
+* In __Nerv.io__, a returned(by `ChunkFile.__init`) __nerv.ChunkFile__ will have a member `handle`, which is a __nerv.ChunkFileHandle__.  
diff --git a/nerv/doc/nerv_layer.md b/nerv/doc/nerv_layer.md
index de2fb12..dd7c9bb 100644
--- a/nerv/doc/nerv_layer.md
+++ b/nerv/doc/nerv_layer.md
@@ -1,9 +1,9 @@
-#The Nerv Layer Package#
+# The Nerv Layer Package
 Part of the [Nerv](../README.md) toolkit.
 
-##Description##
+## Description
 __nerv.Layer__ is the base class and most of its methods are abstract.  
-###Class hierarchy and their members###
+### Class hierarchy and their members
 * __nerv.Layer__.  
 	* `table dim_in` It specifies the dimensions of the inputs.  
 	* `table dim_out` It specifies the dimensions of the outputs.  
@@ -20,7 +20,7 @@ __nerv.Layer__ is the base class and most of its methods are abstract.
 	* `int total_frams` Records how many frames have passed.  
 	* `bool compressed` The reference distribution can be a one-hot format. This feature is enabled by `layer_conf.compressed`.
 
-##Methods##
+## Methods
 * __void Layer.\_\_init(Layer self, string id, table global_conf, table layer_conf)__  
 Abstract method.  
 The constructing method should assign `id` to `self.id` and `global_conf` to `self.gconf`, `layer_conf.dim_in` to `self.dim_in`, `layer_conf.dim_out` to `self.dim_out`. `dim_in` and `dim_out` are a list specifies the dimensions of the inputs and outputs. Also, `layer_conf` will include the parameters, which should also be properly saved.
@@ -43,7 +43,7 @@ Check whether `#self.dim_in == len_in` and `#self.dim_out == len_out`, if violat
 Abstract method.  
 The layer should return a list containing its parameters.
 
-####nerv.Layer.get\_dim(self)####
+#### nerv.Layer.get\_dim(self)
 *	Returns:
 	`dim_in`: __table__.  
     `dim_out`: __table__.  
@@ -52,7 +52,7 @@ The layer should return a list containing its parameters.
 *	Description:  
 	Returns `self.dim_in, self.dim_out`.
 
-##Examples##
+## Examples
 * a basic example using __Nerv__ layers to a linear classification.
 
 ```
@@ -178,3 +178,4 @@ for l = 0, 10, 1 do
 end
 --[[end training]]--
 ```
+
diff --git a/nerv/doc/nerv_matrix.md b/nerv/doc/nerv_matrix.md
index dfd843d..3782eb3 100644
--- a/nerv/doc/nerv_matrix.md
+++ b/nerv/doc/nerv_matrix.md
@@ -1,8 +1,8 @@
-#The Nerv Matrix Package#
+# The Nerv Matrix Package
 Part of the [Nerv](../README.md) toolkit.
 
-##Description##
-###Underlying structure###
+## Description
+### Underlying structure
 In the begining is could be useful to know something about the underlying structure of a __Nerv__ matrix. Please keep in mind that matrice in __Nerv__ is row-major.  
 Every matrix object is a encapsulation of a C struct that describes the attributes of this matrix.  
 ```
@@ -20,12 +20,12 @@ typedef struct Matrix {
 It is worth mentioning that that `data_ref` is a counter which counts the number of references to its memory space, mind that it will also be increased when a row of the matrix is referenced(`col = m[2]`). A __Nerv__ matrix will deallocate its space when this counter is decreased to zero.
 Also note that all assigning operation in __Nerv__ is reference copy, you can use `copy_tod` or `copy_toh` method to copy value. Also, row assigning operations like `m1[2]=m2[3]` is forbidden in __Nerv__.
 
-###Class hierarchy###
+### Class hierarchy
 The class hierarchy of the matrix classes can be clearly observed in `matrix/init.c`.
 First there is a abstract base class __Nerv.Matrix__, which is inherited by __Nerv.CuMatrix__ and __Nerv.MMatrix__(also abstract).  
 Finally, there is __Nerv.CuMatrixFloat__, __Nerv.CuMatrixDouble__, inheriting __Nerv.CuMatrix__, and __Nerv.MMatrixFloat__, __Nerv.MMatrixDouble__, __Nerv.MMatrixInt__ , inheriting __Nerv.MMatrix__.
 
-##Methods##
+## Methods
 Mind that usually a matrix object can only do calculation with matrix of its own type(a __Nerv.CuMatrixFloat__ matrix can only do add operation with a __Nerv.CuMatrixFloat__).  
 In the methods description below, __Matrix__ could be __Nerv.CuMatrixFloat__, __Nerv.CuMatrixDouble__, __Nerv.MMatrixFloat__ or __Nerv.MMatrixDouble__. __Element_type__ could be `float` or `double`, respectively.
 * __Matrix = Matrix(int nrow, int ncol)__  
@@ -53,6 +53,8 @@ Return a new __Matrix__ of size (1,`self.ncol`), which stores the sum of all col
 Return a new __Matrix__ of size (`self.nrow`,1), which stores the sum of all rows of __Matrix__ `self`.
 * __Matrix Matrix.rowmax(Matrix self)__  
 Return a new __Matrix__ of size (`self.nrow`,1), which stores the max value of all rows of __Matrix__ `self`.
+* __Matrix Matrix.rowmax_idx(Matrix self)__  
+Return two new __Matrix__ of size (`self.nrow`,1), which stores the max value of all rows of __Matrix__ `self`, and its corresponding column indices(start from zero).
 * __Matrix Matrix.trans(Matrix self)__  
 Return a new __Matrix__ of size (`self.ncol`,`self.nrow`), which stores the transpose of __Matrix__ `self`.
 * __void Matrix.copy_fromh(Matrix self, MMatrix a)__  
@@ -81,8 +83,8 @@ Fill the content of __Matrix__ `self` to be `value`.
 Set the element of __Matrix__ `self` to be elementwise-sigmoid of `ma`.
 * __void Matrix.sigmoid_grad(Matrix self, Matrix err, Matrix output)__  
 Set the element of __Matrix__ `self`, to be `self[i][j]=err[i][j]*output[i][j]*(1-output[i][j])`. This function is used to propagate sigmoid layer error.
-* __void Matrix.softmax(Matrix self, Matrix a)__  
-Calculate a row-by-row softmax of __Matrix__ `a` and save the result in `self`.
+* __Matrix Matrix.softmax(Matrix self, Matrix a)__  
+Calculate a row-by-row softmax of __Matrix__ `a` and save the result in `self`. Returns a new `self.nrow*1` index matrix that stores the index of the maximum value of each row.
 * __void Matrix.mul_elem(Matrix self, Matrix ma, Matrix mb)__  
 Calculate element-wise multiplication of __Matrix__ `ma` and `mb`, store the result in `self`.
 * __void Matrix.log_elem(Matrix self, Matrix ma)__  
@@ -113,7 +115,7 @@ Write `self` to the file position in `chunk`.
 * __void MMatrix.copy_from(MMatrix ma, MMatrix mb,[int b_bgein, int b_end, int a_begin])__  
 Copy a part of `mb`(rows of index `[b_begin..b_end)`) to `ma` beginning at row index `a_begin`. If not specified, `b_begin` will be `0`, `b_end` will be `b.nrow`, `a_begin` will be `0`.
 
-##Examples##
+## Examples
 * Use `get_dataref_value` to test __Nerv__'s matrix space allocation.  
 ```
 m = 10
@@ -134,6 +136,7 @@ print("test fm:get_dataref_value:", fm:get_dataref_value())
 print(fm)
 print(dm)
 ```
+
 * Test some __Matrix__ calculations.
 ```
 m = 4
@@ -167,3 +170,4 @@ print(a)
 a:log_elem(fs)
 print(a)
 ```
+
diff --git a/nerv/doc/nerv_nn.md b/nerv/doc/nerv_nn.md
index c57447d..63537fb 100644
--- a/nerv/doc/nerv_nn.md
+++ b/nerv/doc/nerv_nn.md
@@ -1,19 +1,19 @@
-#The Nerv NN Package#
+# The Nerv NN Package
 Part of the [Nerv](../README.md) toolkit.
 
-##Description##
-###Class hierarchy###
+## Description
+### Class hierarchy
 it contains __nerv.LayerRepo__, __nerv.ParamRepo__, and __nerv.DAGLayer__(inherits __nerv.Layer__).
 
-###Class hierarchy and their members###
-####nerv.ParamRepo#### 
+### Class hierarchy and their members
+#### nerv.ParamRepo
 Get parameter object by ID.  
 *	`table param_table` Contains the mapping of parameter ID to parameter file(__nerv.ChunkFile__)  
 *  __nerv.LayerRepo__ Get layer object by ID.  
 * 	`table layers` Contains the mapping of layer ID to layer object.
 objects.
 
-####__nerv.DAGLayer__####
+#### __nerv.DAGLayer__
 Inherits __nerv.Layer__.  
 * 	`layers`: __table__, a mapping from a layer ID to its "ref". A ref is a structure that contains reference to space allocations and other info of the layer.
 * 	`inputs`: __table__, a mapping from the inputs ports of the DAG layer to the input ports of the sublayer, the key is the port number, the value is `{ref, port}`.
@@ -21,17 +21,17 @@ Inherits __nerv.Layer__.
 * 	`parsed_conn`: __table__, a list of parsed connections, each entry is of format `{{ref_from, port_from}, {ref_to, port_to}}`.
 * 	`queue`: __table__, a list of "ref"s, the propagation of the DAGLayer will follow this order, and back-propagation will follow a reverse order.
 	
-##Methods##
+## Methods
 
-###__nerv.ParamRepo__###
+### __nerv.ParamRepo__
 
-####nerv.ParamRepo:\_\_init(param\_files)####
+#### nerv.ParamRepo:\_\_init(param\_files)
 * 	Parameters:  
 	`param_files`: __table__
 *	Description:  
 	`param_files` is a list of file names that stores parameters, the newed __ParamRepo__ will read them from file and store the mapping for future fetching.  
     
-####nerv.Param ParamRepo.get_param(ParamRepo self, string pid, table global_conf)####
+#### nerv.Param ParamRepo.get_param(ParamRepo self, string pid, table global_conf)
 *	Returns:  
 	__nerv.Layer__  
 *	Parameters:  
@@ -41,8 +41,8 @@ Inherits __nerv.Layer__.
 *	Description:  
 	__ParamRepo__ will find the __nerv.ChunkFile__ `pf` that contains parameter of ID `pid` and return `pf:read_chunk(pid, global_conf)`.
 
-###__nerv.LayerRepo__###
-####nerv.LayerRepo:\_\_init(layer\_spec, param\_repo, global\_conf)####
+### __nerv.LayerRepo__
+#### nerv.LayerRepo:\_\_init(layer\_spec, param\_repo, global\_conf)
 * 	Returns:  
   	__nerv.LayerRepo__.  
 * 	Parameters:  
@@ -60,7 +60,7 @@ Inherits __nerv.Layer__.
     
   	__LayerRepo__ will merge `param_config` into `layer_config` and construct a layer by calling `layer_type(layerid, global_conf, layer_config)`.    
 
-####nerv.LayerRepo.get\_layer(self, lid)####
+#### nerv.LayerRepo.get\_layer(self, lid)
 * 	Returns:  
 	__nerv.LayerRepo__, the layer with ID `lid`.
 * 	Parameters:  
@@ -69,8 +69,8 @@ Inherits __nerv.Layer__.
 *	Description:   
 	Returns the layer with ID `lid`.
     
-###nerv.DAGLayer###
-####nerv.DAGLayer:\_\_init(id, global\_conf, layer\_conf)####
+### nerv.DAGLayer
+#### nerv.DAGLayer:\_\_init(id, global\_conf, layer\_conf)
 *	Returns:  
 	__nerv.DAGLayer__  
 *	Parameters:  
@@ -89,7 +89,7 @@ Inherits __nerv.Layer__.
   	}})
     ```
     
-####nerv.DAGLayer.init(self, batch\_size)####
+#### nerv.DAGLayer.init(self, batch\_size)
 *	Parameters:  
 	`self`: __nerv.DAGLayer__  
     `batch_size`: __int__
@@ -97,7 +97,7 @@ Inherits __nerv.Layer__.
 	This initialization method will allocate space for output and input matrice, and will call `init()` for each of its sub layers.
     
 
-####nerv.DAGLayer.propagate(self, input, output)####
+#### nerv.DAGLayer.propagate(self, input, output)
 *	Parameters:  
 	`self`: __nerv.DAGLayer__  
     `input`: __table__  
@@ -105,7 +105,7 @@ Inherits __nerv.Layer__.
 *	Description:  
 	The same function as __nerv.Layer.propagate__, do propagation for each layer in the order of `self.queue`.
 
-####nerv.DAGLayer.back\_propagate(self, next\_bp\_err, bp\_err, input, output)####
+#### nerv.DAGLayer.back\_propagate(self, next\_bp\_err, bp\_err, input, output)
 *	Parameters:  
 	`self`: __nerv.DAGLayer__  
     `next_bp_err`: __table__  
@@ -115,7 +115,7 @@ Inherits __nerv.Layer__.
 *	Description:  
 	The same function as __nerv.Layer.back_propagate__, do back-propagation for each layer in the reverse order of `self.queue`.
 
-####nerv.DAGLayer.update(self, bp\_err, input, output)####
+#### nerv.DAGLayer.update(self, bp\_err, input, output)
 *	Parameters:  
 	`self`: __nerv.DAGLayer__  
     `bp_err`: __table__  
@@ -124,7 +124,7 @@ Inherits __nerv.Layer__.
 *	Description:  
 	The same function as __nerv.Layer.update__, do update for each layer in the order of `self.queue`.
     
-##Examples##
+## Examples
 *	aaa
 	
 ```
@@ -253,4 +253,5 @@ for l = 0, 10, 1 do
     ce_last = softmaxL.total_ce 
 end
 --[[end training]]--
-```
-\ No newline at end of file
+```
+
diff --git a/nerv/doc/nerv_param.md b/nerv/doc/nerv_param.md
index 167cb11..98793f0 100644
--- a/nerv/doc/nerv_param.md
+++ b/nerv/doc/nerv_param.md
@@ -1,17 +1,17 @@
-#The Nerv Parameter Package#
+# The Nerv Parameter Package
 Part of the [Nerv](../README.md) toolkit.
 
-##Description##
-###Class hierarchy###
+## Description
+### Class hierarchy
 There is a base class __Nerv.Param__ defined in `layer/init.lua`.
 
-###Class hierarchy and their members###
+### Class hierarchy and their members
 * __nerv.MatrixParam__ inherits __nerv.Param__  
 	* `Matrix trans` stores the parameter matrix.
 * __nerv.LinearTransParam__ inherits __Nerv.MatrixParam__.  
 * __Nerv.BiasParam__ inherits __Nerv.MatrixParam__.  
 
-##Methods##
+## Methods
 * __void Param.\_\_init(Param self, string id, table global_conf)__  
 Constructor of a __Param__, it will set `self.id` to be `id` and `self.gconf` to be `global_conf`.
 * __void Param.set_info(Param self, table info)__  
diff --git a/nerv/examples/asr_trainer.lua b/nerv/examples/asr_trainer.lua
index 3fa2653..5bf28bd 100644
--- a/nerv/examples/asr_trainer.lua
+++ b/nerv/examples/asr_trainer.lua
@@ -1,17 +1,33 @@
-function build_trainer(ifname)
-    local param_repo = nerv.ParamRepo()
-    param_repo:import(ifname, nil, gconf)
-    local layer_repo = make_layer_repo(param_repo)
-    local network = get_network(layer_repo)
-    local global_transf = get_global_transf(layer_repo)
-    local input_order = get_input_order()
+require 'lfs'
+require 'pl'
+local function build_trainer(ifname)
+    local host_param_repo = nerv.ParamRepo()
     local mat_type
+    local src_loc_type
+    local train_loc_type
+    host_param_repo:import(ifname, nil, gconf)
     if gconf.use_cpu then
         mat_type = gconf.mmat_type
+        src_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST
+        train_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST
     else
         mat_type = gconf.cumat_type
+        src_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST
+        train_loc_type = nerv.ParamRepo.LOC_TYPES.ON_DEVICE
     end
-    local iterative_trainer = function (prefix, scp_file, bp)
+    local param_repo = host_param_repo:copy(train_loc_type)
+    local layer_repo = make_layer_repo(param_repo)
+    local network = get_network(layer_repo)
+    local global_transf = get_global_transf(layer_repo)
+    local input_order = get_input_order()
+    local iterative_trainer = function (prefix, scp_file, bp, rebind_param_repo)
+        -- rebind the params if necessary
+        if rebind_param_repo then
+            host_param_repo = rebind_param_repo
+            param_repo = host_param_repo:copy(train_loc_type)
+            layer_repo:rebind(param_repo)
+            rebind_param_repo = nil
+        end
         gconf.randomize = bp
         -- build buffer
         local buffer = make_buffer(make_readers(scp_file, layer_repo))
@@ -64,61 +80,193 @@ function build_trainer(ifname)
         print_stat(layer_repo)
         mat_type.print_profile()
         mat_type.clear_profile()
-        if (not bp) and prefix ~= nil then
-            nerv.info("writing back...")
-            local fname = string.format("%s_cv%.3f.nerv",
-                            prefix, get_accuracy(layer_repo))
-            network:get_params():export(fname, nil)
+        local fname
+        if (not bp) then
+            host_param_repo = param_repo:copy(src_loc_type)
+            if prefix ~= nil then
+                nerv.info("writing back...")
+                fname = string.format("%s_cv%.3f.nerv",
+                                    prefix, get_accuracy(layer_repo))
+                host_param_repo:export(fname, nil)
+            end
         end
-        return get_accuracy(layer_repo)
+        return get_accuracy(layer_repo), host_param_repo, fname
     end
     return iterative_trainer
 end
 
-dofile(arg[1])
-start_halving_inc = 0.5
-halving_factor = 0.6
-end_halving_inc = 0.1
-min_iter = 1
-max_iter = 20
-min_halving = 5
-gconf.batch_size = 256
-gconf.buffer_size = 81920
+local function check_and_add_defaults(spec, opts)
+    local function get_opt_val(k)
+        return opts[string.gsub(k, '_', '-')].val
+    end
+    local opt_v = get_opt_val("resume_from")
+    if opt_v then
+        gconf = dofile(opt_v)
+    else
+        for k, v in pairs(spec) do
+            local opt_v = get_opt_val(k)
+            if opt_v ~= nil then
+                gconf[k] = opt_v
+            elseif gconf[k] ~= nil then
+            elseif v ~= nil then
+                gconf[k] = v
+            end
+        end
+    end
+end
 
-local pf0 = gconf.initialized_param
-local trainer = build_trainer(pf0)
---local trainer = build_trainer("c3.nerv")
-local accu_best = trainer(nil, gconf.cv_scp, false)
-local do_halving = false
-
-nerv.info("initial cross validation: %.3f", accu_best)
-for i = 1, max_iter do
-    nerv.info("[NN] begin iteration %d with lrate = %.6f", i, gconf.lrate)
-    local accu_tr = trainer(nil, gconf.tr_scp, true)
-    nerv.info("[TR] training set %d: %.3f", i, accu_tr)
-    local accu_new = trainer(
-                        string.format("%s_%s_iter_%d_lr%f_tr%.3f",
-                            string.gsub(
-                                (string.gsub(pf0[1], "(.*/)(.*)", "%2")),
-                                "(.*)%..*", "%1"),
-                            os.date("%Y%m%d%H%M%S"),
-                            i, gconf.lrate,
-                            accu_tr),
-                        gconf.cv_scp, false)
-    nerv.info("[CV] cross validation %d: %.3f", i, accu_new)
-    -- TODO: revert the weights
-    local accu_diff = accu_new - accu_best
-    if do_halving and accu_diff < end_halving_inc and i > min_iter then
-        break
+local function make_options(spec)
+    local options = {}
+    for k, v in pairs(spec) do
+        table.insert(options,
+                    {string.gsub(k, '_', '-'), nil, type(v), default = v})
     end
-    if accu_diff < start_halving_inc and i >= min_halving then
-        do_halving = true
+    return options
+end
+
+local function print_help(options)
+    nerv.printf("Usage: <asr_trainer.lua> [options] network_config.lua\n")
+    nerv.print_usage(options)
+end
+
+local function print_gconf()
+    local key_maxlen = 0
+    for k, v in pairs(gconf) do
+        key_maxlen = math.max(key_maxlen, #k or 0)
     end
-    if do_halving then
-        gconf.lrate = gconf.lrate * halving_factor
+    local function pattern_gen()
+        return string.format("%%-%ds = %%s\n", key_maxlen)
     end
-    if accu_new > accu_best then
-        accu_best = accu_new
+    nerv.info("ready to train with the following gconf settings:")
+    nerv.printf(pattern_gen(), "Key", "Value")
+    for k, v in pairs(gconf) do
+        nerv.printf(pattern_gen(), k or "", v or "")
     end
+end
+
+local function dump_gconf(fname)
+    local f = io.open(fname, "w")
+    f:write("return ")
+    f:write(table.tostring(gconf))
+    f:close()
+end
+
+local trainer_defaults = {
+    lrate = 0.8,
+    batch_size = 256,
+    buffer_size = 81920,
+    wcost = 1e-6,
+    momentum = 0.9,
+    start_halving_inc = 0.5,
+    halving_factor = 0.6,
+    end_halving_inc = 0.1,
+    cur_iter = 1,
+    min_iter = 1,
+    max_iter = 20,
+    min_halving = 5,
+    do_halving = false,
+    cumat_tname = "nerv.CuMatrixFloat",
+    mmat_tname = "nerv.MMatrixFloat",
+    debug = false,
+}
+
+local options = make_options(trainer_defaults)
+local extra_opt_spec = {
+    {"tr-scp", nil, "string"},
+    {"cv-scp", nil, "string"},
+    {"resume-from", nil, "string"},
+    {"help", "h", "boolean", default = false, desc = "show this help information"},
+    {"dir", nil, "string", desc = "specify the working directory"},
+}
+
+table.extend(options, extra_opt_spec)
+
+arg, opts = nerv.parse_args(arg, options)
+
+if #arg < 1 or opts["help"].val then
+    print_help(options)
+    return
+end
+
+dofile(arg[1])
+
+--[[
+
+Rule: command-line option overrides network config overrides trainer default.
+Note: config key like aaa_bbbb_cc could be overriden by specifying
+--aaa-bbbb-cc to command-line arguments.
+
+]]--
+
+check_and_add_defaults(trainer_defaults, opts)
+gconf.mmat_type = nerv.get_type(gconf.mmat_tname)
+gconf.cumat_type = nerv.get_type(gconf.cumat_tname)
+gconf.use_cpu = econf.use_cpu or false
+
+local pf0 = gconf.initialized_param
+local date_pattern = "%Y%m%d%H%M%S"
+local logfile_name = "log"
+local working_dir = opts["dir"].val or string.format("nerv_%s", os.date(date_pattern))
+local rebind_param_repo = nil
+
+print_gconf()
+if not lfs.mkdir(working_dir) then
+    nerv.error("[asr_trainer] working directory already exists")
+end
+-- copy the network config
+dir.copyfile(arg[1], working_dir)
+-- set logfile path
+nerv.set_logfile(path.join(working_dir, logfile_name))
+path.chdir(working_dir)
+
+-- start the training
+local trainer = build_trainer(pf0)
+local pr_prev
+gconf.accu_best, pr_prev = trainer(nil, gconf.cv_scp, false)
+nerv.info("initial cross validation: %.3f", gconf.accu_best)
+for i = gconf.cur_iter, gconf.max_iter do
+    local stop = false
+    gconf.cur_iter = i
+    dump_gconf(string.format("iter_%d.meta", i))
+    repeat -- trick to implement `continue` statement
+        nerv.info("[NN] begin iteration %d with lrate = %.6f", i, gconf.lrate)
+        local accu_tr = trainer(nil, gconf.tr_scp, true, rebind_param_repo)
+        nerv.info("[TR] training set %d: %.3f", i, accu_tr)
+        local param_prefix = string.format("%s_%s_iter_%d_lr%f_tr%.3f",
+                                string.gsub(
+                                    (string.gsub(pf0[1], "(.*/)(.*)", "%2")),
+                                    "(.*)%..*", "%1"),
+                                os.date(date_pattern),
+                                i, gconf.lrate,
+                                accu_tr)
+        local accu_new, pr_new, param_fname = trainer(param_prefix, gconf.cv_scp, false)
+        nerv.info("[CV] cross validation %d: %.3f", i, accu_new)
+        local accu_prev = gconf.accu_best
+        if accu_new < gconf.accu_best then
+            nerv.info("rejecting the trained params, rollback to the previous one")
+            file.move(param_fname, param_fname .. ".rejected")
+            rebind_param_repo = pr_prev
+            break -- `continue` equivalent
+        else
+            nerv.info("accepting the trained params")
+            gconf.accu_best = accu_new
+            pr_prev = pr_new
+            gconf.initialized_param = {path.join(path.currentdir(), param_fname)}
+        end
+        if gconf.do_halving and
+            gconf.accu_best - accu_prev < gconf.end_halving_inc and
+            i > gconf.min_iter then
+            stop = true
+            break
+        end
+        if gconf.accu_best - accu_prev < gconf.start_halving_inc and
+            i >= gconf.min_halving then
+            gconf.do_halving = true
+        end
+        if gconf.do_halving then
+            gconf.lrate = gconf.lrate * gconf.halving_factor
+        end
+    until true
+    if stop then break end
 --    nerv.Matrix.print_profile()
 end
diff --git a/nerv/examples/swb_baseline.lua b/nerv/examples/swb_baseline.lua
index 51052ba..0ce8468 100644
--- a/nerv/examples/swb_baseline.lua
+++ b/nerv/examples/swb_baseline.lua
@@ -1,7 +1,5 @@
 require 'htk_io'
 gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
-        cumat_type = nerv.CuMatrixFloat,
-        mmat_type = nerv.MMatrixFloat,
         rearrange = true, -- just to make the context order consistent with old results, deprecated
         frm_ext = 5,
         frm_trim = 5, -- trim the first and last 5 frames, TNet just does this, deprecated
@@ -9,8 +7,7 @@ gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
         cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp",
         htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
         initialized_param = {"/slfs1/users/mfy43/swb_init.nerv",
-                "/slfs1/users/mfy43/swb_global_transf.nerv"},
-        debug = false}
+                            "/slfs1/users/mfy43/swb_global_transf.nerv"}}
 
 function make_layer_repo(param_repo)
     local layer_repo = nerv.LayerRepo(
@@ -18,51 +15,51 @@ function make_layer_repo(param_repo)
         -- global transf
         ["nerv.BiasLayer"] =
         {
-            blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}},
-            blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}}
+            blayer1 = {dim_in = {429}, dim_out = {429}, params = {bias = "bias1"}},
+            blayer2 = {dim_in = {429}, dim_out = {429}, params = {bias = "bias2"}}
         },
         ["nerv.WindowLayer"] =
         {
-            wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}},
-            wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}}
+            wlayer1 = {dim_in = {429}, dim_out = {429}, params = {window = "window1"}},
+            wlayer2 = {dim_in = {429}, dim_out = {429}, params = {window = "window2"}}
         },
         -- biased linearity
         ["nerv.AffineLayer"] =
         {
-            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
-            {dim_in = {429}, dim_out = {2048}}},
-            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
-            {dim_in = {2048}, dim_out = {3001}}}
+            affine0 = {dim_in = {429}, dim_out = {2048},
+                        params = {ltp = "affine0_ltp", bp = "affine0_bp"}},
+            affine1 = {dim_in = {2048}, dim_out = {2048},
+                        params = {ltp = "affine1_ltp", bp = "affine1_bp"}},
+            affine2 = {dim_in = {2048}, dim_out = {2048},
+                        params = {ltp = "affine2_ltp", bp = "affine2_bp"}},
+            affine3 = {dim_in = {2048}, dim_out = {2048},
+                        params = {ltp = "affine3_ltp", bp = "affine3_bp"}},
+            affine4 = {dim_in = {2048}, dim_out = {2048},
+                        params = {ltp = "affine4_ltp", bp = "affine4_bp"}},
+            affine5 = {dim_in = {2048}, dim_out = {2048},
+                        params = {ltp = "affine5_ltp", bp = "affine5_bp"}},
+            affine6 = {dim_in = {2048}, dim_out = {2048},
+                        params = {ltp = "affine6_ltp", bp = "affine6_bp"}},
+            affine7 = {dim_in = {2048}, dim_out = {3001},
+                        params = {ltp = "affine7_ltp", bp = "affine7_bp"}}
         },
         ["nerv.SigmoidLayer"] =
         {
-            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
+            sigmoid0 = {dim_in = {2048}, dim_out = {2048}},
+            sigmoid1 = {dim_in = {2048}, dim_out = {2048}},
+            sigmoid2 = {dim_in = {2048}, dim_out = {2048}},
+            sigmoid3 = {dim_in = {2048}, dim_out = {2048}},
+            sigmoid4 = {dim_in = {2048}, dim_out = {2048}},
+            sigmoid5 = {dim_in = {2048}, dim_out = {2048}},
+            sigmoid6 = {dim_in = {2048}, dim_out = {2048}}
         },
         ["nerv.SoftmaxCELayer"] = -- softmax + ce criterion layer for finetune output
         {
-            ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}}
+            ce_crit = {dim_in = {3001, 1}, dim_out = {1}, compressed = true}
         },
         ["nerv.SoftmaxLayer"] = -- softmax for decode output
         {
-            softmax = {{}, {dim_in = {3001}, dim_out = {3001}}}
+            softmax = {dim_in = {3001}, dim_out = {3001}}
         }
     }, param_repo, gconf)
 
@@ -70,7 +67,7 @@ function make_layer_repo(param_repo)
     {
         ["nerv.DAGLayer"] =
         {
-            global_transf = {{}, {
+            global_transf = {
                 dim_in = {429}, dim_out = {429},
                 sub_layers = layer_repo,
                 connections = {
@@ -80,8 +77,8 @@ function make_layer_repo(param_repo)
                     ["blayer2[1]"] = "wlayer2[1]",
                     ["wlayer2[1]"] = "<output>[1]"
                 }
-            }},
-            main = {{}, {
+            },
+            main = {
                 dim_in = {429}, dim_out = {3001},
                 sub_layers = layer_repo,
                 connections = {
@@ -102,7 +99,7 @@ function make_layer_repo(param_repo)
                     ["sigmoid6[1]"] = "affine7[1]",
                     ["affine7[1]"] = "<output>[1]"
                 }
-            }}
+            }
         }
     }, param_repo, gconf)
 
@@ -110,7 +107,7 @@ function make_layer_repo(param_repo)
     {
         ["nerv.DAGLayer"] =
         {
-            ce_output = {{}, {
+            ce_output = {
                 dim_in = {429, 1}, dim_out = {1},
                 sub_layers = layer_repo,
                 connections = {
@@ -119,8 +116,8 @@ function make_layer_repo(param_repo)
                     ["<input>[2]"] = "ce_crit[2]",
                     ["ce_crit[1]"] = "<output>[1]"
                 }
-            }},
-            softmax_output = {{}, {
+            },
+            softmax_output = {
                 dim_in = {429}, dim_out = {3001},
                 sub_layers = layer_repo,
                 connections = {
@@ -128,7 +125,7 @@ function make_layer_repo(param_repo)
                     ["main[1]"] = "softmax[1]",
                     ["softmax[1]"] = "<output>[1]"
                 }
-            }}
+            }
         }
     }, param_repo, gconf)
 
@@ -173,6 +170,7 @@ function make_buffer(readers)
     return nerv.SGDBuffer(gconf,
         {
             buffer_size = gconf.buffer_size,
+            batch_size = gconf.batch_size,
             randomize = gconf.randomize,
             readers = readers,
             use_gpu = true
@@ -184,6 +182,10 @@ function get_input_order()
             {id = "phone_state"}}
 end
 
+function get_decode_input_order()
+    return {{id = "main_scp", global_transf = true}}
+end
+
 function get_accuracy(layer_repo)
     local ce_crit = layer_repo:get_layer("ce_crit")
     return ce_crit.total_correct / ce_crit.total_frames * 100
diff --git a/nerv/examples/swb_baseline2.lua b/nerv/examples/swb_baseline2.lua
new file mode 100644
index 0000000..8b5ebb1
--- /dev/null
+++ b/nerv/examples/swb_baseline2.lua
@@ -0,0 +1,203 @@
+require 'htk_io'
+gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
+        rearrange = true, -- just to make the context order consistent with old results, deprecated
+        frm_ext = 5,
+        frm_trim = 5, -- trim the first and last 5 frames, TNet just does this, deprecated
+        tr_scp = "/speechlab/users/mfy43/swb50/train_bp.scp",
+        cv_scp = "/speechlab/users/mfy43/swb50/train_cv.scp",
+        htk_conf = "/speechlab/users/mfy43/swb50/plp_0_d_a.conf",
+        initialized_param = {"/speechlab/users/mfy43/swb50/swb_init.nerv",
+                            "/speechlab/users/mfy43/swb50/swb_global_transf.nerv"}}
+
+function make_layer_repo(param_repo)
+    local layer_repo = nerv.LayerRepo(
+    {
+        -- global transf
+        ["nerv.BiasLayer"] =
+        {
+            blayer1 = {dim_in = {429}, dim_out = {429}, params = {bias = "bias1"}},
+            blayer2 = {dim_in = {429}, dim_out = {429}, params = {bias = "bias2"}}
+        },
+        ["nerv.WindowLayer"] =
+        {
+            wlayer1 = {dim_in = {429}, dim_out = {429}, params = {window = "window1"}},
+            wlayer2 = {dim_in = {429}, dim_out = {429}, params = {window = "window2"}}
+        },
+        -- biased linearity
+        ["nerv.AffineLayer"] =
+        {
+            affine0 = {dim_in = {429}, dim_out = {2048},
+                        params = {ltp = "affine0_ltp", bp = "affine0_bp"}},
+            affine1 = {dim_in = {2048}, dim_out = {2048},
+                        params = {ltp = "affine1_ltp", bp = "affine1_bp"}},
+            affine2 = {dim_in = {2048}, dim_out = {2048},
+                        params = {ltp = "affine2_ltp", bp = "affine2_bp"}},
+            affine3 = {dim_in = {2048}, dim_out = {2048},
+                        params = {ltp = "affine3_ltp", bp = "affine3_bp"}},
+            affine4 = {dim_in = {2048}, dim_out = {2048},
+                        params = {ltp = "affine4_ltp", bp = "affine4_bp"}},
+            affine5 = {dim_in = {2048}, dim_out = {2048},
+                        params = {ltp = "affine5_ltp", bp = "affine5_bp"}},
+            affine6 = {dim_in = {2048}, dim_out = {2048},
+                        params = {ltp = "affine6_ltp", bp = "affine6_bp"}},
+            affine7 = {dim_in = {2048}, dim_out = {3001},
+                        params = {ltp = "affine7_ltp", bp = "affine7_bp"}}
+        },
+        ["nerv.SigmoidLayer"] =
+        {
+            sigmoid0 = {dim_in = {2048}, dim_out = {2048}},
+            sigmoid1 = {dim_in = {2048}, dim_out = {2048}},
+            sigmoid2 = {dim_in = {2048}, dim_out = {2048}},
+            sigmoid3 = {dim_in = {2048}, dim_out = {2048}},
+            sigmoid4 = {dim_in = {2048}, dim_out = {2048}},
+            sigmoid5 = {dim_in = {2048}, dim_out = {2048}},
+            sigmoid6 = {dim_in = {2048}, dim_out = {2048}}
+        },
+        ["nerv.SoftmaxCELayer"] = -- softmax + ce criterion layer for finetune output
+        {
+            ce_crit = {dim_in = {3001, 1}, dim_out = {1}, compressed = true}
+        },
+        ["nerv.SoftmaxLayer"] = -- softmax for decode output
+        {
+            softmax = {dim_in = {3001}, dim_out = {3001}}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            global_transf = {
+                dim_in = {429}, dim_out = {429},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "blayer1[1]",
+                    ["blayer1[1]"] = "wlayer1[1]",
+                    ["wlayer1[1]"] = "blayer2[1]",
+                    ["blayer2[1]"] = "wlayer2[1]",
+                    ["wlayer2[1]"] = "<output>[1]"
+                }
+            },
+            main = {
+                dim_in = {429}, dim_out = {3001},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "affine0[1]",
+                    ["affine0[1]"] = "sigmoid0[1]",
+                    ["sigmoid0[1]"] = "affine1[1]",
+                    ["affine1[1]"] = "sigmoid1[1]",
+                    ["sigmoid1[1]"] = "affine2[1]",
+                    ["affine2[1]"] = "sigmoid2[1]",
+                    ["sigmoid2[1]"] = "affine3[1]",
+                    ["affine3[1]"] = "sigmoid3[1]",
+                    ["sigmoid3[1]"] = "affine4[1]",
+                    ["affine4[1]"] = "sigmoid4[1]",
+                    ["sigmoid4[1]"] = "affine5[1]",
+                    ["affine5[1]"] = "sigmoid5[1]",
+                    ["sigmoid5[1]"] = "affine6[1]",
+                    ["affine6[1]"] = "sigmoid6[1]",
+                    ["sigmoid6[1]"] = "affine7[1]",
+                    ["affine7[1]"] = "<output>[1]"
+                }
+            }
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            ce_output = {
+                dim_in = {429, 1}, dim_out = {1},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "ce_crit[1]",
+                    ["<input>[2]"] = "ce_crit[2]",
+                    ["ce_crit[1]"] = "<output>[1]"
+                }
+            },
+            softmax_output = {
+                dim_in = {429}, dim_out = {3001},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "softmax[1]",
+                    ["softmax[1]"] = "<output>[1]"
+                }
+            }
+        }
+    }, param_repo, gconf)
+
+    return layer_repo
+end
+
+function get_network(layer_repo)
+    return layer_repo:get_layer("ce_output")
+end
+
+function get_decode_network(layer_repo)
+    return layer_repo:get_layer("softmax_output")
+end
+
+function get_global_transf(layer_repo)
+    return layer_repo:get_layer("global_transf")
+end
+
+function make_readers(scp_file, layer_repo)
+    return {
+                {reader = nerv.TNetReader(gconf,
+                    {
+                        id = "main_scp",
+                        scp_file = scp_file,
+                        conf_file = gconf.htk_conf,
+                        frm_ext = gconf.frm_ext,
+                        mlfs = {
+                            phone_state = {
+                                file = "/speechlab/users/mfy43/swb50/ref.mlf",
+                                format = "map",
+                                format_arg = "/speechlab/users/mfy43/swb50/dict",
+                                dir = "*/",
+                                ext = "lab"
+                            }
+                        }
+                    }),
+                data = {main_scp = 429, phone_state = 1}}
+            }
+end
+
+function make_buffer(readers)
+    return nerv.SGDBuffer(gconf,
+        {
+            buffer_size = gconf.buffer_size,
+            batch_size = gconf.batch_size,
+            randomize = gconf.randomize,
+            readers = readers,
+            use_gpu = true
+        })
+end
+
+function get_input_order()
+    return {{id = "main_scp", global_transf = true},
+            {id = "phone_state"}}
+end
+
+function get_decode_input_order()
+    return {{id = "main_scp", global_transf = true}}
+end
+
+function get_accuracy(layer_repo)
+    local ce_crit = layer_repo:get_layer("ce_crit")
+    return ce_crit.total_correct / ce_crit.total_frames * 100
+end
+
+function print_stat(layer_repo)
+    local ce_crit = layer_repo:get_layer("ce_crit")
+    nerv.info("*** training stat begin ***")
+    nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce)
+    nerv.printf("correct:\t\t%d\n", ce_crit.total_correct)
+    nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames)
+    nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames)
+    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo))
+    nerv.info("*** training stat end ***")
+end
diff --git a/nerv/examples/swb_baseline_basic.lua b/nerv/examples/swb_baseline_basic.lua
deleted file mode 100644
index 71f04a3..0000000
--- a/nerv/examples/swb_baseline_basic.lua
+++ /dev/null
@@ -1,162 +0,0 @@
-require 'htk_io'
-gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
-        cumat_type = nerv.CuMatrixFloat,
-        mmat_type = nerv.MMatrixFloat,
-        frm_ext = 5,
-        frm_trim = 5,
-        tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
-        cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp",
-        htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
-        initialized_param = {"/slfs1/users/mfy43/swb_init.nerv",
-                "/slfs1/users/mfy43/swb_global_transf.nerv"},
-        debug = false}
-
-function make_layer_repo(param_repo)
-    local layer_repo = nerv.LayerRepo(
-    {
-        -- global transf
-        ["nerv.BiasLayer"] =
-        {
-            blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}},
-            blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}}
-        },
-        ["nerv.WindowLayer"] =
-        {
-            wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}},
-            wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}}
-        },
-        -- biased linearity
-        ["nerv.AffineLayer"] =
-        {
-            affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"},
-            {dim_in = {429}, dim_out = {2048}}},
-            affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"},
-            {dim_in = {2048}, dim_out = {2048}}},
-            affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"},
-            {dim_in = {2048}, dim_out = {3001}}}
-        },
-        ["nerv.SigmoidLayer"] =
-        {
-            sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}},
-            sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}}
-        },
-        ["nerv.SoftmaxCELayer"] =
-        {
-            ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}}
-        }
-    }, param_repo, gconf)
-
-    layer_repo:add_layers(
-    {
-        ["nerv.DAGLayer"] =
-        {
-            global_transf = {{}, {
-                dim_in = {429}, dim_out = {429},
-                sub_layers = layer_repo,
-                connections = {
-                    ["<input>[1]"] = "blayer1[1]",
-                    ["blayer1[1]"] = "wlayer1[1]",
-                    ["wlayer1[1]"] = "blayer2[1]",
-                    ["blayer2[1]"] = "wlayer2[1]",
-                    ["wlayer2[1]"] = "<output>[1]"
-                }
-            }},
-            main = {{}, {
-                dim_in = {429, 1}, dim_out = {1},
-                sub_layers = layer_repo,
-                connections = {
-                    ["<input>[1]"] = "affine0[1]",
-                    ["affine0[1]"] = "sigmoid0[1]",
-                    ["sigmoid0[1]"] = "affine1[1]",
-                    ["affine1[1]"] = "sigmoid1[1]",
-                    ["sigmoid1[1]"] = "affine2[1]",
-                    ["affine2[1]"] = "sigmoid2[1]",
-                    ["sigmoid2[1]"] = "affine3[1]",
-                    ["affine3[1]"] = "sigmoid3[1]",
-                    ["sigmoid3[1]"] = "affine4[1]",
-                    ["affine4[1]"] = "sigmoid4[1]",
-                    ["sigmoid4[1]"] = "affine5[1]",
-                    ["affine5[1]"] = "sigmoid5[1]",
-                    ["sigmoid5[1]"] = "affine6[1]",
-                    ["affine6[1]"] = "sigmoid6[1]",
-                    ["sigmoid6[1]"] = "affine7[1]",
-                    ["affine7[1]"] = "ce_crit[1]",
-                    ["<input>[2]"] = "ce_crit[2]",
-                    ["ce_crit[1]"] = "<output>[1]"
-                }
-            }}
-        }
-    }, param_repo, gconf)
-    return layer_repo
-end
-
-function get_network(layer_repo)
-    return layer_repo:get_layer("main")
-end
-
-function make_readers(scp_file, layer_repo)
-    return {
-                {reader = nerv.TNetReader(gconf,
-                    {
-                        id = "main_scp",
-                        scp_file = scp_file,
-                        conf_file = gconf.htk_conf,
-                        frm_ext = gconf.frm_ext,
-                        mlfs = {
-                            phone_state = {
-                                file = "/slfs1/users/mfy43/swb_ivec/ref.mlf",
-                                format = "map",
-                                format_arg = "/slfs1/users/mfy43/swb_ivec/dict",
-                                dir = "*/",
-                                ext = "lab"
-                            }
-                        }
-                    }),
-                data = {main_scp = 429, phone_state = 1}}
-            }
-end
-
-function make_buffer(readers)
-    return nerv.SGDBuffer(gconf,
-        {
-            buffer_size = gconf.buffer_size,
-            randomize = gconf.randomize,
-            readers = readers
-        })
-end
-
-function get_input_order()
-    return {{id = "main_scp", global_transf = true},
-            {id = "phone_state"}}
-end
-
-function get_accuracy(layer_repo)
-    local ce_crit = layer_repo:get_layer("ce_crit")
-    return ce_crit.total_correct / ce_crit.total_frames * 100
-end
-
-function print_stat(layer_repo)
-    local ce_crit = layer_repo:get_layer("ce_crit")
-    nerv.info("*** training stat begin ***")
-    nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce)
-    nerv.printf("correct:\t\t%d\n", ce_crit.total_correct)
-    nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames)
-    nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames)
-    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo))
-    nerv.info("*** training stat end ***")
-end
diff --git a/nerv/examples/timit_baseline2.lua b/nerv/examples/timit_baseline2.lua
new file mode 100644
index 0000000..2d144b5
--- /dev/null
+++ b/nerv/examples/timit_baseline2.lua
@@ -0,0 +1,212 @@
+require 'kaldi_io'
+gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9, frm_ext = 5,
+        tr_scp = "ark:/speechlab/tools/KALDI/kaldi-master/src/featbin/copy-feats " ..
+                    "scp:/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/train.scp ark:- |",
+        cv_scp = "ark:/speechlab/tools/KALDI/kaldi-master/src/featbin/copy-feats " ..
+                    "scp:/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/cv.scp ark:- |",
+        initialized_param = {"/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/nnet_init.nerv",
+                            "/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/nnet_output.nerv",
+                            "/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/nnet_trans.nerv"},
+        decode_param = {"/speechlab/users/mfy43/timit/nnet_init_20160229015745_iter_13_lr0.013437_tr72.434_cv58.729.nerv",
+                        "/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/nnet_trans.nerv"}}
+
+function make_layer_repo(param_repo)
+    local layer_repo = nerv.LayerRepo(
+    {
+        -- global transf
+        ["nerv.BiasLayer"] =
+        {
+            blayer1 = {dim_in = {440}, dim_out = {440}, params = {bias = "bias0"}}
+        },
+        ["nerv.WindowLayer"] =
+        {
+            wlayer1 = {dim_in = {440}, dim_out = {440}, params = {window = "window0"}}
+        },
+        -- biased linearity
+        ["nerv.AffineLayer"] =
+        {
+            affine0 = {dim_in = {440}, dim_out = {1024},
+                        params = {ltp = "affine0_ltp", bp = "affine0_bp"}},
+            affine1 = {dim_in = {1024}, dim_out = {1024},
+                        params = {ltp = "affine1_ltp", bp = "affine1_bp"}},
+            affine2 = {dim_in = {1024}, dim_out = {1024},
+                        params = {ltp = "affine2_ltp", bp = "affine2_bp"}},
+            affine3 = {dim_in = {1024}, dim_out = {1024},
+                        params = {ltp = "affine3_ltp", bp = "affine3_bp"}},
+            affine4 = {dim_in = {1024}, dim_out = {1024},
+                        params = {ltp = "affine4_ltp", bp = "affine4_bp"}},
+            affine5 = {dim_in = {1024}, dim_out = {1024},
+                        params = {ltp = "affine5_ltp", bp = "affine5_bp"}},
+            affine6 = {dim_in = {1024}, dim_out = {1959},
+                        params = {ltp = "affine6_ltp", bp = "affine6_bp"}}
+        },
+        ["nerv.SigmoidLayer"] =
+        {
+            sigmoid0 = {dim_in = {1024}, dim_out = {1024}},
+            sigmoid1 = {dim_in = {1024}, dim_out = {1024}},
+            sigmoid2 = {dim_in = {1024}, dim_out = {1024}},
+            sigmoid3 = {dim_in = {1024}, dim_out = {1024}},
+            sigmoid4 = {dim_in = {1024}, dim_out = {1024}},
+            sigmoid5 = {dim_in = {1024}, dim_out = {1024}}
+        },
+        ["nerv.SoftmaxCELayer"] = -- softmax + ce criterion layer for finetune output
+        {
+            ce_crit = {dim_in = {1959, 1}, dim_out = {1}, compressed = true}
+        },
+        ["nerv.SoftmaxLayer"] = -- softmax for decode output
+        {
+            softmax = {dim_in = {1959}, dim_out = {1959}}
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            global_transf = {
+                dim_in = {440}, dim_out = {440},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "blayer1[1]",
+                    ["blayer1[1]"] = "wlayer1[1]",
+                    ["wlayer1[1]"] = "<output>[1]"
+                }
+            },
+            main = {
+                dim_in = {440}, dim_out = {1959},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "affine0[1]",
+                    ["affine0[1]"] = "sigmoid0[1]",
+                    ["sigmoid0[1]"] = "affine1[1]",
+                    ["affine1[1]"] = "sigmoid1[1]",
+                    ["sigmoid1[1]"] = "affine2[1]",
+                    ["affine2[1]"] = "sigmoid2[1]",
+                    ["sigmoid2[1]"] = "affine3[1]",
+                    ["affine3[1]"] = "sigmoid3[1]",
+                    ["sigmoid3[1]"] = "affine4[1]",
+                    ["affine4[1]"] = "sigmoid4[1]",
+                    ["sigmoid4[1]"] = "affine5[1]",
+                    ["affine5[1]"] = "sigmoid5[1]",
+                    ["sigmoid5[1]"] = "affine6[1]",
+                    ["affine6[1]"] = "<output>[1]"
+                }
+            }
+        }
+    }, param_repo, gconf)
+
+    layer_repo:add_layers(
+    {
+        ["nerv.DAGLayer"] =
+        {
+            ce_output = {
+                dim_in = {440, 1}, dim_out = {1},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "ce_crit[1]",
+                    ["<input>[2]"] = "ce_crit[2]",
+                    ["ce_crit[1]"] = "<output>[1]"
+                }
+            },
+            softmax_output = {
+                dim_in = {440}, dim_out = {1959},
+                sub_layers = layer_repo,
+                connections = {
+                    ["<input>[1]"] = "main[1]",
+                    ["main[1]"] = "softmax[1]",
+                    ["softmax[1]"] = "<output>[1]"
+                }
+            }
+        }
+    }, param_repo, gconf)
+
+    return layer_repo
+end
+
+function get_network(layer_repo)
+    return layer_repo:get_layer("ce_output")
+end
+
+function get_decode_network(layer_repo)
+    return layer_repo:get_layer("softmax_output")
+end
+
+function get_global_transf(layer_repo)
+    return layer_repo:get_layer("global_transf")
+end
+
+function make_readers(scp_file, layer_repo)
+    return {
+                {reader = nerv.KaldiReader(gconf,
+                    {
+                        id = "main_scp",
+                        feature_rspecifier = scp_file,
+                        conf_file = gconf.htk_conf,
+                        frm_ext = gconf.frm_ext,
+                        mlfs = {
+                            phone_state = {
+                                targets_rspecifier = "ark:/speechlab/tools/KALDI/kaldi-master/src/bin/ali-to-pdf " ..
+                                                        "/speechlab/users/mfy43/timit/s5/exp/tri3_ali/final.mdl " ..
+                                                        "\"ark:gunzip -c /speechlab/users/mfy43/timit/s5/exp/tri3_ali/ali.*.gz |\" " ..
+                                                        "ark:- | " ..
+                                                    "/speechlab/tools/KALDI/kaldi-master/src/bin/ali-to-post " ..
+                                                        "ark:- ark:- |",
+                                format = "map"
+                            }
+                        }
+                    }),
+                data = {main_scp = 440, phone_state = 1}}
+            }
+end
+
+function make_decode_readers(scp_file, layer_repo)
+    return {
+                {reader = nerv.KaldiReader(gconf,
+                    {
+                        id = "main_scp",
+                        feature_rspecifier = scp_file,
+                        conf_file = gconf.htk_conf,
+                        frm_ext = gconf.frm_ext,
+                        mlfs = {},
+                        need_key = true
+                    }),
+                data = {main_scp = 440, phone_state = 1}}
+            }
+end
+
+function make_buffer(readers)
+    return nerv.SGDBuffer(gconf,
+        {
+            buffer_size = gconf.buffer_size,
+            batch_size = gconf.batch_size,
+            randomize = gconf.randomize,
+            readers = readers,
+            use_gpu = true
+        })
+end
+
+function get_input_order()
+    return {{id = "main_scp", global_transf = true},
+            {id = "phone_state"}}
+end
+
+function get_decode_input_order()
+    return {{id = "main_scp", global_transf = true}}
+end
+
+function get_accuracy(layer_repo)
+    local ce_crit = layer_repo:get_layer("ce_crit")
+    return ce_crit.total_correct / ce_crit.total_frames * 100
+end
+
+function print_stat(layer_repo)
+    local ce_crit = layer_repo:get_layer("ce_crit")
+    nerv.info("*** training stat begin ***")
+    nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce)
+    nerv.printf("correct:\t\t%d\n", ce_crit.total_correct)
+    nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames)
+    nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames)
+    nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(layer_repo))
+    nerv.info("*** training stat end ***")
+end
diff --git a/nerv/init.lua b/nerv/init.lua
index e7d668c..da7df29 100644
--- a/nerv/init.lua
+++ b/nerv/init.lua
@@ -13,6 +13,10 @@ function nerv.error_method_not_implemented()
     nerv.error("method not implemented");
 end
 
+function nerv.set_logfile(filename)
+    nerv._logfile = io.open(filename, "w")
+end
+
 --- Format a string just like `sprintf` in C.
 -- @param fmt the format string
 -- @param ... args, the data to be formatted
@@ -25,7 +29,13 @@ end
 -- @param fmt the format string
 -- @param ... args, the data to be formatted
 function nerv.printf(fmt, ...)
-    io.write(nerv.sprintf(fmt, ...))
+    local line = nerv.sprintf(fmt, ...)
+    io.stderr:write(line)
+    -- duplicate the all output to the log file, if set
+    if nerv._logfile then
+        nerv._logfile:write(line)
+        nerv._logfile:flush()
+    end
 end
 
 --- Raise an global error with the formatted message.
@@ -54,7 +64,7 @@ end
 function nerv.warning(fmt, ...)
     nerv.printf(
         string.format("(%s)[nerv] warning: %s\n",
-            os.date("%H:%M:%S.%N %F"), fmt), ...)
+            os.date("%H:%M:%S %F"), fmt), ...)
 end
 
 --- Create a class (Torch-compatible).
@@ -88,24 +98,27 @@ function nerv.class(tname, parenttname)
 end
 
 function table.val_to_str(v)
-  if "string" == type(v) then
-    v = string.gsub(v, "\n", "\\n")
-    if string.match(string.gsub(v,"[^'\"]",""), '^"+$') then
-      return "'" .. v .. "'"
+    if "string" == type(v) then
+        v = string.gsub(v, "\n", "\\n")
+        if string.match(string.gsub(v,"[^'\"]",""), '^"+$') then
+            return "'" .. v .. "'"
+        end
+        return '"' .. string.gsub(v,'"', '\\"') .. '"'
+    else
+        return "table" == type(v) and table.tostring(v) or
+                    (("number" == type(v) or
+                    "string" == type(v) or
+                    "boolean" == type(v)) and tostring(v)) or
+                    nil -- failed to serialize
     end
-    return '"' .. string.gsub(v,'"', '\\"') .. '"'
-  else
-    return "table" == type(v) and table.tostring(v) or
-      tostring(v)
-  end
 end
 
 function table.key_to_str (k)
-  if "string" == type(k) and string.match(k, "^[_%a][_%a%d]*$") then
-    return k
-  else
-    return "[" .. table.val_to_str(k) .. "]"
-  end
+    if "string" == type(k) and string.match(k, "^[_%a][_%a%d]*$") then
+        return k
+    else
+        return "[" .. table.val_to_str(k) .. "]"
+    end
 end
 
 --- Get the string representation of a table, which can be executed as a valid
@@ -114,18 +127,18 @@ end
 -- @return the string representation which will result in a Lua table entity
 -- when evaluated
 function table.tostring(tbl)
-  local result, done = {}, {}
-  for k, v in ipairs(tbl) do
-    table.insert(result, table.val_to_str(v))
-    done[k] = true
-  end
-  for k, v in pairs(tbl) do
-    if not done[k] then
-      table.insert(result,
-        table.key_to_str(k) .. "=" .. table.val_to_str(v))
+    local result, done = {}, {}
+    for k, v in ipairs(tbl) do
+        table.insert(result, table.val_to_str(v))
+        done[k] = true
+    end
+    for k, v in pairs(tbl) do
+        if not done[k] then
+            table.insert(result,
+            table.key_to_str(k) .. "=" .. table.val_to_str(v))
+        end
     end
-  end
-  return "{" .. table.concat(result, ",") .. "}"
+    return "{" .. table.concat(result, ",") .. "}"
 end
 
 --- Get the class by name.
@@ -172,6 +185,168 @@ function nerv.include(filename)
     return dofile(nerv.dirname(caller) .. filename)
 end
 
+--- Parse the command-line options and arguments
+-- @param argv the argrument list to parsed
+-- @param options The specification of options, should be a list of tables,
+-- each one for exactly one available option, say `v`, with `v[1]`, `v[2]`,
+-- `v[3]` indicating the full name of the option, the short form of the option
+-- (when it is a boolean option) and the type of the value controlled by the
+-- option. `default` and `desc` keys can also be specified to set the default
+-- value and description of the option.
+--
+-- An example of specification:
+-- {{"aaa", "a", "boolean", default = false, desc = "an option called aaa"},
+-- {"bbb", "b", "boolean", default = true, desc = "bbb is set to be true if --bbb=no does not present"},
+-- {"ccc", nil, "int", default = 0, desc = "ccc expects an integeral value"}}`
+--
+-- @return args, opts The non-option arguments and parsed options. `opts` is
+-- again a list of tables, each of which corresponds to one table in parameter
+-- `options`. The parsed value could be accessed by `opts["aaa"].val` (which is
+-- `true` if "--aaa" or "-a" is specified).
+function nerv.parse_args(argv, options, unordered)
+    local is_opt_exp = "^[-](.*)$"
+    local sim_opt_exp = "^[-]([a-z]+)$"
+    local opt_exp = "^[-][-]([^=]+)$"
+    local opt_with_val_exp = "^[-][-]([^=]+)=([^=]+)$"
+    local opts = {}
+    local sopts = {}
+    local args = {}
+    local arg_start = false
+    local function err()
+        nerv.error("invalid format of option specification")
+    end
+    for _, v in ipairs(options) do
+        if type(v) ~= "table" or
+            (v[1] == nil and v[2] == nil) or
+            v[3] == nil then
+            err()
+        end
+        local opt_full = v[1]
+        local opt_short = v[2]
+        local opt_type = v[3]
+        local opt_meta = {type = opt_type,
+                            desc = v.desc or "",
+                            val = v.default}
+        if opt_short ~= nil then
+            if type(opt_short) ~= "string" or #opt_short ~= 1 then err() end
+            if opt_type ~= "boolean" then
+                nerv.error("only boolean option could have short form")
+            end
+            sopts[opt_short] = opt_meta
+        end
+        if opt_full ~= nil then
+            if type(opt_full) ~= "string" then err() end
+            opts[opt_full] = opt_meta
+        end
+    end
+    for _, token in ipairs(argv) do
+        if ((not arg_start) or unordered) and token:match(is_opt_exp) then
+            local k = token:match(sim_opt_exp)
+            if k then
+                for c in k:gmatch"." do
+                    if sopts[c] then
+                        sopts[c].val = true
+                    else
+                        nerv.error("invalid option -%s", c)
+                    end
+                end
+            else
+                local k = token:match(opt_exp)
+                if k then
+                    if opts[k] == nil then
+                        nerv.error("invalid option %s", token)
+                    end
+                    if opts[k].type ~= "boolean" then
+                        nerv.error("invalid option --%s: " ..
+                                    "a %s value needs to be specified",
+                                    k, opts[k].type)
+                    else
+                        opts[k].val = true
+                    end
+                else
+                    local k, v = token:match(opt_with_val_exp)
+                    if k then
+                        if opts[k] == nil then
+                            nerv.error("invalid option %s", token)
+                        end
+                        if opts[k].type == "boolean" then
+                            if v == "yes" then
+                                opts[k].val = true
+                            elseif v == "no" then
+                                opts[k].val = false
+                            else
+                                nerv.error("boolean value should be \"yes\" or \"no\"")
+                            end
+                        elseif opts[k].type == "int" then
+                            local t = tonumber(v)
+                            opts[k].val = t
+                            if t == nil or math.floor(t) ~= t then
+                                nerv.error("int value is expected")
+                            end
+                        elseif opts[k].type == "number" then
+                            local t = tonumber(v)
+                            opts[k].val = t
+                            if t == nil then
+                                nerv.error("numeric value is expected")
+                            end
+                        elseif opts[k].type == "string" then
+                            opts[k].val = v
+                        else
+                            nerv.error("unrecognized type %s", opts[k].type)
+                        end
+                    else
+                        nerv.error("unrecognized option %s", token)
+                    end
+                end
+            end
+        else
+            table.insert(args, token)
+            arg_start = true
+        end
+    end
+    return args, opts
+end
+
+--- Print usage information of the command-line options
+-- @param options the list of options used in `parse_args`
+function nerv.print_usage(options)
+    local full_maxlen = 0
+    local type_maxlen = 0
+    local default_maxlen = 0
+    for _, v in ipairs(options) do
+        local opt_full = v[1]
+        local opt_short = v[2]
+        local opt_type = v[3]
+        full_maxlen = math.max(full_maxlen, #opt_full or 0)
+        type_maxlen = math.max(full_maxlen, #opt_type or 0)
+        default_maxlen = math.max(full_maxlen, #tostring(v.default) or 0)
+    end
+    local function pattern_gen()
+        return string.format("\t%%-%ds\t%%-2s\t%%-%ds\t%%-%ds\t%%s\n",
+                            full_maxlen, type_maxlen, default_maxlen)
+    end
+    nerv.printf("\n")
+    nerv.printf(pattern_gen(), "Option", "Abbr.", "Type", "Default", "Desc.")
+    for _, v in ipairs(options) do
+        local opt_full = v[1]
+        local opt_short = v[2]
+        local opt_type = v[3]
+        nerv.printf(pattern_gen(),
+                    (opt_full and '--' .. opt_full) or "",
+                    (opt_short and '-' .. opt_short) or "",
+                    opt_type,
+                    (v.default ~= nil and tostring(v.default)) or "",
+                    v.desc or "")
+    end
+    nerv.printf("\n")
+end
+
+function table.extend(tbl1, tbl2)
+    for _, v in ipairs(tbl2) do
+        table.insert(tbl1, v)
+    end
+end
+
 -- the following lines trigger the initialization of basic modules
 
 nerv.include('matrix/init.lua')
diff --git a/nerv/io/sgd_buffer.lua b/nerv/io/sgd_buffer.lua
index 3cf4f5a..d78f6d1 100644
--- a/nerv/io/sgd_buffer.lua
+++ b/nerv/io/sgd_buffer.lua
@@ -2,8 +2,9 @@ local SGDBuffer = nerv.class("nerv.SGDBuffer", "nerv.DataBuffer")
 
 function SGDBuffer:__init(global_conf, buffer_conf)
     self.gconf = global_conf
+    self.batch_size = buffer_conf.batch_size
     self.buffer_size = math.floor(buffer_conf.buffer_size /
-                                global_conf.batch_size) * global_conf.batch_size
+                                    self.batch_size) * self.batch_size
     self.randomize = buffer_conf.randomize
     self.consume = buffer_conf.consume
     local cumat_type = global_conf.cumat_type
@@ -112,11 +113,11 @@ function SGDBuffer:saturate()
     end
     self.rand_map = self.perm_gen(self.tail) -- generate shuffled index
     collectgarbage("collect")
-    return self.tail >= self.gconf.batch_size
+    return self.tail >= self.batch_size
 end
 
 function SGDBuffer:get_data()
-    local batch_size = self.gconf.batch_size
+    local batch_size = self.batch_size
     if self.head >= self.tail then -- buffer is empty
         local t = os.clock()
         if (not self:saturate()) and (not self.consume) then
diff --git a/nerv/layer/affine.lua b/nerv/layer/affine.lua
index 4156dde..38743aa 100644
--- a/nerv/layer/affine.lua
+++ b/nerv/layer/affine.lua
@@ -8,21 +8,19 @@ local AffineLayer = nerv.class('nerv.AffineLayer', 'nerv.Layer')
 --- A parameter that consists of a single matrix
 -- @type nerv.MatrixParam
 
+function MatrixParam:check(checker)
+    -- check trans matrix type
+    checker(self.trans)
+end
+
 --- Read from a file handle.
 -- @param handle the file handle
 function MatrixParam:read(handle)
     self.trans = self.gconf.mmat_type.load(handle)
-    if not self.gconf.use_cpu then
-        self.trans = self.gconf.cumat_type.new_from_host(self.trans)
-    end
 end
 
 function MatrixParam:write(handle)
-    local trans = self.trans
-    if not self.gconf.use_cpu then
-        trans = self.trans:new_to_host()
-    end
-    trans:save(handle)
+    self.trans:save(handle)
 end
 
 function MatrixParam:train_init()
@@ -30,6 +28,12 @@ function MatrixParam:train_init()
     self.correction:fill(0)
 end
 
+function MatrixParam:copy(copier)
+    local target = nerv.MatrixParam(self.id, self.gconf)
+    target.trans = copier(self.trans)
+    return target
+end
+
 function MatrixParam:_update_by_gradient(gradient, alpha, beta)
     local gconf = self.gconf
     -- momentum gain
@@ -77,25 +81,24 @@ end
 
 --- The constructor.
 function AffineLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    if layer_conf.ltp ~= nil and layer_conf.ltp1 == nil then
-        layer_conf.ltp1 = layer_conf.ltp
-    end
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
+    self:check_dim_len(-1, 1) -- exactly one output, allow multiple inputs
+    self:bind_params()
+end
+
+function AffineLayer:bind_params()
     for i = 1, #self.dim_in do
         local pid = "ltp" .. i
         local pid_list = i == 1 and {pid, "ltp"} or pid
-        self["ltp" .. i] = self:find_param(pid_list, layer_conf, global_conf,
+        self["ltp" .. i] = self:find_param(pid_list, self.lconf, self.gconf,
                                             nerv.LinearTransParam,
-                                            {self.dim_in[i], self.dim_out[1]}) 
+                                            {self.dim_in[i], self.dim_out[1]})
     end
     self.ltp = self.ltp1 -- alias of ltp1
-    self.bp = self:find_param("bp", layer_conf, global_conf,
+    self.bp = self:find_param("bp", self.lconf, self.gconf,
                                 nerv.BiasParam,
                                 {1, self.dim_out[1]})
-    self.gconf = global_conf
-    self:check_dim_len(-1, 1) -- exactly one output, allow multiple inputs
+
 end
 
 function AffineLayer:init(batch_size)
@@ -142,7 +145,7 @@ function AffineLayer:back_propagate(bp_err, next_bp_err, input, output)
 end
 
 function AffineLayer:get_params()
-    local pr = nerv.ParamRepo({self.ltp1, self.bp})
+    local pr = nerv.ParamRepo({self.ltp1, self.bp}, self.loc_type)
     for i = 2, #self.dim_in do
         pr:add(self["ltp" .. i].id, self["ltp" .. i])
     end
diff --git a/nerv/layer/bias.lua b/nerv/layer/bias.lua
index 924c3da..191be78 100644
--- a/nerv/layer/bias.lua
+++ b/nerv/layer/bias.lua
@@ -1,12 +1,15 @@
 local BiasLayer = nerv.class("nerv.BiasLayer", "nerv.Layer")
 
 function BiasLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.gconf = global_conf
-    self.bias = layer_conf.bias
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
     self:check_dim_len(1, 1)
+    self:bind_params()
+end
+
+function BiasLayer:bind_params()
+    self.bias = self:find_param("bias", self.lconf, self.gconf,
+                                nerv.BiasParam,
+                                {1, self.dim_out[1]})
 end
 
 function BiasLayer:init()
@@ -28,5 +31,5 @@ function BiasLayer:propagate(input, output)
 end
 
 function BiasLayer:get_params()
-    return nerv.ParamRepo({self.bias})
+    return nerv.ParamRepo({self.bias}, self.loc_type)
 end
diff --git a/nerv/layer/combiner.lua b/nerv/layer/combiner.lua
index 22e89a9..028c970 100644
--- a/nerv/layer/combiner.lua
+++ b/nerv/layer/combiner.lua
@@ -1,16 +1,8 @@
 local CombinerLayer = nerv.class('nerv.CombinerLayer', 'nerv.Layer')
 
 function CombinerLayer:__init(id, global_conf, layer_conf)
-    self.id = id
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
     self.lambda = layer_conf.lambda
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self.gconf = global_conf
-    if self.gconf.use_cpu then
-        self.mat_type = self.gconf.mmat_type
-    else
-        self.mat_type = self.gconf.cumat_type
-    end
     self:check_dim_len(#self.lambda, -1)
     if #self.dim_in < 1 then
         nerv.error("no input specified")
@@ -20,6 +12,10 @@ function CombinerLayer:__init(id, global_conf, layer_conf)
     end
 end
 
+function CombinerLayer:bind_params()
+    -- do nothing
+end
+
 function CombinerLayer:init(batch_size)
     local dim = self.dim_in[1]
     for i = 2, #self.dim_in do
@@ -66,5 +62,5 @@ function CombinerLayer:back_propagate(bp_err, next_bp_err, input, output)
 end
 
 function CombinerLayer:get_params()
-    return nerv.ParamRepo({})
+    return nerv.ParamRepo({}, self.loc_type)
 end
diff --git a/nerv/layer/dropout.lua b/nerv/layer/dropout.lua
index 42660cc..1a379c9 100644
--- a/nerv/layer/dropout.lua
+++ b/nerv/layer/dropout.lua
@@ -1,22 +1,18 @@
 local DropoutLayer = nerv.class("nerv.DropoutLayer", "nerv.Layer")
 
 function DropoutLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.gconf = global_conf
-    if self.gconf.use_cpu then
-        self.mat_type = self.gconf.mmat_type
-    else
-        self.mat_type = self.gconf.cumat_type
-    end
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
     self.rate = layer_conf.dropout_rate or global_conf.dropout_rate
     if self.rate == nil then
         nerv.warning("[DropoutLayer:propagate] dropout rate is not set")
     end
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
     self:check_dim_len(1, 1) -- two inputs: nn output and label
 end
 
+function DropoutLayer:bind_params()
+    -- do nothing
+end
+
 function DropoutLayer:init(batch_size, chunk_size)
     if self.dim_in[1] ~= self.dim_out[1] then
         nerv.error("mismatching dimensions of input and output")
@@ -73,5 +69,5 @@ function DropoutLayer:back_propagate(bp_err, next_bp_err, input, output, t)
 end
 
 function DropoutLayer:get_params()
-    return nerv.ParamRepo({})
+    return nerv.ParamRepo({}, self.loc_type)
 end
diff --git a/nerv/layer/duplicate.lua b/nerv/layer/duplicate.lua
index 8988617..137472b 100644
--- a/nerv/layer/duplicate.lua
+++ b/nerv/layer/duplicate.lua
@@ -1,10 +1,7 @@
 local DuplicateLayer = nerv.class('nerv.DuplicateLayer', 'nerv.Layer')
 
 function DuplicateLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self.gconf = global_conf
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
     self:check_dim_len(1, -1)
     if #self.dim_out < 1 then
         nerv.error('no output specified')
@@ -40,5 +37,5 @@ function DuplicateLayer:update()
 end
 
 function DuplicateLayer:get_params()
-    return nerv.ParamRepo({})
+    return nerv.ParamRepo({}, self.loc_type)
 end
diff --git a/nerv/layer/elem_mul.lua b/nerv/layer/elem_mul.lua
index fe80a3f..f03649b 100644
--- a/nerv/layer/elem_mul.lua
+++ b/nerv/layer/elem_mul.lua
@@ -1,14 +1,15 @@
 local ElemMulLayer = nerv.class('nerv.ElemMulLayer', 'nerv.Layer')
 
 function ElemMulLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self.gconf = global_conf
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
     -- element-wise multiplication of input[1] and input[2]
     self:check_dim_len(2, 1)
 end
 
+function ElemMulLayer:bind_params()
+    -- do nothing
+end
+
 function ElemMulLayer:init(batch_size)
     if self.dim_in[1] ~= self.dim_in[2] or
         self.dim_in[1] ~= self.dim_out[1] then
@@ -34,5 +35,5 @@ function ElemMulLayer:update(bp_err, input, output)
 end
 
 function ElemMulLayer:get_params()
-    return nerv.ParamRepo({})
+    return nerv.ParamRepo({}, self.loc_type)
 end
diff --git a/nerv/layer/graph.lua b/nerv/layer/graph.lua
index 1406eff..5f42fca 100644
--- a/nerv/layer/graph.lua
+++ b/nerv/layer/graph.lua
@@ -1,10 +1,7 @@
 local GraphLayer = nerv.class('nerv.GraphLayer', 'nerv.Layer')
 
 function GraphLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self.gconf = global_conf
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
     self:graph_init(layer_conf.layer_repo, layer_conf.connections)
 end
 
@@ -155,5 +152,5 @@ function GraphLayer:get_params()
             table.insert(param_repos, ref.layer:get_params())
         end
     end
-    return nerv.ParamRepo.merge(param_repos)
+    return nerv.ParamRepo.merge(param_repos, self.loc_type)
 end
diff --git a/nerv/layer/gru.lua b/nerv/layer/gru.lua
index e81d21a..71718d7 100644
--- a/nerv/layer/gru.lua
+++ b/nerv/layer/gru.lua
@@ -4,11 +4,7 @@ function GRULayer:__init(id, global_conf, layer_conf)
     -- input1:x
     -- input2:h
     -- input3:c (h^~)
-    self.id = id
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self.gconf = global_conf
-
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
     if self.dim_in[2] ~= self.dim_out[1] then
         nerv.error("dim_in[2](%d) mismatch with dim_out[1](%d)",
                     self.dim_in[2], self.dim_out[1])
@@ -17,7 +13,7 @@ function GRULayer:__init(id, global_conf, layer_conf)
     -- prepare a DAGLayer to hold the lstm structure
     local pr = layer_conf.pr
     if pr == nil then
-        pr = nerv.ParamRepo()
+        pr = nerv.ParamRepo({}, self.loc_type)
     end
     
     local function ap(str)
@@ -63,7 +59,7 @@ function GRULayer:__init(id, global_conf, layer_conf)
         },
     }
     
-    local layerRepo = nerv.LayerRepo(layers, pr, global_conf)
+    self.lrepo = nerv.LayerRepo(layers, pr, global_conf)
 
     local connections = {
         ["<input>[1]"] = ap("inputXDup[1]"),
@@ -97,12 +93,20 @@ function GRULayer:__init(id, global_conf, layer_conf)
     self.dag = nerv.DAGLayer(self.id, global_conf,
                                 {dim_in = self.dim_in,
                                 dim_out = self.dim_out,
-                                sub_layers = layerRepo,
+                                sub_layers = self.lrepo,
                                 connections = connections})
     
     self:check_dim_len(2, 1) -- x, h and h
 end
 
+function GRULayer:bind_params()
+    local pr = layer_conf.pr
+    if pr == nil then
+        pr = nerv.ParamRepo({}, self.loc_type)
+    end
+    self.lrepo:rebind(pr)
+end
+
 function GRULayer:init(batch_size, chunk_size)
     self.dag:init(batch_size, chunk_size)
 end
diff --git a/nerv/layer/identity.lua b/nerv/layer/identity.lua
index aeeff89..d56337d 100644
--- a/nerv/layer/identity.lua
+++ b/nerv/layer/identity.lua
@@ -1,10 +1,7 @@
 local IdentityLayer = nerv.class('nerv.IdentityLayer', 'nerv.Layer')
 
 function IdentityLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self.gconf = global_conf
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
     self:check_dim_len(1, 1)
     if self.dim_in[1] ~= self.dim_out[1] then
         nerv.error('mismatching dimensions of input and output')
@@ -29,5 +26,5 @@ function IdentityLayer:update()
 end
 
 function IdentityLayer:get_params()
-    return nerv.ParamRepo({})
+    return nerv.ParamRepo({}, self.loc_type)
 end
diff --git a/nerv/layer/init.lua b/nerv/layer/init.lua
index 4fabefa..475ef62 100644
--- a/nerv/layer/init.lua
+++ b/nerv/layer/init.lua
@@ -30,7 +30,18 @@ end
 local Layer = nerv.class('nerv.Layer')
 
 function Layer:__init(id, global_conf, layer_conf)
-    nerv.error_method_not_implemented()
+    self.id = id
+    self.gconf = global_conf
+    self.lconf = layer_conf
+    if self.gconf.use_cpu then
+        self.mat_type = self.gconf.mmat_type
+        self.loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST
+    else
+        self.mat_type = self.gconf.cumat_type
+        self.loc_type = nerv.ParamRepo.LOC_TYPES.ON_DEVICE
+    end
+    self.dim_in = layer_conf.dim_in
+    self.dim_out = layer_conf.dim_out
 end
 
 function Layer:init(batch_size)
@@ -66,6 +77,10 @@ function Layer:get_params()
     nerv.error_method_not_implemented()
 end
 
+function Layer:bind_params()
+    nerv.error_method_not_implemented()
+end
+
 function Layer:get_dim()
     return self.dim_in, self.dim_out
 end
@@ -78,30 +93,33 @@ function Layer:get_sublayer(id)
     nerv.error('primitive layer does not have sublayers')
 end
 
-function Layer:find_param(pid_list, lconf, gconf, p_type, p_dim)
-    if type(pid_list) == "string" then
-        pid_list = {pid_list}
+function Layer:find_param(plist, lconf, gconf, p_type, p_dim)
+    if type(plist) == "string" then
+        plist = {plist}
     end
-    pid_list_str = table.tostring(pid_list)
-    for i, pid in ipairs(pid_list) do
-        if lconf[pid] ~= nil then
-            nerv.info("param [%s] of layer [%s] found in `layer_conf`.", pid, self.id)
-            return lconf[pid]
+    if lconf.params == nil then
+        lconf.params = {}
+    end
+    plist_str = table.tostring(plist)
+    local pid
+    for i, pname in ipairs(plist) do
+        if lconf.params[pname] ~= nil then
+            nerv.info("param id for [%s] of layer [%s] specified in `layer_conf.params`.", pname, self.id)
+            pid = lconf.params[pname]
         end
-        local pid_g = self.id .. '_' .. pid --global identifier
-        local pr = lconf.pr
-        local p
-        if pr ~= nil and pr:has_param(pid_g) == true then
-            nerv.info("param [%s] of layer [%s] found in `layer_conf.pr`.", pid_list_str, self.id)
-            p = pr:get_param(pid_g)
-            return p
+        if lconf.pr:has_param(pid) then
+            return lconf.pr:get_param(pid)
         end
     end
-    nerv.info("param [%s] of layer [%s] is not found in `layer_conf` or `layer_conf.pr`, " ..
-                "switch to auto-generate", pid_list_str, self.id)
-    local pid_g = self.id .. '_' .. pid_list[1]
-    p = p_type(pid_g, gconf)
-    p.trans = gconf.cumat_type(unpack(p_dim))
+    pid = self.id .. '_' .. plist[1]
+    if lconf.pr:has_param(pid) then
+        nerv.info("param id for [%s] of layer [%s] is generated automatically.", pname, self.id)
+        return lconf.pr:get_param(pid)
+    end
+    nerv.info("param id for [%s] of layer [%s] is not found in the specified param repo, " ..
+                "switch to auto-generate", plist_str, self.id)
+    local p = p_type(pid, gconf)
+    p.trans = self.mat_type(unpack(p_dim))
     if type(gconf.param_random) ~= "function" then
         nerv.error("a param generate function is needed")
     end
diff --git a/nerv/layer/lstm.lua b/nerv/layer/lstm.lua
index caa7569..641d5dc 100644
--- a/nerv/layer/lstm.lua
+++ b/nerv/layer/lstm.lua
@@ -4,15 +4,11 @@ function LSTMLayer:__init(id, global_conf, layer_conf)
     -- input1:x
     -- input2:h
     -- input3:c
-    self.id = id
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self.gconf = global_conf
-
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
     -- prepare a DAGLayer to hold the lstm structure
     local pr = layer_conf.pr
     if pr == nil then
-        pr = nerv.ParamRepo()
+        pr = nerv.ParamRepo({}, self.loc_type)
     end
     
     local function ap(str)
@@ -66,7 +62,7 @@ function LSTMLayer:__init(id, global_conf, layer_conf)
         },
     }
     
-    local layerRepo = nerv.LayerRepo(layers, pr, global_conf)
+    self.lrepo = nerv.LayerRepo(layers, pr, global_conf)
 
     local connections = {
         ["<input>[1]"] = ap("inputXDup[1]"),
@@ -109,12 +105,20 @@ function LSTMLayer:__init(id, global_conf, layer_conf)
     self.dag = nerv.DAGLayer(self.id, global_conf,
                                 {dim_in = self.dim_in,
                                  dim_out = self.dim_out,
-                                 sub_layers = layerRepo,
+                                 sub_layers = self.lrepo,
                                  connections = connections})
     
     self:check_dim_len(3, 2) -- x, h, c and h, c
 end
 
+function LSTMLayer:bind_params()
+    local pr = layer_conf.pr
+    if pr == nil then
+        pr = nerv.ParamRepo({}, self.loc_type)
+    end
+    self.lrepo:rebind(pr)
+end
+
 function LSTMLayer:init(batch_size, chunk_size)
     self.dag:init(batch_size, chunk_size)
 end
diff --git a/nerv/layer/lstm_gate.lua b/nerv/layer/lstm_gate.lua
index 1963eba..7a27bab 100644
--- a/nerv/layer/lstm_gate.lua
+++ b/nerv/layer/lstm_gate.lua
@@ -2,20 +2,19 @@ local LSTMGateLayer = nerv.class('nerv.LSTMGateLayer', 'nerv.Layer')
 -- NOTE: this is a full matrix gate
 
 function LSTMGateLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self.gconf = global_conf
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
+    self:check_dim_len(-1, 1) --accept multiple inputs
+    self:bind_params()
+end
 
+function LSTMGateLayer:bind_params()
     for i = 1, #self.dim_in do
-        self["ltp" .. i] = self:find_param("ltp" .. i, layer_conf, global_conf,
+        self["ltp" .. i] = self:find_param("ltp" .. i, self.lconf, self.gconf,
                                             nerv.LinearTransParam,
                                             {self.dim_in[i], self.dim_out[1]})
     end
-    self.bp = self:find_param("bp", layer_conf, global_conf,
+    self.bp = self:find_param("bp", self.lconf, self.gconf,
                                 nerv.BiasParam, {1, self.dim_out[1]})
-  
-    self:check_dim_len(-1, 1) --accept multiple inputs
 end
 
 function LSTMGateLayer:init(batch_size)
@@ -69,7 +68,7 @@ function LSTMGateLayer:update(bp_err, input, output)
 end
 
 function LSTMGateLayer:get_params()
-    local pr = nerv.ParamRepo({self.bp})
+    local pr = nerv.ParamRepo({self.bp}, self.loc_type)
     for i = 1, #self.dim_in do
         pr:add(self["ltp" .. i].id, self["ltp" .. i])
     end
diff --git a/nerv/layer/mse.lua b/nerv/layer/mse.lua
index 1c218d0..458d086 100644
--- a/nerv/layer/mse.lua
+++ b/nerv/layer/mse.lua
@@ -1,18 +1,14 @@
 local MSELayer = nerv.class("nerv.MSELayer", "nerv.Layer")
 
 function MSELayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self.gconf = global_conf
-    if self.gconf.use_cpu then
-        self.mat_type = self.gconf.mmat_type
-    else
-        self.mat_type = self.gconf.cumat_type
-    end
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
     self:check_dim_len(2, -1)
 end
 
+function MSELayer:bind_params()
+    -- do nothing
+end
+
 function MSELayer:init(batch_size)
     if self.dim_in[1] ~= self.dim_in[2] then
         nerv.error("mismatching dimensions of previous network output and labels")
@@ -61,5 +57,5 @@ function MSELayer:back_propagate(bp_err, next_bp_err, input, output)
 end
 
 function MSELayer:get_params()
-    return nerv.ParamRepo({})
+    return nerv.ParamRepo({}, self.loc_type)
 end
diff --git a/nerv/layer/rnn.lua b/nerv/layer/rnn.lua
index 38f2326..e59cf5b 100644
--- a/nerv/layer/rnn.lua
+++ b/nerv/layer/rnn.lua
@@ -1,10 +1,7 @@
 local RNNLayer = nerv.class('nerv.RNNLayer', 'nerv.GraphLayer')
 
 function RNNLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
-    self.gconf = layer_conf.gconf
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
     self:check_dim_len(1, 1)
 
     local din = layer_conf.dim_in[1]
@@ -12,7 +9,7 @@ function RNNLayer:__init(id, global_conf, layer_conf)
 
     local pr = layer_conf.pr
     if pr == nil then
-        pr = nerv.ParamRepo()
+        pr = nerv.ParamRepo({}, self.loc_type)
     end
 
     local layers = {
diff --git a/nerv/layer/sigmoid.lua b/nerv/layer/sigmoid.lua
index 0a8bcdc..a9f9749 100644
--- a/nerv/layer/sigmoid.lua
+++ b/nerv/layer/sigmoid.lua
@@ -1,13 +1,14 @@
 local SigmoidLayer = nerv.class("nerv.SigmoidLayer", "nerv.Layer")
 
 function SigmoidLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.gconf = global_conf
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
     self:check_dim_len(1, 1)
 end
 
+function SigmoidLayer:bind_params()
+    -- do nothing
+end
+
 function SigmoidLayer:init()
     if self.dim_in[1] ~= self.dim_out[1] then
         nerv.error("mismatching dimensions of input and output")
@@ -31,5 +32,5 @@ function SigmoidLayer:back_propagate(bp_err, next_bp_err, input, output)
 end
 
 function SigmoidLayer:get_params()
-    return nerv.ParamRepo({})
+    return nerv.ParamRepo({}, self.loc_type)
 end
diff --git a/nerv/layer/softmax.lua b/nerv/layer/softmax.lua
index 4205b66..f7a5163 100644
--- a/nerv/layer/softmax.lua
+++ b/nerv/layer/softmax.lua
@@ -1,13 +1,14 @@
 local SoftmaxLayer = nerv.class("nerv.SoftmaxLayer", "nerv.Layer")
 
 function SoftmaxLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.gconf = global_conf
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
     self:check_dim_len(1, 1) -- two inputs: nn output and label
 end
 
+function SoftmaxLayer:bind_params()
+    -- do nothing
+end
+
 function SoftmaxLayer:init(batch_size)
     if self.dim_in[1] ~= self.dim_out[1] then
         nerv.error("mismatching dimensions of input and output")
@@ -31,5 +32,5 @@ function SoftmaxLayer:back_propagate(bp_err, next_bp_err, input, output)
 end
 
 function SoftmaxLayer:get_params()
-    return nerv.ParamRepo({})
+    return nerv.ParamRepo({}, self.loc_type)
 end
diff --git a/nerv/layer/softmax_ce.lua b/nerv/layer/softmax_ce.lua
index d7d650e..7b4a80c 100644
--- a/nerv/layer/softmax_ce.lua
+++ b/nerv/layer/softmax_ce.lua
@@ -1,15 +1,7 @@
 local SoftmaxCELayer = nerv.class("nerv.SoftmaxCELayer", "nerv.Layer")
 
 function SoftmaxCELayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.gconf = global_conf
-    if self.gconf.use_cpu then
-        self.mat_type = self.gconf.mmat_type
-    else
-        self.mat_type = self.gconf.cumat_type
-    end
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
     self.compressed = layer_conf.compressed
     if self.compressed == nil then
         self.compressed = false
@@ -17,6 +9,10 @@ function SoftmaxCELayer:__init(id, global_conf, layer_conf)
     self:check_dim_len(2, -1) -- two inputs: nn output and label
 end
 
+function SoftmaxCELayer:bind_params()
+    -- do nothing
+end
+
 function SoftmaxCELayer:init(batch_size, chunk_size)
     if not self.compressed and (self.dim_in[1] ~= self.dim_in[2]) then
         nerv.error("mismatching dimensions of previous network output and labels")
@@ -94,5 +90,5 @@ function SoftmaxCELayer:back_propagate(bp_err, next_bp_err, input, output, t)
 end
 
 function SoftmaxCELayer:get_params()
-    return nerv.ParamRepo({})
+    return nerv.ParamRepo({}, self.loc_type)
 end
diff --git a/nerv/layer/tanh.lua b/nerv/layer/tanh.lua
index e1c32f2..7a19fc8 100644
--- a/nerv/layer/tanh.lua
+++ b/nerv/layer/tanh.lua
@@ -1,13 +1,14 @@
 local TanhLayer = nerv.class("nerv.TanhLayer", "nerv.Layer")
 
 function TanhLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.gconf = global_conf
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
     self:check_dim_len(1, 1)
 end
 
+function TanhLayer:bind_params()
+    -- do nothing
+end
+
 function TanhLayer:init()
     if self.dim_in[1] ~= self.dim_out[1] then
         nerv.error("mismatching dimensions of input and output")
@@ -31,5 +32,5 @@ function TanhLayer:back_propagate(bp_err, next_bp_err, input, output)
 end
 
 function TanhLayer:get_params()
-    return nerv.ParamRepo({})
+    return nerv.ParamRepo({}, self.loc_type)
 end
diff --git a/nerv/layer/window.lua b/nerv/layer/window.lua
index 4933de0..364929f 100644
--- a/nerv/layer/window.lua
+++ b/nerv/layer/window.lua
@@ -1,12 +1,15 @@
 local WindowLayer = nerv.class("nerv.WindowLayer", "nerv.Layer")
 
 function WindowLayer:__init(id, global_conf, layer_conf)
-    self.id = id
-    self.gconf = global_conf
-    self.window = layer_conf.window
-    self.dim_in = layer_conf.dim_in
-    self.dim_out = layer_conf.dim_out
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
     self:check_dim_len(1, 1)
+    self:bind_params()
+end
+
+function WindowLayer:bind_params()
+    self.window = self:find_param("window", self.lconf, self.gconf,
+                                nerv.BiasParam,
+                                {1, self.dim_out[1]})
 end
 
 function WindowLayer:init()
@@ -28,5 +31,5 @@ function WindowLayer:propagate(input, output)
 end
 
 function WindowLayer:get_params()
-    return nerv.ParamRepo({self.window})
+    return nerv.ParamRepo({self.window}, self.loc_type)
 end
diff --git a/nerv/lib/cblas.h b/nerv/lib/cblas.h
new file mode 100644
index 0000000..4087ffb
--- /dev/null
+++ b/nerv/lib/cblas.h
@@ -0,0 +1,596 @@
+#ifndef CBLAS_H
+
+#ifndef CBLAS_ENUM_DEFINED_H
+   #define CBLAS_ENUM_DEFINED_H
+   enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102 };
+   enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113,
+                         AtlasConj=114};
+   enum CBLAS_UPLO  {CblasUpper=121, CblasLower=122};
+   enum CBLAS_DIAG  {CblasNonUnit=131, CblasUnit=132};
+   enum CBLAS_SIDE  {CblasLeft=141, CblasRight=142};
+#endif
+
+#ifndef CBLAS_ENUM_ONLY
+#define CBLAS_H
+#define CBLAS_INDEX int
+
+int cblas_errprn(int ierr, int info, char *form, ...);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 1 BLAS functions (complex are recast as routines)
+ * ===========================================================================
+ */
+float  cblas_sdsdot(const int N, const float alpha, const float *X,
+                    const int incX, const float *Y, const int incY);
+double cblas_dsdot(const int N, const float *X, const int incX, const float *Y,
+                   const int incY);
+float  cblas_sdot(const int N, const float  *X, const int incX,
+                  const float  *Y, const int incY);
+double cblas_ddot(const int N, const double *X, const int incX,
+                  const double *Y, const int incY);
+/*
+ * Functions having prefixes Z and C only
+ */
+void   cblas_cdotu_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotu);
+void   cblas_cdotc_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotc);
+
+void   cblas_zdotu_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotu);
+void   cblas_zdotc_sub(const int N, const void *X, const int incX,
+                       const void *Y, const int incY, void *dotc);
+
+
+/*
+ * Functions having prefixes S D SC DZ
+ */
+float  cblas_snrm2(const int N, const float *X, const int incX);
+float  cblas_sasum(const int N, const float *X, const int incX);
+
+double cblas_dnrm2(const int N, const double *X, const int incX);
+double cblas_dasum(const int N, const double *X, const int incX);
+
+float  cblas_scnrm2(const int N, const void *X, const int incX);
+float  cblas_scasum(const int N, const void *X, const int incX);
+
+double cblas_dznrm2(const int N, const void *X, const int incX);
+double cblas_dzasum(const int N, const void *X, const int incX);
+
+
+/*
+ * Functions having standard 4 prefixes (S D C Z)
+ */
+CBLAS_INDEX cblas_isamax(const int N, const float  *X, const int incX);
+CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX);
+CBLAS_INDEX cblas_icamax(const int N, const void   *X, const int incX);
+CBLAS_INDEX cblas_izamax(const int N, const void   *X, const int incX);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 1 BLAS routines
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (s, d, c, z)
+ */
+void cblas_sswap(const int N, float *X, const int incX,
+                 float *Y, const int incY);
+void cblas_scopy(const int N, const float *X, const int incX,
+                 float *Y, const int incY);
+void cblas_saxpy(const int N, const float alpha, const float *X,
+                 const int incX, float *Y, const int incY);
+void catlas_saxpby(const int N, const float alpha, const float *X,
+                  const int incX, const float beta, float *Y, const int incY);
+void catlas_sset
+   (const int N, const float alpha, float *X, const int incX);
+
+void cblas_dswap(const int N, double *X, const int incX,
+                 double *Y, const int incY);
+void cblas_dcopy(const int N, const double *X, const int incX,
+                 double *Y, const int incY);
+void cblas_daxpy(const int N, const double alpha, const double *X,
+                 const int incX, double *Y, const int incY);
+void catlas_daxpby(const int N, const double alpha, const double *X,
+                  const int incX, const double beta, double *Y, const int incY);
+void catlas_dset
+   (const int N, const double alpha, double *X, const int incX);
+
+void cblas_cswap(const int N, void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_ccopy(const int N, const void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_caxpy(const int N, const void *alpha, const void *X,
+                 const int incX, void *Y, const int incY);
+void catlas_caxpby(const int N, const void *alpha, const void *X,
+                  const int incX, const void *beta, void *Y, const int incY);
+void catlas_cset
+   (const int N, const void *alpha, void *X, const int incX);
+
+void cblas_zswap(const int N, void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_zcopy(const int N, const void *X, const int incX,
+                 void *Y, const int incY);
+void cblas_zaxpy(const int N, const void *alpha, const void *X,
+                 const int incX, void *Y, const int incY);
+void catlas_zaxpby(const int N, const void *alpha, const void *X,
+                  const int incX, const void *beta, void *Y, const int incY);
+void catlas_zset
+   (const int N, const void *alpha, void *X, const int incX);
+
+
+/*
+ * Routines with S and D prefix only
+ */
+void cblas_srotg(float *a, float *b, float *c, float *s);
+void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
+void cblas_srot(const int N, float *X, const int incX,
+                float *Y, const int incY, const float c, const float s);
+void cblas_srotm(const int N, float *X, const int incX,
+                float *Y, const int incY, const float *P);
+
+void cblas_drotg(double *a, double *b, double *c, double *s);
+void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
+void cblas_drot(const int N, double *X, const int incX,
+                double *Y, const int incY, const double c, const double s);
+void cblas_drotm(const int N, double *X, const int incX,
+                double *Y, const int incY, const double *P);
+
+
+/*
+ * Routines with S D C Z CS and ZD prefixes
+ */
+void cblas_sscal(const int N, const float alpha, float *X, const int incX);
+void cblas_dscal(const int N, const double alpha, double *X, const int incX);
+void cblas_cscal(const int N, const void *alpha, void *X, const int incX);
+void cblas_zscal(const int N, const void *alpha, void *X, const int incX);
+void cblas_csscal(const int N, const float alpha, void *X, const int incX);
+void cblas_zdscal(const int N, const double alpha, void *X, const int incX);
+
+/*
+ * Extra reference routines provided by ATLAS, but not mandated by the standard
+ */
+void cblas_crotg(void *a, void *b, void *c, void *s);
+void cblas_zrotg(void *a, void *b, void *c, void *s);
+void cblas_csrot(const int N, void *X, const int incX, void *Y, const int incY,
+                 const float c, const float s);
+void cblas_zdrot(const int N, void *X, const int incX, void *Y, const int incY,
+                 const double c, const double s);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 2 BLAS
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (S, D, C, Z)
+ */
+void cblas_sgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 const float *X, const int incX, const float beta,
+                 float *Y, const int incY);
+void cblas_sgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const float alpha,
+                 const float *A, const int lda, const float *X,
+                 const int incX, const float beta, float *Y, const int incY);
+void cblas_strmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *A, const int lda,
+                 float *X, const int incX);
+void cblas_stbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const float *A, const int lda,
+                 float *X, const int incX);
+void cblas_stpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *Ap, float *X, const int incX);
+void cblas_strsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *A, const int lda, float *X,
+                 const int incX);
+void cblas_stbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const float *A, const int lda,
+                 float *X, const int incX);
+void cblas_stpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const float *Ap, float *X, const int incX);
+
+void cblas_dgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 const double *X, const int incX, const double beta,
+                 double *Y, const int incY);
+void cblas_dgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const double alpha,
+                 const double *A, const int lda, const double *X,
+                 const int incX, const double beta, double *Y, const int incY);
+void cblas_dtrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *A, const int lda,
+                 double *X, const int incX);
+void cblas_dtbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const double *A, const int lda,
+                 double *X, const int incX);
+void cblas_dtpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *Ap, double *X, const int incX);
+void cblas_dtrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *A, const int lda, double *X,
+                 const int incX);
+void cblas_dtbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const double *A, const int lda,
+                 double *X, const int incX);
+void cblas_dtpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const double *Ap, double *X, const int incX);
+
+void cblas_cgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *X, const int incX, const void *beta,
+                 void *Y, const int incY);
+void cblas_cgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const void *alpha,
+                 const void *A, const int lda, const void *X,
+                 const int incX, const void *beta, void *Y, const int incY);
+void cblas_ctrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ctbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ctpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+void cblas_ctrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda, void *X,
+                 const int incX);
+void cblas_ctbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ctpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+
+void cblas_zgemv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *X, const int incX, const void *beta,
+                 void *Y, const int incY);
+void cblas_zgbmv(const enum CBLAS_ORDER Order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const int KL, const int KU, const void *alpha,
+                 const void *A, const int lda, const void *X,
+                 const int incX, const void *beta, void *Y, const int incY);
+void cblas_ztrmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ztbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ztpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+void cblas_ztrsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *A, const int lda, void *X,
+                 const int incX);
+void cblas_ztbsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const int K, const void *A, const int lda,
+                 void *X, const int incX);
+void cblas_ztpsv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
+                 const int N, const void *Ap, void *X, const int incX);
+
+
+/*
+ * Routines with S and D prefixes only
+ */
+void cblas_ssymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const float alpha, const float *A,
+                 const int lda, const float *X, const int incX,
+                 const float beta, float *Y, const int incY);
+void cblas_ssbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const float alpha, const float *A,
+                 const int lda, const float *X, const int incX,
+                 const float beta, float *Y, const int incY);
+void cblas_sspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const float alpha, const float *Ap,
+                 const float *X, const int incX,
+                 const float beta, float *Y, const int incY);
+void cblas_sger(const enum CBLAS_ORDER Order, const int M, const int N,
+                const float alpha, const float *X, const int incX,
+                const float *Y, const int incY, float *A, const int lda);
+void cblas_ssyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, float *A, const int lda);
+void cblas_sspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, float *Ap);
+void cblas_ssyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, const float *Y, const int incY, float *A,
+                const int lda);
+void cblas_sspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const float *X,
+                const int incX, const float *Y, const int incY, float *A);
+
+void cblas_dsymv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const double alpha, const double *A,
+                 const int lda, const double *X, const int incX,
+                 const double beta, double *Y, const int incY);
+void cblas_dsbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const double alpha, const double *A,
+                 const int lda, const double *X, const int incX,
+                 const double beta, double *Y, const int incY);
+void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const double alpha, const double *Ap,
+                 const double *X, const int incX,
+                 const double beta, double *Y, const int incY);
+void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N,
+                const double alpha, const double *X, const int incX,
+                const double *Y, const int incY, double *A, const int lda);
+void cblas_dsyr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, double *A, const int lda);
+void cblas_dspr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, double *Ap);
+void cblas_dsyr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, const double *Y, const int incY, double *A,
+                const int lda);
+void cblas_dspr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const double *X,
+                const int incX, const double *Y, const int incY, double *A);
+
+
+/*
+ * Routines with C and Z prefixes only
+ */
+void cblas_chemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_chbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_chpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *Ap,
+                 const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_cgeru(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_cgerc(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_cher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const void *X, const int incX,
+                void *A, const int lda);
+void cblas_chpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const float alpha, const void *X,
+                const int incX, void *A);
+void cblas_cher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *A, const int lda);
+void cblas_chpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *Ap);
+
+void cblas_zhemv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_zhbmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const int K, const void *alpha, const void *A,
+                 const int lda, const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const int N, const void *alpha, const void *Ap,
+                 const void *X, const int incX,
+                 const void *beta, void *Y, const int incY);
+void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N,
+                 const void *alpha, const void *X, const int incX,
+                 const void *Y, const int incY, void *A, const int lda);
+void cblas_zher(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const void *X, const int incX,
+                void *A, const int lda);
+void cblas_zhpr(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                const int N, const double alpha, const void *X,
+                const int incX, void *A);
+void cblas_zher2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *A, const int lda);
+void cblas_zhpr2(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const int N,
+                const void *alpha, const void *X, const int incX,
+                const void *Y, const int incY, void *Ap);
+
+/*
+ * ===========================================================================
+ * Prototypes for level 3 BLAS
+ * ===========================================================================
+ */
+
+/*
+ * Routines with standard 4 prefixes (S, D, C, Z)
+ */
+void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const float alpha, const float *A,
+                 const int lda, const float *B, const int ldb,
+                 const float beta, float *C, const int ldc);
+void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 const float *B, const int ldb, const float beta,
+                 float *C, const int ldc);
+void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const float alpha, const float *A, const int lda,
+                 const float beta, float *C, const int ldc);
+void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const float alpha, const float *A, const int lda,
+                  const float *B, const int ldb, const float beta,
+                  float *C, const int ldc);
+void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 float *B, const int ldb);
+void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const float alpha, const float *A, const int lda,
+                 float *B, const int ldb);
+
+void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const double alpha, const double *A,
+                 const int lda, const double *B, const int ldb,
+                 const double beta, double *C, const int ldc);
+void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 const double *B, const int ldb, const double beta,
+                 double *C, const int ldc);
+void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const double alpha, const double *A, const int lda,
+                 const double beta, double *C, const int ldc);
+void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const double alpha, const double *A, const int lda,
+                  const double *B, const int ldb, const double beta,
+                  double *C, const int ldc);
+void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 double *B, const int ldb);
+void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const double alpha, const double *A, const int lda,
+                 double *B, const int ldb);
+
+void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const void *alpha, const void *A,
+                 const int lda, const void *B, const int ldb,
+                 const void *beta, void *C, const int ldc);
+void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const void *alpha, const void *A, const int lda,
+                 const void *beta, void *C, const int ldc);
+void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const void *beta,
+                  void *C, const int ldc);
+void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+
+void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const void *alpha, const void *A,
+                 const int lda, const void *B, const int ldb,
+                 const void *beta, void *C, const int ldc);
+void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const void *alpha, const void *A, const int lda,
+                 const void *beta, void *C, const int ldc);
+void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const void *beta,
+                  void *C, const int ldc);
+void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
+                 const enum CBLAS_DIAG Diag, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 void *B, const int ldb);
+
+
+/*
+ * Routines with prefixes C and Z only
+ */
+void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const float alpha, const void *A, const int lda,
+                 const float beta, void *C, const int ldc);
+void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const float beta,
+                  void *C, const int ldc);
+void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side,
+                 const enum CBLAS_UPLO Uplo, const int M, const int N,
+                 const void *alpha, const void *A, const int lda,
+                 const void *B, const int ldb, const void *beta,
+                 void *C, const int ldc);
+void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                 const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                 const double alpha, const void *A, const int lda,
+                 const double beta, void *C, const int ldc);
+void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
+                  const enum CBLAS_TRANSPOSE Trans, const int N, const int K,
+                  const void *alpha, const void *A, const int lda,
+                  const void *B, const int ldb, const double beta,
+                  void *C, const int ldc);
+
+int cblas_errprn(int ierr, int info, char *form, ...);
+
+#endif  /* end #ifdef CBLAS_ENUM_ONLY */
+#endif
diff --git a/nerv/lib/matrix/cumatrix.c b/nerv/lib/matrix/cumatrix.c
index ff2ea22..aec4d60 100644
--- a/nerv/lib/matrix/cumatrix.c
+++ b/nerv/lib/matrix/cumatrix.c
@@ -37,7 +37,9 @@ void nerv_cuda_context_accu_profile(CuContext *context,
     *val += delta;
 }
 
-static void new_cuda_handles(CuContext *context, Status *status) {
+static void new_cuda_handles(CuContext *context, int dev, Status *status) {
+    if (context->has_handle) return;
+    CUDA_SAFE_SYNC_CALL(cudaSetDevice(dev), status);
     CUBLAS_SAFE_SYNC_CALL(cublasCreate(&(context->cublas_handle)), status);
     CURAND_SAFE_SYNC_CALL(curandCreateGenerator(&(context->curand_gen),
                             CURAND_RNG_PSEUDO_DEFAULT), status);
@@ -47,9 +49,12 @@ static void new_cuda_handles(CuContext *context, Status *status) {
     CUDA_SAFE_SYNC_CALL(cudaEventCreate(&(context->profile_start)), status);
     CUDA_SAFE_SYNC_CALL(cudaEventCreate(&(context->profile_stop)), status);
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
+    context->has_handle = 1;
 }
 
 static void free_cuda_handles(CuContext *context, Status *status) {
+    if (!context->has_handle) return;
+    context->has_handle = 0;
     CUBLAS_SAFE_SYNC_CALL(cublasDestroy(context->cublas_handle), status);
     CURAND_SAFE_SYNC_CALL(curandDestroyGenerator(context->curand_gen), status);
     CUDA_SAFE_SYNC_CALL(cudaEventDestroy(context->profile_start), status);
@@ -57,9 +62,41 @@ static void free_cuda_handles(CuContext *context, Status *status) {
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
 }
 
-CuContext *nerv_cuda_context_create(Status *status) {
+static int choose_best_gpu(Status *status) {
+    int i, n, dev = 0;
+    float best_ratio = 0;
+    fprintf(stderr, "*** select a GPU based on available space\n");
+    CUDA_SAFE_CALL_RET(cudaGetDeviceCount(&n), status);
+    for (i = 0; i < n; i++)
+    {
+        size_t avail, total;
+        float ratio;
+        CUDA_SAFE_SYNC_CALL_RET(cudaSetDevice(i), status);
+        CUDA_SAFE_SYNC_CALL_RET(cuMemGetInfo(&avail, &total), status);
+        ratio = (float)avail/total * 100;
+        fprintf(stderr, "* card %d: %.2f%%\n", i, ratio);
+        if (ratio > best_ratio)
+        {
+            best_ratio = ratio;
+            dev = i;
+        }
+        CUDA_SAFE_SYNC_CALL_RET(cudaDeviceReset(), status);
+    }
+    fprintf(stderr, "*** final decision: GPU %d\n", dev);
+    NERV_SET_STATUS(status, NERV_NORMAL, 0);
+    return dev;
+}
+
+CuContext *nerv_cuda_context_create(int dev, Status *status) {
     CuContext *context = (CuContext *)malloc(sizeof(CuContext));
-    new_cuda_handles(context, status);
+    context->has_handle = 0; /* this line must come first */
+    if (dev == -1)
+    {
+        dev = choose_best_gpu(status);
+        if (status->err_code != NERV_NORMAL)
+            return NULL;
+    }
+    new_cuda_handles(context, dev, status);
     if (status->err_code != NERV_NORMAL)
         return NULL;
     context->profile = nerv_hashmap_create(PROFILE_HASHMAP_SIZE, bkdr_hash, strcmp);
@@ -78,11 +115,14 @@ void nerv_cuda_context_destroy(CuContext *context, Status *status) {
 
 void nerv_cuda_context_select_gpu(CuContext *context,
                                     int dev, Status *status) {
-    free_cuda_handles(context, status);
+    /* free_cuda_handles(context, status);
     if (status->err_code != NERV_NORMAL)
         return;
-    CUDA_SAFE_SYNC_CALL(cudaSetDevice(dev), status);
-    new_cuda_handles(context, status);
+    */
+    /* because of cudaDeviceReset */
+    context->has_handle = 0;
+    CUDA_SAFE_SYNC_CALL(cudaDeviceReset(), status);
+    new_cuda_handles(context, dev, status);
     if (status->err_code != NERV_NORMAL)
         return;
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
diff --git a/nerv/lib/matrix/cumatrix.h b/nerv/lib/matrix/cumatrix.h
index 280035b..fd2a5ce 100644
--- a/nerv/lib/matrix/cumatrix.h
+++ b/nerv/lib/matrix/cumatrix.h
@@ -5,6 +5,7 @@
 #include "cuda_helper.h"
 
 typedef struct CuContext {
+    int has_handle;
     cublasHandle_t cublas_handle;
     cudaEvent_t profile_start, profile_stop;
     curandGenerator_t curand_gen;
@@ -15,6 +16,6 @@ void nerv_cuda_context_print_profile(CuContext *context);
 void nerv_cuda_context_clear_profile(CuContext *context);
 void nerv_cuda_context_accu_profile(CuContext *context, const char *name, float delta);
 void nerv_cuda_context_select_gpu(CuContext *context, int dev, Status *status);
-CuContext *nerv_cuda_context_create(Status *status);
+CuContext *nerv_cuda_context_create(int dev, Status *status);
 void nerv_cuda_context_destroy(CuContext *contex, Status *status);
 #endif
diff --git a/nerv/lib/matrix/generic/mmatrix.c b/nerv/lib/matrix/generic/mmatrix.c
index 485d778..fb99b53 100644
--- a/nerv/lib/matrix/generic/mmatrix.c
+++ b/nerv/lib/matrix/generic/mmatrix.c
@@ -8,10 +8,10 @@
                                                 context, status)
 #define NERV_GENERIC_MATRIX
 #include "../../common.h"
+#include "../../cblas.h"
 #include "../../io/chunk_file.h"
 #include <string.h>
 #include <math.h>
-#include <cblas.h>
 #include <float.h>
 
 Matrix *nerv_matrix_(colsum)(Matrix *a, MContext *context, Status *status) {
diff --git a/nerv/matrix/cumatrix.c b/nerv/matrix/cumatrix.c
index f6a4ed5..b8eef9c 100644
--- a/nerv/matrix/cumatrix.c
+++ b/nerv/matrix/cumatrix.c
@@ -9,7 +9,7 @@ const char *nerv_cuda_context_tname = "nerv.CuContext";
 int nerv_cuda_context_lua_select_gpu(lua_State *L) {
     Status status;
     nerv_cuda_context_select_gpu(luaT_checkudata(L, 1, nerv_cuda_context_tname),
-                            luaL_checkinteger(L, 1), &status);
+                            luaL_checkinteger(L, 2), &status);
     NERV_LUA_CHECK_STATUS(L, status);
     return 0;
 }
@@ -26,7 +26,8 @@ int nerv_cuda_context_lua_clear_profile(lua_State *L) {
 
 int nerv_cuda_context_lua_new(lua_State *L) {
     Status status;
-    CuContext *self = nerv_cuda_context_create(&status);
+    int dev = lua_gettop(L) > 0 ? luaL_checkinteger(L, 1) : -1;
+    CuContext *self = nerv_cuda_context_create(dev, &status);
     NERV_LUA_CHECK_STATUS(L, status);
     luaT_pushudata(L, self, nerv_cuda_context_tname);
     return 1;
diff --git a/nerv/matrix/generic/mmatrix.c b/nerv/matrix/generic/mmatrix.c
index 69000b7..1f37173 100644
--- a/nerv/matrix/generic/mmatrix.c
+++ b/nerv/matrix/generic/mmatrix.c
@@ -8,10 +8,10 @@
 #define MATRIX_BASE_TNAME nerv_matrix_host_tname
 #define NERV_GENERIC_MATRIX
 #include "../../lib/common.h"
+#include "../../lib/cblas.h"
 #include "../../lib/matrix/generic/mmatrix.h"
 #include "../../io/chunk_file.h"
 #include <string.h>
-#include <cblas.h>
 
 #define BLAS_OP_N CblasNoTrans
 static int nerv_matrix_(lua_get_blas_op)(char ch) {
diff --git a/nerv/matrix/init.lua b/nerv/matrix/init.lua
index ef2fb6b..cf85004 100644
--- a/nerv/matrix/init.lua
+++ b/nerv/matrix/init.lua
@@ -87,6 +87,17 @@ function nerv.Matrix:__mul__(b)
     return c
 end
 
+--- A wrapper function for `copy_from`
+function nerv.Matrix:copy_to(b, ...)
+    b:copy_from(self, ...)
+end
+
+--- The base class for all device (in-GPU) matrices
+-- @type nerv.CuMatrix
+
+--- A wrapper function for `copy_fromd`
+nerv.CuMatrix.copy_tod = nerv.Matrix.copy_to
+
 --- CUDA float matrices
 -- @type nerv.CuMatrixFloat
 
@@ -127,6 +138,14 @@ end
 -- @type nerv.MMatrix
 
 --- A wrapper function for `copy_fromh`
-function nerv.MMatrix:copy_toh(b, ...)
+nerv.MMatrix.copy_toh = nerv.Matrix.copy_to
+
+--- A wrapper function for `nerv.CuMatrix` copy
+function nerv.MMatrix:copy_fromd(b, ...)
+    b:copy_toh(self, ...)
+end
+
+--- A wrapper function for `nerv.CuMatrix` copy
+function nerv.MMatrix:copy_tod(b, ...)
     b:copy_fromh(self, ...)
 end
diff --git a/nerv/nerv b/nerv/nerv
index 4dd448c..4c20ec7 100644
--- a/nerv/nerv
+++ b/nerv/nerv
@@ -1,13 +1,21 @@
 #! /usr/bin/env luajit
 require 'nerv'
-nerv.printf("*** NERV: A Lua-based toolkit for high-performance deep learning (alpha) ***\n")
-nerv.info("automatically initialize a default CuContext...")
-nerv.CuMatrix._default_context = nerv.CuContext()
-nerv.info("the default CuContext is ok")
+local options = {{"help", "h", "boolean", default = false, desc = "print this help message"},
+                 {"use-cpu", "c", "boolean", default = false, desc = "use CPU by default (instead of gpu by default)"},
+                 {"select-gpu", nil, "int", default = -1, desc = "select the GPU for computation, fallback to auto mode if not specified"}}
+econf = {} -- environment configuration
 
-nerv.info("automatically initialize a default MContext...")
-nerv.MMatrix._default_context = nerv.MContext()
-nerv.info("the default MContext is ok")
+local function print_help()
+    nerv.printf("Usage: <nerv_prog> [options] script.lua\n")
+    nerv.print_usage(options)
+end
+
+nerv.printf("*** NERV: A Lua-based toolkit for high-performance deep learning (alpha) ***\n")
+arg, opts = nerv.parse_args(arg, options)
+if #arg < 1 or opts["help"].val then
+    print_help()
+    return
+end
 
 -- only for backward compatibilty, will be removed in the future
 local function _add_profile_method(cls)
@@ -15,13 +23,25 @@ local function _add_profile_method(cls)
     cls.print_profile = function () c:print_profile() end
     cls.clear_profile = function () c:clear_profile() end
 end
-_add_profile_method(nerv.CuMatrix)
-_add_profile_method(nerv.MMatrix)
-
 
-if #arg < 1 then
-    return
+if not opts["use-cpu"].val then
+    local dev = opts["select-gpu"].val
+    nerv.info("automatically initialize a default CuContext...")
+    nerv.CuMatrix._default_context = nerv.CuContext(dev)
+    nerv.info("the default CuContext is ok")
+    _add_profile_method(nerv.CuMatrix)
+    nerv.CuMatrix.select_gpu =
+            function (dev) nerv.CuMatrix._default_context:select_gpu(dev) end
+    econf.use_cpu = false
+else
+    econf.use_cpu = true
 end
+
+nerv.info("automatically initialize a default MContext...")
+nerv.MMatrix._default_context = nerv.MContext()
+nerv.info("the default MContext is ok")
+_add_profile_method(nerv.MMatrix)
+
 local script = arg[1]
 local script_arg = {}
 for i = 2, #arg do
@@ -29,5 +49,3 @@ for i = 2, #arg do
 end
 arg = script_arg
 dofile(script)
-nerv.CuMatrix.print_profile()
-nerv.MMatrix.print_profile()
diff --git a/nerv/nerv-scm-1.rockspec b/nerv/nerv-scm-1.rockspec
index 0e1e47f..d039e85 100644
--- a/nerv/nerv-scm-1.rockspec
+++ b/nerv/nerv-scm-1.rockspec
@@ -11,7 +11,8 @@ description = {
     license = "BSD"
 }
 dependencies = {
-    "lua >= 5.1"
+    "lua >= 5.1",
+    "penlight >= 1.3.2"
 }
 build = {
     type = "make",
diff --git a/nerv/nn/layer_dag.lua b/nerv/nn/layer_dag.lua
index 6896878..f999752 100644
--- a/nerv/nn/layer_dag.lua
+++ b/nerv/nn/layer_dag.lua
@@ -134,20 +134,16 @@ function DAGLayer:__init(id, global_conf, layer_conf)
         end
     end
 
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
     self.layers = layers
     self.inputs = inputs
     self.outputs = outputs
-    self.id = id
-    self.dim_in = dim_in
-    self.dim_out = dim_out
     self.parsed_conn = parsed_conn
     self.queue = queue
-    self.gconf = global_conf
-    if self.gconf.use_cpu then
-        self.mat_type = self.gconf.mmat_type
-    else
-        self.mat_type = self.gconf.cumat_type
-    end
+end
+
+function DAGLayer:bind_params()
+    -- do nothing (instead of rebinding params for each layer)
 end
 
 function DAGLayer:init(batch_size, chunk_size)
@@ -325,7 +321,7 @@ function DAGLayer:get_params()
     for id, ref in pairs(self.queue) do
         table.insert(param_repos, ref.layer:get_params())
     end
-    return nerv.ParamRepo.merge(param_repos)
+    return nerv.ParamRepo.merge(param_repos, self.loc_type)
 end
 
 DAGLayer.PORT_TYPES = {
diff --git a/nerv/nn/layer_repo.lua b/nerv/nn/layer_repo.lua
index a169b2b..acef54a 100644
--- a/nerv/nn/layer_repo.lua
+++ b/nerv/nn/layer_repo.lua
@@ -12,27 +12,29 @@ function LayerRepo:add_layers(layer_spec, param_repo, global_conf)
         if layer_type == nil then
             nerv.error('layer type `%s` not found', ltype)
         end
-        for id, layer_config in pairs(llist) do
-            if layers[id] ~= nil then
-                nerv.error("a layer with id %s already exists", id)
-            end
-            nerv.info("create layer: %s", id)
-            if type(layer_config) ~= "table" then
+        for id, lconf in pairs(llist) do
+            if type(lconf) ~= "table" then
                 nerv.error("layer config table is need")
             end
-            if type(layer_config.params) == "table" then
-                for pname, pid in pairs(layer_config.params) do
-                    layer_config[pname] = param_repo:get_param(pid)
-                end
+            if lconf.pr == nil then
+                lconf.pr = param_repo
             end
-            if layer_config.pr == nil then
-                layer_config.pr = param_repo
+            if layers[id] ~= nil then
+                nerv.error("a layer with id %s already exists", id)
             end
-            layers[id] = layer_type(id, global_conf, layer_config)
+            nerv.info("create layer: %s", id)
+            layers[id] = layer_type(id, global_conf, lconf)
         end
     end
 end
 
+function LayerRepo:rebind(param_repo)
+    for id, layer in pairs(self.layers) do
+        layer.lconf.pr = param_repo
+        layer:bind_params()
+    end
+end
+
 function LayerRepo:get_layer(lid)
     local layer = self.layers[lid]
     if layer == nil then
diff --git a/nerv/nn/param_repo.lua b/nerv/nn/param_repo.lua
index c124e08..aba7765 100644
--- a/nerv/nn/param_repo.lua
+++ b/nerv/nn/param_repo.lua
@@ -1,8 +1,37 @@
 local ParamRepo = nerv.class("nerv.ParamRepo")
-function ParamRepo:__init(plist)
+
+ParamRepo.LOC_TYPES = {
+    ON_DEVICE = {},
+    ON_HOST = {}
+}
+
+function ParamRepo:__init(plist, loc_type)
     self.params = {}
+    self.loc_type = loc_type or ParamRepo.LOC_TYPES.ON_HOST
+    local function make_checker(tname)
+        return function (mat)
+            if not nerv.is_type(mat, tname) then
+                nerv.error("unexpected param type in repo specification")
+            end
+        end
+    end
+    self.make_copier = function (mat_type, copy_method)
+        return function (mat)
+            local target = mat_type(mat:nrow(), mat:ncol())
+            mat[copy_method](mat, target)
+            return target
+        end
+    end
+
+    if self.loc_type == ParamRepo.LOC_TYPES.ON_HOST then
+        self.checker = make_checker("nerv.MMatrix")
+    else
+        self.checker = make_checker("nerv.CuMatrix")
+    end
+
     if plist ~= nil then
         for i, p in ipairs(plist) do
+            p:check(self.checker)
             self.params[p.id] = p
         end
     end
@@ -12,6 +41,7 @@ function ParamRepo:add(pid, p)
     if self.params[pid] ~= nil then
         nerv.error("duplicate params with the same id: %s", pid)
     end
+    p:check(self.checker)
     self.params[pid] = p
 end
 
@@ -22,8 +52,8 @@ function ParamRepo:remove(pid, p)
     table.remove(self.params, pid)
 end
 
-function ParamRepo.merge(repos)
-    local self = nerv.ParamRepo()
+function ParamRepo.merge(repos, loc_type)
+    local self = nerv.ParamRepo(nil, loc_type)
     for i, repo in ipairs(repos) do
         if not nerv.is_type(repo, "nerv.ParamRepo") then
             nerv.error("nerv.ParamRepo objects expected, got %s", repo)
@@ -78,3 +108,26 @@ function ParamRepo:get_param(pid)
     end
     return p
 end
+
+function ParamRepo:copy(loc_type, pids)
+    local copier
+    local target = nerv.ParamRepo(nil, loc_type)
+    if loc_type == nil then
+        loc_type = self.loc_type
+    end
+    if loc_type == ParamRepo.LOC_TYPES.ON_HOST then
+        copier = self.make_copier(gconf.mmat_type, 'copy_toh')
+    else
+        copier = self.make_copier(gconf.cumat_type, 'copy_tod')
+    end
+    if pids == nil then
+        for id, p in pairs(self.params) do
+            target.params[id] = p:copy(copier)
+        end
+    else
+        for i, pid in ipairs(pids) do
+            target.params[pid] = self:get_param(pid):copy(copier)
+        end
+    end
+    return target
+end
diff --git a/nerv/test/parse_args.lua b/nerv/test/parse_args.lua
new file mode 100644
index 0000000..34ad55e
--- /dev/null
+++ b/nerv/test/parse_args.lua
@@ -0,0 +1,15 @@
+local options = {{"abandon", "a", "boolean", default = false, desc = "abandon your belief"},
+                 {"bullshit", "b", "boolean", default = false, desc = "start to bullshit"},
+                 {"cheat", "c", "boolean", default = false, desc = "try to cheat"},
+                 {"delete", "d", "boolean", default = false, desc = "remove everything"},
+                 {"hehe", "h", "boolean", default = false, desc = "233333"},
+                 {"oh", "o", "boolean", default = true, desc = "oh yes!"},
+                 {"uid", nil, "int", desc = "user uid"},
+                 {"str", nil, "string", desc = "test string"}}
+
+args, opts = nerv.parse_args({"arg1", "arg2", "-abcd", "arg3",
+                                "--hehe", "--oh=no", "--uid=43",
+                                "highfive", "--str=hello"}, options)
+
+nerv.print_usage(options)
+print(table.tostring(args), table.tostring(opts))