From 619cd9f2d27fa06dd6de774c8970c2eaa2f889c6 Mon Sep 17 00:00:00 2001
From: Yimmon Zhuang <yimmon.zhuang@gmail.com>
Date: Wed, 14 Oct 2015 15:33:48 +0800
Subject: support kaldi decoder

---
 nerv/examples/mpe_chime3.lua | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/nerv/examples/mpe_chime3.lua b/nerv/examples/mpe_chime3.lua
index ec095b0..0615ddc 100644
--- a/nerv/examples/mpe_chime3.lua
+++ b/nerv/examples/mpe_chime3.lua
@@ -7,6 +7,8 @@ gconf = {lrate = 0.00001, wcost = 0, momentum = 0.0,
         tr_scp = "ark,s,cs:/slfs6/users/ymz09/kaldi/src/featbin/copy-feats scp:/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced_smbr/train.scp ark:- |",
         initialized_param = {"/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_init.nerv",
         "/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_global_transf.nerv"},
+        decode_param = {"/slfs6/users/ymz09/nerv-project/test_mpe/1.nerv",
+        "/slfs6/users/ymz09/nerv-project/nerv/nerv-speech/kaldi_seq/test/chime3_global_transf.nerv"},
         debug = false}
 
 function make_layer_repo(param_repo)
@@ -125,13 +127,12 @@ function make_layer_repo(param_repo)
                     ["mpe_crit[1]"] = "<output>[1]"
                 }
             }},
-            softmax_output = {{}, {
+            decode_output = {{}, {
                 dim_in = {440}, dim_out = {2011},
                 sub_layers = layer_repo,
                 connections = {
                     ["<input>[1]"] = "main[1]",
-                    ["main[1]"] = "softmax[1]",
-                    ["softmax[1]"] = "<output>[1]"
+                    ["main[1]"] = "<output>[1]"
                 }
             }}
         }
@@ -145,7 +146,7 @@ function get_network(layer_repo)
 end
 
 function get_decode_network(layer_repo)
-    return layer_repo:get_layer("softmax_output")
+    return layer_repo:get_layer("decode_output")
 end
 
 function get_global_transf(layer_repo)
-- 
cgit v1.2.3


From 21a5f8e7e0c92fb8ba249a1784ac315bd3178855 Mon Sep 17 00:00:00 2001
From: Yimmon Zhuang <yimmon.zhuang@gmail.com>
Date: Mon, 23 Nov 2015 16:15:37 +0800
Subject: add need_key for sequence training

---
 nerv/examples/mmi_chime3.lua | 1 +
 nerv/examples/mpe_chime3.lua | 1 +
 2 files changed, 2 insertions(+)

diff --git a/nerv/examples/mmi_chime3.lua b/nerv/examples/mmi_chime3.lua
index 6ac7f28..3daaafa 100644
--- a/nerv/examples/mmi_chime3.lua
+++ b/nerv/examples/mmi_chime3.lua
@@ -160,6 +160,7 @@ function make_readers(feature_rspecifier, layer_repo)
                         feature_rspecifier = feature_rspecifier,
                         frm_ext = gconf.frm_ext,
                         global_transf = layer_repo:get_layer("global_transf"),
+                        need_key = true,
                         mlfs = {}
                     })
                 }
diff --git a/nerv/examples/mpe_chime3.lua b/nerv/examples/mpe_chime3.lua
index 0615ddc..f9a2855 100644
--- a/nerv/examples/mpe_chime3.lua
+++ b/nerv/examples/mpe_chime3.lua
@@ -161,6 +161,7 @@ function make_readers(feature_rspecifier, layer_repo)
                         feature_rspecifier = feature_rspecifier,
                         frm_ext = gconf.frm_ext,
                         global_transf = layer_repo:get_layer("global_transf"),
+                        need_key = true,
                         mlfs = {}
                     })
                 }
-- 
cgit v1.2.3


From bcead9a4063190d5573604e7ca1981368162d565 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Mon, 23 Nov 2015 17:01:00 +0800
Subject: use rearrange = true and frm_trim = 5 in settings and reproduce the
 results again

---
 nerv/examples/swb_baseline.lua | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nerv/examples/swb_baseline.lua b/nerv/examples/swb_baseline.lua
index 8f72200..51052ba 100644
--- a/nerv/examples/swb_baseline.lua
+++ b/nerv/examples/swb_baseline.lua
@@ -2,9 +2,9 @@ require 'htk_io'
 gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9,
         cumat_type = nerv.CuMatrixFloat,
         mmat_type = nerv.MMatrixFloat,
-        direct_update = true,
+        rearrange = true, -- just to make the context order consistent with old results, deprecated
         frm_ext = 5,
-        frm_trim = 5,
+        frm_trim = 5, -- trim the first and last 5 frames, TNet just does this, deprecated
         tr_scp = "/slfs1/users/mfy43/swb_ivec/train_bp.scp",
         cv_scp = "/slfs1/users/mfy43/swb_ivec/train_cv.scp",
         htk_conf = "/slfs1/users/mfy43/swb_ivec/plp_0_d_a.conf",
-- 
cgit v1.2.3


From 00c3f11361967a0f78fd770d20a2af3e9e7c1f50 Mon Sep 17 00:00:00 2001
From: txh18 <cloudygooseg@gmail.com>
Date: Thu, 10 Dec 2015 00:15:38 +0800
Subject: bilstm_v2 did not run well

---
 nerv/examples/lmptb/lm_trainer.lua |  2 +-
 nerv/tnn/tnn.lua                   | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/nerv/examples/lmptb/lm_trainer.lua b/nerv/examples/lmptb/lm_trainer.lua
index 3b8b5c3..ecedc9f 100644
--- a/nerv/examples/lmptb/lm_trainer.lua
+++ b/nerv/examples/lmptb/lm_trainer.lua
@@ -196,7 +196,6 @@ function LMTrainer.lm_process_file_birnn(global_conf, fn, tnn, do_train, p_conf)
         if r == false then 
             break 
         end
-    
         for t = 1, chunk_size do
             tnn.err_inputs_m[t][1]:fill(1)
             for i = 1, batch_size do
@@ -269,6 +268,7 @@ function LMTrainer.lm_process_file_birnn(global_conf, fn, tnn, do_train, p_conf)
 
         collectgarbage("collect")                                              
 
+        tnn:flush_all()
         --break --debug
     end
     
diff --git a/nerv/tnn/tnn.lua b/nerv/tnn/tnn.lua
index cf02123..bcfeb40 100644
--- a/nerv/tnn/tnn.lua
+++ b/nerv/tnn/tnn.lua
@@ -64,7 +64,7 @@ function TNN.make_initial_store(st, p, dim, batch_size, chunk_size, extend_t, gl
     if (type(st) ~= "table") then
         nerv.error("st should be a table")
     end
-    for i = 1 - extend_t - 1, chunk_size + extend_t + 1 do --intentionally allocated more time
+    for i = 1 - extend_t - 2, chunk_size + extend_t + 2 do --intentionally allocated more time
         if (st[i] == nil) then
             st[i] = {}
         end
@@ -339,6 +339,11 @@ function TNN:net_propagate() --propagate according to feeds_now
     end
 
     local feeds_now = self.feeds_now
+    for t = 1, self.chunk_size do --some layer maybe do not have inputs from time 1..chunk_size
+        for id, ref in pairs(self.layers) do
+            self:propagate_dfs(ref, t)
+        end
+    end
     for t = 1, self.chunk_size do
         if (bit.band(feeds_now.flagsPack_now[t], nerv.TNN.FC.HAS_INPUT) > 0) then
             for i = 1, #self.dim_in do
@@ -362,6 +367,7 @@ function TNN:net_propagate() --propagate according to feeds_now
             end
         end
     end
+
     if (flag_out == false) then
         nerv.error("some thing wrong, some labeled output is not propagated")
     end
@@ -458,6 +464,11 @@ function TNN:net_backpropagate(do_update) --propagate according to feeds_now
     end
 
     local feeds_now = self.feeds_now
+    for t = 1, self.chunk_size do --some layer maybe do not have outputs from time 1..chunk_size
+        for id, ref in pairs(self.layers) do
+            self:backpropagate_dfs(ref, t)
+        end
+    end
     for t = 1, self.chunk_size do
         if bit.band(feeds_now.flagsPack_now[t], nerv.TNN.FC.HAS_LABEL) > 0 then
             for i = 1, #self.dim_out do
-- 
cgit v1.2.3


From 62169f73b935dd6df8fe0c5628beed58820d186e Mon Sep 17 00:00:00 2001
From: txh18 <cloudygooseg@gmail.com>
Date: Thu, 10 Dec 2015 00:16:26 +0800
Subject: ...

---
 nerv/examples/lmptb/bilstmlm_v2_ptb_main.lua | 522 +++++++++++++++++++++++++++
 1 file changed, 522 insertions(+)
 create mode 100644 nerv/examples/lmptb/bilstmlm_v2_ptb_main.lua

diff --git a/nerv/examples/lmptb/bilstmlm_v2_ptb_main.lua b/nerv/examples/lmptb/bilstmlm_v2_ptb_main.lua
new file mode 100644
index 0000000..4f52f29
--- /dev/null
+++ b/nerv/examples/lmptb/bilstmlm_v2_ptb_main.lua
@@ -0,0 +1,522 @@
+--[[
+The bilstm_v2 slightly changed the structure of tnn so that the current prediction won't have info about the very word to predict, so we should not get an amazingly low PPL
+]]--
+require 'lmptb.lmvocab'
+require 'lmptb.lmfeeder'
+require 'lmptb.lmutil'
+require 'lmptb.layer.init'
+--require 'tnn.init'
+require 'lmptb.lmseqreader'
+require 'lm_trainer'
+
+--[[global function rename]]--
+--local printf = nerv.printf
+local LMTrainer = nerv.LMTrainer
+--[[global function rename ends]]--
+
+--global_conf: table
+--first_time: bool
+--Returns: a ParamRepo
+function prepare_parameters(global_conf, iter)
+    nerv.printf("%s preparing parameters...\n", global_conf.sche_log_pre) 
+    
+    global_conf.paramRepo = nerv.ParamRepo()
+    local paramRepo = global_conf.paramRepo
+
+    if iter == -1 then --first time
+        nerv.printf("%s first time, prepare some pre-set parameters, and leaving other parameters to auto-generation...\n", global_conf.sche_log_pre) 
+        local f = nerv.ChunkFile(global_conf.param_fn .. '.0', 'w')
+        f:close()
+        --[[
+        ltp_ih = nerv.LinearTransParam("ltp_ih", global_conf)  
+        ltp_ih.trans = global_conf.cumat_type(global_conf.vocab:size(), global_conf.hidden_size) --index 0 is for zero, others correspond to vocab index(starting from 1)
+        ltp_ih.trans:generate(global_conf.param_random)
+
+        ltp_hh = nerv.LinearTransParam("ltp_hh", global_conf)
+        ltp_hh.trans = global_conf.cumat_type(global_conf.hidden_size, global_conf.hidden_size)
+        ltp_hh.trans:generate(global_conf.param_random) 
+
+        --ltp_ho = nerv.LinearTransParam("ltp_ho", global_conf)
+        --ltp_ho.trans = global_conf.cumat_type(global_conf.hidden_size, global_conf.vocab:size())
+        --ltp_ho.trans:generate(global_conf.param_random)
+
+        bp_h = nerv.BiasParam("bp_h", global_conf)
+        bp_h.trans = global_conf.cumat_type(1, global_conf.hidden_size)
+        bp_h.trans:generate(global_conf.param_random)
+
+        --bp_o = nerv.BiasParam("bp_o", global_conf)
+        --bp_o.trans = global_conf.cumat_type(1, global_conf.vocab:size())
+        --bp_o.trans:generate(global_conf.param_random)
+
+        local f = nerv.ChunkFile(global_conf.param_fn .. '.0', 'w')
+        f:write_chunk(ltp_ih)
+        f:write_chunk(ltp_hh)
+        --f:write_chunk(ltp_ho)
+        f:write_chunk(bp_h)
+        --f:write_chunk(bp_o)
+        f:close()
+        ]]--
+        return nil
+    end
+    
+    nerv.printf("%s loading parameter from file %s...\n", global_conf.sche_log_pre, global_conf.param_fn .. '.' .. tostring(iter)) 
+    paramRepo:import({global_conf.param_fn .. '.' .. tostring(iter)}, nil, global_conf)
+
+    nerv.printf("%s preparing parameters end.\n", global_conf.sche_log_pre)
+
+    return nil
+end
+
+--global_conf: table
+--Returns: nerv.LayerRepo
+function prepare_layers(global_conf)
+    nerv.printf("%s preparing layers...\n", global_conf.sche_log_pre)
+    
+    local pr = global_conf.paramRepo
+
+    local du = false
+
+    local layers = {
+        ["nerv.LSTMLayerT"] = {
+            ["lstmFL1"] = {{}, {["dim_in"] = {global_conf.hidden_size, global_conf.hidden_size, global_conf.hidden_size}, ["dim_out"] = {global_conf.hidden_size, global_conf.hidden_size}, ["pr"] = pr}}, 
+            ["lstmRL1"] = {{}, {["dim_in"] = {global_conf.hidden_size, global_conf.hidden_size, global_conf.hidden_size}, ["dim_out"] = {global_conf.hidden_size, global_conf.hidden_size}, ["pr"] = pr}}, 
+        },
+        
+        ["nerv.DropoutLayerT"] = {
+            ["dropoutL1"] = {{}, {["dim_in"] = {global_conf.hidden_size}, ["dim_out"] = {global_conf.hidden_size}}}, 
+        },
+
+        ["nerv.SelectLinearLayer"] = {
+            ["selectL1"] = {{}, {["dim_in"] = {1}, ["dim_out"] = {global_conf.hidden_size}, ["vocab"] = global_conf.vocab, ["pr"] = pr}},
+        },
+        
+        ["nerv.CombinerLayer"] = {
+            ["combinerXL1"] = {{}, {["dim_in"] = {global_conf.hidden_size}, ["dim_out"] = {global_conf.hidden_size, global_conf.hidden_size}, ["lambda"] = {1}}},
+            ["combinerHFL1"] = {{}, {["dim_in"] = {global_conf.hidden_size}, ["dim_out"] = {global_conf.hidden_size, global_conf.hidden_size}, ["lambda"] = {1}}},
+            ["combinerHRL1"] = {{}, {["dim_in"] = {global_conf.hidden_size}, ["dim_out"] = {global_conf.hidden_size, global_conf.hidden_size}, ["lambda"] = {1}}},
+        },
+
+        ["nerv.AffineLayer"] = {
+            ["biAffineL1"] = {{}, {["dim_in"] = {global_conf.hidden_size, global_conf.hidden_size}, ["dim_out"] = {global_conf.hidden_size}, ["pr"] = pr, ["lambda"] = {1, 1}}},
+            ["outputL"] = {{}, {["dim_in"] = {global_conf.hidden_size}, ["dim_out"] = {global_conf.vocab:size()}, ["direct_update"] = du, ["pr"] = pr}},
+        },
+
+        ["nerv.TanhLayer"] = {
+            ["biTanhL1"] = {{}, {["dim_in"] = {global_conf.hidden_size}, ["dim_out"] = {global_conf.hidden_size}}},
+        },
+
+        ["nerv.SoftmaxCELayerT"] = {
+            ["softmaxL"] = {{}, {["dim_in"] = {global_conf.vocab:size(), global_conf.vocab:size()}, ["dim_out"] = {1}}},
+        },
+    }
+
+    if global_conf.layer_num > 1 then
+        nerv.error("this script currently do not support more than one layer")
+    end
+    --[[ 
+    for l = 2, global_conf.layer_num do 
+        layers["nerv.DropoutLayerT"]["dropoutL" .. l] = {{}, {["dim_in"] = {global_conf.hidden_size}, ["dim_out"] = {global_conf.hidden_size}}}
+        layers["nerv.LSTMLayerT"]["lstmL" .. l] = {{}, {["dim_in"] = {global_conf.hidden_size, global_conf.hidden_size, global_conf.hidden_size}, ["dim_out"] = {global_conf.hidden_size, global_conf.hidden_size}, ["pr"] = pr}}
+        layers["nerv.CombinerLayer"]["combinerL" .. l] = {{}, {["dim_in"] = {global_conf.hidden_size}, ["dim_out"] = {global_conf.hidden_size, global_conf.hidden_size}, ["lambda"] = {1}}}
+    end
+    ]]--
+
+    local layerRepo = nerv.LayerRepo(layers, pr, global_conf)
+    nerv.printf("%s preparing layers end.\n", global_conf.sche_log_pre)
+    return layerRepo
+end
+
+--global_conf: table
+--layerRepo: nerv.LayerRepo
+--Returns: a nerv.TNN
+function prepare_tnn(global_conf, layerRepo)
+    nerv.printf("%s Generate and initing TNN ...\n", global_conf.sche_log_pre)
+
+    --input: input_w, input_w, ... input_w_now, last_activation
+    local connections_t = {
+        {"<input>[1]", "selectL1[1]", 0},
+        
+        --{"selectL1[1]", "recurrentL1[1]", 0},  
+        --{"recurrentL1[1]", "sigmoidL1[1]", 0},
+        --{"sigmoidL1[1]", "combinerL1[1]", 0},
+        --{"combinerL1[1]", "recurrentL1[2]", 1},
+        
+        {"selectL1[1]", "combinerXL1[1]", 0},
+        {"combinerXL1[1]", "lstmFL1[1]", 0},
+        {"lstmFL1[1]", "combinerHFL1[1]", 0},
+        {"combinerHFL1[1]", "lstmFL1[2]", 1},
+        {"lstmFL1[2]", "lstmFL1[3]", 1},
+        {"combinerXL1[2]", "lstmRL1[1]", 0},
+        {"lstmRL1[1]", "combinerHRL1[1]", 0},
+        {"combinerHRL1[1]", "lstmRL1[2]", -1},
+        {"lstmRL1[2]", "lstmRL1[3]", -1},
+        {"combinerHFL1[2]", "biAffineL1[1]", 0},
+        {"combinerHRL1[2]", "biAffineL1[2]", -2},
+        {"biAffineL1[1]", "biTanhL1[1]", 0},
+        {"biTanhL1[1]", "dropoutL1[1]", 0},
+
+        {"dropoutL"..global_conf.layer_num.."[1]", "outputL[1]", 0},
+        {"outputL[1]", "softmaxL[1]", 0},
+        {"<input>[2]", "softmaxL[2]", 0},
+        {"softmaxL[1]", "<output>[1]", 0}
+    }
+
+    --[[    
+    for l = 2, global_conf.layer_num do
+        table.insert(connections_t, {"dropoutL"..(l-1).."[1]", "lstmL"..l.."[1]", 0})
+        table.insert(connections_t, {"lstmL"..l.."[2]", "lstmL"..l.."[3]", 1})
+        table.insert(connections_t, {"lstmL"..l.."[1]", "combinerL"..l.."[1]", 0})
+        table.insert(connections_t, {"combinerL"..l.."[1]", "lstmL"..l.."[2]", 1})
+        table.insert(connections_t, {"combinerL"..l.."[2]", "dropoutL"..l.."[1]", 0})
+    end
+    ]]--
+
+    --[[
+    printf("%s printing DAG connections:\n", global_conf.sche_log_pre)
+    for key, value in pairs(connections_t) do
+        printf("\t%s->%s\n", key, value)
+    end
+    ]]--
+
+    local tnn = nerv.TNN("TNN", global_conf, {["dim_in"] = {1, global_conf.vocab:size()}, 
+            ["dim_out"] = {1}, ["sub_layers"] = layerRepo,
+            ["connections"] = connections_t, ["clip_t"] = global_conf.clip_t,
+        })
+
+    tnn:init(global_conf.batch_size, global_conf.chunk_size)
+
+    nerv.printf("%s Initing TNN end.\n", global_conf.sche_log_pre)
+    return tnn
+end
+
+function load_net(global_conf, next_iter)
+    prepare_parameters(global_conf, next_iter)
+    local layerRepo = prepare_layers(global_conf)
+    local tnn = prepare_tnn(global_conf, layerRepo)
+    return tnn
+end
+
+local train_fn, valid_fn, test_fn
+global_conf = {}
+local set = arg[1] --"test"
+
+if (set == "ptb") then
+
+root_dir = '/home/slhome/txh18/workspace'
+data_dir = root_dir .. '/ptb/DATA'
+train_fn = data_dir .. '/ptb.train.txt.adds'
+valid_fn = data_dir .. '/ptb.valid.txt.adds'
+test_fn = data_dir .. '/ptb.test.txt.adds'
+vocab_fn = data_dir .. '/vocab'
+
+qdata_dir = root_dir .. '/ptb/questionGen/gen'
+
+global_conf = {
+    lrate = 0.015, wcost = 1e-5, momentum = 0, clip_t = 5,
+    cumat_type = nerv.CuMatrixFloat,
+    mmat_type = nerv.MMatrixFloat,
+    nn_act_default = 0, 
+
+    hidden_size = 300,
+    layer_num = 1,
+    chunk_size = 90,
+    batch_size = 20, 
+    max_iter = 35,
+    lr_decay = 1.003,
+    decay_iter = 10,
+    param_random = function() return (math.random() / 5 - 0.1) end,
+    dropout_str = "0",
+
+    train_fn = train_fn,
+    valid_fn = valid_fn,
+    test_fn = test_fn,
+    vocab_fn = vocab_fn,
+    max_sen_len = 90,
+    sche_log_pre = "[SCHEDULER]:",
+    log_w_num = 40000, --give a message when log_w_num words have been processed
+    timer = nerv.Timer(),
+    work_dir_base = '/home/slhome/txh18/workspace/ptb/EXP-nerv/bilstmlm_v2.0'
+}
+
+elseif (set == "msr_sc") then
+
+data_dir = '/home/slhome/txh18/workspace/sentenceCompletion/DATA_PV2'
+train_fn = data_dir .. '/normed_all.sf.len60.adds.train'
+valid_fn = data_dir .. '/normed_all.sf.len60.adds.dev'
+test_fn = data_dir .. '/answer_normed.adds'
+vocab_fn = data_dir .. '/normed_all.choose.vocab30000.addqvocab'
+
+global_conf = {
+    lrate = 1, wcost = 1e-6, momentum = 0,
+    cumat_type = nerv.CuMatrixFloat,
+    mmat_type = nerv.MMatrixFloat,
+    nn_act_default = 0, 
+
+    hidden_size = 300,
+    layer_num = 1,
+    chunk_size = 15,
+    batch_size = 10, 
+    max_iter = 30,
+    decay_iter = 10,
+    lr_decay = 1.003,
+    param_random = function() return (math.random() / 5 - 0.1) end,
+    dropout_str = "0",
+
+    train_fn = train_fn,
+    valid_fn = valid_fn,
+    test_fn = test_fn,
+    vocab_fn = vocab_fn,
+    sche_log_pre = "[SCHEDULER]:",
+    log_w_num = 400000, --give a message when log_w_num words have been processed
+    timer = nerv.Timer(),
+    work_dir_base = '/home/slhome/txh18/workspace/sentenceCompletion/EXP-Nerv/rnnlm_test'
+}
+
+elseif (set == "twitter") then
+
+root_dir = '/home/slhome/txh18/workspace'
+data_dir = root_dir .. '/twitter_new/DATA'
+train_fn = data_dir .. '/twitter.choose.adds'
+valid_fn = data_dir .. '/twitter.valid.adds'
+test_fn = data_dir .. '/comm.test.choose-ppl.adds'
+vocab_fn = data_dir .. '/twitter.choose.train.vocab'
+
+--qdata_dir = root_dir .. '/ptb/questionGen/gen'
+
+global_conf = {
+    lrate = 0.15, wcost = 1e-5, momentum = 0, clip_t = 5,
+    cumat_type = nerv.CuMatrixFloat,
+    mmat_type = nerv.MMatrixFloat,
+    nn_act_default = 0, 
+
+    hidden_size = 300,
+    layer_num = 1,
+    chunk_size = 15,
+    batch_size = 20, 
+    max_iter = 35,
+    lr_decay = 1.003,
+    decay_iter = 10,
+    param_random = function() return (math.random() / 5 - 0.1) end,
+    dropout_str = "0",
+
+    train_fn = train_fn,
+    valid_fn = valid_fn,
+    test_fn = test_fn,
+    vocab_fn = vocab_fn,
+    max_sen_len = 90,
+    sche_log_pre = "[SCHEDULER]:",
+    log_w_num = 40000, --give a message when log_w_num words have been processed
+    timer = nerv.Timer(),
+    work_dir_base = root_dir .. '/twitter_new/EXP-nerv/bilstmlm_v1.0'
+}
+
+else
+
+valid_fn = '/home/slhome/txh18/workspace/nerv/nerv/nerv/examples/lmptb/m-tests/some-text'
+train_fn = '/home/slhome/txh18/workspace/nerv/nerv/nerv/examples/lmptb/m-tests/some-text'
+test_fn = '/home/slhome/txh18/workspace/nerv/nerv/nerv/examples/lmptb/m-tests/some-text'
+vocab_fn = '/home/slhome/txh18/workspace/nerv/nerv/nerv/examples/lmptb/m-tests/some-text'
+
+global_conf = {
+    lrate = 0.01, wcost = 1e-5, momentum = 0,
+    cumat_type = nerv.CuMatrixFloat,
+    mmat_type = nerv.MMatrixFloat,
+    nn_act_default = 0, 
+
+    hidden_size = 20,
+    layer_num = 1,
+    chunk_size = 20,
+    batch_size = 10, 
+    max_iter = 2,
+    param_random = function() return (math.random() / 5 - 0.1) end,
+    dropout_str = "0",
+
+    train_fn = train_fn,
+    valid_fn = valid_fn,
+    test_fn = test_fn,
+    max_sen_len = 80,
+    lr_decay = 1.003,
+    decay_iter = 10,
+    vocab_fn = vocab_fn, 
+    sche_log_pre = "[SCHEDULER]:",
+    log_w_num = 10, --give a message when log_w_num words have been processed
+    timer = nerv.Timer(),
+    work_dir_base = '/home/slhome/txh18/workspace/nerv/play/testEXP/tnn_bilstmlm_test'
+}
+
+end
+
+lr_half = false --can not be local, to be set by loadstring
+start_iter = -1
+ppl_last = 100000
+commands_str = "train:test"
+commands = {}
+test_iter = -1
+start_lr = nil
+
+--for testout(question)
+q_file = "/home/slhome/txh18/workspace/ptb/questionGen/gen/ptb.test.txt.q10rs1_Msss.adds"
+
+if arg[2] ~= nil then
+    nerv.printf("%s applying arg[2](%s)...\n", global_conf.sche_log_pre, arg[2])
+    loadstring(arg[2])() 
+    nerv.LMUtil.wait(0.5)
+else
+    nerv.printf("%s no user setting, all default...\n", global_conf.sche_log_pre)
+end
+
+
+global_conf.work_dir = global_conf.work_dir_base .. 'h' .. global_conf.hidden_size .. 'l' .. global_conf.layer_num --.. 'ch' .. global_conf.chunk_size .. 'ba' .. global_conf.batch_size .. 'slr' ..   global_conf.lrate .. 'wc' .. global_conf.wcost .. 'dr' .. global_conf.dropout_str 
+global_conf.train_fn_shuf = global_conf.work_dir .. '/train_fn_shuf'
+global_conf.train_fn_shuf_bak = global_conf.train_fn_shuf .. '_bak'
+global_conf.param_fn = global_conf.work_dir .. "/params"
+global_conf.dropout_list = nerv.SUtil.parse_schedule(global_conf.dropout_str)
+global_conf.log_fn = global_conf.work_dir .. '/log_lstm_tnn_' .. commands_str ..os.date("_TT%m_%d_%X",os.time())
+global_conf.log_fn, _ = string.gsub(global_conf.log_fn, ':', '-')
+commands = nerv.SUtil.parse_commands_set(commands_str)
+if start_lr ~= nil then
+    global_conf.lrate = start_lr --starting lr can be set by user(arg[2])
+end
+
+nerv.printf("%s creating work_dir(%s)...\n", global_conf.sche_log_pre, global_conf.work_dir)
+nerv.LMUtil.wait(2)
+os.execute("mkdir -p "..global_conf.work_dir)
+os.execute("cp " .. global_conf.train_fn .. " " .. global_conf.train_fn_shuf)
+
+--redirecting log outputs!
+nerv.SUtil.log_redirect(global_conf.log_fn)
+nerv.LMUtil.wait(2)
+
+----------------printing options---------------------------------
+nerv.printf("%s printing global_conf...\n", global_conf.sche_log_pre)
+for id, value in pairs(global_conf) do
+    nerv.printf("%s:\t%s\n", id, tostring(value))
+end
+nerv.LMUtil.wait(2)
+
+nerv.printf("%s printing training scheduling options...\n", global_conf.sche_log_pre)
+nerv.printf("lr_half:\t%s\n", tostring(lr_half))
+nerv.printf("start_iter:\t%s\n", tostring(start_iter))
+nerv.printf("ppl_last:\t%s\n", tostring(ppl_last))
+nerv.printf("commands_str:\t%s\n", commands_str)
+nerv.printf("test_iter:\t%s\n", tostring(test_iter))
+nerv.printf("%s printing training scheduling end.\n", global_conf.sche_log_pre)
+nerv.LMUtil.wait(2)
+------------------printing options end------------------------------
+
+math.randomseed(1)
+
+local vocab = nerv.LMVocab()
+global_conf["vocab"] = vocab
+nerv.printf("%s building vocab...\n", global_conf.sche_log_pre)
+global_conf.vocab:build_file(global_conf.vocab_fn, false)
+ppl_rec = {} 
+
+local final_iter = -1
+if commands["train"] == 1 then
+    if start_iter == -1 then 
+        prepare_parameters(global_conf, -1) --write pre_generated params to param.0 file
+    end
+    
+    if start_iter == -1 or start_iter == 0 then
+        nerv.printf("===INITIAL VALIDATION===\n") 
+        local tnn = load_net(global_conf, 0)
+        global_conf.paramRepo = tnn:get_params() --get auto-generted params
+        global_conf.paramRepo:export(global_conf.param_fn .. '.0', nil)  --some parameters are auto-generated, saved again to param.0 file
+        global_conf.dropout_rate = 0
+        local result = LMTrainer.lm_process_file_birnn(global_conf, global_conf.valid_fn, tnn, false) --false update!
+        nerv.LMUtil.wait(1)
+        ppl_rec[0] = {} 
+        ppl_rec[0].valid = result:ppl_all("birnn")  
+        ppl_last = ppl_rec[0].valid 
+        ppl_rec[0].train = 0 
+        ppl_rec[0].test = 0
+        ppl_rec[0].lr = 0 
+    
+        start_iter = 1
+    
+        nerv.printf("\n") 
+    end
+    
+    for iter = start_iter, global_conf.max_iter, 1 do
+        final_iter = iter --for final testing
+        global_conf.sche_log_pre = "[SCHEDULER ITER"..iter.." LR"..global_conf.lrate.."]:" 
+        tnn = load_net(global_conf, iter - 1) 
+        nerv.printf("===ITERATION %d LR %f===\n", iter, global_conf.lrate) 
+        global_conf.dropout_rate = nerv.SUtil.sche_get(global_conf.dropout_list, iter)
+        result = LMTrainer.lm_process_file_birnn(global_conf, global_conf.train_fn_shuf, tnn, true) --true update!
+        global_conf.dropout_rate = 0
+        ppl_rec[iter] = {}
+        ppl_rec[iter].train = result:ppl_all("birnn")
+        --shuffling training file
+        nerv.printf("%s shuffling training file\n", global_conf.sche_log_pre)
+        os.execute('cp ' .. global_conf.train_fn_shuf .. ' ' .. global_conf.train_fn_shuf_bak)
+        os.execute('cat ' .. global_conf.train_fn_shuf_bak .. ' | sort -R --random-source=/dev/zero > ' .. global_conf.train_fn_shuf)
+        nerv.printf("===PEEK ON TEST %d===\n", iter) 
+        result = LMTrainer.lm_process_file_birnn(global_conf, global_conf.test_fn, tnn, false) --false update!
+        ppl_rec[iter].test = result:ppl_all("birnn")  
+        nerv.printf("===VALIDATION %d===\n", iter) 
+        result = LMTrainer.lm_process_file_birnn(global_conf, global_conf.valid_fn, tnn, false) --false update!
+        ppl_rec[iter].valid = result:ppl_all("birnn") 
+        ppl_rec[iter].lr = global_conf.lrate 
+        if ((ppl_last / ppl_rec[iter].valid < global_conf.lr_decay or lr_half == true) and iter > global_conf.decay_iter) then 
+            global_conf.lrate = (global_conf.lrate * 0.6)
+        end 
+        if ppl_rec[iter].valid < ppl_last then 
+            nerv.printf("%s PPL improves, saving net to file %s.%d...\n", global_conf.sche_log_pre, global_conf.param_fn, iter) 
+            global_conf.paramRepo:export(global_conf.param_fn .. '.' .. tostring(iter), nil) 
+        else 
+            nerv.printf("%s PPL did not improve, rejected, copying param file of last iter...\n", global_conf.sche_log_pre) 
+            os.execute('cp ' .. global_conf.param_fn..'.'..tostring(iter - 1) .. ' ' .. global_conf.param_fn..'.'..tostring(iter))
+        end 
+        if ppl_last / ppl_rec[iter].valid < global_conf.lr_decay or lr_half == true then
+            lr_half = true
+        end
+        if ppl_rec[iter].valid < ppl_last then
+            ppl_last = ppl_rec[iter].valid
+        end
+        nerv.printf("\n") 
+        nerv.LMUtil.wait(2) 
+    end
+    nerv.info("saving final nn to param.final")
+    os.execute('cp ' .. global_conf.param_fn .. '.' .. tostring(final_iter) .. ' ' .. global_conf.param_fn .. '.final')
+    
+    nerv.printf("===VALIDATION PPL record===\n") 
+    for i, _ in pairs(ppl_rec) do 
+        nerv.printf("<ITER%d LR%.5f train:%.3f valid:%.3f test:%.3f> \n", i, ppl_rec[i].lr, ppl_rec[i].train, ppl_rec[i].valid, ppl_rec[i].test) 
+    end 
+    nerv.printf("\n")
+end --if commands["train"]
+
+if commands["test"] == 1 then
+    nerv.printf("===FINAL TEST===\n") 
+    global_conf.sche_log_pre = "[SCHEDULER FINAL_TEST]:" 
+    if final_iter ~= -1 and test_iter == -1 then
+        test_iter = final_iter
+    end
+    if test_iter == -1 then
+        test_iter = "final"
+    end
+    tnn = load_net(global_conf, test_iter) 
+    global_conf.dropout_rate = 0
+    LMTrainer.lm_process_file_birnn(global_conf, global_conf.test_fn, tnn, false) --false update!
+end --if commands["test"]
+
+if commands["testout"] == 1 then
+    nerv.printf("===TEST OUT===\n") 
+    nerv.printf("q_file:\t%s\n", q_file)    
+    local q_fn = q_file --qdata_dir .. '/' .. q_file
+    global_conf.sche_log_pre = "[SCHEDULER FINAL_TEST]:" 
+    if final_iter ~= -1 and test_iter == -1 then
+        test_iter = final_iter
+    end
+    if test_iter == -1 then
+        test_iter = "final"
+    end
+    tnn = load_net(global_conf, test_iter) 
+    global_conf.dropout_rate = 0
+    LMTrainer.lm_process_file_birnn(global_conf, q_fn, tnn, false,
+            {["one_sen_report"] = true}) --false update!
+end --if commands["testout"]
+
+
-- 
cgit v1.2.3


From 91075c34160fa24e484148b26c1178e05c2212a4 Mon Sep 17 00:00:00 2001
From: txh18 <cloudygooseg@gmail.com>
Date: Thu, 10 Dec 2015 13:28:13 +0800
Subject: bug fix for recent changes in tnn

---
 nerv/examples/lmptb/lmptb/layer/select_linear.lua | 2 +-
 nerv/tnn/tnn.lua                                  | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/nerv/examples/lmptb/lmptb/layer/select_linear.lua b/nerv/examples/lmptb/lmptb/layer/select_linear.lua
index 580b9c5..431ef3a 100644
--- a/nerv/examples/lmptb/lmptb/layer/select_linear.lua
+++ b/nerv/examples/lmptb/lmptb/layer/select_linear.lua
@@ -30,7 +30,7 @@ function SL:init(batch_size)
 end
 
 function SL:update(bp_err, input, output)
-    --use this to produce reproducable result
+    --use this to produce reproducable result, don't forget to set the dropout to zero!
     --for i = 1, input[1]:nrow(), 1 do
     --    local word_vec = self.ltp.trans[input[1][i - 1][0]]
     --    word_vec:add(word_vec, bp_err[1][i - 1], 1, - self.gconf.lrate / self.gconf.batch_size)
diff --git a/nerv/tnn/tnn.lua b/nerv/tnn/tnn.lua
index bcfeb40..7ae3172 100644
--- a/nerv/tnn/tnn.lua
+++ b/nerv/tnn/tnn.lua
@@ -466,7 +466,7 @@ function TNN:net_backpropagate(do_update) --propagate according to feeds_now
     local feeds_now = self.feeds_now
     for t = 1, self.chunk_size do --some layer maybe do not have outputs from time 1..chunk_size
         for id, ref in pairs(self.layers) do
-            self:backpropagate_dfs(ref, t)
+            self:backpropagate_dfs(ref, t, do_update)
         end
     end
     for t = 1, self.chunk_size do
@@ -500,6 +500,9 @@ end
 --ref: the TNN_ref of a layer
 --t: the current time to propagate
 function TNN:backpropagate_dfs(ref, t, do_update)
+    if do_update == nil then
+        nerv.error("got a nil do_update")
+    end
     if self:out_of_feedrange(t) then
         return
     end
-- 
cgit v1.2.3


From 5cf7e88df3aa4cf60819e955f0f537d2cfeccaac Mon Sep 17 00:00:00 2001
From: txh18 <cloudygooseg@gmail.com>
Date: Thu, 10 Dec 2015 17:20:31 +0800
Subject: removed flush_all for every mb in process_birnn

---
 nerv/examples/lmptb/lm_trainer.lua | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nerv/examples/lmptb/lm_trainer.lua b/nerv/examples/lmptb/lm_trainer.lua
index ecedc9f..eab6e2d 100644
--- a/nerv/examples/lmptb/lm_trainer.lua
+++ b/nerv/examples/lmptb/lm_trainer.lua
@@ -246,6 +246,8 @@ function LMTrainer.lm_process_file_birnn(global_conf, fn, tnn, do_train, p_conf)
         --tnn:move_right_to_nextmb({0}) --do not need history for bi directional model
         global_conf.timer:toc('tnn_afterprocess')
 
+        --tnn:flush_all() --you need this for bilstmlm_ptb_v2, because it has connection across 2 time steps
+
         global_conf.timer:toc('most_out_loop_lmprocessfile')
 
         --print log
@@ -268,7 +270,6 @@ function LMTrainer.lm_process_file_birnn(global_conf, fn, tnn, do_train, p_conf)
 
         collectgarbage("collect")                                              
 
-        tnn:flush_all()
         --break --debug
     end
     
-- 
cgit v1.2.3


From 1b0f2ab768e34f126ce946e1689cd47c86f41645 Mon Sep 17 00:00:00 2001
From: Determinant <ted.sybil@gmail.com>
Date: Thu, 10 Dec 2015 21:46:11 +0800
Subject: fix the bug in freeing submatrix; add `offset` to Matrix structure

---
 nerv/lib/matrix/cukernel.cu         |  1 +
 nerv/lib/matrix/cumatrix.c          |  1 +
 nerv/lib/matrix/generic/cumatrix.c  |  2 +-
 nerv/lib/matrix/generic/elem_type.h | 13 ++++++++++---
 nerv/lib/matrix/generic/matrix.c    |  9 +++++----
 nerv/lib/matrix/matrix.h            |  1 +
 nerv/lib/matrix/mmatrix.c           |  5 ++++-
 nerv/matrix/cumatrix.c              |  1 +
 nerv/matrix/generic/cumatrix.c      |  2 +-
 nerv/matrix/generic/elem_type.h     | 22 ----------------------
 nerv/matrix/generic/mmatrix.c       |  2 +-
 nerv/matrix/mmatrix.c               |  2 ++
 12 files changed, 28 insertions(+), 33 deletions(-)
 delete mode 100644 nerv/matrix/generic/elem_type.h

diff --git a/nerv/lib/matrix/cukernel.cu b/nerv/lib/matrix/cukernel.cu
index 210e6bf..c20e538 100644
--- a/nerv/lib/matrix/cukernel.cu
+++ b/nerv/lib/matrix/cukernel.cu
@@ -44,6 +44,7 @@ __device__ float atomicAdd_nvidia(float* address, float val) {
 #undef MATRIX_USE_FLOAT
 #undef MATRIX_ELEM
 #undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_PTR_BASE
 #undef MATRIX_ELEM_FMT
 #undef MATRIX_ELEM_WRITE_FMT
 
diff --git a/nerv/lib/matrix/cumatrix.c b/nerv/lib/matrix/cumatrix.c
index ff1168d..a8ed075 100644
--- a/nerv/lib/matrix/cumatrix.c
+++ b/nerv/lib/matrix/cumatrix.c
@@ -57,6 +57,7 @@ void nerv_cumatrix_init() {
 #undef MATRIX_USE_FLOAT
 #undef MATRIX_ELEM
 #undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_PTR_BASE
 #undef MATRIX_ELEM_FMT
 #undef MATRIX_ELEM_WRITE_FMT
 #undef MATRIX_CUMATRIX_HOST_TNAME
diff --git a/nerv/lib/matrix/generic/cumatrix.c b/nerv/lib/matrix/generic/cumatrix.c
index 00af895..7643c01 100644
--- a/nerv/lib/matrix/generic/cumatrix.c
+++ b/nerv/lib/matrix/generic/cumatrix.c
@@ -315,7 +315,7 @@ void nerv_matrix_(copy_rows_fromh_by_idx)(Matrix *a, const Matrix *b,
     long nrow = a->nrow;
     if (!(0 <= b_begin && b_begin + nrow <= idx->ncol))
         NERV_EXIT_STATUS(status, MAT_INVALID_COPY_INTERVAL, 0);
-    float *idx_ptr = idx->data.f;
+    float *idx_ptr = MATRIX_ELEM_PTR_F(idx);
     int i;
     if (idx->nrow != 1)
         NERV_EXIT_STATUS(status, MAT_IDX_VECTOR_EXP, 0);
diff --git a/nerv/lib/matrix/generic/elem_type.h b/nerv/lib/matrix/generic/elem_type.h
index bffe940..07f6355 100644
--- a/nerv/lib/matrix/generic/elem_type.h
+++ b/nerv/lib/matrix/generic/elem_type.h
@@ -1,22 +1,29 @@
+#define MATRIX_ELEM_PTR_F(self) ((float *)((char *)((self)->data.f) + (self)->offset))
+#define MATRIX_ELEM_PTR_D(self) ((double *)((char *)((self)->data.d) + (self)->offset))
+#define MATRIX_ELEM_PTR_I(self) ((long *)((char *)((self)->data.i) + (self)->offset))
+
 #ifdef MATRIX_USE_FLOAT
 
 #define MATRIX_ELEM float
 #define MATRIX_ELEM_FMT "%f"
 #define MATRIX_ELEM_WRITE_FMT "%.8f"
-#define MATRIX_ELEM_PTR(self) ((self)->data.f)
+#define MATRIX_ELEM_PTR(self) MATRIX_ELEM_PTR_F(self)
+#define MATRIX_ELEM_PTR_BASE(self) ((self)->data.f)
 
 #elif defined(MATRIX_USE_DOUBLE)
 
 #define MATRIX_ELEM double
 #define MATRIX_ELEM_FMT "%lf"
 #define MATRIX_ELEM_WRITE_FMT "%.8lf"
-#define MATRIX_ELEM_PTR(self) ((self)->data.d)
+#define MATRIX_ELEM_PTR(self) MATRIX_ELEM_PTR_D(self)
+#define MATRIX_ELEM_PTR_BASE(self) ((self)->data.d)
 
 #elif defined(MATRIX_USE_INT)
 
 #define MATRIX_ELEM long
 #define MATRIX_ELEM_FMT "%ld"
 #define MATRIX_ELEM_WRITE_FMT "%ld"
-#define MATRIX_ELEM_PTR(self) ((self)->data.i)
+#define MATRIX_ELEM_PTR(self) MATRIX_ELEM_PTR_I(self)
+#define MATRIX_ELEM_PTR_BASE(self) ((self)->data.i)
 
 #endif
diff --git a/nerv/lib/matrix/generic/matrix.c b/nerv/lib/matrix/generic/matrix.c
index 4246751..998d107 100644
--- a/nerv/lib/matrix/generic/matrix.c
+++ b/nerv/lib/matrix/generic/matrix.c
@@ -4,12 +4,11 @@
 /* FIXME: malloc failure detection */
 
 void nerv_matrix_(data_free)(Matrix *self, Status *status) {
-    if(*self->data_ref == 0) return; /* FIXME: repeat free memory */
     assert(*self->data_ref > 0);
     if (--(*self->data_ref) == 0)
     {
         /* free matrix data */
-        MATRIX_DATA_FREE(MATRIX_ELEM_PTR(self), status);
+        MATRIX_DATA_FREE(MATRIX_ELEM_PTR_BASE(self), status);
         free(self->data_ref);
         free(self);
     }
@@ -29,7 +28,7 @@ Matrix *nerv_matrix_(create)(long nrow, long ncol, Status *status) {
     self->ncol = ncol;
     self->nmax = self->nrow * self->ncol;
     self->dim = 2;
-    MATRIX_DATA_ALLOC(&MATRIX_ELEM_PTR(self), &self->stride,
+    MATRIX_DATA_ALLOC(&MATRIX_ELEM_PTR_BASE(self), &self->stride,
                      sizeof(MATRIX_ELEM) * self->ncol, self->nrow,
                      status);
     if (status->err_code != NERV_NORMAL)
@@ -39,6 +38,7 @@ Matrix *nerv_matrix_(create)(long nrow, long ncol, Status *status) {
     }
     self->data_ref = (long *)malloc(sizeof(long));
     *self->data_ref = 0;
+    self->offset = 0;
     nerv_matrix_(data_retain)(self);
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
     return self;
@@ -55,8 +55,9 @@ Matrix *nerv_matrix_(getrow)(Matrix *self, int row) {
     prow->dim = 1;
     prow->stride = self->stride;
     prow->nmax = prow->ncol;
-    MATRIX_ELEM_PTR(prow) = MATRIX_ROW_PTR(self, row);
+    prow->data = self->data;
     prow->data_ref = self->data_ref;
+    prow->offset = row * self->stride;
     nerv_matrix_(data_retain)(prow);
     return prow;
 }
diff --git a/nerv/lib/matrix/matrix.h b/nerv/lib/matrix/matrix.h
index 67a6e30..073bd13 100644
--- a/nerv/lib/matrix/matrix.h
+++ b/nerv/lib/matrix/matrix.h
@@ -12,6 +12,7 @@ typedef struct Matrix {
         double *d;
         long *i;
     } data;                   /* pointer to actual storage */
+    unsigned long offset;              /* the actual beginning of the matrix */
     long *data_ref;
 } Matrix;
 
diff --git a/nerv/lib/matrix/mmatrix.c b/nerv/lib/matrix/mmatrix.c
index b8157eb..b5670f2 100644
--- a/nerv/lib/matrix/mmatrix.c
+++ b/nerv/lib/matrix/mmatrix.c
@@ -6,6 +6,7 @@
 #define host_matrix_(NAME) host_matrix_float_##NAME
 #define nerv_matrix_(NAME) nerv_matrix_host_float_##NAME
 #include "generic/matrix.h"
+#include "generic/elem_type.h"
 #include "generic/mmatrix.c"
 
 Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) {
@@ -13,7 +14,7 @@ Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) {
     Matrix *self = nerv_matrix_(create)(1, ncol, status);
     if (status->err_code != NERV_NORMAL)
         return NULL;
-    float *prow = self->data.f;
+    float *prow = MATRIX_ELEM_PTR_F(self);
     for (i = 0; i < ncol; i++)
         prow[i] = i;
     for (i = ncol - 1; i >= 0; i--)
@@ -31,6 +32,7 @@ Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) {
 #undef MATRIX_USE_FLOAT
 #undef MATRIX_ELEM
 #undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_PTR_BASE
 #undef MATRIX_ELEM_FMT
 #undef MATRIX_ELEM_WRITE_FMT
 
@@ -44,6 +46,7 @@ Matrix *nerv_matrix_(perm_gen)(int ncol, Status *status) {
 #undef MATRIX_USE_DOUBLE
 #undef MATRIX_ELEM
 #undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_PTR_BASE
 #undef MATRIX_ELEM_FMT
 #undef MATRIX_ELEM_WRITE_FMT
 
diff --git a/nerv/matrix/cumatrix.c b/nerv/matrix/cumatrix.c
index fef03fc..bf92f92 100644
--- a/nerv/matrix/cumatrix.c
+++ b/nerv/matrix/cumatrix.c
@@ -49,6 +49,7 @@ const char *nerv_matrix_(tname) = "nerv.CuMatrixFloat";
 #undef MATRIX_USE_FLOAT
 #undef MATRIX_ELEM
 #undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_PTR_BASE
 #undef MATRIX_ELEM_FMT
 #undef MATRIX_ELEM_WRITE_FMT
 #undef MATRIX_CUMATRIX_HOST_TNAME
diff --git a/nerv/matrix/generic/cumatrix.c b/nerv/matrix/generic/cumatrix.c
index e1519b0..be3d627 100644
--- a/nerv/matrix/generic/cumatrix.c
+++ b/nerv/matrix/generic/cumatrix.c
@@ -1,5 +1,5 @@
 #ifdef NERV_GENERIC_CUMATRIX
-#include "elem_type.h"
+#include "../../lib/matrix/generic/elem_type.h"
 #define MATRIX_DATA_WRITE(L, data, idx, val) cuda_matrix_(write)(L, data, idx, val)
 #define MATRIX_DATA_READ(L, data, idx) cuda_matrix_(read)(L, data, idx)
 #define MATRIX_INIT(L) cuda_matrix_(init)(L)
diff --git a/nerv/matrix/generic/elem_type.h b/nerv/matrix/generic/elem_type.h
deleted file mode 100644
index bffe940..0000000
--- a/nerv/matrix/generic/elem_type.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifdef MATRIX_USE_FLOAT
-
-#define MATRIX_ELEM float
-#define MATRIX_ELEM_FMT "%f"
-#define MATRIX_ELEM_WRITE_FMT "%.8f"
-#define MATRIX_ELEM_PTR(self) ((self)->data.f)
-
-#elif defined(MATRIX_USE_DOUBLE)
-
-#define MATRIX_ELEM double
-#define MATRIX_ELEM_FMT "%lf"
-#define MATRIX_ELEM_WRITE_FMT "%.8lf"
-#define MATRIX_ELEM_PTR(self) ((self)->data.d)
-
-#elif defined(MATRIX_USE_INT)
-
-#define MATRIX_ELEM long
-#define MATRIX_ELEM_FMT "%ld"
-#define MATRIX_ELEM_WRITE_FMT "%ld"
-#define MATRIX_ELEM_PTR(self) ((self)->data.i)
-
-#endif
diff --git a/nerv/matrix/generic/mmatrix.c b/nerv/matrix/generic/mmatrix.c
index 01dd9e5..a4e8489 100644
--- a/nerv/matrix/generic/mmatrix.c
+++ b/nerv/matrix/generic/mmatrix.c
@@ -1,6 +1,6 @@
 #ifdef NERV_GENERIC_MMATRIX
 #include "../../lib/matrix/generic/matrix.h"
-#include "elem_type.h"
+#include "../../lib/matrix/generic/elem_type.h"
 #define MATRIX_DATA_WRITE(L, data, idx, val) (data[idx] = val)
 #define MATRIX_DATA_READ(L, data, idx) (data[idx])
 #define MATRIX_INIT(L) host_matrix_(init)(L)
diff --git a/nerv/matrix/mmatrix.c b/nerv/matrix/mmatrix.c
index 961059c..20c31d6 100644
--- a/nerv/matrix/mmatrix.c
+++ b/nerv/matrix/mmatrix.c
@@ -45,6 +45,7 @@ static const luaL_Reg nerv_matrix_(extra_methods_int)[] = {
 #undef MATRIX_USE_FLOAT
 #undef MATRIX_ELEM
 #undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_PTR_BASE
 #undef MATRIX_ELEM_FMT
 #undef MATRIX_ELEM_WRITE_FMT
 #undef MMATRIX_INIT
@@ -60,6 +61,7 @@ const char *nerv_matrix_(tname) = "nerv.MMatrixDouble";
 #undef MATRIX_USE_DOUBLE
 #undef MATRIX_ELEM
 #undef MATRIX_ELEM_PTR
+#undef MATRIX_ELEM_PTR_BASE
 #undef MATRIX_ELEM_FMT
 #undef MATRIX_ELEM_WRITE_FMT
 
-- 
cgit v1.2.3


From 7fd5c2c8672c8ac75348e2d51f56a72b5fd21b7b Mon Sep 17 00:00:00 2001
From: txh18 <cloudygooseg@gmail.com>
Date: Thu, 10 Dec 2015 23:13:29 +0800
Subject: ...

---
 nerv/examples/lmptb/bilstmlm_ptb_main.lua | 6 ++++--
 nerv/examples/lmptb/lstmlm_ptb_main.lua   | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/nerv/examples/lmptb/bilstmlm_ptb_main.lua b/nerv/examples/lmptb/bilstmlm_ptb_main.lua
index 0472588..e88eea2 100644
--- a/nerv/examples/lmptb/bilstmlm_ptb_main.lua
+++ b/nerv/examples/lmptb/bilstmlm_ptb_main.lua
@@ -345,7 +345,7 @@ end
 
 lr_half = false --can not be local, to be set by loadstring
 start_iter = -1
-start_lr = global_conf.lrate
+start_lr = nil
 ppl_last = 100000
 commands_str = "train:test"
 commands = {}
@@ -371,7 +371,9 @@ global_conf.log_fn = global_conf.work_dir .. '/log_lstm_tnn_' .. commands_str ..
 global_conf.log_fn, _ = string.gsub(global_conf.log_fn, ':', '-')
 commands = nerv.SUtil.parse_commands_set(commands_str)
 
-global_conf.lrate = start_lr --starting lr can be set by user(arg[2])
+if start_lr ~= nil then
+    global_conf.lrate = start_lr --starting lr can be set by user(arg[2])
+end
 
 nerv.printf("%s creating work_dir(%s)...\n", global_conf.sche_log_pre, global_conf.work_dir)
 nerv.LMUtil.wait(2)
diff --git a/nerv/examples/lmptb/lstmlm_ptb_main.lua b/nerv/examples/lmptb/lstmlm_ptb_main.lua
index 6e3fab9..9bdd5ff 100644
--- a/nerv/examples/lmptb/lstmlm_ptb_main.lua
+++ b/nerv/examples/lmptb/lstmlm_ptb_main.lua
@@ -333,7 +333,7 @@ end
 
 lr_half = false --can not be local, to be set by loadstring
 start_iter = -1
-start_lr = global_conf.lrate
+start_lr = nil
 ppl_last = 100000
 commands_str = "train:test"
 commands = {}
@@ -358,7 +358,9 @@ global_conf.log_fn = global_conf.work_dir .. '/log_lstm_tnn_' .. commands_str ..
 global_conf.log_fn, _ = string.gsub(global_conf.log_fn, ':', '-')
 commands = nerv.SUtil.parse_commands_set(commands_str)
 
-global_conf.lrate = start_lr
+if start_lr ~= nil then
+    global_conf.lrate = start_lr
+end
  
 nerv.printf("%s creating work_dir(%s)...\n", global_conf.sche_log_pre, global_conf.work_dir)
 nerv.LMUtil.wait(2)
-- 
cgit v1.2.3


From 28bb2edd5ee81688f245cc89f872150db1e01e44 Mon Sep 17 00:00:00 2001
From: txh18 <cloudygooseg@gmail.com>
Date: Fri, 11 Dec 2015 21:43:55 +0800
Subject: bug fix: added check in the select_linear kernel

---
 nerv/lib/matrix/generic/cukernel.cu | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/nerv/lib/matrix/generic/cukernel.cu b/nerv/lib/matrix/generic/cukernel.cu
index aa830b5..552f7a4 100644
--- a/nerv/lib/matrix/generic/cukernel.cu
+++ b/nerv/lib/matrix/generic/cukernel.cu
@@ -263,11 +263,14 @@ __global__ void cudak_(clip)(MATRIX_ELEM *a,
 
 #ifdef __NERV_FUTURE_CUDA_7
 __global__ void cudak_(update_select_rows)(MATRIX_ELEM *c, const MATRIX_ELEM *a, const MATRIX_ELEM *idx,
-                            int nrow_a, int ncol_a, int stride_c, int stride_a, double alpha, double beta) {
+                            int nrow_a, int ncol_a, int nrow_c, int stride_c, int stride_a, double alpha, double beta) {
     int j = blockIdx.x * blockDim.x + threadIdx.x;
     int i = blockIdx.y * blockDim.y + threadIdx.y;
     if (i >= nrow_a || j >= ncol_a) return;
     int i_c = lrintf(idx[i]);
+    if (i_c < 0 || i_c >= nrow_c) {
+        printf("ERROR inside kernel update_select_rows, i_c(%d) out of range!", i_c);
+    }
     //critical: i_c could conflict among threads(same index in the idx array), so atomicAdd is used
     //c[j + i_c * stride_c] = c[j + i_c * stride_c] * (1 - beta * alpha) + a[j + i * stride_a] * alpha; 
     atomicAdd_nvidia(c + j + i_c * stride_c, c[j + i_c * stride_c] * (- beta * alpha) + a[j + i * stride_a] * alpha);
@@ -335,11 +338,15 @@ __global__ void cudak_(gen_col_idx)(MATRIX_ELEM *b,
 
 __global__ void cudak_(copy_rows_by_idx)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
                                     const MATRIX_ELEM *idx,
-                                    int nrow, int ncol, int stride) {
+                                    int nrow, int ncol, int a_nrow, int stride) {
     int j = blockIdx.x * blockDim.x + threadIdx.x;
     int i = blockIdx.y * blockDim.y + threadIdx.y;
     if (i >= nrow || j >= ncol) return;
-    b[j + i * stride] = a[j + lrintf(idx[i]) * stride];
+    int k = lrintf(idx[i]);
+    if (k < 0 || k >= a_nrow) {
+        printf("error in kernel copy_rows_by_idx k(%d) out of range\n", k);
+    }
+    b[j + i * stride] = a[j + k * stride];
 }
 
 extern "C" {
@@ -639,7 +646,7 @@ extern "C" {
                 CEIL_DIV(a->nrow, threadsPerBlock.y));
         cudak_(update_select_rows)<<<numBlocks, threadsPerBlock>>> \
             (MATRIX_ELEM_PTR(c), MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(idx), 
-             a->nrow, a->ncol, c->stride / sizeof(MATRIX_ELEM), 
+             a->nrow, a->ncol, c->nrow, c->stride / sizeof(MATRIX_ELEM), 
              a->stride / sizeof(MATRIX_ELEM), alpha, beta);
         cudaStreamSynchronize(0);
     }
@@ -710,7 +717,7 @@ extern "C" {
         cudak_(copy_rows_by_idx)<<<numBlocks, threadsPerBlock>>> \
             (MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(b),
             MATRIX_ELEM_PTR(idx) + idx_begin,
-            b->nrow, b->ncol, b->stride / sizeof(MATRIX_ELEM));
+            b->nrow, b->ncol, a->nrow, b->stride / sizeof(MATRIX_ELEM));
         cudaStreamSynchronize(0);
     }
 }
-- 
cgit v1.2.3


From 32c39bcaf72a7dc08968909d8b6a8b108ac923b4 Mon Sep 17 00:00:00 2001
From: txh18 <cloudygooseg@gmail.com>
Date: Sun, 13 Dec 2015 20:58:02 +0800
Subject: move lmvocab:read_line to lmutil

---
 nerv/examples/lmptb/lmptb/lmfeeder.lua    |  3 ++-
 nerv/examples/lmptb/lmptb/lmseqreader.lua |  3 ++-
 nerv/examples/lmptb/lmptb/lmutil.lua      | 27 +++++++++++++++++++++++++++
 nerv/examples/lmptb/lmptb/lmvocab.lua     | 12 ------------
 4 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/nerv/examples/lmptb/lmptb/lmfeeder.lua b/nerv/examples/lmptb/lmptb/lmfeeder.lua
index 34631bf..e140f38 100644
--- a/nerv/examples/lmptb/lmptb/lmfeeder.lua
+++ b/nerv/examples/lmptb/lmptb/lmfeeder.lua
@@ -1,4 +1,5 @@
 require 'lmptb.lmvocab'
+require 'lmptb.lmutil'
 
 local Feeder = nerv.class("nerv.LMFeeder")
 
@@ -39,7 +40,7 @@ function Feeder:refresh_stream(id)
     local st = self.streams[id]
     if (st.store[st.head] ~= nil) then return end
     if (self.fh == nil) then return end
-    local list = self.vocab:read_line(self.fh)
+    local list = nerv.LMUtil.read_line(self.fh)
     if (list == nil) then --file has end
         printf("%s file expires, closing.\n", self.log_pre)
         self.fh:close() 
diff --git a/nerv/examples/lmptb/lmptb/lmseqreader.lua b/nerv/examples/lmptb/lmptb/lmseqreader.lua
index ed791d2..b603911 100644
--- a/nerv/examples/lmptb/lmptb/lmseqreader.lua
+++ b/nerv/examples/lmptb/lmptb/lmseqreader.lua
@@ -1,4 +1,5 @@
 require 'lmptb.lmvocab'
+require 'lmptb.lmutil'
 --require 'tnn.init'
 
 local LMReader = nerv.class("nerv.LMSeqReader")
@@ -58,7 +59,7 @@ function LMReader:refresh_stream(id)
     local st = self.streams[id]
     if (st.store[st.head] ~= nil) then return end
     if (self.fh == nil) then return end
-    local list = self.vocab:read_line(self.fh)
+    local list = nerv.LMUtil.read_line(self.fh)
     if (list == nil) then --file has end
         printf("%s file expires, closing.\n", self.log_pre)
         self.fh:close() 
diff --git a/nerv/examples/lmptb/lmptb/lmutil.lua b/nerv/examples/lmptb/lmptb/lmutil.lua
index 71e8e17..27b4b10 100644
--- a/nerv/examples/lmptb/lmptb/lmutil.lua
+++ b/nerv/examples/lmptb/lmptb/lmutil.lua
@@ -1,11 +1,38 @@
 local Util = nerv.class("nerv.LMUtil")
 
+local mysplit = function(inputstr, sep)
+    if sep == nil then
+        sep = "%s"
+    end
+    local t={} ; i=1
+    for str in string.gmatch(inputstr, "([^"..sep.."]+)") do
+        t[i] = str
+        i = i + 1
+    end
+    return t
+end
+
 --function rounds a number to the given number of decimal places.
 function Util.round(num, idp)
     local mult = 10^(idp or 0)
     return math.floor(num * mult + 0.5) / mult
 end
 
+--fh: file_handle
+--Returns: a list of tokens(string) in the line, if there is no "</s>" at the end, the function will at it, if nothing to read, returns nil
+function Util.read_line(fh)
+    local l_str, list
+    
+    repeat
+        l_str = fh:read("*line")
+        if (l_str == nil) then return nil end
+        list = mysplit(l_str)
+    until #list >= 1
+
+    return list
+end
+
+
 --list: table, list of string(word)
 --vocab: nerv.LMVocab
 --ty: nerv.CuMatrix
diff --git a/nerv/examples/lmptb/lmptb/lmvocab.lua b/nerv/examples/lmptb/lmptb/lmvocab.lua
index 3d256c0..2ad0e7e 100644
--- a/nerv/examples/lmptb/lmptb/lmvocab.lua
+++ b/nerv/examples/lmptb/lmptb/lmvocab.lua
@@ -101,18 +101,6 @@ function Vocab:get_word_id(key)
     return self.map_id(key)
 end
 
---fh: file_handle
---Returns: a list of tokens(string) in the line, if there is no "</s>" at the end, the function will at it, if nothing to read, returns nil
-function Vocab:read_line(fh)
-    local l_str = fh:read("*line")
-    if (l_str == nil) then return nil end
-    local list = mysplit(l_str)
-    if (list[(#list)] ~= self.sen_end_token) then
-        list[#list + 1] = self.sen_end_token
-    end
-    return list
-end
-
 --fn: string
 --Add all words in fn to the vocab
 function Vocab:build_file(fn)
-- 
cgit v1.2.3


From cbcdec35ae17511d7ff022f290e97c518d7a1f1b Mon Sep 17 00:00:00 2001
From: txh18 <cloudygooseg@gmail.com>
Date: Sun, 13 Dec 2015 21:03:26 +0800
Subject: ...

---
 nerv/examples/lmptb/lmptb/lmvocab.lua | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nerv/examples/lmptb/lmptb/lmvocab.lua b/nerv/examples/lmptb/lmptb/lmvocab.lua
index 2ad0e7e..7f512fd 100644
--- a/nerv/examples/lmptb/lmptb/lmvocab.lua
+++ b/nerv/examples/lmptb/lmptb/lmvocab.lua
@@ -1,3 +1,5 @@
+require 'lmptb.lmutil'
+
 local Vocab = nerv.class("nerv.LMVocab")
 
 local printf = nerv.printf
@@ -107,7 +109,7 @@ function Vocab:build_file(fn)
     printf("%s Vocab building on file %s...\n", self.log_pre, fn)
     local file = io.open(fn, "r")
     while (true) do
-        local list = self:read_line(file)
+        local list = nerv.LMUtil.read_line(file)
         if (list == nil) then
             break
         else
-- 
cgit v1.2.3


From 60ddaa4807978af9277edd0ad6758e1006d25223 Mon Sep 17 00:00:00 2001
From: txh18 <cloudygooseg@gmail.com>
Date: Sun, 13 Dec 2015 21:22:29 +0800
Subject: bug fix in lmptb.vocab

---
 nerv/examples/lmptb/lmptb/lmvocab.lua | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nerv/examples/lmptb/lmptb/lmvocab.lua b/nerv/examples/lmptb/lmptb/lmvocab.lua
index 7f512fd..0e7ef3e 100644
--- a/nerv/examples/lmptb/lmptb/lmvocab.lua
+++ b/nerv/examples/lmptb/lmptb/lmvocab.lua
@@ -100,7 +100,7 @@ function Vocab:get_word_id(key)
     if (self.map_id[key] == nil) then
         nerv.error("id key %d does not exist.", key) 
     end
-    return self.map_id(key)
+    return self.map_id[key]
 end
 
 --fn: string
-- 
cgit v1.2.3


From 2be64c382aa8d2fedd6aaf69dff212e7afef22b5 Mon Sep 17 00:00:00 2001
From: txh18 <cloudygooseg@gmail.com>
Date: Sun, 20 Dec 2015 22:07:02 +0800
Subject: added find_param method for layersT

---
 nerv/tnn/init.lua | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/nerv/tnn/init.lua b/nerv/tnn/init.lua
index 979f5d8..4bbff12 100644
--- a/nerv/tnn/init.lua
+++ b/nerv/tnn/init.lua
@@ -33,6 +33,8 @@ function LayerT:check_dim_len(len_in, len_out)
     end
 end
 
+layerT.find_param = nerv.layer.find_param
+
 function LayerT:get_params()
     nerv.error_method_not_implemented()
 end
-- 
cgit v1.2.3


From 996472e76c31ba560622841b4b31318244317c84 Mon Sep 17 00:00:00 2001
From: txh18 <cloudygooseg@gmail.com>
Date: Sun, 20 Dec 2015 22:08:54 +0800
Subject: small mistake in layersT

---
 nerv/tnn/init.lua | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nerv/tnn/init.lua b/nerv/tnn/init.lua
index 4bbff12..b375fa8 100644
--- a/nerv/tnn/init.lua
+++ b/nerv/tnn/init.lua
@@ -33,7 +33,7 @@ function LayerT:check_dim_len(len_in, len_out)
     end
 end
 
-layerT.find_param = nerv.layer.find_param
+LayerT.find_param = nerv.Layer.find_param
 
 function LayerT:get_params()
     nerv.error_method_not_implemented()
-- 
cgit v1.2.3


From 7f03ce8da24870f2757473385a75ed990b36d817 Mon Sep 17 00:00:00 2001
From: txh18 <cloudygooseg@gmail.com>
Date: Mon, 21 Dec 2015 13:36:54 +0800
Subject: added compressed_label support in the reader

---
 nerv/examples/lmptb/lm_trainer.lua        | 12 ++++++++++--
 nerv/examples/lmptb/lmptb/lmseqreader.lua | 19 ++++++++++++++++++-
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/nerv/examples/lmptb/lm_trainer.lua b/nerv/examples/lmptb/lm_trainer.lua
index eab6e2d..06c1a4c 100644
--- a/nerv/examples/lmptb/lm_trainer.lua
+++ b/nerv/examples/lmptb/lm_trainer.lua
@@ -23,6 +23,9 @@ function LMTrainer.lm_process_file_rnn(global_conf, fn, tnn, do_train, p_conf)
     end
     local reader
     local r_conf = {}
+    if p_conf.compressed_label ~= nil then
+        r_conf.compressed_label = p_conf.compressed_label
+    end
     local chunk_size, batch_size
     if p_conf.one_sen_report == true then --report log prob one by one sentence
         if do_train == true then
@@ -156,13 +159,16 @@ function LMTrainer.lm_process_file_birnn(global_conf, fn, tnn, do_train, p_conf)
     local reader
     local chunk_size, batch_size
     local r_conf = {["se_mode"] = true}
+    if p_conf.compressed_label ~= nil then
+        r_conf.compressed_label = p_conf.compressed_label
+    end
     if p_conf.one_sen_report == true then --report log prob one by one sentence
         if do_train == true then
             nerv.warning("LMTrainer.lm_process_file_birnn: warning, one_sen_report is true while do_train is also true, strange")
         end
         nerv.printf("lm_process_file_birnn: one_sen report mode, set batch_size to 1 and chunk_size to max_sen_len(%d)\n", 
                 global_conf.max_sen_len)
-        batch_size = 1 
+        batch_size = global_conf.batch_size
         chunk_size = global_conf.max_sen_len
     else
         batch_size = global_conf.batch_size
@@ -239,7 +245,9 @@ function LMTrainer.lm_process_file_birnn(global_conf, fn, tnn, do_train, p_conf)
         end
         if p_conf.one_sen_report == true then
             for i = 1, batch_size do
-                nerv.printf("LMTrainer.lm_process_file_birnn: one_sen_report_output, %f\n", sen_logp[i])    
+                if sen_logp[i] ~= nil then
+                    nerv.printf("LMTrainer.lm_process_file_birnn: one_sen_report_output, %f\n", sen_logp[i])    
+                end
             end
         end
 
diff --git a/nerv/examples/lmptb/lmptb/lmseqreader.lua b/nerv/examples/lmptb/lmptb/lmseqreader.lua
index b603911..0f29f8b 100644
--- a/nerv/examples/lmptb/lmptb/lmseqreader.lua
+++ b/nerv/examples/lmptb/lmptb/lmseqreader.lua
@@ -24,6 +24,10 @@ function LMReader:__init(global_conf, batch_size, chunk_size, vocab, r_conf)
     if r_conf.se_mode == true then
         self.se_mode = true
     end
+    self.compressed_label = false
+    if r_conf.compressed_label == true then
+        self.compressed_label = true
+    end
 end
 
 --fn: string
@@ -46,6 +50,9 @@ function LMReader:open_file(fn)
     for j = 1, self.chunk_size, 1 do
         self.bak_inputs_m[j] = {}
         self.bak_inputs_m[j][1] = self.gconf.mmat_type(self.batch_size, 1)
+        if self.compressed_label == true then
+            self.bak_inputs_m[j][2] = self.gconf.mmat_type(self.batch_size, 1)
+        end
         --self.bak_inputs_m[j][2] = self.gconf.mmat_type(self.batch_size, self.vocab:size()) --since MMatrix does not yet have fill, this m[j][2] is not used
     end
 end
@@ -118,6 +125,9 @@ function LMReader:get_batch(feeds)
                 end
                 inputs_s[j][i] = self.vocab.null_token
                 self.bak_inputs_m[j][1][i - 1][0] = 0
+                if self.compressed_label == true then
+                    self.bak_inputs_m[j][2][i - 1][0] = 0
+                end
                 labels_s[j][i] = self.vocab.null_token
             else
                 self:refresh_stream(i)
@@ -132,7 +142,11 @@ function LMReader:get_batch(feeds)
                 end
                 if st.store[st.head + 1] ~= nil then
                     labels_s[j][i] = st.store[st.head + 1]
-                    inputs_m[j][2][i - 1][self.vocab:get_word_str(st.store[st.head + 1]).id - 1] = 1
+                    if self.compressed_label == true then
+                        self.bak_inputs_m[j][2][i - 1][0] = self.vocab:get_word_str(st.store[st.head + 1]).id - 1
+                    else
+                        inputs_m[j][2][i - 1][self.vocab:get_word_str(st.store[st.head + 1]).id - 1] = 1
+                    end
                 else
                     if (inputs_s[j][i] ~= self.vocab.null_token) then
                         nerv.error("reader error : input not null but label is null_token")
@@ -169,6 +183,9 @@ function LMReader:get_batch(feeds)
             flagsPack[j] = bit.bor(flagsPack[j], flags[j][i])
         end
         inputs_m[j][1]:copy_fromh(self.bak_inputs_m[j][1])
+        if self.compressed_label == true then
+            inputs_m[j][2]:copy_fromh(self.bak_inputs_m[j][2])
+        end
     end
 
     --check for self.al_sen_start
-- 
cgit v1.2.3


From 95ef51432218683ec90fdbaa2e92007f4fbd4610 Mon Sep 17 00:00:00 2001
From: txh18 <cloudygooseg@gmail.com>
Date: Mon, 21 Dec 2015 16:58:35 +0800
Subject: added a garbace collect in tnn

---
 nerv/tnn/tnn.lua | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nerv/tnn/tnn.lua b/nerv/tnn/tnn.lua
index 7ae3172..5351053 100644
--- a/nerv/tnn/tnn.lua
+++ b/nerv/tnn/tnn.lua
@@ -77,6 +77,7 @@ function TNN.make_initial_store(st, p, dim, batch_size, chunk_size, extend_t, gl
             st_c[i + t_c][p_c] = st[i][p]
         end
     end
+    collectgarbage("collect") --free the old one to save memory
 end
 
 function TNN:out_of_feedrange(t) --out of chunk, or no input, for the current feed
-- 
cgit v1.2.3


From 9a172678f824351840283363161e8b38d2d5cfb2 Mon Sep 17 00:00:00 2001
From: txh18 <cloudygooseg@gmail.com>
Date: Tue, 22 Dec 2015 13:34:10 +0800
Subject: give a update_select_rows a more proper name

---
 nerv/lib/matrix/generic/cukernel.cu | 30 +++++++++++++++++++++++++++---
 nerv/lib/matrix/generic/cumatrix.c  | 18 +++++++++++++++---
 nerv/lib/matrix/generic/cumatrix.h  |  6 +++++-
 nerv/matrix/generic/cumatrix.c      | 21 ++++++++++++++++++---
 4 files changed, 65 insertions(+), 10 deletions(-)

diff --git a/nerv/lib/matrix/generic/cukernel.cu b/nerv/lib/matrix/generic/cukernel.cu
index 552f7a4..9244783 100644
--- a/nerv/lib/matrix/generic/cukernel.cu
+++ b/nerv/lib/matrix/generic/cukernel.cu
@@ -262,7 +262,7 @@ __global__ void cudak_(clip)(MATRIX_ELEM *a,
 }
 
 #ifdef __NERV_FUTURE_CUDA_7
-__global__ void cudak_(update_select_rows)(MATRIX_ELEM *c, const MATRIX_ELEM *a, const MATRIX_ELEM *idx,
+__global__ void cudak_(update_select_rows_by_rowidx)(MATRIX_ELEM *c, const MATRIX_ELEM *a, const MATRIX_ELEM *idx,
                             int nrow_a, int ncol_a, int nrow_c, int stride_c, int stride_a, double alpha, double beta) {
     int j = blockIdx.x * blockDim.x + threadIdx.x;
     int i = blockIdx.y * blockDim.y + threadIdx.y;
@@ -275,6 +275,20 @@ __global__ void cudak_(update_select_rows)(MATRIX_ELEM *c, const MATRIX_ELEM *a,
     //c[j + i_c * stride_c] = c[j + i_c * stride_c] * (1 - beta * alpha) + a[j + i * stride_a] * alpha; 
     atomicAdd_nvidia(c + j + i_c * stride_c, c[j + i_c * stride_c] * (- beta * alpha) + a[j + i * stride_a] * alpha);
 }
+
+__global__ void cudak_(update_select_rows_by_colidx)(MATRIX_ELEM *c, const MATRIX_ELEM *a, const MATRIX_ELEM *idx,
+                            int nrow_a, int ncol_a, int nrow_c, int stride_c, int stride_a, int stride_idx, double alpha, double beta) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    if (i >= nrow_a || j >= ncol_a) return;
+    int i_c = lrintf(idx[stride_idx * i]);
+    if (i_c < 0 || i_c >= nrow_c) {
+        printf("ERROR inside kernel update_select_rows, i_c(%d) out of range!", i_c);
+    }
+    //critical: i_c could conflict among threads(same index in the idx array), so atomicAdd is used
+    //c[j + i_c * stride_c] = c[j + i_c * stride_c] * (1 - beta * alpha) + a[j + i * stride_a] * alpha; 
+    atomicAdd_nvidia(c + j + i_c * stride_c, c[j + i_c * stride_c] * (- beta * alpha) + a[j + i * stride_a] * alpha);
+}
 #endif
 
 __global__ void cudak_(expand_frm)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
@@ -640,16 +654,26 @@ extern "C" {
     }
     
 #ifdef __NERV_FUTURE_CUDA_7
-    void cudak_(cuda_update_select_rows)(Matrix *c, const Matrix *a, const Matrix *idx, double alpha, double beta) {
+    void cudak_(cuda_update_select_rows_by_rowidx)(Matrix *c, const Matrix *a, const Matrix *idx, double alpha, double beta) {
         dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
         dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x),
                 CEIL_DIV(a->nrow, threadsPerBlock.y));
-        cudak_(update_select_rows)<<<numBlocks, threadsPerBlock>>> \
+        cudak_(update_select_rows_by_rowidx)<<<numBlocks, threadsPerBlock>>> \
             (MATRIX_ELEM_PTR(c), MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(idx), 
              a->nrow, a->ncol, c->nrow, c->stride / sizeof(MATRIX_ELEM), 
              a->stride / sizeof(MATRIX_ELEM), alpha, beta);
         cudaStreamSynchronize(0);
     }
+    void cudak_(cuda_update_select_rows_by_colidx)(Matrix *c, const Matrix *a, const Matrix *idx, double alpha, double beta) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(a->ncol, threadsPerBlock.x),
+                CEIL_DIV(a->nrow, threadsPerBlock.y));
+        cudak_(update_select_rows_by_colidx)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(c), MATRIX_ELEM_PTR(a), MATRIX_ELEM_PTR(idx), 
+             a->nrow, a->ncol, c->nrow, c->stride / sizeof(MATRIX_ELEM), 
+             a->stride / sizeof(MATRIX_ELEM), idx->stride / sizeof(MATRIX_ELEM), alpha, beta);
+        cudaStreamSynchronize(0);
+    }
 #endif
 
     void cudak_(cuda_expand_frm)(const Matrix *a, Matrix *b, int context) {
diff --git a/nerv/lib/matrix/generic/cumatrix.c b/nerv/lib/matrix/generic/cumatrix.c
index 68889ad..31d6b06 100644
--- a/nerv/lib/matrix/generic/cumatrix.c
+++ b/nerv/lib/matrix/generic/cumatrix.c
@@ -394,14 +394,26 @@ void nerv_matrix_(copy_rows_fromd_by_idx)(Matrix *a, const Matrix *b,
 }