20 files changed, 321 insertions, 119 deletions
diff --git a/nerv/Makefile b/nerv/Makefile
index a29309a..a472cfc 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -36,7 +36,7 @@ LUA_LIBS := matrix/init.lua io/init.lua init.lua \
 			nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/layer_dag.lua \
 			io/sgd_buffer.lua \
 			tnn/init.lua tnn/layer_dag_t.lua tnn/sutil.lua tnn/tnn.lua \
-			tnn/layersT/dropout_t.lua tnn/layersT/lstm_t.lua tnn/layersT/softmax_ce_t.lua
+			tnn/layersT/dropout_t.lua tnn/layersT/lstm_t.lua tnn/layersT/gru_t.lua tnn/layersT/softmax_ce_t.lua
 
 INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK
 #CUDA_BASE := /usr/local/cuda-7.0
diff --git a/nerv/examples/lmptb/grulm_ptb_main.lua b/nerv/examples/lmptb/grulm_ptb_main.lua
index ef5d7f9..4a3f39f 100644
--- a/nerv/examples/lmptb/grulm_ptb_main.lua
+++ b/nerv/examples/lmptb/grulm_ptb_main.lua
@@ -198,6 +198,7 @@ qdata_dir = root_dir .. '/ptb/questionGen/gen'
 global_conf = {
     lrate = 0.15, wcost = 1e-5, momentum = 0, clip_t = 5,
     cumat_type = nerv.CuMatrixFloat,
+    select_gpu = 0,
     mmat_type = nerv.MMatrixFloat,
     nn_act_default = 0, 
 
@@ -259,7 +260,7 @@ global_conf = {
 elseif (set == "twitter") then
 
 data_dir = root_dir .. '/twitter_new/DATA'
-train_fn = data_dir .. '/twitter.choose2.adds'
+train_fn = data_dir .. '/twitter.choose.adds'
 valid_fn = data_dir .. '/twitter.valid.adds'
 test_fn = data_dir .. '/comm.test.choose-ppl.adds'
 vocab_fn = data_dir .. '/twitter.choose.train.vocab'
@@ -359,7 +360,14 @@ commands = nerv.SUtil.parse_commands_set(commands_str)
 if start_lr ~= nil then
     global_conf.lrate = start_lr
 end
- 
+
+nerv.printf("detecting gconf.select_gpu...\n")
+if global_conf.select_gpu then
+    nerv.printf("select gpu to %d\n", global_conf.select_gpu)
+    global_conf.cumat_type.select_gpu(global_conf.select_gpu)
+    nerv.LMUtil.wait(1)
+end
+
 nerv.printf("%s creating work_dir(%s)...\n", global_conf.sche_log_pre, global_conf.work_dir)
 nerv.LMUtil.wait(2)
 os.execute("mkdir -p "..global_conf.work_dir)
@@ -388,10 +396,10 @@ nerv.LMUtil.wait(2)
 
 math.randomseed(1)
 
-local vocab = nerv.LMVocab()
+local vocab = nerv.LMVocab(global_conf)
 global_conf["vocab"] = vocab
 nerv.printf("%s building vocab...\n", global_conf.sche_log_pre)
-global_conf.vocab:build_file(global_conf.vocab_fn, false)
+global_conf.vocab:build_file(global_conf.vocab_fn)
 ppl_rec = {} 
 
 local final_iter = -1
diff --git a/nerv/examples/lmptb/lm_sampler.lua b/nerv/examples/lmptb/lm_sampler.lua
index c25a75c..c9adf85 100644
--- a/nerv/examples/lmptb/lm_sampler.lua
+++ b/nerv/examples/lmptb/lm_sampler.lua
@@ -3,31 +3,34 @@ local LMSampler = nerv.class('nerv.LMSampler')
 function LMSampler:__init(global_conf)
     self.log_pre = "LMSampler"
     self.gconf = global_conf
+    self.batch_size = self.gconf.batch_size
+    self.chunk_size = self.gconf.chunk_size --largest sample sentence length
     self.vocab = self.gconf.vocab
     self.sen_end_token = self.vocab.sen_end_token
     self.sen_end_id = self.vocab:get_word_str(self.sen_end_token).id 
+
+    self.loaded = false
 end
 
-function LMSampler:load_dagL(dagL)
-    self.batch_size = self.gconf.batch_size
-    self.chunk_size = self.gconf.chunk_size
-    
+function LMSampler:load_dagL(dagL)   
     nerv.printf("%s loading dagL\n", self.log_pre)
 
     self.dagL = dagL
+    self.dagL:init(self.batch_size)
 
     self.dagL_inputs = {}
-    self.dagL_inputs[1] = global_conf.cumat_type(global_conf.batch_size, 1)
+    self.dagL_inputs[1] = self.gconf.cumat_type(self.gconf.batch_size, 1)
     self.dagL_inputs[1]:fill(self.sen_end_id - 1)
-    self.dagL_inputs[2] = global_conf.cumat_type(global_conf.batch_size, global_conf.hidden_size)
+    self.dagL_inputs[2] = self.gconf.cumat_type(self.gconf.batch_size, self.gconf.hidden_size)
     self.dagL_inputs[2]:fill(0)
     
     self.dagL_outputs = {}
-    self.dagL_outputs[1] = global_conf.cumat_type(global_conf.batch_size, global_conf.vocab:size())
-    self.dagL_outputs[2] = global_conf.cumat_type(global_conf.batch_size, global_conf.hidden_size)
+    self.dagL_outputs[1] = self.gconf.cumat_type(self.gconf.batch_size, self.gconf.vocab:size())
+    self.dagL_outputs[2] = self.gconf.cumat_type(self.gconf.batch_size, self.gconf.hidden_size)
     
-    self.smout_d = global_conf.cumat_type(self.batch_size, self.vocab:size())
-    self.smout_h = global_conf.mmat_type(self.batch_size, self.vocab:size())
+    self.smout_d = self.gconf.cumat_type(self.batch_size, self.vocab:size())
+    self.ssout_d = self.gconf.cumat_type(self.batch_size, self.vocab:size())
+    self.ssout_h = self.gconf.mmat_type(self.batch_size, self.vocab:size())
 
     self.store = {}
     for i = 1, self.batch_size do
@@ -38,11 +41,31 @@ function LMSampler:load_dagL(dagL)
         self.store[i][1].p = 0
     end
     self.repo = {}
+
+    self.loaded = true
 end
 
-function LMSampler:sample_to_store(smout)
+function LMSampler:sample_to_store(ssout) --private
     for i = 1, self.batch_size do
         local ran = math.random()
+        local id = 1
+        local low = 0
+        local high = ssout:ncol() - 1
+        if ssout[i - 1][high] < 0.9999 or ssout[i - 1][high] > 1.0001 then
+            nerv.error("%s ERROR, softmax output summation(%f) seems to have some problem", self.log_pre, ssout[i - 1][high])
+        end
+        if ssout[i - 1][low] < ran then
+            while low + 1 < high do
+                local mid = math.floor((low + high) / 2)
+                if ssout[i - 1][mid] < ran then
+                    low = mid
+                else
+                    high = mid
+                end
+            end
+            id = high + 1
+        end
+        --[[
         local s = 0
         local id = self.vocab:size()
         for j = 0, self.vocab:size() - 1 do
@@ -52,19 +75,25 @@ function LMSampler:sample_to_store(smout)
                 break
             end
         end
+        ]]--
         if #self.store[i] >= self.chunk_size - 2 then
             id = self.sen_end_id
         end
         local tmp = {}
         tmp.w = self.vocab:get_word_id(id).str
         tmp.id = id
-        tmp.p = smout[i - 1][id - 1]
+        if id == 1 then
+            tmp.p = ssout[i - 1][id - 1]
+        else
+            tmp.p = ssout[i - 1][id - 1] - ssout[i - 1][id - 2] 
+        end
         table.insert(self.store[i], tmp)
     end
 end
 
---Returns: LMResult
 function LMSampler:lm_sample_rnn_dagL(sample_num, p_conf)
+    assert(self.loaded == true)
+
     local dagL = self.dagL
     local inputs = self.dagL_inputs
     local outputs = self.dagL_outputs
@@ -74,9 +103,10 @@ function LMSampler:lm_sample_rnn_dagL(sample_num, p_conf)
         inputs[2]:copy_fromd(outputs[2]) --copy hidden activation
     
         self.smout_d:softmax(outputs[1])
-        self.smout_d:copy_toh(self.smout_h)
+        self.ssout_d:prefixsum_row(self.smout_d)
+        self.ssout_d:copy_toh(self.ssout_h)
         
-        self:sample_to_store(self.smout_h)
+        self:sample_to_store(self.ssout_h)
         for i = 1, self.batch_size do
             inputs[1][i - 1][0] = self.store[i][#self.store[i]].id - 1
             if self.store[i][#self.store[i]].id == self.sen_end_id then --meet a sentence end
diff --git a/nerv/examples/lmptb/lmptb/layer/init.lua b/nerv/examples/lmptb/lmptb/layer/init.lua
index b345244..ceae009 100644
--- a/nerv/examples/lmptb/lmptb/layer/init.lua
+++ b/nerv/examples/lmptb/lmptb/layer/init.lua
@@ -1,6 +1,6 @@
 require 'lmptb.layer.select_linear'
 require 'lmptb.layer.affine_recurrent_plusvec'
-require 'lmptb.layer.gru_t'
+--require 'lmptb.layer.gru_t'
 require 'lmptb.layer.lm_affine_recurrent'
 
 
diff --git a/nerv/examples/lmptb/lmptb/lmseqreader.lua b/nerv/examples/lmptb/lmptb/lmseqreader.lua
index 0f29f8b..1272929 100644
--- a/nerv/examples/lmptb/lmptb/lmseqreader.lua
+++ b/nerv/examples/lmptb/lmptb/lmseqreader.lua
@@ -28,6 +28,10 @@ function LMReader:__init(global_conf, batch_size, chunk_size, vocab, r_conf)
     if r_conf.compressed_label == true then
         self.compressed_label = true
     end
+    self.same_io = false
+    if r_conf.same_io == true then --can be used to train P(wi|w1..(i-1),(i+1)..n)
+        self.same_io = true
+    end
 end
 
 --fn: string
@@ -36,9 +40,9 @@ function LMReader:open_file(fn)
     if (self.fh ~= nil) then
         nerv.error("%s error: in open_file(fn is %s), file handle not nil.", self.log_pre, fn)
     end
-    printf("%s opening file %s...\n", self.log_pre, fn)
-    print(self.log_pre, "batch_size:", self.batch_size, "chunk_size", self.chunk_size)
-    print(self.log_pre, "se_mode:", self.se_mode)
+    nerv.printf("%s opening file %s...\n", self.log_pre, fn)
+    nerv.printf("%s batch_size:%d chunk_size:%d\n", self.log_pre, self.batch_size, self.chunk_size)
+    nerv.printf("%s se_mode:%s same_io:%s\n", self.log_pre, tostring(self.se_mode), tostring(self.same_io))
     self.fh = io.open(fn, "r")
     self.streams = {}
     for i = 1, self.batch_size, 1 do
@@ -132,12 +136,15 @@ function LMReader:get_batch(feeds)
             else
                 self:refresh_stream(i)
                 if st.store[st.head] ~= nil then
-                    inputs_s[j][i] = st.store[st.head]
-                    --inputs_m[j][1][i - 1][0] = self.vocab:get_word_str(st.store[st.head]).id - 1
-                    self.bak_inputs_m[j][1][i - 1][0] = self.vocab:get_word_str(st.store[st.head]).id - 1
+                    if self.same_io == false then 
+                        inputs_s[j][i] = st.store[st.head]
+                        self.bak_inputs_m[j][1][i - 1][0] = self.vocab:get_word_str(st.store[st.head]).id - 1
+                    else
+                        inputs_s[j][i] = st.store[st.head + 1]
+                        self.bak_inputs_m[j][1][i - 1][0] = self.vocab:get_word_str(st.store[st.head + 1]).id - 1                      
+                    end
                 else
                     inputs_s[j][i] = self.vocab.null_token
-                    --inputs_m[j][1][i - 1][0] = 0
                     self.bak_inputs_m[j][1][i - 1][0] = 0
                 end
                 if st.store[st.head + 1] ~= nil then
@@ -148,7 +155,7 @@ function LMReader:get_batch(feeds)
                         inputs_m[j][2][i - 1][self.vocab:get_word_str(st.store[st.head + 1]).id - 1] = 1
                     end
                 else
-                    if (inputs_s[j][i] ~= self.vocab.null_token) then
+                    if inputs_s[j][i] ~= self.vocab.null_token then
                         nerv.error("reader error : input not null but label is null_token")
                     end
                     labels_s[j][i] = self.vocab.null_token
@@ -159,6 +166,9 @@ function LMReader:get_batch(feeds)
                     end
                     flags[j][i] = bit.bor(flags[j][i], nerv.TNN.FC.SEQ_NORM) --has both input and label
                     got_new = true
+                    if st.store[st.head] == self.vocab.sen_end_token then
+                        flags[j][i] = bit.bor(flags[j][i], nerv.TNN.FC.SEQ_START)
+                    end
                     st.store[st.head] = nil
                     st.head = st.head + 1
                     if labels_s[j][i] == self.vocab.sen_end_token then
@@ -169,10 +179,7 @@ function LMReader:get_batch(feeds)
                             end_stream = true --meet sentence end, this stream ends now
                         end
                     end
-                    if inputs_s[j][i] == self.vocab.sen_end_token then
-                        flags[j][i] = bit.bor(flags[j][i], nerv.TNN.FC.SEQ_START)
-                    end
-                end 
+               end 
             end
         end
     end
@@ -190,7 +197,7 @@ function LMReader:get_batch(feeds)
 
     --check for self.al_sen_start
     for i = 1, self.batch_size do
-        if inputs_s[1][i] ~= self.vocab.sen_end_token and inputs_s[1][i] ~= self.vocab.null_token then
+        if bit.band(flags[1][i], nerv.TNN.FC.SEQ_START) == 0 and flags[1][i] > 0 then
             self.stat.al_sen_start = false
         end
     end
@@ -198,7 +205,6 @@ function LMReader:get_batch(feeds)
     if got_new == false then
         nerv.info("lmseqreader file ends, printing stats...")
         nerv.printf("al_sen_start:%s\n", tostring(self.stat.al_sen_start))
-
         return false
     else
         return true
diff --git a/nerv/examples/lmptb/lmptb/lmutil.lua b/nerv/examples/lmptb/lmptb/lmutil.lua
index 6d66d6e..13a5c45 100644
--- a/nerv/examples/lmptb/lmptb/lmutil.lua
+++ b/nerv/examples/lmptb/lmptb/lmutil.lua
@@ -112,10 +112,17 @@ end
 --cla:string
 --w:string
 --prob:float, the probability
-function Result:add(cla, w, prob)
-    self[cla].logp_all = self[cla].logp_all + math.log10(prob)
+function Result:add(cla, w, prob, log10ed)
+    local lp
+    if log10ed == true then
+        lp = prob
+    else
+        lp = math.log10(prob)
+    end
+
+    self[cla].logp_all = self[cla].logp_all + lp
     if (self.vocab:is_unk_str(w)) then
-        self[cla].logp_unk = self[cla].logp_unk + math.log10(prob)
+        self[cla].logp_unk = self[cla].logp_unk + lp
         self[cla].cn_unk = self[cla].cn_unk + 1
     end
     if (w == self.vocab.sen_end_token) then
diff --git a/nerv/examples/lmptb/lmptb/lmvocab.lua b/nerv/examples/lmptb/lmptb/lmvocab.lua
index 0e7ef3e..38bb18e 100644
--- a/nerv/examples/lmptb/lmptb/lmvocab.lua
+++ b/nerv/examples/lmptb/lmptb/lmvocab.lua
@@ -2,8 +2,6 @@ require 'lmptb.lmutil'
 
 local Vocab = nerv.class("nerv.LMVocab")
 
-local printf = nerv.printf
-
 local mysplit = function(inputstr, sep)
     if sep == nil then
         sep = "%s"
@@ -106,7 +104,7 @@ end
 --fn: string
 --Add all words in fn to the vocab
 function Vocab:build_file(fn)
-    printf("%s Vocab building on file %s...\n", self.log_pre, fn)
+    nerv.printf("%s Vocab building on file %s...\n", self.log_pre, fn)
     local file = io.open(fn, "r")
     while (true) do
         local list = nerv.LMUtil.read_line(file)
@@ -119,7 +117,7 @@ function Vocab:build_file(fn)
         end
     end
     file:close()
-    printf("%s Building finished, vocab size now is %d.\n", self.log_pre, self:size())
+    nerv.printf("%s Building finished, vocab size now is %d.\n", self.log_pre, self:size())
 end
 
 --[[test
diff --git a/nerv/examples/lmptb/lstmlm_ptb_main.lua b/nerv/examples/lmptb/lstmlm_ptb_main.lua
index 9bdd5ff..b576834 100644
--- a/nerv/examples/lmptb/lstmlm_ptb_main.lua
+++ b/nerv/examples/lmptb/lstmlm_ptb_main.lua
@@ -277,7 +277,7 @@ global_conf = {
     hidden_size = 300,
     layer_num = 1,
     chunk_size = 15,
-    batch_size = 20, 
+    batch_size = 32, 
     max_iter = 35,
     lr_decay = 1.003,
     decay_iter = 10,
@@ -390,10 +390,10 @@ nerv.LMUtil.wait(2)
 
 math.randomseed(1)
 
-local vocab = nerv.LMVocab()
+local vocab = nerv.LMVocab(global_conf)
 global_conf["vocab"] = vocab
 nerv.printf("%s building vocab...\n", global_conf.sche_log_pre)
-global_conf.vocab:build_file(global_conf.vocab_fn, false)
+global_conf.vocab:build_file(global_conf.vocab_fn)
 ppl_rec = {} 
 
 local final_iter = -1
diff --git a/nerv/examples/lmptb/sample_grulm_ptb_main.lua b/nerv/examples/lmptb/m-tests/lm_sampler_test.lua
index 9a13d36..effb2ad 100644
--- a/nerv/examples/lmptb/sample_grulm_ptb_main.lua
+++ b/nerv/examples/lmptb/m-tests/lm_sampler_test.lua
@@ -134,10 +134,39 @@ function prepare_tnn(global_conf, layerRepo)
     return tnn
 end
 
-function prepare_dagL(global_conf, layerRepo)
-    nerv.printf("%s Generate and initing dagL ...\n", global_conf.sche_log_pre)
+function load_net_tnn(global_conf, fn)
+    prepare_parameters(global_conf, fn)
+    local layerRepo = prepare_layers(global_conf)
+    local tnn = prepare_tnn(global_conf, layerRepo)
+    return tnn
+end
+
+function prepare_sampler(sm_conf)
+    sm_conf.pr = nerv.ParamRepo()
+    sm_conf.pr:import({sm_conf.fn_to_sample}, nil, sm_conf)
+
+    local layers = {
+        ["nerv.GRULayerT"] = {
+            ["gruL1"] = {{}, {["dim_in"] = {sm_conf.hidden_size, sm_conf.hidden_size}, ["dim_out"] = {sm_conf.hidden_size}, ["pr"] = sm_conf.pr}}, 
+        },
+        ["nerv.DropoutLayerT"] = {
+            ["dropoutL1"] = {{}, {["dim_in"] = {sm_conf.hidden_size}, ["dim_out"] = {sm_conf.hidden_size}}}, 
+        },
+        ["nerv.SelectLinearLayer"] = {
+            ["selectL1"] = {{}, {["dim_in"] = {1}, ["dim_out"] = {sm_conf.hidden_size}, ["vocab"] = sm_conf.vocab, ["pr"] = sm_conf.pr}},
+        },
+        ["nerv.CombinerLayer"] = {
+           ["combinerL1"] = {{}, {["dim_in"] = {sm_conf.hidden_size}, ["dim_out"] = {sm_conf.hidden_size, sm_conf.hidden_size}, ["lambda"] = {1}}},
+        },
+        ["nerv.AffineLayer"] = {
+            ["outputL"] = {{}, {["dim_in"] = {sm_conf.hidden_size}, ["dim_out"] = {sm_conf.vocab:size()},  ["pr"] = sm_conf.pr}},
+        },
+        ["nerv.SoftmaxCELayerT"] = {
+            ["softmaxL"] = {{}, {["dim_in"] = {sm_conf.vocab:size(), sm_conf.vocab:size()}, ["dim_out"] = {1}}},
+        },
+    }
+    local layerRepo = nerv.LayerRepo(layers, sm_conf.pr, sm_conf)
 
-    --input: input_w, input_w, ... input_w_now, last_activation
     local connections_t = {
         ["<input>[1]"] = "selectL1[1]",
        
@@ -151,48 +180,19 @@ function prepare_dagL(global_conf, layerRepo)
         ["combinerL1[2]"] = "<output>[2]",
     }
     
-    if global_conf.layer_num > 1 then
+    if sm_conf.layer_num > 1 then
         nerv.error("multiple layer is currently not supported(not hard to implement though)")
     end
-    --[[
-    for l = 2, global_conf.layer_num do
-        table.insert(connections_t, {"dropoutL"..(l-1).."[1]", "gruL"..l.."[1]", 0})
-        table.insert(connections_t, {"gruL"..l.."[1]", "combinerL"..l.."[1]", 0})
-        table.insert(connections_t, {"combinerL"..l.."[1]", "gruL"..l.."[2]", 1})
-        table.insert(connections_t, {"combinerL"..l.."[2]", "dropoutL"..l.."[1]", 0})
-    end
-    ]]--
-
-    --[[
-    printf("%s printing DAG connections:\n", global_conf.sche_log_pre)
-    for key, value in pairs(connections_t) do
-        printf("\t%s->%s\n", key, value)
-    end
-    ]]--
 
-    local dagL = nerv.DAGLayerT("dagL", global_conf, {["dim_in"] = {1, global_conf.hidden_size}, 
-            ["dim_out"] = {global_conf.vocab:size(), global_conf.hidden_size}, ["sub_layers"] = layerRepo,
+    local dagL = nerv.DAGLayerT("dagL", sm_conf, {["dim_in"] = {1, sm_conf.hidden_size}, 
+            ["dim_out"] = {sm_conf.vocab:size(), sm_conf.hidden_size}, ["sub_layers"] = layerRepo,
             ["connections"] = connections_t
         })
+    
+    local sampler = nerv.LMSampler(sm_conf)
+    sampler:load_dagL(dagL)
 
-    dagL:init(global_conf.batch_size)
-
-    nerv.printf("%s Initing DAGL end.\n", global_conf.sche_log_pre)
-    return dagL
-end
-
-function load_net_tnn(global_conf, fn)
-    prepare_parameters(global_conf, fn)
-    local layerRepo = prepare_layers(global_conf)
-    local tnn = prepare_tnn(global_conf, layerRepo)
-    return tnn
-end
-
-function load_net_dagL(global_conf, fn)
-    prepare_parameters(global_conf, fn)
-    local layerRepo = prepare_layers(global_conf)
-    local dagL = prepare_dagL(global_conf, layerRepo)
-    return dagL
+    return sampler
 end
 
 local train_fn, valid_fn, test_fn
@@ -240,6 +240,23 @@ global_conf = {
     fn_to_sample = root_dir .. '/ptb/EXP-nerv/grulm_v1.0h300l1ch15ba32slr0.15wc1e-05dr0.5/params.final',
 }
 
+sm_conf = {
+    cumat_type = nerv.CuMatrixFloat,
+    mmat_type = nerv.MMatrixFloat,
+    nn_act_default = 0, 
+
+    hidden_size = 300,
+    layer_num = 1,
+    batch_size = 32, 
+    chunk_size = 85, --largest sample sentence length
+    max_iter = 35,
+    max_sen_len = 90,
+    sche_log_pre = "[SAMPLER_S]:",
+
+    timer = global_conf.timer,
+    fn_to_sample = root_dir .. '/ptb/EXP-nerv/grulm_v1.0h300l1ch15ba32slr0.15wc1e-05dr0.5/params.final',
+}
+
 elseif (set == "msr_sc") then
 
 data_dir = '/home/slhome/txh18/workspace/sentenceCompletion/DATA_PV2'
@@ -276,15 +293,13 @@ global_conf = {
 
 elseif (set == "twitter") then
 
-data_dir = root_dir .. '/twitter_new/DATA'
-train_fn = data_dir .. '/twitter.choose2.adds'
-valid_fn = data_dir .. '/twitter.valid.adds'
-test_fn = data_dir .. '/comm.test.choose-ppl.adds'
-vocab_fn = data_dir .. '/twitter.choose.train.vocab'
-
---qdata_dir = root_dir .. '/ptb/questionGen/gen'
-
-global_conf = {
+    data_dir = root_dir .. '/twitter_new/DATA'
+    train_fn = data_dir .. '/twitter.choose2.adds'
+    valid_fn = data_dir .. '/twitter.valid.adds'
+    test_fn = data_dir .. '/comm.test.choose-ppl.adds'
+    vocab_fn = data_dir .. '/twitter.choose.train.vocab'
+    --qdata_dir = root_dir .. '/ptb/questionGen/gen'
+    global_conf = {
     lrate = 0.15, wcost = 1e-5, momentum = 0, clip_t = 5,
     cumat_type = nerv.CuMatrixFloat,
     mmat_type = nerv.MMatrixFloat,
@@ -309,7 +324,7 @@ global_conf = {
     log_w_num = 40000, --give a message when log_w_num words have been processed
     timer = nerv.Timer(),
     work_dir_base = root_dir .. '/twitter_new/EXP-nerv/grulm_v1.0'
-}
+    }
 
 else
 
@@ -347,15 +362,12 @@ global_conf = {
 
 end
 
-lr_half = false --can not be local, to be set by loadstring
-start_iter = -1
-start_lr = nil
-ppl_last = 100000
 commands_str = "sampling" --"train:test"
 commands = {}
-test_iter = -1
---for testout(question)
-q_file = "/home/slhome/txh18/workspace/ptb/questionGen/gen/ptb.test.txt.q10rs1_Msss.adds"
+test_iter = -1 --obselete
+random_seed = 1
+sample_num = 10
+out_fn = nil
 
 if arg[2] ~= nil then
     nerv.printf("%s applying arg[2](%s)...\n", global_conf.sche_log_pre, arg[2])
@@ -385,26 +397,27 @@ nerv.LMUtil.wait(2)
 ]]--
 
 ----------------printing options---------------------------------
-nerv.printf("%s printing global_conf...\n", global_conf.sche_log_pre)
-for id, value in pairs(global_conf) do
+nerv.printf("%s printing sm_conf...\n", sm_conf.sche_log_pre)
+for id, value in pairs(sm_conf) do
     nerv.printf("%s:\t%s\n", id, tostring(value))
 end
 nerv.LMUtil.wait(2)
 
 nerv.printf("%s printing training scheduling options...\n", global_conf.sche_log_pre)
-nerv.printf("lr_half:\t%s\n", tostring(lr_half))
-nerv.printf("start_iter:\t%s\n", tostring(start_iter))
-nerv.printf("ppl_last:\t%s\n", tostring(ppl_last))
 nerv.printf("commands_str:\t%s\n", commands_str)
 nerv.printf("test_iter:\t%s\n", tostring(test_iter))
+nerv.printf("random_seed:\t%s\n", tostring(random_seed))
+nerv.printf("sample_num:\t%s\n", tostring(sample_num))
+nerv.printf("out_fn:\t%s\n", tostring(out_fn))
 nerv.printf("%s printing training scheduling end.\n", global_conf.sche_log_pre)
 nerv.LMUtil.wait(2)
 ------------------printing options end------------------------------
 
-math.randomseed(1)
+math.randomseed(random_seed)
 
 local vocab = nerv.LMVocab()
 global_conf["vocab"] = vocab
+sm_conf["vocab"] = global_conf.vocab
 nerv.printf("%s building vocab...\n", global_conf.sche_log_pre)
 global_conf.vocab:build_file(global_conf.vocab_fn, false)
 ppl_rec = {} 
@@ -421,18 +434,34 @@ end --if commands["test"]
 if commands["sampling"] == 1 then
     nerv.printf("===SAMPLE===\n") 
     global_conf.sche_log_pre = "[SCHEDULER SAMPLING]:" 
-    local dagL = load_net_dagL(global_conf, global_conf.fn_to_sample) 
-    local sampler = nerv.LMSampler(global_conf)
-    sampler:load_dagL(dagL)
-    for k = 1, 5 do
-        local res = sampler:lm_sample_rnn_dagL(10, {})
+    local sampler = prepare_sampler(sm_conf)
+    local out_fh = nil
+    if out_fn ~= nil then
+        out_fh = assert(io.open(out_fn, "w"))
+        nerv.printf("%s outputing samples to file \"%s\"...\n", global_conf.sche_log_pre, out_fn)
+    end
+    for k = 1, sample_num do
+        local res = sampler:lm_sample_rnn_dagL(1, {})
         for i = 1, #res do
+            if out_fh == nil then nerv.printf("lm_sampler_output_sample: ") end
             for j = 1, #res[i] do
-                nerv.printf("%s ", res[i][j].w)
+                if out_fh == nil then
+                    nerv.printf("%s %f ", res[i][j].w, res[i][j].p)
+                else
+                    out_fh:write(nerv.sprintf("%s %f ", res[i][j].w, res[i][j].p))
+                end
+            end
+            if out_fh == nil then
+                nerv.printf("\n")
+            else
+                out_fh:write(nerv.sprintf("\n"))
             end
-            nerv.printf("\n")
         end
+        if k % 10000 == 0 and out_fh ~= nil then nerv.printf("%s %d sample done\n", global_conf.sche_log_pre, k) end
     end
+
+    if out_fh ~= nil then out_fh:close() end
+    nerv.printf("%s complete,bye\n", global_conf.sche_log_pre)
     --global_conf.dropout_rate = 0
     --LMTrainer.lm_process_file_rnn(global_conf, global_conf.test_fn, tnn, false) --false update!
 end --if commands["sampling"]
diff --git a/nerv/examples/lmptb/m-tests/lmseqreader_test.lua b/nerv/examples/lmptb/m-tests/lmseqreader_test.lua
index 9127559..3f99741 100644
--- a/nerv/examples/lmptb/m-tests/lmseqreader_test.lua
+++ b/nerv/examples/lmptb/m-tests/lmseqreader_test.lua
@@ -7,7 +7,7 @@ local test_fn = "/home/slhome/txh18/workspace/nerv/nerv/nerv/examples/lmptb/m-te
 --local test_fn = "/home/slhome/txh18/workspace/nerv-project/nerv/examples/lmptb/PTBdata/ptb.train.txt"
 local vocab = nerv.LMVocab()
 vocab:build_file(test_fn)
-local chunk_size = 20
+local chunk_size = 15
 local batch_size = 3
 local global_conf = {
     lrate = 1, wcost = 1e-6, momentum = 0,
@@ -30,7 +30,8 @@ local global_conf = {
     vocab = vocab
 }
 
-local reader = nerv.LMSeqReader(global_conf, batch_size, chunk_size, vocab, {["se_mode"] = true})
+local reader = nerv.LMSeqReader(global_conf, batch_size, chunk_size, vocab, 
+        {["se_mode"] = true, ["same_io"] = true})
 reader:open_file(test_fn)
 local feeds = {}
 feeds.flags_now = {}
@@ -40,14 +41,15 @@ for j = 1, chunk_size do
     feeds.inputs_m[j] = {global_conf.cumat_type(batch_size, 1), global_conf.cumat_type(batch_size, global_conf.vocab:size())}
     feeds.flags_now[j] = {}
 end
-while (1) do
+for k = 1, 5 do
     local r = reader:get_batch(feeds)
     if (r == false) then break end
     for j = 1, chunk_size, 1 do
         for i = 1, batch_size, 1 do
-            printf("%s[L(%s)] ", feeds.inputs_s[j][i], feeds.labels_s[j][i])   --vocab:get_word_str(input[i][j]).id
+            printf("%s[L(%s)]F%d ", feeds.inputs_s[j][i], feeds.labels_s[j][i], feeds.flags_now[j][i])   --vocab:get_word_str(input[i][j]).id
         end
         printf("\n")
     end
     printf("\n")
 end
+printf("reader.sen_start %s\n", tostring(reader.stat.al_sen_start))
diff --git a/nerv/examples/lmptb/m-tests/some-text b/nerv/examples/lmptb/m-tests/some-text
index da4bea9..6756fa0 100644
--- a/nerv/examples/lmptb/m-tests/some-text
+++ b/nerv/examples/lmptb/m-tests/some-text
@@ -1,4 +1,4 @@
-</s> aa bb cc aa bb cc aa bb cc aa bb cc aa bb cc aa </s>
+</s> aa bb cc aa bb cc aa bb cc aa bb cc aa </s>
 </s> aa bb cc aa bb cc aa bb cc aa </s>
 </s> bb cc aa bb cc aa bb cc aa </s>
 </s> aa bb cc aa </s>
diff --git a/nerv/examples/lmptb/rnnlm_ptb_main.lua b/nerv/examples/lmptb/rnnlm_ptb_main.lua
index dc011fb..a1d9471 100644
--- a/nerv/examples/lmptb/rnnlm_ptb_main.lua
+++ b/nerv/examples/lmptb/rnnlm_ptb_main.lua
@@ -197,6 +197,43 @@ global_conf = {
     work_dir_base = root_dir .. '/ptb/EXP-nerv/rnnlm_tnn'
 }
 
+elseif (set == "twitter") then
+
+data_dir = root_dir .. '/twitter_new/DATA'
+train_fn = data_dir .. '/twitter.choose.adds'
+valid_fn = data_dir .. '/twitter.valid.adds'
+test_fn = data_dir .. '/comm.test.choose-ppl.adds'
+vocab_fn = data_dir .. '/twitter.choose.train.vocab'
+
+--qdata_dir = root_dir .. '/ptb/questionGen/gen'
+
+global_conf = {
+    lrate = 0.15, wcost = 1e-5, momentum = 0, clip_t = 5,
+    cumat_type = nerv.CuMatrixFloat,
+    mmat_type = nerv.MMatrixFloat,
+    nn_act_default = 0, 
+
+    hidden_size = 300,
+    layer_num = 1,
+    chunk_size = 15,
+    batch_size = 32, 
+    max_iter = 30,
+    lr_decay = 1.003,
+    decay_iter = 10,
+    param_random = function() return (math.random() / 5 - 0.1) end,
+    dropout_str = "0.5",
+
+    train_fn = train_fn,
+    valid_fn = valid_fn,
+    test_fn = test_fn,
+    vocab_fn = vocab_fn,
+    max_sen_len = 32,
+    sche_log_pre = "[SCHEDULER]:",
+    log_w_num = 40000, --give a message when log_w_num words have been processed
+    timer = nerv.Timer(),
+    work_dir_base = root_dir .. '/twitter_new/EXP-nerv/rnnlm_v1.0'
+}
+
 elseif (set == "msr_sc") then
 
 data_dir = '/home/slhome/txh18/workspace/sentenceCompletion/DATA_PV2'
diff --git a/nerv/lib/matrix/cumatrix.c b/nerv/lib/matrix/cumatrix.c
index 04205e4..58bdfe7 100644
--- a/nerv/lib/matrix/cumatrix.c
+++ b/nerv/lib/matrix/cumatrix.c
@@ -9,6 +9,14 @@ static cudaEvent_t profile_start, profile_stop;
 curandGenerator_t curand_gen;
 static HashMap *profile;
 
+void nerv_cumatrix_select_gpu(int dev, Status *status) {
+    fprintf(stderr, "** selecting GPU %d\n", dev);
+    NERV_SET_STATUS(status, NERV_NORMAL, 0);
+    CUDA_SAFE_SYNC_CALL(cudaSetDevice(dev), status);
+    CUDA_SAFE_SYNC_CALL(cublasDestroy(cublas_handle), status);
+    CUDA_SAFE_SYNC_CALL(cublasCreate(&cublas_handle), status);
+}
+
 void nerv_cumatrix_print_profile() {
     size_t i;
     fprintf(stderr, "*** [nerv cumatrix profile] **\n");
diff --git a/nerv/lib/matrix/generic/cukernel.cu b/nerv/lib/matrix/generic/cukernel.cu
index 8fbe05d..51e3b6a 100644
--- a/nerv/lib/matrix/generic/cukernel.cu
+++ b/nerv/lib/matrix/generic/cukernel.cu
@@ -383,6 +383,20 @@ __global__ void cudak_(copy_rows_by_colidx)(const MATRIX_ELEM *a, MATRIX_ELEM *b
     b[j + i * stride] = a[j + k * stride];
 }
 
+__global__ void cudak_(prefixsum_row_reduce)(const MATRIX_ELEM *a, MATRIX_ELEM *b,
+                        int nrow, int ncol, int stride_a, int stride_b, int offset) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    long idx_a, idx_b;
+    if (i >= nrow || j >= ncol) return;
+    idx_b = j + i * stride_b;
+    idx_a = j + i * stride_a;
+    //b[idx] = 1.0 / (1.0 + exp(-a[idx]));
+    if (j >= offset) 
+        b[idx_b] = a[idx_a] + a[idx_a - offset];
+    else
+        b[idx_b] = a[idx_a];
+}
 
 extern "C" {
 #include "../cukernel.h"
@@ -745,6 +759,40 @@ extern "C" {
         cudaStreamSynchronize(0);
     }
 
+    void cudak_(cuda_prefixsum_row)(const Matrix *a, Matrix *b) {
+        dim3 threadsPerBlock(CUDA_THREADS_N, CUDA_THREADS_N);
+        dim3 numBlocks(CEIL_DIV(b->ncol, threadsPerBlock.x),
+                CEIL_DIV(b->nrow, threadsPerBlock.y));
+        
+        MATRIX_ELEM *tmp[2];
+        size_t tmp_stride[2];
+        cudaMallocPitch(tmp, tmp_stride + 0, a->ncol * sizeof(MATRIX_ELEM), a->nrow);
+        cudaMallocPitch(tmp + 1, tmp_stride + 1, a->ncol * sizeof(MATRIX_ELEM), a->nrow);
+        
+        int offset = 1;
+        cudak_(prefixsum_row_reduce)<<<numBlocks, threadsPerBlock>>> \
+            (MATRIX_ELEM_PTR(a), tmp[0], b->nrow, b->ncol,
+            a->stride / sizeof(MATRIX_ELEM), tmp_stride[0] / sizeof(MATRIX_ELEM), offset);
+        int pin = 0, pout = 1;
+
+        for (offset = 2;offset <= a->ncol / 2;offset *= 2) {
+            cudak_(prefixsum_row_reduce)<<<numBlocks, threadsPerBlock>>> \
+                (tmp[pin], tmp[pout], b->nrow, b->ncol,
+                tmp_stride[pin] / sizeof(MATRIX_ELEM), tmp_stride[pout] / sizeof(MATRIX_ELEM), offset);
+            pin = 1 - pin; 
+            pout = 1 - pout;
+        }
+
+        cudak_(prefixsum_row_reduce)<<<numBlocks, threadsPerBlock>>> \
+            (tmp[pin], MATRIX_ELEM_PTR(b), b->nrow, b->ncol,
+            tmp_stride[pin] / sizeof(MATRIX_ELEM), b->stride / sizeof(MATRIX_ELEM), offset);
+        
+        cudaFree(tmp[0]);
+        cudaFree(tmp[1]);
+        
+        cudaStreamSynchronize(0);
+    }
+
     void cudak_(cuda_decompress)(const Matrix *a, Matrix *b) {
         dim3 threadsPerBlock(1, CUDA_THREADS_NN);
         dim3 numBlocks(1, CEIL_DIV(a->nrow, threadsPerBlock.y));
diff --git a/nerv/lib/matrix/generic/cumatrix.c b/nerv/lib/matrix/generic/cumatrix.c
index bf93b77..7b70607 100644
--- a/nerv/lib/matrix/generic/cumatrix.c
+++ b/nerv/lib/matrix/generic/cumatrix.c
@@ -486,6 +486,14 @@ void nerv_matrix_(scale_rows_by_row)(Matrix *a, const Matrix *b,
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
 }
 
+void nerv_matrix_(prefixsum_row)(Matrix *a, const Matrix *b, Status *status) {
+    CHECK_SAME_DIMENSION(a, b, status);
+    PROFILE_START
+    cudak_(cuda_prefixsum_row)(b, a);
+    PROFILE_STOP
+    NERV_SET_STATUS(status, NERV_NORMAL, 0);
+}
+
 static void cuda_matrix_(free)(MATRIX_ELEM *ptr, Status *status) {
     CUDA_SAFE_SYNC_CALL(cudaFree(ptr), status);
     NERV_SET_STATUS(status, NERV_NORMAL, 0);
diff --git a/nerv/lib/matrix/generic/cumatrix.h b/nerv/lib/matrix/generic/cumatrix.h
index 4f66a2c..5b8076f 100644
--- a/nerv/lib/matrix/generic/cumatrix.h
+++ b/nerv/lib/matrix/generic/cumatrix.h
@@ -61,6 +61,7 @@ void nerv_matrix_(scale_rows_by_col)(Matrix *a, const Matrix *b,
                                     Status *status);
 void nerv_matrix_(scale_rows_by_row)(Matrix *a, const Matrix *b,
                                     Status *status);
+void nerv_matrix_(prefixsum_row)(Matrix *a, const Matrix *b, Status *status);
 void nerv_matrix_(thres_mask)(Matrix *a, Matrix *b,
                                 double thres, double low, double high,
                                 Status *status);
diff --git a/nerv/matrix/cumatrix.c b/nerv/matrix/cumatrix.c
index bf92f92..7f22d68 100644
--- a/nerv/matrix/cumatrix.c
+++ b/nerv/matrix/cumatrix.c
@@ -8,6 +8,14 @@ static cublasHandle_t cublas_handle;
 static cudaEvent_t profile_start, profile_stop;
 static HashMap *profile;
 
+static int select_gpu(lua_State *L) {
+    Status status;
+    int dev = luaL_checkinteger(L, 1);
+    nerv_cumatrix_select_gpu(dev, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
+    return 0;
+}
+
 static int print_profile(lua_State *L) {
     nerv_cumatrix_print_profile();
     return 0;
@@ -21,6 +29,7 @@ static int clear_profile(lua_State *L) {
 static const luaL_Reg cumatrix_methods[] = {
     {"print_profile", print_profile},
     {"clear_profile", clear_profile},
+    {"select_gpu", select_gpu},
     {NULL, NULL}
 };
 
diff --git a/nerv/matrix/generic/cumatrix.c b/nerv/matrix/generic/cumatrix.c
index cb55901..b706c21 100644
--- a/nerv/matrix/generic/cumatrix.c
+++ b/nerv/matrix/generic/cumatrix.c
@@ -15,6 +15,15 @@ static int nerv_matrix_(lua_get_blas_op)(char ch) {
     return (ch == 'T' || ch == 't') ? CUBLAS_OP_T : CUBLAS_OP_N;
 }
 
+static int nerv_matrix_(lua_prefixsum_row)(lua_State *L) {
+    Status status;
+    Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
+    Matrix *b = luaT_checkudata(L, 2, nerv_matrix_(tname));
+    nerv_matrix_(prefixsum_row)(a, b, &status);
+    NERV_LUA_CHECK_STATUS(L, status);
+    return 0;
+}
+
 static int nerv_matrix_(lua_thres_mask)(lua_State *L) {
     Status status;
     Matrix *a = luaT_checkudata(L, 1, nerv_matrix_(tname));
@@ -230,6 +239,7 @@ static const luaL_Reg nerv_matrix_(extra_methods)[] = {
     {"rearrange_frm", nerv_matrix_(lua_rearrange_frm)},
     {"scale_rows_by_row", nerv_matrix_(lua_scale_rows_by_row)},
     {"scale_rows_by_col", nerv_matrix_(lua_scale_rows_by_col)},
+    {"prefixsum_row", nerv_matrix_(lua_prefixsum_row)},
 #ifdef __NERV_FUTURE_CUDA_7
     {"update_select_rows_by_rowidx", nerv_matrix_(lua_update_select_rows_by_rowidx)},
     {"update_select_rows_by_colidx", nerv_matrix_(lua_update_select_rows_by_colidx)},
diff --git a/nerv/tnn/init.lua b/nerv/tnn/init.lua
index b375fa8..7faca31 100644
--- a/nerv/tnn/init.lua
+++ b/nerv/tnn/init.lua
@@ -47,5 +47,6 @@ nerv.include('sutil.lua')
 nerv.include('tnn.lua')
 nerv.include('layersT/softmax_ce_t.lua')
 nerv.include('layersT/lstm_t.lua')
+nerv.include('layersT/gru_t.lua')
 nerv.include('layersT/dropout_t.lua')
 nerv.include('layer_dag_t.lua')
diff --git a/nerv/examples/lmptb/lmptb/layer/gru_t.lua b/nerv/tnn/layersT/gru_t.lua
index 8f15cc8..8f15cc8 100644
--- a/nerv/examples/lmptb/lmptb/layer/gru_t.lua
+++ b/nerv/tnn/layersT/gru_t.lua