diff options
-rw-r--r-- | nerv/examples/lmptb/lm_trainer.lua | 10 | ||||
-rw-r--r-- | nerv/examples/lmptb/lmptb/lmutil.lua | 6 | ||||
-rw-r--r-- | nerv/layer/affine.lua | 17 |
3 files changed, 20 insertions, 13 deletions
diff --git a/nerv/examples/lmptb/lm_trainer.lua b/nerv/examples/lmptb/lm_trainer.lua index 2be97c8..7c11a34 100644 --- a/nerv/examples/lmptb/lm_trainer.lua +++ b/nerv/examples/lmptb/lm_trainer.lua @@ -22,6 +22,8 @@ function LMTrainer.lm_process_file(global_conf, fn, tnn, do_train) local next_log_wcn = global_conf.log_w_num while (1) do + global_conf.timer:tic('most_out_loop_lmprocessfile') + local r, feeds r, feeds = tnn:getFeedFromReader(reader) @@ -60,12 +62,17 @@ function LMTrainer.lm_process_file(global_conf, fn, tnn, do_train) end end end + + tnn:moveRightToNextMB() + global_conf.timer:tic('most_out_loop_lmprocessfile') + + --print log if (result["rnn"].cn_w > next_log_wcn) then next_log_wcn = next_log_wcn + global_conf.log_w_num printf("%s %d words processed %s.\n", global_conf.sche_log_pre, result["rnn"].cn_w, os.date()) printf("\t%s log prob per sample :%f.\n", global_conf.sche_log_pre, result:logp_sample("rnn")) for key, value in pairs(global_conf.timer.rec) do - printf("\t [global_conf.timer]: time spent on %s:%.5fs\n", key, value) + printf("\t [global_conf.timer]: time spent on %s:%.5f clock time\n", key, value) end global_conf.timer:flush() nerv.LMUtil.wait(0.1) @@ -77,7 +84,6 @@ function LMTrainer.lm_process_file(global_conf, fn, tnn, do_train) end ]]-- - tnn:moveRightToNextMB() collectgarbage("collect") diff --git a/nerv/examples/lmptb/lmptb/lmutil.lua b/nerv/examples/lmptb/lmptb/lmutil.lua index 77babef..821aa94 100644 --- a/nerv/examples/lmptb/lmptb/lmutil.lua +++ b/nerv/examples/lmptb/lmptb/lmutil.lua @@ -124,7 +124,7 @@ function Timer:__init() end function Timer:tic(item) - self.last[item] = os.time() + self.last[item] = os.clock() end function Timer:toc(item) @@ -134,11 +134,11 @@ function Timer:toc(item) if (self.rec[item] == nil) then self.rec[item] = 0 end - self.rec[item] = self.rec[item] + os.difftime(os.time(), self.last[item]) + self.rec[item] = self.rec[item] + os.clock() - self.last[item] end function Timer:flush() for key, value in pairs(self.rec) do - self.rec[key] = 0 + self.rec[key] = nil end end diff --git a/nerv/layer/affine.lua b/nerv/layer/affine.lua index a2809bf..0fcff36 100644 --- a/nerv/layer/affine.lua +++ b/nerv/layer/affine.lua @@ -24,21 +24,21 @@ function MatrixParam:update(gradient) local mmt_gain = 1.0 / (1.0 - gconf.momentum); local n = self.gconf.batch_size * mmt_gain -- perform update - self.trans:add(self.trans, self.correction, 1.0, -gconf.lrate / n) + self.trans:add(self.trans, self.correction, 1.0 - gconf.lrate*gconf.wcost/gconf.batch_size, -gconf.lrate / n) end function LinearTransParam:update(gradient) MatrixParam.update(self, gradient) - local gconf = self.gconf - -- weight decay - self.trans:add(self.trans, self.trans, 1.0, -gconf.lrate * gconf.wcost / gconf.batch_size) + -- local gconf = self.gconf + -- weight decay(put into MatrixParam:update) + -- self.trans:add(self.trans, self.trans, 1.0, -gconf.lrate * gconf.wcost / gconf.batch_size) end function BiasParam:update(gradient) MatrixParam.update(self, gradient) - local gconf = self.gconf + -- local gconf = self.gconf -- weight decay - self.trans:add(self.trans, self.trans, 1.0, -gconf.lrate * gconf.wcost / gconf.batch_size) + -- self.trans:add(self.trans, self.trans, 1.0, -gconf.lrate * gconf.wcost / gconf.batch_size) end function AffineLayer:__init(id, global_conf, layer_conf) @@ -76,12 +76,13 @@ function AffineLayer:update(bp_err, input, output) local gconf = self.gconf if (gconf.momentum > 0) then self.ltp.correction:mul(input[1], bp_err[1], 1.0, gconf.momentum, 'T', 'N') + self.bp.correction:add(self.bp.correction, bp_err[1]:colsum(), gconf.momentum, 1) -- momentum gain local mmt_gain = 1.0 / (1.0 - gconf.momentum); local n = self.gconf.batch_size * mmt_gain -- perform update - self.ltp.trans:add(self.ltp.trans, self.ltp.correction, 1.0, -gconf.lrate / n) - self.bp.trans:add(self.bp.trans, bp_err[1]:colsum(), 1.0-gconf.lrate*gconf.wcost, -gconf.lrate / gconf.batch_size) + self.ltp.trans:add(self.ltp.trans, self.ltp.correction, 1.0-gconf.lrate*gconf.wcost/gconf.batch_size, -gconf.lrate / n) + self.bp.trans:add(self.bp.trans, self.bp.correction, 1.0-gconf.lrate*gconf.wcost/gconf.batch_size, -gconf.lrate / n) else self.ltp.trans:mul(input[1], bp_err[1], -gconf.lrate / gconf.batch_size, 1.0-gconf.lrate*gconf.wcost/gconf.batch_size, 'T', 'N') self.bp.trans:add(self.bp.trans, bp_err[1]:colsum(), 1.0-gconf.lrate*gconf.wcost/gconf.batch_size, -gconf.lrate / gconf.batch_size) |