From c589c3aabaae7f3867bdfed994c8179a87f42675 Mon Sep 17 00:00:00 2001 From: Qi Liu Date: Tue, 29 Mar 2016 10:05:29 +0800 Subject: fix bug of momentum & update mse layer --- nerv/layer/affine.lua | 43 +++++++++++++++++++------------------------ nerv/layer/lstm_gate.lua | 9 +++++---- nerv/layer/mse.lua | 33 +++++++++++++++++++++++---------- nerv/nn/network.lua | 6 ++---- 4 files changed, 49 insertions(+), 42 deletions(-) diff --git a/nerv/layer/affine.lua b/nerv/layer/affine.lua index 38743aa..a05ae17 100644 --- a/nerv/layer/affine.lua +++ b/nerv/layer/affine.lua @@ -25,7 +25,9 @@ end function MatrixParam:train_init() self.correction = self.trans:create() + self.correction_acc = self.correction:create() self.correction:fill(0) + self.correction_acc:fill(0) end function MatrixParam:copy(copier) @@ -34,46 +36,37 @@ function MatrixParam:copy(copier) return target end -function MatrixParam:_update_by_gradient(gradient, alpha, beta) +function MatrixParam:_update(alpha, beta) local gconf = self.gconf -- momentum gain local mmt_gain = 1.0 / (1.0 - gconf.momentum) local n = gconf.batch_size * mmt_gain -- perform update if gconf.momentum > 0 then - self.correction:add(self.correction, gradient, gconf.momentum, 1.0) + self.correction:add(self.correction, self.correction_acc, gconf.momentum, 1.0) self.trans:add(self.trans, self.correction, alpha, -gconf.lrate / n * beta) else - self.trans:add(self.trans, gradient, alpha, -gconf.lrate / n * beta) + self.trans:add(self.trans, self.correction_acc, alpha, -gconf.lrate / n * beta) end + self.correction_acc:fill(0) end -function MatrixParam:_update_by_err_input(err, input, alpha, beta) - local gconf = self.gconf - -- momentum gain - local mmt_gain = 1.0 / (1.0 - gconf.momentum) - local n = gconf.batch_size * mmt_gain - -- perform update - if gconf.momentum > 0 then - self.correction:mul(input, err, 1.0, gconf.momentum, 'T', 'N') - self.trans:add(self.trans, self.correction, alpha, -gconf.lrate / n * beta) - else - self.trans:mul(input, err, -gconf.lrate / n * beta, alpha, 'T', 'N') - end +function MatrixParam:back_propagate_by_gradient(gradient) + self.correction_acc:add(self.correction_acc, gradient, 1.0, 1.0) end -function MatrixParam:update_by_gradient(gradient) - self:_update_by_gradient(gradient, 1.0, 1.0) +function MatrixParam:back_propagate_by_err_input(err, input) + self.correction_acc:mul(input, err, 1.0, 1.0, 'T', 'N') end -function MatrixParam:update_by_err_input(err, input) - self:_update_by_err_input(err, input, 1.0, 1.0) +function MatrixParam:update_by_gradient() + self:_update(1.0, 1.0) end -function LinearTransParam:update_by_err_input(err, input) +function MatrixParam:update_by_err_input() local gconf = self.gconf local l2 = 1 - gconf.lrate * gconf.wcost - self:_update_by_err_input(err, input, l2, l2) + self:_update(l2, l2) end --- A fully-connected linear transform layer. @@ -121,11 +114,11 @@ function AffineLayer:batch_resize(batch_size) -- do nothing end -function AffineLayer:update(bp_err, input, output) +function AffineLayer:update() for i = 1, #self.dim_in do - self["ltp" .. i]:update_by_err_input(bp_err[1], input[i]) + self["ltp" .. i]:update_by_err_input() end - self.bp:update_by_gradient(bp_err[1]:colsum()) + self.bp:update_by_gradient() end function AffineLayer:propagate(input, output) @@ -141,7 +134,9 @@ end function AffineLayer:back_propagate(bp_err, next_bp_err, input, output) for i = 1, #self.dim_in do next_bp_err[i]:mul(bp_err[1], self["ltp" .. i].trans, 1.0, 0.0, 'N', 'T') + self["ltp" .. i]:back_propagate_by_err_input(bp_err[1], input[i]) end + self.bp:back_propagate_by_gradient(bp_err[1]:colsum()) end function AffineLayer:get_params() diff --git a/nerv/layer/lstm_gate.lua b/nerv/layer/lstm_gate.lua index e690721..9d79b04 100644 --- a/nerv/layer/lstm_gate.lua +++ b/nerv/layer/lstm_gate.lua @@ -60,18 +60,19 @@ function LSTMGateLayer:back_propagate(bp_err, next_bp_err, input, output) self.err_bakm:sigmoid_grad(bp_err[1], output[1]) for i = 1, #self.dim_in do next_bp_err[i]:mul(self.err_bakm, self["ltp" .. i].trans, 1.0, 0.0, 'N', 'T') + self["ltp" .. i]:back_propagate_by_err_input(self.err_bakm, input[i]) end + self.bp:back_propagate_by_gradient(self.err_bakm:colsum()) end -function LSTMGateLayer:update(bp_err, input, output) - self.err_bakm:sigmoid_grad(bp_err[1], output[1]) +function LSTMGateLayer:update() for i = 1, #self.dim_in do - self["ltp" .. i]:update_by_err_input(self.err_bakm, input[i]) + self["ltp" .. i]:update_by_err_input() if self.param_type[i] == 'D' then self["ltp" .. i].trans:diagonalize() end end - self.bp:update_by_gradient(self.err_bakm:colsum()) + self.bp:update_by_gradient() end function LSTMGateLayer:get_params() diff --git a/nerv/layer/mse.lua b/nerv/layer/mse.lua index 458d086..c1ea596 100644 --- a/nerv/layer/mse.lua +++ b/nerv/layer/mse.lua @@ -9,23 +9,28 @@ function MSELayer:bind_params() -- do nothing end -function MSELayer:init(batch_size) +function MSELayer:init(batch_size, chunk_size) if self.dim_in[1] ~= self.dim_in[2] then nerv.error("mismatching dimensions of previous network output and labels") end - self.scale = 1 / self.dim_in[1] + self.scale = 1.0 / self.dim_in[1] self.total_mse = 0.0 self.total_frames = 0 self.mse = self.mat_type(batch_size, self.dim_in[1]) self.mse_sum = self.mat_type(batch_size, 1) - self.diff = self.mse:create() + self.diff = {} + for t = 1, chunk_size do + self.diff[t] = self.mse:create() + end end function MSELayer:batch_resize(batch_size) if self.mse:nrow() ~= batch_resize then self.mse = self.mat_type(batch_size, self.dim_in[1]) self.mse_sum = self.mat_type(batch_size, 1) - self.diff = self.mse:create() + for t = 1, chunk_size do + self.diff[t] = self.mse:create() + end end end @@ -33,24 +38,32 @@ function MSELayer:update(bp_err, input, output) -- no params, therefore do nothing end -function MSELayer:propagate(input, output) +function MSELayer:propagate(input, output, t) + if t == nil then + t = 1 + end local mse = self.mse local mse_sum = self.mse_sum + local diff = self.diff[t] mse:add(input[1], input[2], 1.0, -1.0) - self.diff:copy_from(mse) + mse:set_values_by_mask(self.gconf.mask[t], 0) + diff:copy_from(mse) mse:mul_elem(mse, mse) - mse_sum:add(mse_sum, mse:rowsum(mse), 0.0, self.scale) + mse_sum:add(mse_sum, mse:rowsum(), 0.0, self.scale * 0.5) if output[1] ~= nil then output[1]:copy_from(mse_sum) end self.total_mse = self.total_mse + mse_sum:colsum()[0][0] - self.total_frames = self.total_frames + mse_sum:nrow() + self.total_frames = self.total_frames + self.gconf.mask[t]:colsum()[0][0] end -- NOTE: must call propagate before back_propagate -function MSELayer:back_propagate(bp_err, next_bp_err, input, output) +function MSELayer:back_propagate(bp_err, next_bp_err, input, output, t) + if t == nil then + t = 1 + end local nbe = next_bp_err[1] - nbe:add(nbe, self.diff, 0.0, 2 * self.scale) + nbe:add(nbe, self.diff[t], 0.0, self.scale) if bp_err[1] ~= nil then nbe:scale_rows_by_col(bp_err[1]) end diff --git a/nerv/nn/network.lua b/nerv/nn/network.lua index bb03be4..cf6a4d3 100644 --- a/nerv/nn/network.lua +++ b/nerv/nn/network.lua @@ -615,10 +615,8 @@ function network:back_propagate() end function network:update() - for t = 1, self.max_length do - for i = 1, #self.layers do - self.layers[i]:update(self.err_input[t][i], self.input[t][i], self.output[t][i], t) - end + for i = 1, #self.layers do + self.layers[i]:update() end end -- cgit v1.2.3