diff options
Diffstat (limited to 'layer')
-rw-r--r-- | layer/affine.lua | 75 | ||||
-rw-r--r-- | layer/bias.lua | 2 | ||||
-rw-r--r-- | layer/combiner.lua | 26 | ||||
-rw-r--r-- | layer/init.lua | 12 | ||||
-rw-r--r-- | layer/mse.lua | 28 | ||||
-rw-r--r-- | layer/sigmoid.lua | 4 | ||||
-rw-r--r-- | layer/softmax_ce.lua | 21 | ||||
-rw-r--r-- | layer/window.lua | 2 |
8 files changed, 97 insertions, 73 deletions
diff --git a/layer/affine.lua b/layer/affine.lua index 2cd7acb..00cbcfb 100644 --- a/layer/affine.lua +++ b/layer/affine.lua @@ -3,13 +3,35 @@ local LinearTransParam = nerv.class('nerv.LinearTransParam', 'nerv.MatrixParam') local BiasParam = nerv.class('nerv.BiasParam', 'nerv.MatrixParam') local AffineLayer = nerv.class('nerv.AffineLayer', 'nerv.Layer') -function MatrixParam:read(pcdata) +function MatrixParam:read(handle) self.trans = self.gconf.cumat_type.new_from_host( - nerv.MMatrixFloat.load(pcdata)) + nerv.MMatrixFloat.load(handle)) end -function MatrixParam:write(pfhandle) - self.trans:new_to_host():save(pfhandle) +function MatrixParam:write(handle) + self.trans:new_to_host():save(handle) +end + +function MatrixParam:train_init() + self.correction = self.trans:create() + self.correction:fill(0) +end + +function MatrixParam:update(gradient) + local gconf = self.gconf + self.correction:add(self.correction, gradient, gconf.momentum, 1.0) + -- momentum gain + local mmt_gain = 1.0 / (1.0 - gconf.momentum); + local n = self.gconf.batch_size * mmt_gain + -- perform update + self.trans:add(self.trans, self.correction, 1.0, -gconf.lrate / n) +end + +function LinearTransParam:update(gradient) + MatrixParam.update(self, gradient) + local gconf = self.gconf + -- weight decay + self.trans:add(self.trans, self.trans, 1.0, -gconf.lrate * gconf.wcost) end function AffineLayer:__init(id, global_conf, layer_conf) @@ -20,9 +42,10 @@ function AffineLayer:__init(id, global_conf, layer_conf) self.dim_out = layer_conf.dim_out self.gconf = global_conf self:check_dim_len(1, 1) -- exactly one input and one output + self.direct_update = layer_conf.direct_update end -function AffineLayer:init() +function AffineLayer:init(batch_size) if self.ltp.trans:ncol() ~= self.bp.trans:ncol() then nerv.error("mismatching dimensions of linear transform and bias paramter") end @@ -32,32 +55,24 @@ function AffineLayer:init() if self.dim_out[1] ~= self.ltp.trans:ncol() then nerv.error("mismatching dimensions of linear transform parameter and output") end - - -- linear transform correction - self.ltc = self.ltp.trans:create() - self.ltc:fill(0) - -- bias correction - self.bc = self.bp.trans:create() - self.bc:fill(0) + self.ltp_grad = self.ltp.trans:create() + self.ltp:train_init() + self.bp:train_init() end function AffineLayer:update(bp_err, input, output) - local ltp = self.ltp.trans - local bp = self.bp.trans - local ltc = self.ltc - local bc = self.bc - local gconf = self.gconf - -- momentum gain - local mmt_gain = 1.0 / (1.0 - gconf.momentum); - local n = input[1]:nrow() * mmt_gain - -- update corrections (accumulated errors) - ltc:mul(input[1], bp_err[1], 1.0, gconf.momentum, 'T', 'N') - bc:add(bc, bp_err[1]:colsum(), gconf.momentum, 1.0) - -- perform update - ltp:add(ltp, ltc, 1.0, -gconf.lrate / n) - bp:add(bp, bc, 1.0, -gconf.lrate / n) - -- weight decay - ltp:add(ltp, ltp, 1.0, -gconf.lrate * gconf.wcost) + if self.direct_update then + self.ltp.correction:mul(input[1], bp_err[1], 1.0, gconf.momentum, 'T', 'N') + -- momentum gain + local mmt_gain = 1.0 / (1.0 - gconf.momentum); + local n = self.gconf.batch_size * mmt_gain + -- perform update + self.ltp.trans:add(self.ltp.trans, self.ltp.correction, 1.0, -gconf.lrate / n) + else + self.ltp_grad:mul(input[1], bp_err[1], 1.0, 0.0, 'T', 'N') + self.ltp:update(self.ltp_grad) + end + self.bp:update(bp_err[1]:colsum()) end function AffineLayer:propagate(input, output) @@ -67,10 +82,10 @@ function AffineLayer:propagate(input, output) output[1]:add_row(self.bp.trans, 1.0) end -function AffineLayer:back_propagate(next_bp_err, bp_err, input, output) +function AffineLayer:back_propagate(bp_err, next_bp_err, input, output) next_bp_err[1]:mul(bp_err[1], self.ltp.trans, 1.0, 0.0, 'N', 'T') end function AffineLayer:get_params() - return {self.ltp, self.bp} + return nerv.ParamRepo({self.ltp, self.bp}) end diff --git a/layer/bias.lua b/layer/bias.lua index 8cd326b..c99274d 100644 --- a/layer/bias.lua +++ b/layer/bias.lua @@ -24,5 +24,5 @@ function BiasLayer:propagate(input, output) end function BiasLayer:get_params() - return {self.bias} + return nerv.ParamRepo({self.bias}) end diff --git a/layer/combiner.lua b/layer/combiner.lua index 75e47e2..7bd7617 100644 --- a/layer/combiner.lua +++ b/layer/combiner.lua @@ -7,9 +7,15 @@ function CombinerLayer:__init(id, global_conf, layer_conf) self.dim_out = layer_conf.dim_out self.gconf = global_conf self:check_dim_len(#self.lambda, -1) + if #self.dim_in < 1 then + nerv.error("no input specified") + end + if #self.dim_out < 1 then + nerv.error("no output specified") + end end -function CombinerLayer:init() +function CombinerLayer:init(batch_size) local dim = self.dim_in[1] for i = 2, #self.dim_in do if self.dim_in[i] ~= dim then @@ -21,6 +27,7 @@ function CombinerLayer:init() nerv.error("mismatching dimensions of inputs/outputs") end end + self.sum = self.gconf.cumat_type(batch_size, dim) end function CombinerLayer:update(bp_err, input, output) @@ -32,24 +39,21 @@ function CombinerLayer:propagate(input, output) output[1]:add(output[1], input[i], 1.0, self.lambda[i]) end for i = 2, #self.dim_out do - output[i]:copy_fromd(output[1]) + output[i]:copy_fromd(output[1]) end end -function CombinerLayer:back_propagate(next_bp_err, bp_err, input, output) - local sum = bp_err[1]:create() - sum:fill(0) - for i = 1, #self.dim_out do +function CombinerLayer:back_propagate(bp_err, next_bp_err, input, output) + local sum = self.sum + sum:copy_fromd(bp_err[1]) + for i = 2, #self.dim_out do sum:add(sum, bp_err[i], 1.0, 1.0) end for i = 1, #self.dim_in do - local scale = nerv.CuMatrixFloat(sum:nrow(), 1) - scale:fill(self.lambda[i]) - next_bp_err[i]:copy_fromd(sum) - next_bp_err[i]:scale_rows_by_col(scale) + next_bp_err[i]:add(next_bp_err[i], sum, 0.0, self.lambda[i]) end end function CombinerLayer:get_params() - return {} + return nerv.ParamRepo({}) end diff --git a/layer/init.lua b/layer/init.lua index 169427d..e39af94 100644 --- a/layer/init.lua +++ b/layer/init.lua @@ -15,11 +15,15 @@ function Param:set_info(info) self.info = info end -function Param:read(pfhandle) +function Param:read(handle) nerv.error_method_not_implemented() end -function Param:write(pfhandle) +function Param:write(handle) + nerv.error_method_not_implemented() +end + +function Param:update(gradient) nerv.error_method_not_implemented() end @@ -29,7 +33,7 @@ function Layer:__init(id, global_conf, layer_conf) nerv.error_method_not_implemented() end -function Layer:init() +function Layer:init(batch_size) nerv.error_method_not_implemented() end @@ -41,7 +45,7 @@ function Layer:propagate(input, output) nerv.error_method_not_implemented() end -function Layer:back_propagate(next_bp_err, bp_err, input, output) +function Layer:back_propagate(bp_err, next_bp_err, input, output) nerv.error_method_not_implemented() end diff --git a/layer/mse.lua b/layer/mse.lua index da5b24d..9a97add 100644 --- a/layer/mse.lua +++ b/layer/mse.lua @@ -8,12 +8,16 @@ function MSELayer:__init(id, global_conf, layer_conf) self:check_dim_len(2, -1) end -function MSELayer:init() +function MSELayer:init(batch_size) if self.dim_in[1] ~= self.dim_in[2] then nerv.error("mismatching dimensions of previous network output and labels") end + self.scale = 1 / self.dim_in[1] self.total_mse = 0.0 self.total_frames = 0 + self.mse = self.gconf.cumat_type(batch_size, self.dim_in[1]) + self.mse_sum = self.gconf.cumat_type(batch_size, 1) + self.diff = self.mse:create() end function MSELayer:update(bp_err, input, output) @@ -21,32 +25,28 @@ function MSELayer:update(bp_err, input, output) end function MSELayer:propagate(input, output) - local mse = input[1]:create() + local mse = self.mse + local mse_sum = self.mse_sum mse:add(input[1], input[2], 1.0, -1.0) - self.diff = mse:create() self.diff:copy_fromd(mse) mse:mul_elem(mse, mse) - mse = mse:rowsum(mse) - local scale = nerv.CuMatrixFloat(mse:nrow(), 1) - scale:fill(1 / input[1]:ncol()) - mse:scale_rows_by_col(scale) + mse_sum:add(mse_sum, mse:rowsum(mse), 0.0, self.scale) if output[1] ~= nil then - output[1]:copy_fromd(mse) + output[1]:copy_fromd(mse_sum) end - self.total_mse = self.total_mse + mse:colsum()[0] - self.total_frames = self.total_frames + mse:nrow() + self.total_mse = self.total_mse + mse_sum:colsum()[0] + self.total_frames = self.total_frames + mse_sum:nrow() end -- NOTE: must call propagate before back_propagate -function MSELayer:back_propagate(next_bp_err, bp_err, input, output) +function MSELayer:back_propagate(bp_err, next_bp_err, input, output) local nbe = next_bp_err[1] - nbe:copy_fromd(self.diff) - self.diff = nil + nbe:add(nbe, self.diff, 0.0, 2 * self.scale) if bp_err[1] ~= nil then nbe:scale_rows_by_col(bp_err[1]) end end function MSELayer:get_params() - return {} + return nerv.ParamRepo({}) end diff --git a/layer/sigmoid.lua b/layer/sigmoid.lua index dd10fb9..dfd09eb 100644 --- a/layer/sigmoid.lua +++ b/layer/sigmoid.lua @@ -22,10 +22,10 @@ function SigmoidLayer:propagate(input, output) output[1]:sigmoid(input[1]) end -function SigmoidLayer:back_propagate(next_bp_err, bp_err, input, output) +function SigmoidLayer:back_propagate(bp_err, next_bp_err, input, output) next_bp_err[1]:sigmoid_grad(bp_err[1], output[1]) end function SigmoidLayer:get_params() - return {} + return nerv.ParamRepo({}) end diff --git a/layer/softmax_ce.lua b/layer/softmax_ce.lua index 7888540..daf891e 100644 --- a/layer/softmax_ce.lua +++ b/layer/softmax_ce.lua @@ -12,13 +12,15 @@ function SoftmaxCELayer:__init(id, global_conf, layer_conf) self:check_dim_len(2, -1) -- two inputs: nn output and label end -function SoftmaxCELayer:init() +function SoftmaxCELayer:init(batch_size) if not self.compressed and (self.dim_in[1] ~= self.dim_in[2]) then nerv.error("mismatching dimensions of previous network output and labels") end self.total_ce = 0.0 self.total_correct = 0 self.total_frames = 0 + self.softmax = self.gconf.cumat_type(batch_size, self.dim_in[1]) + self.ce = self.softmax:create() end function SoftmaxCELayer:update(bp_err, input, output) @@ -26,12 +28,11 @@ function SoftmaxCELayer:update(bp_err, input, output) end function SoftmaxCELayer:propagate(input, output) - local soutput = input[1]:create() -- temporary value for calc softmax - self.soutput = soutput - local classified = soutput:softmax(input[1]) - local ce = soutput:create() - ce:log_elem(soutput) + local softmax = self.softmax + local ce = self.ce + local classified = softmax:softmax(input[1]) local label = input[2] + ce:log_elem(softmax) if self.compressed then label = label:decompress(input[1]:ncol()) end @@ -42,26 +43,26 @@ function SoftmaxCELayer:propagate(input, output) end -- add total ce self.total_ce = self.total_ce - ce:colsum()[0] - self.total_frames = self.total_frames + soutput:nrow() + self.total_frames = self.total_frames + softmax:nrow() -- TODO: add colsame for uncompressed label if self.compressed then self.total_correct = self.total_correct + classified:colsame(input[2])[0] end end -function SoftmaxCELayer:back_propagate(next_bp_err, bp_err, input, output) +function SoftmaxCELayer:back_propagate(bp_err, next_bp_err, input, output) -- softmax output - label local label = input[2] if self.compressed then label = label:decompress(input[1]:ncol()) end local nbe = next_bp_err[1] - nbe:add(self.soutput, label, 1.0, -1.0) + nbe:add(self.softmax, label, 1.0, -1.0) if bp_err[1] ~= nil then nbe:scale_rows_by_col(bp_err[1]) end end function SoftmaxCELayer:get_params() - return {} + return nerv.ParamRepo({}) end diff --git a/layer/window.lua b/layer/window.lua index 3a093f4..4e9a3b1 100644 --- a/layer/window.lua +++ b/layer/window.lua @@ -24,5 +24,5 @@ function WindowLayer:propagate(input, output) end function WindowLayer:get_params() - return {self.window} + return nerv.ParamRepo({self.window}) end |