nerv/layer/affine.lua


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129

local MatrixParam = nerv.class('nerv.MatrixParam', 'nerv.Param')
local LinearTransParam = nerv.class('nerv.LinearTransParam', 'nerv.MatrixParam')
local BiasParam = nerv.class('nerv.BiasParam', 'nerv.MatrixParam')
local AffineLayer = nerv.class('nerv.AffineLayer', 'nerv.Layer')

function MatrixParam:read(handle)
    self.trans = self.gconf.cumat_type.new_from_host(
                    nerv.MMatrixFloat.load(handle))
end

function MatrixParam:write(handle)
    self.trans:new_to_host():save(handle)
end

function MatrixParam:train_init()
    self.correction = self.trans:create()
    self.correction:fill(0)
end

function MatrixParam:update(gradient)
    local gconf = self.gconf
    if gconf.momentum > 0 then
        self.correction:add(self.correction, gradient, gconf.momentum, 1.0)
        -- momentum gain
        local mmt_gain = 1.0 / (1.0 - gconf.momentum)
        local n = self.gconf.batch_size * mmt_gain
        -- perform update
        self.trans:add(self.trans, self.correction, 1.0 - gconf.lrate * gconf.wcost / gconf.batch_size, - gconf.lrate / n)
    else
        self.trans:add(self.trans, gradient, 1.0 - gconf.lrate * gconf.wcost / gconf.batch_size, - gconf.lrate / gconf.batch_size)
    end
end

function MatrixParam:updateEI(err, input)
    local gconf = self.gconf
    if gconf.momentum > 0 then
        self.correction:mul(input, err, 1.0, gconf.momentum, 'T', 'N')
        -- momentum gain
        local mmt_gain = 1.0 / (1.0 - gconf.momentum)
        local n = self.gconf.batch_size * mmt_gain
        -- perform update
        self.trans:add(self.trans, self.correction, 1.0 - gconf.lrate * gconf.wcost / gconf.batch_size, - gconf.lrate / n)
    else
        self.trans:mul(input, err, - gconf.lrate / gconf.batch_size, 1.0 - gconf.lrate * gconf.wcost / gconf.batch_size, 'T', 'N')
    end
end

--[[ --these updates are the same
function LinearTransParam:update(gradient)
    MatrixParam.update(self, gradient)
    -- local gconf = self.gconf
    -- weight decay(put into MatrixParam:update)
    -- self.trans:add(self.trans, self.trans, 1.0, -gconf.lrate * gconf.wcost / gconf.batch_size)
end

function BiasParam:update(gradient)
    MatrixParam.update(self, gradient)
    --local gconf = self.gconf
    -- weight decay
    -- self.trans:add(self.trans, self.trans, 1.0, -gconf.lrate * gconf.wcost / gconf.batch_size)
end
]]--

function AffineLayer:__init(id, global_conf, layer_conf)
    self.id = id
    self.ltp = layer_conf.ltp
    self.bp = layer_conf.bp
    self.dim_in = layer_conf.dim_in
    self.dim_out = layer_conf.dim_out
    self.gconf = global_conf
    self:check_dim_len(1, 1) -- exactly one input and one output
    self.direct_update = layer_conf.direct_update or global_conf.direct_update
end

function AffineLayer:init(batch_size)
    if self.ltp.trans:ncol() ~= self.bp.trans:ncol() then
        nerv.error("mismatching dimensions of linear transform and bias paramter")
    end
    if self.dim_in[1] ~= self.ltp.trans:nrow() then
        nerv.error("mismatching dimensions of linear transform parameter and input")
    end
    if self.dim_out[1] ~= self.ltp.trans:ncol() then
        nerv.error("mismatching dimensions of linear transform parameter and output")
    end
    self.ltp_grad = self.ltp.trans:create()
    self.ltp:train_init()
    self.bp:train_init()
end

function AffineLayer:batch_resize(batch_size)
    -- do nothing
end

function AffineLayer:update(bp_err, input, output)
    if self.direct_update == true then
        local gconf = self.gconf
        if gconf.momentum > 0 then
            self.ltp.correction:mul(input[1], bp_err[1], 1.0, gconf.momentum, 'T', 'N')
            self.bp.correction:add(self.bp.correction, bp_err[1]:colsum(), gconf.momentum, 1)
            -- momentum gain
            local mmt_gain = 1.0 / (1.0 - gconf.momentum)
            local n = self.gconf.batch_size * mmt_gain
            -- perform update
            self.ltp.trans:add(self.ltp.trans, self.ltp.correction, 1.0 - gconf.lrate * gconf.wcost / gconf.batch_size, - gconf.lrate / n)
            self.bp.trans:add(self.bp.trans, self.bp.correction, 1.0 - gconf.lrate * gconf.wcost / gconf.batch_size, - gconf.lrate / n)
        else
            self.ltp.trans:mul(input[1], bp_err[1], - gconf.lrate / gconf.batch_size, 1.0 - gconf.lrate * gconf.wcost / gconf.batch_size, 'T', 'N')
            self.bp.trans:add(self.bp.trans, bp_err[1]:colsum(), 1.0 - gconf.lrate * gconf.wcost / gconf.batch_size, - gconf.lrate / gconf.batch_size)
        end
    else
        self.ltp:updateEI(bp_err[1], input[1])
        self.bp:update(bp_err[1]:colsum())
    end
end

function AffineLayer:propagate(input, output)
    -- apply linear transform
    output[1]:mul(input[1], self.ltp.trans, 1.0, 0.0, 'N', 'N')
    -- add bias
    output[1]:add_row(self.bp.trans, 1.0)
end

function AffineLayer:back_propagate(bp_err, next_bp_err, input, output)
    next_bp_err[1]:mul(bp_err[1], self.ltp.trans, 1.0, 0.0, 'N', 'T')
end

function AffineLayer:get_params()
    return nerv.ParamRepo({self.ltp, self.bp})
end