nerv/layer/affine.lua


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229

--- Contains parameter and layer classes related to linear (or affine)
-- transform.

--- The class for all matrix-based parameters. The class has a single matrix
-- which can be accessed by `self.trans`.
-- @type nerv.MatrixParam

local MatrixParam = nerv.class('nerv.MatrixParam', 'nerv.Param')

--- Check the storage location of the contained matrix. This function is
-- required by `nerv.ParamRepo`.
-- @param checker the callback function for checking
function MatrixParam:check(checker)
    -- check trans matrix type
    checker(self.trans)
end

--- Read from a file handle. See `nerv.Param.read`.
-- @param handle the file handle
function MatrixParam:read(handle)
    self.trans = self.gconf.mmat_type.load(handle)
end

--- Write to a file handle. See `nerv.Param.write`.
-- @param handle the file handle
function MatrixParam:write(handle)
    self.trans:save(handle)
end

function MatrixParam:train_init()
    self.correction = self.trans:create()
    self.correction_acc = self.correction:create()
    self.correction:fill(0)
    self.correction_acc:fill(0)
end

function MatrixParam:copy(copier)
    local target = nerv.MatrixParam(self.id, self.gconf)
    target.trans = copier(self.trans)
    return target
end

function MatrixParam:_update(alpha, beta)
    if self.no_update then
        return
    end
    local gconf = self.gconf
    -- momentum gain
    local mmt_gain = 1.0 / (1.0 - gconf.momentum)
    local n = gconf.batch_size * mmt_gain
    -- clip gradient
    if gconf.clip then
        self.correction_acc:clip(-gconf.clip, gconf.clip)
    end
    -- perform update
    if gconf.momentum > 0 then
        self.correction:add(self.correction, self.correction_acc, gconf.momentum, 1.0)
        self.trans:add(self.trans, self.correction, alpha, -gconf.lrate / n * beta)
    else
        self.trans:add(self.trans, self.correction_acc, alpha, -gconf.lrate / n * beta)
    end
    self.correction_acc:fill(0)
end

function MatrixParam:back_propagate_by_gradient(gradient)
    self.correction_acc:add(self.correction_acc, gradient, 1.0, 1.0)
end

function MatrixParam:back_propagate_by_err_input(err, input)
    self.correction_acc:mul(input, err, 1.0, 1.0, 'T', 'N')
end

function MatrixParam:update_by_gradient()
    self:_update(1.0, 1.0)
end

function MatrixParam:update_by_err_input()
    local gconf = self.gconf
    local l2 = 1 - gconf.lrate * gconf.wcost
    self:_update(l2, l2)
end

--- The affine layer that does the calculation Wx + b, also known as fully
-- connected linear transform layer.
-- @type nerv.AffineLayer

local AffineLayer = nerv.class('nerv.AffineLayer', 'nerv.Layer')

--- The constructor.
-- @param id the identifier
-- @param global_conf see `self.gconf` of `nerv.Layer.__init`
-- @param layer_conf a table providing with settings dedicated for the layer,
-- for `layer_conf` fields that are shared by all layers, see
-- `nerv.Layer.__init`. This fields can be specified:
-- * `activation`: the type of the activation function layer, also known as \sigma in \sigma(Wx + b). The activation function layer must gurantee not use parameter `input` in its `back_propagate` function. Default value none (no activation function).
-- * `no_bias`: a bool value indicates use bias parameter or not. Default value false.
-- * `param_type`: a string table has the same length with `dim_in`, indicates the parameter type for every input. 'D' for diagonal weight matrix, 'N' for normal weight matrix. Default 'N' for every input.
-- The affine layer requires parameters to be bound, the
-- following parameter names will be looked up while binding:
--
-- * `ltp`: the linear transformation parameter, also known as the weight matrix, W in Wx + b
-- * `bp`: the bias parameter, also known as the bias matrix, b in Wx + b

function AffineLayer:__init(id, global_conf, layer_conf)
    nerv.Layer.__init(self, id, global_conf, layer_conf)
    self:check_dim_len(-1, 1) -- exactly one output, allow multiple inputs
    self.param_type = layer_conf.param_type or table.vector(#self.dim_in, 'N')
    if layer_conf.activation then
        self.activation = layer_conf.activation('', global_conf, {dim_in = {self.dim_out[1]}, dim_out = {self.dim_out[1]}})
    end
    self.no_bias = layer_conf.no_bias
    self:bind_params()
end

function AffineLayer:bind_params()
    local lconf = self.lconf
    lconf.no_update_ltp1 = lconf.no_update_ltp1 or lconf.no_update_ltp
    for i = 1, #self.dim_in do
        local pid = "ltp" .. i
        local pid_list = i == 1 and {pid, "ltp"} or pid
        self["ltp" .. i] = self:find_param(pid_list, lconf, self.gconf,
                                            nerv.LinearTransParam,
                                            {self.dim_in[i], self.dim_out[1]})
        if self.param_type[i] == 'D' then
            self['ltp' .. i].trans:diagonalize()
        end
        local no_update = lconf["no_update_ltp" .. i]
        if (no_update ~= nil) and no_update or lconf.no_update_all then
            self["ltp" .. i].no_update = true
        end
    end
    self.ltp = self.ltp1 -- alias of ltp1
    if not self.no_bias then
       self.bp = self:find_param("bp", lconf, self.gconf,
                                    nerv.BiasParam,
                                    {1, self.dim_out[1]},
                                    nerv.Param.gen_zero)
        local no_update = lconf["no_update_bp"]
        if (no_update ~= nil) and no_update or lconf.no_update_all then
            self.bp.no_update = true
        end
    end
end

function AffineLayer:init(batch_size)
    if not self.no_bias and self.dim_out[1] ~= self.bp.trans:ncol() then
        nerv.error("mismatching dimensions of linear transform and bias paramter")
    end
    for i = 1, #self.dim_in do
        if self.dim_in[i] ~= self["ltp" .. i].trans:nrow() then
            nerv.error("mismatching dimensions of linear transform parameter and input")
        end
        if self.dim_out[1] ~= self["ltp" .. i].trans:ncol() then
            nerv.error("mismatching dimensions of linear transform parameter and output")
        end
        self["ltp" .. i]:train_init()
    end
    if not self.no_bias then
        self.bp:train_init()
    end
    if self.activation then
        self.bak_mat = self.mat_type(batch_size, self.dim_out[1])
        self.bak_mat:fill(0)
    end
end

function AffineLayer:batch_resize(batch_size)
    -- do nothing
end

function AffineLayer:update()
    for i = 1, #self.dim_in do
        self["ltp" .. i]:update_by_err_input()
        if self.param_type[i] == 'D' then
            self['ltp' .. i].trans:diagonalize()
        end
    end
    if not self.no_bias then
        self.bp:update_by_gradient()
    end
end

function AffineLayer:propagate(input, output)
    local result = self.activation and self.bak_mat or output[1]
    -- apply linear transform
    result:mul(input[1], self.ltp1.trans, 1.0, 0.0, 'N', 'N')
    for i = 2, #self.dim_in do
        result:mul(input[i], self["ltp" .. i].trans, 1.0, 1.0, 'N', 'N')
    end
    -- add bias
    if not self.no_bias then
        result:add_row(self.bp.trans, 1.0)
    end
    if self.activation then
        self.activation:propagate({result}, output)
    end
end

function AffineLayer:back_propagate(bp_err, next_bp_err, input, output)
    local result = self.activation and self.bak_mat or bp_err[1]
    if self.activation then
        self.activation:back_propagate(bp_err, {result}, {result}, output)
    end
    for i = 1, #self.dim_in do
        next_bp_err[i]:mul(result, self["ltp" .. i].trans, 1.0, 0.0, 'N', 'T')
        self["ltp" .. i]:back_propagate_by_err_input(result, input[i])
    end
    if not self.no_bias then
        self.bp:back_propagate_by_gradient(result:colsum())
    end
end

function AffineLayer:get_params()
    local pr = nerv.ParamRepo({self.ltp1, self.bp}, self.loc_type)
    for i = 2, #self.dim_in do
        pr:add(self["ltp" .. i])
    end
    return pr
end

--- The class for linear transform parameter.
-- @type nerv.LinearTransParam

local LinearTransParam = nerv.class('nerv.LinearTransParam', 'nerv.MatrixParam')

--- The class for bias parameter (currently implemented as a one-row matrix).
-- @type nerv.BiasParam

local BiasParam = nerv.class('nerv.BiasParam', 'nerv.MatrixParam')