nerv/layer/affine.lua


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150

--- Parameter and layer classes related to linear transform.

local MatrixParam = nerv.class('nerv.MatrixParam', 'nerv.Param')
local LinearTransParam = nerv.class('nerv.LinearTransParam', 'nerv.MatrixParam')
local BiasParam = nerv.class('nerv.BiasParam', 'nerv.MatrixParam')
local AffineLayer = nerv.class('nerv.AffineLayer', 'nerv.Layer')

--- A parameter that consists of a single matrix
-- @type nerv.MatrixParam

--- Read from a file handle.
-- @param handle the file handle
function MatrixParam:read(handle)
    self.trans = self.gconf.mmat_type.load(handle)
    if not self.gconf.use_cpu then
        self.trans = self.gconf.cumat_type.new_from_host(self.trans)
    end
end

function MatrixParam:write(handle)
    local trans = self.trans
    if not self.gconf.use_cpu then
        trans = self.trans:new_to_host()
    end
    trans:save(handle)
end

function MatrixParam:train_init()
    self.correction = self.trans:create()
    self.correction:fill(0)
end

function MatrixParam:_update_by_gradient(gradient, alpha, beta)
    local gconf = self.gconf
    -- momentum gain
    local mmt_gain = 1.0 / (1.0 - gconf.momentum)
    local n = gconf.batch_size * mmt_gain
    -- perform update
    if gconf.momentum > 0 then
        self.correction:add(self.correction, gradient, gconf.momentum, 1.0)
        self.trans:add(self.trans, self.correction, alpha, -gconf.lrate / n * beta)
    else
        self.trans:add(self.trans, gradient, alpha, -gconf.lrate / n * beta)
    end
end

function MatrixParam:_update_by_err_input(err, input, alpha, beta)
    local gconf = self.gconf
    -- momentum gain
    local mmt_gain = 1.0 / (1.0 - gconf.momentum)
    local n = gconf.batch_size * mmt_gain
    -- perform update
    if gconf.momentum > 0 then
        self.correction:mul(input, err, 1.0, gconf.momentum, 'T', 'N')
        self.trans:add(self.trans, self.correction, alpha, -gconf.lrate / n * beta)
    else
        self.trans:mul(input, err, -gconf.lrate / n * beta, alpha, 'T', 'N')
    end
end

function MatrixParam:update_by_gradient(gradient)
    self:_update_by_gradient(gradient, 1.0, 1.0)
end

function MatrixParam:update_by_err_input(err, input)
    self:_update_by_err_input(err, input, 1.0, 1.0)
end

function LinearTransParam:update_by_err_input(err, input)
    local gconf = self.gconf
    local l2 = 1 - gconf.lrate * gconf.wcost
    self:_update_by_err_input(err, input, l2, l2)
end

--- A fully-connected linear transform layer.
-- @type nerv.AffineLayer

--- The constructor.
function AffineLayer:__init(id, global_conf, layer_conf)
    self.id = id
    self.dim_in = layer_conf.dim_in
    self.dim_out = layer_conf.dim_out
    if layer_conf.ltp ~= nil and layer_conf.ltp1 == nil then
        layer_conf.ltp1 = layer_conf.ltp
    end
    for i = 1, #self.dim_in do
        local pid = "ltp" .. i
        local pid_list = i == 1 and {"ltp", pid} or pid
        self["ltp" .. i] = self:find_param(pid_list, layer_conf, global_conf,
                                            nerv.LinearTransParam,
                                            {self.dim_in[i], self.dim_out[1]}, pid) 
    end
    self.ltp = self.ltp1 -- alias of ltp1
    self.bp = self:find_param("bp", layer_conf, global_conf,
                                nerv.BiasParam,
                                {1, self.dim_out[1]}, "bp")
    self.gconf = global_conf
    self:check_dim_len(-1, 1) -- exactly one output, allow multiple inputs
end

function AffineLayer:init(batch_size)
    if self.ltp.trans:ncol() ~= self.bp.trans:ncol() then
        nerv.error("mismatching dimensions of linear transform and bias paramter")
    end
    for i = 1, #self.dim_in do
        if self.dim_in[i] ~= self["ltp" .. i].trans:nrow() then
            nerv.error("mismatching dimensions of linear transform parameter and input")
        end
        if self.dim_out[1] ~= self["ltp" .. i].trans:ncol() then
            nerv.error("mismatching dimensions of linear transform parameter and output")
        end
        self["ltp" .. i]:train_init()
    end
    self.bp:train_init()
end

function AffineLayer:batch_resize(batch_size)
    -- do nothing
end

function AffineLayer:update(bp_err, input, output)
    for i = 1, #self.dim_in do
        self["ltp" .. i]:update_by_err_input(bp_err[1], input[i])
    end
    self.bp:update_by_gradient(bp_err[1]:colsum())
end

function AffineLayer:propagate(input, output)
    -- apply linear transform
    output[1]:mul(input[1], self.ltp1.trans, 1.0, 0.0, 'N', 'N')
    for i = 2, #self.dim_in do
        output[1]:mul(input[i], self["ltp" .. i].trans, 1.0, 1.0, 'N', 'N')
    end
    -- add bias
    output[1]:add_row(self.bp.trans, 1.0)
end

function AffineLayer:back_propagate(bp_err, next_bp_err, input, output)
    for i = 1, #self.dim_in do
        next_bp_err[i]:mul(bp_err[1], self["ltp" .. i].trans, 1.0, 0.0, 'N', 'T')
    end
end

function AffineLayer:get_params()
    local pr = nerv.ParamRepo({self.ltp, self.bp})
    for i = 2, #self.dim_in do
        pr:add(self["ltp" .. i].id, self["ltp" .. i])
    end
    return pr
end