1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
|
local trainer = nerv.class('nerv.Trainer')
function trainer:__init(gconf)
local mat_type
self.gconf = gconf
self.src_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST
local src_loc_type = self.src_loc_type
if gconf.use_cpu then
mat_type = gconf.mmat_type
self.train_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST
else
mat_type = gconf.cumat_type
self.train_loc_type = nerv.ParamRepo.LOC_TYPES.ON_DEVICE
end
local train_loc_type = self.train_loc_type
local host_param_repo = nerv.ParamRepo()
-- import the parameters from chunk files
host_param_repo:import(gconf.initialized_param, gconf)
local param_repo = host_param_repo:copy(train_loc_type, gconf)
-- create layers and establish initial bindings
self.layer_repo = self:make_layer_repo(param_repo)
local layer_repo = self.layer_repo
-- compile the network to be trained
local graph = self:get_network(layer_repo)
self.input_order = self:get_input_order()
self.network = nerv.Network('network', gconf,
{network = graph,
nn_act_default = gconf.nn_act_default})
local network = self.network
network:init(gconf.batch_size, gconf.chunk_size)
local dim_in, dim_out = network.dim_in, network.dim_out
self.err_output = {}
local err_output = self.err_output
for i = 1, #dim_in do
err_output[i] = {}
local dummy = mat_type(gconf.batch_size, dim_in[i])
for t = 1, gconf.chunk_size do
table.insert(err_output[i], dummy)
end
end
self.output = {}
self.err_input = {}
local output = self.output
local err_input = self.err_input
for i = 1, #dim_out do
output[i] = {}
for t = 1, gconf.chunk_size do
table.insert(output[i], mat_type(gconf.batch_size, dim_out[i]))
end
err_input[i] = {}
if dim_out[i] ~= 1 then
nerv.warning("the output has multiple heads, the default " ..
"`err_input` will be zero")
end
for t = 1, gconf.chunk_size do
if dim_out[i] == 1 then
table.insert(err_input[i], gconf.mask[t])
else
table.insert(err_input[i], mat_type(gconf.batch_size, dim_out[i]))
err_input[i][t]:fill(0)
end
end
end
end
function trainer:make_buffer(readers)
local gconf = self.gconf
if gconf.chunk_size == 1 then
return nerv.FrmBuffer(gconf, {
buffer_size = gconf.buffer_size,
batch_size = gconf.batch_size,
chunk_size = gconf.chunk_size,
randomize = gconf.randomize,
readers = readers,
use_gpu = true,
})
else
return nerv.SeqBuffer(gconf, {
buffer_size = gconf.buffer_size,
batch_size = gconf.batch_size,
chunk_size = gconf.chunk_size,
randomize = gconf.randomize,
readers = readers,
nn_act_default = gconf.nn_act_default,
})
end
end
function trainer:process(dataset, do_train)
self:epoch_preprocess(dataset, do_train)
local buffer = self:make_buffer(self:get_readers(dataset))
local cnt = 0
local network = self.network
local input_order = self.input_order
local output = self.output
local err_input = self.err_input
local err_output = self.err_output
network:epoch_init()
for data in buffer.get_data, buffer do
cnt = cnt + 1
local info = {input = {},
output = output,
err_input = err_input,
err_output = err_output,
do_train = do_train,
seq_length = data.seq_length,
new_seq = data.new_seq}
for i = 1, #network.dim_in do
info.input[i] = data.data[input_order[i]]
end
self:mini_batch_preprocess(cnt, info)
network:mini_batch_init(info)
network:propagate()
self:mini_batch_inprocess(cnt, info)
if do_train then
network:back_propagate()
network:update()
end
self:mini_batch_afterprocess(cnt, info)
collectgarbage('collect')
end
self:epoch_afterprocess(dataset, do_train)
return self:get_error()
end
function trainer:if_accept(cv_err)
return cv_err < gconf.best_cv
end
function trainer:do_halving()
gconf.lrate = gconf.lrate * gconf.hfactor
end
function trainer:save_params(train_err, cv_err)
local gconf = self.gconf
local src_loc_type = self.src_loc_type
local train_loc_type = self.train_loc_type
local layer_repo = self.layer_repo
local param_fname = string.format('%s_iter_%d_lr%f_tr%.3f_cv%.3f.nerv',
os.date(gconf.date_pattern),
gconf.cur_iter,
gconf.lrate,
train_err,
cv_err)
param_fname = path.join(gconf.working_dir, param_fname)
local network = self.network
local host_param_repo = network:get_params():copy(src_loc_type, gconf)
host_param_repo:export(param_fname)
if self:if_accept(cv_err) then
nerv.info("accepting the trained params")
gconf.best_cv = cv_err
gconf.initialized_param = {param_fname}
else
nerv.info("rejecting the trained params, rollback to the previous one")
file.move(param_fname, param_fname .. '.rejected')
host_param_repo = nerv.ParamRepo()
host_param_repo:import(gconf.initialized_param, gconf)
local param_repo = host_param_repo:copy(train_loc_type, gconf)
-- rebind the parameters
layer_repo:rebind(param_repo)
self:do_halving()
end
end
function trainer:training_preprocess()
end
function trainer:training_afterprocess()
end
function trainer:epoch_preprocess(dataset, do_train)
end
function trainer:epoch_afterprocess(dataset, do_train)
end
function trainer:mini_batch_preprocess(cnt, info)
end
function trainer:mini_batch_inprocess(cnt, info)
end
function trainer:mini_batch_afterprocess(cnt, info)
end
function trainer:make_layer_repo(param_repo)
nerv.error_method_not_implemented()
end
function trainer:get_network(layer_repo)
nerv.error_method_not_implemented()
end
function trainer:get_readers(dataset)
nerv.error_method_not_implemented()
end
function trainer:get_input_order()
nerv.error_method_not_implemented()
end
function trainer:get_error()
nerv.error_method_not_implemented()
end
|