From 9b2fa6b357d441afbd6ccf41b9e039f5dc34eb05 Mon Sep 17 00:00:00 2001 From: Qi Liu Date: Wed, 30 Mar 2016 16:53:52 +0800 Subject: update general trainer --- nerv/examples/ptb/main.lua | 118 +++++++++++++++++++++++ nerv/examples/ptb/reader.lua | 67 +++++++++++++ nerv/examples/ptb/select_linear.lua | 63 +++++++++++++ nerv/examples/trainer.lua | 166 ++++++++++++++++++++++++++++++++ nerv/examples/trainer_class.lua | 183 ++++++++++++++++++++++++++++++++++++ 5 files changed, 597 insertions(+) create mode 100644 nerv/examples/ptb/main.lua create mode 100644 nerv/examples/ptb/reader.lua create mode 100644 nerv/examples/ptb/select_linear.lua create mode 100644 nerv/examples/trainer.lua create mode 100644 nerv/examples/trainer_class.lua diff --git a/nerv/examples/ptb/main.lua b/nerv/examples/ptb/main.lua new file mode 100644 index 0000000..688716b --- /dev/null +++ b/nerv/examples/ptb/main.lua @@ -0,0 +1,118 @@ +nerv.include('reader.lua') +nerv.include('select_linear.lua') + +gconf = { + chunk_size = 5, + dropout_rate = 0, + lrate = 1.5, + wcost = 1e-5, + max_iter = 3, + clip = 5, + momentum = 0, + batch_size = 200, + test = true, +} + +local hidden_size = 300 +local vocab_size = 10000 +local layer_num = 1 +local dropout_rate = 0.5 +local trainer = nerv.Trainer + +function trainer:make_layer_repo(param_repo) + local layers = { + ['nerv.LSTMLayer'] = {}, + ['nerv.DropoutLayer'] = {}, + ['nerv.SelectLinearLayer'] = { + ['select'] = {dim_in = {1}, dim_out = {hidden_size}, vocab = vocab_size, pr = param_repo}, + }, + ['nerv.AffineLayer'] = { + output = {dim_in = {hidden_size}, dim_out = {vocab_size}, pr = param_repo}, + }, + ['nerv.SoftmaxCELayer'] = { + softmax = {dim_in = {vocab_size, 1}, dim_out = {1}, compressed = true}, + }, + } + for i = 1, layer_num do + layers['nerv.LSTMLayer']['lstm' .. i] = {dim_in = {hidden_size}, dim_out = {hidden_size}, pr = param_repo} + layers['nerv.DropoutLayer']['dropout' .. i] = {dim_in = {hidden_size}, dim_out = {hidden_size}} + end + return nerv.LayerRepo(layers, param_repo, gconf) +end + +function trainer:get_network(layer_repo) + local connections = { + {'[1]', 'select[1]', 0}, + {'select[1]', 'lstm1[1]', 0}, + {'dropout' .. layer_num .. '[1]', 'output[1]', 0}, + {'output[1]', 'softmax[1]', 0}, + {'[2]', 'softmax[2]', 0}, + {'softmax[1]', '[1]', 0}, + } + for i = 1, layer_num do + table.insert(connections, {'lstm' .. i .. '[1]', 'dropout' .. i .. '[1]', 0}) + if i < 1 then + table.insert(connections, {'dropout' .. (i - 1) .. '[1]', 'lstm' .. i .. '[1]', 0}) + end + end + return nerv.GraphLayer('graph', gconf, {dim_in = {1, 1}, dim_out = {1}, layer_repo = layer_repo, connections = connections}) +end + +function trainer:get_input_order() + return {'input', 'label'} +end + +function trainer:get_readers(dataset) + local data_path = 'nerv/nerv/examples/lmptb/PTBdata/' + local vocab_file = data_path .. 'vocab' + local train_file = data_path .. 'ptb.train.txt.adds' + local cv_file = data_path .. 'ptb.valid.txt.adds' + local test_file = data_path .. 'ptb.test.txt.adds' + local reader + if dataset == 'train' then + reader = nerv.Reader(vocab_file, train_file) + elseif dataset == 'validate' then + reader = nerv.Reader(vocab_file, cv_file) + elseif dataset == 'test' then + reader = nerv.Reader(vocab_file, test_file) + else + nerv.error('no such dataset') + end + return {{reader = reader, data = {input = 1, label = 1}}} +end + +local total_err +local total_frame + +function trainer:get_error() + return math.pow(10, -total_err / total_frame) +end + +function trainer:epoch_preprocess(dataset, do_train) + if dataset == 'train' then + gconf.dropout_rate = dropout_rate + nerv.info('set dropout rate to %f', dropout_rate) + end + if dataset == 'validate' then + gconf.dropout_rate = 0 + nerv.info('set dropout rate to 0') + end + if dataset == 'test' then + gconf.dropout_rate = 0 + nerv.info('set dropout rate to 0') + end + total_err = 0 + total_frame = 0 +end + +function trainer:mini_batch_middleprocess(cnt, info) + for t = 1, gconf.chunk_size do + local tmp = info.output[1][t]:new_to_host() + for i = 1, gconf.batch_size do + total_err = total_err + math.log10(math.exp(tmp[i - 1][0])) + end + end + for i = 1, gconf.batch_size do + total_frame = total_frame + info.seq_length[i] + end +end diff --git a/nerv/examples/ptb/reader.lua b/nerv/examples/ptb/reader.lua new file mode 100644 index 0000000..70c0c97 --- /dev/null +++ b/nerv/examples/ptb/reader.lua @@ -0,0 +1,67 @@ +local Reader = nerv.class('nerv.Reader') + +function Reader:__init(vocab_file, input_file) + self:get_vocab(vocab_file) + self:get_seq(input_file) + self.offset = 1 +end + +function Reader:get_vocab(vocab_file) + local f = io.open(vocab_file, 'r') + local id = 0 + self.vocab = {} + while true do + local word = f:read() + if word == nil then + break + end + self.vocab[word] = id + id = id + 1 + end + self.size = id +end + +function Reader:split(s, t) + local ret = {} + for x in (s .. t):gmatch('(.-)' .. t) do + table.insert(ret, x) + end + return ret +end + +function Reader:get_seq(input_file) + local f = io.open(input_file, 'r') + self.seq = {} + -- while true do + for i = 1, 26 do + local seq = f:read() + if seq == nil then + break + end + seq = self:split(seq, ' ') + local tmp = {} + for i = 1, #seq do + if seq[i] ~= '' then + table.insert(tmp, self.vocab[seq[i]]) + end + end + table.insert(self.seq, tmp) + end +end + +function Reader:get_data() + if self.offset > #self.seq then + return nil + end + local tmp = self.seq[self.offset] + local res = { + input = nerv.MMatrixFloat(#tmp - 1, 1), + label = nerv.MMatrixFloat(#tmp - 1, 1), + } + for i = 1, #tmp - 1 do + res.input[i - 1][0] = tmp[i] + res.label[i - 1][0] = tmp[i + 1] + end + self.offset = self.offset + 1 + return res +end diff --git a/nerv/examples/ptb/select_linear.lua b/nerv/examples/ptb/select_linear.lua new file mode 100644 index 0000000..42778f8 --- /dev/null +++ b/nerv/examples/ptb/select_linear.lua @@ -0,0 +1,63 @@ +local SL = nerv.class('nerv.SelectLinearLayer', 'nerv.Layer') + +--id: string +--global_conf: table +--layer_conf: table +--Get Parameters +function SL:__init(id, global_conf, layer_conf) + nerv.Layer.__init(self, id, global_conf, layer_conf) + + self.vocab = layer_conf.vocab + + self:check_dim_len(1, 1) + self:bind_params() +end + +function SL:bind_params() + self.ltp = self:find_param("ltp", self.lconf, self.gconf, nerv.LinearTransParam, {self.vocab, self.dim_out[1]}) --layer_conf.ltp +end + +--Check parameter +function SL:init(batch_size) + if (self.dim_in[1] ~= 1) then --one word id + nerv.error("mismatching dimensions of ltp and input") + end + if (self.dim_out[1] ~= self.ltp.trans:ncol()) then + nerv.error("mismatching dimensions of bp and output") + end + + self.batch_size = bath_size + self.ltp:train_init() +end + +function SL:update() + --use this to produce reproducable result, don't forget to set the dropout to zero! + --for i = 1, input[1]:nrow(), 1 do + -- local word_vec = self.ltp.trans[input[1][i - 1][0]] + -- word_vec:add(word_vec, bp_err[1][i - 1], 1, - self.gconf.lrate / self.gconf.batch_size) + --end + + --I tried the update_select_rows kernel which uses atomicAdd, but it generates unreproducable result + self.ltp:update_by_err_input() +end + +function SL:propagate(input, output) + --for i = 0, input[1]:ncol() - 1, 1 do + -- if (input[1][0][i] > 0) then + -- output[1][i]:copy_fromd(self.ltp.trans[input[1][0][i]]) + -- else + -- output[1][i]:fill(0) + -- end + --end + output[1]:copy_rows_fromd_by_colidx(self.ltp.trans, input[1]) +end + +function SL:back_propagate(bp_err, next_bp_err, input, output) + --input is compressed, do nothing + self.ltp:back_propagate_by_err_input(bp_err[1], input[1]:decompress(self.vocab)) +end + +function SL:get_params() + local paramRepo = nerv.ParamRepo({self.ltp}, self.loc_type) + return paramRepo +end diff --git a/nerv/examples/trainer.lua b/nerv/examples/trainer.lua new file mode 100644 index 0000000..b691f5b --- /dev/null +++ b/nerv/examples/trainer.lua @@ -0,0 +1,166 @@ +nerv.include('trainer_class.lua') + +require 'lfs' +require 'pl' + +-- ======================================================= +-- Deal with command line input & init training envrioment +-- ======================================================= + +local function check_and_add_defaults(spec, opts) + local function get_opt_val(k) + local k = string.gsub(k, '_', '-') + return opts[k].val, opts[k].specified + end + local opt_v = get_opt_val("resume_from") + if opt_v then + nerv.info("resuming from previous training state") + gconf = dofile(opt_v) + else + for k, v in pairs(spec) do + local opt_v, specified = get_opt_val(k) + if (not specified) and gconf[k] ~= nil then + nerv.info("using setting in network config file: %s = %s", k, gconf[k]) + elseif opt_v ~= nil then + nerv.info("using setting in options: %s = %s", k, opt_v) + gconf[k] = opt_v + end + end + end +end + +local function make_options(spec) + local options = {} + for k, v in pairs(spec) do + table.insert(options, + {string.gsub(k, '_', '-'), nil, type(v), default = v}) + end + return options +end + +local function print_help(options) + nerv.printf("Usage: [options] network_config.lua\n") + nerv.print_usage(options) +end + +local function print_gconf() + local key_maxlen = 0 + for k, v in pairs(gconf) do + key_maxlen = math.max(key_maxlen, #k or 0) + end + local function pattern_gen() + return string.format("%%-%ds = %%s\n", key_maxlen) + end + nerv.info("ready to train with the following gconf settings:") + nerv.printf(pattern_gen(), "Key", "Value") + for k, v in pairs(gconf) do + nerv.printf(pattern_gen(), k or "", v or "") + end +end + +local function dump_gconf(fname) + local f = io.open(fname, "w") + f:write("return ") + f:write(table.tostring(gconf)) + f:close() +end + +local trainer_defaults = { + lrate = 0.8, + batch_size = 256, + chunk_size = 1, + buffer_size = 81920, + wcost = 1e-6, + momentum = 0.9, + cur_iter = 1, + max_iter = 20, + cumat_tname = "nerv.CuMatrixFloat", + mmat_tname = "nerv.MMatrixFloat", + trainer_tname = "nerv.Trainer", +} + +local options = make_options(trainer_defaults) +local extra_opt_spec = { + {"resume-from", nil, "string"}, + {"help", "h", "boolean", default = false, desc = "show this help information"}, + {"dir", nil, "string", desc = "specify the working directory"}, +} + +table.extend(options, extra_opt_spec) + +local opts +arg, opts = nerv.parse_args(arg, options) + +if #arg < 1 or opts["help"].val then + print_help(options) + return +end + +local script = arg[1] +local script_arg = {} +for i = 2, #arg do + table.insert(script_arg, arg[i]) +end +arg = script_arg +dofile(script) + +--[[ + +Rule: command-line option overrides network config overrides trainer default. +Note: config key like aaa_bbbb_cc could be overriden by specifying +--aaa-bbbb-cc to command-line arguments. + +]]-- + +check_and_add_defaults(trainer_defaults, opts) +gconf.mmat_type = nerv.get_type(gconf.mmat_tname) +gconf.cumat_type = nerv.get_type(gconf.cumat_tname) +gconf.trainer = nerv.get_type(gconf.trainer_tname) +gconf.use_cpu = econf.use_cpu or false +if gconf.initialized_param == nil then + gconf.initialized_param = {} +end +if gconf.param_random == nil then + gconf.param_random = function() return math.random() / 5 - 0.1 end +end + +local date_pattern = "%Y-%m-%d_%H:%M:%S" +local logfile_name = "log" +local working_dir = opts["dir"].val or string.format("nerv_%s", os.date(date_pattern)) +gconf.working_dir = working_dir +gconf.date_pattern = date_pattern + +print_gconf() +if not lfs.mkdir(working_dir) then + nerv.error("[trainer] working directory already exists") +end + +-- copy the network config +dir.copyfile(script, working_dir) +-- set logfile path +nerv.set_logfile(path.join(working_dir, logfile_name)) + +-- ============= +-- main function +-- ============= + +local trainer = gconf.trainer(gconf) +trainer:training_preprocess() +gconf.best_cv = trainer:process('validate', false) +nerv.info("initial cross validation: %.3f", gconf.best_cv) + +for i = gconf.cur_iter, gconf.max_iter do + gconf.cur_iter = i + dump_gconf(path.join(working_dir, string.format("iter_%d.meta", i))) + nerv.info("[NN] begin iteration %d with lrate = %.6f", i, gconf.lrate) + local train_err = trainer:process('train', true) + nerv.info("[TR] training set %d: %.3f", i, train_err) + local cv_err = trainer:process('validate', false) + nerv.info("[CV] cross validation %d: %.3f", i, cv_err) + if gconf.test then + local test_err = trainer:process('test', false) + nerv.info('[TE] testset error %d: %.3f', i, test_err) + end + trainer:halving(train_err, cv_err) +end +trainer:training_afterprocess() diff --git a/nerv/examples/trainer_class.lua b/nerv/examples/trainer_class.lua new file mode 100644 index 0000000..4ae08d9 --- /dev/null +++ b/nerv/examples/trainer_class.lua @@ -0,0 +1,183 @@ +local trainer = nerv.class('nerv.Trainer') + +function trainer:__init(gconf) + self.gconf = gconf + local mat_type + self.src_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST + local src_loc_type = self.src_loc_type + if gconf.use_cpu then + mat_type = gconf.mmat_type + self.train_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST + else + mat_type = gconf.cumat_type + self.train_loc_type = nerv.ParamRepo.LOC_TYPES.ON_DEVICE + end + local train_loc_type = self.train_loc_type + + local host_param_repo = nerv.ParamRepo() + host_param_repo:import(gconf.initialized_param, gconf) + local param_repo = host_param_repo:copy(train_loc_type, gconf) + self.layer_repo = self:make_layer_repo(param_repo) + local layer_repo = self.layer_repo + local graph = self:get_network(layer_repo) + self.input_order = self:get_input_order() + + self.network = nerv.Network('network', gconf, {network = graph, clip = gconf.clip}) + local network = self.network + network:init(gconf.batch_size, gconf.chunk_size) + + local dim_in, dim_out = network.dim_in, network.dim_out + self.err_output = {} + local err_output = self.err_output + for i = 1, #dim_in do + err_output[i] = {} + local tmp = mat_type(gconf.batch_size, dim_in[i]) + for t = 1, gconf.chunk_size do + err_output[i][t] = tmp + end + end + self.output = {} + self.err_input = {} + local output = self.output + local err_input = self.err_input + for i = 1, #dim_out do + output[i] = {} + for t = 1, gconf.chunk_size do + output[i][t] = mat_type(gconf.batch_size, dim_out[i]) + end + err_input[i] = {} + local tmp = mat_type(gconf.batch_size, dim_out[i]) + tmp:fill(0) + for t = 1, gconf.chunk_size do + if dim_out[i] == 1 then + err_input[i][t] = gconf.mask[t] + else + err_input[i][t] = tmp + end + end + end +end + +function trainer:make_buffer(readers) + local gconf = self.gconf + if gconf.chunk_size == 1 then + return nerv.FrmBuffer(gconf, { + buffer_size = gconf.buffer_size, + batch_size = gconf.batch_size, + chunk_size = gconf.chunk_size, + randomize = gconf.randomize, + readers = readers, + use_gpu = true, + }) + else + return nerv.SeqBuffer(gconf, { + batch_size = gconf.batch_size, + chunk_size = gconf.chunk_size, + readers = readers, + }) + end +end + +function trainer:process(dataset, do_train) + self:epoch_preprocess(dataset, do_train) + local buffer = self:make_buffer(self:get_readers(dataset)) + local cnt = 0 + local network = self.network + local input_order = self.input_order + local output = self.output + local err_input = self.err_input + local err_output = self.err_output + network:epoch_init() + + while true do + local data = buffer:get_data() + if data == nil then + break + end + + cnt = cnt + 1 + local info = {input = {}, output = output, err_input = err_input, err_output = err_output, + do_train = do_train, seq_length = data.seq_length, new_seq = data.new_seq} + for i = 1, #network.dim_in do + info.input[i] = data.data[input_order[i]] + end + + self:mini_batch_preprocess(cnt, info) + network:mini_batch_init(info) + network:propagate() + self:mini_batch_middleprocess(cnt, info) + if do_train then + network:back_propagate() + network:update() + end + self:mini_batch_afterprocess(cnt, info) + + collectgarbage('collect') + end + + self:epoch_afterprocess(dataset, do_train) + return self:get_error() +end + +function trainer:halving(train_err, cv_err) + local gconf = self.gconf + local src_loc_type = self.src_loc_type + local train_loc_type = self.train_loc_type + local layer_repo = self.layer_repo + local param_fname = string.format('%s_iter_%d_lr%f_tr%.3f_cv%.3f.nerv', os.date(gconf.date_pattern), gconf.cur_iter, gconf.lrate, train_err, cv_err) + param_fname = path.join(gconf.working_dir, param_fname) + local network = self.network + local host_param_repo = network:get_params():copy(src_loc_type, gconf) + host_param_repo:export(param_fname) + + if cv_err < gconf.best_cv then + nerv.info("accepting the trained params") + gconf.best_cv = cv_err + gconf.initialized_param = {param_fname} + else + nerv.info("rejecting the trained params, rollback to the previous one") + file.move(param_fname, param_fname .. '.rejected') + host_param_repo = nerv.ParamRepo() + host_param_repo:import(gconf.initialized_param, gconf) + local param_repo = host_param_repo:copy(train_loc_type, gconf) + layer_repo:rebind(param_repo) + gconf.lrate = gconf.lrate * 0.5 + end +end + +function trainer:training_preprocess() +end + +function trainer:training_afterprocess() +end + +function trainer:epoch_preprocess(dataset, do_train) +end + +function trainer:epoch_afterprocess(dataset, do_train) +end + +function trainer:mini_batch_preprocess(cnt, info) +end + +function trainer:mini_batch_middleprocess(cnt, info) +end + +function trainer:mini_batch_afterprocess(cnt, info) +end + +function trainer:make_layer_repo(param_repo) + nerv.error_method_not_implemented() +end + +function trainer:get_network(layer_repo) + nerv.error_method_not_implemented() +end + +function trainer:get_readers(dataset) + nerv.error_method_not_implemented() +end + +function trainer:get_input_order() + nerv.error_method_not_implemented() +end -- cgit v1.2.3-70-g09d2 From 78643f5127d86b54894f46a64d9593cdf6048d51 Mon Sep 17 00:00:00 2001 From: Qi Liu Date: Thu, 31 Mar 2016 16:48:07 +0800 Subject: update general trainer --- nerv/Makefile | 2 +- nerv/examples/trainer.lua | 2 - nerv/examples/trainer_class.lua | 183 ---------------------------------------- nerv/nn/init.lua | 1 + nerv/nn/trainer.lua | 183 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 185 insertions(+), 186 deletions(-) delete mode 100644 nerv/examples/trainer_class.lua create mode 100644 nerv/nn/trainer.lua diff --git a/nerv/Makefile b/nerv/Makefile index a5e4f66..dde8fe7 100644 --- a/nerv/Makefile +++ b/nerv/Makefile @@ -43,7 +43,7 @@ LUA_LIBS := matrix/init.lua io/init.lua init.lua \ layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua \ layer/elem_mul.lua layer/lstm.lua layer/lstm_gate.lua layer/dropout.lua layer/gru.lua \ layer/graph.lua layer/rnn.lua layer/duplicate.lua layer/identity.lua \ - nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/network.lua \ + nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/network.lua nn/trainer.lua\ io/frm_buffer.lua io/seq_buffer.lua INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK diff --git a/nerv/examples/trainer.lua b/nerv/examples/trainer.lua index b691f5b..7af628e 100644 --- a/nerv/examples/trainer.lua +++ b/nerv/examples/trainer.lua @@ -1,5 +1,3 @@ -nerv.include('trainer_class.lua') - require 'lfs' require 'pl' diff --git a/nerv/examples/trainer_class.lua b/nerv/examples/trainer_class.lua deleted file mode 100644 index 4ae08d9..0000000 --- a/nerv/examples/trainer_class.lua +++ /dev/null @@ -1,183 +0,0 @@ -local trainer = nerv.class('nerv.Trainer') - -function trainer:__init(gconf) - self.gconf = gconf - local mat_type - self.src_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST - local src_loc_type = self.src_loc_type - if gconf.use_cpu then - mat_type = gconf.mmat_type - self.train_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST - else - mat_type = gconf.cumat_type - self.train_loc_type = nerv.ParamRepo.LOC_TYPES.ON_DEVICE - end - local train_loc_type = self.train_loc_type - - local host_param_repo = nerv.ParamRepo() - host_param_repo:import(gconf.initialized_param, gconf) - local param_repo = host_param_repo:copy(train_loc_type, gconf) - self.layer_repo = self:make_layer_repo(param_repo) - local layer_repo = self.layer_repo - local graph = self:get_network(layer_repo) - self.input_order = self:get_input_order() - - self.network = nerv.Network('network', gconf, {network = graph, clip = gconf.clip}) - local network = self.network - network:init(gconf.batch_size, gconf.chunk_size) - - local dim_in, dim_out = network.dim_in, network.dim_out - self.err_output = {} - local err_output = self.err_output - for i = 1, #dim_in do - err_output[i] = {} - local tmp = mat_type(gconf.batch_size, dim_in[i]) - for t = 1, gconf.chunk_size do - err_output[i][t] = tmp - end - end - self.output = {} - self.err_input = {} - local output = self.output - local err_input = self.err_input - for i = 1, #dim_out do - output[i] = {} - for t = 1, gconf.chunk_size do - output[i][t] = mat_type(gconf.batch_size, dim_out[i]) - end - err_input[i] = {} - local tmp = mat_type(gconf.batch_size, dim_out[i]) - tmp:fill(0) - for t = 1, gconf.chunk_size do - if dim_out[i] == 1 then - err_input[i][t] = gconf.mask[t] - else - err_input[i][t] = tmp - end - end - end -end - -function trainer:make_buffer(readers) - local gconf = self.gconf - if gconf.chunk_size == 1 then - return nerv.FrmBuffer(gconf, { - buffer_size = gconf.buffer_size, - batch_size = gconf.batch_size, - chunk_size = gconf.chunk_size, - randomize = gconf.randomize, - readers = readers, - use_gpu = true, - }) - else - return nerv.SeqBuffer(gconf, { - batch_size = gconf.batch_size, - chunk_size = gconf.chunk_size, - readers = readers, - }) - end -end - -function trainer:process(dataset, do_train) - self:epoch_preprocess(dataset, do_train) - local buffer = self:make_buffer(self:get_readers(dataset)) - local cnt = 0 - local network = self.network - local input_order = self.input_order - local output = self.output - local err_input = self.err_input - local err_output = self.err_output - network:epoch_init() - - while true do - local data = buffer:get_data() - if data == nil then - break - end - - cnt = cnt + 1 - local info = {input = {}, output = output, err_input = err_input, err_output = err_output, - do_train = do_train, seq_length = data.seq_length, new_seq = data.new_seq} - for i = 1, #network.dim_in do - info.input[i] = data.data[input_order[i]] - end - - self:mini_batch_preprocess(cnt, info) - network:mini_batch_init(info) - network:propagate() - self:mini_batch_middleprocess(cnt, info) - if do_train then - network:back_propagate() - network:update() - end - self:mini_batch_afterprocess(cnt, info) - - collectgarbage('collect') - end - - self:epoch_afterprocess(dataset, do_train) - return self:get_error() -end - -function trainer:halving(train_err, cv_err) - local gconf = self.gconf - local src_loc_type = self.src_loc_type - local train_loc_type = self.train_loc_type - local layer_repo = self.layer_repo - local param_fname = string.format('%s_iter_%d_lr%f_tr%.3f_cv%.3f.nerv', os.date(gconf.date_pattern), gconf.cur_iter, gconf.lrate, train_err, cv_err) - param_fname = path.join(gconf.working_dir, param_fname) - local network = self.network - local host_param_repo = network:get_params():copy(src_loc_type, gconf) - host_param_repo:export(param_fname) - - if cv_err < gconf.best_cv then - nerv.info("accepting the trained params") - gconf.best_cv = cv_err - gconf.initialized_param = {param_fname} - else - nerv.info("rejecting the trained params, rollback to the previous one") - file.move(param_fname, param_fname .. '.rejected') - host_param_repo = nerv.ParamRepo() - host_param_repo:import(gconf.initialized_param, gconf) - local param_repo = host_param_repo:copy(train_loc_type, gconf) - layer_repo:rebind(param_repo) - gconf.lrate = gconf.lrate * 0.5 - end -end - -function trainer:training_preprocess() -end - -function trainer:training_afterprocess() -end - -function trainer:epoch_preprocess(dataset, do_train) -end - -function trainer:epoch_afterprocess(dataset, do_train) -end - -function trainer:mini_batch_preprocess(cnt, info) -end - -function trainer:mini_batch_middleprocess(cnt, info) -end - -function trainer:mini_batch_afterprocess(cnt, info) -end - -function trainer:make_layer_repo(param_repo) - nerv.error_method_not_implemented() -end - -function trainer:get_network(layer_repo) - nerv.error_method_not_implemented() -end - -function trainer:get_readers(dataset) - nerv.error_method_not_implemented() -end - -function trainer:get_input_order() - nerv.error_method_not_implemented() -end diff --git a/nerv/nn/init.lua b/nerv/nn/init.lua index 1037d05..34b05cf 100644 --- a/nerv/nn/init.lua +++ b/nerv/nn/init.lua @@ -1,3 +1,4 @@ nerv.include('layer_repo.lua') nerv.include('param_repo.lua') nerv.include('network.lua') +nerv.include('trainer.lua') diff --git a/nerv/nn/trainer.lua b/nerv/nn/trainer.lua new file mode 100644 index 0000000..4ae08d9 --- /dev/null +++ b/nerv/nn/trainer.lua @@ -0,0 +1,183 @@ +local trainer = nerv.class('nerv.Trainer') + +function trainer:__init(gconf) + self.gconf = gconf + local mat_type + self.src_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST + local src_loc_type = self.src_loc_type + if gconf.use_cpu then + mat_type = gconf.mmat_type + self.train_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST + else + mat_type = gconf.cumat_type + self.train_loc_type = nerv.ParamRepo.LOC_TYPES.ON_DEVICE + end + local train_loc_type = self.train_loc_type + + local host_param_repo = nerv.ParamRepo() + host_param_repo:import(gconf.initialized_param, gconf) + local param_repo = host_param_repo:copy(train_loc_type, gconf) + self.layer_repo = self:make_layer_repo(param_repo) + local layer_repo = self.layer_repo + local graph = self:get_network(layer_repo) + self.input_order = self:get_input_order() + + self.network = nerv.Network('network', gconf, {network = graph, clip = gconf.clip}) + local network = self.network + network:init(gconf.batch_size, gconf.chunk_size) + + local dim_in, dim_out = network.dim_in, network.dim_out + self.err_output = {} + local err_output = self.err_output + for i = 1, #dim_in do + err_output[i] = {} + local tmp = mat_type(gconf.batch_size, dim_in[i]) + for t = 1, gconf.chunk_size do + err_output[i][t] = tmp + end + end + self.output = {} + self.err_input = {} + local output = self.output + local err_input = self.err_input + for i = 1, #dim_out do + output[i] = {} + for t = 1, gconf.chunk_size do + output[i][t] = mat_type(gconf.batch_size, dim_out[i]) + end + err_input[i] = {} + local tmp = mat_type(gconf.batch_size, dim_out[i]) + tmp:fill(0) + for t = 1, gconf.chunk_size do + if dim_out[i] == 1 then + err_input[i][t] = gconf.mask[t] + else + err_input[i][t] = tmp + end + end + end +end + +function trainer:make_buffer(readers) + local gconf = self.gconf + if gconf.chunk_size == 1 then + return nerv.FrmBuffer(gconf, { + buffer_size = gconf.buffer_size, + batch_size = gconf.batch_size, + chunk_size = gconf.chunk_size, + randomize = gconf.randomize, + readers = readers, + use_gpu = true, + }) + else + return nerv.SeqBuffer(gconf, { + batch_size = gconf.batch_size, + chunk_size = gconf.chunk_size, + readers = readers, + }) + end +end + +function trainer:process(dataset, do_train) + self:epoch_preprocess(dataset, do_train) + local buffer = self:make_buffer(self:get_readers(dataset)) + local cnt = 0 + local network = self.network + local input_order = self.input_order + local output = self.output + local err_input = self.err_input + local err_output = self.err_output + network:epoch_init() + + while true do + local data = buffer:get_data() + if data == nil then + break + end + + cnt = cnt + 1 + local info = {input = {}, output = output, err_input = err_input, err_output = err_output, + do_train = do_train, seq_length = data.seq_length, new_seq = data.new_seq} + for i = 1, #network.dim_in do + info.input[i] = data.data[input_order[i]] + end + + self:mini_batch_preprocess(cnt, info) + network:mini_batch_init(info) + network:propagate() + self:mini_batch_middleprocess(cnt, info) + if do_train then + network:back_propagate() + network:update() + end + self:mini_batch_afterprocess(cnt, info) + + collectgarbage('collect') + end + + self:epoch_afterprocess(dataset, do_train) + return self:get_error() +end + +function trainer:halving(train_err, cv_err) + local gconf = self.gconf + local src_loc_type = self.src_loc_type + local train_loc_type = self.train_loc_type + local layer_repo = self.layer_repo + local param_fname = string.format('%s_iter_%d_lr%f_tr%.3f_cv%.3f.nerv', os.date(gconf.date_pattern), gconf.cur_iter, gconf.lrate, train_err, cv_err) + param_fname = path.join(gconf.working_dir, param_fname) + local network = self.network + local host_param_repo = network:get_params():copy(src_loc_type, gconf) + host_param_repo:export(param_fname) + + if cv_err < gconf.best_cv then + nerv.info("accepting the trained params") + gconf.best_cv = cv_err + gconf.initialized_param = {param_fname} + else + nerv.info("rejecting the trained params, rollback to the previous one") + file.move(param_fname, param_fname .. '.rejected') + host_param_repo = nerv.ParamRepo() + host_param_repo:import(gconf.initialized_param, gconf) + local param_repo = host_param_repo:copy(train_loc_type, gconf) + layer_repo:rebind(param_repo) + gconf.lrate = gconf.lrate * 0.5 + end +end + +function trainer:training_preprocess() +end + +function trainer:training_afterprocess() +end + +function trainer:epoch_preprocess(dataset, do_train) +end + +function trainer:epoch_afterprocess(dataset, do_train) +end + +function trainer:mini_batch_preprocess(cnt, info) +end + +function trainer:mini_batch_middleprocess(cnt, info) +end + +function trainer:mini_batch_afterprocess(cnt, info) +end + +function trainer:make_layer_repo(param_repo) + nerv.error_method_not_implemented() +end + +function trainer:get_network(layer_repo) + nerv.error_method_not_implemented() +end + +function trainer:get_readers(dataset) + nerv.error_method_not_implemented() +end + +function trainer:get_input_order() + nerv.error_method_not_implemented() +end -- cgit v1.2.3-70-g09d2 From c5d07f0fc076d6cece255cf2372df9ef7c63126b Mon Sep 17 00:00:00 2001 From: Qi Liu Date: Thu, 31 Mar 2016 16:56:26 +0800 Subject: change ptb example --- nerv/examples/ptb/main.lua | 6 +++--- nerv/examples/ptb/reader.lua | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/nerv/examples/ptb/main.lua b/nerv/examples/ptb/main.lua index 688716b..5d1a326 100644 --- a/nerv/examples/ptb/main.lua +++ b/nerv/examples/ptb/main.lua @@ -2,13 +2,13 @@ nerv.include('reader.lua') nerv.include('select_linear.lua') gconf = { - chunk_size = 5, + chunk_size = 15, dropout_rate = 0, lrate = 1.5, wcost = 1e-5, - max_iter = 3, + max_iter = 35, clip = 5, - momentum = 0, + momentum = 0.9, batch_size = 200, test = true, } diff --git a/nerv/examples/ptb/reader.lua b/nerv/examples/ptb/reader.lua index 70c0c97..76a78cf 100644 --- a/nerv/examples/ptb/reader.lua +++ b/nerv/examples/ptb/reader.lua @@ -32,8 +32,8 @@ end function Reader:get_seq(input_file) local f = io.open(input_file, 'r') self.seq = {} - -- while true do - for i = 1, 26 do + while true do + -- for i = 1, 26 do local seq = f:read() if seq == nil then break -- cgit v1.2.3-70-g09d2 From 74d6956dc79b387289d911d9cbea5b7245405b62 Mon Sep 17 00:00:00 2001 From: Qi Liu Date: Thu, 31 Mar 2016 20:30:44 +0800 Subject: add randomize into default setting --- nerv/examples/trainer.lua | 1 + 1 file changed, 1 insertion(+) diff --git a/nerv/examples/trainer.lua b/nerv/examples/trainer.lua index 7af628e..783ff1d 100644 --- a/nerv/examples/trainer.lua +++ b/nerv/examples/trainer.lua @@ -72,6 +72,7 @@ local trainer_defaults = { momentum = 0.9, cur_iter = 1, max_iter = 20, + randomize = true, cumat_tname = "nerv.CuMatrixFloat", mmat_tname = "nerv.MMatrixFloat", trainer_tname = "nerv.Trainer", -- cgit v1.2.3-70-g09d2