aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTed Yin <[email protected]>2016-03-31 20:44:17 +0800
committerTed Yin <[email protected]>2016-03-31 20:44:17 +0800
commit7829db926609d3e0498349e1a09634531244e0e5 (patch)
tree08b2efe303b3aa8da3199641106419641296d8fc
parent89d57b6fae6bcb0195a73fb97ab6870ee0d0ce20 (diff)
parent74d6956dc79b387289d911d9cbea5b7245405b62 (diff)
Merge pull request #37 from liuq901/master
add general trainer
-rw-r--r--nerv/Makefile2
-rw-r--r--nerv/examples/ptb/main.lua118
-rw-r--r--nerv/examples/ptb/reader.lua67
-rw-r--r--nerv/examples/ptb/select_linear.lua63
-rw-r--r--nerv/examples/trainer.lua165
-rw-r--r--nerv/nn/init.lua1
-rw-r--r--nerv/nn/trainer.lua183
7 files changed, 598 insertions, 1 deletions
diff --git a/nerv/Makefile b/nerv/Makefile
index a5e4f66..dde8fe7 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -43,7 +43,7 @@ LUA_LIBS := matrix/init.lua io/init.lua init.lua \
layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua \
layer/elem_mul.lua layer/lstm.lua layer/lstm_gate.lua layer/dropout.lua layer/gru.lua \
layer/graph.lua layer/rnn.lua layer/duplicate.lua layer/identity.lua \
- nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/network.lua \
+ nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/network.lua nn/trainer.lua\
io/frm_buffer.lua io/seq_buffer.lua
INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK
diff --git a/nerv/examples/ptb/main.lua b/nerv/examples/ptb/main.lua
new file mode 100644
index 0000000..5d1a326
--- /dev/null
+++ b/nerv/examples/ptb/main.lua
@@ -0,0 +1,118 @@
+nerv.include('reader.lua')
+nerv.include('select_linear.lua')
+
+gconf = {
+ chunk_size = 15,
+ dropout_rate = 0,
+ lrate = 1.5,
+ wcost = 1e-5,
+ max_iter = 35,
+ clip = 5,
+ momentum = 0.9,
+ batch_size = 200,
+ test = true,
+}
+
+local hidden_size = 300
+local vocab_size = 10000
+local layer_num = 1
+local dropout_rate = 0.5
+local trainer = nerv.Trainer
+
+function trainer:make_layer_repo(param_repo)
+ local layers = {
+ ['nerv.LSTMLayer'] = {},
+ ['nerv.DropoutLayer'] = {},
+ ['nerv.SelectLinearLayer'] = {
+ ['select'] = {dim_in = {1}, dim_out = {hidden_size}, vocab = vocab_size, pr = param_repo},
+ },
+ ['nerv.AffineLayer'] = {
+ output = {dim_in = {hidden_size}, dim_out = {vocab_size}, pr = param_repo},
+ },
+ ['nerv.SoftmaxCELayer'] = {
+ softmax = {dim_in = {vocab_size, 1}, dim_out = {1}, compressed = true},
+ },
+ }
+ for i = 1, layer_num do
+ layers['nerv.LSTMLayer']['lstm' .. i] = {dim_in = {hidden_size}, dim_out = {hidden_size}, pr = param_repo}
+ layers['nerv.DropoutLayer']['dropout' .. i] = {dim_in = {hidden_size}, dim_out = {hidden_size}}
+ end
+ return nerv.LayerRepo(layers, param_repo, gconf)
+end
+
+function trainer:get_network(layer_repo)
+ local connections = {
+ {'<input>[1]', 'select[1]', 0},
+ {'select[1]', 'lstm1[1]', 0},
+ {'dropout' .. layer_num .. '[1]', 'output[1]', 0},
+ {'output[1]', 'softmax[1]', 0},
+ {'<input>[2]', 'softmax[2]', 0},
+ {'softmax[1]', '<output>[1]', 0},
+ }
+ for i = 1, layer_num do
+ table.insert(connections, {'lstm' .. i .. '[1]', 'dropout' .. i .. '[1]', 0})
+ if i < 1 then
+ table.insert(connections, {'dropout' .. (i - 1) .. '[1]', 'lstm' .. i .. '[1]', 0})
+ end
+ end
+ return nerv.GraphLayer('graph', gconf, {dim_in = {1, 1}, dim_out = {1}, layer_repo = layer_repo, connections = connections})
+end
+
+function trainer:get_input_order()
+ return {'input', 'label'}
+end
+
+function trainer:get_readers(dataset)
+ local data_path = 'nerv/nerv/examples/lmptb/PTBdata/'
+ local vocab_file = data_path .. 'vocab'
+ local train_file = data_path .. 'ptb.train.txt.adds'
+ local cv_file = data_path .. 'ptb.valid.txt.adds'
+ local test_file = data_path .. 'ptb.test.txt.adds'
+ local reader
+ if dataset == 'train' then
+ reader = nerv.Reader(vocab_file, train_file)
+ elseif dataset == 'validate' then
+ reader = nerv.Reader(vocab_file, cv_file)
+ elseif dataset == 'test' then
+ reader = nerv.Reader(vocab_file, test_file)
+ else
+ nerv.error('no such dataset')
+ end
+ return {{reader = reader, data = {input = 1, label = 1}}}
+end
+
+local total_err
+local total_frame
+
+function trainer:get_error()
+ return math.pow(10, -total_err / total_frame)
+end
+
+function trainer:epoch_preprocess(dataset, do_train)
+ if dataset == 'train' then
+ gconf.dropout_rate = dropout_rate
+ nerv.info('set dropout rate to %f', dropout_rate)
+ end
+ if dataset == 'validate' then
+ gconf.dropout_rate = 0
+ nerv.info('set dropout rate to 0')
+ end
+ if dataset == 'test' then
+ gconf.dropout_rate = 0
+ nerv.info('set dropout rate to 0')
+ end
+ total_err = 0
+ total_frame = 0
+end
+
+function trainer:mini_batch_middleprocess(cnt, info)
+ for t = 1, gconf.chunk_size do
+ local tmp = info.output[1][t]:new_to_host()
+ for i = 1, gconf.batch_size do
+ total_err = total_err + math.log10(math.exp(tmp[i - 1][0]))
+ end
+ end
+ for i = 1, gconf.batch_size do
+ total_frame = total_frame + info.seq_length[i]
+ end
+end
diff --git a/nerv/examples/ptb/reader.lua b/nerv/examples/ptb/reader.lua
new file mode 100644
index 0000000..76a78cf
--- /dev/null
+++ b/nerv/examples/ptb/reader.lua
@@ -0,0 +1,67 @@
+local Reader = nerv.class('nerv.Reader')
+
+function Reader:__init(vocab_file, input_file)
+ self:get_vocab(vocab_file)
+ self:get_seq(input_file)
+ self.offset = 1
+end
+
+function Reader:get_vocab(vocab_file)
+ local f = io.open(vocab_file, 'r')
+ local id = 0
+ self.vocab = {}
+ while true do
+ local word = f:read()
+ if word == nil then
+ break
+ end
+ self.vocab[word] = id
+ id = id + 1
+ end
+ self.size = id
+end
+
+function Reader:split(s, t)
+ local ret = {}
+ for x in (s .. t):gmatch('(.-)' .. t) do
+ table.insert(ret, x)
+ end
+ return ret
+end
+
+function Reader:get_seq(input_file)
+ local f = io.open(input_file, 'r')
+ self.seq = {}
+ while true do
+ -- for i = 1, 26 do
+ local seq = f:read()
+ if seq == nil then
+ break
+ end
+ seq = self:split(seq, ' ')
+ local tmp = {}
+ for i = 1, #seq do
+ if seq[i] ~= '' then
+ table.insert(tmp, self.vocab[seq[i]])
+ end
+ end
+ table.insert(self.seq, tmp)
+ end
+end
+
+function Reader:get_data()
+ if self.offset > #self.seq then
+ return nil
+ end
+ local tmp = self.seq[self.offset]
+ local res = {
+ input = nerv.MMatrixFloat(#tmp - 1, 1),
+ label = nerv.MMatrixFloat(#tmp - 1, 1),
+ }
+ for i = 1, #tmp - 1 do
+ res.input[i - 1][0] = tmp[i]
+ res.label[i - 1][0] = tmp[i + 1]
+ end
+ self.offset = self.offset + 1
+ return res
+end
diff --git a/nerv/examples/ptb/select_linear.lua b/nerv/examples/ptb/select_linear.lua
new file mode 100644
index 0000000..42778f8
--- /dev/null
+++ b/nerv/examples/ptb/select_linear.lua
@@ -0,0 +1,63 @@
+local SL = nerv.class('nerv.SelectLinearLayer', 'nerv.Layer')
+
+--id: string
+--global_conf: table
+--layer_conf: table
+--Get Parameters
+function SL:__init(id, global_conf, layer_conf)
+ nerv.Layer.__init(self, id, global_conf, layer_conf)
+
+ self.vocab = layer_conf.vocab
+
+ self:check_dim_len(1, 1)
+ self:bind_params()
+end
+
+function SL:bind_params()
+ self.ltp = self:find_param("ltp", self.lconf, self.gconf, nerv.LinearTransParam, {self.vocab, self.dim_out[1]}) --layer_conf.ltp
+end
+
+--Check parameter
+function SL:init(batch_size)
+ if (self.dim_in[1] ~= 1) then --one word id
+ nerv.error("mismatching dimensions of ltp and input")
+ end
+ if (self.dim_out[1] ~= self.ltp.trans:ncol()) then
+ nerv.error("mismatching dimensions of bp and output")
+ end
+
+ self.batch_size = bath_size
+ self.ltp:train_init()
+end
+
+function SL:update()
+ --use this to produce reproducable result, don't forget to set the dropout to zero!
+ --for i = 1, input[1]:nrow(), 1 do
+ -- local word_vec = self.ltp.trans[input[1][i - 1][0]]
+ -- word_vec:add(word_vec, bp_err[1][i - 1], 1, - self.gconf.lrate / self.gconf.batch_size)
+ --end
+
+ --I tried the update_select_rows kernel which uses atomicAdd, but it generates unreproducable result
+ self.ltp:update_by_err_input()
+end
+
+function SL:propagate(input, output)
+ --for i = 0, input[1]:ncol() - 1, 1 do
+ -- if (input[1][0][i] > 0) then
+ -- output[1][i]:copy_fromd(self.ltp.trans[input[1][0][i]])
+ -- else
+ -- output[1][i]:fill(0)
+ -- end
+ --end
+ output[1]:copy_rows_fromd_by_colidx(self.ltp.trans, input[1])
+end
+
+function SL:back_propagate(bp_err, next_bp_err, input, output)
+ --input is compressed, do nothing
+ self.ltp:back_propagate_by_err_input(bp_err[1], input[1]:decompress(self.vocab))
+end
+
+function SL:get_params()
+ local paramRepo = nerv.ParamRepo({self.ltp}, self.loc_type)
+ return paramRepo
+end
diff --git a/nerv/examples/trainer.lua b/nerv/examples/trainer.lua
new file mode 100644
index 0000000..783ff1d
--- /dev/null
+++ b/nerv/examples/trainer.lua
@@ -0,0 +1,165 @@
+require 'lfs'
+require 'pl'
+
+-- =======================================================
+-- Deal with command line input & init training envrioment
+-- =======================================================
+
+local function check_and_add_defaults(spec, opts)
+ local function get_opt_val(k)
+ local k = string.gsub(k, '_', '-')
+ return opts[k].val, opts[k].specified
+ end
+ local opt_v = get_opt_val("resume_from")
+ if opt_v then
+ nerv.info("resuming from previous training state")
+ gconf = dofile(opt_v)
+ else
+ for k, v in pairs(spec) do
+ local opt_v, specified = get_opt_val(k)
+ if (not specified) and gconf[k] ~= nil then
+ nerv.info("using setting in network config file: %s = %s", k, gconf[k])
+ elseif opt_v ~= nil then
+ nerv.info("using setting in options: %s = %s", k, opt_v)
+ gconf[k] = opt_v
+ end
+ end
+ end
+end
+
+local function make_options(spec)
+ local options = {}
+ for k, v in pairs(spec) do
+ table.insert(options,
+ {string.gsub(k, '_', '-'), nil, type(v), default = v})
+ end
+ return options
+end
+
+local function print_help(options)
+ nerv.printf("Usage: <asr_trainer.lua> [options] network_config.lua\n")
+ nerv.print_usage(options)
+end
+
+local function print_gconf()
+ local key_maxlen = 0
+ for k, v in pairs(gconf) do
+ key_maxlen = math.max(key_maxlen, #k or 0)
+ end
+ local function pattern_gen()
+ return string.format("%%-%ds = %%s\n", key_maxlen)
+ end
+ nerv.info("ready to train with the following gconf settings:")
+ nerv.printf(pattern_gen(), "Key", "Value")
+ for k, v in pairs(gconf) do
+ nerv.printf(pattern_gen(), k or "", v or "")
+ end
+end
+
+local function dump_gconf(fname)
+ local f = io.open(fname, "w")
+ f:write("return ")
+ f:write(table.tostring(gconf))
+ f:close()
+end
+
+local trainer_defaults = {
+ lrate = 0.8,
+ batch_size = 256,
+ chunk_size = 1,
+ buffer_size = 81920,
+ wcost = 1e-6,
+ momentum = 0.9,
+ cur_iter = 1,
+ max_iter = 20,
+ randomize = true,
+ cumat_tname = "nerv.CuMatrixFloat",
+ mmat_tname = "nerv.MMatrixFloat",
+ trainer_tname = "nerv.Trainer",
+}
+
+local options = make_options(trainer_defaults)
+local extra_opt_spec = {
+ {"resume-from", nil, "string"},
+ {"help", "h", "boolean", default = false, desc = "show this help information"},
+ {"dir", nil, "string", desc = "specify the working directory"},
+}
+
+table.extend(options, extra_opt_spec)
+
+local opts
+arg, opts = nerv.parse_args(arg, options)
+
+if #arg < 1 or opts["help"].val then
+ print_help(options)
+ return
+end
+
+local script = arg[1]
+local script_arg = {}
+for i = 2, #arg do
+ table.insert(script_arg, arg[i])
+end
+arg = script_arg
+dofile(script)
+
+--[[
+
+Rule: command-line option overrides network config overrides trainer default.
+Note: config key like aaa_bbbb_cc could be overriden by specifying
+--aaa-bbbb-cc to command-line arguments.
+
+]]--
+
+check_and_add_defaults(trainer_defaults, opts)
+gconf.mmat_type = nerv.get_type(gconf.mmat_tname)
+gconf.cumat_type = nerv.get_type(gconf.cumat_tname)
+gconf.trainer = nerv.get_type(gconf.trainer_tname)
+gconf.use_cpu = econf.use_cpu or false
+if gconf.initialized_param == nil then
+ gconf.initialized_param = {}
+end
+if gconf.param_random == nil then
+ gconf.param_random = function() return math.random() / 5 - 0.1 end
+end
+
+local date_pattern = "%Y-%m-%d_%H:%M:%S"
+local logfile_name = "log"
+local working_dir = opts["dir"].val or string.format("nerv_%s", os.date(date_pattern))
+gconf.working_dir = working_dir
+gconf.date_pattern = date_pattern
+
+print_gconf()
+if not lfs.mkdir(working_dir) then
+ nerv.error("[trainer] working directory already exists")
+end
+
+-- copy the network config
+dir.copyfile(script, working_dir)
+-- set logfile path
+nerv.set_logfile(path.join(working_dir, logfile_name))
+
+-- =============
+-- main function
+-- =============
+
+local trainer = gconf.trainer(gconf)
+trainer:training_preprocess()
+gconf.best_cv = trainer:process('validate', false)
+nerv.info("initial cross validation: %.3f", gconf.best_cv)
+
+for i = gconf.cur_iter, gconf.max_iter do
+ gconf.cur_iter = i
+ dump_gconf(path.join(working_dir, string.format("iter_%d.meta", i)))
+ nerv.info("[NN] begin iteration %d with lrate = %.6f", i, gconf.lrate)
+ local train_err = trainer:process('train', true)
+ nerv.info("[TR] training set %d: %.3f", i, train_err)
+ local cv_err = trainer:process('validate', false)
+ nerv.info("[CV] cross validation %d: %.3f", i, cv_err)
+ if gconf.test then
+ local test_err = trainer:process('test', false)
+ nerv.info('[TE] testset error %d: %.3f', i, test_err)
+ end
+ trainer:halving(train_err, cv_err)
+end
+trainer:training_afterprocess()
diff --git a/nerv/nn/init.lua b/nerv/nn/init.lua
index 1037d05..34b05cf 100644
--- a/nerv/nn/init.lua
+++ b/nerv/nn/init.lua
@@ -1,3 +1,4 @@
nerv.include('layer_repo.lua')
nerv.include('param_repo.lua')
nerv.include('network.lua')
+nerv.include('trainer.lua')
diff --git a/nerv/nn/trainer.lua b/nerv/nn/trainer.lua
new file mode 100644
index 0000000..4ae08d9
--- /dev/null
+++ b/nerv/nn/trainer.lua
@@ -0,0 +1,183 @@
+local trainer = nerv.class('nerv.Trainer')
+
+function trainer:__init(gconf)
+ self.gconf = gconf
+ local mat_type
+ self.src_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST
+ local src_loc_type = self.src_loc_type
+ if gconf.use_cpu then
+ mat_type = gconf.mmat_type
+ self.train_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST
+ else
+ mat_type = gconf.cumat_type
+ self.train_loc_type = nerv.ParamRepo.LOC_TYPES.ON_DEVICE
+ end
+ local train_loc_type = self.train_loc_type
+
+ local host_param_repo = nerv.ParamRepo()
+ host_param_repo:import(gconf.initialized_param, gconf)
+ local param_repo = host_param_repo:copy(train_loc_type, gconf)
+ self.layer_repo = self:make_layer_repo(param_repo)
+ local layer_repo = self.layer_repo
+ local graph = self:get_network(layer_repo)
+ self.input_order = self:get_input_order()
+
+ self.network = nerv.Network('network', gconf, {network = graph, clip = gconf.clip})
+ local network = self.network
+ network:init(gconf.batch_size, gconf.chunk_size)
+
+ local dim_in, dim_out = network.dim_in, network.dim_out
+ self.err_output = {}
+ local err_output = self.err_output
+ for i = 1, #dim_in do
+ err_output[i] = {}
+ local tmp = mat_type(gconf.batch_size, dim_in[i])
+ for t = 1, gconf.chunk_size do
+ err_output[i][t] = tmp
+ end
+ end
+ self.output = {}
+ self.err_input = {}
+ local output = self.output
+ local err_input = self.err_input
+ for i = 1, #dim_out do
+ output[i] = {}
+ for t = 1, gconf.chunk_size do
+ output[i][t] = mat_type(gconf.batch_size, dim_out[i])
+ end
+ err_input[i] = {}
+ local tmp = mat_type(gconf.batch_size, dim_out[i])
+ tmp:fill(0)
+ for t = 1, gconf.chunk_size do
+ if dim_out[i] == 1 then
+ err_input[i][t] = gconf.mask[t]
+ else
+ err_input[i][t] = tmp
+ end
+ end
+ end
+end
+
+function trainer:make_buffer(readers)
+ local gconf = self.gconf
+ if gconf.chunk_size == 1 then
+ return nerv.FrmBuffer(gconf, {
+ buffer_size = gconf.buffer_size,
+ batch_size = gconf.batch_size,
+ chunk_size = gconf.chunk_size,
+ randomize = gconf.randomize,
+ readers = readers,
+ use_gpu = true,
+ })
+ else
+ return nerv.SeqBuffer(gconf, {
+ batch_size = gconf.batch_size,
+ chunk_size = gconf.chunk_size,
+ readers = readers,
+ })
+ end
+end
+
+function trainer:process(dataset, do_train)
+ self:epoch_preprocess(dataset, do_train)
+ local buffer = self:make_buffer(self:get_readers(dataset))
+ local cnt = 0
+ local network = self.network
+ local input_order = self.input_order
+ local output = self.output
+ local err_input = self.err_input
+ local err_output = self.err_output
+ network:epoch_init()
+
+ while true do
+ local data = buffer:get_data()
+ if data == nil then
+ break
+ end
+
+ cnt = cnt + 1
+ local info = {input = {}, output = output, err_input = err_input, err_output = err_output,
+ do_train = do_train, seq_length = data.seq_length, new_seq = data.new_seq}
+ for i = 1, #network.dim_in do
+ info.input[i] = data.data[input_order[i]]
+ end
+
+ self:mini_batch_preprocess(cnt, info)
+ network:mini_batch_init(info)
+ network:propagate()
+ self:mini_batch_middleprocess(cnt, info)
+ if do_train then
+ network:back_propagate()
+ network:update()
+ end
+ self:mini_batch_afterprocess(cnt, info)
+
+ collectgarbage('collect')
+ end
+
+ self:epoch_afterprocess(dataset, do_train)
+ return self:get_error()
+end
+
+function trainer:halving(train_err, cv_err)
+ local gconf = self.gconf
+ local src_loc_type = self.src_loc_type
+ local train_loc_type = self.train_loc_type
+ local layer_repo = self.layer_repo
+ local param_fname = string.format('%s_iter_%d_lr%f_tr%.3f_cv%.3f.nerv', os.date(gconf.date_pattern), gconf.cur_iter, gconf.lrate, train_err, cv_err)
+ param_fname = path.join(gconf.working_dir, param_fname)
+ local network = self.network
+ local host_param_repo = network:get_params():copy(src_loc_type, gconf)
+ host_param_repo:export(param_fname)
+
+ if cv_err < gconf.best_cv then
+ nerv.info("accepting the trained params")
+ gconf.best_cv = cv_err
+ gconf.initialized_param = {param_fname}
+ else
+ nerv.info("rejecting the trained params, rollback to the previous one")
+ file.move(param_fname, param_fname .. '.rejected')
+ host_param_repo = nerv.ParamRepo()
+ host_param_repo:import(gconf.initialized_param, gconf)
+ local param_repo = host_param_repo:copy(train_loc_type, gconf)
+ layer_repo:rebind(param_repo)
+ gconf.lrate = gconf.lrate * 0.5
+ end
+end
+
+function trainer:training_preprocess()
+end
+
+function trainer:training_afterprocess()
+end
+
+function trainer:epoch_preprocess(dataset, do_train)
+end
+
+function trainer:epoch_afterprocess(dataset, do_train)
+end
+
+function trainer:mini_batch_preprocess(cnt, info)
+end
+
+function trainer:mini_batch_middleprocess(cnt, info)
+end
+
+function trainer:mini_batch_afterprocess(cnt, info)
+end
+
+function trainer:make_layer_repo(param_repo)
+ nerv.error_method_not_implemented()
+end
+
+function trainer:get_network(layer_repo)
+ nerv.error_method_not_implemented()
+end
+
+function trainer:get_readers(dataset)
+ nerv.error_method_not_implemented()
+end
+
+function trainer:get_input_order()
+ nerv.error_method_not_implemented()
+end