Merge pull request #37 from liuq901/master

add general trainer
author: Ted Yin <Determinant@users.noreply.github.com> 2016-03-31 20:44:17 +0800
committer: Ted Yin <Determinant@users.noreply.github.com> 2016-03-31 20:44:17 +0800
commit: 7829db926609d3e0498349e1a09634531244e0e5 (patch)
tree: 08b2efe303b3aa8da3199641106419641296d8fc
parent: 89d57b6fae6bcb0195a73fb97ab6870ee0d0ce20 (diff)
parent: 74d6956dc79b387289d911d9cbea5b7245405b62 (diff)
7 files changed, 598 insertions, 1 deletions
diff --git a/nerv/Makefile b/nerv/Makefile
index a5e4f66..dde8fe7 100644
--- a/nerv/Makefile
+++ b/nerv/Makefile
@@ -43,7 +43,7 @@ LUA_LIBS := matrix/init.lua io/init.lua init.lua \
 			layer/window.lua layer/bias.lua layer/combiner.lua layer/mse.lua \
 			layer/elem_mul.lua layer/lstm.lua layer/lstm_gate.lua layer/dropout.lua layer/gru.lua \
 			layer/graph.lua layer/rnn.lua layer/duplicate.lua layer/identity.lua \
-			nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/network.lua \
+			nn/init.lua nn/layer_repo.lua nn/param_repo.lua nn/network.lua nn/trainer.lua\
 			io/frm_buffer.lua io/seq_buffer.lua
 
 INCLUDE := -I $(LUA_INCDIR) -DLUA_USE_APICHECK
diff --git a/nerv/examples/ptb/main.lua b/nerv/examples/ptb/main.lua
new file mode 100644
index 0000000..5d1a326
--- /dev/null
+++ b/nerv/examples/ptb/main.lua
@@ -0,0 +1,118 @@
+nerv.include('reader.lua')
+nerv.include('select_linear.lua')
+
+gconf = {
+    chunk_size = 15,
+    dropout_rate = 0,
+    lrate = 1.5,
+    wcost = 1e-5,
+    max_iter = 35,
+    clip = 5,
+    momentum = 0.9,
+    batch_size = 200,
+    test = true,
+}
+
+local hidden_size = 300
+local vocab_size = 10000
+local layer_num = 1
+local dropout_rate = 0.5
+local trainer = nerv.Trainer
+
+function trainer:make_layer_repo(param_repo)
+    local layers = {
+        ['nerv.LSTMLayer'] = {},
+        ['nerv.DropoutLayer'] = {},
+        ['nerv.SelectLinearLayer'] = {
+            ['select'] = {dim_in = {1}, dim_out = {hidden_size}, vocab = vocab_size, pr = param_repo},
+        },
+        ['nerv.AffineLayer'] = {
+            output = {dim_in = {hidden_size}, dim_out = {vocab_size}, pr = param_repo},
+        },
+        ['nerv.SoftmaxCELayer'] = {
+            softmax = {dim_in = {vocab_size, 1}, dim_out = {1}, compressed = true},
+        },
+    }
+    for i = 1, layer_num do
+        layers['nerv.LSTMLayer']['lstm' .. i] = {dim_in = {hidden_size}, dim_out = {hidden_size}, pr = param_repo}
+        layers['nerv.DropoutLayer']['dropout' .. i] = {dim_in = {hidden_size}, dim_out = {hidden_size}}
+    end
+    return nerv.LayerRepo(layers, param_repo, gconf)
+end
+
+function trainer:get_network(layer_repo)
+    local connections = {
+        {'<input>[1]', 'select[1]', 0},
+        {'select[1]', 'lstm1[1]', 0},
+        {'dropout' .. layer_num .. '[1]', 'output[1]', 0},
+        {'output[1]', 'softmax[1]', 0},
+        {'<input>[2]', 'softmax[2]', 0},
+        {'softmax[1]', '<output>[1]', 0},
+    }
+    for i = 1, layer_num do
+        table.insert(connections, {'lstm' .. i .. '[1]', 'dropout' .. i .. '[1]', 0})
+        if i < 1 then
+            table.insert(connections, {'dropout' .. (i - 1) .. '[1]', 'lstm' .. i .. '[1]', 0})
+        end
+    end
+    return nerv.GraphLayer('graph', gconf, {dim_in = {1, 1}, dim_out = {1}, layer_repo = layer_repo, connections = connections})
+end
+
+function trainer:get_input_order()
+    return {'input', 'label'}
+end
+
+function trainer:get_readers(dataset)
+    local data_path = 'nerv/nerv/examples/lmptb/PTBdata/'
+    local vocab_file = data_path .. 'vocab'
+    local train_file = data_path .. 'ptb.train.txt.adds'
+    local cv_file = data_path .. 'ptb.valid.txt.adds'
+    local test_file = data_path .. 'ptb.test.txt.adds'
+    local reader
+    if dataset == 'train' then  
+        reader = nerv.Reader(vocab_file, train_file)
+    elseif dataset == 'validate' then
+        reader = nerv.Reader(vocab_file, cv_file)
+    elseif dataset == 'test' then
+        reader = nerv.Reader(vocab_file, test_file)
+    else
+        nerv.error('no such dataset')
+    end
+    return {{reader = reader, data = {input = 1, label = 1}}}
+end
+
+local total_err
+local total_frame
+
+function trainer:get_error()
+    return math.pow(10, -total_err / total_frame)
+end
+
+function trainer:epoch_preprocess(dataset, do_train)
+    if dataset == 'train' then
+        gconf.dropout_rate = dropout_rate
+        nerv.info('set dropout rate to %f', dropout_rate)
+    end
+    if dataset == 'validate' then
+        gconf.dropout_rate = 0
+        nerv.info('set dropout rate to 0')
+    end
+    if dataset == 'test' then
+        gconf.dropout_rate = 0
+        nerv.info('set dropout rate to 0')
+    end
+    total_err = 0
+    total_frame = 0
+end
+
+function trainer:mini_batch_middleprocess(cnt, info)
+    for t = 1, gconf.chunk_size do
+        local tmp = info.output[1][t]:new_to_host()
+        for i = 1, gconf.batch_size do
+            total_err = total_err + math.log10(math.exp(tmp[i - 1][0]))
+        end
+    end
+    for i = 1, gconf.batch_size do
+        total_frame = total_frame + info.seq_length[i]
+    end
+end
diff --git a/nerv/examples/ptb/reader.lua b/nerv/examples/ptb/reader.lua
new file mode 100644
index 0000000..76a78cf
--- /dev/null
+++ b/nerv/examples/ptb/reader.lua
@@ -0,0 +1,67 @@
+local Reader = nerv.class('nerv.Reader')
+
+function Reader:__init(vocab_file, input_file)
+    self:get_vocab(vocab_file)
+    self:get_seq(input_file)
+    self.offset = 1
+end
+
+function Reader:get_vocab(vocab_file)
+    local f = io.open(vocab_file, 'r')
+    local id = 0
+    self.vocab = {}
+    while true do
+        local word = f:read()
+        if word == nil then
+            break
+        end
+        self.vocab[word] = id
+        id = id + 1
+    end
+    self.size = id
+end
+
+function Reader:split(s, t)
+    local ret = {}
+    for x in (s .. t):gmatch('(.-)' .. t) do
+        table.insert(ret, x)
+    end
+    return ret
+end
+
+function Reader:get_seq(input_file)
+    local f = io.open(input_file, 'r')
+    self.seq = {}
+    while true do
+    -- for i = 1, 26 do
+        local seq = f:read()
+        if seq == nil then
+            break
+        end
+        seq = self:split(seq, ' ')
+        local tmp = {}
+        for i = 1, #seq do
+            if seq[i] ~= '' then
+                table.insert(tmp, self.vocab[seq[i]])
+            end
+        end
+        table.insert(self.seq, tmp)
+    end
+end
+
+function Reader:get_data()
+    if self.offset > #self.seq then
+        return nil
+    end
+    local tmp = self.seq[self.offset]
+    local res = {
+        input = nerv.MMatrixFloat(#tmp - 1, 1),
+        label = nerv.MMatrixFloat(#tmp - 1, 1),
+    }
+    for i = 1, #tmp - 1 do
+        res.input[i - 1][0] = tmp[i]
+        res.label[i - 1][0] = tmp[i + 1]
+    end
+    self.offset = self.offset + 1
+    return res
+end
diff --git a/nerv/examples/ptb/select_linear.lua b/nerv/examples/ptb/select_linear.lua
new file mode 100644
index 0000000..42778f8
--- /dev/null
+++ b/nerv/examples/ptb/select_linear.lua
@@ -0,0 +1,63 @@
+local SL = nerv.class('nerv.SelectLinearLayer', 'nerv.Layer')
+
+--id: string
+--global_conf: table
+--layer_conf: table
+--Get Parameters
+function SL:__init(id, global_conf, layer_conf)
+    nerv.Layer.__init(self, id, global_conf, layer_conf)
+
+    self.vocab = layer_conf.vocab
+ 
+    self:check_dim_len(1, 1)
+    self:bind_params()
+end
+
+function SL:bind_params()
+    self.ltp = self:find_param("ltp", self.lconf, self.gconf, nerv.LinearTransParam, {self.vocab, self.dim_out[1]}) --layer_conf.ltp
+end
+
+--Check parameter 
+function SL:init(batch_size)
+    if (self.dim_in[1] ~= 1) then --one word id 
+        nerv.error("mismatching dimensions of ltp and input")
+    end
+    if (self.dim_out[1] ~= self.ltp.trans:ncol()) then
+        nerv.error("mismatching dimensions of bp and output")
+    end
+    
+    self.batch_size = bath_size
+    self.ltp:train_init()
+end
+
+function SL:update()
+    --use this to produce reproducable result, don't forget to set the dropout to zero!
+    --for i = 1, input[1]:nrow(), 1 do
+    --    local word_vec = self.ltp.trans[input[1][i - 1][0]]
+    --    word_vec:add(word_vec, bp_err[1][i - 1], 1, - self.gconf.lrate / self.gconf.batch_size)
+    --end 
+    
+    --I tried the update_select_rows kernel which uses atomicAdd, but it generates unreproducable result
+    self.ltp:update_by_err_input()
+end
+
+function SL:propagate(input, output)
+    --for i = 0, input[1]:ncol() - 1, 1 do
+    --    if (input[1][0][i] > 0) then
+    --        output[1][i]:copy_fromd(self.ltp.trans[input[1][0][i]])
+    --    else
+    --        output[1][i]:fill(0)
+    --    end
+    --end
+    output[1]:copy_rows_fromd_by_colidx(self.ltp.trans, input[1])
+end
+
+function SL:back_propagate(bp_err, next_bp_err, input, output)
+    --input is compressed, do nothing
+    self.ltp:back_propagate_by_err_input(bp_err[1], input[1]:decompress(self.vocab))
+end
+
+function SL:get_params()
+    local paramRepo = nerv.ParamRepo({self.ltp}, self.loc_type)
+    return paramRepo
+end
diff --git a/nerv/examples/trainer.lua b/nerv/examples/trainer.lua
new file mode 100644
index 0000000..783ff1d
--- /dev/null
+++ b/nerv/examples/trainer.lua
@@ -0,0 +1,165 @@
+require 'lfs'
+require 'pl'
+
+-- =======================================================
+-- Deal with command line input & init training envrioment
+-- =======================================================
+
+local function check_and_add_defaults(spec, opts)
+    local function get_opt_val(k)
+        local k = string.gsub(k, '_', '-')
+        return opts[k].val, opts[k].specified
+    end
+    local opt_v = get_opt_val("resume_from")
+    if opt_v then
+        nerv.info("resuming from previous training state")
+        gconf = dofile(opt_v)
+    else
+        for k, v in pairs(spec) do
+            local opt_v, specified = get_opt_val(k)
+            if (not specified) and gconf[k] ~= nil then
+                nerv.info("using setting in network config file: %s = %s", k, gconf[k])
+            elseif opt_v ~= nil then
+                nerv.info("using setting in options: %s = %s", k, opt_v)
+                gconf[k] = opt_v
+            end
+        end
+    end
+end
+
+local function make_options(spec)
+    local options = {}
+    for k, v in pairs(spec) do
+        table.insert(options,
+                    {string.gsub(k, '_', '-'), nil, type(v), default = v})
+    end
+    return options
+end
+
+local function print_help(options)
+    nerv.printf("Usage: <asr_trainer.lua> [options] network_config.lua\n")
+    nerv.print_usage(options)
+end
+
+local function print_gconf()
+    local key_maxlen = 0
+    for k, v in pairs(gconf) do
+        key_maxlen = math.max(key_maxlen, #k or 0)
+    end
+    local function pattern_gen()
+        return string.format("%%-%ds = %%s\n", key_maxlen)
+    end
+    nerv.info("ready to train with the following gconf settings:")
+    nerv.printf(pattern_gen(), "Key", "Value")
+    for k, v in pairs(gconf) do
+        nerv.printf(pattern_gen(), k or "", v or "")
+    end
+end
+
+local function dump_gconf(fname)
+    local f = io.open(fname, "w")
+    f:write("return ")
+    f:write(table.tostring(gconf))
+    f:close()
+end
+
+local trainer_defaults = {
+    lrate = 0.8,
+    batch_size = 256,
+    chunk_size = 1,
+    buffer_size = 81920,
+    wcost = 1e-6,
+    momentum = 0.9,
+    cur_iter = 1,
+    max_iter = 20,
+    randomize = true,
+    cumat_tname = "nerv.CuMatrixFloat",
+    mmat_tname = "nerv.MMatrixFloat",
+    trainer_tname = "nerv.Trainer",
+}
+
+local options = make_options(trainer_defaults)
+local extra_opt_spec = {
+    {"resume-from", nil, "string"},
+    {"help", "h", "boolean", default = false, desc = "show this help information"},
+    {"dir", nil, "string", desc = "specify the working directory"},
+}
+
+table.extend(options, extra_opt_spec)
+
+local opts
+arg, opts = nerv.parse_args(arg, options)
+
+if #arg < 1 or opts["help"].val then
+    print_help(options)
+    return
+end
+
+local script = arg[1]
+local script_arg = {}
+for i = 2, #arg do
+    table.insert(script_arg, arg[i])
+end
+arg = script_arg
+dofile(script)
+
+--[[
+
+Rule: command-line option overrides network config overrides trainer default.
+Note: config key like aaa_bbbb_cc could be overriden by specifying
+--aaa-bbbb-cc to command-line arguments.
+
+]]--
+
+check_and_add_defaults(trainer_defaults, opts)
+gconf.mmat_type = nerv.get_type(gconf.mmat_tname)
+gconf.cumat_type = nerv.get_type(gconf.cumat_tname)
+gconf.trainer = nerv.get_type(gconf.trainer_tname)
+gconf.use_cpu = econf.use_cpu or false
+if gconf.initialized_param == nil then
+    gconf.initialized_param = {}
+end
+if gconf.param_random == nil then
+    gconf.param_random = function() return math.random() / 5 - 0.1 end
+end
+
+local date_pattern = "%Y-%m-%d_%H:%M:%S"
+local logfile_name = "log"
+local working_dir = opts["dir"].val or string.format("nerv_%s", os.date(date_pattern))
+gconf.working_dir = working_dir
+gconf.date_pattern = date_pattern
+
+print_gconf()
+if not lfs.mkdir(working_dir) then
+    nerv.error("[trainer] working directory already exists")
+end
+
+-- copy the network config
+dir.copyfile(script, working_dir)
+-- set logfile path
+nerv.set_logfile(path.join(working_dir, logfile_name))
+
+-- =============
+-- main function
+-- =============
+
+local trainer = gconf.trainer(gconf)
+trainer:training_preprocess()
+gconf.best_cv = trainer:process('validate', false)
+nerv.info("initial cross validation: %.3f", gconf.best_cv)
+
+for i = gconf.cur_iter, gconf.max_iter do
+    gconf.cur_iter = i
+    dump_gconf(path.join(working_dir, string.format("iter_%d.meta", i)))
+    nerv.info("[NN] begin iteration %d with lrate = %.6f", i, gconf.lrate)
+    local train_err = trainer:process('train', true)
+    nerv.info("[TR] training set %d: %.3f", i, train_err)
+    local cv_err = trainer:process('validate', false)
+    nerv.info("[CV] cross validation %d: %.3f", i, cv_err)
+    if gconf.test then
+        local test_err = trainer:process('test', false)
+        nerv.info('[TE] testset error %d: %.3f', i, test_err)
+    end
+    trainer:halving(train_err, cv_err)
+end
+trainer:training_afterprocess()
diff --git a/nerv/nn/init.lua b/nerv/nn/init.lua
index 1037d05..34b05cf 100644
--- a/nerv/nn/init.lua
+++ b/nerv/nn/init.lua
@@ -1,3 +1,4 @@
 nerv.include('layer_repo.lua')
 nerv.include('param_repo.lua')
 nerv.include('network.lua')
+nerv.include('trainer.lua')
diff --git a/nerv/nn/trainer.lua b/nerv/nn/trainer.lua
new file mode 100644
index 0000000..4ae08d9
--- /dev/null
+++ b/nerv/nn/trainer.lua
@@ -0,0 +1,183 @@
+local trainer = nerv.class('nerv.Trainer')
+
+function trainer:__init(gconf)
+    self.gconf = gconf
+    local mat_type
+    self.src_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST
+    local src_loc_type = self.src_loc_type
+    if gconf.use_cpu then
+        mat_type = gconf.mmat_type
+        self.train_loc_type = nerv.ParamRepo.LOC_TYPES.ON_HOST
+    else
+        mat_type = gconf.cumat_type
+        self.train_loc_type = nerv.ParamRepo.LOC_TYPES.ON_DEVICE
+    end
+    local train_loc_type = self.train_loc_type
+
+    local host_param_repo = nerv.ParamRepo()
+    host_param_repo:import(gconf.initialized_param, gconf)
+    local param_repo = host_param_repo:copy(train_loc_type, gconf)
+    self.layer_repo = self:make_layer_repo(param_repo)
+    local layer_repo = self.layer_repo
+    local graph = self:get_network(layer_repo)
+    self.input_order = self:get_input_order()
+
+    self.network = nerv.Network('network', gconf, {network = graph, clip = gconf.clip})
+    local network = self.network
+    network:init(gconf.batch_size, gconf.chunk_size)
+
+    local dim_in, dim_out = network.dim_in, network.dim_out
+    self.err_output = {}
+    local err_output = self.err_output
+    for i = 1, #dim_in do
+        err_output[i] = {}
+        local tmp = mat_type(gconf.batch_size, dim_in[i])
+        for t = 1, gconf.chunk_size do
+            err_output[i][t] = tmp
+        end
+    end
+    self.output = {}
+    self.err_input = {}
+    local output = self.output
+    local err_input = self.err_input
+    for i = 1, #dim_out do
+        output[i] = {}
+        for t = 1, gconf.chunk_size do
+            output[i][t] = mat_type(gconf.batch_size, dim_out[i])
+        end
+        err_input[i] = {}
+        local tmp = mat_type(gconf.batch_size, dim_out[i])
+        tmp:fill(0)
+        for t = 1, gconf.chunk_size do
+            if dim_out[i] == 1 then
+                err_input[i][t] = gconf.mask[t]
+            else
+                err_input[i][t] = tmp
+            end
+        end
+    end
+end
+
+function trainer:make_buffer(readers)
+    local gconf = self.gconf
+    if gconf.chunk_size == 1 then
+        return nerv.FrmBuffer(gconf, {
+            buffer_size = gconf.buffer_size,
+            batch_size = gconf.batch_size,
+            chunk_size = gconf.chunk_size,
+            randomize = gconf.randomize,
+            readers = readers,
+            use_gpu = true,
+        })
+    else
+        return nerv.SeqBuffer(gconf, {
+            batch_size = gconf.batch_size,
+            chunk_size = gconf.chunk_size,
+            readers = readers,
+        })
+    end
+end
+
+function trainer:process(dataset, do_train)
+    self:epoch_preprocess(dataset, do_train)
+    local buffer = self:make_buffer(self:get_readers(dataset))
+    local cnt = 0
+    local network = self.network
+    local input_order = self.input_order
+    local output = self.output
+    local err_input = self.err_input
+    local err_output = self.err_output
+    network:epoch_init()
+
+    while true do
+        local data = buffer:get_data()
+        if data == nil then
+            break
+        end
+
+        cnt = cnt + 1
+        local info = {input = {}, output = output, err_input = err_input, err_output = err_output,
+            do_train = do_train, seq_length = data.seq_length, new_seq = data.new_seq}
+        for i = 1, #network.dim_in do
+            info.input[i] = data.data[input_order[i]]
+        end
+
+        self:mini_batch_preprocess(cnt, info)
+        network:mini_batch_init(info)
+        network:propagate()
+        self:mini_batch_middleprocess(cnt, info)
+        if do_train then
+            network:back_propagate()
+            network:update()
+        end
+        self:mini_batch_afterprocess(cnt, info)
+
+        collectgarbage('collect')
+    end
+
+    self:epoch_afterprocess(dataset, do_train)
+    return self:get_error()
+end
+
+function trainer:halving(train_err, cv_err)
+    local gconf = self.gconf
+    local src_loc_type = self.src_loc_type
+    local train_loc_type = self.train_loc_type
+    local layer_repo = self.layer_repo
+    local param_fname = string.format('%s_iter_%d_lr%f_tr%.3f_cv%.3f.nerv', os.date(gconf.date_pattern), gconf.cur_iter, gconf.lrate, train_err, cv_err)
+    param_fname = path.join(gconf.working_dir, param_fname)
+    local network = self.network
+    local host_param_repo = network:get_params():copy(src_loc_type, gconf)
+    host_param_repo:export(param_fname)
+
+    if cv_err < gconf.best_cv then
+        nerv.info("accepting the trained params")
+        gconf.best_cv = cv_err
+        gconf.initialized_param = {param_fname}
+    else
+        nerv.info("rejecting the trained params, rollback to the previous one")
+        file.move(param_fname, param_fname .. '.rejected')
+        host_param_repo = nerv.ParamRepo()
+        host_param_repo:import(gconf.initialized_param, gconf)
+        local param_repo = host_param_repo:copy(train_loc_type, gconf)
+        layer_repo:rebind(param_repo)
+        gconf.lrate = gconf.lrate * 0.5
+    end
+end
+
+function trainer:training_preprocess()
+end
+
+function trainer:training_afterprocess()
+end
+
+function trainer:epoch_preprocess(dataset, do_train)
+end
+
+function trainer:epoch_afterprocess(dataset, do_train)
+end
+
+function trainer:mini_batch_preprocess(cnt, info)
+end
+
+function trainer:mini_batch_middleprocess(cnt, info)
+end
+
+function trainer:mini_batch_afterprocess(cnt, info)
+end
+
+function trainer:make_layer_repo(param_repo)
+    nerv.error_method_not_implemented()
+end
+
+function trainer:get_network(layer_repo)
+    nerv.error_method_not_implemented()
+end
+
+function trainer:get_readers(dataset)
+    nerv.error_method_not_implemented()
+end
+
+function trainer:get_input_order()
+    nerv.error_method_not_implemented()
+end
author	Ted Yin <Determinant@users.noreply.github.com>	2016-03-31 20:44:17 +0800
committer	Ted Yin <Determinant@users.noreply.github.com>	2016-03-31 20:44:17 +0800
commit	7829db926609d3e0498349e1a09634531244e0e5 (patch)
tree	08b2efe303b3aa8da3199641106419641296d8fc
parent	89d57b6fae6bcb0195a73fb97ab6870ee0d0ce20 (diff)
parent	74d6956dc79b387289d911d9cbea5b7245405b62 (diff)