--author: txh18(Tianxing) --This recipe is rnnlm with bptt, unfolding for each time instance --The training framework is the same with Mikolov's rnnlm, Tianxing's XRNN-CPU and Wengong's XRNN-GPU --It uses DAGLayer to simulate RNNLM unfold --TODO: the select_linear now accepts a column vector, instead of a row vector require 'lmptb.lmvocab' require 'lmptb.lmfeeder' require 'lmptb.lmutil' require 'tnn.init' nerv.include('lmptb/layer/init.lua') --[[global function rename]]-- printf = nerv.printf --[[global function rename ends]]-- --global_conf: table --first_time: bool --Returns: a ParamRepo function prepare_parameters(global_conf, first_time) printf("%s preparing parameters...\n", global_conf.sche_log_pre) if (first_time) then ltp_ih = nerv.LinearTransParam("ltp_ih", global_conf) ltp_ih.trans = global_conf.cumat_type(global_conf.vocab:size() + 1, global_conf.hidden_size) --index 0 is for zero, others correspond to vocab index(starting from 1) ltp_ih.trans:generate(global_conf.param_random) ltp_ih.trans[0]:fill(0) ltp_hh = nerv.LinearTransParam("ltp_hh", global_conf) ltp_hh.trans = global_conf.cumat_type(global_conf.hidden_size, global_conf.hidden_size) ltp_hh.trans:generate(global_conf.param_random) ltp_ho = nerv.LinearTransParam("ltp_ho", global_conf) ltp_ho.trans = global_conf.cumat_type(global_conf.hidden_size, global_conf.vocab:size()) ltp_ho.trans:generate(global_conf.param_random) bp_h = nerv.BiasParam("bp_h", global_conf) bp_h.trans = global_conf.cumat_type(1, global_conf.hidden_size) bp_h.trans:generate(global_conf.param_random) bp_o = nerv.BiasParam("bp_o", global_conf) bp_o.trans = global_conf.cumat_type(1, global_conf.vocab:size()) bp_o.trans:generate(global_conf.param_random) local f = nerv.ChunkFile(global_conf.param_fn, 'w') f:write_chunk(ltp_ih) f:write_chunk(ltp_hh) f:write_chunk(ltp_ho) f:write_chunk(bp_h) f:write_chunk(bp_o) f:close() end local paramRepo = nerv.ParamRepo() paramRepo:import({global_conf.param_fn}, nil, global_conf) printf("%s preparing parameters end.\n", global_conf.sche_log_pre) return paramRepo end --global_conf: table --Returns: nerv.LayerRepo function prepare_layers(global_conf, paramRepo) printf("%s preparing layers...\n", global_conf.sche_log_pre) local recurrentLconfig = {{["bp"] = "bp_h", ["ltp_hh"] = "ltp_hh"}, {["dim_in"] = {global_conf.hidden_size, global_conf.hidden_size}, ["dim_out"] = {global_conf.hidden_size}, ["break_id"] = global_conf.vocab:get_sen_entry().id, ["independent"] = global_conf.independent, ["clip"] = 10}} local layers = { ["nerv.IndRecurrentLayer"] = { ["recurrentL1"] = recurrentLconfig, }, ["nerv.SelectLinearLayer"] = { ["selectL1"] = {{["ltp"] = "ltp_ih"}, {["dim_in"] = {1}, ["dim_out"] = {global_conf.hidden_size}, ["vocab"] = global_conf.vocab}}, }, ["nerv.SigmoidLayer"] = { ["sigmoidL1"] = {{}, {["dim_in"] = {global_conf.hidden_size}, ["dim_out"] = {global_conf.hidden_size}}} }, ["nerv.AffineLayer"] = { ["outputL"] = {{["ltp"] = "ltp_ho", ["bp"] = "bp_o"}, {["dim_in"] = {global_conf.hidden_size}, ["dim_out"] = {global_conf.vocab:size()}}}, }, ["nerv.SoftmaxCELayer"] = { ["softmaxL"] = {{}, {["dim_in"] = {global_conf.vocab:size(), global_conf.vocab:size()}, ["dim_out"] = {1}}}, }, } printf("%s adding %d bptt layers...\n", global_conf.sche_log_pre, global_conf.bptt) for i = 1, global_conf.bptt do layers["nerv.IndRecurrentLayer"]["recurrentL" .. (i + 1)] = recurrentLconfig layers["nerv.SigmoidLayer"]["sigmoidL" .. (i + 1)] = {{}, {["dim_in"] = {global_conf.hidden_size}, ["dim_out"] = {global_conf.hidden_size}}} layers["nerv.SelectLinearLayer"]["selectL" .. (i + 1)] = {{["ltp"] = "ltp_ih"}, {["dim_in"] = {1}, ["dim_out"] = {global_conf.hidden_size}, ["vocab"] = global_conf.vocab}} end local layerRepo = nerv.LayerRepo(layers, paramRepo, global_conf) printf("%s preparing layers end.\n", global_conf.sche_log_pre) return layerRepo end --global_conf: table --layerRepo: nerv.LayerRepo --Returns: a nerv.DAGLayer function prepare_dagLayer(global_conf, layerRepo) printf("%s Initing daglayer ...\n", global_conf.sche_log_pre) --input: input_w, input_w, ... input_w_now, last_activation local dim_in_t = {} for i = 1, global_conf.bptt + 1 do dim_in_t[i] = 1 end dim_in_t[global_conf.bptt + 2] = global_conf.hidden_size dim_in_t[global_conf.bptt + 3] = global_conf.vocab:size() --[[ softmax | \ ouptut i(bptt+3) | recurrentL(bptt+1)... recurrentL2-recurrentL1 selectL(bptt+1) selectL2 selectL1 / | | | i(bptt+2) i(bptt+1) i2 i1 ]]-- local connections_t = { ["selectL1[1]"] = "recurrentL1[1]", ["recurrentL1[1]"] = "sigmoidL1[1]", ["sigmoidL1[1]"] = "outputL[1]", ["outputL[1]"] = "softmaxL[1]", ["softmaxL[1]"] = "[1]" } for i = 1, global_conf.bptt, 1 do connections_t["["..i.."]"] = "selectL"..i.."[1]" connections_t["selectL"..(i+1).."[1]"] = "recurrentL"..(i+1).."[1]" connections_t["recurrentL"..(i+1).."[1]"] = "sigmoidL"..(i+1).."[1]" connections_t["sigmoidL"..(i+1).."[1]"] = "recurrentL"..i.."[2]" end connections_t["["..(global_conf.bptt+1).."]"] = "selectL"..(global_conf.bptt+1).."[1]" connections_t["["..(global_conf.bptt+2).."]"] = "recurrentL"..(global_conf.bptt+1).."[2]" connections_t["["..(global_conf.bptt+3).."]"] = "softmaxL[2]" printf("%s printing DAG connections:\n", global_conf.sche_log_pre) for key, value in pairs(connections_t) do printf("\t%s->%s\n", key, value) end local dagL = nerv.DAGLayerT("dagL", global_conf, {["dim_in"] = dim_in_t, ["dim_out"] = {1}, ["sub_layers"] = layerRepo, ["connections"] = connections_t, }) dagL:init(global_conf.batch_size) printf("%s Initing DAGLayer end.\n", global_conf.sche_log_pre) return dagL end --global_conf: table --dagL: nerv.DAGLayer --fn: string --config: table --Returns: table, result function propagateFile(global_conf, dagL, fn, config) printf("%s Begining doing on %s...\n", global_conf.sche_log_pre, fn) if (config.do_train == true) then printf("%s do_train in config is true.\n", global_conf.sche_log_pre) end local feeder = nerv.LMFeeder(global_conf, global_conf.batch_size, global_conf.vocab) feeder:open_file(fn) local tnow = 1 local token_store = {} local hidden_store = {} local sigmoidL_ref = dagL.layers["sigmoidL1"] local inputL_ref = dagL.layers["selectL1"] token_store[tnow] = feeder:get_batch() for i = 1, global_conf.bptt + 1 do hidden_store[tnow - i] = global_conf.cumat_type(global_conf.batch_size, global_conf.hidden_size) hidden_store[tnow - i]:fill(0) token_store[tnow - i] = {} for j = 1, global_conf.batch_size do token_store[tnow - i][j] = global_conf.vocab.null_token end end local dagL_input = {} for i = 1, global_conf.bptt + 1 do dagL_input[i] = global_conf.cumat_type(global_conf.batch_size, 1) --changed to row vector, debughtx end dagL_input[global_conf.bptt + 2] = global_conf.cumat_type(global_conf.batch_size, global_conf.hidden_size) dagL_input[global_conf.bptt + 3] = global_conf.cumat_type(global_conf.batch_size, global_conf.vocab:size()) local dagL_output = {global_conf.cumat_type(global_conf.batch_size, 1)} local dagL_err = {nil} --{global_conf.cumat_type(global_conf.batch_size, 1)} local dagL_input_err = {} for i = 1, global_conf.bptt + 1 do dagL_input_err[i] = nil --global_conf.cumat_type(global_conf.batch_size, global_conf.vocab:size()) end dagL_input_err[global_conf.bptt + 2] = global_conf.cumat_type(global_conf.batch_size, global_conf.hidden_size) dagL_input_err[global_conf.bptt + 3] = global_conf.cumat_type(global_conf.batch_size, global_conf.vocab:size()) local result = nerv.LMResult(global_conf, global_conf.vocab) result:init("rnn") global_conf.input_word_id = {} while (1) do token_store[tnow + 1] = feeder:get_batch() --The next word(to predict) if (token_store[tnow + 1] == nil) then break end --dagL:propagate(dagL_input, dagL_output) for i = 1, global_conf.bptt + 1 do nerv.LMUtil.set_id(dagL_input[i], token_store[tnow - i + 1], global_conf.vocab) global_conf.input_word_id["recurrentL"..i] = dagL_input[i] --for IndRecurrent end dagL_input[global_conf.bptt + 2]:copy_fromd(hidden_store[tnow - global_conf.bptt - 1]) nerv.LMUtil.set_onehot(dagL_input[global_conf.bptt + 3], token_store[tnow + 1], global_conf.vocab) --for softmax --local dagL_input = create_dag_input(global_conf, token_store, hidden_store, tnow) global_conf.timer:tic("dagL-propagate") dagL:propagate(dagL_input, dagL_output) global_conf.timer:toc("dagL-propagate") hidden_store[tnow] = global_conf.cumat_type(global_conf.batch_size, global_conf.hidden_size) hidden_store[tnow]:copy_fromd(sigmoidL_ref.outputs[1][1]) if (config.do_train == true) then global_conf.timer:tic("dagL-back_propagate") dagL:back_propagate(dagL_err, dagL_input_err, dagL_input, dagL_output) global_conf.timer:toc("dagL-back_propagate") global_conf.timer:tic("dagL-update") dagL:update(dagL_err, dagL_input, dagL_output) global_conf.timer:toc("dagL-update") inputL_ref.layer.ltp.trans[0]:fill(0) --afraid that this will be updated in select_linear:update end for i = 1, global_conf.batch_size, 1 do if (token_store[tnow + 1][i] ~= global_conf.vocab.null_token) then result:add("rnn", token_store[tnow + 1][i], math.exp(dagL_output[1][i - 1][0])) if (config.report_word == true) then printf("%s %s: \n", global_conf.sche_log_pre, token_store[tnow + 1][i], i, math.exp(dagL_output[1][i - 1][0])) end end if (result["rnn"].cn_w % global_conf.log_w_num == 0) then printf("%s %d words processed %s.\n", global_conf.sche_log_pre, result["rnn"].cn_w, os.date()) printf("\t%s log prob per sample :%f.\n", global_conf.sche_log_pre, result:logp_sample("rnn")); --[[ for key, value in pairs(global_conf.timer.rec) do printf("\t [global_conf.timer]: time spent on %s:%.5fs\n", key, value) end ]]-- --comment this for debughtx global_conf.timer:flush() --nerv.CuMatrix.print_profile() --nerv.CuMatrix.clear_profile() end end token_store[tnow - 2 - global_conf.bptt] = nil hidden_store[tnow - 2 - global_conf.bptt] = nil collectgarbage("collect") tnow = tnow + 1 end printf("%s Displaying result:\n", global_conf.sche_log_pre) printf("%s %s\n", global_conf.sche_log_pre, result:status("rnn")) printf("%s Doing on %s end.\n", global_conf.sche_log_pre, fn) return result end --returns dagL, paramRepo function load_net(global_conf) local paramRepo = prepare_parameters(global_conf, false) local layerRepo = prepare_layers(global_conf, paramRepo) local dagL = prepare_dagLayer(global_conf, layerRepo) return dagL, paramRepo end --[[global settings]]-- local set = "ptb" if (set == "ptb") then data_dir = "/home/slhome/txh18/workspace/nerv/nerv/nerv/examples/lmptb/PTBdata" train_fn = data_dir.."/ptb.train.txt" valid_fn = data_dir.."/ptb.valid.txt" test_fn = data_dir.."/ptb.test.txt" work_dir_base = "/home/slhome/txh18/workspace/nerv/lmptb-work" global_conf = { lrate = 1, wcost = 1e-6, momentum = 0, cumat_type = nerv.CuMatrixFloat, mmat_type = nerv.MMatrixFloat, hidden_size = 50, batch_size = 10, bptt = 6, --train bptt_block's words. could be set to zero max_iter = 18, param_random = function() return (math.random() / 5 - 0.1) end, independent = true, train_fn = train_fn, valid_fn = valid_fn, test_fn = test_fn, sche_log_pre = "[SCHEDULER]:", log_w_num = 1000, --give a message when log_w_num words have been processed timer = nerv.Timer() } global_conf.work_dir = work_dir_base.."/h"..global_conf.hidden_size.."bp"..global_conf.bptt.."slr"..global_conf.lrate --..os.date("_%bD%dH%H") --comment this for testing global_conf.param_fn = global_conf.work_dir.."/params" elseif (set == "test") then train_fn = "/slfs1/users/txh18/workspace/nerv-project/some-text" valid_fn = "/slfs1/users/txh18/workspace/nerv-project/some-text" test_fn = "/slfs1/users/txh18/workspace/nerv-project/some-text" work_dir = "/slfs1/users/txh18/workspace/nerv-project/lmptb-work-play" global_conf = { lrate = 0.1, wcost = 1e-6, momentum = 0, cumat_type = nerv.CuMatrixFloat, mmat_type = nerv.MMatrixFloat, hidden_size = 5, batch_size = 1, bptt = 0, --train bptt_block's words. could be set to zero max_iter = 15, param_random = function() return (math.random() / 5 - 0.1) end, independent = true, train_fn = train_fn, valid_fn = valid_fn, test_fn = test_fn, work_dir = work_dir, param_fn = work_dir .. "/params", sche_log_pre = "[SCHEDULER]:", log_w_num = 80000, --give a message when log_w_num words have been processed timer = nerv.Timer() } end local vocab = nerv.LMVocab() global_conf["vocab"] = vocab printf("%s printing global_conf...\n", global_conf.sche_log_pre) for key, value in pairs(global_conf) do printf("\t%s=%s\n", key, value) end printf("%s wait 3 seconds...\n", global_conf.sche_log_pre) nerv.LMUtil.wait(3) printf("%s creating work_dir...\n", global_conf.sche_log_pre) os.execute("mkdir -p "..global_conf.work_dir) scheduler = " printf(\"===INITIAL VALIDATION===\\n\") \ dagL, paramRepo = load_net(global_conf) \ printf(\"===INITIAL VALIDATION===\\n\") \ local result = propagateFile(global_conf, dagL, global_conf.valid_fn, {do_train = false, report_word = false}) \ ppl_rec = {} \ lr_rec = {} \ ppl_rec[0] = result:ppl_net(\"rnn\") ppl_last = ppl_rec[0] \ lr_rec[0] = 0 \ printf(\"\\n\") \ local lr_half = false \ for iter = 1, global_conf.max_iter, 1 do \ printf(\"===ITERATION %d LR %f===\\n\", iter, global_conf.lrate) \ global_conf.sche_log_pre = \"[SCHEDULER ITER\"..iter..\" LR\"..global_conf.lrate..\"]:\" \ dagL, paramRepo = load_net(global_conf) \ propagateFile(global_conf, dagL, global_conf.train_fn, {do_train = true, report_word = false}) \ printf(\"===VALIDATION %d===\\n\", iter) \ local result = propagateFile(global_conf, dagL, global_conf.valid_fn, {do_train = false, report_word = false}) \ ppl_rec[iter] = result:ppl_net(\"rnn\") \ lr_rec[iter] = global_conf.lrate \ if (ppl_last / ppl_rec[iter] < 1.03 or lr_half == true) then \ global_conf.lrate = (global_conf.lrate / 2) \ lr_half = true \ end \ if (ppl_rec[iter] < ppl_last) then \ printf(\"%s saving net to file %s...\\n\", global_conf.sche_log_pre, global_conf.param_fn) \ paramRepo:export(global_conf.param_fn, nil) \ ppl_last = ppl_rec[iter] \ else \ printf(\"%s PPL did not improve, rejected...\\n\", global_conf.sche_log_pre) \ end \ printf(\"\\n\") \ nerv.LMUtil.wait(2) \ end \ printf(\"===VALIDATION PPL record===\\n\") \ for i = 0, #ppl_rec do printf(\" \", i, lr_rec[i], ppl_rec[i]) end \ printf(\"\\n\") \ printf(\"===FINAL TEST===\\n\") \ global_conf.sche_log_pre = \"[SCHEDULER FINAL_TEST]:\" \ dagL, _ = load_net(global_conf) \ propagateFile(global_conf, dagL, global_conf.test_fn, {do_train = false, report_word = false})" printf("%s printing schedule:\n", global_conf.sche_log_pre) printf("%s\n", scheduler) printf("%s wait 3 seconds...\n", global_conf.sche_log_pre) nerv.LMUtil.wait(3) --[[global settings end]]-- global_conf.vocab:build_file(global_conf.train_fn) prepare_parameters(global_conf, true) assert(loadstring(scheduler))()