From b08da1fef90e93b188704056cdae651d7865f98d Mon Sep 17 00:00:00 2001 From: Qi Liu Date: Mon, 14 Mar 2016 20:07:15 +0800 Subject: speedup border flush --- Makefile | 5 +- nerv/examples/network_debug/config.lua | 2 +- nerv/examples/network_debug/main.lua | 2 +- nerv/examples/network_debug/reader.lua | 4 +- nerv/nn/network.lua | 215 ++++++++++++++++++++------------- 5 files changed, 141 insertions(+), 87 deletions(-) diff --git a/Makefile b/Makefile index 28012da..3995d21 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: all clean install luajit luarocks speech +.PHONY: all clean install luajit luarocks speech submodule ############## EDIT THESE LINES ##################### SHELL := /bin/bash PREFIX := $(CURDIR)/install/ @@ -26,7 +26,8 @@ export BLAS_LDFLAGS nerv-clean speech/speech_utils-clean speech/htk_io-clean speech/kaldi_io-clean speech/kaldi_decode-clean \ Penlight -all: luajit luarocks Penlight nerv +all: nerv +submodule: luajit luajit Penlight luajit: PREFIX=$(PREFIX) ./tools/build_luajit.sh luarocks: diff --git a/nerv/examples/network_debug/config.lua b/nerv/examples/network_debug/config.lua index 9025b78..093bde2 100644 --- a/nerv/examples/network_debug/config.lua +++ b/nerv/examples/network_debug/config.lua @@ -12,7 +12,7 @@ function get_global_conf() layer_num = 1, chunk_size = 15, batch_size = 20, - max_iter = 3, + max_iter = 1, param_random = function() return (math.random() / 5 - 0.1) end, dropout_rate = 0.5, timer = nerv.Timer(), diff --git a/nerv/examples/network_debug/main.lua b/nerv/examples/network_debug/main.lua index 1bee43c..bbcdb6c 100644 --- a/nerv/examples/network_debug/main.lua +++ b/nerv/examples/network_debug/main.lua @@ -20,12 +20,12 @@ for i = 1, global_conf.max_iter do local train_reader = nerv.Reader(data_path .. 'vocab', data_path .. 'ptb.train.txt.adds') local val_reader = nerv.Reader(data_path .. 'vocab', data_path .. 'ptb.valid.txt.adds') local train_ppl, val_ppl = NN:epoch(train_reader, val_reader) + nerv.printf('Epoch %d: %f %f %f\n', i, global_conf.lrate, train_ppl, val_ppl) if val_ppl < best_cv then best_cv = val_ppl else global_conf.lrate = global_conf.lrate / 2.0 end - nerv.printf('Epoch %d: %f %f %f\n', i, global_conf.lrate, train_ppl, val_ppl) timer:toc('Epoch' .. i) timer:check('Epoch' .. i) io.flush() diff --git a/nerv/examples/network_debug/reader.lua b/nerv/examples/network_debug/reader.lua index 70c0c97..76a78cf 100644 --- a/nerv/examples/network_debug/reader.lua +++ b/nerv/examples/network_debug/reader.lua @@ -32,8 +32,8 @@ end function Reader:get_seq(input_file) local f = io.open(input_file, 'r') self.seq = {} - -- while true do - for i = 1, 26 do + while true do + -- for i = 1, 26 do local seq = f:read() if seq == nil then break diff --git a/nerv/nn/network.lua b/nerv/nn/network.lua index 2cb83ce..910cdad 100644 --- a/nerv/nn/network.lua +++ b/nerv/nn/network.lua @@ -16,6 +16,7 @@ function network:__init(id, global_conf, network_conf) if self.nn_act_default == nil then self.nn_act_default = 0 end + self.layers = {} self.input_conn = {} self.output_conn = {} @@ -36,16 +37,41 @@ function network:__init(id, global_conf, network_conf) end self.output_conn[id][port] = {0, i, time} end + self.delay = 0 for i = 1, #self.layers do local dim_in, _ = self.layers[i]:get_dim() for j = 1, #dim_in do + if self.input_conn[i][j] == nil then + nerv.error('dangling input') + end local time = self.input_conn[i][j][3] if math.abs(time) > self.delay then self.delay = math.abs(time) end end end + + self.input_edge = {} + self.output_edge = {} + for t = -self.delay, self.delay do + self.input_edge[t] = {} + self.output_edge[t] = {} + end + for i = 1, #self.layers do + local dim_in, dim_out = self.layers[i]:get_dim() + for j = 1, #dim_in do + local time = self.input_conn[i][j][3] + table.insert(self.input_edge[time], {i, j}) + end + for j = 1, #dim_out do + if self.output_conn[i][j] == nil then + nerv.error('dangling output') + end + local time = self.output_conn[i][j][3] + table.insert(self.output_edge[time], {i, j}) + end + end end function network:compile(layer) @@ -112,11 +138,20 @@ function network:init(batch_size, chunk_size) self:make_initial_store() collectgarbage('collect') + + self.flush = {} + for t = 1, self.chunk_size do + self.flush[t] = {} + end end function network:epoch_init() + self.timestamp = 0 for i = 1, #self.layers do self.layers[i]:init(self.batch_size, self.chunk_size) + for t = 1, self.chunk_size do + self.flush[t][i] = {timestamp = 0, input = {}, output = {}} + end end end @@ -134,12 +169,10 @@ function network:topsort() for i = 1, #self.layers do local _, dim_out = self.layers[i]:get_dim() for j = 1, #dim_out do - if self.output_conn[i][j] ~= nil then - local edge = self.output_conn[i][j] - local id, time = edge[1], edge[3] + t - if time >= 1 and time <= self.chunk_size and id ~= 0 then - degree[time][id] = degree[time][id] + 1 - end + local edge = self.output_conn[i][j] + local id, time = edge[1], edge[3] + t + if time >= 1 and time <= self.chunk_size and id ~= 0 then + degree[time][id] = degree[time][id] + 1 end end end @@ -161,15 +194,13 @@ function network:topsort() l = l + 1 local _, dim_out = self.layers[i]:get_dim() for j = 1, #dim_out do - if self.output_conn[i][j] ~= nil then - local edge = self.output_conn[i][j] - local id, time = edge[1], edge[3] + t - if time >= 1 and time <= self.chunk_size and id ~= 0 then - degree[time][id] = degree[time][id] - 1 - if degree[time][id] == 0 then - r = r + 1 - self.queue[r] = {chunk = time, id = id} - end + local edge = self.output_conn[i][j] + local id, time = edge[1], edge[3] + t + if time >= 1 and time <= self.chunk_size and id ~= 0 then + degree[time][id] = degree[time][id] - 1 + if degree[time][id] == 0 then + r = r + 1 + self.queue[r] = {chunk = time, id = id} end end end @@ -202,17 +233,19 @@ function network:make_initial_store() memory[t][i][j]:fill(self.nn_act_default) end end - -- memory[t][0] stores network input - memory[t][0] = {} - for j = 1, #self.dim_in do - memory[t][0][j] = self.mat_type(self.batch_size, self.dim_in[j]) - memory[t][0][j]:fill(self.nn_act_default) - end - -- err_memory[t][0] stores network err_input - err_memory[t][0] = {} - for j = 1, #self.dim_out do - err_memory[t][0][j] = self.mat_type(self.batch_size, self.dim_out[j]) - err_memory[t][0][j]:fill(0) + if t < 1 or t > self.chunk_size then + -- memory[t][0] stores network input + memory[t][0] = {} + for j = 1, #self.dim_in do + memory[t][0][j] = self.mat_type(self.batch_size, self.dim_in[j]) + memory[t][0][j]:fill(self.nn_act_default) + end + -- err_memory[t][0] stores network err_input + err_memory[t][0] = {} + for j = 1, #self.dim_out do + err_memory[t][0][j] = self.mat_type(self.batch_size, self.dim_out[j]) + err_memory[t][0][j]:fill(0) + end end end @@ -314,9 +347,14 @@ function network:make_initial_store() self.legacy[t] = {} for i = 1, #self.layers do self.legacy[t][i] = {} - local _, dim_out = self.layers[i]:get_dim() - for j = 1, #dim_out do - self.legacy[t][i][j] = memory[t][i][j] + end + end + for d = 1, self.delay do + for t = 1 - d, 0 do + for i = 1, #self.output_edge[d] do + local edge = self.output_edge[d][i] + local id, port = edge[1], edge[2] + self.legacy[t][id][port] = memory[t][id][port] end end end @@ -383,59 +421,74 @@ function network:mini_batch_init(info) self.info = info self:set_input(self.info.input) self:set_output(self.info.output) + if self.info.do_train then + self:set_err_input(self.info.err_input) + self:set_err_output(self.info.err_output) + end -- calculate border self.max_length = 0 - self.border = {} - for i = 1, self.chunk_size do - self.border[i] = {} - end + self.timestamp = self.timestamp + 1 for i = 1, self.batch_size do if self.info.seq_length[i] > self.max_length then self.max_length = self.info.seq_length[i] end - for t = 1, self.delay do - local chunk = self.info.seq_length[i] + t - if chunk > self.chunk_size then - break + local border = self.info.seq_length[i] + for d = 1, self.delay do + for t = border + 1, border + d do + if t > self.chunk_size then + break + end + for j = 1, #self.output_edge[-d] do + local edge = self.output_edge[-d][j] + local id, port = edge[1], edge[2] + local flush = self.flush[t][id] + if flush.timestamp ~= self.timestamp then + flush.timestamp = self.timestamp + flush.input = {} + flush.output = {} + end + table.insert(flush.output, {port, i}) + end + end + if self.info.do_train then + for t = border, border - d + 1, -1 do + if t < 1 then + break + end + for j = 1, #self.input_edge[-d] do + local edge = self.input_edge[-d][j] + local id, port = edge[1], edge[2] + local flush = self.flush[t][id] + if flush.timestamp ~= self.timestamp then + flush.timestamp = self.timestamp + flush.input = {} + flush.output = {} + end + table.insert(flush.input, {port, i}) + end + end end - table.insert(self.border[chunk], i) end end -- copy legacy - for t = 1 - self.delay, 0 do - for i = 1, #self.layers do - local _, dim_out = self.layers[i]:get_dim() - for j = 1, #dim_out do - if t + self.chunk_size >= 1 and self.output_conn[i][j][1] ~= 0 then - self.legacy[t][i][j]:copy_from(self.output[t + self.chunk_size][i][j]) + for d = 1, self.delay do + for t = 1 - d, 0 do + for i = 1, #self.output_edge[d] do + local edge = self.output_edge[d][i] + local id, port = edge[1], edge[2] + if t + self.chunk_size >= 1 and self.output_conn[id][port][1] ~= 0 then + self.legacy[t][id][port]:copy_from(self.output[t + self.chunk_size][id][port]) end - for k = 1, #self.info.new_seq do - local batch = self.info.new_seq[k] - self.legacy[t][i][j][batch - 1]:fill(self.nn_act_default) + for j = 1, #self.info.new_seq do + local batch = self.info.new_seq[j] + self.legacy[t][id][port][batch - 1]:fill(self.nn_act_default) end end end end - if self.info.do_train then - self:set_err_input(self.info.err_input) - self:set_err_output(self.info.err_output) - - -- flush border gradient - for t = self.max_length + 1, self.max_length + self.delay do - if t > self.chunk_size then - break - end - for i = 1, #self.layers do - local dim_in, _ = self.layers[i]:get_dim() - for j = 1, #dim_in do - self.err_output[t][i][j]:fill(0) - end - end - end - end end function network:propagate() @@ -445,11 +498,11 @@ function network:propagate() self.layers[id]:propagate(self.input[t][id], self.output[t][id], t) end -- flush border activation - for j = 1, #self.border[t] do - local batch = self.border[t][j] - local _, dim_out = self.layers[id]:get_dim() - for k = 1, #dim_out do - self.output[t][id][k][batch - 1]:fill(self.nn_act_default) + if self.flush[t][id].timestamp == self.timestamp then + for j = 1, #self.flush[t][id].output do + local border = self.flush[t][id].output[j] + local port, batch = border[1], border[2] + self.output[t][id][port][batch - 1]:fill(self.nn_act_default) end end end @@ -459,15 +512,8 @@ function network:back_propagate() for i = #self.queue, 1, -1 do local t, id = self.queue[i].chunk, self.queue[i].id if t <= self.max_length then - -- flush border gradient - for j = 1, #self.border[t] do - local batch = self.border[t][j] - local _, dim_out = self.layers[id]:get_dim() - for k = 1, #dim_out do - self.err_input[t][id][k][batch - 1]:fill(0) - end - end self.layers[id]:back_propagate(self.err_input[t][id], self.err_output[t][id], self.input[t][id], self.output[t][id], t) + -- gradient clip if self.clip ~= nil then local dim_in, _ = self.layers[id]:get_dim() for j = 1, #dim_in do @@ -475,14 +521,21 @@ function network:back_propagate() end end end + -- flush border gradient + if self.flush[t][id].timestamp == self.timestamp then + for j = 1, #self.flush[t][id].input do + local border = self.flush[t][id].input[j] + local port, batch = border[1], border[2] + self.err_output[t][id][port][batch - 1]:fill(0) + end + end end end function network:update() - for i = 1, #self.queue do - local t, id = self.queue[i].chunk, self.queue[i].id - if t <= self.max_length then - self.layers[id]:update(self.err_input[t][id], self.input[t][id], self.output[t][id], t) + for t = 1, self.max_length do + for i = 1, #self.layers do + self.layers[i]:update(self.err_input[t][i], self.input[t][i], self.output[t][i], t) end end end -- cgit v1.2.3-70-g09d2