--- Implements the concept of computable but opaque networks built ("compiled") -- from nested layers. -- @author Qi Liu -- @author Ted Yin --- The class describing a computable but opaque network built from nested -- layers. -- @type nerv.Network local network = nerv.class('nerv.Network') --- The constructor. -- @param id the identifier of the network (currently having no effects) -- @param global_conf a table describing the computation state and providing -- with some global settings -- -- The following fields in `global_conf` will be used: -- -- * `use_cpu`: whether to use CPU for the computation -- * `mmat_type`: the class used for creating matrices in CPU computation -- * `cumat_type` (if `use_cpu = false`): the class used for creating matrices -- in GPU computation -- -- The following fields in `global_conf` will be altered: -- -- * `mask`: an array of `chunk_size` length containing column binary vectors -- indicating whether each frame in a *batch matrix* (i.e. one matrix in a BPTT -- chunk/"mini-batch") contains a valid data (1 indicates data, 0 indicates -- holes) -- -- @param network_conf a table providing with settings dedicated for the -- network. Available fields includes: -- -- * `network`: a `nerv.Layer` instance describing the structure of the network -- to be compiled -- * `clip`: a `number` value indicating the cliping threshold (i.e. preserve -- the values within [-clip, +clip]) -- * `nn_act_default`: a `number` value indicating the value used for filling -- "holes" in activation values of a batch matrix (0 by default) function network:__init(id, global_conf, network_conf) self.id = id self.network = network_conf.network self.dim_in = self.network.dim_in self.dim_out = self.network.dim_out self.gconf = global_conf if self.gconf.use_cpu then self.mat_type = self.gconf.mmat_type else self.mat_type = self.gconf.cumat_type end self.clip = network_conf.clip self.nn_act_default = network_conf.nn_act_default if self.nn_act_default == nil then self.nn_act_default = 0 end self.layers = {} self.input_conn = {} self.output_conn = {} self.socket = self:compile(self.network) for i = 1, #self.dim_in do local edge = self.socket.inputs[i] local id, port, time = edge[1], edge[2], edge[3] if self.input_conn[id][port] ~= nil then nerv.error('duplicate edge') end if nerv.is_type(self.layers[id], 'nerv.DuplicateLayer') then local tmp = nerv.IdentityLayer('', self.gconf, {dim_in = {self.dim_in[i]}, dim_out = {self.dim_in[i]}}) table.insert(self.layers, tmp) local new_id = #self.layers self.input_conn[new_id] = {{0, i, time}} self.output_conn[new_id] = {{id, port, 0}} self.input_conn[id][port] = {new_id, 1, 0} self.socket.inputs[i] = {new_id, 1, time} else self.input_conn[id][port] = {0, i, time} end end for i = 1, #self.dim_out do local edge = self.socket.outputs[i] local id, port, time = edge[1], edge[2], edge[3] if self.output_conn[id][port] ~= nil then nerv.error('duplicate edge') end if nerv.is_type(self.layers[id], 'nerv.DuplicateLayer') then local tmp = nerv.IdentityLayer('', self.gconf, {dim_in = {self.dim_out[i]}, dim_out = {self.dim_out[i]}}) table.insert(self.layers, tmp) local new_id = #self.layers self.input_conn[new_id] = {{id, port, 0}} self.output_conn[new_id] = {{0, i, time}} self.output_conn[id][port] = {new_id, 1, 0} self.socket.outputs[i] = {new_id, 1, time} else self.output_conn[id][port] = {0, i, time} end end self.delay = 0 for i = 1, #self.layers do local dim_in, _ = self.layers[i]:get_dim() for j = 1, #dim_in do if self.input_conn[i][j] == nil then nerv.error('dangling input') end local time = self.input_conn[i][j][3] if math.abs(time) > self.delay then self.delay = math.abs(time) end end end self.input_edge = {} self.output_edge = {} for t = -self.delay, self.delay do self.input_edge[t] = {} self.output_edge[t] = {} end for i = 1, #self.layers do local dim_in, dim_out = self.layers[i]:get_dim() for j = 1, #dim_in do local time = self.input_conn[i][j][3] table.insert(self.input_edge[time], {i, j}) end for j = 1, #dim_out do if self.output_conn[i][j] == nil then nerv.error('dangling output') end local time = self.output_conn[i][j][3] table.insert(self.output_edge[time], {i, j}) end end end function network:compile(layer) local socket = {inputs = {}, outputs = {}} if not nerv.is_type(layer, 'nerv.GraphLayer') then table.insert(self.layers, layer) local id = #self.layers self.input_conn[id] = {} self.output_conn[id] = {} local dim_in, dim_out = layer:get_dim() for i = 1, #dim_in do socket.inputs[i] = {id, i, 0} end for i = 1, #dim_out do socket.outputs[i] = {id, i, 0} end else local sublayer_socket = {} for id, sublayer in pairs(layer.layers) do if id ~= '' then sublayer_socket[sublayer.id] = self:compile(sublayer.layer) end end for _, edge in pairs(layer.connections) do -- id = 0 means or local id_from, port_from = edge[1], edge[2] local id_to, port_to = edge[3], edge[4] local time = edge[5] if id_from == 0 then if socket.inputs[port_from] ~= nil then nerv.error('duplicate input socket') end local input = sublayer_socket[id_to].inputs[port_to] local id, port, t = input[1], input[2], input[3] + time socket.inputs[port_from] = {id, port, t} else local output = sublayer_socket[id_from].outputs[port_from] local id, port, t = output[1], output[2], output[3] + time if id_to == 0 then if socket.outputs[port_to] ~= nil then nerv.error('duplicate output socket') end socket.outputs[port_to] = {id, port, t} else local input = sublayer_socket[id_to].inputs[port_to] local id1, port1, t1 = input[1], input[2], input[3] if self.input_conn[id1][port1] ~= nil or self.output_conn[id][port] ~= nil then nerv.error('duplicate edge') end self.input_conn[id1][port1] = {id, port, t + t1} self.output_conn[id][port] = {id1, port1, t + t1} end end end end return socket end --- Initialize the network for training. -- To be called before all the epochs, will resolve the structure of the -- network and allocate the memory for storing temporary values. -- @param batch_size The size of a batch matrix -- @param chunk_size The size of a BPTT chunk function network:init(batch_size, chunk_size) self.batch_size = batch_size self.chunk_size = chunk_size self:topsort() self:make_initial_store() collectgarbage('collect') self.flush = {} self.gconf.mask = {} for t = 1, self.chunk_size do self.flush[t] = {} self.gconf.mask[t] = self.mat_type(self.batch_size, 1) end end --- Initialize the internal state of the network for the new epoch. -- To be called before each new epoch. function network:epoch_init() self.timestamp = 0 for i = 1, #self.layers do self.layers[i]:init(self.batch_size, self.chunk_size) for t = 1, self.chunk_size do self.flush[t][i] = {timestamp = 0, input = {}, output = {}} end end end function network:topsort() nerv.info('network topology sort') local degree = {} for t = 1, self.chunk_size do degree[t] = {} for i = 1, #self.layers do degree[t][i] = 0 end end for t = 1, self.chunk_size do for i = 1, #self.layers do local _, dim_out = self.layers[i]:get_dim() for j = 1, #dim_out do local edge = self.output_conn[i][j] local id, time = edge[1], edge[3] + t if time >= 1 and time <= self.chunk_size and id ~= 0 then degree[time][id] = degree[time][id] + 1 end end end end self.queue = {} local l = 1 local r = 0 for t = 1, self.chunk_size do for i = 1, #self.layers do if degree[t][i] == 0 then r = r + 1 self.queue[r] = {chunk = t, id = i} end end end while l <= r do local t, i = self.queue[l].chunk, self.queue[l].id l = l + 1 local _, dim_out = self.layers[i]:get_dim() for j = 1, #dim_out do local edge = self.output_conn[i][j] local id, time = edge[1], edge[3] + t if time >= 1 and time <= self.chunk_size and id ~= 0 then degree[time][id] = degree[time][id] - 1 if degree[time][id] == 0 then r = r + 1 self.queue[r] = {chunk = time, id = id} end end end end if r ~= self.chunk_size * #self.layers then nerv.error('loop detected') end end function network:make_initial_store() nerv.info('network initing storage') -- allocate memory local memory = {} local err_memory = {} for t = 1 - self.delay, self.chunk_size + self.delay do memory[t] = {} err_memory[t] = {} for i = 1, #self.layers do memory[t][i] = {} err_memory[t][i] = {} local dim_in, dim_out = self.layers[i]:get_dim() for j = 1, #dim_in do err_memory[t][i][j] = self.mat_type(self.batch_size, dim_in[j]) err_memory[t][i][j]:fill(0) end if t < 1 or t > self.chunk_size or not nerv.is_type(self.layers[i], 'nerv.DuplicateLayer') then for j = 1, #dim_out do memory[t][i][j] = self.mat_type(self.batch_size, dim_out[j]) memory[t][i][j]:fill(self.nn_act_default) end end end if t < 1 or t > self.chunk_size then -- memory[t][0] stores network input memory[t][0] = {} for j = 1, #self.dim_in do memory[t][0][j] = self.mat_type(self.batch_size, self.dim_in[j]) memory[t][0][j]:fill(self.nn_act_default) end -- err_memory[t][0] stores network err_input err_memory[t][0] = {} for j = 1, #self.dim_out do err_memory[t][0][j] = self.mat_type(self.batch_size, self.dim_out[j]) err_memory[t][0][j]:fill(0) end end end -- connect memory and reference self.input = {} self.output = {} self.err_input = {} self.err_output = {} for t = 1, self.chunk_size do self.input[t] = {} self.output[t] = {} self.err_input[t] = {} self.err_output[t] = {} for i = 1, #self.layers do self.input[t][i] = {} self.output[t][i] = {} self.err_input[t][i] = {} self.err_output[t][i] = {} local dim_in, dim_out = self.layers[i]:get_dim() for j = 1, #dim_in do local edge = self.input_conn[i][j] local id, port, time = edge[1], edge[2], edge[3] if id ~= 0 or t - time < 1 or t - time > self.chunk_size then self.input[t][i][j] = memory[t - time][id][port] end if id ~= 0 then self.err_output[t][i][j] = err_memory[t][i][j] end end for j = 1, #dim_out do local edge = self.output_conn[i][j] local id, port, time = edge[1], edge[2], edge[3] if id ~= 0 then self.output[t][i][j] = memory[t][i][j] end if id ~= 0 or t + time < 1 or t + time > self.chunk_size then self.err_input[t][i][j] = err_memory[t + time][id][port] end end end end -- reference copy for duplicate layer for i = 1, #self.queue do local t, id = self.queue[i].chunk, self.queue[i].id if nerv.is_type(self.layers[id], 'nerv.DuplicateLayer') then local _, dim_out = self.layers[id]:get_dim() for j = 1, #dim_out do if self.output[t][id][j] ~= nil then nerv.error('duplicate output reference not nil') end self.output[t][id][j] = self.input[t][id][1] local edge = self.output_conn[id][j] local to, port, time = edge[1], edge[2], edge[3] + t if time >= 1 and time <= self.chunk_size then if self.input[time][to][port] ~= nil then nerv.error('duplicate input reference not nil') end self.input[time][to][port] = self.output[t][id][j] end end end end -- check dangling reference for t = 1, self.chunk_size do for i = 1, #self.dim_in do local edge = self.socket.inputs[i] local id, port, time = edge[1], edge[2], edge[3] if t + time >= 1 and t + time <= self.chunk_size then if self.input[t + time][id][port] ~= nil then nerv.error('input reference not nil') end self.input[t + time][id][port] = true -- just a place holder if self.err_output[t + time][id][port] ~= nil then nerv.error('err_output reference not nil') end self.err_output[t + time][id][port] = true -- just a place holder end end for i = 1, #self.dim_out do local edge = self.socket.outputs[i] local id, port, time = edge[1], edge[2], edge[3] if t - time >= 1 and t - time <= self.chunk_size then if self.output[t - time][id][port] ~= nil then nerv.error('output reference not nil') end self.output[t - time][id][port] = true -- just a place holder if self.err_input[t - time][id][port] ~= nil then nerv.error('err_output reference not nil') end self.err_input[t - time][id][port] = true -- just a place holder end end end for t = 1, self.chunk_size do for i = 1, #self.layers do local dim_in, dim_out = self.layers[i]:get_dim() for j = 1, #dim_in do if self.input[t][i][j] == nil then print(t,i,j,self.layers[i].id) nerv.error('input reference dangling') end if self.err_output[t][i][j] == nil then nerv.error('err_output reference dangling') end end for j = 1, #dim_out do if self.output[t][i][j] == nil then nerv.error('output reference dangling') end if self.err_input[t][i][j] == nil then nerv.error('err_input reference dangling') end end end end -- allocate reference for legacy of previous mini-batch self.legacy = {} for t = 1 - self.delay, 0 do self.legacy[t] = {} for i = 1, #self.layers do self.legacy[t][i] = {} end end for d = 1, self.delay do for t = 1 - d, 0 do if t + self.chunk_size >= 1 then for i = 1, #self.output_edge[d] do local edge = self.output_edge[d][i] local id, port = edge[1], edge[2] self.legacy[t][id][port] = memory[t][id][port] end end end end end function network:set_input(input) for t = 1, self.chunk_size do for i = 1, #self.dim_in do local edge = self.socket.inputs[i] local id, port, time = edge[1], edge[2], edge[3] if t + time >= 1 and t + time <= self.chunk_size then self.input[t + time][id][port] = input[i][t] end end end end function network:set_output(output) for t = 1, self.chunk_size do for i = 1, #self.dim_out do local edge = self.socket.outputs[i] local id, port, time = edge[1], edge[2], edge[3] if t - time >= 1 and t - time <= self.chunk_size then self.output[t - time][id][port] = output[i][t] end end end end function network:set_err_input(err_input) for t = 1, self.chunk_size do for i = 1, #self.dim_out do local edge = self.socket.outputs[i] local id, port, time = edge[1], edge[2], edge[3] if t - time >= 1 and t - time <= self.chunk_size then self.err_input[t - time][id][port] = err_input[i][t] end end end end function network:set_err_output(err_output) for t = 1, self.chunk_size do for i = 1, #self.dim_in do local edge = self.socket.inputs[i] local id, port, time = edge[1], edge[2], edge[3] if t + time >= 1 and t + time <= self.chunk_size then self.err_output[t + time][id][port] = err_output[i][t] end end end end --- Initialize the internal state of the network for the new mini-batch (a BPTT chunk). -- To be called before each propagation/back-propagation. -- @param info a table containing information needed for the current mini-batch computation. The following fields must be supplied: -- -- * `input`: an array containing `chunk_size` number of row-major batch -- matrices with `batch_size` rows -- * `output`: similar to `input`, but the matrices have different number of -- columns (depending on the width of the output, which is typically 1 for -- criteria, i.e. single column indicating the error), used to hold the output of the network -- * `seq_length` : a table containing the length (number of frames) of each -- sequence (utterance) -- * `new_seq`: a table containing the indices of batch matrix rows that are the -- first frames of a sequence -- * `do_train`: a bool value indicating whether to update the network -- -- If `do_train` is true, two additional fields are required: -- -- * `err_input`: an array with the same structure as `output` but containg the initial -- values for computing errors in back-propagation (when the width of the -- output is 1, `gconf.mask` is typically used here to ignore the invalid -- values produced by "holes" in the mini-batch). -- * `err_output`: an array with the same structure as `input`. Although we -- are mostly not interested in its values, just allocate this to unify -- the computation and ease the implementation function network:mini_batch_init(info) self.info = info self:set_input(self.info.input) self:set_output(self.info.output) if self.info.do_train then self:set_err_input(self.info.err_input) self:set_err_output(self.info.err_output) end -- calculate mask for t = 1, self.chunk_size do local tmp = self.gconf.mmat_type(self.batch_size, 1) for i = 1, self.batch_size do if t <= self.info.seq_length[i] then tmp[i - 1][0] = 1 else tmp[i - 1][0] = 0 end end self.gconf.mask[t]:copy_fromh(tmp) end -- calculate max length self.max_length = 0 for i = 1, self.batch_size do self.max_length = math.max(self.max_length, self.info.seq_length[i]) end -- calculate border self.timestamp = self.timestamp + 1 for i = 1, self.batch_size do local border = self.info.seq_length[i] for d = 1, self.delay do for t = border + 1, border + d do if t > self.max_length then break end for j = 1, #self.output_edge[-d] do local edge = self.output_edge[-d][j] local id, port = edge[1], edge[2] local flush = self.flush[t][id] if flush.timestamp ~= self.timestamp then flush.timestamp = self.timestamp flush.input = {} flush.output = {} end table.insert(flush.output, {port, i}) end end if self.info.do_train then for t = border, border - d + 1, -1 do if t < 1 then break end for j = 1, #self.input_edge[-d] do local edge = self.input_edge[-d][j] local id, port = edge[1], edge[2] local flush = self.flush[t][id] if flush.timestamp ~= self.timestamp then flush.timestamp = self.timestamp flush.input = {} flush.output = {} end table.insert(flush.input, {port, i}) end end end end end -- flush border gradient if self.info.do_train then local border = self.max_length for d = 1, self.delay do for t = border + 1, border + d do if t > self.chunk_size then break end for j = 1, #self.input_edge[d] do local edge = self.input_edge[d][j] local id, port = edge[1], edge[2] self.err_output[t][id][port]:fill(0) end end end end -- copy legacy for d = 1, self.delay do for t = 1 - d, 0 do if t + self.chunk_size >= 1 then for i = 1, #self.output_edge[d] do local edge = self.output_edge[d][i] local id, port = edge[1], edge[2] if self.output_conn[id][port][1] ~= 0 then self.legacy[t][id][port]:copy_from(self.output[t + self.chunk_size][id][port]) end for j = 1, #self.info.new_seq do local batch = self.info.new_seq[j] self.legacy[t][id][port][batch - 1]:fill(self.nn_act_default) end end end end end end --- Perform a propagation. function network:propagate() for i = 1, #self.queue do local t, id = self.queue[i].chunk, self.queue[i].id if t <= self.max_length then self.layers[id]:propagate(self.input[t][id], self.output[t][id], t) end -- flush border activation if self.flush[t][id].timestamp == self.timestamp then for j = 1, #self.flush[t][id].output do local border = self.flush[t][id].output[j] local port, batch = border[1], border[2] self.output[t][id][port][batch - 1]:fill(self.nn_act_default) end end end end --- Perform a backward propagation to calculate gradients used for update. function network:back_propagate() for i = #self.queue, 1, -1 do local t, id = self.queue[i].chunk, self.queue[i].id if t <= self.max_length then self.layers[id]:back_propagate(self.err_input[t][id], self.err_output[t][id], self.input[t][id], self.output[t][id], t) -- gradient clip if self.clip ~= nil then local dim_in, _ = self.layers[id]:get_dim() for j = 1, #dim_in do self.err_output[t][id][j]:clip(-self.clip, self.clip) end end end -- flush border gradient if self.flush[t][id].timestamp == self.timestamp then for j = 1, #self.flush[t][id].input do local border = self.flush[t][id].input[j] local port, batch = border[1], border[2] self.err_output[t][id][port][batch - 1]:fill(0) end end end end --- Update the parameters bound to each layer. function network:update() for i = 1, #self.layers do self.layers[i]:update() end end function network:set_attr(name, value) self.network:set_attr(name, value) end function network:get_sublayer(id) return self.network:get_sublayer(id) end function network:get_params() return self.network:get_params() end