aboutsummaryrefslogtreecommitdiff
path: root/nerv/examples/asr_trainer.lua
blob: 5001e1239a2b4fa4c30e59bea6b11cd51618765a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
require 'lfs'
require 'pl'
local function build_trainer(ifname)
    local param_repo = nerv.ParamRepo()
    param_repo:import(ifname, nil, gconf)
    local layer_repo = make_layer_repo(param_repo)
    local network = get_network(layer_repo)
    local global_transf = get_global_transf(layer_repo)
    local input_order = get_input_order()
    local mat_type
    if gconf.use_cpu then
        mat_type = gconf.mmat_type
    else
        mat_type = gconf.cumat_type
    end
    local iterative_trainer = function (prefix, scp_file, bp)
        gconf.randomize = bp
        -- build buffer
        local buffer = make_buffer(make_readers(scp_file, layer_repo))
        -- initialize the network
        network:init(gconf.batch_size)
        gconf.cnt = 0
        err_input = {mat_type(gconf.batch_size, 1)}
        err_input[1]:fill(1)
        for data in buffer.get_data, buffer do
            -- prine stat periodically
            gconf.cnt = gconf.cnt + 1
            if gconf.cnt == 1000 then
                print_stat(layer_repo)
                mat_type.print_profile()
                mat_type.clear_profile()
                gconf.cnt = 0
                -- break
            end
            local input = {}
--            if gconf.cnt == 1000 then break end
            for i, e in ipairs(input_order) do
                local id = e.id
                if data[id] == nil then
                    nerv.error("input data %s not found", id)
                end
                local transformed
                if e.global_transf then
                    transformed = nerv.speech_utils.global_transf(data[id],
                                        global_transf,
                                        gconf.frm_ext or 0, 0,
                                        gconf)
                else
                    transformed = data[id]
                end
                table.insert(input, transformed)
            end
            local output = {mat_type(gconf.batch_size, 1)}
            err_output = {}
            for i = 1, #input do
                table.insert(err_output, input[i]:create())
            end
            network:propagate(input, output)
            if bp then
                network:back_propagate(err_input, err_output, input, output)
                network:update(err_input, input, output)
            end
            -- collect garbage in-time to save GPU memory
            collectgarbage("collect")
        end
        print_stat(layer_repo)
        mat_type.print_profile()
        mat_type.clear_profile()
        if (not bp) and prefix ~= nil then
            nerv.info("writing back...")
            local fname = string.format("%s_cv%.3f.nerv",
                            prefix, get_accuracy(layer_repo))
            network:get_params():export(fname, nil)
        end
        return get_accuracy(layer_repo)
    end
    return iterative_trainer
end

local function check_and_add_defaults(spec)
    for k, v in pairs(spec) do
        gconf[k] = opts[string.gsub(k, '_', '-')].val or gconf[k] or v
    end
end

local function make_options(spec)
    local options = {}
    for k, v in pairs(spec) do
        table.insert(options,
                    {string.gsub(k, '_', '-'), nil, type(v), default = v})
    end
    return options
end

local function print_help(options)
    nerv.printf("Usage: <asr_trainer.lua> [options] network_config.lua\n")
    nerv.print_usage(options)
end

local function print_gconf()
    local key_maxlen = 0
    for k, v in pairs(gconf) do
        key_maxlen = math.max(key_maxlen, #k or 0)
    end
    local function pattern_gen()
        return string.format("%%-%ds = %%s\n", key_maxlen)
    end
    nerv.info("ready to train with the following gconf settings:")
    nerv.printf(pattern_gen(), "Key", "Value")
    for k, v in pairs(gconf) do
        nerv.printf(pattern_gen(), k or "", v or "")
    end
end

local trainer_defaults = {
    lrate = 0.8,
    batch_size = 256,
    buffer_size = 81920,
    wcost = 1e-6,
    momentum = 0.9,
    start_halving_inc = 0.5,
    halving_factor = 0.6,
    end_halving_inc = 0.1,
    min_iter = 1,
    max_iter = 20,
    min_halving = 5,
    do_halving = false,
    tr_scp = nil,
    cv_scp = nil,
    cumat_type = nerv.CuMatrixFloat,
    mmat_type = nerv.MMatrixFloat,
    debug = false
}

local options = make_options(trainer_defaults)
table.insert(options, {"help", "h", "boolean",
                        default = false, desc = "show this help information"})
table.insert(options, {"dir", nil, "string",
                        default = nil, desc = "specify the working directory"})

arg, opts = nerv.parse_args(arg, options)

if #arg < 1 or opts["help"].val then
    print_help(options)
    return
end

dofile(arg[1])

--[[

Rule: command-line option overrides network config overrides trainer default.
Note: config key like aaa_bbbb_cc could be overriden by specifying
--aaa-bbbb-cc to command-line arguments.

]]--

check_and_add_defaults(trainer_defaults)

local pf0 = gconf.initialized_param
local trainer = build_trainer(pf0)
local accu_best = trainer(nil, gconf.cv_scp, false)
local date_pattern = "%Y%m%d%H%M%S"
local logfile_name = "log"
local working_dir = opts["dir"].val or string.format("nerv_%s", os.date(date_pattern))

print_gconf()
if not lfs.mkdir(working_dir) then
    nerv.error("[asr_trainer] working directory already exists")
end
-- copy the network config
dir.copyfile(arg[1], working_dir)
-- set logfile path
nerv.set_logfile(path.join(working_dir, logfile_name))
path.chdir(working_dir)
nerv.info("initial cross validation: %.3f", accu_best)
for i = 1, gconf.max_iter do
    nerv.info("[NN] begin iteration %d with lrate = %.6f", i, gconf.lrate)
    local accu_tr = trainer(nil, gconf.tr_scp, true)
    nerv.info("[TR] training set %d: %.3f", i, accu_tr)
    local accu_new = trainer(
                        string.format("%s_%s_iter_%d_lr%f_tr%.3f",
                            string.gsub(
                                (string.gsub(pf0[1], "(.*/)(.*)", "%2")),
                                "(.*)%..*", "%1"),
                            os.date(date_pattern),
                            i, gconf.lrate,
                            accu_tr),
                        gconf.cv_scp, false)
    nerv.info("[CV] cross validation %d: %.3f", i, accu_new)
    -- TODO: revert the weights
    local accu_diff = accu_new - accu_best
    if gconf.do_halving and
        accu_diff < gconf.end_halving_inc and
        i > gconf.min_iter then
        break
    end
    if accu_diff < gconf.start_halving_inc and
        i >= gconf.min_halving then
        gconf.do_halving = true
    end
    if gconf.do_halving then
        gconf.lrate = gconf.lrate * gconf.halving_factor
    end
    if accu_new > accu_best then
        accu_best = accu_new
    end
--    nerv.Matrix.print_profile()
end