diff options
author | Ted Yin <[email protected]> | 2015-10-12 09:26:53 +0800 |
---|---|---|
committer | Ted Yin <[email protected]> | 2015-10-12 09:26:53 +0800 |
commit | 0dba4c998fcccb4bae29582b7d8be94de476dd0b (patch) | |
tree | b8529d4f0c2ea0a91ee4b7a4b21a14c0616fc081 | |
parent | 7acd14eca701deaffb2d16262528da37ee23263a (diff) | |
parent | e39fb231f64ddc8b79a6eb5434f529aadb3165fe (diff) |
Merge pull request #6 from yimmon/master
add kaldi_seq
-rw-r--r-- | kaldi_io/Makefile | 2 | ||||
-rw-r--r-- | kaldi_io/example/swb_baseline.lua | 3 | ||||
-rw-r--r-- | kaldi_io/example/swb_baseline_basic.lua | 157 | ||||
-rw-r--r-- | kaldi_io/init.lua | 1 | ||||
-rw-r--r-- | kaldi_io/kaldi.mk | 70 | ||||
-rw-r--r-- | kaldi_seq/.valgrind | 0 | ||||
-rw-r--r-- | kaldi_seq/Makefile | 47 | ||||
-rw-r--r-- | kaldi_seq/init.c | 8 | ||||
-rw-r--r-- | kaldi_seq/init.lua | 2 | ||||
-rw-r--r-- | kaldi_seq/kaldi_seq-scm-1.rockspec | 36 | ||||
-rw-r--r-- | kaldi_seq/layer/mmi.lua | 50 | ||||
-rw-r--r-- | kaldi_seq/layer/mpe.lua | 52 | ||||
-rw-r--r-- | kaldi_seq/src/init.c | 131 | ||||
-rw-r--r-- | kaldi_seq/src/kaldi_mmi.cpp | 427 | ||||
-rw-r--r-- | kaldi_seq/src/kaldi_mmi.h | 20 | ||||
-rw-r--r-- | kaldi_seq/src/kaldi_mpe.cpp | 411 | ||||
-rw-r--r-- | kaldi_seq/src/kaldi_mpe.h | 21 | ||||
-rw-r--r-- | kaldi_seq/tools/net_kaldi2nerv.cpp | 85 | ||||
-rw-r--r-- | kaldi_seq/tools/transf_kaldi2nerv.cpp | 106 |
19 files changed, 1400 insertions, 229 deletions
diff --git a/kaldi_io/Makefile b/kaldi_io/Makefile index 1066fc5..7b0c0bd 100644 --- a/kaldi_io/Makefile +++ b/kaldi_io/Makefile @@ -1,5 +1,5 @@ # Change KDIR to `kaldi-trunk' path (Kaldi must be compiled with --share) -KDIR := /home/stuymf/kaldi-trunk/ +KDIR := /slfs6/users/ymz09/kaldi/ SHELL := /bin/bash BUILD_DIR := $(CURDIR)/build diff --git a/kaldi_io/example/swb_baseline.lua b/kaldi_io/example/swb_baseline.lua index 8b1e122..3ef6c65 100644 --- a/kaldi_io/example/swb_baseline.lua +++ b/kaldi_io/example/swb_baseline.lua @@ -173,7 +173,8 @@ function make_buffer(readers) end function get_input_order() - return {"main_scp", "phone_state"} + return {{id = "main_scp", global_transf = true}, + {id = "phone_state"}} end function get_accuracy(layer_repo) diff --git a/kaldi_io/example/swb_baseline_basic.lua b/kaldi_io/example/swb_baseline_basic.lua deleted file mode 100644 index e6c8145..0000000 --- a/kaldi_io/example/swb_baseline_basic.lua +++ /dev/null @@ -1,157 +0,0 @@ -require 'kaldi_io' -gconf = {lrate = 0.8, wcost = 1e-6, momentum = 0.9, - cumat_type = nerv.CuMatrixFloat, - mmat_type = nerv.MMatrixFloat, - frm_ext = 5, - tr_rspecifier = "ark:/slfs6/users/ymz09/kaldi/src/featbin/copy-feats scp:/slfs6/users/ymz09/swb_ivec/train_bp.scp ark:- |", - cv_rspecifier = "ark:/slfs6/users/ymz09/kaldi/src/featbin/copy-feats scp:/slfs6/users/ymz09/swb_ivec/train_cv.scp ark:- |", - initialized_param = {"/slfs6/users/ymz09/swb_ivec/swb_init.nerv", - "/slfs6/users/ymz09/swb_ivec/swb_global_transf.nerv"}, - debug = false} - -function make_sublayer_repo(param_repo) - return nerv.LayerRepo( - { - -- global transf - ["nerv.BiasLayer"] = - { - blayer1 = {{bias = "bias1"}, {dim_in = {429}, dim_out = {429}}}, - blayer2 = {{bias = "bias2"}, {dim_in = {429}, dim_out = {429}}} - }, - ["nerv.WindowLayer"] = - { - wlayer1 = {{window = "window1"}, {dim_in = {429}, dim_out = {429}}}, - wlayer2 = {{window = "window2"}, {dim_in = {429}, dim_out = {429}}} - }, - -- biased linearity - ["nerv.AffineLayer"] = - { - affine0 = {{ltp = "affine0_ltp", bp = "affine0_bp"}, - {dim_in = {429}, dim_out = {2048}}}, - affine1 = {{ltp = "affine1_ltp", bp = "affine1_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine2 = {{ltp = "affine2_ltp", bp = "affine2_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine3 = {{ltp = "affine3_ltp", bp = "affine3_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine4 = {{ltp = "affine4_ltp", bp = "affine4_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine5 = {{ltp = "affine5_ltp", bp = "affine5_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine6 = {{ltp = "affine6_ltp", bp = "affine6_bp"}, - {dim_in = {2048}, dim_out = {2048}}}, - affine7 = {{ltp = "affine7_ltp", bp = "affine7_bp"}, - {dim_in = {2048}, dim_out = {3001}}} - }, - ["nerv.SigmoidLayer"] = - { - sigmoid0 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid1 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid2 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid3 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid4 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid5 = {{}, {dim_in = {2048}, dim_out = {2048}}}, - sigmoid6 = {{}, {dim_in = {2048}, dim_out = {2048}}} - }, - ["nerv.SoftmaxCELayer"] = - { - ce_crit = {{}, {dim_in = {3001, 1}, dim_out = {1}, compressed = true}} - } - }, param_repo, gconf) -end - -function make_layer_repo(sublayer_repo, param_repo) - return nerv.LayerRepo( - { - ["nerv.DAGLayer"] = - { - global_transf = {{}, { - dim_in = {429}, dim_out = {429}, - sub_layers = sublayer_repo, - connections = { - ["<input>[1]"] = "blayer1[1]", - ["blayer1[1]"] = "wlayer1[1]", - ["wlayer1[1]"] = "blayer2[1]", - ["blayer2[1]"] = "wlayer2[1]", - ["wlayer2[1]"] = "<output>[1]" - } - }}, - main = {{}, { - dim_in = {429, 1}, dim_out = {1}, - sub_layers = sublayer_repo, - connections = { - ["<input>[1]"] = "affine0[1]", - ["affine0[1]"] = "sigmoid0[1]", - ["sigmoid0[1]"] = "affine1[1]", - ["affine1[1]"] = "sigmoid1[1]", - ["sigmoid1[1]"] = "affine2[1]", - ["affine2[1]"] = "sigmoid2[1]", - ["sigmoid2[1]"] = "affine3[1]", - ["affine3[1]"] = "sigmoid3[1]", - ["sigmoid3[1]"] = "affine4[1]", - ["affine4[1]"] = "sigmoid4[1]", - ["sigmoid4[1]"] = "affine5[1]", - ["affine5[1]"] = "sigmoid5[1]", - ["sigmoid5[1]"] = "affine6[1]", - ["affine6[1]"] = "sigmoid6[1]", - ["sigmoid6[1]"] = "affine7[1]", - ["affine7[1]"] = "ce_crit[1]", - ["<input>[2]"] = "ce_crit[2]", - ["ce_crit[1]"] = "<output>[1]" - } - }} - } - }, param_repo, gconf) -end - -function get_network(layer_repo) - return layer_repo:get_layer("main") -end - -function make_readers(feature_rspecifier, layer_repo) - return { - {reader = nerv.KaldiReader(gconf, - { - id = "main_scp", - feature_rspecifier = feature_rspecifier, - frm_ext = gconf.frm_ext, - mlfs = { - phone_state = { - targets_rspecifier = "ark:/slfs6/users/ymz09/kaldi/src/bin/ali-to-pdf /slfs6/users/ymz09/swb_ivec/final.mdl \"ark:gunzip -c /slfs6/users/ymz09/swb_ivec/ali.*.gz |\" ark:- | /slfs6/users/ymz09/kaldi/src/bin/ali-to-post ark:- ark:- |", - format = "map" - } - }, - global_transf = layer_repo:get_layer("global_transf") - }), - data = {main_scp = 429, phone_state = 1}} - } -end - -function make_buffer(readers) - return nerv.SGDBuffer(gconf, - { - buffer_size = gconf.buffer_size, - randomize = gconf.randomize, - readers = readers - }) -end - -function get_input_order() - return {"main_scp", "phone_state"} -end - -function get_accuracy(sublayer_repo) - local ce_crit = sublayer_repo:get_layer("ce_crit") - return ce_crit.total_correct / ce_crit.total_frames * 100 -end - -function print_stat(sublayer_repo) - local ce_crit = sublayer_repo:get_layer("ce_crit") - nerv.info("*** training stat begin ***") - nerv.printf("cross entropy:\t\t%.8f\n", ce_crit.total_ce) - nerv.printf("correct:\t\t%d\n", ce_crit.total_correct) - nerv.printf("frames:\t\t\t%d\n", ce_crit.total_frames) - nerv.printf("err/frm:\t\t%.8f\n", ce_crit.total_ce / ce_crit.total_frames) - nerv.printf("accuracy:\t\t%.3f%%\n", get_accuracy(sublayer_repo)) - nerv.info("*** training stat end ***") -end diff --git a/kaldi_io/init.lua b/kaldi_io/init.lua index 3fc5b10..b7e6da8 100644 --- a/kaldi_io/init.lua +++ b/kaldi_io/init.lua @@ -66,6 +66,7 @@ function KaldiReader:get_data() rearranged:copy_toh(feat_utter) end res[self.feat_id] = feat_utter + res["key"] = self.feat_repo:key() -- add corresponding labels for id, repo in pairs(self.lab_repo) do local lab_utter = repo:get_utter(self.feat_repo, diff --git a/kaldi_io/kaldi.mk b/kaldi_io/kaldi.mk deleted file mode 100644 index 4a397f0..0000000 --- a/kaldi_io/kaldi.mk +++ /dev/null @@ -1,70 +0,0 @@ -# This file was generated using the following command: -# ./configure - -# Rules that enable valgrind debugging ("make valgrind") - -valgrind: .valgrind - -.valgrind: - echo -n > valgrind.out - for x in $(TESTFILES); do echo $$x>>valgrind.out; valgrind ./$$x >/dev/null 2>> valgrind.out; done - ! ( grep 'ERROR SUMMARY' valgrind.out | grep -v '0 errors' ) - ! ( grep 'definitely lost' valgrind.out | grep -v -w 0 ) - rm valgrind.out - touch .valgrind - - -CONFIGURE_VERSION := 2 -OPENFSTLIBS = -L/slwork/users/wd007/src/kaldi/tools/openfst/lib -lfst -OPENFSTLDFLAGS = -Wl,-rpath=/slwork/users/wd007/src/kaldi/tools/openfst/lib -FSTROOT = /slwork/users/wd007/src/kaldi/tools/openfst -ATLASINC = /slwork/users/wd007/src/kaldi/tools/ATLAS/include -ATLASLIBS = -L/usr/lib -llapack -lcblas -latlas -lf77blas -# You have to make sure ATLASLIBS is set... - -ifndef FSTROOT -$(error FSTROOT not defined.) -endif - -ifndef ATLASINC -$(error ATLASINC not defined.) -endif - -ifndef ATLASLIBS -$(error ATLASLIBS not defined.) -endif - - -CXXFLAGS = -msse -msse2 -Wall -I.. \ - -fPIC \ - -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN \ - -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self \ - -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H \ - -DHAVE_ATLAS -I$(ATLASINC) \ - -I$(FSTROOT)/include \ - $(EXTRA_CXXFLAGS) \ - -g # -O0 -DKALDI_PARANOID - -ifeq ($(KALDI_FLAVOR), dynamic) -CXXFLAGS += -fPIC -endif - -LDFLAGS = -rdynamic $(OPENFSTLDFLAGS) -LDLIBS = $(EXTRA_LDLIBS) $(OPENFSTLIBS) $(ATLASLIBS) -lm -lpthread -ldl -CC = g++ -CXX = g++ -AR = ar -AS = as -RANLIB = ranlib - -#Next section enables CUDA for compilation -CUDA = true -CUDATKDIR = /usr/local/cuda - -CUDA_INCLUDE= -I$(CUDATKDIR)/include -CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 64 -DHAVE_CUDA - -CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include -CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64 -CUDA_LDLIBS += -lcublas -lcudart #LDLIBS : The libs are loaded later than static libs in implicit rule - diff --git a/kaldi_seq/.valgrind b/kaldi_seq/.valgrind new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/kaldi_seq/.valgrind diff --git a/kaldi_seq/Makefile b/kaldi_seq/Makefile new file mode 100644 index 0000000..e76eea8 --- /dev/null +++ b/kaldi_seq/Makefile @@ -0,0 +1,47 @@ +# Change KDIR to `kaldi-trunk' path (Kaldi must be compiled with --share) +KDIR := /slfs6/users/ymz09/kaldi/ + +SHELL := /bin/bash +BUILD_DIR := $(CURDIR)/build +INC_PATH := $(LUA_BINDIR)/../include/ +OBJS := init.o src/kaldi_mpe.o src/kaldi_mmi.o src/init.o +LIBS := libkaldiseq.so +LUA_LIBS := init.lua layer/mpe.lua layer/mmi.lua +INCLUDE := -I $(LUA_INCDIR) -I $(INC_PATH) -DLUA_USE_APICHECK + +SUBDIR := src layer +OBJ_DIR := $(BUILD_DIR)/objs +LUA_DIR := $(INST_LUADIR)/kaldi_seq +KALDIINCLUDE := -I $(KDIR)/tools/ATLAS/include/ -I $(KDIR)/tools/openfst/include/ -I $(KDIR)/src/ + +OBJS := $(addprefix $(OBJ_DIR)/,$(OBJS)) +LIBS := $(addprefix $(INST_LIBDIR)/,$(LIBS)) +OBJ_SUBDIR := $(addprefix $(OBJ_DIR)/,$(SUBDIR)) +LUA_SUBDIR := $(addprefix $(LUA_DIR)/,$(SUBDIR)) +LUA_LIBS := $(addprefix $(LUA_DIR)/,$(LUA_LIBS)) +LIB_PATH := $(LUA_BINDIR)/../lib + +build: $(OBJ_DIR) $(OBJ_SUBDIR) $(OBJS) +install: $(LUA_DIR) $(LUA_SUBDIR) $(LUA_LIBS) $(LIBS) + +include $(KDIR)/src/kaldi.mk + +KL1 := -rdynamic -Wl,-rpath=$(KDIR)/tools/openfst/lib -L/usr/local/cuda/lib64 -Wl,-rpath,/usr/local/cuda/lib64 -Wl,-rpath=$(KDIR)/src/lib -L. -L$(KDIR)/src/nnet/ -L$(KDIR)/src/cudamatrix/ -L$(KDIR)/src/lat/ -L$(KDIR)/src/hmm/ -L$(KDIR)/src/tree/ -L$(KDIR)/src/matrix/ -L$(KDIR)/src/util/ -L$(KDIR)/src/base/ $(KDIR)/src/nnet//libkaldi-nnet.so $(KDIR)/src/cudamatrix//libkaldi-cudamatrix.so $(KDIR)/src/lat//libkaldi-lat.so $(KDIR)/src/hmm//libkaldi-hmm.so $(KDIR)/src/tree//libkaldi-tree.so $(KDIR)/src/matrix//libkaldi-matrix.so $(KDIR)/src/util//libkaldi-util.so $(KDIR)/src/base//libkaldi-base.so -L$(KDIR)/tools/openfst/lib -lfst /usr/lib/liblapack.so /usr/lib/libcblas.so /usr/lib/libatlas.so /usr/lib/libf77blas.so -lm -lpthread -ldl -lcublas -lcudart -lkaldi-nnet -lkaldi-cudamatrix -lkaldi-lat -lkaldi-hmm -lkaldi-tree -lkaldi-matrix -lkaldi-util -lkaldi-base + +KL2 := -msse -msse2 -Wall -pthread -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H -DHAVE_ATLAS -I$(KDIR)/tools/ATLAS/include -I$(KDIR)/tools/openfst/include -Wno-sign-compare -g -fPIC -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -DKALDI_NO_EXPF + +$(OBJ_DIR) $(LUA_DIR) $(OBJ_SUBDIR) $(LUA_SUBDIR): + -mkdir -p $@ +$(LUA_DIR)/%.lua: %.lua + cp $< $@ +$(LIBS): $(OBJ_DIR)/src/kaldi_mpe.o $(OBJ_DIR)/src/kaldi_mmi.o $(OBJ_DIR)/init.o $(OBJ_DIR)/src/init.o + gcc -shared -fPIC -o $@ $(OBJ_DIR)/src/kaldi_mpe.o $(OBJ_DIR)/src/kaldi_mmi.o $(OBJ_DIR)/init.o $(OBJ_DIR)/src/init.o -lstdc++ -Wl,-rpath=$(LIB_PATH) -L$(LIB_PATH) -lnervcore -lluaT $(KL1) +$(OBJ_DIR)/src/kaldi_mpe.o: src/kaldi_mpe.cpp + g++ -o $@ -c $< $(KALDIINCLUDE) -g -fPIC $(INCLUDE) $(KL2) +$(OBJ_DIR)/src/kaldi_mmi.o: src/kaldi_mmi.cpp + g++ -o $@ -c $< $(KALDIINCLUDE) -g -fPIC $(INCLUDE) $(KL2) +$(OBJ_DIR)/%.o: %.c + gcc -o $@ -c $< -g $(INCLUDE) -fPIC +clean: + -rm $(OBJ_DIR)/src/*.o + diff --git a/kaldi_seq/init.c b/kaldi_seq/init.c new file mode 100644 index 0000000..ed89473 --- /dev/null +++ b/kaldi_seq/init.c @@ -0,0 +1,8 @@ +#include "../nerv/common.h" +#include <stdio.h> + +extern void kaldi_seq_init(lua_State *L); +int luaopen_libkaldiseq(lua_State *L) { + kaldi_seq_init(L); + return 1; +} diff --git a/kaldi_seq/init.lua b/kaldi_seq/init.lua new file mode 100644 index 0000000..39f4cb3 --- /dev/null +++ b/kaldi_seq/init.lua @@ -0,0 +1,2 @@ +nerv.include('layer/mpe.lua') +nerv.include('layer/mmi.lua') diff --git a/kaldi_seq/kaldi_seq-scm-1.rockspec b/kaldi_seq/kaldi_seq-scm-1.rockspec new file mode 100644 index 0000000..41e34f0 --- /dev/null +++ b/kaldi_seq/kaldi_seq-scm-1.rockspec @@ -0,0 +1,36 @@ +package = "kaldi_seq" +version = "scm-1" +source = { + url = "https://github.com/Nerv-SJTU/nerv-speech.git" +} +description = { + summary = "Kaldi sequence training support for Nerv", + detailed = [[ + ]], + homepage = "https://github.com/Nerv-SJTU/nerv-speech", + license = "BSD" +} +dependencies = { + "nerv >= scm-1", + "lua >= 5.1" +} +build = { + type = "make", + build_variables = { + CFLAGS="$(CFLAGS)", + LIBFLAG="$(LIBFLAG)", + LUA_LIBDIR="$(LUA_LIBDIR)", + LUA_BINDIR="$(LUA_BINDIR)", + LUA_INCDIR="$(LUA_INCDIR)", + INST_PREFIX="$(PREFIX)", + LUA="$(LUA)", + }, + install_variables = { + LUA_BINDIR="$(LUA_BINDIR)", + INST_PREFIX="$(PREFIX)", + INST_BINDIR="$(BINDIR)", + INST_LIBDIR="$(LIBDIR)", + INST_LUADIR="$(LUADIR)", + INST_CONFDIR="$(CONFDIR)", + }, +} diff --git a/kaldi_seq/layer/mmi.lua b/kaldi_seq/layer/mmi.lua new file mode 100644 index 0000000..ecc7f48 --- /dev/null +++ b/kaldi_seq/layer/mmi.lua @@ -0,0 +1,50 @@ +require 'libkaldiseq' +local MMILayer = nerv.class("nerv.MMILayer", "nerv.Layer") + +function MMILayer:__init(id, global_conf, layer_conf) + self.id = id + self.gconf = global_conf + self.dim_in = layer_conf.dim_in + self.dim_out = layer_conf.dim_out + self.arg = layer_conf.cmd.arg + self.mdl = layer_conf.cmd.mdl + self.lat = layer_conf.cmd.lat + self.ali = layer_conf.cmd.ali + self:check_dim_len(2, -1) -- two inputs: nn output and utt key +end + +function MMILayer:init(batch_size) + self.total_frames = 0 + self.kaldi_mmi = nerv.KaldiMMI(self.arg, self.mdl, self.lat, self.ali) + if self.kaldi_mmi == nil then + nerv.error("kaldi arguments is expected: %s %s %s %s", self.arg, + self.mdl, self.lat, self.ali) + end +end + +function MMILayer:batch_resize(batch_size) + -- do nothing +end + +function MMILayer:update(bp_err, input, output) + -- no params, therefore do nothing +end + +function MMILayer:propagate(input, output) + self.valid = false + self.valid = self.kaldi_mmi:check(input[1], input[2]) + return self.valid +end + +function MMILayer:back_propagate(bp_err, next_bp_err, input, output) + if self.valid ~= true then + nerv.error("kaldi sequence training back_propagate fail") + end + local mmat = input[1]:new_to_host() + next_bp_err[1]:copy_fromh(self.kaldi_mmi:calc_diff(mmat, input[2])) + self.total_frames = self.total_frames + self.kaldi_mmi:get_num_frames() +end + +function MMILayer:get_params() + return nerv.ParamRepo({}) +end diff --git a/kaldi_seq/layer/mpe.lua b/kaldi_seq/layer/mpe.lua new file mode 100644 index 0000000..ec8a8f3 --- /dev/null +++ b/kaldi_seq/layer/mpe.lua @@ -0,0 +1,52 @@ +require 'libkaldiseq' +local MPELayer = nerv.class("nerv.MPELayer", "nerv.Layer") + +function MPELayer:__init(id, global_conf, layer_conf) + self.id = id + self.gconf = global_conf + self.dim_in = layer_conf.dim_in + self.dim_out = layer_conf.dim_out + self.arg = layer_conf.cmd.arg + self.mdl = layer_conf.cmd.mdl + self.lat = layer_conf.cmd.lat + self.ali = layer_conf.cmd.ali + self:check_dim_len(2, -1) -- two inputs: nn output and utt key +end + +function MPELayer:init(batch_size) + self.total_correct = 0 + self.total_frames = 0 + self.kaldi_mpe = nerv.KaldiMPE(self.arg, self.mdl, self.lat, self.ali) + if self.kaldi_mpe == nil then + nerv.error("kaldi arguments is expected: %s %s %s %s", self.arg, + self.mdl, self.lat, self.ali) + end +end + +function MPELayer:batch_resize(batch_size) + -- do nothing +end + +function MPELayer:update(bp_err, input, output) + -- no params, therefore do nothing +end + +function MPELayer:propagate(input, output) + self.valid = false + self.valid = self.kaldi_mpe:check(input[1], input[2]) + return self.valid +end + +function MPELayer:back_propagate(bp_err, next_bp_err, input, output) + if self.valid ~= true then + nerv.error("kaldi sequence training back_propagate fail") + end + local mmat = input[1]:new_to_host() + next_bp_err[1]:copy_fromh(self.kaldi_mpe:calc_diff(mmat, input[2])) + self.total_frames = self.total_frames + self.kaldi_mpe:get_num_frames() + self.total_correct = self.total_correct + self.kaldi_mpe:get_utt_frame_acc() +end + +function MPELayer:get_params() + return nerv.ParamRepo({}) +end diff --git a/kaldi_seq/src/init.c b/kaldi_seq/src/init.c new file mode 100644 index 0000000..9b38056 --- /dev/null +++ b/kaldi_seq/src/init.c @@ -0,0 +1,131 @@ +#include "nerv/common.h" +#include "kaldi_mpe.h" +#include "kaldi_mmi.h" +#include <stdio.h> + +const char *nerv_kaldi_mpe_tname = "nerv.KaldiMPE"; +const char *nerv_kaldi_mmi_tname = "nerv.KaldiMMI"; +const char *nerv_matrix_cuda_float_tname = "nerv.CuMatrixFloat"; +const char *nerv_matrix_host_float_tname = "nerv.MMatrixFloat"; + +static int mpe_new(lua_State *L) { + const char *arg = luaL_checkstring(L, 1); + const char *mdl = luaL_checkstring(L, 2); + const char *lat = luaL_checkstring(L, 3); + const char *ali = luaL_checkstring(L, 4); + KaldiMPE *mpe = new_KaldiMPE(arg, mdl, lat, ali); + luaT_pushudata(L, mpe, nerv_kaldi_mpe_tname); + return 1; +} + +static int mpe_destroy(lua_State *L) { + KaldiMPE *mpe = luaT_checkudata(L, 1, nerv_kaldi_mpe_tname); + destroy_KaldiMPE(mpe); + return 0; +} + +static int mpe_check(lua_State *L) { + KaldiMPE *mpe = luaT_checkudata(L, 1, nerv_kaldi_mpe_tname); + const Matrix *cumat = luaT_checkudata(L, 2, nerv_matrix_cuda_float_tname); + const char *utt = luaL_checkstring(L, 3); + + lua_pushboolean(L, check_mpe(mpe, cumat, utt)); + return 1; +} + +static int mpe_calc_diff(lua_State *L) { + KaldiMPE *mpe = luaT_checkudata(L, 1, nerv_kaldi_mpe_tname); + Matrix *mat = luaT_checkudata(L, 2, nerv_matrix_host_float_tname); + const char *utt = luaL_checkstring(L, 3); + + Matrix *diff = calc_diff_mpe(mpe, mat, utt); + luaT_pushudata(L, diff, nerv_matrix_host_float_tname); + return 1; +} + +static int mpe_get_num_frames(lua_State *L) { + KaldiMPE *mpe = luaT_checkudata(L, 1, nerv_kaldi_mpe_tname); + lua_pushnumber(L, get_num_frames_mpe(mpe)); + return 1; +} + +static int mpe_get_utt_frame_acc(lua_State *L) { + KaldiMPE *mpe = luaT_checkudata(L, 1, nerv_kaldi_mpe_tname); + lua_pushnumber(L, get_utt_frame_acc_mpe(mpe)); + return 1; +} + +static const luaL_Reg mpe_methods[] = { + {"check", mpe_check}, + {"calc_diff", mpe_calc_diff}, + {"get_num_frames", mpe_get_num_frames}, + {"get_utt_frame_acc", mpe_get_utt_frame_acc}, + {NULL, NULL} +}; + +static void mpe_init(lua_State *L) { + luaT_newmetatable(L, nerv_kaldi_mpe_tname, NULL, + mpe_new, mpe_destroy, NULL); + luaL_register(L, NULL, mpe_methods); + lua_pop(L, 1); +} + +static int mmi_new(lua_State *L) { + const char *arg = luaL_checkstring(L, 1); + const char *mdl = luaL_checkstring(L, 2); + const char *lat = luaL_checkstring(L, 3); + const char *ali = luaL_checkstring(L, 4); + KaldiMMI *mmi = new_KaldiMMI(arg, mdl, lat, ali); + luaT_pushudata(L, mmi, nerv_kaldi_mmi_tname); + return 1; +} + +static int mmi_destroy(lua_State *L) { + KaldiMMI *mmi = luaT_checkudata(L, 1, nerv_kaldi_mmi_tname); + destroy_KaldiMMI(mmi); + return 0; +} + +static int mmi_check(lua_State *L) { + KaldiMMI *mmi = luaT_checkudata(L, 1, nerv_kaldi_mmi_tname); + const Matrix *cumat = luaT_checkudata(L, 2, nerv_matrix_cuda_float_tname); + const char *utt = luaL_checkstring(L, 3); + + lua_pushboolean(L, check_mmi(mmi, cumat, utt)); + return 1; +} + +static int mmi_calc_diff(lua_State *L) { + KaldiMMI *mmi = luaT_checkudata(L, 1, nerv_kaldi_mmi_tname); + Matrix *mat = luaT_checkudata(L, 2, nerv_matrix_host_float_tname); + const char *utt = luaL_checkstring(L, 3); + + Matrix *diff = calc_diff_mmi(mmi, mat, utt); + luaT_pushudata(L, diff, nerv_matrix_host_float_tname); + return 1; +} + +static int mmi_get_num_frames(lua_State *L) { + KaldiMMI *mmi = luaT_checkudata(L, 1, nerv_kaldi_mmi_tname); + lua_pushnumber(L, get_num_frames_mmi(mmi)); + return 1; +} + +static const luaL_Reg mmi_methods[] = { + {"check", mmi_check}, + {"calc_diff", mmi_calc_diff}, + {"get_num_frames", mmi_get_num_frames}, + {NULL, NULL} +}; + +static void mmi_init(lua_State *L) { + luaT_newmetatable(L, nerv_kaldi_mmi_tname, NULL, + mmi_new, mmi_destroy, NULL); + luaL_register(L, NULL, mmi_methods); + lua_pop(L, 1); +} + +void kaldi_seq_init(lua_State *L) { + mpe_init(L); + mmi_init(L); +} diff --git a/kaldi_seq/src/kaldi_mmi.cpp b/kaldi_seq/src/kaldi_mmi.cpp new file mode 100644 index 0000000..ea9b4f1 --- /dev/null +++ b/kaldi_seq/src/kaldi_mmi.cpp @@ -0,0 +1,427 @@ +#include <string> +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "tree/context-dep.h" +#include "hmm/transition-model.h" +#include "fstext/fstext-lib.h" +#include "decoder/faster-decoder.h" +#include "decoder/decodable-matrix.h" +#include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" + +#include "nnet/nnet-trnopts.h" +#include "nnet/nnet-component.h" +#include "nnet/nnet-activation.h" +#include "nnet/nnet-nnet.h" +#include "nnet/nnet-pdf-prior.h" +#include "nnet/nnet-utils.h" +#include "base/timer.h" +#include "cudamatrix/cu-device.h" + +#include <iomanip> + +typedef kaldi::BaseFloat BaseFloat; +typedef struct Matrix NervMatrix; + +namespace kaldi{ + namespace nnet1{ + void LatticeAcousticRescore(const kaldi::Matrix<BaseFloat> &log_like, + const TransitionModel &trans_model, + const std::vector<int32> &state_times, + Lattice *lat); + } +} + +extern "C" { +#include "kaldi_mmi.h" +#include "string.h" +#include "assert.h" +#include "nerv/common.h" + + extern NervMatrix *nerv_matrix_host_float_create(long nrow, long ncol, Status *status); + extern void nerv_matrix_host_float_copy_fromd(NervMatrix *mat, const NervMatrix *cumat, int, int, int, Status *); + using namespace kaldi; + using namespace kaldi::nnet1; + typedef kaldi::int32 int32; + + struct KaldiMMI { + TransitionModel *trans_model; + RandomAccessLatticeReader *den_lat_reader; + RandomAccessInt32VectorReader *ref_ali_reader; + + Lattice den_lat; + vector<int32> state_times; + + PdfPriorOptions *prior_opts; + PdfPrior *log_prior; + + std::vector<int32> ref_ali; + + Timer *time; + double time_now; + + int32 num_done, num_no_ref_ali, num_no_den_lat, num_other_error; + int32 num_frm_drop; + + kaldi::int64 total_frames; + double lat_like; // total likelihood of the lattice + double lat_ac_like; // acoustic likelihood weighted by posterior. + double total_mmi_obj, mmi_obj; + double total_post_on_ali, post_on_ali; + + int32 num_frames; + + bool binary; + BaseFloat acoustic_scale, lm_scale, old_acoustic_scale; + kaldi::int32 max_frames; + bool drop_frames; + std::string use_gpu; + }; + + KaldiMMI * new_KaldiMMI(const char* arg, const char* mdl, const char* lat, const char* ali) + { + KaldiMMI * mmi = new KaldiMMI; + + const char *usage = + "Perform one iteration of DNN-MMI training by stochastic " + "gradient descent.\n" + "The network weights are updated on each utterance.\n" + "Usage: nnet-train-mmi-sequential [options] <model-in> <transition-model-in> " + "<feature-rspecifier> <den-lat-rspecifier> <ali-rspecifier> [<model-out>]\n" + "e.g.: \n" + " nnet-train-mmi-sequential nnet.init trans.mdl scp:train.scp scp:denlats.scp ark:train.ali " + "nnet.iter1\n"; + + ParseOptions po(usage); + + NnetTrainOptions trn_opts; trn_opts.learn_rate=0.00001; + trn_opts.Register(&po); + + mmi->binary = true; + po.Register("binary", &(mmi->binary), "Write output in binary mode"); + + std::string feature_transform; + po.Register("feature-transform", &feature_transform, + "Feature transform in Nnet format"); + + mmi->prior_opts = new PdfPriorOptions; + PdfPriorOptions &prior_opts = *(mmi->prior_opts); + prior_opts.Register(&po); + + mmi->acoustic_scale = 1.0, + mmi->lm_scale = 1.0, + mmi->old_acoustic_scale = 0.0; + po.Register("acoustic-scale", &(mmi->acoustic_scale), + "Scaling factor for acoustic likelihoods"); + po.Register("lm-scale", &(mmi->lm_scale), + "Scaling factor for \"graph costs\" (including LM costs)"); + po.Register("old-acoustic-scale", &(mmi->old_acoustic_scale), + "Add in the scores in the input lattices with this scale, rather " + "than discarding them."); + mmi->max_frames = 6000; // Allow segments maximum of one minute by default + po.Register("max-frames",&(mmi->max_frames), "Maximum number of frames a segment can have to be processed"); + + mmi->drop_frames = true; + po.Register("drop-frames", &(mmi->drop_frames), + "Drop frames, where is zero den-posterior under numerator path " + "(ie. path not in lattice)"); + + mmi->use_gpu=std::string("yes"); + po.Register("use-gpu", &(mmi->use_gpu), "yes|no|optional, only has effect if compiled with CUDA"); + + int narg = 0; + char args[64][1024]; + char *token; + char *saveptr = NULL; + char tmpstr[1024]; + + strcpy(tmpstr, arg); + strcpy(args[0], "nnet-train-mmi-sequential"); + for(narg = 1, token = strtok_r(tmpstr, " ", &saveptr); token; token = strtok_r(NULL, " ", &saveptr)) + strcpy(args[narg++], token); + strcpy(args[narg++], "0.nnet"); + strcpy(args[narg++], mdl); + strcpy(args[narg++], "feat"); + strcpy(args[narg++], lat); + strcpy(args[narg++], ali); + strcpy(args[narg++], "1.nnet"); + + char **argsv = new char*[narg]; + for(int _i = 0; _i < narg; _i++) + argsv[_i] = args[_i]; + + po.Read(narg, argsv); + delete [] argsv; + + if (po.NumArgs() != 6) { + po.PrintUsage(); + exit(1); + } + + std::string transition_model_filename = po.GetArg(2), + den_lat_rspecifier = po.GetArg(4), + ref_ali_rspecifier = po.GetArg(5); + + // Select the GPU +#if HAVE_CUDA == 1 + CuDevice::Instantiate().SelectGpuId(mmi->use_gpu); +#endif + + // Read the class-frame-counts, compute priors + mmi->log_prior = new PdfPrior(prior_opts); + + // Read transition model + mmi->trans_model = new TransitionModel; + ReadKaldiObject(transition_model_filename, mmi->trans_model); + + mmi->den_lat_reader = new RandomAccessLatticeReader(den_lat_rspecifier); + mmi->ref_ali_reader = new RandomAccessInt32VectorReader(ref_ali_rspecifier); + + if (mmi->drop_frames) { + KALDI_LOG << "--drop-frames=true :" + " we will zero gradient for frames with total den/num mismatch." + " The mismatch is likely to be caused by missing correct path " + " from den-lattice due wrong annotation or search error." + " Leaving such frames out stabilizes the training."; + } + + mmi->time = new Timer; + mmi->time_now = 0; + mmi->num_done =0; + mmi->num_no_ref_ali = 0; + mmi->num_no_den_lat = 0; + mmi->num_other_error = 0; + mmi->total_frames = 0; + mmi->num_frm_drop = 0; + + mmi->total_mmi_obj = 0.0, mmi->mmi_obj = 0.0; + mmi->total_post_on_ali = 0.0, mmi->post_on_ali = 0.0; + return mmi; + } + + void destroy_KaldiMMI(KaldiMMI *mmi) + { + delete mmi->trans_model; + delete mmi->den_lat_reader; + delete mmi->ref_ali_reader; + delete mmi->time; + delete mmi->prior_opts; + delete mmi->log_prior; + } + + int check_mmi(KaldiMMI *mmi, const NervMatrix* mat, const char *key) + { + std::string utt(key); + if (!mmi->den_lat_reader->HasKey(utt)) { + KALDI_WARN << "Utterance " << utt << ": found no lattice."; + mmi->num_no_den_lat++; + return 0; + } + if (!mmi->ref_ali_reader->HasKey(utt)) { + KALDI_WARN << "Utterance " << utt << ": found no reference alignment."; + mmi->num_no_ref_ali++; + return 0; + } + + assert(sizeof(BaseFloat) == sizeof(float)); + // 1) get the features, numerator alignment + mmi->ref_ali = mmi->ref_ali_reader->Value(utt); + long mat_nrow = mat->nrow, mat_ncol = mat->ncol; + // check for temporal length of numerator alignments + if (static_cast<MatrixIndexT>(mmi->ref_ali.size()) != mat_nrow) { + KALDI_WARN << "Numerator alignment has wrong length " + << mmi->ref_ali.size() << " vs. "<< mat_nrow; + mmi->num_other_error++; + return 0; + } + if (mat_nrow > mmi->max_frames) { + KALDI_WARN << "Utterance " << utt << ": Skipped because it has " << mat_nrow << + " frames, which is more than " << mmi->max_frames << "."; + mmi->num_other_error++; + return 0; + } + // 2) get the denominator lattice, preprocess + mmi->den_lat = mmi->den_lat_reader->Value(utt); + Lattice &den_lat = mmi->den_lat; + if (den_lat.Start() == -1) { + KALDI_WARN << "Empty lattice for utt " << utt; + mmi->num_other_error++; + return 0; + } + if (mmi->old_acoustic_scale != 1.0) { + fst::ScaleLattice(fst::AcousticLatticeScale(mmi->old_acoustic_scale), + &den_lat); + } + // optional sort it topologically + kaldi::uint64 props = den_lat.Properties(fst::kFstProperties, false); + if (!(props & fst::kTopSorted)) { + if (fst::TopSort(&den_lat) == false) + KALDI_ERR << "Cycles detected in lattice."; + } + // get the lattice length and times of states + mmi->state_times.clear(); + vector<int32> &state_times = mmi->state_times; + int32 max_time = kaldi::LatticeStateTimes(den_lat, &state_times); + // check for temporal length of denominator lattices + if (max_time != mat_nrow) { + KALDI_WARN << "Denominator lattice has wrong length " + << max_time << " vs. " << mat_nrow; + mmi->num_other_error++; + return 0; + } + + return 1; + } + + NervMatrix * calc_diff_mmi(KaldiMMI * mmi, NervMatrix * mat, const char * key) + { + std::string utt(key); + assert(sizeof(BaseFloat) == sizeof(float)); + + kaldi::Matrix<BaseFloat> nnet_out_h, nnet_diff_h; + nnet_out_h.Resize(mat->nrow, mat->ncol, kUndefined); + + size_t stride = mat->stride; + for (int i = 0; i < mat->nrow; i++) + { + const BaseFloat *nerv_row = (BaseFloat *)((char *)mat->data.f + i * stride); + BaseFloat *row = nnet_out_h.RowData(i); + memmove(row, nerv_row, sizeof(BaseFloat) * mat->ncol); + } + + mmi->num_frames = nnet_out_h.NumRows(); + + PdfPriorOptions &prior_opts = *(mmi->prior_opts); + if (prior_opts.class_frame_counts != "") { + CuMatrix<BaseFloat> nnet_out; + nnet_out.Resize(mat->nrow, mat->ncol, kUndefined); + nnet_out.CopyFromMat(nnet_out_h); + mmi->log_prior->SubtractOnLogpost(&nnet_out); + nnet_out.CopyToMat(&nnet_out_h); + nnet_out.Resize(0,0); + } + + // 4) rescore the latice + LatticeAcousticRescore(nnet_out_h, *(mmi->trans_model), mmi->state_times, &(mmi->den_lat)); + if (mmi->acoustic_scale != 1.0 || mmi->lm_scale != 1.0) + fst::ScaleLattice(fst::LatticeScale(mmi->lm_scale, mmi->acoustic_scale), &(mmi->den_lat)); + + kaldi::Posterior post; + mmi->lat_like = kaldi::LatticeForwardBackward(mmi->den_lat, &post, &(mmi->lat_ac_like)); + + nnet_diff_h.Resize(mat->nrow, mat->ncol, kSetZero); + for (int32 t = 0; t < post.size(); t++) { + for (int32 arc = 0; arc < post[t].size(); arc++) { + int32 pdf = mmi->trans_model->TransitionIdToPdf(post[t][arc].first); + nnet_diff_h(t, pdf) += post[t][arc].second; + } + } + + double path_ac_like = 0.0; + for(int32 t=0; t<mmi->num_frames; t++) { + int32 pdf = mmi->trans_model->TransitionIdToPdf(mmi->ref_ali[t]); + path_ac_like += nnet_out_h(t,pdf); + } + path_ac_like *= mmi->acoustic_scale; + mmi->mmi_obj = path_ac_like - mmi->lat_like; + + mmi->post_on_ali = 0.0; + for(int32 t=0; t<mmi->num_frames; t++) { + int32 pdf = mmi->trans_model->TransitionIdToPdf(mmi->ref_ali[t]); + double posterior = nnet_diff_h(t, pdf); + mmi->post_on_ali += posterior; + } + + KALDI_VLOG(1) << "Lattice #" << mmi->num_done + 1 << " processed" + << " (" << utt << "): found " << mmi->den_lat.NumStates() + << " states and " << fst::NumArcs(mmi->den_lat) << " arcs."; + + KALDI_VLOG(1) << "Utterance " << utt << ": Average MMI obj. value = " + << (mmi->mmi_obj/mmi->num_frames) << " over " << mmi->num_frames + << " frames," + << " (Avg. den-posterior on ali " << mmi->post_on_ali/mmi->num_frames << ")"; + + // 7a) Search for the frames with num/den mismatch + int32 frm_drop = 0; + std::vector<int32> frm_drop_vec; + for(int32 t=0; t<mmi->num_frames; t++) { + int32 pdf = mmi->trans_model->TransitionIdToPdf(mmi->ref_ali[t]); + double posterior = nnet_diff_h(t, pdf); + if(posterior < 1e-20) { + frm_drop++; + frm_drop_vec.push_back(t); + } + } + + // 8) subtract the pdf-Viterbi-path + for(int32 t=0; t<nnet_diff_h.NumRows(); t++) { + int32 pdf = mmi->trans_model->TransitionIdToPdf(mmi->ref_ali[t]); + nnet_diff_h(t, pdf) -= 1.0; + } + + // 9) Drop mismatched frames from the training by zeroing the derivative + if(mmi->drop_frames) { + for(int32 i=0; i<frm_drop_vec.size(); i++) { + nnet_diff_h.Row(frm_drop_vec[i]).Set(0.0); + } + mmi->num_frm_drop += frm_drop; + } + + // Report the frame dropping + if (frm_drop > 0) { + std::stringstream ss; + ss << (mmi->drop_frames?"Dropped":"[dropping disabled] Would drop") + << " frames in " << utt << " " << frm_drop << "/" << mmi->num_frames << ","; + //get frame intervals from vec frm_drop_vec + ss << " intervals :"; + //search for streaks of consecutive numbers: + int32 beg_streak=frm_drop_vec[0]; + int32 len_streak=0; + int32 i; + for(i=0; i<frm_drop_vec.size(); i++,len_streak++) { + if(beg_streak + len_streak != frm_drop_vec[i]) { + ss << " " << beg_streak << ".." << frm_drop_vec[i-1] << "frm"; + beg_streak = frm_drop_vec[i]; + len_streak = 0; + } + } + ss << " " << beg_streak << ".." << frm_drop_vec[i-1] << "frm"; + //print + KALDI_WARN << ss.str(); + } + + assert(mat->nrow == nnet_diff_h.NumRows() && mat->ncol == nnet_diff_h.NumCols()); + stride = mat->stride; + for (int i = 0; i < mat->nrow; i++) + { + const BaseFloat *row = nnet_diff_h.RowData(i); + BaseFloat *nerv_row = (BaseFloat *)((char *)mat->data.f + i * stride); + memmove(nerv_row, row, sizeof(BaseFloat) * mat->ncol); + } + nnet_diff_h.Resize(0,0); + + // increase time counter + mmi->total_mmi_obj += mmi->mmi_obj; + mmi->total_post_on_ali += mmi->post_on_ali; + mmi->total_frames += mmi->num_frames; + mmi->num_done++; + + if (mmi->num_done % 100 == 0) { + mmi->time_now = mmi->time->Elapsed(); + KALDI_VLOG(1) << "After " << mmi->num_done << " utterances: time elapsed = " + << mmi->time_now/60 << " min; processed " << mmi->total_frames/mmi->time_now + << " frames per second."; +#if HAVE_CUDA==1 + // check the GPU is not overheated + CuDevice::Instantiate().CheckGpuHealth(); +#endif + } + return mat; + } + + double get_num_frames_mmi(const KaldiMMI *mmi) + { + return (double)mmi->num_frames; + } + +} diff --git a/kaldi_seq/src/kaldi_mmi.h b/kaldi_seq/src/kaldi_mmi.h new file mode 100644 index 0000000..ce6787c --- /dev/null +++ b/kaldi_seq/src/kaldi_mmi.h @@ -0,0 +1,20 @@ +#ifndef NERV_kaldi_KALDI_MMI +#define NERV_kaldi_KALDI_MMI +#include "nerv/matrix/matrix.h" +#include "nerv/common.h" +#ifdef __cplusplus +extern "C" { +#endif + + typedef struct KaldiMMI KaldiMMI; + + KaldiMMI * new_KaldiMMI(const char*, const char*, const char*, const char*); + void destroy_KaldiMMI(KaldiMMI *); + int check_mmi(KaldiMMI *, const Matrix*, const char *); + Matrix * calc_diff_mmi(KaldiMMI *, Matrix *, const char *); + double get_num_frames_mmi(const KaldiMMI *); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/kaldi_seq/src/kaldi_mpe.cpp b/kaldi_seq/src/kaldi_mpe.cpp new file mode 100644 index 0000000..60384e2 --- /dev/null +++ b/kaldi_seq/src/kaldi_mpe.cpp @@ -0,0 +1,411 @@ +#include <string> +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "tree/context-dep.h" +#include "hmm/transition-model.h" +#include "fstext/fstext-lib.h" +#include "decoder/faster-decoder.h" +#include "decoder/decodable-matrix.h" +#include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" + +#include "nnet/nnet-trnopts.h" +#include "nnet/nnet-component.h" +#include "nnet/nnet-activation.h" +#include "nnet/nnet-nnet.h" +#include "nnet/nnet-pdf-prior.h" +#include "nnet/nnet-utils.h" +#include "base/timer.h" +#include "cudamatrix/cu-device.h" + +typedef kaldi::BaseFloat BaseFloat; +typedef struct Matrix NervMatrix; + +namespace kaldi { + namespace nnet1 { + + void LatticeAcousticRescore(const Matrix<BaseFloat> &log_like, + const TransitionModel &trans_model, + const std::vector<int32> &state_times, + Lattice *lat) { + kaldi::uint64 props = lat->Properties(fst::kFstProperties, false); + if (!(props & fst::kTopSorted)) + KALDI_ERR << "Input lattice must be topologically sorted."; + + KALDI_ASSERT(!state_times.empty()); + std::vector<std::vector<int32> > time_to_state(log_like.NumRows()); + for (size_t i = 0; i < state_times.size(); i++) { + KALDI_ASSERT(state_times[i] >= 0); + if (state_times[i] < log_like.NumRows()) // end state may be past this.. + time_to_state[state_times[i]].push_back(i); + else + KALDI_ASSERT(state_times[i] == log_like.NumRows() + && "There appears to be lattice/feature mismatch."); + } + + for (int32 t = 0; t < log_like.NumRows(); t++) { + for (size_t i = 0; i < time_to_state[t].size(); i++) { + int32 state = time_to_state[t][i]; + for (fst::MutableArcIterator<Lattice> aiter(lat, state); !aiter.Done(); + aiter.Next()) { + LatticeArc arc = aiter.Value(); + int32 trans_id = arc.ilabel; + if (trans_id != 0) { // Non-epsilon input label on arc + int32 pdf_id = trans_model.TransitionIdToPdf(trans_id); + arc.weight.SetValue2(-log_like(t, pdf_id) + arc.weight.Value2()); + aiter.SetValue(arc); + } + } + } + } + } + + } // namespace nnet1 +} // namespace kaldi + + +extern "C" { +#include "kaldi_mpe.h" +#include "string.h" +#include "assert.h" +#include "nerv/common.h" + + extern NervMatrix *nerv_matrix_host_float_create(long nrow, long ncol, Status *status); + extern void nerv_matrix_host_float_copy_fromd(NervMatrix *mat, const NervMatrix *cumat, int, int, int, Status *); + using namespace kaldi; + using namespace kaldi::nnet1; + typedef kaldi::int32 int32; + + struct KaldiMPE { + TransitionModel *trans_model; + RandomAccessLatticeReader *den_lat_reader; + RandomAccessInt32VectorReader *ref_ali_reader; + + Lattice den_lat; + vector<int32> state_times; + + PdfPriorOptions *prior_opts; + PdfPrior *log_prior; + + std::vector<int32> silence_phones; + std::vector<int32> ref_ali; + + Timer *time; + double time_now; + + int32 num_done, num_no_ref_ali, num_no_den_lat, num_other_error; + + kaldi::int64 total_frames; + int32 num_frames; + double total_frame_acc, utt_frame_acc; + + bool binary; + bool one_silence_class; + BaseFloat acoustic_scale, lm_scale, old_acoustic_scale; + kaldi::int32 max_frames; + bool do_smbr; + std::string use_gpu; + }; + + KaldiMPE * new_KaldiMPE(const char* arg, const char* mdl, const char* lat, const char* ali) + { + KaldiMPE * mpe = new KaldiMPE; + + const char *usage = + "Perform iteration of Neural Network MPE/sMBR training by stochastic " + "gradient descent.\n" + "The network weights are updated on each utterance.\n" + "Usage: nnet-train-mpe-sequential [options] <model-in> <transition-model-in> " + "<feature-rspecifier> <den-lat-rspecifier> <ali-rspecifier> [<model-out>]\n" + "e.g.: \n" + " nnet-train-mpe-sequential nnet.init trans.mdl scp:train.scp scp:denlats.scp ark:train.ali " + "nnet.iter1\n"; + + ParseOptions po(usage); + + NnetTrainOptions trn_opts; trn_opts.learn_rate=0.00001; + trn_opts.Register(&po); + + mpe->binary = true; + po.Register("binary", &(mpe->binary), "Write output in binary mode"); + + std::string feature_transform; + po.Register("feature-transform", &feature_transform, + "Feature transform in Nnet format"); + std::string silence_phones_str; + po.Register("silence-phones", &silence_phones_str, "Colon-separated list " + "of integer id's of silence phones, e.g. 46:47"); + + mpe->prior_opts = new PdfPriorOptions; + PdfPriorOptions &prior_opts = *(mpe->prior_opts); + prior_opts.Register(&po); + + mpe->one_silence_class = false; + mpe->acoustic_scale = 1.0, + mpe->lm_scale = 1.0, + mpe->old_acoustic_scale = 0.0; + po.Register("acoustic-scale", &(mpe->acoustic_scale), + "Scaling factor for acoustic likelihoods"); + po.Register("lm-scale", &(mpe->lm_scale), + "Scaling factor for \"graph costs\" (including LM costs)"); + po.Register("old-acoustic-scale", &(mpe->old_acoustic_scale), + "Add in the scores in the input lattices with this scale, rather " + "than discarding them."); + po.Register("one-silence-class", &(mpe->one_silence_class), "If true, newer " + "behavior which will tend to reduce insertions."); + mpe->max_frames = 6000; // Allow segments maximum of one minute by default + po.Register("max-frames",&(mpe->max_frames), "Maximum number of frames a segment can have to be processed"); + mpe->do_smbr = false; + po.Register("do-smbr", &(mpe->do_smbr), "Use state-level accuracies instead of " + "phone accuracies."); + + mpe->use_gpu=std::string("yes"); + po.Register("use-gpu", &(mpe->use_gpu), "yes|no|optional, only has effect if compiled with CUDA"); + + int narg = 0; + char args[64][1024]; + char *token; + char *saveptr = NULL; + char tmpstr[1024]; + + strcpy(tmpstr, arg); + strcpy(args[0], "nnet-train-mpe-sequential"); + for(narg = 1, token = strtok_r(tmpstr, " ", &saveptr); token; token = strtok_r(NULL, " ", &saveptr)) + strcpy(args[narg++], token); + strcpy(args[narg++], "0.nnet"); + strcpy(args[narg++], mdl); + strcpy(args[narg++], "feat"); + strcpy(args[narg++], lat); + strcpy(args[narg++], ali); + strcpy(args[narg++], "1.nnet"); + + char **argsv = new char*[narg]; + for(int _i = 0; _i < narg; _i++) + argsv[_i] = args[_i]; + + po.Read(narg, argsv); + delete [] argsv; + + if (po.NumArgs() != 6) { + po.PrintUsage(); + exit(1); + } + + std::string transition_model_filename = po.GetArg(2), + den_lat_rspecifier = po.GetArg(4), + ref_ali_rspecifier = po.GetArg(5); + + std::vector<int32> &silence_phones = mpe->silence_phones; + if (!kaldi::SplitStringToIntegers(silence_phones_str, ":", false, + &silence_phones)) + KALDI_ERR << "Invalid silence-phones string " << silence_phones_str; + kaldi::SortAndUniq(&silence_phones); + if (silence_phones.empty()) + KALDI_LOG << "No silence phones specified."; + + // Select the GPU +#if HAVE_CUDA == 1 + CuDevice::Instantiate().SelectGpuId(mpe->use_gpu); +#endif + + // Read the class-frame-counts, compute priors + mpe->log_prior = new PdfPrior(prior_opts); + + // Read transition model + mpe->trans_model = new TransitionModel; + ReadKaldiObject(transition_model_filename, mpe->trans_model); + + mpe->den_lat_reader = new RandomAccessLatticeReader(den_lat_rspecifier); + mpe->ref_ali_reader = new RandomAccessInt32VectorReader(ref_ali_rspecifier); + + mpe->time = new Timer; + mpe->time_now = 0; + mpe->num_done =0; + mpe->num_no_ref_ali = 0; + mpe->num_no_den_lat = 0; + mpe->num_other_error = 0; + mpe->total_frames = 0; + mpe->total_frame_acc = 0.0; + mpe->utt_frame_acc = 0.0; + + return mpe; + } + + void destroy_KaldiMPE(KaldiMPE *mpe) + { + delete mpe->trans_model; + delete mpe->den_lat_reader; + delete mpe->ref_ali_reader; + delete mpe->time; + delete mpe->prior_opts; + delete mpe->log_prior; + } + + int check_mpe(KaldiMPE *mpe, const NervMatrix* mat, const char *key) + { + std::string utt(key); + if (!mpe->den_lat_reader->HasKey(utt)) { + KALDI_WARN << "Utterance " << utt << ": found no lattice."; + mpe->num_no_den_lat++; + return 0; + } + if (!mpe->ref_ali_reader->HasKey(utt)) { + KALDI_WARN << "Utterance " << utt << ": found no reference alignment."; + mpe->num_no_ref_ali++; + return 0; + } + + //assert(sizeof(BaseFloat) == sizeof(float)); + // 1) get the features, numerator alignment + mpe->ref_ali = mpe->ref_ali_reader->Value(utt); + long mat_nrow = mat->nrow, mat_ncol = mat->ncol; + // check for temporal length of numerator alignments + if (static_cast<MatrixIndexT>(mpe->ref_ali.size()) != mat_nrow) { + KALDI_WARN << "Numerator alignment has wrong length " + << mpe->ref_ali.size() << " vs. "<< mat_nrow; + mpe->num_other_error++; + return 0; + } + if (mat_nrow > mpe->max_frames) { + KALDI_WARN << "Utterance " << utt << ": Skipped because it has " << mat_nrow << + " frames, which is more than " << mpe->max_frames << "."; + mpe->num_other_error++; + return 0; + } + // 2) get the denominator lattice, preprocess + mpe->den_lat = mpe->den_lat_reader->Value(utt); + Lattice &den_lat = mpe->den_lat; + if (den_lat.Start() == -1) { + KALDI_WARN << "Empty lattice for utt " << utt; + mpe->num_other_error++; + return 0; + } + if (mpe->old_acoustic_scale != 1.0) { + fst::ScaleLattice(fst::AcousticLatticeScale(mpe->old_acoustic_scale), + &den_lat); + } + // optional sort it topologically + kaldi::uint64 props = den_lat.Properties(fst::kFstProperties, false); + if (!(props & fst::kTopSorted)) { + if (fst::TopSort(&den_lat) == false) + KALDI_ERR << "Cycles detected in lattice."; + } + // get the lattice length and times of states + mpe->state_times.clear(); + vector<int32> &state_times = mpe->state_times; + int32 max_time = kaldi::LatticeStateTimes(den_lat, &state_times); + // check for temporal length of denominator lattices + if (max_time != mat_nrow) { + KALDI_WARN << "Denominator lattice has wrong length " + << max_time << " vs. " << mat_nrow; + mpe->num_other_error++; + return 0; + } + + return 1; + } + + NervMatrix * calc_diff_mpe(KaldiMPE * mpe, NervMatrix * mat, const char * key) + { + std::string utt(key); + //assert(sizeof(BaseFloat) == sizeof(float)); + + CuMatrix<BaseFloat> nnet_diff; + kaldi::Matrix<BaseFloat> nnet_out_h; + nnet_out_h.Resize(mat->nrow, mat->ncol, kUndefined); + + size_t stride = mat->stride; + for (int i = 0; i < mat->nrow; i++) + { + const BaseFloat *nerv_row = (BaseFloat *)((char *)mat->data.f + i * stride); + BaseFloat *row = nnet_out_h.RowData(i); + memmove(row, nerv_row, sizeof(BaseFloat) * mat->ncol); + } + + mpe->num_frames = nnet_out_h.NumRows(); + + PdfPriorOptions &prior_opts = *(mpe->prior_opts); + if (prior_opts.class_frame_counts != "") { + CuMatrix<BaseFloat> nnet_out; + nnet_out.Resize(nnet_out_h.NumRows(), nnet_out_h.NumCols(), kUndefined); + nnet_out.CopyFromMat(nnet_out_h); + mpe->log_prior->SubtractOnLogpost(&nnet_out); + nnet_out_h.Resize(nnet_out.NumRows(), nnet_out.NumCols(), kUndefined); + nnet_out.CopyToMat(&nnet_out_h); + nnet_out.Resize(0,0); + } + + // 4) rescore the latice + LatticeAcousticRescore(nnet_out_h, *(mpe->trans_model), mpe->state_times, &(mpe->den_lat)); + if (mpe->acoustic_scale != 1.0 || mpe->lm_scale != 1.0) + fst::ScaleLattice(fst::LatticeScale(mpe->lm_scale, mpe->acoustic_scale), &(mpe->den_lat)); + + kaldi::Posterior post; + std::vector<int32> &silence_phones = mpe->silence_phones; + + if (mpe->do_smbr) { // use state-level accuracies, i.e. sMBR estimation + mpe->utt_frame_acc = LatticeForwardBackwardMpeVariants( + *(mpe->trans_model), silence_phones, mpe->den_lat, mpe->ref_ali, "smbr", + mpe->one_silence_class, &post); + } else { // use phone-level accuracies, i.e. MPFE (minimum phone frame error) + mpe->utt_frame_acc = LatticeForwardBackwardMpeVariants( + *(mpe->trans_model), silence_phones, mpe->den_lat, mpe->ref_ali, "mpfe", + mpe->one_silence_class, &post); + } + + // 6) convert the Posterior to a matrix, + PosteriorToMatrixMapped(post, *(mpe->trans_model), &nnet_diff); + nnet_diff.Scale(-1.0); // need to flip the sign of derivative, + + KALDI_VLOG(1) << "Lattice #" << mpe->num_done + 1 << " processed" + << " (" << utt << "): found " << mpe->den_lat.NumStates() + << " states and " << fst::NumArcs(mpe->den_lat) << " arcs."; + + KALDI_VLOG(1) << "Utterance " << utt << ": Average frame accuracy = " + << (mpe->utt_frame_acc/mpe->num_frames) << " over " << mpe->num_frames + << " frames," + << " diff-range(" << nnet_diff.Min() << "," << nnet_diff.Max() << ")"; + + nnet_out_h.Resize(nnet_diff.NumRows(), nnet_diff.NumCols(), kUndefined); + nnet_diff.CopyToMat(&nnet_out_h); + nnet_diff.Resize(0,0); // release GPU memory, + + assert(mat->nrow == nnet_out_h.NumRows() && mat->ncol == nnet_out_h.NumCols()); + stride = mat->stride; + for (int i = 0; i < mat->nrow; i++) + { + const BaseFloat *row = nnet_out_h.RowData(i); + BaseFloat *nerv_row = (BaseFloat *)((char *)mat->data.f + i * stride); + memmove(nerv_row, row, sizeof(BaseFloat) * mat->ncol); + } + nnet_out_h.Resize(0,0); + + // increase time counter + mpe->total_frame_acc += mpe->utt_frame_acc; + mpe->total_frames += mpe->num_frames; + mpe->num_done++; + + if (mpe->num_done % 100 == 0) { + mpe->time_now = mpe->time->Elapsed(); + KALDI_VLOG(1) << "After " << mpe->num_done << " utterances: time elapsed = " + << mpe->time_now/60 << " min; processed " << mpe->total_frames/mpe->time_now + << " frames per second."; +#if HAVE_CUDA==1 + // check the GPU is not overheated + CuDevice::Instantiate().CheckGpuHealth(); +#endif + } + return mat; + } + + double get_num_frames_mpe(const KaldiMPE *mpe) + { + return (double)mpe->num_frames; + } + + double get_utt_frame_acc_mpe(const KaldiMPE *mpe) + { + return (double)mpe->utt_frame_acc; + } + +} diff --git a/kaldi_seq/src/kaldi_mpe.h b/kaldi_seq/src/kaldi_mpe.h new file mode 100644 index 0000000..fd09574 --- /dev/null +++ b/kaldi_seq/src/kaldi_mpe.h @@ -0,0 +1,21 @@ +#ifndef NERV_kaldi_KALDI_MPE +#define NERV_kaldi_KALDI_MPE +#include "nerv/matrix/matrix.h" +#include "nerv/common.h" +#ifdef __cplusplus +extern "C" { +#endif + + typedef struct KaldiMPE KaldiMPE; + + KaldiMPE * new_KaldiMPE(const char*, const char*, const char*, const char*); + void destroy_KaldiMPE(KaldiMPE *); + int check_mpe(KaldiMPE *, const Matrix*, const char *); + Matrix * calc_diff_mpe(KaldiMPE *, Matrix *, const char *); + double get_num_frames_mpe(const KaldiMPE *); + double get_utt_frame_acc_mpe(const KaldiMPE *); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/kaldi_seq/tools/net_kaldi2nerv.cpp b/kaldi_seq/tools/net_kaldi2nerv.cpp new file mode 100644 index 0000000..bbac3db --- /dev/null +++ b/kaldi_seq/tools/net_kaldi2nerv.cpp @@ -0,0 +1,85 @@ +#include <iostream> +#include <cstdio> +#include <cstring> +#include <cstdlib> +#include <cassert> +using namespace std; + +const char fmt[] = "[%013d]\n"; + +int main(int argc, char *argv[]) +{ + if(argc < 3){ + printf("Usage: %s kaldi_nnet nerv_output\n", argv[0]); + exit(0); + } + + FILE *fin = fopen(argv[1], "r"); + FILE *fout = fopen(argv[2], "w"); + + if(!fin || !fout){ + printf("fopen error\n"); + exit(1); + } + + char buf[1024], tag[64]; + int a, b; + char ***arr; + long start, size; + int affine_ltp = 0, affine_bp = 0; + + while(fgets(buf, 1024, fin)){ + if(sscanf(buf, "%s%d%d", tag, &b, &a) == 3 && strcmp(tag, "<AffineTransform>") == 0){ + fgets(buf, 1024, fin); + arr = new char**[a]; + for(int i = 0; i < a; i++) + arr[i] = new char*[b]; + for(int j = 0; j < b; j++) + for(int i = 0; i < a; i++){ + arr[i][j] = new char[16]; + fscanf(fin, "%s", arr[i][j]); + } + + start = ftell(fout); + fprintf(fout, fmt, 0); + fprintf(fout, "{type=\"nerv.LinearTransParam\",id=\"affine%d_ltp\"}\n", affine_ltp++); + fprintf(fout, "%d %d\n", a, b); + for(int i = 0; i < a; i++){ + for(int j = 0; j < b; j++){ + fprintf(fout, "%s ", arr[i][j]); + delete [] arr[i][j]; + } + fprintf(fout, "\n"); + delete [] arr[i]; + } + delete [] arr; + + size = ftell(fout) - start; + fseek(fout, start, SEEK_SET); + fprintf(fout, fmt, (int)size); + fseek(fout, 0, SEEK_END); + + fgets(buf, 1024, fin); + fscanf(fin, "%*s"); + + start = ftell(fout); + fprintf(fout, fmt, 0); + fprintf(fout, "{type=\"nerv.BiasParam\",id=\"affine%d_bp\"}\n", affine_bp++); + fprintf(fout, "%d %d\n", 1, b); + for(int i = 0; i < b; i++){ + fscanf(fin, "%s", buf); + fprintf(fout, "%s ", buf); + } + fputs("\n", fout); + size = ftell(fout) - start; + fseek(fout, start, SEEK_SET); + fprintf(fout, fmt, (int)size); + fseek(fout, 0, SEEK_END); + } + } + + fclose(fin); + fclose(fout); + + return 0; +} diff --git a/kaldi_seq/tools/transf_kaldi2nerv.cpp b/kaldi_seq/tools/transf_kaldi2nerv.cpp new file mode 100644 index 0000000..525bcda --- /dev/null +++ b/kaldi_seq/tools/transf_kaldi2nerv.cpp @@ -0,0 +1,106 @@ +#include <iostream> +#include <cstdio> +#include <cstring> +#include <cstdlib> +#include <cassert> +using namespace std; + +const char fmt[] = "[%013d]\n"; + +int main(int argc, char *argv[]) +{ + if(argc < 3){ + printf("Usage: %s kaldi_transf nerv_output\n", argv[0]); + exit(1); + } + + FILE *fin = fopen(argv[1], "r"); + FILE *fout = fopen(argv[2], "w"); + if(!fin || !fout){ + puts("fopen error"); + exit(1); + } + + char buf[1024], tag[64]; + int a, b; + int size_window, size_bias; + char **window, **bias; + + while(fgets(buf, sizeof(buf), fin)) + { + if(sscanf(buf, "%s%d%d", tag, &a, &b) == 3){ + if(strcmp(tag, "<AddShift>") == 0){ + assert(a == b); + size_bias = a; + fscanf(fin, "%*s%*s%*s"); + bias = new char*[size_bias]; + for(int i = 0; i < size_bias; i++){ + bias[i] = new char[16]; + fscanf(fin, "%s", bias[i]); + } + } else if(strcmp(tag, "<Rescale>") == 0){ + assert(a == b); + size_window = a; + fscanf(fin, "%*s%*s%*s"); + window = new char*[size_window]; + for(int i = 0; i < size_window; i++){ + window[i] = new char[16]; + fscanf(fin, "%s", window[i]); + } + } + } + } + + long start = ftell(fout), size; + fprintf(fout, fmt, 0); + fprintf(fout, "{id = \"bias1\", type = \"nerv.MatrixParam\"}\n"); + fprintf(fout, "1 %d\n", size_bias); + for(int i = 0; i<size_bias; i++) + fprintf(fout, "0 "); + fputs("\n", fout); + size = ftell(fout) - start; + fseek(fout, start, SEEK_SET); + fprintf(fout, fmt, (int)size); + fseek(fout, 0, SEEK_END); + + start = ftell(fout); + fprintf(fout, fmt, 0); + fprintf(fout, "{id = \"window1\", type = \"nerv.MatrixParam\"}\n"); + fprintf(fout, "1 %d\n", size_window); + for(int i = 0; i<size_window; i++) + fprintf(fout, "1 "); + fputs("\n", fout); + size = ftell(fout) - start; + fseek(fout, start, SEEK_SET); + fprintf(fout, fmt, (int)size); + fseek(fout, 0, SEEK_END); + + start = ftell(fout); + fprintf(fout, fmt, 0); + fprintf(fout, "{id = \"bias2\", type = \"nerv.MatrixParam\"}\n"); + fprintf(fout, "1 %d\n", size_bias); + for(int i = 0; i<size_bias; i++) + fprintf(fout, "%s ", bias[i]); + fputs("\n", fout); + size = ftell(fout) - start; + fseek(fout, start, SEEK_SET); + fprintf(fout, fmt, (int)size); + fseek(fout, 0, SEEK_END); + + start = ftell(fout); + fprintf(fout, fmt, 0); + fprintf(fout, "{id = \"window2\", type = \"nerv.MatrixParam\"}\n"); + fprintf(fout, "1 %d\n", size_window); + for(int i = 0; i<size_window; i++) + fprintf(fout, "%s ", window[i]); + fputs("\n", fout); + size = ftell(fout) - start; + fseek(fout, start, SEEK_SET); + fprintf(fout, fmt, (int)size); + fseek(fout, 0, SEEK_END); + + fclose(fin); + fclose(fout); + + return 0; +} |