13 files changed, 250 insertions, 164 deletions
diff --git a/kaldi_decode/Makefile b/kaldi_decode/Makefile
new file mode 100644
index 0000000..e3a7c2d
--- /dev/null
+++ b/kaldi_decode/Makefile
@@ -0,0 +1,43 @@
+ifndef LUA_BINDIR
+$(error Please build the package via luarocks: `luarocks make`)
+endif
+
+ifndef KALDI_BASE
+$(error KALDI_BASE is not set)
+endif
+
+ifndef CUDA_BASE
+$(error CUDA_BASE is not set)
+endif
+
+KDIR := $(KALDI_BASE)
+BUILD_DIR := $(CURDIR)/build
+INC_PATH := $(LUA_BINDIR)/../include/
+OBJS := src/nnet-forward.o nnet-forward
+
+SUBDIR := src
+OBJ_DIR := $(BUILD_DIR)/objs
+LUA_DIR = $(INST_LUADIR)/kaldi_decode
+KALDIINCLUDE := -I $(KDIR)/tools/ATLAS/include/ -I $(KDIR)/tools/openfst/include/ -I $(KDIR)/src/
+
+OBJS := $(addprefix $(OBJ_DIR)/,$(OBJS))
+OBJ_SUBDIR := $(addprefix $(OBJ_DIR)/,$(SUBDIR))
+
+KL := $(KDIR)/src/feat/kaldi-feat.a $(KDIR)/src/cudamatrix/kaldi-cudamatrix.a $(KDIR)/src/matrix/kaldi-matrix.a $(KDIR)/src/base/kaldi-base.a $(KDIR)/src/util/kaldi-util.a $(KDIR)/src/hmm/kaldi-hmm.a $(KDIR)/src/tree/kaldi-tree.a $(KDIR)/src/nnet/kaldi-nnet.a $(BLAS_LDFLAGS)
+
+build: $(OBJ_DIR) $(LUA_DIR) $(OBJ_SUBDIR) $(OBJS)
+$(OBJ_DIR)/%.o: %.cc
+	g++ -c -o $@ $< -Wall $(KALDIINCLUDE) -DHAVE_ATLAS -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN -DLUA_USE_APICHECK -I $(LUA_INCDIR) -I $(INC_PATH) $(CFLAGS)
+$(OBJ_DIR)/nnet-forward: $(OBJ_DIR)/src/nnet-forward.o
+	g++ -o $@ $< $(KL) -L$(LUA_LIBDIR) -Wl,-rpath=$(LUA_LIBDIR) -lluajit-5.1 -L$(CUDA_BASE)/lib64/ -Wl,-rpath=$(CUDA_BASE)/lib64/ -lcudart -lcublas -ldl
+$(OBJ_DIR) $(LUA_DIR) $(OBJ_SUBDIR):
+	-mkdir -p $@
+install: $(LUA_DIR)
+	cp $(OBJ_DIR)/nnet-forward $(LUA_BINDIR)/nnet-forward-with-nerv
+	cp src/asr_propagator.lua $(LUA_DIR)/
+	sed 's*nnet_forward=*nnet_forward=$(LUA_BINDIR)/nnet-forward-with-nerv.sh*g;s*asr_propagator=*asr_propagator=$(LUA_BINDIR)/../share/lua/5.1/kaldi_decode/asr_propagator.lua*g' decode_with_nerv.sh > $(LUA_BINDIR)/decode_with_nerv.sh
+	echo '$(LUA_BINDIR)/nnet-forward-with-nerv "$$@"' | cat nnet-forward-with-nerv.sh - | sed 's*\.\./\.\./install/bin/luarocks*$(LUA_BINDIR)/luarocks*g' > $(LUA_BINDIR)/nnet-forward-with-nerv.sh
+	chmod +x $(LUA_BINDIR)/nnet-forward-with-nerv.sh
+	chmod +x $(LUA_BINDIR)/decode_with_nerv.sh
+clean:
+	-rm -r $(OBJ_DIR)
diff --git a/kaldi_decode/README b/kaldi_decode/README
deleted file mode 100755
index 8d0a95b..0000000
--- a/kaldi_decode/README
+++ /dev/null
@@ -1,13 +0,0 @@
-source path.sh
-source cmd.sh
-
-acwt=0.1
-dir=/slfs5/users/ymz09/chime/baseline/ASR/exp/nerv_seq/
-graph=/slfs5/users/ymz09/chime/baseline/ASR/exp/tri4a_dnn_tr05_multi_enhanced/graph_tgpr_5k
-data=/slfs5/users/ymz09/chime/baseline/ASR/data-fbank/et05_real_enhanced
-config=/slfs6/users/ymz09/nerv-project/nerv/nerv/examples/mpe_chime3.lua
-
-decode.sh --nj 4 --cmd "$decode_cmd" --config conf/decode_dnn.config --acwt $acwt \
-        $graph $data $config \
-        $dir/decode_tgpr_5k_et05_real_enhanced_nerv
-
diff --git a/kaldi_decode/README.timit b/kaldi_decode/README.timit
new file mode 100755
index 0000000..7fac918
--- /dev/null
+++ b/kaldi_decode/README.timit
@@ -0,0 +1,15 @@
+#!/bin/bash
+source path.sh
+source cmd.sh
+
+gmmdir=/speechlab/users/mfy43/timit/s5/exp/tri3/
+data_fmllr=/speechlab/users/mfy43/timit/s5/data-fmllr-tri3/
+dir=/speechlab/users/mfy43/timit/s5/exp/dnn4_nerv_prepare/
+nerv_config=/speechlab/users/mfy43/nerv/nerv/examples/timit_baseline2.lua
+decode=/speechlab/users/mfy43/nerv/install/bin/decode_with_nerv.sh
+
+# Decode (reuse HCLG graph)
+$decode --nj 20 --cmd "$decode_cmd" --acwt 0.2 \
+    $gmmdir/graph $data_fmllr/test $nerv_config $dir/decode_test || exit 1;
+$decode --nj 20 --cmd "$decode_cmd" --acwt 0.2 \
+    $gmmdir/graph $data_fmllr/dev $nerv_config $dir/decode_dev || exit 1;
diff --git a/kaldi_decode/cmd.sh b/kaldi_decode/cmd.sh
deleted file mode 100755
index be10905..0000000
--- a/kaldi_decode/cmd.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-#export train_cmd="queue.pl -l arch=*64"
-#export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-#export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-export train_cmd=run.pl
-#export decode_cmd=run.pl
-export decode_cmd='queue.pl -l hostname="markov"'
-export cuda_cmd=run.pl
-export mkgraph_cmd=run.pl
-
-#export train_cmd='queue.pl'
-#export decode_cmd='queue.pl'
-#export cuda_cmd='queue.pl -l gpu=1 -l hostname="markov|date|hamming"'
-#export mkgraph_cmd='queue.pl"'
-
diff --git a/kaldi_decode/conf/decode_dnn.config b/kaldi_decode/conf/decode_dnn.config
deleted file mode 100644
index 89dd992..0000000
--- a/kaldi_decode/conf/decode_dnn.config
+++ /dev/null
@@ -1,2 +0,0 @@
-beam=18.0 # beam for decoding.  Was 13.0 in the scripts.
-lattice_beam=10.0 # this has most effect on size of the lattices.
diff --git a/kaldi_decode/decode_with_nerv.sh b/kaldi_decode/decode_with_nerv.sh
new file mode 100755
index 0000000..5554b2e
--- /dev/null
+++ b/kaldi_decode/decode_with_nerv.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+# Copyright 2012-2013 Karel Vesely, Daniel Povey
+# Apache 2.0
+
+# Begin configuration section. 
+nnet=               # non-default location of DNN (optional)
+feature_transform=  # non-default location of feature_transform (optional)
+model=              # non-default location of transition model (optional)
+class_frame_counts= # non-default location of PDF counts (optional)
+srcdir=             # non-default location of DNN-dir (decouples model dir from decode dir)
+
+stage=0 # stage=1 skips lattice generation
+nj=4
+cmd=run.pl
+
+acwt=0.10 # note: only really affects pruning (scoring is on lattices).
+beam=13.0
+lattice_beam=8.0
+min_active=200
+max_active=7000 # limit of active tokens
+max_mem=50000000 # approx. limit to memory consumption during minimization in bytes
+nnet_forward_opts="--apply-log=true" # IMPORTANT, to apply log before to substract log-prior, and to know the modified 'nnet-forward' removed '--no-softmax' option
+
+skip_scoring=false
+scoring_opts="--min-lmwt 4 --max-lmwt 15"
+
+num_threads=1 # if >1, will use latgen-faster-parallel
+parallel_opts=   # Ignored now.
+use_gpu="no" # yes|no|optionaly
+
+cmvn_opts=
+splice_opts=
+delta_opts=
+
+asr_propagator=
+nnet_forward=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "Usage: $0 [options] <graph-dir> <data-dir> <nerv-model-config> <decode-dir>"
+   echo "... where <decode-dir> is assumed to be a sub-directory of the directory"
+   echo " where the DNN and transition model is."
+   echo "e.g.: $0 exp/dnn1/graph_tgpr data/test config.lua exp/dnn1/decode_tgpr"
+   echo ""
+   echo "This script works on plain or modified features (CMN,delta+delta-delta),"
+   echo "which are then sent through feature-transform. It works out what type"
+   echo "of features you used from content of srcdir."
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   echo ""
+   echo "  --srcdir <dir>                                   # non-default dir with DNN/models, can be different"
+   echo "                                                   # from parent dir of <decode-dir>' (opt.)"
+   echo ""
+   echo "  --acwt <float>                                   # select acoustic scale for decoding"
+   echo "  --scoring-opts <opts>                            # options forwarded to local/score.sh"
+   echo "  --num-threads <N>                                # N>1: run multi-threaded decoder"
+   exit 1;
+fi
+
+
+graphdir=$1
+data=$2
+model_conf=$3
+dir=$4
+
+[ -z $srcdir ] && srcdir=`dirname $dir`; # Default model directory one level up from decoding directory.
+sdata=$data/split$nj;
+
+mkdir -p $dir/log
+
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+# Select default locations to model files (if not already set externally)
+[ -z "$model" ] && model=$srcdir/final.mdl
+#
+[ -z "$class_frame_counts" -a -f $srcdir/prior_counts ] && class_frame_counts=$srcdir/prior_counts # priority,
+[ -z "$class_frame_counts" ] && class_frame_counts=$srcdir/ali_train_pdf.counts
+
+# Check that files exist
+for f in $sdata/1/feats.scp $model $class_frame_counts $graphdir/HCLG.fst; do
+  [ ! -f $f ] && echo "$0: missing file $f" && exit 1;
+done
+
+# Possibly use multi-threaded decoder
+thread_string=
+[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" 
+
+
+# PREPARE FEATURE EXTRACTION PIPELINE
+# import config,
+D=$srcdir
+[ -e $D/norm_vars ] && cmvn_opts="--norm-means=true --norm-vars=$(cat $D/norm_vars)" # Bwd-compatibility,
+[ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
+[ -e $D/splice_opts ] && splice_opts=$(cat $D/splice_opts)
+[ -e $D/delta_order ] && delta_opts="--delta-order=$(cat $D/delta_order)" # Bwd-compatibility,
+[ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
+#
+# Create the feature stream,
+feats="ark,s,cs:copy-feats scp:$sdata/JOB/feats.scp ark:- |"
+# apply-cmvn (optional),
+[ ! -z "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
+[ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
+# splice-opts (optional),
+[ ! -z "$splice_opts" ] && feats="$feats splice-feats $splice_opts ark:- ark:- |"
+# add-deltas (optional),
+[ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
+#
+# Run the decoding in the queue,
+if [ $stage -le 0 ]; then
+#  $cmd --num-threads $((num_threads+1)) JOB=1:$nj $dir/log/decode.JOB.log \
+# remove multi-threads to avoid smp requirement
+  $cmd --num-threads $((num_threads)) JOB=1:$nj $dir/log/decode.JOB.log \
+    $nnet_forward $nnet_forward_opts --class-frame-counts=$class_frame_counts --use-gpu=$use_gpu $model_conf "$feats" ark:- $asr_propagator \| \
+    latgen-faster-mapped$thread_string --min-active=$min_active --max-active=$max_active --max-mem=$max_mem --beam=$beam \
+    --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true --word-symbol-table=$graphdir/words.txt \
+    $model $graphdir/HCLG.fst ark:- "ark:|gzip -c > $dir/lat.JOB.gz" || exit 1;
+fi
+# Run the scoring
+if ! $skip_scoring ; then
+  [ ! -x local/score.sh ] && \
+    echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+  local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir || exit 1;
+fi
+
+exit 0;
diff --git a/kaldi_decode/kaldi_decode-scm-1.rockspec b/kaldi_decode/kaldi_decode-scm-1.rockspec
new file mode 100644
index 0000000..cc533ae
--- /dev/null
+++ b/kaldi_decode/kaldi_decode-scm-1.rockspec
@@ -0,0 +1,36 @@
+package = "kaldi_decode"
+version = "scm-1"
+source = {
+    url = "https://github.com/Nerv-SJTU/nerv-speech.git"
+}
+description = {
+    summary = "Kaldi decode support for NERV",
+    detailed = [[
+    ]],
+    homepage = "https://github.com/Determinant/nerv-speech",
+    license = "BSD"
+}
+dependencies = {
+    "nerv >= scm-1",
+    "lua >= 5.1"
+}
+build = {
+    type = "make",
+    build_variables = {
+        CFLAGS="$(CFLAGS) -Wall -Wextra -g -O2",
+        --CFLAGS="$(CFLAGS) -Wall -Wextra -g",
+        LIBFLAG="$(LIBFLAG)",
+        LUA_LIBDIR="$(LUA_LIBDIR)",
+        LUA_BINDIR="$(LUA_BINDIR)",
+        LUA_INCDIR="$(LUA_INCDIR)",
+        LUA="$(LUA)",
+    },
+    install_variables = {
+        LUA_BINDIR="$(LUA_BINDIR)",
+        INST_PREFIX="$(PREFIX)",
+        INST_BINDIR="$(BINDIR)",
+        INST_LIBDIR="$(LIBDIR)",
+        INST_LUADIR="$(LUADIR)",
+        INST_CONFDIR="$(CONFDIR)",
+    },
+}
diff --git a/kaldi_decode/local/score.sh b/kaldi_decode/local/score.sh
deleted file mode 100755
index b18f350..0000000
--- a/kaldi_decode/local/score.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/bin/bash
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
-# Apache 2.0
-
-[ -f ./path.sh ] && . ./path.sh
-
-# begin configuration section.
-cmd=run.pl
-stage=0
-decode_mbr=true
-reverse=false
-word_ins_penalty=0.0
-min_lmwt=5
-max_lmwt=20
-#end configuration section.
-
-[ -f ./path.sh ] && . ./path.sh
-. parse_options.sh || exit 1;
-
-if [ $# -ne 3 ]; then
-  echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
-  echo " Options:"
-  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
-  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
-  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
-  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
-  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
-  echo "    --reverse (true/false)          # score with time reversed features "
-  exit 1;
-fi
-
-data=$1
-lang_or_graph=$2
-dir=$3
-
-symtab=$lang_or_graph/words.txt
-
-for f in $symtab $dir/lat.1.gz $data/text; do
-  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
-done
-
-mkdir -p $dir/scoring/log
-
-cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
-
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
-  lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
-  lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \
-  lattice-best-path --word-symbol-table=$symtab \
-    ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
-
-if $reverse; then
-  for lmwt in `seq $min_lmwt $max_lmwt`; do
-    mv $dir/scoring/$lmwt.tra $dir/scoring/$lmwt.tra.orig
-    awk '{ printf("%s ",$1); for(i=NF; i>1; i--){ printf("%s ",$i); } printf("\n"); }' \
-       <$dir/scoring/$lmwt.tra.orig >$dir/scoring/$lmwt.tra
-  done
-fi
-
-# Note: the double level of quoting for the sed command
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
-   cat $dir/scoring/LMWT.tra \| \
-    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
-    compute-wer --text --mode=present \
-     ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT || exit 1;
-
-exit 0;
diff --git a/kaldi_decode/nnet-forward-with-nerv.sh b/kaldi_decode/nnet-forward-with-nerv.sh
new file mode 100644
index 0000000..71bf239
--- /dev/null
+++ b/kaldi_decode/nnet-forward-with-nerv.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+source <(../../install/bin/luarocks path)
diff --git a/kaldi_decode/path.sh b/kaldi_decode/path.sh
deleted file mode 100755
index 5e9bd2a..0000000
--- a/kaldi_decode/path.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-### change this line to your kaldi repo
-export KALDI_ROOT=/speechlab/tools/KALDI/kaldi-master/
-### the following lines should not be changed in most cases
-
-# setup kaldi path
-[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh 
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/irstlm/bin/:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin/:$KALDI_ROOT/src/kwsbin:$PWD:$PATH
-export LC_ALL=C
-
-# setup luarocks path and cpath for NERV (important)
-source <(../../install/bin/luarocks path)
diff --git a/kaldi_decode/src/Makefile b/kaldi_decode/src/Makefile
deleted file mode 100644
index 0897798..0000000
--- a/kaldi_decode/src/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-# Change KDIR to `kaldi-trunk' path (Kaldi must be compiled with --share)
-KDIR := /speechlab/tools/KALDI/kaldi-master/
-NERVDIR := /speechlab/users/mfy43/nerv/
-CUDADIR := /usr/local/cuda/
-
-nnet-forward:
-	g++ -msse -msse2 -Wall -I $(KDIR)/src/ -pthread -DKALDI_DOUBLEPRECISION=0 -DHAVE_POSIX_MEMALIGN -Wno-sign-compare -Wno-unused-local-typedefs -Winit-self -DHAVE_EXECINFO_H=1 -rdynamic -DHAVE_CXXABI_H -DHAVE_ATLAS -I $(KDIR)/tools/ATLAS/include -I $(KDIR)/tools/openfst/include -Wno-sign-compare -g  -fPIC -DHAVE_CUDA -I $(CUDADIR)/include  -DKALDI_NO_EXPF  -I $(NERVDIR)/install//include/luajit-2.0/ -I $(NERVDIR)/install/include/ -DLUA_USE_APICHECK -c -o nnet-forward.o nnet-forward.cc
-	g++ -rdynamic -Wl,-rpath=$(KDIR)/tools/openfst/lib -L$(CUDADIR)/lib64 -Wl,-rpath,$(CUDADIR)/lib64 -Wl,-rpath=$(KDIR)/src/lib -L.  -L$(KDIR)/src/nnet/   -L$(KDIR)/src/cudamatrix/   -L$(KDIR)/src/lat/   -L$(KDIR)/src/hmm/   -L$(KDIR)/src/tree/   -L$(KDIR)/src/matrix/   -L$(KDIR)/src/util/   -L$(KDIR)/src/base/ nnet-forward.o $(KDIR)/src/nnet//libkaldi-nnet.so $(KDIR)/src/cudamatrix//libkaldi-cudamatrix.so $(KDIR)/src/lat//libkaldi-lat.so $(KDIR)/src/hmm//libkaldi-hmm.so $(KDIR)/src/tree//libkaldi-tree.so $(KDIR)/src/matrix//libkaldi-matrix.so $(KDIR)/src/util//libkaldi-util.so $(KDIR)/src/base//libkaldi-base.so   -L$(KDIR)/tools/openfst/lib -lfst -lm -lpthread -ldl   -lkaldi-nnet   -lkaldi-cudamatrix   -lkaldi-lat   -lkaldi-hmm   -lkaldi-tree   -lkaldi-matrix   -lkaldi-util   -lkaldi-base -lstdc++ -L$(NERVDIR)/install/lib -Wl,-rpath=$(NERVDIR)/install/lib -lnervcore -lluaT -rdynamic -Wl,-rpath=$(KDIR)//tools/openfst/lib -L$(DUDADIR)/lib64 -Wl,-rpath,$(CUDADIR)/lib64 -Wl,-rpath=$(KDIR)//src/lib -lfst -lm -lpthread -ldl -L $(NERVDIR)/luajit-2.0/src/ -lluajit -o nnet-forward -L/home/intel/mkl/lib/intel64/ -Wl,-rpath=/home/intel/mkl/lib/intel64/ -lmkl_rt
-
-clean:
-	-rm nnet-forward.o nnet-forward
-
diff --git a/kaldi_decode/src/nerv4decode.lua b/kaldi_decode/src/asr_propagator.lua
index 898b5a8..5d0ad7c 100644
--- a/kaldi_decode/src/nerv4decode.lua
+++ b/kaldi_decode/src/asr_propagator.lua
@@ -15,19 +15,18 @@ local function _add_profile_method(cls)
 end
 _add_profile_method(nerv.MMatrix)
 
-function build_trainer(ifname, feature)
+function build_propagator(ifname, feature)
     local param_repo = nerv.ParamRepo()
     param_repo:import(ifname, nil, gconf)
     local layer_repo = make_layer_repo(param_repo)
     local network = get_decode_network(layer_repo)
     local global_transf = get_global_transf(layer_repo)
-    local input_order = get_input_order()
-    local readers = make_readers(feature, layer_repo)
-    network:init(1)
+    local input_order = get_decode_input_order()
+    local readers = make_decode_readers(feature, layer_repo)
 
-    local iterative_trainer = function()
+    local batch_propagator = function()
         local data = nil
-        for ri = 1, #readers, 1 do
+        for ri = 1, #readers do
             data = readers[ri].reader:get_data()
             if data ~= nil then
                 break
@@ -38,6 +37,9 @@ function build_trainer(ifname, feature)
             return "", nil
         end
 
+        gconf.batch_size = data[input_order[1].id]:nrow()
+        network:init(gconf.batch_size)
+
         local input = {}
         for i, e in ipairs(input_order) do
             local id = e.id
@@ -47,16 +49,15 @@ function build_trainer(ifname, feature)
             local transformed
             if e.global_transf then
                 transformed = nerv.speech_utils.global_transf(data[id],
-                global_transf,
-                gconf.frm_ext or 0, 0,
-                gconf)
+                                    global_transf,
+                                    gconf.frm_ext or 0, 0,
+                                    gconf)
             else
                 transformed = data[id]
             end
             table.insert(input, transformed)
         end
         local output = {nerv.MMatrixFloat(input[1]:nrow(), network.dim_out[1])}
-        network:batch_resize(input[1]:nrow())
         network:propagate(input, output)
         
         local utt = data["key"]
@@ -64,20 +65,17 @@ function build_trainer(ifname, feature)
             nerv.error("no key found.")
         end
 
-        local mat = nerv.MMatrixFloat(output[1]:nrow(), output[1]:ncol())
-        output[1]:copy_toh(mat)
-
         collectgarbage("collect")
-        return utt, mat
+        return utt, output[1]
     end
 
-    return iterative_trainer
+    return batch_propagator
 end
 
 function init(config, feature)
     dofile(config)
     gconf.use_cpu = true -- use CPU to decode
-    trainer = build_trainer(gconf.decode_param, feature)
+    trainer = build_propagator(gconf.decode_param, feature)
 end
 
 function feed()
diff --git a/kaldi_decode/src/nnet-forward.cc b/kaldi_decode/src/nnet-forward.cc
index 4911791..8781705 100644
--- a/kaldi_decode/src/nnet-forward.cc
+++ b/kaldi_decode/src/nnet-forward.cc
@@ -46,7 +46,7 @@ int main(int argc, char *argv[]) {
         const char *usage =
             "Perform forward pass through Neural Network.\n"
             "\n"
-            "Usage:  nnet-forward [options] <nerv-config> <feature-rspecifier> <feature-wspecifier> [nerv4decode.lua]\n"
+            "Usage:  nnet-forward [options] <nerv-config> <feature-rspecifier> <feature-wspecifier> [asr_propagator.lua]\n"
             "e.g.: \n"
             " nnet-forward config.lua ark:features.ark ark:mlpoutput.ark\n";
 
@@ -78,9 +78,9 @@ int main(int argc, char *argv[]) {
         std::string config = po.GetArg(1),
             feature_rspecifier = po.GetArg(2),
             feature_wspecifier = po.GetArg(3),
-            nerv4decode = "src/nerv4decode.lua";
-        if(po.NumArgs() >= 4)
-            nerv4decode = po.GetArg(4);
+            propagator = "src/asr_propagator.lua";
+            if(po.NumArgs() >= 4)
+                propagator = po.GetArg(4);
 
         //Select the GPU
 #if HAVE_CUDA==1
@@ -99,8 +99,8 @@ int main(int argc, char *argv[]) {
 
         lua_State *L = lua_open();
         luaL_openlibs(L);
-        if(luaL_loadfile(L, nerv4decode.c_str()))
-            KALDI_ERR << "luaL_loadfile() " << nerv4decode << " failed " << lua_tostring(L, -1);
+        if(luaL_loadfile(L, propagator.c_str()))
+            KALDI_ERR << "luaL_loadfile() " << propagator << " failed " << lua_tostring(L, -1);
 
         if(lua_pcall(L, 0, 0, 0))
             KALDI_ERR << "lua_pall failed " << lua_tostring(L, -1);