From b33b3a6732c6b6a66bd5c44c615be56d66f4ed67 Mon Sep 17 00:00:00 2001 From: Yimmon Zhuang Date: Wed, 14 Oct 2015 15:37:20 +0800 Subject: support kaldi decoder --- kaldi_decode/utils/split_data.sh | 135 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100755 kaldi_decode/utils/split_data.sh (limited to 'kaldi_decode/utils/split_data.sh') diff --git a/kaldi_decode/utils/split_data.sh b/kaldi_decode/utils/split_data.sh new file mode 100755 index 0000000..941890c --- /dev/null +++ b/kaldi_decode/utils/split_data.sh @@ -0,0 +1,135 @@ +#!/bin/bash +# Copyright 2010-2013 Microsoft Corporation +# Johns Hopkins University (Author: Daniel Povey) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +split_per_spk=true +if [ "$1" == "--per-utt" ]; then + split_per_spk=false + shift +fi + +if [ $# != 2 ]; then + echo "Usage: split_data.sh [--per-utt] " + echo "This script will not split the data-dir if it detects that the output is newer than the input." + echo "By default it splits per speaker (so each speaker is in only one split dir)," + echo "but with the --per-utt option it will ignore the speaker information while splitting." + exit 1 +fi + +data=$1 +numsplit=$2 + +if [ $numsplit -le 0 ]; then + echo "Invalid num-split argument $numsplit"; + exit 1; +fi + +n=0; +feats="" +wavs="" +utt2spks="" +texts="" + +nu=`cat $data/utt2spk | wc -l` +nf=`cat $data/feats.scp 2>/dev/null | wc -l` +nt=`cat $data/text 2>/dev/null | wc -l` # take it as zero if no such file +if [ -f $data/feats.scp ] && [ $nu -ne $nf ]; then + echo "** split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf); you can " + echo "** use utils/fix_data_dir.sh $data to fix this." +fi +if [ -f $data/text ] && [ $nu -ne $nt ]; then + echo "** split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt); you can " + echo "** use utils/fix_data_dir.sh to fix this." +fi + +s1=$data/split$numsplit/1 +if [ ! -d $s1 ]; then + need_to_split=true +else + need_to_split=false + for f in utt2spk spk2utt spk2warp feats.scp text wav.scp cmvn.scp spk2gender \ + vad.scp segments reco2file_and_channel utt2lang; do + if [[ -f $data/$f && ( ! -f $s1/$f || $s1/$f -ot $data/$f ) ]]; then + need_to_split=true + fi + done +fi + +if ! $need_to_split; then + exit 0; +fi + +for n in `seq $numsplit`; do + mkdir -p $data/split$numsplit/$n + utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk" +done + +if $split_per_spk; then + utt2spk_opt="--utt2spk=$data/utt2spk" +else + utt2spk_opt= +fi + +# If lockfile is not installed, just don't lock it. It's not a big deal. +which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock + +utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1 + +for n in `seq $numsplit`; do + dsn=$data/split$numsplit/$n + utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1; +done + +maybe_wav_scp= +if [ ! -f $data/segments ]; then + maybe_wav_scp=wav.scp # If there is no segments file, then wav file is + # indexed per utt. +fi + +# split some things that are indexed by utterance. +for f in feats.scp text vad.scp utt2lang $maybe_wav_scp; do + if [ -f $data/$f ]; then + utils/filter_scps.pl JOB=1:$numsplit \ + $data/split$numsplit/JOB/utt2spk $data/$f $data/split$numsplit/JOB/$f || exit 1; + fi +done + +# split some things that are indexed by speaker +for f in spk2gender spk2warp cmvn.scp; do + if [ -f $data/$f ]; then + utils/filter_scps.pl JOB=1:$numsplit \ + $data/split$numsplit/JOB/spk2utt $data/$f $data/split$numsplit/JOB/$f || exit 1; + fi +done + +for n in `seq $numsplit`; do + dsn=$data/split$numsplit/$n + if [ -f $data/segments ]; then + utils/filter_scp.pl $dsn/utt2spk $data/segments > $dsn/segments + awk '{print $2;}' $dsn/segments | sort | uniq > $data/tmp.reco # recording-ids. + if [ -f $data/reco2file_and_channel ]; then + utils/filter_scp.pl $data/tmp.reco $data/reco2file_and_channel > $dsn/reco2file_and_channel + fi + if [ -f $data/wav.scp ]; then + utils/filter_scp.pl $data/tmp.reco $data/wav.scp >$dsn/wav.scp + fi + rm $data/tmp.reco + fi # else it would have been handled above, see maybe_wav. +done + +rm -f $data/.split_lock + +exit 0 -- cgit v1.2.3