diff options
author | Determinant <[email protected]> | 2016-02-29 20:06:25 +0800 |
---|---|---|
committer | Determinant <[email protected]> | 2016-02-29 20:06:25 +0800 |
commit | 534b039d297b9f2f83f889e2592686d79569e141 (patch) | |
tree | 66f73999d98427d2e4c2ed09eeec3d6d845fb975 /kaldi_decode/utils/split_data.sh | |
parent | 1e0ac0fb5c9f517e7325deb16004de1054454da7 (diff) |
...
Diffstat (limited to 'kaldi_decode/utils/split_data.sh')
-rwxr-xr-x | kaldi_decode/utils/split_data.sh | 135 |
1 files changed, 0 insertions, 135 deletions
diff --git a/kaldi_decode/utils/split_data.sh b/kaldi_decode/utils/split_data.sh deleted file mode 100755 index 941890c..0000000 --- a/kaldi_decode/utils/split_data.sh +++ /dev/null @@ -1,135 +0,0 @@ -#!/bin/bash -# Copyright 2010-2013 Microsoft Corporation -# Johns Hopkins University (Author: Daniel Povey) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -# MERCHANTABLITY OR NON-INFRINGEMENT. -# See the Apache 2 License for the specific language governing permissions and -# limitations under the License. - -split_per_spk=true -if [ "$1" == "--per-utt" ]; then - split_per_spk=false - shift -fi - -if [ $# != 2 ]; then - echo "Usage: split_data.sh [--per-utt] <data-dir> <num-to-split>" - echo "This script will not split the data-dir if it detects that the output is newer than the input." - echo "By default it splits per speaker (so each speaker is in only one split dir)," - echo "but with the --per-utt option it will ignore the speaker information while splitting." - exit 1 -fi - -data=$1 -numsplit=$2 - -if [ $numsplit -le 0 ]; then - echo "Invalid num-split argument $numsplit"; - exit 1; -fi - -n=0; -feats="" -wavs="" -utt2spks="" -texts="" - -nu=`cat $data/utt2spk | wc -l` -nf=`cat $data/feats.scp 2>/dev/null | wc -l` -nt=`cat $data/text 2>/dev/null | wc -l` # take it as zero if no such file -if [ -f $data/feats.scp ] && [ $nu -ne $nf ]; then - echo "** split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf); you can " - echo "** use utils/fix_data_dir.sh $data to fix this." -fi -if [ -f $data/text ] && [ $nu -ne $nt ]; then - echo "** split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt); you can " - echo "** use utils/fix_data_dir.sh to fix this." -fi - -s1=$data/split$numsplit/1 -if [ ! -d $s1 ]; then - need_to_split=true -else - need_to_split=false - for f in utt2spk spk2utt spk2warp feats.scp text wav.scp cmvn.scp spk2gender \ - vad.scp segments reco2file_and_channel utt2lang; do - if [[ -f $data/$f && ( ! -f $s1/$f || $s1/$f -ot $data/$f ) ]]; then - need_to_split=true - fi - done -fi - -if ! $need_to_split; then - exit 0; -fi - -for n in `seq $numsplit`; do - mkdir -p $data/split$numsplit/$n - utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk" -done - -if $split_per_spk; then - utt2spk_opt="--utt2spk=$data/utt2spk" -else - utt2spk_opt= -fi - -# If lockfile is not installed, just don't lock it. It's not a big deal. -which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock - -utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1 - -for n in `seq $numsplit`; do - dsn=$data/split$numsplit/$n - utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1; -done - -maybe_wav_scp= -if [ ! -f $data/segments ]; then - maybe_wav_scp=wav.scp # If there is no segments file, then wav file is - # indexed per utt. -fi - -# split some things that are indexed by utterance. -for f in feats.scp text vad.scp utt2lang $maybe_wav_scp; do - if [ -f $data/$f ]; then - utils/filter_scps.pl JOB=1:$numsplit \ - $data/split$numsplit/JOB/utt2spk $data/$f $data/split$numsplit/JOB/$f || exit 1; - fi -done - -# split some things that are indexed by speaker -for f in spk2gender spk2warp cmvn.scp; do - if [ -f $data/$f ]; then - utils/filter_scps.pl JOB=1:$numsplit \ - $data/split$numsplit/JOB/spk2utt $data/$f $data/split$numsplit/JOB/$f || exit 1; - fi -done - -for n in `seq $numsplit`; do - dsn=$data/split$numsplit/$n - if [ -f $data/segments ]; then - utils/filter_scp.pl $dsn/utt2spk $data/segments > $dsn/segments - awk '{print $2;}' $dsn/segments | sort | uniq > $data/tmp.reco # recording-ids. - if [ -f $data/reco2file_and_channel ]; then - utils/filter_scp.pl $data/tmp.reco $data/reco2file_and_channel > $dsn/reco2file_and_channel - fi - if [ -f $data/wav.scp ]; then - utils/filter_scp.pl $data/tmp.reco $data/wav.scp >$dsn/wav.scp - fi - rm $data/tmp.reco - fi # else it would have been handled above, see maybe_wav. -done - -rm -f $data/.split_lock - -exit 0 |