summaryrefslogtreecommitdiff
path: root/kaldi_decode/utils/split_data.sh
diff options
context:
space:
mode:
Diffstat (limited to 'kaldi_decode/utils/split_data.sh')
-rwxr-xr-xkaldi_decode/utils/split_data.sh135
1 files changed, 135 insertions, 0 deletions
diff --git a/kaldi_decode/utils/split_data.sh b/kaldi_decode/utils/split_data.sh
new file mode 100755
index 0000000..941890c
--- /dev/null
+++ b/kaldi_decode/utils/split_data.sh
@@ -0,0 +1,135 @@
+#!/bin/bash
+# Copyright 2010-2013 Microsoft Corporation
+# Johns Hopkins University (Author: Daniel Povey)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+split_per_spk=true
+if [ "$1" == "--per-utt" ]; then
+ split_per_spk=false
+ shift
+fi
+
+if [ $# != 2 ]; then
+ echo "Usage: split_data.sh [--per-utt] <data-dir> <num-to-split>"
+ echo "This script will not split the data-dir if it detects that the output is newer than the input."
+ echo "By default it splits per speaker (so each speaker is in only one split dir),"
+ echo "but with the --per-utt option it will ignore the speaker information while splitting."
+ exit 1
+fi
+
+data=$1
+numsplit=$2
+
+if [ $numsplit -le 0 ]; then
+ echo "Invalid num-split argument $numsplit";
+ exit 1;
+fi
+
+n=0;
+feats=""
+wavs=""
+utt2spks=""
+texts=""
+
+nu=`cat $data/utt2spk | wc -l`
+nf=`cat $data/feats.scp 2>/dev/null | wc -l`
+nt=`cat $data/text 2>/dev/null | wc -l` # take it as zero if no such file
+if [ -f $data/feats.scp ] && [ $nu -ne $nf ]; then
+ echo "** split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf); you can "
+ echo "** use utils/fix_data_dir.sh $data to fix this."
+fi
+if [ -f $data/text ] && [ $nu -ne $nt ]; then
+ echo "** split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt); you can "
+ echo "** use utils/fix_data_dir.sh to fix this."
+fi
+
+s1=$data/split$numsplit/1
+if [ ! -d $s1 ]; then
+ need_to_split=true
+else
+ need_to_split=false
+ for f in utt2spk spk2utt spk2warp feats.scp text wav.scp cmvn.scp spk2gender \
+ vad.scp segments reco2file_and_channel utt2lang; do
+ if [[ -f $data/$f && ( ! -f $s1/$f || $s1/$f -ot $data/$f ) ]]; then
+ need_to_split=true
+ fi
+ done
+fi
+
+if ! $need_to_split; then
+ exit 0;
+fi
+
+for n in `seq $numsplit`; do
+ mkdir -p $data/split$numsplit/$n
+ utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk"
+done
+
+if $split_per_spk; then
+ utt2spk_opt="--utt2spk=$data/utt2spk"
+else
+ utt2spk_opt=
+fi
+
+# If lockfile is not installed, just don't lock it. It's not a big deal.
+which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock
+
+utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1
+
+for n in `seq $numsplit`; do
+ dsn=$data/split$numsplit/$n
+ utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1;
+done
+
+maybe_wav_scp=
+if [ ! -f $data/segments ]; then
+ maybe_wav_scp=wav.scp # If there is no segments file, then wav file is
+ # indexed per utt.
+fi
+
+# split some things that are indexed by utterance.
+for f in feats.scp text vad.scp utt2lang $maybe_wav_scp; do
+ if [ -f $data/$f ]; then
+ utils/filter_scps.pl JOB=1:$numsplit \
+ $data/split$numsplit/JOB/utt2spk $data/$f $data/split$numsplit/JOB/$f || exit 1;
+ fi
+done
+
+# split some things that are indexed by speaker
+for f in spk2gender spk2warp cmvn.scp; do
+ if [ -f $data/$f ]; then
+ utils/filter_scps.pl JOB=1:$numsplit \
+ $data/split$numsplit/JOB/spk2utt $data/$f $data/split$numsplit/JOB/$f || exit 1;
+ fi
+done
+
+for n in `seq $numsplit`; do
+ dsn=$data/split$numsplit/$n
+ if [ -f $data/segments ]; then
+ utils/filter_scp.pl $dsn/utt2spk $data/segments > $dsn/segments
+ awk '{print $2;}' $dsn/segments | sort | uniq > $data/tmp.reco # recording-ids.
+ if [ -f $data/reco2file_and_channel ]; then
+ utils/filter_scp.pl $data/tmp.reco $data/reco2file_and_channel > $dsn/reco2file_and_channel
+ fi
+ if [ -f $data/wav.scp ]; then
+ utils/filter_scp.pl $data/tmp.reco $data/wav.scp >$dsn/wav.scp
+ fi
+ rm $data/tmp.reco
+ fi # else it would have been handled above, see maybe_wav.
+done
+
+rm -f $data/.split_lock
+
+exit 0