summaryrefslogtreecommitdiff
path: root/kaldi_decode/utils/split_data.sh
diff options
context:
space:
mode:
Diffstat (limited to 'kaldi_decode/utils/split_data.sh')
-rwxr-xr-xkaldi_decode/utils/split_data.sh135
1 files changed, 0 insertions, 135 deletions
diff --git a/kaldi_decode/utils/split_data.sh b/kaldi_decode/utils/split_data.sh
deleted file mode 100755
index 941890c..0000000
--- a/kaldi_decode/utils/split_data.sh
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/bin/bash
-# Copyright 2010-2013 Microsoft Corporation
-# Johns Hopkins University (Author: Daniel Povey)
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-split_per_spk=true
-if [ "$1" == "--per-utt" ]; then
- split_per_spk=false
- shift
-fi
-
-if [ $# != 2 ]; then
- echo "Usage: split_data.sh [--per-utt] <data-dir> <num-to-split>"
- echo "This script will not split the data-dir if it detects that the output is newer than the input."
- echo "By default it splits per speaker (so each speaker is in only one split dir),"
- echo "but with the --per-utt option it will ignore the speaker information while splitting."
- exit 1
-fi
-
-data=$1
-numsplit=$2
-
-if [ $numsplit -le 0 ]; then
- echo "Invalid num-split argument $numsplit";
- exit 1;
-fi
-
-n=0;
-feats=""
-wavs=""
-utt2spks=""
-texts=""
-
-nu=`cat $data/utt2spk | wc -l`
-nf=`cat $data/feats.scp 2>/dev/null | wc -l`
-nt=`cat $data/text 2>/dev/null | wc -l` # take it as zero if no such file
-if [ -f $data/feats.scp ] && [ $nu -ne $nf ]; then
- echo "** split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf); you can "
- echo "** use utils/fix_data_dir.sh $data to fix this."
-fi
-if [ -f $data/text ] && [ $nu -ne $nt ]; then
- echo "** split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt); you can "
- echo "** use utils/fix_data_dir.sh to fix this."
-fi
-
-s1=$data/split$numsplit/1
-if [ ! -d $s1 ]; then
- need_to_split=true
-else
- need_to_split=false
- for f in utt2spk spk2utt spk2warp feats.scp text wav.scp cmvn.scp spk2gender \
- vad.scp segments reco2file_and_channel utt2lang; do
- if [[ -f $data/$f && ( ! -f $s1/$f || $s1/$f -ot $data/$f ) ]]; then
- need_to_split=true
- fi
- done
-fi
-
-if ! $need_to_split; then
- exit 0;
-fi
-
-for n in `seq $numsplit`; do
- mkdir -p $data/split$numsplit/$n
- utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk"
-done
-
-if $split_per_spk; then
- utt2spk_opt="--utt2spk=$data/utt2spk"
-else
- utt2spk_opt=
-fi
-
-# If lockfile is not installed, just don't lock it. It's not a big deal.
-which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock
-
-utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1
-
-for n in `seq $numsplit`; do
- dsn=$data/split$numsplit/$n
- utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1;
-done
-
-maybe_wav_scp=
-if [ ! -f $data/segments ]; then
- maybe_wav_scp=wav.scp # If there is no segments file, then wav file is
- # indexed per utt.
-fi
-
-# split some things that are indexed by utterance.
-for f in feats.scp text vad.scp utt2lang $maybe_wav_scp; do
- if [ -f $data/$f ]; then
- utils/filter_scps.pl JOB=1:$numsplit \
- $data/split$numsplit/JOB/utt2spk $data/$f $data/split$numsplit/JOB/$f || exit 1;
- fi
-done
-
-# split some things that are indexed by speaker
-for f in spk2gender spk2warp cmvn.scp; do
- if [ -f $data/$f ]; then
- utils/filter_scps.pl JOB=1:$numsplit \
- $data/split$numsplit/JOB/spk2utt $data/$f $data/split$numsplit/JOB/$f || exit 1;
- fi
-done
-
-for n in `seq $numsplit`; do
- dsn=$data/split$numsplit/$n
- if [ -f $data/segments ]; then
- utils/filter_scp.pl $dsn/utt2spk $data/segments > $dsn/segments
- awk '{print $2;}' $dsn/segments | sort | uniq > $data/tmp.reco # recording-ids.
- if [ -f $data/reco2file_and_channel ]; then
- utils/filter_scp.pl $data/tmp.reco $data/reco2file_and_channel > $dsn/reco2file_and_channel
- fi
- if [ -f $data/wav.scp ]; then
- utils/filter_scp.pl $data/tmp.reco $data/wav.scp >$dsn/wav.scp
- fi
- rm $data/tmp.reco
- fi # else it would have been handled above, see maybe_wav.
-done
-
-rm -f $data/.split_lock
-
-exit 0