#!/bin/bash # Copyright 2010-2013 Microsoft Corporation # Johns Hopkins University (Author: Daniel Povey) # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. split_per_spk=true if [ "$1" == "--per-utt" ]; then split_per_spk=false shift fi if [ $# != 2 ]; then echo "Usage: split_data.sh [--per-utt] " echo "This script will not split the data-dir if it detects that the output is newer than the input." echo "By default it splits per speaker (so each speaker is in only one split dir)," echo "but with the --per-utt option it will ignore the speaker information while splitting." exit 1 fi data=$1 numsplit=$2 if [ $numsplit -le 0 ]; then echo "Invalid num-split argument $numsplit"; exit 1; fi n=0; feats="" wavs="" utt2spks="" texts="" nu=`cat $data/utt2spk | wc -l` nf=`cat $data/feats.scp 2>/dev/null | wc -l` nt=`cat $data/text 2>/dev/null | wc -l` # take it as zero if no such file if [ -f $data/feats.scp ] && [ $nu -ne $nf ]; then echo "** split_data.sh: warning, #lines is (utt2spk,feats.scp) is ($nu,$nf); you can " echo "** use utils/fix_data_dir.sh $data to fix this." fi if [ -f $data/text ] && [ $nu -ne $nt ]; then echo "** split_data.sh: warning, #lines is (utt2spk,text) is ($nu,$nt); you can " echo "** use utils/fix_data_dir.sh to fix this." fi s1=$data/split$numsplit/1 if [ ! -d $s1 ]; then need_to_split=true else need_to_split=false for f in utt2spk spk2utt spk2warp feats.scp text wav.scp cmvn.scp spk2gender \ vad.scp segments reco2file_and_channel utt2lang; do if [[ -f $data/$f && ( ! -f $s1/$f || $s1/$f -ot $data/$f ) ]]; then need_to_split=true fi done fi if ! $need_to_split; then exit 0; fi for n in `seq $numsplit`; do mkdir -p $data/split$numsplit/$n utt2spks="$utt2spks $data/split$numsplit/$n/utt2spk" done if $split_per_spk; then utt2spk_opt="--utt2spk=$data/utt2spk" else utt2spk_opt= fi # If lockfile is not installed, just don't lock it. It's not a big deal. which lockfile >&/dev/null && lockfile -l 60 $data/.split_lock utils/split_scp.pl $utt2spk_opt $data/utt2spk $utt2spks || exit 1 for n in `seq $numsplit`; do dsn=$data/split$numsplit/$n utils/utt2spk_to_spk2utt.pl $dsn/utt2spk > $dsn/spk2utt || exit 1; done maybe_wav_scp= if [ ! -f $data/segments ]; then maybe_wav_scp=wav.scp # If there is no segments file, then wav file is # indexed per utt. fi # split some things that are indexed by utterance. for f in feats.scp text vad.scp utt2lang $maybe_wav_scp; do if [ -f $data/$f ]; then utils/filter_scps.pl JOB=1:$numsplit \ $data/split$numsplit/JOB/utt2spk $data/$f $data/split$numsplit/JOB/$f || exit 1; fi done # split some things that are indexed by speaker for f in spk2gender spk2warp cmvn.scp; do if [ -f $data/$f ]; then utils/filter_scps.pl JOB=1:$numsplit \ $data/split$numsplit/JOB/spk2utt $data/$f $data/split$numsplit/JOB/$f || exit 1; fi done for n in `seq $numsplit`; do dsn=$data/split$numsplit/$n if [ -f $data/segments ]; then utils/filter_scp.pl $dsn/utt2spk $data/segments > $dsn/segments awk '{print $2;}' $dsn/segments | sort | uniq > $data/tmp.reco # recording-ids. if [ -f $data/reco2file_and_channel ]; then utils/filter_scp.pl $data/tmp.reco $data/reco2file_and_channel > $dsn/reco2file_and_channel fi if [ -f $data/wav.scp ]; then utils/filter_scp.pl $data/tmp.reco $data/wav.scp >$dsn/wav.scp fi rm $data/tmp.reco fi # else it would have been handled above, see maybe_wav. done rm -f $data/.split_lock exit 0