diff options
author | Ted Yin <[email protected]> | 2015-10-14 15:43:43 +0800 |
---|---|---|
committer | Ted Yin <[email protected]> | 2015-10-14 15:43:43 +0800 |
commit | fd2148540efd7a5e5e2c054280d53d3eb3b055cc (patch) | |
tree | 47501412a3324e4c13b1238eeb913aae02b2024a /kaldi_decode/utils/run.pl | |
parent | 0dba4c998fcccb4bae29582b7d8be94de476dd0b (diff) | |
parent | b33b3a6732c6b6a66bd5c44c615be56d66f4ed67 (diff) |
Merge pull request #7 from yimmon/master
support kaldi decoder
Diffstat (limited to 'kaldi_decode/utils/run.pl')
-rwxr-xr-x | kaldi_decode/utils/run.pl | 264 |
1 files changed, 264 insertions, 0 deletions
diff --git a/kaldi_decode/utils/run.pl b/kaldi_decode/utils/run.pl new file mode 100755 index 0000000..6145a7a --- /dev/null +++ b/kaldi_decode/utils/run.pl @@ -0,0 +1,264 @@ +#!/usr/bin/env perl +use warnings; #sed replacement for -w perl parameter + +# In general, doing +# run.pl some.log a b c is like running the command a b c in +# the bash shell, and putting the standard error and output into some.log. +# To run parallel jobs (backgrounded on the host machine), you can do (e.g.) +# run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB +# and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier]. +# If any of the jobs fails, this script will fail. + +# A typical example is: +# run.pl some.log my-prog "--opt=foo bar" foo \| other-prog baz +# and run.pl will run something like: +# ( my-prog '--opt=foo bar' foo | other-prog baz ) >& some.log +# +# Basically it takes the command-line arguments, quotes them +# as necessary to preserve spaces, and evaluates them with bash. +# In addition it puts the command line at the top of the log, and +# the start and end times of the command at the beginning and end. +# The reason why this is useful is so that we can create a different +# version of this program that uses a queueing system instead. + +# use Data::Dumper; + +@ARGV < 2 && die "usage: run.pl log-file command-line arguments..."; + + +$max_jobs_run = -1; +$jobstart = 1; +$jobend = 1; +$ignored_opts = ""; # These will be ignored. + +# First parse an option like JOB=1:4, and any +# options that would normally be given to +# queue.pl, which we will just discard. + +if (@ARGV > 0) { + while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { # parse any options + # that would normally go to qsub, but which will be ignored here. + $switch = shift @ARGV; + if ($switch eq "-V") { + $ignored_opts .= "-V "; + } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") { + # we do support the option --max-jobs-run n, and its GridEngine form -tc n. + $max_jobs_run = shift @ARGV; + if (! ($max_jobs_run > 0)) { + die "run.pl: invalid option --max-jobs-run $max_jobs_run"; + } + } else { + $option = shift @ARGV; + if ($switch eq "-sync" && $option =~ m/^[yY]/) { + $ignored_opts .= "-sync "; # Note: in the + # corresponding code in queue.pl it says instead, just "$sync = 1;". + } + $ignored_opts .= "$switch $option "; + if ($switch eq "-pe") { # e.g. -pe smp 5 + $option2 = shift @ARGV; + $ignored_opts .= "$option2 "; + } + } + } + if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:10 + $jobname = $1; + $jobstart = $2; + $jobend = $3; + shift; + if ($jobstart > $jobend) { + die "run.pl: invalid job range $ARGV[0]"; + } + if ($jobstart <= 0) { + die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility)."; + } + } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1. + $jobname = $1; + $jobstart = $2; + $jobend = $2; + shift; + } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) { + print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n"; + } +} + +# Users found this message confusing so we are removing it. +# if ($ignored_opts ne "") { +# print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n"; +# } + +if ($max_jobs_run == -1) { # If --max-jobs-run option not set, + # then work out the number of processors if possible, + # and set it based on that. + $max_jobs_run = 0; + if (open(P, "</proc/cpuinfo")) { # Linux + while (<P>) { if (m/^processor/) { $max_jobs_run++; } } + if ($max_jobs_run == 0) { + print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n"; + $max_jobs_run = 10; # reasonable default. + } + close(P); + } elsif (open(P, "sysctl -a |")) { # BSD/Darwin + while (<P>) { + if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4 + $max_jobs_run = $1; + last; + } + } + close(P); + if ($max_jobs_run == 0) { + print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n"; + $max_jobs_run = 10; # reasonable default. + } + } else { + # allow at most 32 jobs at once, on non-UNIX systems; change this code + # if you need to change this default. + $max_jobs_run = 32; + } + # The just-computed value of $max_jobs_run is just the number of processors + # (or our best guess); and if it happens that the number of jobs we need to + # run is just slightly above $max_jobs_run, it will make sense to increase + # $max_jobs_run to equal the number of jobs, so we don't have a small number + # of leftover jobs. + $num_jobs = $jobend - $jobstart + 1; + if ($num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) { + $max_jobs_run = $num_jobs; + } +} + +$logfile = shift @ARGV; + +if (defined $jobname && $logfile !~ m/$jobname/ && + $jobend > $jobstart) { + print STDERR "run.pl: you are trying to run a parallel job but " + . "you are putting the output into just one log file ($logfile)\n"; + exit(1); +} + +$cmd = ""; + +foreach $x (@ARGV) { + if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } + elsif ($x =~ m:\":) { $cmd .= "'$x' "; } + else { $cmd .= "\"$x\" "; } +} + +#$Data::Dumper::Indent=0; +$ret = 0; +$numfail = 0; +%active_pids=(); + +use POSIX ":sys_wait_h"; +for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { + if (scalar(keys %active_pids) >= $max_jobs_run) { + + # Lets wait for a change in any child's status + # Then we have to work out which child finished + $r = waitpid(-1, 0); + $code = $?; + if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen. + if ( defined $active_pids{$r} ) { + $jid=$active_pids{$r}; + $fail[$jid]=$code; + if ($code !=0) { $numfail++;} + delete $active_pids{$r}; + # print STDERR "Finished: $r/$jid " . Dumper(\%active_pids) . "\n"; + } else { + die "run.pl: Cannot find the PID of the chold process that just finished."; + } + + # In theory we could do a non-blocking waitpid over all jobs running just + # to find out if only one or more jobs finished during the previous waitpid() + # However, we just omit this and will reap the next one in the next pass + # through the for(;;) cycle + } + $childpid = fork(); + if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; } + if ($childpid == 0) { # We're in the child... this branch + # executes the job and returns (possibly with an error status). + if (defined $jobname) { + $cmd =~ s/$jobname/$jobid/g; + $logfile =~ s/$jobname/$jobid/g; + } + system("mkdir -p `dirname $logfile` 2>/dev/null"); + open(F, ">$logfile") || die "run.pl: Error opening log file $logfile"; + print F "# " . $cmd . "\n"; + print F "# Started at " . `date`; + $starttime = `date +'%s'`; + print F "#\n"; + close(F); + + # Pipe into bash.. make sure we're not using any other shell. + open(B, "|bash") || die "run.pl: Error opening shell command"; + print B "( " . $cmd . ") 2>>$logfile >> $logfile"; + close(B); # If there was an error, exit status is in $? + $ret = $?; + + $lowbits = $ret & 127; + $highbits = $ret >> 8; + if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" } + else { $return_str = "code $highbits"; } + + $endtime = `date +'%s'`; + open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)"; + $enddate = `date`; + chop $enddate; + print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n"; + print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n"; + close(F); + exit($ret == 0 ? 0 : 1); + } else { + $pid[$jobid] = $childpid; + $active_pids{$childpid} = $jobid; + # print STDERR "Queued: " . Dumper(\%active_pids) . "\n"; + } +} + +# Now we have submitted all the jobs, lets wait until all the jobs finish +foreach $child (keys %active_pids) { + $jobid=$active_pids{$child}; + $r = waitpid($pid[$jobid], 0); + $code = $?; + if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen. + if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully +} + +# Some sanity checks: +# The $fail array should not contain undefined codes +# The number of non-zeros in that array should be equal to $numfail +# We cannot do foreach() here, as the JOB ids do not necessarily start by zero +$failed_jids=0; +for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) { + $job_return = $fail[$jobid]; + if (not defined $job_return ) { + # print Dumper(\@fail); + + die "run.pl: Sanity check failed: we have indication that some jobs are running " . + "even after we waited for all jobs to finish" ; + } + if ($job_return != 0 ){ $failed_jids++;} +} +if ($failed_jids != $numfail) { + die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)." +} +if ($numfail > 0) { $ret = 1; } + +if ($ret != 0) { + $njobs = $jobend - $jobstart + 1; + if ($njobs == 1) { + if (defined $jobname) { + $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with + # that job. + } + print STDERR "run.pl: job failed, log is in $logfile\n"; + if ($logfile =~ m/JOB/) { + print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script."; + } + } + else { + $logfile =~ s/$jobname/*/g; + print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n"; + } +} + + +exit ($ret); |