summaryrefslogtreecommitdiff
path: root/kaldi_io/src/kaldi/util/kaldi-table-inl.h
diff options
context:
space:
mode:
Diffstat (limited to 'kaldi_io/src/kaldi/util/kaldi-table-inl.h')
-rw-r--r--kaldi_io/src/kaldi/util/kaldi-table-inl.h2246
1 files changed, 2246 insertions, 0 deletions
diff --git a/kaldi_io/src/kaldi/util/kaldi-table-inl.h b/kaldi_io/src/kaldi/util/kaldi-table-inl.h
new file mode 100644
index 0000000..6b73c88
--- /dev/null
+++ b/kaldi_io/src/kaldi/util/kaldi-table-inl.h
@@ -0,0 +1,2246 @@
+// util/kaldi-table-inl.h
+
+// Copyright 2009-2011 Microsoft Corporation
+// 2013 Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_UTIL_KALDI_TABLE_INL_H_
+#define KALDI_UTIL_KALDI_TABLE_INL_H_
+
+#include <algorithm>
+#include "util/kaldi-io.h"
+#include "util/text-utils.h"
+#include "util/stl-utils.h" // for StringHasher.
+
+
+namespace kaldi {
+
+/// \addtogroup table_impl_types
+/// @{
+
+template<class Holder> class SequentialTableReaderImplBase {
+ public:
+ typedef typename Holder::T T;
+ // note that Open takes rxfilename not rspecifier.
+ virtual bool Open(const std::string &rxfilename) = 0;
+ virtual bool Done() const = 0;
+ virtual bool IsOpen() const = 0;
+ virtual std::string Key() = 0;
+ virtual const T &Value() = 0;
+ virtual void FreeCurrent() = 0;
+ virtual void Next() = 0;
+ virtual bool Close() = 0;
+ SequentialTableReaderImplBase() { }
+ virtual ~SequentialTableReaderImplBase() { }
+ private:
+ KALDI_DISALLOW_COPY_AND_ASSIGN(SequentialTableReaderImplBase);
+};
+
+
+// This is the implementation for SequentialTableReader
+// when it's actually a script file.
+template<class Holder> class SequentialTableReaderScriptImpl:
+ public SequentialTableReaderImplBase<Holder> {
+ public:
+ typedef typename Holder::T T;
+
+ SequentialTableReaderScriptImpl(): state_(kUninitialized) { }
+
+ virtual bool Open(const std::string &rspecifier) {
+ if (state_ != kUninitialized)
+ if (! Close()) // call Close() yourself to suppress this exception.
+ KALDI_ERR << "TableReader::Open, error closing previous input: "
+ << "rspecifier was " << rspecifier_;
+ bool binary;
+ rspecifier_ = rspecifier;
+ RspecifierType rs = ClassifyRspecifier(rspecifier, &script_rxfilename_,
+ &opts_);
+ KALDI_ASSERT(rs == kScriptRspecifier);
+ if (!script_input_.Open(script_rxfilename_, &binary)) { // Failure on Open
+ KALDI_WARN << "Failed to open script file "
+ << PrintableRxfilename(script_rxfilename_);
+ state_ = kUninitialized;
+ return false;
+ } else { // Open succeeded.
+ if (binary) { // script file should not be binary file..
+ state_ = kError; // bad script file.
+ script_input_.Close();
+ return false;
+ } else {
+ state_ = kFileStart;
+ Next();
+ if (state_ == kError) {
+ script_input_.Close();
+ return false;
+ }
+ if (opts_.permissive) { // Next() will have preloaded.
+ KALDI_ASSERT(state_ == kLoadSucceeded || state_ == kEof);
+ } else {
+ KALDI_ASSERT(state_ == kHaveScpLine || state_ == kEof);
+ }
+ return true; // Success.
+ }
+ }
+ }
+
+ virtual bool IsOpen() const {
+ switch (state_) {
+ case kEof: case kError: case kHaveScpLine: case kLoadSucceeded: case kLoadFailed: return true;
+ case kUninitialized: return false;
+ default: KALDI_ERR << "IsOpen() called on invalid object."; // kFileStart is not valid
+ // state for user to call something on.
+ return false;
+ }
+ }
+
+ virtual bool Done() const {
+ switch (state_) {
+ case kHaveScpLine: return false;
+ case kLoadSucceeded: case kLoadFailed: return false;
+ // These cases are because we want LoadCurrent()
+ // to be callable after Next() and to not change the Done() status [only Next() should change
+ // the Done() status].
+ case kEof: case kError: return true; // Error condition, like Eof, counts as Done(); the destructor
+ // or Close() will inform the user of the error.
+ default: KALDI_ERR << "Done() called on TableReader object at the wrong time.";
+ return false;
+ }
+ }
+
+ virtual std::string Key() {
+ // Valid to call this whenever Done() returns false.
+ switch (state_) {
+ case kHaveScpLine: case kLoadSucceeded: case kLoadFailed: break;
+ default:
+ // coding error.
+ KALDI_ERR << "Key() called on TableReader object at the wrong time.";
+ }
+ return key_;
+ }
+ const T &Value() {
+ StateType orig_state = state_;
+ if (state_ == kHaveScpLine) LoadCurrent(); // Takes
+ // state_ to kLoadSucceeded or kLoadFailed.
+ if (state_ == kLoadFailed) { // this can happen due to
+ // a file listed in an scp file not existing, or
+ // read failure, failure of a command, etc.
+ if (orig_state == kHaveScpLine)
+ KALDI_ERR << "TableReader: failed to load object from "
+ << PrintableRxfilename(data_rxfilename_)
+ << " (to suppress this error, add the permissive "
+ << "(p, ) option to the rspecifier.";
+
+ else // orig_state_ was kLoadFailed, which only could have happened
+ // if the user called FreeCurrent().
+ KALDI_ERR << "TableReader: you called Value() after FreeCurrent().";
+ } else if (state_ != kLoadSucceeded) {
+ // This would be a coding error.
+ KALDI_ERR << "TableReader: Value() called at the wrong time.";
+ }
+ return holder_.Value();
+ }
+ void FreeCurrent() {
+ if (state_ == kLoadSucceeded) {
+ holder_.Clear();
+ state_ = kLoadFailed;
+ } else {
+ KALDI_WARN << "TableReader: FreeCurrent called at the wrong time.";
+ }
+ }
+ void Next() {
+ while (1) {
+ NextScpLine();
+ if (Done()) return;
+ if (opts_.permissive) {
+ // Permissive mode means, when reading scp files, we treat keys whose scp entry
+ // cannot be read as nonexistent. This means trying to read.
+ if (LoadCurrent()) return; // Success.
+ // else try the next scp line.
+ } else {
+ return; // We go the next key; Value() will crash if we can't
+ // read the scp line.
+ }
+ }
+ }
+
+ virtual bool Close() {
+ // Close() will succeed if the stream was not in an error
+ // state. To clean up, it also closes the Input objects if
+ // they're open.
+ if (script_input_.IsOpen())
+ script_input_.Close();
+ if (data_input_.IsOpen())
+ data_input_.Close();
+ if (state_ == kLoadSucceeded)
+ holder_.Clear();
+ if (!this->IsOpen())
+ KALDI_ERR << "Close() called on input that was not open.";
+ StateType old_state = state_;
+ state_ = kUninitialized;
+ if (old_state == kError) {
+ if (opts_.permissive) {
+ KALDI_WARN << "Close() called on scp file with read error, ignoring the "
+ "error because permissive mode specified.";
+ return true;
+ } else return false; // User will do something with the error status.
+ } else return true;
+ }
+
+ virtual ~SequentialTableReaderScriptImpl() {
+ if (state_ == kError)
+ KALDI_ERR << "TableReader: reading script file failed: from scp "
+ << PrintableRxfilename(script_rxfilename_);
+ // If you don't want this exception to be thrown you can
+ // call Close() and check the status.
+ if (state_ == kLoadSucceeded)
+ holder_.Clear();
+ }
+ private:
+ bool LoadCurrent() {
+ // Attempts to load object whose rxfilename is on the current scp line.
+ if (state_ != kHaveScpLine)
+ KALDI_ERR << "TableReader: LoadCurrent() called at the wrong time.";
+ bool ans;
+ // note, NULL means it doesn't read the binary-mode header
+ if (Holder::IsReadInBinary()) ans = data_input_.Open(data_rxfilename_, NULL);
+ else ans = data_input_.OpenTextMode(data_rxfilename_);
+ if (!ans) {
+ // May want to make this warning a VLOG at some point
+ KALDI_WARN << "TableReader: failed to open file "
+ << PrintableRxfilename(data_rxfilename_);
+ state_ = kLoadFailed;
+ return false;
+ } else {
+ if (holder_.Read(data_input_.Stream())) {
+ state_ = kLoadSucceeded;
+ return true;
+ } else { // holder_ will not contain data.
+ KALDI_WARN << "TableReader: failed to load object from "
+ << PrintableRxfilename(data_rxfilename_);
+ state_ = kLoadFailed;
+ return false;
+ }
+ }
+ }
+
+ // Reads the next line in the script file.
+ void NextScpLine() {
+ switch (state_) {
+ case kLoadSucceeded: holder_.Clear(); break;
+ case kHaveScpLine: case kLoadFailed: case kFileStart: break;
+ default:
+ // No other states are valid to call Next() from.
+ KALDI_ERR << "Reading script file: Next called wrongly.";
+ }
+ std::string line;
+ if (getline(script_input_.Stream(), line)) {
+ SplitStringOnFirstSpace(line, &key_, &data_rxfilename_);
+ if (!key_.empty() && !data_rxfilename_.empty()) {
+ // Got a valid line.
+ state_ = kHaveScpLine;
+ } else {
+ // Got an invalid line.
+ state_ = kError; // we can't make sense of this
+ // scp file and will now die.
+ }
+ } else {
+ state_ = kEof; // nothing more in the scp file.
+ // Might as well close the input streams as don't need them.
+ script_input_.Close();
+ if (data_input_.IsOpen())
+ data_input_.Close();
+ }
+ }
+
+
+ Input script_input_; // Input object for the .scp file
+ Input data_input_; // Input object for the entries in
+ // the script file.
+ Holder holder_; // Holds the object.
+ bool binary_; // Binary-mode archive.
+ std::string key_;
+ std::string rspecifier_;
+ std::string script_rxfilename_; // of the script file.
+ RspecifierOptions opts_; // options.
+ std::string data_rxfilename_; // of the file we're reading.
+ enum StateType {
+ // [The state of the reading process] [does holder_ [is script_inp_
+ // have object] open]
+ kUninitialized, // Uninitialized or closed. no no
+ kEof, // We did Next() and found eof in script file. no no
+ kError, // Some other error no yes
+ kHaveScpLine, // Just called Open() or Next() and have a no yes
+ // line of the script file but no data.
+ kLoadSucceeded, // Called LoadCurrent() and it succeeded. yes yes
+ kLoadFailed, // Called LoadCurrent() and it failed, no yes
+ // or the user called FreeCurrent().. note,
+ // if when called by user we are in this state,
+ // it means the user called FreeCurrent().
+ kFileStart, // [state we only use internally] no yes
+ } state_;
+ private:
+};
+
+
+// This is the implementation for SequentialTableReader
+// when it's an archive. Note that the archive format is:
+// key1 [space] object1 key2 [space]
+// object2 ... eof.
+// "object1" is the output of the Holder::Write function and will
+// typically contain a binary header (in binary mode) and then
+// the output of object.Write(os, binary).
+// The archive itself does not care whether it is in binary
+// or text mode, for reading purposes.
+
+template<class Holder> class SequentialTableReaderArchiveImpl:
+ public SequentialTableReaderImplBase<Holder> {
+ public:
+ typedef typename Holder::T T;
+
+ SequentialTableReaderArchiveImpl(): state_(kUninitialized) { }
+
+ virtual bool Open(const std::string &rspecifier) {
+ if (state_ != kUninitialized) {
+ if (! Close()) { // call Close() yourself to suppress this exception.
+ if (opts_.permissive)
+ KALDI_WARN << "TableReader::Open, error closing previous input "
+ "(only warning, since permissive mode).";
+ else
+ KALDI_ERR << "TableReader::Open, error closing previous input.";
+ }
+ }
+ rspecifier_ = rspecifier;
+ RspecifierType rs = ClassifyRspecifier(rspecifier,
+ &archive_rxfilename_,
+ &opts_);
+ KALDI_ASSERT(rs == kArchiveRspecifier);
+
+ bool ans;
+ // NULL means don't expect binary-mode header
+ if (Holder::IsReadInBinary())
+ ans = input_.Open(archive_rxfilename_, NULL);
+ else
+ ans = input_.OpenTextMode(archive_rxfilename_);
+ if (!ans) { // header.
+ KALDI_WARN << "TableReader: failed to open stream "
+ << PrintableRxfilename(archive_rxfilename_);
+ state_ = kUninitialized; // Failure on Open
+ return false; // User should print the error message.
+ }
+ state_ = kFileStart;
+ Next();
+ if (state_ == kError) {
+ KALDI_WARN << "Error beginning to read archive file (wrong filename?): "
+ << PrintableRxfilename(archive_rxfilename_);
+ input_.Close();
+ state_ = kUninitialized;
+ return false;
+ }
+ KALDI_ASSERT(state_ == kHaveObject || state_ == kEof);
+ return true;
+ }
+
+ virtual void Next() {
+ switch (state_) {
+ case kHaveObject:
+ holder_.Clear(); break;
+ case kFileStart: case kFreedObject:
+ break;
+ default:
+ KALDI_ERR << "TableReader: Next() called wrongly.";
+ }
+ std::istream &is = input_.Stream();
+ is.clear(); // Clear any fail bits that may have been set... just in case
+ // this happened in the Read function.
+ is >> key_; // This eats up any leading whitespace and gets the string.
+ if (is.eof()) {
+ state_ = kEof;
+ return;
+ }
+ if (is.fail()) { // This shouldn't really happen, barring file-system errors.
+ KALDI_WARN << "Error reading archive "
+ << PrintableRxfilename(archive_rxfilename_);
+ state_ = kError;
+ return;
+ }
+ int c;
+ if ((c = is.peek()) != ' ' && c != '\t' && c != '\n') { // We expect a space ' ' after the key.
+ // We also allow tab [which is consumed] and newline [which is not], just
+ // so we can read archives generated by scripts that may not be fully
+ // aware of how this format works.
+ KALDI_WARN << "Invalid archive file format: expected space after key "
+ << key_ << ", got character "
+ << CharToString(static_cast<char>(is.peek())) << ", reading "
+ << PrintableRxfilename(archive_rxfilename_);
+ state_ = kError;
+ return;
+ }
+ if (c != '\n') is.get(); // Consume the space or tab.
+ if (holder_.Read(is)) {
+ state_ = kHaveObject;
+ return;
+ } else {
+ KALDI_WARN << "Object read failed, reading archive "
+ << PrintableRxfilename(archive_rxfilename_);
+ state_ = kError;
+ return;
+ }
+ }
+
+ virtual bool IsOpen() const {
+ switch (state_) {
+ case kEof: case kError: case kHaveObject: case kFreedObject: return true;
+ case kUninitialized: return false;
+ default: KALDI_ERR << "IsOpen() called on invalid object."; // kFileStart is not valid
+ // state for user to call something on.
+ return false;
+ }
+ }
+
+ virtual bool Done() const {
+ switch (state_) {
+ case kHaveObject:
+ return false;
+ case kEof: case kError:
+ return true; // Error-state counts as Done(), but destructor
+ // will fail (unless you check the status with Close()).
+ default:
+ KALDI_ERR << "Done() called on TableReader object at the wrong time.";
+ return false;
+ }
+ }
+
+ virtual std::string Key() {
+ // Valid to call this whenever Done() returns false
+ switch (state_) {
+ case kHaveObject: break; // only valid case.
+ default:
+ // coding error.
+ KALDI_ERR << "Key() called on TableReader object at the wrong time.";
+ }
+ return key_;
+ }
+ const T &Value() {
+ switch (state_) {
+ case kHaveObject:
+ break; // only valid case.
+ default:
+ // coding error.
+ KALDI_ERR << "Value() called on TableReader object at the wrong time.";
+ }
+ return holder_.Value();
+ }
+ virtual void FreeCurrent() {
+ if (state_ == kHaveObject) {
+ holder_.Clear();
+ state_ = kFreedObject;
+ } else
+ KALDI_WARN << "TableReader: FreeCurernt called at the wrong time.";
+ }
+
+ virtual bool Close() {
+ if (! this->IsOpen())
+ KALDI_ERR << "Close() called on TableReader twice or otherwise wrongly.";
+ if (input_.IsOpen())
+ input_.Close();
+ if (state_ == kHaveObject)
+ holder_.Clear();
+ bool ans;
+ if (opts_.permissive) {
+ ans = true; // always return success.
+ if (state_ == kError)
+ KALDI_WARN << "Error detected closing TableReader for archive "
+ << PrintableRxfilename(archive_rxfilename_) << " but ignoring "
+ << "it as permissive mode specified.";
+ } else
+ ans = (state_ != kError); // If error state, user should detect it.
+ state_ = kUninitialized;
+ return ans;
+ }
+
+ virtual ~SequentialTableReaderArchiveImpl() {
+ if (state_ == kError) {
+ if (opts_.permissive)
+ KALDI_WARN << "Error detected closing TableReader for archive "
+ << PrintableRxfilename(archive_rxfilename_) << " but ignoring "
+ << "it as permissive mode specified.";
+ else
+ KALDI_ERR << "TableReader: error detected closing archive "
+ << PrintableRxfilename(archive_rxfilename_);
+ }
+ // If you don't want this exception to be thrown you can
+ // call Close() and check the status.
+ if (state_ == kHaveObject)
+ holder_.Clear();
+ }
+ private:
+ Input input_; // Input object for the archive
+ Holder holder_; // Holds the object.
+ std::string key_;
+ std::string rspecifier_;
+ std::string archive_rxfilename_;
+ RspecifierOptions opts_;
+ enum { // [The state of the reading process] [does holder_ [is input_
+ // have object] open]
+ kUninitialized, // Uninitialized or closed. no no
+ kFileStart, // [state we use internally: just opened.] no yes
+ kEof, // We did Next() and found eof in archive no no
+ kError, // Some other error no no
+ kHaveObject, // We read the key and the object after it. yes yes
+ kFreedObject, // The user called FreeCurrent(). no yes
+ } state_;
+};
+
+
+template<class Holder>
+SequentialTableReader<Holder>::SequentialTableReader(const std::string &rspecifier): impl_(NULL) {
+ if (rspecifier != "" && !Open(rspecifier))
+ KALDI_ERR << "Error constructing TableReader: rspecifier is " << rspecifier;
+}
+
+template<class Holder>
+bool SequentialTableReader<Holder>::Open(const std::string &rspecifier) {
+ if (IsOpen())
+ if (!Close())
+ KALDI_ERR << "Could not close previously open object.";
+ // now impl_ will be NULL.
+
+ RspecifierType wt = ClassifyRspecifier(rspecifier, NULL, NULL);
+ switch (wt) {
+ case kArchiveRspecifier:
+ impl_ = new SequentialTableReaderArchiveImpl<Holder>();
+ break;
+ case kScriptRspecifier:
+ impl_ = new SequentialTableReaderScriptImpl<Holder>();
+ break;
+ case kNoRspecifier: default:
+ KALDI_WARN << "Invalid rspecifier " << rspecifier;
+ return false;
+ }
+ if (!impl_->Open(rspecifier)) {
+ delete impl_;
+ impl_ = NULL;
+ return false; // sub-object will have printed warnings.
+ }
+ else return true;
+}
+
+template<class Holder>
+bool SequentialTableReader<Holder>::Close() {
+ CheckImpl();
+ bool ans = impl_->Close();
+ delete impl_; // We don't keep around empty impl_ objects.
+ impl_ = NULL;
+ return ans;
+}
+
+
+template<class Holder>
+bool SequentialTableReader<Holder>::IsOpen() const {
+ return (impl_ != NULL); // Because we delete the object whenever
+ // that object is not open. Thus, the IsOpen functions of the
+ // Impl objects are not really needed.
+}
+
+template<class Holder>
+std::string SequentialTableReader<Holder>::Key() {
+ CheckImpl();
+ return impl_->Key(); // this call may throw if called wrongly in other ways,
+ // e.g. eof.
+}
+
+
+template<class Holder>
+void SequentialTableReader<Holder>::FreeCurrent() {
+ CheckImpl();
+ impl_->FreeCurrent();
+}
+
+
+template<class Holder>
+const typename SequentialTableReader<Holder>::T &
+SequentialTableReader<Holder>::Value() {
+ CheckImpl();
+ return impl_->Value(); // This may throw (if LoadCurrent() returned false you are safe.).
+}
+
+
+template<class Holder>
+void SequentialTableReader<Holder>::Next() {
+ CheckImpl();
+ impl_->Next();
+}
+
+template<class Holder>
+bool SequentialTableReader<Holder>::Done() {
+ CheckImpl();
+ return impl_->Done();
+}
+
+
+template<class Holder>
+SequentialTableReader<Holder>::~SequentialTableReader() {
+ if (impl_) delete impl_;
+ // Destructor of impl_ may throw.
+}
+
+
+
+template<class Holder> class TableWriterImplBase {
+ public:
+ typedef typename Holder::T T;
+
+ virtual bool Open(const std::string &wspecifier) = 0;
+
+ // Write returns true on success, false on failure, but
+ // some errors may not be detected until we call Close().
+ // It throws (via KALDI_ERR) if called wrongly. We could
+ // have just thrown on all errors, since this is what
+ // TableWriter does; it was designed this way because originally
+ // TableWriter::Write returned an exit status.
+ virtual bool Write(const std::string &key, const T &value) = 0;
+
+ // Flush will flush any archive; it does not return error status,
+ // any errors will be reported on the next Write or Close.
+ virtual void Flush() = 0;
+
+ virtual bool Close() = 0;
+
+ virtual bool IsOpen() const = 0;
+
+ // May throw on write error if Close was not called.
+ virtual ~TableWriterImplBase() { }
+
+ TableWriterImplBase() { }
+ private:
+ KALDI_DISALLOW_COPY_AND_ASSIGN(TableWriterImplBase);
+};
+
+
+// The implementation of TableWriter we use when writing directly
+// to an archive with no associated scp.
+template<class Holder>
+class TableWriterArchiveImpl: public TableWriterImplBase<Holder> {
+ public:
+ typedef typename Holder::T T;
+
+ virtual bool Open(const std::string &wspecifier) {
+ switch (state_) {
+ case kUninitialized:
+ break;
+ case kWriteError:
+ KALDI_ERR << "TableWriter: opening stream, already open with write error.";
+ case kOpen: default:
+ if (!Close()) // throw because this error may not have been previously
+ // detected by the user.
+ KALDI_ERR << "TableWriter: opening stream, error closing previously open stream.";
+ }
+ wspecifier_ = wspecifier;
+ WspecifierType ws = ClassifyWspecifier(wspecifier,
+ &archive_wxfilename_,
+ NULL,
+ &opts_);
+ KALDI_ASSERT(ws == kArchiveWspecifier); // or wrongly called.
+
+ if (output_.Open(archive_wxfilename_, opts_.binary, false)) { // false means no binary header.
+ state_ = kOpen;
+ return true;
+ } else {
+ // stream will not be open. User will report this error
+ // (we return bool), so don't bother printing anything.
+ state_ = kUninitialized;
+ return false;
+ }
+ }
+
+ virtual bool IsOpen() const {
+ switch (state_) {
+ case kUninitialized: return false;
+ case kOpen: case kWriteError: return true;
+ default: KALDI_ERR << "IsOpen() called on TableWriter in invalid state.";
+ }
+ return false;
+ }
+
+ // Write returns true on success, false on failure, but
+ // some errors may not be detected till we call Close().
+ virtual bool Write(const std::string &key, const T &value) {
+ switch (state_) {
+ case kOpen: break;
+ case kWriteError:
+ // user should have known from the last
+ // call to Write that there was a problem.
+ KALDI_WARN << "TableWriter: attempting to write to invalid stream.";
+ return false;
+ case kUninitialized: default:
+ KALDI_ERR << "TableWriter: Write called on invalid stream";
+
+ }
+ // state is now kOpen or kWriteError.
+ if (!IsToken(key)) // e.g. empty string or has spaces...
+ KALDI_ERR << "TableWriter: using invalid key " << key;
+ output_.Stream() << key << ' ';
+ if (!Holder::Write(output_.Stream(), opts_.binary, value)) {
+ KALDI_WARN << "TableWriter: write failure to "
+ << PrintableWxfilename(archive_wxfilename_);
+ state_ = kWriteError;
+ return false;
+ }
+ if (state_ == kWriteError) return false; // Even if this Write seems to have
+ // succeeded, we fail because a previous Write failed and the archive may be
+ // corrupted and unreadable.
+
+ if (opts_.flush)
+ Flush();
+ return true;
+ }
+
+ // Flush will flush any archive; it does not return error status,
+ // any errors will be reported on the next Write or Close.
+ virtual void Flush() {
+ switch (state_) {
+ case kWriteError: case kOpen:
+ output_.Stream().flush(); // Don't check error status.
+ return;
+ default:
+ KALDI_WARN << "TableWriter: Flush called on not-open writer.";
+ }
+ }
+
+ virtual bool Close() {
+ if (!this->IsOpen() || !output_.IsOpen())
+ KALDI_ERR << "TableWriter: Close called on a stream that was not open." << this->IsOpen() << ", " << output_.IsOpen();
+ bool close_success = output_.Close();
+ if (!close_success) {
+ KALDI_WARN << "TableWriter: error closing stream: wspecifier is "
+ << wspecifier_;
+ state_ = kUninitialized;
+ return false;
+ }
+ if (state_ == kWriteError) {
+ KALDI_WARN << "TableWriter: closing writer in error state: wspecifier is "
+ << wspecifier_;
+ state_ = kUninitialized;
+ return false;
+ }
+ state_ = kUninitialized;
+ return true;
+ }
+
+ TableWriterArchiveImpl(): state_(kUninitialized) {}
+
+ // May throw on write error if Close was not called.
+ virtual ~TableWriterArchiveImpl() {
+ if (!IsOpen()) return;
+ else if (!Close())
+ KALDI_ERR << "At TableWriter destructor: Write failed or stream close "
+ << "failed: wspecifier is "<< wspecifier_;
+ }
+
+ private:
+ Output output_;
+ WspecifierOptions opts_;
+ std::string wspecifier_;
+ std::string archive_wxfilename_;
+ enum { // is stream open?
+ kUninitialized, // no
+ kOpen, // yes
+ kWriteError, // yes
+ } state_;
+};
+
+
+
+
+// The implementation of TableWriter we use when writing to
+// individual files (more generally, wxfilenames) specified
+// in an scp file that we read.
+
+// Note: the code for this class is similar to RandomAccessTableReaderScriptImpl;
+// try to keep them in sync.
+
+template<class Holder>
+class TableWriterScriptImpl: public TableWriterImplBase<Holder> {
+ public:
+ typedef typename Holder::T T;
+
+ TableWriterScriptImpl(): last_found_(0), state_(kUninitialized) {}
+
+ virtual bool Open(const std::string &wspecifier) {
+ switch (state_) {
+ case kReadScript:
+ KALDI_ERR << " Opening already open TableWriter: call Close first.";
+ case kUninitialized: case kNotReadScript:
+ break;
+ }
+ wspecifier_ = wspecifier;
+ WspecifierType ws = ClassifyWspecifier(wspecifier,
+ NULL,
+ &script_rxfilename_,
+ &opts_);
+ KALDI_ASSERT(ws == kScriptWspecifier); // or wrongly called.
+ KALDI_ASSERT(script_.empty()); // no way it could be nonempty at this point.
+
+ if (! ReadScriptFile(script_rxfilename_,
+ true, // print any warnings
+ &script_)) { // error reading script file or invalid format
+ state_ = kNotReadScript;
+ return false; // no need to print further warnings. user gets the error.
+ }
+ std::sort(script_.begin(), script_.end());
+ for (size_t i = 0; i+1 < script_.size(); i++) {
+ if (script_[i].first.compare(script_[i+1].first) >= 0) {
+ // script[i] not < script[i+1] in lexical order...
+ KALDI_WARN << "Script file " << PrintableRxfilename(script_rxfilename_)
+ << " contains duplicate key " << script_[i].first;
+ state_ = kNotReadScript;
+ return false;
+ }
+ }
+ state_ = kReadScript;
+ return true;
+ }
+
+ virtual bool IsOpen() const { return (state_ == kReadScript); }
+
+ virtual bool Close() {
+ if (!IsOpen())
+ KALDI_ERR << "Close() called on TableWriter that was not open.";
+ state_ = kUninitialized;
+ last_found_ = 0;
+ script_.clear();
+ return true;
+ }
+
+ // Write returns true on success, false on failure, but
+ // some errors may not be detected till we call Close().
+ virtual bool Write(const std::string &key, const T &value) {
+ if (!IsOpen())
+ KALDI_ERR << "TableWriter: Write called on invalid stream";
+
+ if (!IsToken(key)) // e.g. empty string or has spaces...
+ KALDI_ERR << "TableWriter: using invalid key " << key;
+
+ std::string wxfilename;
+ if (!LookupFilename(key, &wxfilename)) {
+ if (opts_.permissive) {
+ return true; // In permissive mode, it's as if we're writing to /dev/null
+ // for missing keys.
+ } else {
+ KALDI_WARN << "TableWriter: script file "
+ << PrintableRxfilename(script_rxfilename_)
+ << " has no entry for key "<<key;
+ return false;
+ }
+ }
+ Output output;
+ if (!output.Open(wxfilename, opts_.binary, false)) {
+ // Open in the text/binary mode (on Windows) given by member var. "binary"
+ // (obtained from wspecifier), but do not put the binary-mode header (it
+ // will be written, if needed, by the Holder::Write function.)
+ KALDI_WARN << "TableWriter: failed to open stream: "
+ << PrintableWxfilename(wxfilename);
+ return false;
+ }
+ if (!Holder::Write(output.Stream(), opts_.binary, value)
+ || !output.Close()) {
+ KALDI_WARN << "TableWriter: failed to write data to "
+ << PrintableWxfilename(wxfilename);
+ return false;
+ }
+ return true;
+ }
+
+ // Flush does nothing in this implementation, there is nothing to flush.
+ virtual void Flush() { }
+
+
+ virtual ~TableWriterScriptImpl() {
+ // Nothing to do in destructor.
+ }
+
+ private:
+ // Note: this function is almost the same as in RandomAccessTableReaderScriptImpl.
+ bool LookupFilename(const std::string &key, std::string *wxfilename) {
+ // First, an optimization: if we're going consecutively, this will
+ // make the lookup very fast.
+ last_found_++;
+ if (last_found_ < script_.size() && script_[last_found_].first == key) {
+ *wxfilename = script_[last_found_].second;
+ return true;
+ }
+ std::pair<std::string, std::string> pr(key, ""); // Important that ""
+ // compares less than or equal to any string, so lower_bound points to the
+ // element that has the same key.
+ typedef typename std::vector<std::pair<std::string, std::string> >::const_iterator
+ IterType;
+ IterType iter = std::lower_bound(script_.begin(), script_.end(), pr);
+ if (iter != script_.end() && iter->first == key) {
+ last_found_ = iter - script_.begin();
+ *wxfilename = iter->second;
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+
+ WspecifierOptions opts_;
+ std::string wspecifier_;
+ std::string script_rxfilename_;
+
+ // the script_ variable contains pairs of (key, filename), sorted using
+ // std::sort. This can be used with binary_search to look up filenames for
+ // writing. If this becomes inefficient we can use std::unordered_map (but I
+ // suspect this wouldn't be significantly faster & would use more memory).
+ // If memory becomes a problem here, the user should probably be passing
+ // only the relevant part of the scp file rather than expecting us to get too
+ // clever in the code.
+ std::vector<std::pair<std::string, std::string> > script_;
+ size_t last_found_; // This is for an optimization used in LookupFilename.
+
+ enum {
+ kUninitialized,
+ kReadScript,
+ kNotReadScript, // read of script failed.
+ } state_;
+};
+
+
+// The implementation of TableWriter we use when writing directly
+// to an archive plus an associated scp.
+template<class Holder>
+class TableWriterBothImpl: public TableWriterImplBase<Holder> {
+ public:
+ typedef typename Holder::T T;
+
+ virtual bool Open(const std::string &wspecifier) {
+ switch (state_) {
+ case kUninitialized:
+ break;
+ case kWriteError:
+ KALDI_ERR << "TableWriter: opening stream, already open with write error.";
+ case kOpen: default:
+ if (!Close()) // throw because this error may not have been previously detected by user.
+ KALDI_ERR << "TableWriter: opening stream, error closing previously open stream.";
+ }
+ wspecifier_ = wspecifier;
+ WspecifierType ws = ClassifyWspecifier(wspecifier,
+ &archive_wxfilename_,
+ &script_wxfilename_,
+ &opts_);
+ KALDI_ASSERT(ws == kBothWspecifier); // or wrongly called.
+ if (ClassifyWxfilename(archive_wxfilename_) != kFileOutput)
+ KALDI_WARN << "When writing to both archive and script, the script file "
+ "will generally not be interpreted correctly unless the archive is "
+ "an actual file: wspecifier = " << wspecifier;
+
+ if (!archive_output_.Open(archive_wxfilename_, opts_.binary, false)) { // false means no binary header.
+ state_ = kUninitialized;
+ return false;
+ }
+ if (!script_output_.Open(script_wxfilename_, false, false)) { // first false means text mode:
+ // script