From 96a32415ab43377cf1575bd3f4f2980f58028209 Mon Sep 17 00:00:00 2001 From: Determinant Date: Fri, 14 Aug 2015 11:51:42 +0800 Subject: add implementation for kaldi io (by ymz) --- .../include/fst/extensions/far/compile-strings.h | 304 ++++++++++++ .../openfst/include/fst/extensions/far/create.h | 87 ++++ .../openfst/include/fst/extensions/far/equal.h | 99 ++++ .../openfst/include/fst/extensions/far/extract.h | 140 ++++++ .../tools/openfst/include/fst/extensions/far/far.h | 532 +++++++++++++++++++++ .../openfst/include/fst/extensions/far/farlib.h | 31 ++ .../openfst/include/fst/extensions/far/farscript.h | 273 +++++++++++ .../openfst/include/fst/extensions/far/info.h | 128 +++++ .../openfst/include/fst/extensions/far/main.h | 43 ++ .../include/fst/extensions/far/print-strings.h | 138 ++++++ .../openfst/include/fst/extensions/far/stlist.h | 305 ++++++++++++ .../openfst/include/fst/extensions/far/sttable.h | 371 ++++++++++++++ 12 files changed, 2451 insertions(+) create mode 100644 kaldi_io/src/tools/openfst/include/fst/extensions/far/compile-strings.h create mode 100644 kaldi_io/src/tools/openfst/include/fst/extensions/far/create.h create mode 100644 kaldi_io/src/tools/openfst/include/fst/extensions/far/equal.h create mode 100644 kaldi_io/src/tools/openfst/include/fst/extensions/far/extract.h create mode 100644 kaldi_io/src/tools/openfst/include/fst/extensions/far/far.h create mode 100644 kaldi_io/src/tools/openfst/include/fst/extensions/far/farlib.h create mode 100644 kaldi_io/src/tools/openfst/include/fst/extensions/far/farscript.h create mode 100644 kaldi_io/src/tools/openfst/include/fst/extensions/far/info.h create mode 100644 kaldi_io/src/tools/openfst/include/fst/extensions/far/main.h create mode 100644 kaldi_io/src/tools/openfst/include/fst/extensions/far/print-strings.h create mode 100644 kaldi_io/src/tools/openfst/include/fst/extensions/far/stlist.h create mode 100644 kaldi_io/src/tools/openfst/include/fst/extensions/far/sttable.h (limited to 'kaldi_io/src/tools/openfst/include/fst/extensions/far') diff --git a/kaldi_io/src/tools/openfst/include/fst/extensions/far/compile-strings.h b/kaldi_io/src/tools/openfst/include/fst/extensions/far/compile-strings.h new file mode 100644 index 0000000..ca247db --- /dev/null +++ b/kaldi_io/src/tools/openfst/include/fst/extensions/far/compile-strings.h @@ -0,0 +1,304 @@ + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Authors: allauzen@google.com (Cyril Allauzen) +// ttai@google.com (Terry Tai) +// jpr@google.com (Jake Ratkiewicz) + + +#ifndef FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_ +#define FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_ + +#include +#include +#include +using std::vector; + +#include +#include + +namespace fst { + +// Construct a reader that provides FSTs from a file (stream) either on a +// line-by-line basis or on a per-stream basis. Note that the freshly +// constructed reader is already set to the first input. +// +// Sample Usage: +// for (StringReader reader(...); !reader.Done(); reader.Next()) { +// Fst *fst = reader.GetVectorFst(); +// } +template +class StringReader { + public: + typedef A Arc; + typedef typename A::Label Label; + typedef typename A::Weight Weight; + typedef typename StringCompiler::TokenType TokenType; + + enum EntryType { LINE = 1, FILE = 2 }; + + StringReader(istream &istrm, + const string &source, + EntryType entry_type, + TokenType token_type, + bool allow_negative_labels, + const SymbolTable *syms = 0, + Label unknown_label = kNoStateId) + : nline_(0), strm_(istrm), source_(source), entry_type_(entry_type), + token_type_(token_type), symbols_(syms), done_(false), + compiler_(token_type, syms, unknown_label, allow_negative_labels) { + Next(); // Initialize the reader to the first input. + } + + bool Done() { + return done_; + } + + void Next() { + VLOG(1) << "Processing source " << source_ << " at line " << nline_; + if (!strm_) { // We're done if we have no more input. + done_ = true; + return; + } + if (entry_type_ == LINE) { + getline(strm_, content_); + ++nline_; + } else { + content_.clear(); + string line; + while (getline(strm_, line)) { + ++nline_; + content_.append(line); + content_.append("\n"); + } + } + if (!strm_ && content_.empty()) // We're also done if we read off all the + done_ = true; // whitespace at the end of a file. + } + + VectorFst *GetVectorFst(bool keep_symbols = false) { + VectorFst *fst = new VectorFst; + if (keep_symbols) { + fst->SetInputSymbols(symbols_); + fst->SetOutputSymbols(symbols_); + } + if (compiler_(content_, fst)) { + return fst; + } else { + delete fst; + return NULL; + } + } + + CompactFst > *GetCompactFst(bool keep_symbols = false) { + CompactFst > *fst; + if (keep_symbols) { + VectorFst tmp; + tmp.SetInputSymbols(symbols_); + tmp.SetOutputSymbols(symbols_); + fst = new CompactFst >(tmp); + } else { + fst = new CompactFst >; + } + if (compiler_(content_, fst)) { + return fst; + } else { + delete fst; + return NULL; + } + } + + private: + size_t nline_; + istream &strm_; + string source_; + EntryType entry_type_; + TokenType token_type_; + const SymbolTable *symbols_; + bool done_; + StringCompiler compiler_; + string content_; // The actual content of the input stream's next FST. + + DISALLOW_COPY_AND_ASSIGN(StringReader); +}; + +// Compute the minimal length required to encode each line number as a decimal +// number. +int KeySize(const char *filename); + +template +void FarCompileStrings(const vector &in_fnames, + const string &out_fname, + const string &fst_type, + const FarType &far_type, + int32 generate_keys, + FarEntryType fet, + FarTokenType tt, + const string &symbols_fname, + const string &unknown_symbol, + bool keep_symbols, + bool initial_symbols, + bool allow_negative_labels, + bool file_list_input, + const string &key_prefix, + const string &key_suffix) { + typename StringReader::EntryType entry_type; + if (fet == FET_LINE) { + entry_type = StringReader::LINE; + } else if (fet == FET_FILE) { + entry_type = StringReader::FILE; + } else { + FSTERROR() << "FarCompileStrings: unknown entry type"; + return; + } + + typename StringCompiler::TokenType token_type; + if (tt == FTT_SYMBOL) { + token_type = StringCompiler::SYMBOL; + } else if (tt == FTT_BYTE) { + token_type = StringCompiler::BYTE; + } else if (tt == FTT_UTF8) { + token_type = StringCompiler::UTF8; + } else { + FSTERROR() << "FarCompileStrings: unknown token type"; + return; + } + + bool compact; + if (fst_type.empty() || (fst_type == "vector")) { + compact = false; + } else if (fst_type == "compact") { + compact = true; + } else { + FSTERROR() << "FarCompileStrings: unknown fst type: " + << fst_type; + return; + } + + const SymbolTable *syms = 0; + typename Arc::Label unknown_label = kNoLabel; + if (!symbols_fname.empty()) { + SymbolTableTextOptions opts; + opts.allow_negative = allow_negative_labels; + syms = SymbolTable::ReadText(symbols_fname, opts); + if (!syms) { + FSTERROR() << "FarCompileStrings: error reading symbol table: " + << symbols_fname; + return; + } + if (!unknown_symbol.empty()) { + unknown_label = syms->Find(unknown_symbol); + if (unknown_label == kNoLabel) { + FSTERROR() << "FarCompileStrings: unknown label \"" << unknown_label + << "\" missing from symbol table: " << symbols_fname; + return; + } + } + } + + FarWriter *far_writer = + FarWriter::Create(out_fname, far_type); + if (!far_writer) return; + + vector inputs; + if (file_list_input) { + for (int i = 1; i < in_fnames.size(); ++i) { + istream *istrm = in_fnames.empty() ? &cin : + new ifstream(in_fnames[i].c_str()); + string str; + while (getline(*istrm, str)) + inputs.push_back(str); + if (!in_fnames.empty()) + delete istrm; + } + } else { + inputs = in_fnames; + } + + for (int i = 0, n = 0; i < inputs.size(); ++i) { + if (generate_keys == 0 && inputs[i].empty()) { + FSTERROR() << "FarCompileStrings: read from a file instead of stdin or" + << " set the --generate_keys flags."; + delete far_writer; + delete syms; + return; + } + int key_size = generate_keys ? generate_keys : + (entry_type == StringReader::FILE ? 1 : + KeySize(inputs[i].c_str())); + istream *istrm = inputs[i].empty() ? &cin : + new ifstream(inputs[i].c_str()); + + bool keep_syms = keep_symbols; + for (StringReader reader( + *istrm, inputs[i].empty() ? "stdin" : inputs[i], + entry_type, token_type, allow_negative_labels, + syms, unknown_label); + !reader.Done(); + reader.Next()) { + ++n; + const Fst *fst; + if (compact) + fst = reader.GetCompactFst(keep_syms); + else + fst = reader.GetVectorFst(keep_syms); + if (initial_symbols) + keep_syms = false; + if (!fst) { + FSTERROR() << "FarCompileStrings: compiling string number " << n + << " in file " << inputs[i] << " failed with token_type = " + << (tt == FTT_BYTE ? "byte" : + (tt == FTT_UTF8 ? "utf8" : + (tt == FTT_SYMBOL ? "symbol" : "unknown"))) + << " and entry_type = " + << (fet == FET_LINE ? "line" : + (fet == FET_FILE ? "file" : "unknown")); + delete far_writer; + delete syms; + if (!inputs[i].empty()) delete istrm; + return; + } + ostringstream keybuf; + keybuf.width(key_size); + keybuf.fill('0'); + keybuf << n; + string key; + if (generate_keys > 0) { + key = keybuf.str(); + } else { + char* filename = new char[inputs[i].size() + 1]; + strcpy(filename, inputs[i].c_str()); + key = basename(filename); + if (entry_type != StringReader::FILE) { + key += "-"; + key += keybuf.str(); + } + delete[] filename; + } + far_writer->Add(key_prefix + key + key_suffix, *fst); + delete fst; + } + if (generate_keys == 0) + n = 0; + if (!inputs[i].empty()) + delete istrm; + } + + delete far_writer; +} + +} // namespace fst + + +#endif // FST_EXTENSIONS_FAR_COMPILE_STRINGS_H_ diff --git a/kaldi_io/src/tools/openfst/include/fst/extensions/far/create.h b/kaldi_io/src/tools/openfst/include/fst/extensions/far/create.h new file mode 100644 index 0000000..edb31e7 --- /dev/null +++ b/kaldi_io/src/tools/openfst/include/fst/extensions/far/create.h @@ -0,0 +1,87 @@ +// create-main.h + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: riley@google.com (Michael Riley) +// Modified: jpr@google.com (Jake Ratkiewicz) to use new dispatch +// +// \file +// Creates a finite-state archive from component FSTs. Includes +// helper function for farcreate.cc that templates the main on the arc +// type to support multiple and extensible arc types. +// + +#ifndef FST_EXTENSIONS_FAR_CREATE_H__ +#define FST_EXTENSIONS_FAR_CREATE_H__ + +#include +#include +#include +using std::vector; + +#include + +namespace fst { + +template +void FarCreate(const vector &in_fnames, + const string &out_fname, + const int32 generate_keys, + const bool file_list_input, + const FarType &far_type, + const string &key_prefix, + const string &key_suffix) { + FarWriter *far_writer = + FarWriter::Create(out_fname, far_type); + if (!far_writer) return; + + vector inputs; + if (file_list_input) { + for (int i = 1; i < in_fnames.size(); ++i) { + ifstream istrm(in_fnames[i].c_str()); + string str; + while (getline(istrm, str)) + inputs.push_back(str); + } + } else { + inputs = in_fnames; + } + + for (int i = 0; i < inputs.size(); ++i) { + Fst *ifst = Fst::Read(inputs[i]); + if (!ifst) return; + string key; + if (generate_keys > 0) { + ostringstream keybuf; + keybuf.width(generate_keys); + keybuf.fill('0'); + keybuf << i + 1; + key = keybuf.str(); + } else { + char* filename = new char[inputs[i].size() + 1]; + strcpy(filename, inputs[i].c_str()); + key = basename(filename); + delete[] filename; + } + + far_writer->Add(key_prefix + key + key_suffix, *ifst); + delete ifst; + } + + delete far_writer; +} + +} // namespace fst + +#endif // FST_EXTENSIONS_FAR_CREATE_H__ diff --git a/kaldi_io/src/tools/openfst/include/fst/extensions/far/equal.h b/kaldi_io/src/tools/openfst/include/fst/extensions/far/equal.h new file mode 100644 index 0000000..be82e2d --- /dev/null +++ b/kaldi_io/src/tools/openfst/include/fst/extensions/far/equal.h @@ -0,0 +1,99 @@ + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: allauzen@google.com (Cyril Allauzen) + +#ifndef FST_EXTENSIONS_FAR_EQUAL_H_ +#define FST_EXTENSIONS_FAR_EQUAL_H_ + +#include + +#include +#include + +namespace fst { + +template +bool FarEqual(const string &filename1, + const string &filename2, + float delta = kDelta, + const string &begin_key = string(), + const string &end_key = string()) { + + FarReader *reader1 = FarReader::Open(filename1); + FarReader *reader2 = FarReader::Open(filename2); + if (!reader1 || !reader2) { + delete reader1; + delete reader2; + VLOG(1) << "FarEqual: cannot open input Far file(s)"; + return false; + } + + if (!begin_key.empty()) { + bool find_begin1 = reader1->Find(begin_key); + bool find_begin2 = reader2->Find(begin_key); + if (!find_begin1 || !find_begin2) { + bool ret = !find_begin1 && !find_begin2; + if (!ret) { + VLOG(1) << "FarEqual: key \"" << begin_key << "\" missing from " + << (find_begin1 ? "second" : "first") << " archive."; + } + delete reader1; + delete reader2; + return ret; + } + } + + for(; !reader1->Done() && !reader2->Done(); + reader1->Next(), reader2->Next()) { + const string key1 = reader1->GetKey(); + const string key2 = reader2->GetKey(); + if (!end_key.empty() && end_key < key1 && end_key < key2) { + delete reader1; + delete reader2; + return true; + } + if (key1 != key2) { + VLOG(1) << "FarEqual: mismatched keys \"" + << key1 << "\" <> \"" << key2 << "\"."; + delete reader1; + delete reader2; + return false; + } + if (!Equal(reader1->GetFst(), reader2->GetFst(), delta)) { + VLOG(1) << "FarEqual: Fsts for key \"" << key1 << "\" are not equal."; + delete reader1; + delete reader2; + return false; + } + } + + if (!reader1->Done() || !reader2->Done()) { + VLOG(1) << "FarEqual: key \"" + << (reader1->Done() ? reader2->GetKey() : reader1->GetKey()) + << "\" missing form " << (reader2->Done() ? "first" : "second") + << " archive."; + delete reader1; + delete reader2; + return false; + } + + delete reader1; + delete reader2; + return true; +} + +} // namespace fst + +#endif // FST_EXTENSIONS_FAR_EQUAL_H_ diff --git a/kaldi_io/src/tools/openfst/include/fst/extensions/far/extract.h b/kaldi_io/src/tools/openfst/include/fst/extensions/far/extract.h new file mode 100644 index 0000000..95866de --- /dev/null +++ b/kaldi_io/src/tools/openfst/include/fst/extensions/far/extract.h @@ -0,0 +1,140 @@ +// extract-main.h + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: riley@google.com (Michael Riley) +// Modified: jpr@google.com (Jake Ratkiewicz) to use the new arc-dispatch + +// \file +// Extracts component FSTs from an finite-state archive. +// + +#ifndef FST_EXTENSIONS_FAR_EXTRACT_H__ +#define FST_EXTENSIONS_FAR_EXTRACT_H__ + +#include +#include +using std::vector; + +#include + +namespace fst { + +template +inline void FarWriteFst(const Fst* fst, string key, + string* okey, int* nrep, + const int32 &generate_filenames, int i, + const string &filename_prefix, + const string &filename_suffix) { + if (key == *okey) + ++*nrep; + else + *nrep = 0; + + *okey = key; + + string ofilename; + if (generate_filenames) { + ostringstream tmp; + tmp.width(generate_filenames); + tmp.fill('0'); + tmp << i; + ofilename = tmp.str(); + } else { + if (*nrep > 0) { + ostringstream tmp; + tmp << '.' << nrep; + key.append(tmp.str().data(), tmp.str().size()); + } + ofilename = key; + } + fst->Write(filename_prefix + ofilename + filename_suffix); +} + +template +void FarExtract(const vector &ifilenames, + const int32 &generate_filenames, + const string &keys, + const string &key_separator, + const string &range_delimiter, + const string &filename_prefix, + const string &filename_suffix) { + FarReader *far_reader = FarReader::Open(ifilenames); + if (!far_reader) return; + + string okey; + int nrep = 0; + + vector key_vector; + // User has specified a set of fsts to extract, where some of the "fsts" could + // be ranges. + if (!keys.empty()) { + char *keys_cstr = new char[keys.size()+1]; + strcpy(keys_cstr, keys.c_str()); + SplitToVector(keys_cstr, key_separator.c_str(), &key_vector, true); + int i = 0; + for (int k = 0; k < key_vector.size(); ++k, ++i) { + string key = string(key_vector[k]); + char *key_cstr = new char[key.size()+1]; + strcpy(key_cstr, key.c_str()); + vector range_vector; + SplitToVector(key_cstr, range_delimiter.c_str(), &range_vector, false); + if (range_vector.size() == 1) { // Not a range + if (!far_reader->Find(key)) { + LOG(ERROR) << "FarExtract: Cannot find key: " << key; + return; + } + const Fst &fst = far_reader->GetFst(); + FarWriteFst(&fst, key, &okey, &nrep, generate_filenames, i, + filename_prefix, filename_suffix); + } else if (range_vector.size() == 2) { // A legal range + string begin_key = string(range_vector[0]); + string end_key = string(range_vector[1]); + if (begin_key.empty() || end_key.empty()) { + LOG(ERROR) << "FarExtract: Illegal range specification: " << key; + return; + } + if (!far_reader->Find(begin_key)) { + LOG(ERROR) << "FarExtract: Cannot find key: " << begin_key; + return; + } + for ( ; !far_reader->Done(); far_reader->Next(), ++i) { + string ikey = far_reader->GetKey(); + if (end_key < ikey) break; + const Fst &fst = far_reader->GetFst(); + FarWriteFst(&fst, ikey, &okey, &nrep, generate_filenames, i, + filename_prefix, filename_suffix); + } + } else { + LOG(ERROR) << "FarExtract: Illegal range specification: " << key; + return; + } + delete key_cstr; + } + delete keys_cstr; + return; + } + // Nothing specified: extract everything. + for (int i = 1; !far_reader->Done(); far_reader->Next(), ++i) { + string key = far_reader->GetKey(); + const Fst &fst = far_reader->GetFst(); + FarWriteFst(&fst, key, &okey, &nrep, generate_filenames, i, + filename_prefix, filename_suffix); + } + return; +} + +} // namespace fst + +#endif // FST_EXTENSIONS_FAR_EXTRACT_H__ diff --git a/kaldi_io/src/tools/openfst/include/fst/extensions/far/far.h b/kaldi_io/src/tools/openfst/include/fst/extensions/far/far.h new file mode 100644 index 0000000..acce76e --- /dev/null +++ b/kaldi_io/src/tools/openfst/include/fst/extensions/far/far.h @@ -0,0 +1,532 @@ +// far.h + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: riley@google.com (Michael Riley) +// +// \file +// Finite-State Transducer (FST) archive classes. +// + +#ifndef FST_EXTENSIONS_FAR_FAR_H__ +#define FST_EXTENSIONS_FAR_FAR_H__ + +#include +#include +#include +#include + +namespace fst { + +enum FarEntryType { FET_LINE, FET_FILE }; +enum FarTokenType { FTT_SYMBOL, FTT_BYTE, FTT_UTF8 }; + +inline bool IsFst(const string &filename) { + ifstream strm(filename.c_str()); + if (!strm) + return false; + return IsFstHeader(strm, filename); +} + +// FST archive header class +class FarHeader { + public: + const string &FarType() const { return fartype_; } + const string &ArcType() const { return arctype_; } + + bool Read(const string &filename) { + FstHeader fsthdr; + if (filename.empty()) { + // Header reading unsupported on stdin. Assumes STList and StdArc. + fartype_ = "stlist"; + arctype_ = "standard"; + return true; + } else if (IsSTTable(filename)) { // Check if STTable + ReadSTTableHeader(filename, &fsthdr); + fartype_ = "sttable"; + arctype_ = fsthdr.ArcType().empty() ? "unknown" : fsthdr.ArcType(); + return true; + } else if (IsSTList(filename)) { // Check if STList + ReadSTListHeader(filename, &fsthdr); + fartype_ = "sttable"; + arctype_ = fsthdr.ArcType().empty() ? "unknown" : fsthdr.ArcType(); + return true; + } else if (IsFst(filename)) { // Check if Fst + ifstream istrm(filename.c_str()); + fsthdr.Read(istrm, filename); + fartype_ = "fst"; + arctype_ = fsthdr.ArcType().empty() ? "unknown" : fsthdr.ArcType(); + return true; + } + return false; + } + + private: + string fartype_; + string arctype_; +}; + +enum FarType { + FAR_DEFAULT = 0, + FAR_STTABLE = 1, + FAR_STLIST = 2, + FAR_FST = 3, +}; + +// This class creates an archive of FSTs. +template +class FarWriter { + public: + typedef A Arc; + + // Creates a new (empty) FST archive; returns NULL on error. + static FarWriter *Create(const string &filename, FarType type = FAR_DEFAULT); + + // Adds an FST to the end of an archive. Keys must be non-empty and + // in lexicographic order. FSTs must have a suitable write method. + virtual void Add(const string &key, const Fst &fst) = 0; + + virtual FarType Type() const = 0; + + virtual bool Error() const = 0; + + virtual ~FarWriter() {} + + protected: + FarWriter() {} + + private: + DISALLOW_COPY_AND_ASSIGN(FarWriter); +}; + + +// This class iterates through an existing archive of FSTs. +template +class FarReader { + public: + typedef A Arc; + + // Opens an existing FST archive in a single file; returns NULL on error. + // Sets current position to the beginning of the achive. + static FarReader *Open(const string &filename); + + // Opens an existing FST archive in multiple files; returns NULL on error. + // Sets current position to the beginning of the achive. + static FarReader *Open(const vector &filenames); + + // Resets current posision to beginning of archive. + virtual void Reset() = 0; + + // Sets current position to first entry >= key. Returns true if a match. + virtual bool Find(const string &key) = 0; + + // Current position at end of archive? + virtual bool Done() const = 0; + + // Move current position to next FST. + virtual void Next() = 0; + + // Returns key at the current position. This reference is invalidated if + // the current position in the archive is changed. + virtual const string &GetKey() const = 0; + + // Returns FST at the current position. This reference is invalidated if + // the current position in the archive is changed. + virtual const Fst &GetFst() const = 0; + + virtual FarType Type() const = 0; + + virtual bool Error() const = 0; + + virtual ~FarReader() {} + + protected: + FarReader() {} + + private: + DISALLOW_COPY_AND_ASSIGN(FarReader); +}; + + +template +class FstWriter { + public: + void operator()(ostream &strm, const Fst &fst) const { + fst.Write(strm, FstWriteOptions()); + } +}; + + +template +class STTableFarWriter : public FarWriter { + public: + typedef A Arc; + + static STTableFarWriter *Create(const string &filename) { + STTableWriter, FstWriter > *writer = + STTableWriter, FstWriter >::Create(filename); + return new STTableFarWriter(writer); + } + + void Add(const string &key, const Fst &fst) { writer_->Add(key, fst); } + + FarType Type() const { return FAR_STTABLE; } + + bool Error() const { return writer_->Error(); } + + ~STTableFarWriter() { delete writer_; } + + private: + explicit STTableFarWriter(STTableWriter, FstWriter > *writer) + : writer_(writer) {} + + private: + STTableWriter, FstWriter > *writer_; + + DISALLOW_COPY_AND_ASSIGN(STTableFarWriter); +}; + + +template +class STListFarWriter : public FarWriter { + public: + typedef A Arc; + + static STListFarWriter *Create(const string &filename) { + STListWriter, FstWriter > *writer = + STListWriter, FstWriter >::Create(filename); + return new STListFarWriter(writer); + } + + void Add(const string &key, const Fst &fst) { writer_->Add(key, fst); } + + FarType Type() const { return FAR_STLIST; } + + bool Error() const { return writer_->Error(); } + + ~STListFarWriter() { delete writer_; } + + private: + explicit STListFarWriter(STListWriter, FstWriter > *writer) + : writer_(writer) {} + + private: + STListWriter, FstWriter > *writer_; + + DISALLOW_COPY_AND_ASSIGN(STListFarWriter); +}; + + +template +class FstFarWriter : public FarWriter { + public: + typedef A Arc; + + explicit FstFarWriter(const string &filename) + : filename_(filename), error_(false), written_(false) {} + + static FstFarWriter *Create(const string &filename) { + return new FstFarWriter(filename); + } + + void Add(const string &key, const Fst &fst) { + if (written_) { + LOG(WARNING) << "FstFarWriter::Add: only one Fst supported," + << " subsequent entries discarded."; + } else { + error_ = !fst.Write(filename_); + written_ = true; + } + } + + FarType Type() const { return FAR_FST; } + + bool Error() const { return error_; } + + ~FstFarWriter() {} + + private: + string filename_; + bool error_; + bool written_; + + DISALLOW_COPY_AND_ASSIGN(FstFarWriter); +}; + + +template +FarWriter *FarWriter::Create(const string &filename, FarType type) { + switch(type) { + case FAR_DEFAULT: + if (filename.empty()) + return STListFarWriter::Create(filename); + case FAR_STTABLE: + return STTableFarWriter::Create(filename); + break; + case FAR_STLIST: + return STListFarWriter::Create(filename); + break; + case FAR_FST: + return FstFarWriter::Create(filename); + break; + default: + LOG(ERROR) << "FarWriter::Create: unknown far type"; + return 0; + } +} + + +template +class FstReader { + public: + Fst *operator()(istream &strm) const { + return Fst::Read(strm, FstReadOptions()); + } +}; + + +template +class STTableFarReader : public FarReader { + public: + typedef A Arc; + + static STTableFarReader *Open(const string &filename) { + STTableReader, FstReader > *reader = + STTableReader, FstReader >::Open(filename); + // TODO: error check + return new STTableFarReader(reader); + } + + static STTableFarReader *Open(const vector &filenames) { + STTableReader, FstReader > *reader = + STTableReader, FstReader >::Open(filenames); + // TODO: error check + return new STTableFarReader(reader); + } + + void Reset() { reader_->Reset(); } + + bool Find(const string &key) { return reader_->Find(key); } + + bool Done() const { return reader_->Done(); } + + void Next() { return reader_->Next(); } + + const string &GetKey() const { return reader_->GetKey(); } + + const Fst &GetFst() const { return reader_->GetEntry(); } + + FarType Type() const { return FAR_STTABLE; } + + bool Error() const { return reader_->Error(); } + + ~STTableFarReader() { delete reader_; } + + private: + explicit STTableFarReader(STTableReader, FstReader > *reader) + : reader_(reader) {} + + private: + STTableReader, FstReader > *reader_; + + DISALLOW_COPY_AND_ASSIGN(STTableFarReader); +}; + + +template +class STListFarReader : public FarReader { + public: + typedef A Arc; + + static STListFarReader *Open(const string &filename) { + STListReader, FstReader > *reader = + STListReader, FstReader >::Open(filename); + // TODO: error check + return new STListFarReader(reader); + } + + static STListFarReader *Open(const vector &filenames) { + STListReader, FstReader > *reader = + STListReader, FstReader >::Open(filenames); + // TODO: error check + return new STListFarReader(reader); + } + + void Reset() { reader_->Reset(); } + + bool Find(const string &key) { return reader_->Find(key); } + + bool Done() const { return reader_->Done(); } + + void Next() { return reader_->Next(); } + + const string &GetKey() const { return reader_->GetKey(); } + + const Fst &GetFst() const { return reader_->GetEntry(); } + + FarType Type() const { return FAR_STLIST; } + + bool Error() const { return reader_->Error(); } + + ~STListFarReader() { delete reader_; } + + private: + explicit STListFarReader(STListReader, FstReader > *reader) + : reader_(reader) {} + + private: + STListReader, FstReader > *reader_; + + DISALLOW_COPY_AND_ASSIGN(STListFarReader); +}; + +template +class FstFarReader : public FarReader { + public: + typedef A Arc; + + static FstFarReader *Open(const string &filename) { + vector filenames; + filenames.push_back(filename); + return new FstFarReader(filenames); + } + + static FstFarReader *Open(const vector &filenames) { + return new FstFarReader(filenames); + } + + FstFarReader(const vector &filenames) + : keys_(filenames), has_stdin_(false), pos_(0), fst_(0), error_(false) { + sort(keys_.begin(), keys_.end()); + streams_.resize(keys_.size(), 0); + for (size_t i = 0; i < keys_.size(); ++i) { + if (keys_[i].empty()) { + if (!has_stdin_) { + streams_[i] = &cin; + //sources_[i] = "stdin"; + has_stdin_ = true; + } else { + FSTERROR() << "FstFarReader::FstFarReader: stdin should only " + << "appear once in the input file list."; + error_ = true; + return; + } + } else { + streams_[i] = new ifstream( + keys_[i].c_str(), ifstream::in | ifstream::binary); + } + } + if (pos_ >= keys_.size()) return; + ReadFst(); + } + + void Reset() { + if (has_stdin_) { + FSTERROR() << "FstFarReader::Reset: operation not supported on stdin"; + error_ = true; + return; + } + pos_ = 0; + ReadFst(); + } + + bool Find(const string &key) { + if (has_stdin_) { + FSTERROR() << "FstFarReader::Find: operation not supported on stdin"; + error_ = true; + return false; + } + pos_ = 0;//TODO + ReadFst(); + return true; + } + + bool Done() const { return error_ || pos_ >= keys_.size(); } + + void Next() { + ++pos_; + ReadFst(); + } + + const string &GetKey() const { + return keys_[pos_]; + } + + const Fst &GetFst() const { + return *fst_; + } + + FarType Type() const { return FAR_FST; } + + bool Error() const { return error_; } + + ~FstFarReader() { + if (fst_) delete fst_; + for (size_t i = 0; i < keys_.size(); ++i) + delete streams_[i]; + } + + private: + void ReadFst() { + if (fst_) delete fst_; + if (pos_ >= keys_.size()) return; + streams_[pos_]->seekg(0); + fst_ = Fst::Read(*streams_[pos_], FstReadOptions()); + if (!fst_) { + FSTERROR() << "FstFarReader: error reading Fst from: " << keys_[pos_]; + error_ = true; + } + } + + private: + vector keys_; + vector streams_; + bool has_stdin_; + size_t pos_; + mutable Fst *fst_; + mutable bool error_; + + DISALLOW_COPY_AND_ASSIGN(FstFarReader); +}; + +template +FarReader *FarReader::Open(const string &filename) { + if (filename.empty()) + return STListFarReader::Open(filename); + else if (IsSTTable(filename)) + return STTableFarReader::Open(filename); + else if (IsSTList(filename)) + return STListFarReader::Open(filename); + else if (IsFst(filename)) + return FstFarReader::Open(filename); + return 0; +} + + +template +FarReader *FarReader::Open(const vector &filenames) { + if (!filenames.empty() && filenames[0].empty()) + return STListFarReader::Open(filenames); + else if (!filenames.empty() && IsSTTable(filenames[0])) + return STTableFarReader::Open(filenames); + else if (!filenames.empty() && IsSTList(filenames[0])) + return STListFarReader::Open(filenames); + else if (!filenames.empty() && IsFst(filenames[0])) + return FstFarReader::Open(filenames); + return 0; +} + +} // namespace fst + +#endif // FST_EXTENSIONS_FAR_FAR_H__ diff --git a/kaldi_io/src/tools/openfst/include/fst/extensions/far/farlib.h b/kaldi_io/src/tools/openfst/include/fst/extensions/far/farlib.h new file mode 100644 index 0000000..91ba224 --- /dev/null +++ b/kaldi_io/src/tools/openfst/include/fst/extensions/far/farlib.h @@ -0,0 +1,31 @@ + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: jpr@google.com (Jake Ratkiewicz) + +// A finite-state archive (FAR) is used to store an indexable collection of +// FSTs in a single file. Utilities are provided to create FARs from FSTs, +// to iterate over FARs, and to extract specific FSTs from FARs. + +#ifndef FST_EXTENSIONS_FAR_FARLIB_H_ +#define FST_EXTENSIONS_FAR_FARLIB_H_ + +#include +#include +#include +#include +#include +#include + +#endif // FST_EXTENSIONS_FAR_FARLIB_H_ diff --git a/kaldi_io/src/tools/openfst/include/fst/extensions/far/farscript.h b/kaldi_io/src/tools/openfst/include/fst/extensions/far/farscript.h new file mode 100644 index 0000000..cfd9167 --- /dev/null +++ b/kaldi_io/src/tools/openfst/include/fst/extensions/far/farscript.h @@ -0,0 +1,273 @@ + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: jpr@google.com (Jake Ratkiewicz) + +// Convenience file for including all of the FAR operations, +// or registering them for new arc types. + +#ifndef FST_EXTENSIONS_FAR_FARSCRIPT_H_ +#define FST_EXTENSIONS_FAR_FARSCRIPT_H_ + +#include +using std::vector; +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace fst { +namespace script { + +// Note: it is safe to pass these strings as references because +// this struct is only used to pass them deeper in the call graph. +// Be sure you understand why this is so before using this struct +// for anything else! +struct FarCompileStringsArgs { + const vector &in_fnames; + const string &out_fname; + const string &fst_type; + const FarType &far_type; + const int32 generate_keys; + const FarEntryType fet; + const FarTokenType tt; + const string &symbols_fname; + const string &unknown_symbol; + const bool keep_symbols; + const bool initial_symbols; + const bool allow_negative_labels; + const bool file_list_input; + const string &key_prefix; + const string &key_suffix; + + FarCompileStringsArgs(const vector &in_fnames, + const string &out_fname, + const string &fst_type, + const FarType &far_type, + int32 generate_keys, + FarEntryType fet, + FarTokenType tt, + const string &symbols_fname, + const string &unknown_symbol, + bool keep_symbols, + bool initial_symbols, + bool allow_negative_labels, + bool file_list_input, + const string &key_prefix, + const string &key_suffix) : + in_fnames(in_fnames), out_fname(out_fname), fst_type(fst_type), + far_type(far_type), generate_keys(generate_keys), fet(fet), + tt(tt), symbols_fname(symbols_fname), unknown_symbol(unknown_symbol), + keep_symbols(keep_symbols), initial_symbols(initial_symbols), + allow_negative_labels(allow_negative_labels), + file_list_input(file_list_input), key_prefix(key_prefix), + key_suffix(key_suffix) { } +}; + +template +void FarCompileStrings(FarCompileStringsArgs *args) { + fst::FarCompileStrings( + args->in_fnames, args->out_fname, args->fst_type, args->far_type, + args->generate_keys, args->fet, args->tt, args->symbols_fname, + args->unknown_symbol, args->keep_symbols, args->initial_symbols, + args->allow_negative_labels, args->file_list_input, + args->key_prefix, args->key_suffix); +} + +void FarCompileStrings( + const vector &in_fnames, + const string &out_fname, + const string &arc_type, + const string &fst_type, + const FarType &far_type, + int32 generate_keys, + FarEntryType fet, + FarTokenType tt, + const string &symbols_fname, + const string &unknown_symbol, + bool keep_symbols, + bool initial_symbols, + bool allow_negative_labels, + bool file_list_input, + const string &key_prefix, + const string &key_suffix); + + +// Note: it is safe to pass these strings as references because +// this struct is only used to pass them deeper in the call graph. +// Be sure you understand why this is so before using this struct +// for anything else! +struct FarCreateArgs { + const vector &in_fnames; + const string &out_fname; + const int32 generate_keys; + const bool file_list_input; + const FarType &far_type; + const string &key_prefix; + const string &key_suffix; + + FarCreateArgs( + const vector &in_fnames, const string &out_fname, + const int32 generate_keys, const bool file_list_input, + const FarType &far_type, const string &key_prefix, + const string &key_suffix) + : in_fnames(in_fnames), out_fname(out_fname), + generate_keys(generate_keys), file_list_input(file_list_input), + far_type(far_type), key_prefix(key_prefix), key_suffix(key_suffix) { } +}; + +template +void FarCreate(FarCreateArgs *args) { + fst::FarCreate(args->in_fnames, args->out_fname, args->generate_keys, + args->file_list_input, args->far_type, + args->key_prefix, args->key_suffix); +} + +void FarCreate(const vector &in_fnames, + const string &out_fname, + const string &arc_type, + const int32 generate_keys, + const bool file_list_input, + const FarType &far_type, + const string &key_prefix, + const string &key_suffix); + + +typedef args::Package FarEqualInnerArgs; +typedef args::WithReturnValue FarEqualArgs; + +template +void FarEqual(FarEqualArgs *args) { + args->retval = fst::FarEqual( + args->args.arg1, args->args.arg2, args->args.arg3, + args->args.arg4, args->args.arg5); +} + +bool FarEqual(const string &filename1, + const string &filename2, + const string &arc_type, + float delta = kDelta, + const string &begin_key = string(), + const string &end_key = string()); + + +typedef args::Package &, int32, + const string&, const string&, const string&, + const string&, const string&> FarExtractArgs; + +template +void FarExtract(FarExtractArgs *args) { + fst::FarExtract( + args->arg1, args->arg2, args->arg3, args->arg4, args->arg5, args->arg6, + args->arg7); +} + +void FarExtract(const vector &ifilenames, + const string &arc_type, + int32 generate_filenames, + const string &keys, + const string &key_separator, + const string &range_delimiter, + const string &filename_prefix, + const string &filename_suffix); + +typedef args::Package &, const string &, + const string &, const bool> FarInfoArgs; + +template +void FarInfo(FarInfoArgs *args) { + fst::FarInfo(args->arg1, args->arg2, args->arg3, args->arg4); +} + +void FarInfo(const vector &filenames, + const string &arc_type, + const string &begin_key, + const string &end_key, + const bool list_fsts); + +struct FarPrintStringsArgs { + const vector &ifilenames; + const FarEntryType entry_type; + const FarTokenType token_type; + const string &begin_key; + const string &end_key; + const bool print_key; + const bool print_weight; + const string &symbols_fname; + const bool initial_symbols; + const int32 generate_filenames; + const string &filename_prefix; + const string &filename_suffix; + + FarPrintStringsArgs( + const vector &ifilenames, const FarEntryType entry_type, + const FarTokenType token_type, const string &begin_key, + const string &end_key, const bool print_key, const bool print_weight, + const string &symbols_fname, const bool initial_symbols, + const int32 generate_filenames, + const string &filename_prefix, const string &filename_suffix) : + ifilenames(ifilenames), entry_type(entry_type), token_type(token_type), + begin_key(begin_key), end_key(end_key), + print_key(print_key), print_weight(print_weight), + symbols_fname(symbols_fname), initial_symbols(initial_symbols), + generate_filenames(generate_filenames), filename_prefix(filename_prefix), + filename_suffix(filename_suffix) { } +}; + +template +void FarPrintStrings(FarPrintStringsArgs *args) { + fst::FarPrintStrings( + args->ifilenames, args->entry_type, args->token_type, + args->begin_key, args->end_key, args->print_key, args->print_weight, + args->symbols_fname, args->initial_symbols, args->generate_filenames, + args->filename_prefix, args->filename_suffix); +} + + +void FarPrintStrings(const vector &ifilenames, + const string &arc_type, + const FarEntryType entry_type, + const FarTokenType token_type, + const string &begin_key, + const string &end_key, + const bool print_key, + const bool print_weight, + const string &symbols_fname, + const bool initial_symbols, + const int32 generate_filenames, + const string &filename_prefix, + const string &filename_suffix); + +} // namespace script +} // namespace fst + + +#define REGISTER_FST_FAR_OPERATIONS(ArcType) \ + REGISTER_FST_OPERATION(FarCompileStrings, ArcType, FarCompileStringsArgs); \ + REGISTER_FST_OPERATION(FarCreate, ArcType, FarCreateArgs); \ + REGISTER_FST_OPERATION(FarEqual, ArcType, FarEqualArgs); \ + REGISTER_FST_OPERATION(FarExtract, ArcType, FarExtractArgs); \ + REGISTER_FST_OPERATION(FarInfo, ArcType, FarInfoArgs); \ + REGISTER_FST_OPERATION(FarPrintStrings, ArcType, FarPrintStringsArgs) + +#endif // FST_EXTENSIONS_FAR_FARSCRIPT_H_ diff --git a/kaldi_io/src/tools/openfst/include/fst/extensions/far/info.h b/kaldi_io/src/tools/openfst/include/fst/extensions/far/info.h new file mode 100644 index 0000000..100fe68 --- /dev/null +++ b/kaldi_io/src/tools/openfst/include/fst/extensions/far/info.h @@ -0,0 +1,128 @@ + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: allauzen@google.com (Cyril Allauzen) +// Modified: jpr@google.com (Jake Ratkiewicz) + +#ifndef FST_EXTENSIONS_FAR_INFO_H_ +#define FST_EXTENSIONS_FAR_INFO_H_ + +#include +#include +#include +#include +using std::vector; + +#include +#include // For FarTypeToString + +namespace fst { + +template +void CountStatesAndArcs(const Fst &fst, size_t *nstate, size_t *narc) { + StateIterator > siter(fst); + for (; !siter.Done(); siter.Next(), ++(*nstate)) { + ArcIterator > aiter(fst, siter.Value()); + for (; !aiter.Done(); aiter.Next(), ++(*narc)) {} + } +} + +struct KeyInfo { + string key; + string type; + size_t nstate; + size_t narc; + + KeyInfo(string k, string t, int64 ns = 0, int64 na = 0) + : key(k), type(t), nstate(ns), narc(na) {} +}; + +template +void FarInfo(const vector &filenames, const string &begin_key, + const string &end_key, const bool list_fsts) { + FarReader *far_reader = FarReader::Open(filenames); + if (!far_reader) return; + + if (!begin_key.empty()) + far_reader->Find(begin_key); + + vector *infos = list_fsts ? new vector() : 0; + size_t nfst = 0, nstate = 0, narc = 0; + set fst_types; + for (; !far_reader->Done(); far_reader->Next()) { + string key = far_reader->GetKey(); + if (!end_key.empty() && end_key < key) + break; + ++nfst; + const Fst &fst = far_reader->GetFst(); + fst_types.insert(fst.Type()); + if (infos) { + KeyInfo info(key, fst.Type()); + CountStatesAndArcs(fst, &info.nstate, &info.narc); + nstate += info.nstate; + nstate += info.narc; + infos->push_back(info); + } else { + CountStatesAndArcs(fst, &nstate, &narc); + } + } + + if (!infos) { + cout << std::left << setw(50) << "far type" + << FarTypeToString(far_reader->Type()) << endl; + cout << std::left << setw(50) << "arc type" << Arc::Type() << endl; + cout << std::left << setw(50) << "fst type"; + for (set::const_iterator iter = fst_types.begin(); + iter != fst_types.end(); + ++iter) { + if (iter != fst_types.begin()) + cout << ","; + cout << *iter; + } + cout << endl; + cout << std::left << setw(50) << "# of FSTs" << nfst << endl; + cout << std::left << setw(50) << "total # of states" << nstate << endl; + cout << std::left << setw(50) << "total # of arcs" << narc << endl; + + } else { + int wkey = 10, wtype = 10, wnstate = 16, wnarc = 16; + for (size_t i = 0; i < infos->size(); ++i) { + const KeyInfo &info = (*infos)[i]; + if (info.key.size() + 2 > wkey) + wkey = info.key.size() + 2; + if (info.type.size() + 2 > wtype) + wtype = info.type.size() + 2; + if (ceil(log10(info.nstate)) + 2 > wnstate) + wnstate = ceil(log10(info.nstate)) + 2; + if (ceil(log10(info.narc)) + 2 > wnarc) + wnarc = ceil(log10(info.narc)) + 2; + } + + cout << std::left << setw(wkey) << "key" << setw(wtype) << "type" + << std::right << setw(wnstate) << "# of states" + << setw(wnarc) << "# of arcs" << endl; + + for (size_t i = 0; i < infos->size(); ++i) { + const KeyInfo &info = (*infos)[i]; + cout << std::left << setw(wkey) << info.key << setw(wtype) << info.type + << std::right << setw(wnstate) << info.nstate + << setw(wnarc) << info.narc << endl; + } + } +} + +} // namespace fst + + +#endif // FST_EXTENSIONS_FAR_INFO_H_ diff --git a/kaldi_io/src/tools/openfst/include/fst/extensions/far/main.h b/kaldi_io/src/tools/openfst/include/fst/extensions/far/main.h new file mode 100644 index 0000000..00ccfef --- /dev/null +++ b/kaldi_io/src/tools/openfst/include/fst/extensions/far/main.h @@ -0,0 +1,43 @@ +// main.h + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: riley@google.com (Michael Riley) +// +// \file +// Classes and functions for registering and invoking Far main +// functions that support multiple and extensible arc types. + +#ifndef FST_EXTENSIONS_FAR_MAIN_H__ +#define FST_EXTENSIONS_FAR_MAIN_H__ + +#include + +namespace fst { + +FarEntryType StringToFarEntryType(const string &s); +FarTokenType StringToFarTokenType(const string &s); + +// Return the 'FarType' value corresponding to a far type name. +FarType FarTypeFromString(const string &str); + +// Return the textual name corresponding to a 'FarType;. +string FarTypeToString(FarType type); + +string LoadArcTypeFromFar(const string& far_fname); +string LoadArcTypeFromFst(const string& far_fname); + +} // namespace fst + +#endif // FST_EXTENSIONS_FAR_MAIN_H__ diff --git a/kaldi_io/src/tools/openfst/include/fst/extensions/far/print-strings.h b/kaldi_io/src/tools/openfst/include/fst/extensions/far/print-strings.h new file mode 100644 index 0000000..dcc7351 --- /dev/null +++ b/kaldi_io/src/tools/openfst/include/fst/extensions/far/print-strings.h @@ -0,0 +1,138 @@ +// printstrings-main.h + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: allauzen@google.com (Cyril Allauzen) +// Modified by: jpr@google.com (Jake Ratkiewicz) +// +// \file +// Output as strings the string FSTs in a finite-state archive. + +#ifndef FST_EXTENSIONS_FAR_PRINT_STRINGS_H__ +#define FST_EXTENSIONS_FAR_PRINT_STRINGS_H__ + +#include +#include +using std::vector; + +#include +#include +#include + +DECLARE_string(far_field_separator); + +namespace fst { + +template +void FarPrintStrings( + const vector &ifilenames, const FarEntryType entry_type, + const FarTokenType far_token_type, const string &begin_key, + const string &end_key, const bool print_key, const bool print_weight, + const string &symbols_fname, const bool initial_symbols, + const int32 generate_filenames, + const string &filename_prefix, const string &filename_suffix) { + + typename StringPrinter::TokenType token_type; + if (far_token_type == FTT_SYMBOL) { + token_type = StringPrinter::SYMBOL; + } else if (far_token_type == FTT_BYTE) { + token_type = StringPrinter::BYTE; + } else if (far_token_type == FTT_UTF8) { + token_type = StringPrinter::UTF8; + } else { + FSTERROR() << "FarPrintStrings: unknown token type"; + return; + } + + const SymbolTable *syms = 0; + if (!symbols_fname.empty()) { + // allow negative flag? + SymbolTableTextOptions opts; + opts.allow_negative = true; + syms = SymbolTable::ReadText(symbols_fname, opts); + if (!syms) { + FSTERROR() << "FarPrintStrings: error reading symbol table: " + << symbols_fname; + return; + } + } + + FarReader *far_reader = FarReader::Open(ifilenames); + if (!far_reader) return; + + if (!begin_key.empty()) + far_reader->Find(begin_key); + + string okey; + int nrep = 0; + for (int i = 1; !far_reader->Done(); far_reader->Next(), ++i) { + string key = far_reader->GetKey(); + if (!end_key.empty() && end_key < key) + break; + if (okey == key) + ++nrep; + else + nrep = 0; + okey = key; + + const Fst &fst = far_reader->GetFst(); + if (i == 1 && initial_symbols && syms == 0 && fst.InputSymbols() != 0) + syms = fst.InputSymbols()->Copy(); + string str; + VLOG(2) << "Handling key: " << key; + StringPrinter string_printer( + token_type, syms ? syms : fst.InputSymbols()); + string_printer(fst, &str); + + if (entry_type == FET_LINE) { + if (print_key) + cout << key << FLAGS_far_field_separator[0]; + cout << str; + if (print_weight) + cout << FLAGS_far_field_separator[0] << ShortestDistance(fst); + cout << endl; + } else if (entry_type == FET_FILE) { + stringstream sstrm; + if (generate_filenames) { + sstrm.fill('0'); + sstrm << std::right << setw(generate_filenames) << i; + } else { + sstrm << key; + if (nrep > 0) + sstrm << "." << nrep; + } + + string filename; + filename = filename_prefix + sstrm.str() + filename_suffix; + + ofstream ostrm(filename.c_str()); + if (!ostrm) { + FSTERROR() << "FarPrintStrings: Can't open file:" << filename; + delete syms; + delete far_reader; + return; + } + ostrm << str; + if (token_type == StringPrinter::SYMBOL) + ostrm << "\n"; + } + } + delete syms; +} + + + +} // namespace fst + +#endif // FST_EXTENSIONS_FAR_PRINT_STRINGS_H__ diff --git a/kaldi_io/src/tools/openfst/include/fst/extensions/far/stlist.h b/kaldi_io/src/tools/openfst/include/fst/extensions/far/stlist.h new file mode 100644 index 0000000..ff3d98b --- /dev/null +++ b/kaldi_io/src/tools/openfst/include/fst/extensions/far/stlist.h @@ -0,0 +1,305 @@ + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: allauzen@google.com (Cyril Allauzen) +// +// \file +// A generic (string,type) list file format. +// +// This is a stripped-down version of STTable that does +// not support the Find() operation but that does support +// reading/writting from standard in/out. + +#ifndef FST_EXTENSIONS_FAR_STLIST_H_ +#define FST_EXTENSIONS_FAR_STLIST_H_ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +using std::pair; using std::make_pair; +#include +using std::vector; + +namespace fst { + +static const int32 kSTListMagicNumber = 5656924; +static const int32 kSTListFileVersion = 1; + +// String-type list writing class for object of type 'T' using functor 'W' +// to write an object of type 'T' from a stream. 'W' must conform to the +// following interface: +// +// struct Writer { +// void operator()(ostream &, const T &) const; +// }; +// +template +class STListWriter { + public: + typedef T EntryType; + typedef W EntryWriter; + + explicit STListWriter(const string filename) + : stream_( + filename.empty() ? &cout : + new ofstream(filename.c_str(), ofstream::out | ofstream::binary)), + error_(false) { + WriteType(*stream_, kSTListMagicNumber); + WriteType(*stream_, kSTListFileVersion); + if (!stream_) { + FSTERROR() << "STListWriter::STListWriter: error writing to file: " + << filename; + error_ = true; + } + } + + static STListWriter *Create(const string &filename) { + return new STListWriter(filename); + } + + void Add(const string &key, const T &t) { + if (key == "") { + FSTERROR() << "STListWriter::Add: key empty: " << key; + error_ = true; + } else if (key < last_key_) { + FSTERROR() << "STListWriter::Add: key disorder: " << key; + error_ = true; + } + if (error_) return; + last_key_ = key; + WriteType(*stream_, key); + entry_writer_(*stream_, t); + } + + bool Error() const { return error_; } + + ~STListWriter() { + WriteType(*stream_, string()); + if (stream_ != &cout) + delete stream_; + } + + private: + EntryWriter entry_writer_; // Write functor for 'EntryType' + ostream *stream_; // Output stream + string last_key_; // Last key + bool error_; + + DISALLOW_COPY_AND_ASSIGN(STListWriter); +}; + + +// String-type list reading class for object of type 'T' using functor 'R' +// to read an object of type 'T' form a stream. 'R' must conform to the +// following interface: +// +// struct Reader { +// T *operator()(istream &) const; +// }; +// +template +class STListReader { + public: + typedef T EntryType; + typedef R EntryReader; + + explicit STListReader(const vector &filenames) + : sources_(filenames), entry_(0), error_(false) { + streams_.resize(filenames.size(), 0); + bool has_stdin = false; + for (size_t i = 0; i < filenames.size(); ++i) { + if (filenames[i].empty()) { + if (!has_stdin) { + streams_[i] = &cin; + sources_[i] = "stdin"; + has_stdin = true; + } else { + FSTERROR() << "STListReader::STListReader: stdin should only " + << "appear once in the input file list."; + error_ = true; + return; + } + } else { + streams_[i] = new ifstream( + filenames[i].c_str(), ifstream::in | ifstream::binary); + } + int32 magic_number = 0, file_version = 0; + ReadType(*streams_[i], &magic_number); + ReadType(*streams_[i], &file_version); + if (