summaryrefslogtreecommitdiff
path: root/kaldi_io/src/tools/openfst/include/fst/extensions/ngram
diff options
context:
space:
mode:
Diffstat (limited to 'kaldi_io/src/tools/openfst/include/fst/extensions/ngram')
-rw-r--r--kaldi_io/src/tools/openfst/include/fst/extensions/ngram/bitmap-index.h183
-rw-r--r--kaldi_io/src/tools/openfst/include/fst/extensions/ngram/ngram-fst.h934
-rw-r--r--kaldi_io/src/tools/openfst/include/fst/extensions/ngram/nthbit.h46
3 files changed, 1163 insertions, 0 deletions
diff --git a/kaldi_io/src/tools/openfst/include/fst/extensions/ngram/bitmap-index.h b/kaldi_io/src/tools/openfst/include/fst/extensions/ngram/bitmap-index.h
new file mode 100644
index 0000000..f5a5ba7
--- /dev/null
+++ b/kaldi_io/src/tools/openfst/include/fst/extensions/ngram/bitmap-index.h
@@ -0,0 +1,183 @@
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Copyright 2005-2010 Google, Inc.
+// Author: sorenj@google.com (Jeffrey Sorensen)
+
+#ifndef FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
+#define FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
+
+#include <vector>
+using std::vector;
+
+#include <fst/compat.h>
+
+// This class is a bitstring storage class with an index that allows
+// seeking to the Nth set or clear bit in time O(Log(N)) where N is
+// the length of the bit vector. In addition, it allows counting set or
+// clear bits over ranges in constant time.
+//
+// This is accomplished by maintaining an "secondary" index of limited
+// size in bits that maintains a running count of the number of bits set
+// in each block of bitmap data. A block is defined as the number of
+// uint64 values that can fit in the secondary index before an overflow
+// occurs.
+//
+// To handle overflows, a "primary" index containing a running count of
+// bits set in each block is created using the type uint64.
+
+namespace fst {
+
+class BitmapIndex {
+ public:
+ static size_t StorageSize(size_t size) {
+ return ((size + kStorageBlockMask) >> kStorageLogBitSize);
+ }
+
+ BitmapIndex() : bits_(NULL), size_(0) { }
+
+ bool Get(size_t index) const {
+ return (bits_[index >> kStorageLogBitSize] &
+ (kOne << (index & kStorageBlockMask))) != 0;
+ }
+
+ static void Set(uint64* bits, size_t index) {
+ bits[index >> kStorageLogBitSize] |= (kOne << (index & kStorageBlockMask));
+ }
+
+ static void Clear(uint64* bits, size_t index) {
+ bits[index >> kStorageLogBitSize] &= ~(kOne << (index & kStorageBlockMask));
+ }
+
+ size_t Bits() const {
+ return size_;
+ }
+
+ size_t ArraySize() const {
+ return StorageSize(size_);
+ }
+
+ // Returns the number of one bits in the bitmap
+ size_t GetOnesCount() const {
+ return primary_index_[primary_index_size() - 1];
+ }
+
+ // Returns the number of one bits in positions 0 to limit - 1.
+ // REQUIRES: limit <= Bits()
+ size_t Rank1(size_t end) const;
+
+ // Returns the number of one bits in the range start to end - 1.
+ // REQUIRES: limit <= Bits()
+ size_t GetOnesCountInRange(size_t start, size_t end) const {
+ return Rank1(end) - Rank1(start);
+ }
+
+ // Returns the number of zero bits in positions 0 to limit - 1.
+ // REQUIRES: limit <= Bits()
+ size_t Rank0(size_t end) const {
+ return end - Rank1(end);
+ }
+
+ // Returns the number of zero bits in the range start to end - 1.
+ // REQUIRES: limit <= Bits()
+ size_t GetZeroesCountInRange(size_t start, size_t end) const {
+ return end - start - GetOnesCountInRange(start, end);
+ }
+
+ // Return true if any bit between begin inclusive and end exclusive
+ // is set. 0 <= begin <= end <= Bits() is required.
+ //
+ bool TestRange(size_t start, size_t end) const {
+ return Rank1(end) > Rank1(start);
+ }
+
+ // Returns the offset to the nth set bit (zero based)
+ // or Bits() if index >= number of ones
+ size_t Select1(size_t bit_index) const;
+
+ // Returns the offset to the nth clear bit (zero based)
+ // or Bits() if index > number of
+ size_t Select0(size_t bit_index) const;
+
+ // Rebuilds from index for the associated Bitmap, should be called
+ // whenever changes have been made to the Bitmap or else behavior
+ // of the indexed bitmap methods will be undefined.
+ void BuildIndex(const uint64 *bits, size_t size);
+
+ // the secondary index accumulates counts until it can possibly overflow
+ // this constant computes the number of uint64 units that can fit into
+ // units the size of uint16.
+ static const uint64 kOne = 1;
+ static const uint32 kStorageBitSize = 64;
+ static const uint32 kStorageLogBitSize = 6;
+ static const uint32 kSecondaryBlockSize = ((1 << 16) - 1)
+ >> kStorageLogBitSize;
+
+ private:
+ static const uint32 kStorageBlockMask = kStorageBitSize - 1;
+
+ // returns, from the index, the count of ones up to array_index
+ size_t get_index_ones_count(size_t array_index) const;
+
+ // because the indexes, both primary and secondary, contain a running
+ // count of the population of one bits contained in [0,i), there is
+ // no reason to have an element in the zeroth position as this value would
+ // necessarily be zero. (The bits are indexed in a zero based way.) Thus
+ // we don't store the 0th element in either index. Both of the following
+ // functions, if greater than 0, must be decremented by one before retreiving
+ // the value from the corresponding array.
+ // returns the 1 + the block that contains the bitindex in question
+ // the inverted version works the same but looks for zeros using an inverted
+ // view of the index
+ size_t find_primary_block(size_t bit_index) const;
+
+ size_t find_inverted_primary_block(size_t bit_index) const;
+
+ // similarly, the secondary index (which resets its count to zero at
+ // the end of every kSecondaryBlockSize entries) does not store the element
+ // at 0. Note that the rem_bit_index parameter is the number of bits
+ // within the secondary block, after the bits accounted for by the primary
+ // block have been removed (i.e. the remaining bits) And, because we
+ // reset to zero with each new block, there is no need to store those
+ // actual zeros.
+ // returns 1 + the secondary block that contains the bitindex in question
+ size_t find_secondary_block(size_t block, size_t rem_bit_index) const;
+
+ size_t find_inverted_secondary_block(size_t block, size_t rem_bit_index)
+ const;
+
+ // We create a primary index based upon the number of secondary index
+ // blocks. The primary index uses fields wide enough to accomodate any
+ // index of the bitarray so cannot overflow
+ // The primary index is the actual running
+ // count of one bits set for all blocks (and, thus, all uint64s).
+ size_t primary_index_size() const {
+ return (ArraySize() + kSecondaryBlockSize - 1) / kSecondaryBlockSize;
+ }
+
+ const uint64* bits_;
+ size_t size_;
+
+ // The primary index contains the running popcount of all blocks
+ // which means the nth value contains the popcounts of
+ // [0,n*kSecondaryBlockSize], however, the 0th element is omitted.
+ vector<uint32> primary_index_;
+ // The secondary index contains the running popcount of the associated
+ // bitmap. It is the same length (in units of uint16) as the
+ // bitmap's map is in units of uint64s.
+ vector<uint16> secondary_index_;
+};
+
+} // end namespace fst
+
+#endif // FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
diff --git a/kaldi_io/src/tools/openfst/include/fst/extensions/ngram/ngram-fst.h b/kaldi_io/src/tools/openfst/include/fst/extensions/ngram/ngram-fst.h
new file mode 100644
index 0000000..d113fb3
--- /dev/null
+++ b/kaldi_io/src/tools/openfst/include/fst/extensions/ngram/ngram-fst.h
@@ -0,0 +1,934 @@
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Copyright 2005-2010 Google, Inc.
+// Author: sorenj@google.com (Jeffrey Sorensen)
+//
+#ifndef FST_EXTENSIONS_NGRAM_NGRAM_FST_H_
+#define FST_EXTENSIONS_NGRAM_NGRAM_FST_H_
+
+#include <stddef.h>
+#include <string.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+using std::vector;
+
+#include <fst/compat.h>
+#include <fst/fstlib.h>
+#include <fst/mapped-file.h>
+#include <fst/extensions/ngram/bitmap-index.h>
+
+// NgramFst implements a n-gram language model based upon the LOUDS data
+// structure. Please refer to "Unary Data Strucutres for Language Models"
+// http://research.google.com/pubs/archive/37218.pdf
+
+namespace fst {
+template <class A> class NGramFst;
+template <class A> class NGramFstMatcher;
+
+// Instance data containing mutable state for bookkeeping repeated access to
+// the same state.
+template <class A>
+struct NGramFstInst {
+ typedef typename A::Label Label;
+ typedef typename A::StateId StateId;
+ typedef typename A::Weight Weight;
+ StateId state_;
+ size_t num_futures_;
+ size_t offset_;
+ size_t node_;
+ StateId node_state_;
+ vector<Label> context_;
+ StateId context_state_;
+ NGramFstInst()
+ : state_(kNoStateId), node_state_(kNoStateId),
+ context_state_(kNoStateId) { }
+};
+
+// Implementation class for LOUDS based NgramFst interface
+template <class A>
+class NGramFstImpl : public FstImpl<A> {
+ using FstImpl<A>::SetInputSymbols;
+ using FstImpl<A>::SetOutputSymbols;
+ using FstImpl<A>::SetType;
+ using FstImpl<A>::WriteHeader;
+
+ friend class ArcIterator<NGramFst<A> >;
+ friend class NGramFstMatcher<A>;
+
+ public:
+ using FstImpl<A>::InputSymbols;
+ using FstImpl<A>::SetProperties;
+ using FstImpl<A>::Properties;
+
+ typedef A Arc;
+ typedef typename A::Label Label;
+ typedef typename A::StateId StateId;
+ typedef typename A::Weight Weight;
+
+ NGramFstImpl() : data_region_(0), data_(0), owned_(false) {
+ SetType("ngram");
+ SetInputSymbols(NULL);
+ SetOutputSymbols(NULL);
+ SetProperties(kStaticProperties);
+ }
+
+ NGramFstImpl(const Fst<A> &fst, vector<StateId>* order_out);
+
+ ~NGramFstImpl() {
+ if (owned_) {
+ delete [] data_;
+ }
+ delete data_region_;
+ }
+
+ static NGramFstImpl<A>* Read(istream &strm, // NOLINT
+ const FstReadOptions &opts) {
+ NGramFstImpl<A>* impl = new NGramFstImpl();
+ FstHeader hdr;
+ if (!impl->ReadHeader(strm, opts, kMinFileVersion, &hdr)) return 0;
+ uint64 num_states, num_futures, num_final;
+ const size_t offset = sizeof(num_states) + sizeof(num_futures) +
+ sizeof(num_final);
+ // Peek at num_states and num_futures to see how much more needs to be read.
+ strm.read(reinterpret_cast<char *>(&num_states), sizeof(num_states));
+ strm.read(reinterpret_cast<char *>(&num_futures), sizeof(num_futures));
+ strm.read(reinterpret_cast<char *>(&num_final), sizeof(num_final));
+ size_t size = Storage(num_states, num_futures, num_final);
+ MappedFile *data_region = MappedFile::Allocate(size);
+ char *data = reinterpret_cast<char *>(data_region->mutable_data());
+ // Copy num_states, num_futures and num_final back into data.
+ memcpy(data, reinterpret_cast<char *>(&num_states), sizeof(num_states));
+ memcpy(data + sizeof(num_states), reinterpret_cast<char *>(&num_futures),
+ sizeof(num_futures));
+ memcpy(data + sizeof(num_states) + sizeof(num_futures),
+ reinterpret_cast<char *>(&num_final), sizeof(num_final));
+ strm.read(data + offset, size - offset);
+ if (!strm) {
+ delete impl;
+ return NULL;
+ }
+ impl->Init(data, false, data_region);
+ return impl;
+ }
+
+ bool Write(ostream &strm, // NOLINT
+ const FstWriteOptions &opts) const {
+ FstHeader hdr;
+ hdr.SetStart(Start());
+ hdr.SetNumStates(num_states_);
+ WriteHeader(strm, opts, kFileVersion, &hdr);
+ strm.write(data_, Storage(num_states_, num_futures_, num_final_));
+ return strm;
+ }
+
+ StateId Start() const {
+ return 1;
+ }
+
+ Weight Final(StateId state) const {
+ if (final_index_.Get(state)) {
+ return final_probs_[final_index_.Rank1(state)];
+ } else {
+ return Weight::Zero();
+ }
+ }
+
+ size_t NumArcs(StateId state, NGramFstInst<A> *inst = NULL) const {
+ if (inst == NULL) {
+ const size_t next_zero = future_index_.Select0(state + 1);
+ const size_t this_zero = future_index_.Select0(state);
+ return next_zero - this_zero - 1;
+ }
+ SetInstFuture(state, inst);
+ return inst->num_futures_ + ((state == 0) ? 0 : 1);
+ }
+
+ size_t NumInputEpsilons(StateId state) const {
+ // State 0 has no parent, thus no backoff.
+ if (state == 0) return 0;
+ return 1;
+ }
+
+ size_t NumOutputEpsilons(StateId state) const {
+ return NumInputEpsilons(state);
+ }
+
+ StateId NumStates() const {
+ return num_states_;
+ }
+
+ void InitStateIterator(StateIteratorData<A>* data) const {
+ data->base = 0;
+ data->nstates = num_states_;
+ }
+
+ static size_t Storage(uint64 num_states, uint64 num_futures,
+ uint64 num_final) {
+ uint64 b64;
+ Weight weight;
+ Label label;
+ size_t offset = sizeof(num_states) + sizeof(num_futures) +
+ sizeof(num_final);
+ offset += sizeof(b64) * (
+ BitmapIndex::StorageSize(num_states * 2 + 1) +
+ BitmapIndex::StorageSize(num_futures + num_states + 1) +
+ BitmapIndex::StorageSize(num_states));
+ offset += (num_states + 1) * sizeof(label) + num_futures * sizeof(label);
+ // Pad for alignemnt, see
+ // http://en.wikipedia.org/wiki/Data_structure_alignment#Computing_padding
+ offset = (offset + sizeof(weight) - 1) & ~(sizeof(weight) - 1);
+ offset += (num_states + 1) * sizeof(weight) + num_final * sizeof(weight) +
+ (num_futures + 1) * sizeof(weight);
+ return offset;
+ }
+
+ void SetInstFuture(StateId state, NGramFstInst<A> *inst) const {
+ if (inst->state_ != state) {
+ inst->state_ = state;
+ const size_t next_zero = future_index_.Select0(state + 1);
+ const size_t this_zero = future_index_.Select0(state);
+ inst->num_futures_ = next_zero - this_zero - 1;
+ inst->offset_ = future_index_.Rank1(future_index_.Select0(state) + 1);
+ }
+ }
+
+ void SetInstNode(NGramFstInst<A> *inst) const {
+ if (inst->node_state_ != inst->state_) {
+ inst->node_state_ = inst->state_;
+ inst->node_ = context_index_.Select1(inst->state_);
+ }
+ }
+
+ void SetInstContext(NGramFstInst<A> *inst) const {
+ SetInstNode(inst);
+ if (inst->context_state_ != inst->state_) {
+ inst->context_state_ = inst->state_;
+ inst->context_.clear();
+ size_t node = inst->node_;
+ while (node != 0) {
+ inst->context_.push_back(context_words_[context_index_.Rank1(node)]);
+ node = context_index_.Select1(context_index_.Rank0(node) - 1);
+ }
+ }
+ }
+
+ // Access to the underlying representation
+ const char* GetData(size_t* data_size) const {
+ *data_size = Storage(num_states_, num_futures_, num_final_);
+ return data_;
+ }
+
+ void Init(const char* data, bool owned, MappedFile *file = 0);
+
+ const vector<Label> &GetContext(StateId s, NGramFstInst<A> *inst) const {
+ SetInstFuture(s, inst);
+ SetInstContext(inst);
+ return inst->context_;
+ }
+
+ private:
+ StateId Transition(const vector<Label> &context, Label future) const;
+
+ // Properties always true for this Fst class.
+ static const uint64 kStaticProperties = kAcceptor | kIDeterministic |
+ kODeterministic | kEpsilons | kIEpsilons | kOEpsilons | kILabelSorted |
+ kOLabelSorted | kWeighted | kCyclic | kInitialAcyclic | kNotTopSorted |
+ kAccessible | kCoAccessible | kNotString | kExpanded;
+ // Current file format version.
+ static const int kFileVersion = 4;
+ // Minimum file format version supported.
+ static const int kMinFileVersion = 4;
+
+ MappedFile *data_region_;
+ const char* data_;
+ bool owned_; // True if we own data_
+ uint64 num_states_, num_futures_, num_final_;
+ size_t root_num_children_;
+ const Label *root_children_;
+ size_t root_first_child_;
+ // borrowed references
+ const uint64 *context_, *future_, *final_;
+ const Label *context_words_, *future_words_;
+ const Weight *backoff_, *final_probs_, *future_probs_;
+ BitmapIndex context_index_;
+ BitmapIndex future_index_;
+ BitmapIndex final_index_;
+
+ void operator=(const NGramFstImpl<A> &); // Disallow
+};
+
+template<typename A>
+NGramFstImpl<A>::NGramFstImpl(const Fst<A> &fst, vector<StateId>* order_out)
+ : data_region_(0), data_(0), owned_(false) {
+ typedef A Arc;
+ typedef typename Arc::Label Label;
+ typedef typename Arc::Weight Weight;
+ typedef typename Arc::StateId StateId;
+ SetType("ngram");
+ SetInputSymbols(fst.InputSymbols());
+ SetOutputSymbols(fst.OutputSymbols());
+ SetProperties(kStaticProperties);
+
+ // Check basic requirements for an OpenGRM language model Fst.
+ int64 props = kAcceptor | kIDeterministic | kIEpsilons | kILabelSorted;
+ if (fst.Properties(props, true) != props) {
+ FSTERROR() << "NGramFst only accepts OpenGRM langauge models as input";
+ SetProperties(kError, kError);
+ return;
+ }
+
+ int64 num_states = CountStates(fst);
+ Label* context = new Label[num_states];
+
+ // Find the unigram state by starting from the start state, following
+ // epsilons.
+ StateId unigram = fst.Start();
+ while (1) {
+ if (unigram == kNoStateId) {
+ FSTERROR() << "Could not identify unigram state.";
+ SetProperties(kError, kError);
+ return;
+ }
+ ArcIterator<Fst<A> > aiter(fst, unigram);
+ if (aiter.Done()) {
+ LOG(WARNING) << "Unigram state " << unigram << " has no arcs.";
+ break;
+ }
+ if (aiter.Value().ilabel != 0) break;
+ unigram = aiter.Value().nextstate;
+ }
+
+ // Each state's context is determined by the subtree it is under from the
+ // unigram state.
+ queue<pair<StateId, Label> > label_queue;
+ vector<bool> visited(num_states);
+ // Force an epsilon link to the start state.
+ label_queue.push(make_pair(fst.Start(), 0));
+ for (ArcIterator<Fst<A> > aiter(fst, unigram);
+ !aiter.Done(); aiter.Next()) {
+ label_queue.push(make_pair(aiter.Value().nextstate, aiter.Value().ilabel));
+ }
+ // investigate states in breadth first fashion to assign context words.
+ while (!label_queue.empty()) {
+ pair<StateId, Label> &now = label_queue.front();
+ if (!visited[now.first]) {
+ context[now.first] = now.second;
+ visited[now.first] = true;
+ for (ArcIterator<Fst<A> > aiter(fst, now.first);
+ !aiter.Done(); aiter.Next()) {
+ const Arc &arc = aiter.Value();
+ if (arc.ilabel != 0) {
+ label_queue.push(make_pair(arc.nextstate, now.second));
+ }
+ }
+ }
+ label_queue.pop();
+ }
+ visited.clear();
+
+ // The arc from the start state should be assigned an epsilon to put it
+ // in front of the all other labels (which makes Start state 1 after
+ // unigram which is state 0).
+ context[fst.Start()] = 0;
+
+ // Build the tree of contexts fst by reversing the epsilon arcs from fst.
+ VectorFst<Arc> context_fst;
+ uint64 num_final = 0;
+ for (int i = 0; i < num_states; ++i) {
+ if (fst.Final(i) != Weight::Zero()) {
+ ++num_final;
+ }
+ context_fst.SetFinal(context_fst.AddState(), fst.Final(i));
+ }
+ context_fst.SetStart(unigram);
+ context_fst.SetInputSymbols(fst.InputSymbols());
+ context_fst.SetOutputSymbols(fst.OutputSymbols());
+ int64 num_context_arcs = 0;
+ int64 num_futures = 0;
+ for (StateIterator<Fst<A> > siter(fst); !siter.Done(); siter.Next()) {
+ const StateId &state = siter.Value();
+ num_futures += fst.NumArcs(state) - fst.NumInputEpsilons(state);
+ ArcIterator<Fst<A> > aiter(fst, state);
+ if (!aiter.Done()) {
+ const Arc &arc = aiter.Value();
+ // this arc goes from state to arc.nextstate, so create an arc from
+ // arc.nextstate to state to reverse it.
+ if (arc.ilabel == 0) {
+ context_fst.AddArc(arc.nextstate, Arc(context[state], context[state],
+ arc.weight, state));
+ num_context_arcs++;
+ }
+ }
+ }
+ if (num_context_arcs != context_fst.NumStates() - 1) {
+ FSTERROR() << "Number of contexts arcs != number of states - 1";
+ SetProperties(kError, kError);
+ return;
+ }
+ if (context_fst.NumStates() != num_states) {
+ FSTERROR() << "Number of contexts != number of states";
+ SetProperties(kError, kError);
+ return;
+ }
+ int64 context_props = context_fst.Properties(kIDeterministic |
+ kILabelSorted, true);
+ if (!(context_props & kIDeterministic)) {
+ FSTERROR() << "Input fst is not structured properly";
+ SetProperties(kError, kError);
+ return;
+ }
+ if (!(context_props & kILabelSorted)) {
+ ArcSort(&context_fst, ILabelCompare<Arc>());
+ }
+
+ delete [] context;
+
+ uint64 b64;
+ Weight weight;
+ Label label = kNoLabel;
+ const size_t storage = Storage(num_states, num_futures, num_final);
+ MappedFile *data_region = MappedFile::Allocate(storage);
+ char *data = reinterpret_cast<char *>(data_region->mutable_data());
+ memset(data, 0, storage);
+ size_t offset = 0;
+ memcpy(data + offset, reinterpret_cast<char *>(&num_states),
+ sizeof(num_states));
+ offset += sizeof(num_states);
+ memcpy(data + offset, reinterpret_cast<char *>(&num_futures),
+ sizeof(num_futures));
+ offset += sizeof(num_futures);
+ memcpy(data + offset, reinterpret_cast<char *>(&num_final),
+ sizeof(num_final));
+ offset += sizeof(num_final);
+ uint64* context_bits = reinterpret_cast<uint64*>(data + offset);
+ offset += BitmapIndex::StorageSize(num_states * 2 + 1) * sizeof(b64);
+ uint64* future_bits = reinterpret_cast<uint64*>(data + offset);
+ offset +=
+ BitmapIndex::StorageSize(num_futures + num_states + 1) * sizeof(b64);
+ uint64* final_bits = reinterpret_cast<uint64*>(data + offset);
+ offset += BitmapIndex::StorageSize(num_states) * sizeof(b64);
+ Label* context_words = reinterpret_cast<Label*>(data + offset);
+ offset += (num_states + 1) * sizeof(label);
+ Label* future_words = reinterpret_cast<Label*>(data + offset);
+ offset += num_futures * sizeof(label);
+ offset = (offset + sizeof(weight) - 1) & ~(sizeof(weight) - 1);
+ Weight* backoff = reinterpret_cast<Weight*>(data + offset);
+ offset += (num_states + 1) * sizeof(weight);
+ Weight* final_probs = reinterpret_cast<Weight*>(data + offset);
+ offset += num_final * sizeof(weight);
+ Weight* future_probs = reinterpret_cast<Weight*>(data + offset);
+ int64 context_arc = 0, future_arc = 0, context_bit = 0, future_bit = 0,
+ final_bit = 0;
+
+ // pseudo-root bits
+ BitmapIndex::Set(context_bits, context_bit++);
+ ++context_bit;
+ context_words[context_arc] = label;
+ backoff[context_arc] = Weight::Zero();
+ context_arc++;
+
+ ++future_bit;
+ if (order_out) {
+ order_out->clear();
+ order_out->resize(num_states);
+ }
+
+ queue<StateId> context_q;
+ context_q.push(context_fst.Start());
+ StateId state_number = 0;
+ while (!context_q.empty()) {
+ const StateId &state = context_q.front();
+ if (order_out) {
+ (*order_out)[state] = state_number;
+ }
+
+ const Weight &final = context_fst.Final(state);
+ if (final != Weight::Zero()) {
+ BitmapIndex::Set(final_bits, state_number);
+ final_probs[final_bit] = final;
+ ++final_bit;
+ }
+
+ for (ArcIterator<VectorFst<A> > aiter(context_fst, state);
+ !aiter.Done(); aiter.Next()) {
+ const Arc &arc = aiter.Value();
+ context_words[context_arc] = arc.ilabel;
+ backoff[context_arc] = arc.weight;
+ ++context_arc;
+ BitmapIndex::Set(context_bits, context_bit++);
+ context_q.push(arc.nextstate);
+ }
+ ++context_bit;
+
+ for (ArcIterator<Fst<A> > aiter(fst, state); !aiter.Done(); aiter.Next()) {
+ const Arc &arc = aiter.Value();
+ if (arc.ilabel != 0) {
+ future_words[future_arc] = arc.ilabel;
+ future_probs[future_arc] = arc.weight;
+ ++future_arc;
+ BitmapIndex::Set(future_bits, future_bit++);
+ }
+ }
+ ++future_bit;
+ ++state_number;
+ context_q.pop();
+ }
+
+ if ((state_number != num_states) ||
+ (context_bit != num_states * 2 + 1) ||
+ (context_arc != num_states) ||
+ (future_arc != num_futures) ||
+ (future_bit != num_futures + num_states + 1) ||
+ (final_bit != num_final)) {
+ FSTERROR() << "Structure problems detected during construction";
+ SetProperties(kError, kError);
+ return;
+ }
+
+ Init(data, false, data_region);
+}
+
+template<typename A>
+inline void NGramFstImpl<A>::Init(const char* data, bool owned,
+ MappedFile *data_region) {
+ if (owned_) {
+ delete [] data_;
+ }
+ delete data_region_;
+ data_region_ = data_region;
+ owned_ = owned;
+ data_ = data;
+ size_t offset = 0;
+ num_states_ = *(reinterpret_cast<const uint64*>(data_ + offset));
+ offset += sizeof(num_states_);
+ num_futures_ = *(reinterpret_cast<const uint64*>(data_ + offset));
+ offset += sizeof(num_futures_);
+ num_final_ = *(reinterpret_cast<const uint64*>(data_ + offset));
+ offset += sizeof(num_final_);
+ uint64 bits;
+ size_t context_bits = num_states_ * 2 + 1;
+ size_t future_bits = num_futures_ + num_states_ + 1;
+ context_ = reinterpret_cast<const uint64*>(data_ + offset);
+ offset += BitmapIndex::StorageSize(context_bits) * sizeof(bits);
+ future_ = reinterpret_cast<const uint64*>(data_ + offset);
+ offset += BitmapIndex::StorageSize(future_bits) * sizeof(bits);
+ final_ = reinterpret_cast<const uint64*>(data_ + offset);
+ offset += BitmapIndex::StorageSize(num_states_) * sizeof(bits);
+ context_words_ = reinterpret_cast<const Label*>(data_ + offset);
+ offset += (num_states_ + 1) * sizeof(*context_words_);
+ future_words_ = reinterpret_cast<const Label*>(data_ + offset);
+ offset += num_futures_ * sizeof(*future_words_);
+ offset = (offset + sizeof(*backoff_) - 1) & ~(sizeof(*backoff_) - 1);
+ backoff_ = reinterpret_cast<const Weight*>(data_ + offset);
+ offset += (num_states_ + 1) * sizeof(*backoff_);
+ final_probs_ = reinterpret_cast<const Weight*>(data_ + offset);
+ offset += num_final_ * sizeof(*final_probs_);
+ future_probs_ = reinterpret_cast<const Weight*>(data_ + offset);
+
+ context_index_.BuildIndex(context_, context_bits);
+ future_index_.BuildIndex(future_, future_bits);
+ final_index_.BuildIndex(final_, num_states_);
+
+ const size_t node_rank = context_index_.Rank1(0);
+ root_first_child_ = context_index_.Select0(node_rank) + 1;
+ if (context_index_.Get(root_first_child_) == false) {
+ FSTERROR() << "Missing unigrams";
+ SetProperties(kError, kError);
+ return;
+ }
+ const size_t last_child = context_index_.Select0(node_rank + 1) - 1;
+ root_num_children_ = last_child - root_first_child_ + 1;
+ root_children_ = context_words_ + context_index_.Rank1(root_first_child_);
+}
+
+template<typename A>
+inline typename A::StateId NGramFstImpl<A>::Transition(
+ const vector<Label> &context, Label future) const {
+ size_t num_children = root_num_children_;
+ const Label *children = root_children_;
+ const Label *loc = lower_bound(children, children + num_children, future);
+ if (loc == children + num_children || *loc != future) {
+ return context_index_.Rank1(0);
+ }
+ size_t node = root_first_child_ + loc - children;
+ size_t node_rank = context_index_.Rank1(node);
+ size_t first_child = context_index_.Select0(node_rank) + 1;
+ if (context_index_.Get(first_child) == false) {
+ return context_index_.Rank1(node);
+ }
+ size_t last_child = context_index_.Select0(node_rank + 1) - 1;
+ num_children = last_child - first_child + 1;
+ for (int word = context.size() - 1; word >= 0; --word) {
+ children = context_words_ + context_index_.Rank1(first_child);
+ loc = lower_bound(children, children + last_child - first_child + 1,
+ context[word]);
+ if (loc == children + last_child - first_child + 1 ||
+ *loc != context[word]) {
+ break;
+ }
+ node = first_child + loc - children;
+ node_rank = context_index_.Rank1(node);
+ first_child = context_index_.Select0(node_rank) + 1;
+ if (context_index_.Get(first_child) == false) break;
+ last_child = context_index_.Select0(node_rank + 1) - 1;
+ }
+ return context_index_.Rank1(node);
+}
+
+/*****************************************************************************/
+template<class A>
+class NGramFst : public ImplToExpandedFst<NGramFstImpl<A> > {
+ friend class ArcIterator<NGramFst<A> >;
+ friend class NGramFstMatcher<A>;
+
+ public:
+ typedef A Arc;
+ typedef typename A::StateId StateId;
+ typedef typename A::Label Label;
+ typedef typename A::Weight Weight;
+ typedef NGramFstImpl<A> Impl;
+
+ explicit NGramFst(const Fst<A> &dst)
+ : ImplToExpandedFst<Impl>(new Impl(dst, NULL)) {}
+
+ NGramFst(const Fst<A> &fst, vector<StateId>* order_out)
+ : ImplToExpandedFst<Impl>(new Impl(fst, order_out)) {}
+
+ // Because the NGramFstImpl is a const stateless data structure, there
+ // is never a need to do anything beside copy the reference.
+ NGramFst(const NGramFst<A> &fst, bool safe = false)
+ : ImplToExpandedFst<Impl>(fst, false) {}
+
+ NGramFst() : ImplToExpandedFst<Impl>(new Impl()) {}
+
+ // Non-standard constructor to initialize NGramFst directly from data.
+ NGramFst(const char* data, bool owned) : ImplToExpandedFst<Impl>(new Impl()) {
+ GetImpl()->Init(data, owned, NULL);
+ }
+
+ // Get method that gets the data associated with Init().
+ const char* GetData(size_t* data_size) const {
+ return GetImpl()->GetData(data_size);
+ }
+
+ const vector<Label> GetContext(StateId s) const {
+ return GetImpl()->GetContext(s, &inst_);
+ }
+
+ virtual size_t NumArcs(StateId s) const {
+ return GetImpl()->NumArcs(s, &inst_);
+ }
+
+ virtual NGramFst<A>* Copy(bool safe = false) const {
+ return new NGramFst(*this, safe);
+ }
+
+ static NGramFst<A>* Read(istream &strm, const FstReadOptions &opts) {
+ Impl* impl = Impl::Read(strm, opts);
+ return impl ? new NGramFst<A>(impl) : 0;
+ }
+
+ static NGramFst<A>* Read(const string &filename) {
+ if (!filename.empty()) {
+ ifstream strm(filename.c_str(), ifstream::in | ifstream::binary);
+ if (!strm) {
+ LOG(ERROR) << "NGramFst::Read: Can't open file: " << filename;
+ return 0;
+ }
+ return Read(strm, FstReadOptions(filename));
+ } else {
+ return Read(cin, FstReadOptions("standard input"));
+ }
+ }
+
+ virtual bool Write(ostream &strm, const FstWriteOptions &opts) const {
+ return GetImpl()->Write(strm, opts);
+ }
+
+ virtual bool Write(const string &filename) const {
+ return Fst<A>::WriteFile(filename);
+ }
+
+ virtual inline void InitStateIterator(StateIteratorData<A>* data) const {
+ GetImpl()->InitStateIterator(data);
+ }
+
+ virtual inline void InitArcIterator(
+ StateId s, ArcIteratorData<A>* data) const;
+
+ virtual MatcherBase<A>* InitMatcher(MatchType match_type) const {
+ return new NGramFstMatcher<A>(*this, match_type);
+ }
+
+ private:
+ explicit NGramFst(Impl* impl) : ImplToExpandedFst<Impl>(impl) {}
+
+ Impl* GetImpl() const {
+ return
+ ImplToExpandedFst<Impl, ExpandedFst<A> >::GetImpl();
+ }
+
+ void SetImpl(Impl* impl, bool own_impl = true) {
+ ImplToExpandedFst<Impl, Fst<A> >::SetImpl(impl, own_impl);
+ }
+
+ mutable NGramFstInst<A> inst_;
+};
+
+template <class A> inline void
+NGramFst<A>::InitArcIterator(StateId s, ArcIteratorData<A>* data) const {
+ GetImpl()->SetInstFuture(s, &inst_);
+ GetImpl()->SetInstNode(&inst_);
+ data->base = new ArcIterator<NGramFst<A> >(*this, s);
+}
+
+/*****************************************************************************/
+template <class A>
+class NGramFstMatcher : public MatcherBase<A> {
+ public:
+ typedef A Arc;
+ typedef typename A::Label Label;
+ typedef typename A::StateId StateId;
+ typedef typename A::Weight Weight;
+
+ NGramFstMatcher(const NGramFst<A> &fst, MatchType match_type)
+ : fst_(fst), inst_(fst.inst_), match_type_(match_type),
+ current_loop_(false),
+ loop_(kNoLabel, 0, A::Weight::One(), kNoStateId) {
+ if (match_type_ == MATCH_OUTPUT) {
+ swap(loop_.ilabel, loop_.olabel);
+ }
+ }
+
+ NGramFstMatcher(const NGramFstMatcher<A> &matcher, bool safe = false)
+ : fst_(matcher.fst_), inst_(matcher.inst_),
+ match_type_(matcher.match_type_), current_loop_(false),
+ loop_(kNoLabel, 0, A::Weight::One(), kNoStateId) {
+ if (match_type_ == MATCH_OUTPUT) {