// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright 2005-2010 Google, Inc.
// Author: sorenj@google.com (Jeffrey Sorensen)
//
#ifndef FST_EXTENSIONS_NGRAM_NGRAM_FST_H_
#define FST_EXTENSIONS_NGRAM_NGRAM_FST_H_
#include <stddef.h>
#include <string.h>
#include <algorithm>
#include <string>
#include <vector>
using std::vector;
#include <fst/compat.h>
#include <fst/fstlib.h>
#include <fst/mapped-file.h>
#include <fst/extensions/ngram/bitmap-index.h>
// NgramFst implements a n-gram language model based upon the LOUDS data
// structure. Please refer to "Unary Data Strucutres for Language Models"
// http://research.google.com/pubs/archive/37218.pdf
namespace fst {
template <class A> class NGramFst;
template <class A> class NGramFstMatcher;
// Instance data containing mutable state for bookkeeping repeated access to
// the same state.
template <class A>
struct NGramFstInst {
typedef typename A::Label Label;
typedef typename A::StateId StateId;
typedef typename A::Weight Weight;
StateId state_;
size_t num_futures_;
size_t offset_;
size_t node_;
StateId node_state_;
vector<Label> context_;
StateId context_state_;
NGramFstInst()
: state_(kNoStateId), node_state_(kNoStateId),
context_state_(kNoStateId) { }
};
// Implementation class for LOUDS based NgramFst interface
template <class A>
class NGramFstImpl : public FstImpl<A> {
using FstImpl<A>::SetInputSymbols;
using FstImpl<A>::SetOutputSymbols;
using FstImpl<A>::SetType;
using FstImpl<A>::WriteHeader;
friend class ArcIterator<NGramFst<A> >;
friend class NGramFstMatcher<A>;
public:
using FstImpl<A>::InputSymbols;
using FstImpl<A>::SetProperties;
using FstImpl<A>::Properties;
typedef A Arc;
typedef typename A::Label Label;
typedef typename A::StateId StateId;
typedef typename A::Weight Weight;
NGramFstImpl() : data_region_(0), data_(0), owned_(false) {
SetType("ngram");
SetInputSymbols(NULL);
SetOutputSymbols(NULL);
SetProperties(kStaticProperties);
}
NGramFstImpl(const Fst<A> &fst, vector<StateId>* order_out);
~NGramFstImpl() {
if (owned_) {
delete [] data_;
}
delete data_region_;
}
static NGramFstImpl<A>* Read(istream &strm, // NOLINT
const FstReadOptions &opts) {
NGramFstImpl<A>* impl = new NGramFstImpl();
FstHeader hdr;
if (!impl->ReadHeader(strm, opts, kMinFileVersion, &hdr)) return 0;
uint64 num_states, num_futures, num_final;
const size_t offset = sizeof(num_states) + sizeof(num_futures) +
sizeof(num_final);
// Peek at num_states and num_futures to see how much more needs to be read.
strm.read(reinterpret_cast<char *>(&num_states), sizeof(num_states));
strm.read(reinterpret_cast<char *>(&num_futures), sizeof(num_futures));
strm.read(reinterpret_cast<char *>(&num_final), sizeof(num_final));
size_t size = Storage(num_states, num_futures, num_final);
MappedFile *data_region = MappedFile::Allocate(size);
char *data = reinterpret_cast<char *>(data_region->mutable_data());
// Copy num_states, num_futures and num_final back into data.
memcpy(data, reinterpret_cast<char *>(&num_states), sizeof(num_states));
memcpy(data + sizeof(num_states), reinterpret_cast<char *>(&num_futures),
sizeof(num_futures));
memcpy(data + sizeof(num_states) + sizeof(num_futures),
reinterpret_cast<char *>(&num_final), sizeof(num_final));
strm.read(data + offset, size - offset);
if (!strm) {
delete impl;
return NULL;
}
impl->Init(data, false, data_region);
return impl;
}
bool Write(ostream &strm, // NOLINT
const FstWriteOptions &opts) const {
FstHeader hdr;
hdr.SetStart(Start());
hdr.SetNumStates(num_states_);
WriteHeader(strm, opts, kFileVersion, &hdr);
strm.write(data_, Storage(num_states_, num_futures_,