From 96a32415ab43377cf1575bd3f4f2980f58028209 Mon Sep 17 00:00:00 2001 From: Determinant Date: Fri, 14 Aug 2015 11:51:42 +0800 Subject: add implementation for kaldi io (by ymz) --- .../src/tools/openfst/include/fst/determinize.h | 1015 ++++++++++++++++++++ 1 file changed, 1015 insertions(+) create mode 100644 kaldi_io/src/tools/openfst/include/fst/determinize.h (limited to 'kaldi_io/src/tools/openfst/include/fst/determinize.h') diff --git a/kaldi_io/src/tools/openfst/include/fst/determinize.h b/kaldi_io/src/tools/openfst/include/fst/determinize.h new file mode 100644 index 0000000..9ff8723 --- /dev/null +++ b/kaldi_io/src/tools/openfst/include/fst/determinize.h @@ -0,0 +1,1015 @@ +// determinize.h + + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: riley@google.com (Michael Riley) +// +// \file +// Functions and classes to determinize an FST. + +#ifndef FST_LIB_DETERMINIZE_H__ +#define FST_LIB_DETERMINIZE_H__ + +#include +#include +#include +using std::tr1::unordered_map; +using std::tr1::unordered_multimap; +#include +#include +#include +#include +using std::vector; + +#include +#include +#include +#include +#include +#include + + +namespace fst { + +// +// COMMON DIVISORS - these are used in determinization to compute +// the transition weights. In the simplest case, it is just the same +// as the semiring Plus(). However, other choices permit more efficient +// determinization when the output contains strings. +// + +// The default common divisor uses the semiring Plus. +template +class DefaultCommonDivisor { + public: + typedef W Weight; + + W operator()(const W &w1, const W &w2) const { return Plus(w1, w2); } +}; + + +// The label common divisor for a (left) string semiring selects a +// single letter common prefix or the empty string. This is used in +// the determinization of output strings so that at most a single +// letter will appear in the output of a transtion. +template +class LabelCommonDivisor { + public: + typedef StringWeight Weight; + + Weight operator()(const Weight &w1, const Weight &w2) const { + StringWeightIterator iter1(w1); + StringWeightIterator iter2(w2); + + if (!(StringWeight::Properties() & kLeftSemiring)) { + FSTERROR() << "LabelCommonDivisor: Weight needs to be left semiring"; + return Weight::NoWeight(); + } else if (w1.Size() == 0 || w2.Size() == 0) { + return Weight::One(); + } else if (w1 == Weight::Zero()) { + return Weight(iter2.Value()); + } else if (w2 == Weight::Zero()) { + return Weight(iter1.Value()); + } else if (iter1.Value() == iter2.Value()) { + return Weight(iter1.Value()); + } else { + return Weight::One(); + } + } +}; + + +// The gallic common divisor uses the label common divisor on the +// string component and the template argument D common divisor on the +// weight component, which defaults to the default common divisor. +template > +class GallicCommonDivisor { + public: + typedef GallicWeight Weight; + + Weight operator()(const Weight &w1, const Weight &w2) const { + return Weight(label_common_divisor_(w1.Value1(), w2.Value1()), + weight_common_divisor_(w1.Value2(), w2.Value2())); + } + + private: + LabelCommonDivisor label_common_divisor_; + D weight_common_divisor_; +}; + + +// Represents an element in a subset +template +struct DeterminizeElement { + typedef typename A::StateId StateId; + typedef typename A::Weight Weight; + + DeterminizeElement() {} + + DeterminizeElement(StateId s, Weight w) : state_id(s), weight(w) {} + + bool operator==(const DeterminizeElement & element) const { + return state_id == element.state_id && weight == element.weight; + } + + bool operator<(const DeterminizeElement & element) const { + return state_id < element.state_id || + (state_id == element.state_id && weight == element.weight); + } + + StateId state_id; // Input state Id + Weight weight; // Residual weight +}; + + +// +// DETERMINIZE FILTERS - these can be used in determinization to compute +// transformations on the subsets prior to their being added as destination +// states. The filter operates on a map between a label and the +// corresponding destination subsets. The possibly modified map is +// then used to construct the destination states for arcs exiting state 's'. +// It must define the ordered map type LabelMap and have a default +// and copy constructor. + +// A determinize filter that does not modify its input. +template +struct IdentityDeterminizeFilter { + typedef typename Arc::StateId StateId; + typedef typename Arc::Label Label; + typedef slist< DeterminizeElement > Subset; + typedef map LabelMap; + + static uint64 Properties(uint64 props) { return props; } + + void operator()(StateId s, LabelMap *label_map) {} +}; + + +// +// DETERMINIZATION STATE TABLES +// +// The determiziation state table has the form: +// +// template +// class DeterminizeStateTable { +// public: +// typedef typename Arc::StateId StateId; +// typedef DeterminizeElement Element; +// typedef slist Subset; +// +// // Required constuctor +// DeterminizeStateTable(); +// +// // Required copy constructor that does not copy state +// DeterminizeStateTable(const DeterminizeStateTable &table); +// +// // Lookup state ID by subset (not depending of the element order). +// // If it doesn't exist, then add it. FindState takes +// // ownership of the subset argument (so that it doesn't have to +// // copy it if it creates a new state). +// StateId FindState(Subset *subset); +// +// // Lookup subset by ID. +// const Subset *FindSubset(StateId id) const; +// }; +// + +// The default determinization state table based on the +// compact hash bi-table. +template +class DefaultDeterminizeStateTable { + public: + typedef typename Arc::StateId StateId; + typedef typename Arc::Label Label; + typedef typename Arc::Weight Weight; + typedef DeterminizeElement Element; + typedef slist Subset; + + explicit DefaultDeterminizeStateTable(size_t table_size = 0) + : table_size_(table_size), + subsets_(table_size_, new SubsetKey(), new SubsetEqual(&elements_)) { } + + DefaultDeterminizeStateTable(const DefaultDeterminizeStateTable &table) + : table_size_(table.table_size_), + subsets_(table_size_, new SubsetKey(), new SubsetEqual(&elements_)) { } + + ~DefaultDeterminizeStateTable() { + for (StateId s = 0; s < subsets_.Size(); ++s) + delete subsets_.FindEntry(s); + } + + // Finds the state corresponding to a subset. Only creates a new + // state if the subset is not found. FindState takes ownership of + // the subset argument (so that it doesn't have to copy it if it + // creates a new state). + StateId FindState(Subset *subset) { + StateId ns = subsets_.Size(); + StateId s = subsets_.FindId(subset); + if (s != ns) delete subset; // subset found + return s; + } + + const Subset* FindSubset(StateId s) { return subsets_.FindEntry(s); } + + private: + // Comparison object for hashing Subset(s). Subsets are not sorted in this + // implementation, so ordering must not be assumed in the equivalence + // test. + class SubsetEqual { + public: + SubsetEqual() { // needed for compilation but should never be called + FSTERROR() << "SubsetEqual: default constructor not implemented"; + } + + // Constructor takes vector needed to check equality. See immediately + // below for constraints on it. + explicit SubsetEqual(vector *elements) + : elements_(elements) {} + + // At each call to operator(), the elements_ vector should contain + // only NULLs. When this operator returns, elements_ will still + // have this property. + bool operator()(Subset* subset1, Subset* subset2) const { + if (!subset1 && !subset2) + return true; + if ((subset1 && !subset2) || (!subset1 && subset2)) + return false; + + if (subset1->size() != subset2->size()) + return false; + + // Loads first subset elements in element vector. + for (typename Subset::iterator iter1 = subset1->begin(); + iter1 != subset1->end(); + ++iter1) { + Element &element1 = *iter1; + while (elements_->size() <= element1.state_id) + elements_->push_back(0); + (*elements_)[element1.state_id] = &element1; + } + + // Checks second subset matches first via element vector. + for (typename Subset::iterator iter2 = subset2->begin(); + iter2 != subset2->end(); + ++iter2) { + Element &element2 = *iter2; + while (elements_->size() <= element2.state_id) + elements_->push_back(0); + Element *element1 = (*elements_)[element2.state_id]; + if (!element1 || element1->weight != element2.weight) { + // Mismatch found. Resets element vector before returning false. + for (typename Subset::iterator iter1 = subset1->begin(); + iter1 != subset1->end(); + ++iter1) + (*elements_)[iter1->state_id] = 0; + return false; + } else { + (*elements_)[element2.state_id] = 0; // Clears entry + } + } + return true; + } + private: + vector *elements_; + }; + + // Hash function for Subset to Fst states. Subset elements are not + // sorted in this implementation, so the hash must be invariant + // under subset reordering. + class SubsetKey { + public: + size_t operator()(const Subset* subset) const { + size_t hash = 0; + if (subset) { + for (typename Subset::const_iterator iter = subset->begin(); + iter != subset->end(); + ++iter) { + const Element &element = *iter; + int lshift = element.state_id % (CHAR_BIT * sizeof(size_t) - 1) + 1; + int rshift = CHAR_BIT * sizeof(size_t) - lshift; + size_t n = element.state_id; + hash ^= n << lshift ^ n >> rshift ^ element.weight.Hash(); + } + } + return hash; + } + }; + + size_t table_size_; + + typedef CompactHashBiTable SubsetTable; + + SubsetTable subsets_; + vector elements_; + + void operator=(const DefaultDeterminizeStateTable &); // disallow +}; + +// Options for finite-state transducer determinization templated on +// the arc type, common divisor, the determinization filter and the +// state table. DeterminizeFst takes ownership of the determinization +// filter and state table if provided. +template , + class F = IdentityDeterminizeFilter, + class T = DefaultDeterminizeStateTable > +struct DeterminizeFstOptions : CacheOptions { + typedef typename Arc::Label Label; + float delta; // Quantization delta for subset weights + Label subsequential_label; // Label used for residual final output + // when producing subsequential transducers. + F *filter; // Determinization filter + T *state_table; // Determinization state table + + explicit DeterminizeFstOptions(const CacheOptions &opts, + float del = kDelta, Label lab = 0, + F *filt = 0, + T *table = 0) + : CacheOptions(opts), delta(del), subsequential_label(lab), + filter(filt), state_table(table) {} + + explicit DeterminizeFstOptions(float del = kDelta, Label lab = 0, + F *filt = 0, T *table = 0) + : delta(del), subsequential_label(lab), filter(filt), + state_table(table) {} +}; + +// Implementation of delayed DeterminizeFst. This base class is +// common to the variants that implement acceptor and transducer +// determinization. +template +class DeterminizeFstImplBase : public CacheImpl { + public: + using FstImpl::SetType; + using FstImpl::SetProperties; + using FstImpl::Properties; + using FstImpl::SetInputSymbols; + using FstImpl::SetOutputSymbols; + + using CacheBaseImpl< CacheState >::HasStart; + using CacheBaseImpl< CacheState >::HasFinal; + using CacheBaseImpl< CacheState >::HasArcs; + using CacheBaseImpl< CacheState >::SetFinal; + using CacheBaseImpl< CacheState >::SetStart; + + typedef typename A::Label Label; + typedef typename A::Weight Weight; + typedef typename A::StateId StateId; + typedef CacheState State; + + template + DeterminizeFstImplBase(const Fst &fst, + const DeterminizeFstOptions &opts) + : CacheImpl(opts), fst_(fst.Copy()) { + SetType("determinize"); + uint64 iprops = fst.Properties(kFstProperties, false); + uint64 dprops = DeterminizeProperties(iprops, + opts.subsequential_label != 0); + SetProperties(F::Properties(dprops), kCopyProperties); + SetInputSymbols(fst.InputSymbols()); + SetOutputSymbols(fst.OutputSymbols()); + } + + DeterminizeFstImplBase(const DeterminizeFstImplBase &impl) + : CacheImpl(impl), + fst_(impl.fst_->Copy(true)) { + SetType("determinize"); + SetProperties(impl.Properties(), kCopyProperties); + SetInputSymbols(impl.InputSymbols()); + SetOutputSymbols(impl.OutputSymbols()); + } + + virtual ~DeterminizeFstImplBase() { delete fst_; } + + virtual DeterminizeFstImplBase *Copy() = 0; + + StateId Start() { + if (!HasStart()) { + StateId start = ComputeStart(); + if (start != kNoStateId) { + SetStart(start); + } + } + return CacheImpl::Start(); + } + + Weight Final(StateId s) { + if (!HasFinal(s)) { + Weight final = ComputeFinal(s); + SetFinal(s, final); + } + return CacheImpl::Final(s); + } + + virtual void Expand(StateId s) = 0; + + size_t NumArcs(StateId s) { + if (!HasArcs(s)) + Expand(s); + return CacheImpl::NumArcs(s); + } + + size_t NumInputEpsilons(StateId s) { + if (!HasArcs(s)) + Expand(s); + return CacheImpl::NumInputEpsilons(s); + } + + size_t NumOutputEpsilons(StateId s) { + if (!HasArcs(s)) + Expand(s); + return CacheImpl::NumOutputEpsilons(s); + } + + void InitArcIterator(StateId s, ArcIteratorData *data) { + if (!HasArcs(s)) + Expand(s); + CacheImpl::InitArcIterator(s, data); + } + + virtual StateId ComputeStart() = 0; + + virtual Weight ComputeFinal(StateId s) = 0; + + const Fst &GetFst() const { return *fst_; } + + private: + const Fst *fst_; // Input Fst + + void operator=(const DeterminizeFstImplBase &); // disallow +}; + + +// Implementation of delayed determinization for weighted acceptors. +// It is templated on the arc type A and the common divisor D. +template +class DeterminizeFsaImpl : public DeterminizeFstImplBase { + public: + using FstImpl::SetProperties; + using DeterminizeFstImplBase::GetFst; + using DeterminizeFstImplBase::SetArcs; + + typedef typename A::Label Label; + typedef typename A::Weight Weight; + typedef typename A::StateId StateId; + typedef DeterminizeElement Element; + typedef slist Subset; + typedef typename F::LabelMap LabelMap; + + DeterminizeFsaImpl(const Fst &fst, + const vector *in_dist, vector *out_dist, + const DeterminizeFstOptions &opts) + : DeterminizeFstImplBase(fst, opts), + delta_(opts.delta), + in_dist_(in_dist), + out_dist_(out_dist), + filter_(opts.filter ? opts.filter : new F()), + state_table_(opts.state_table ? opts.state_table : new T()) { + if (!fst.Properties(kAcceptor, true)) { + FSTERROR() << "DeterminizeFst: argument not an acceptor"; + SetProperties(kError, kError); + } + if (!(Weight::Properties() & kLeftSemiring)) { + FSTERROR() << "DeterminizeFst: Weight needs to be left distributive: " + << Weight::Type(); + SetProperties(kError, kError); + } + if (out_dist_) + out_dist_->clear(); + } + + DeterminizeFsaImpl(const DeterminizeFsaImpl &impl) + : DeterminizeFstImplBase(impl), + delta_(impl.delta_), + in_dist_(0), + out_dist_(0), + filter_(new F(*impl.filter_)), + state_table_(new T(*impl.state_table_)) { + if (impl.out_dist_) { + FSTERROR() << "DeterminizeFsaImpl: cannot copy with out_dist vector"; + SetProperties(kError, kError); + } + } + + virtual ~DeterminizeFsaImpl() { + delete filter_; + delete state_table_; + } + + virtual DeterminizeFsaImpl *Copy() { + return new DeterminizeFsaImpl(*this); + } + + uint64 Properties() const { return Properties(kFstProperties); } + + // Set error if found; return FST impl properties. + uint64 Properties(uint64 mask) const { + if ((mask & kError) && (GetFst().Properties(kError, false))) + SetProperties(kError, kError); + return FstImpl::Properties(mask); + } + + virtual StateId ComputeStart() { + StateId s = GetFst().Start(); + if (s == kNoStateId) + return kNoStateId; + Element element(s, Weight::One()); + Subset *subset = new Subset; + subset->push_front(element); + return FindState(subset); + } + + virtual Weight ComputeFinal(StateId s) { + const Subset *subset = state_table_->FindSubset(s); + Weight final = Weight::Zero(); + for (typename Subset::const_iterator siter = subset->begin(); + siter != subset->end(); + ++siter) { + const Element &element = *siter; + final = Plus(final, Times(element.weight, + GetFst().Final(element.state_id))); + if (!final.Member()) + SetProperties(kError, kError); + } + return final; + } + + StateId FindState(Subset *subset) { + StateId s = state_table_->FindState(subset); + if (in_dist_ && out_dist_->size() <= s) + out_dist_->push_back(ComputeDistance(subset)); + return s; + } + + // Compute distance from a state to the final states in the DFA + // given the distances in the NFA. + Weight ComputeDistance(const Subset *subset) { + Weight outd = Weight::Zero(); + for (typename Subset::const_iterator siter = subset->begin(); + siter != subset->end(); ++siter) { + const Element &element = *siter; + Weight ind = element.state_id < in_dist_->size() ? + (*in_dist_)[element.state_id] : Weight::Zero(); + outd = Plus(outd, Times(element.weight, ind)); + } + return outd; + } + + // Computes the outgoing transitions from a state, creating new destination + // states as needed. + virtual void Expand(StateId s) { + + LabelMap label_map; + LabelSubsets(s, &label_map); + + for (typename LabelMap::iterator liter = label_map.begin(); + liter != label_map.end(); + ++liter) + AddArc(s, liter->first, liter->second); + SetArcs(s); + } + + private: + // Constructs destination subsets per label. At return, subset + // element weights include the input automaton label weights and the + // subsets may contain duplicate states. + void LabelSubsets(StateId s, LabelMap *label_map) { + const Subset *src_subset = state_table_->FindSubset(s); + + for (typename Subset::const_iterator siter = src_subset->begin(); + siter != src_subset->end(); + ++siter) { + const Element &src_element = *siter; + for (ArcIterator< Fst > aiter(GetFst(), src_element.state_id); + !aiter.Done(); + aiter.Next()) { + const A &arc = aiter.Value(); + Element dest_element(arc.nextstate, + Times(src_element.weight, arc.weight)); + + // The LabelMap may be a e.g. multimap with more complex + // determinization filters, so we insert efficiently w/o using []. + typename LabelMap::iterator liter = label_map->lower_bound(arc.ilabel); + Subset* dest_subset; + if (liter == label_map->end() || liter->first != arc.ilabel) { + dest_subset = new Subset; + label_map->insert(liter, make_pair(arc.ilabel, dest_subset)); + } else { + dest_subset = liter->second; + } + + dest_subset->push_front(dest_element); + } + } + // Applies the determinization filter + (*filter_)(s, label_map); + } + + // Adds an arc from state S to the destination state associated + // with subset DEST_SUBSET (as created by LabelSubsets). + void AddArc(StateId s, Label label, Subset *dest_subset) { + A arc; + arc.ilabel = label; + arc.olabel = label; + arc.weight = Weight::Zero(); + + typename Subset::iterator oiter; + for (typename Subset::iterator diter = dest_subset->begin(); + diter != dest_subset->end();) { + Element &dest_element = *diter; + // Computes label weight. + arc.weight = common_divisor_(arc.weight, dest_element.weight); + + while (elements_.size() <= dest_element.state_id) + elements_.push_back(0); + Element *matching_element = elements_[dest_element.state_id]; + if (matching_element) { + // Found duplicate state: sums state weight and deletes dup. + matching_element->weight = Plus(matching_element->weight, + dest_element.weight); + if (!matching_element->weight.Member()) + SetProperties(kError, kError); + ++diter; + dest_subset->erase_after(oiter); + } else { + // Saves element so we can check for duplicate for this state. + elements_[dest_element.state_id] = &dest_element; + oiter = diter; + ++diter; + } + } + + // Divides out label weight from destination subset elements. + // Quantizes to ensure comparisons are effective. + // Clears element vector. + for (typename Subset::iterator diter = dest_subset->begin(); + diter != dest_subset->end(); + ++diter) { + Element &dest_element = *diter; + dest_element.weight = Divide(dest_element.weight, arc.weight, + DIVIDE_LEFT); + dest_element.weight = dest_element.weight.Quantize(delta_); + elements_[dest_element.state_id] = 0; + } + + arc.nextstate = FindState(dest_subset); + CacheImpl::PushArc(s, arc); + } + + float delta_; // Quantization delta for subset weights + const vector *in_dist_; // Distance to final NFA states + vector *out_dist_; // Distance to final DFA states + + D common_divisor_; + F *filter_; + T *state_table_; + + vector elements_; + + void operator=(const DeterminizeFsaImpl &); // disallow +}; + + +// Implementation of delayed determinization for transducers. +// Transducer determinization is implemented by mapping the input to +// the Gallic semiring as an acceptor whose weights contain the output +// strings and using acceptor determinization above to determinize +// that acceptor. +template +class DeterminizeFstImpl : public DeterminizeFstImplBase { + public: + using FstImpl::SetProperties; + using DeterminizeFstImplBase::GetFst; + using CacheBaseImpl< CacheState >::GetCacheGc; + using CacheBaseImpl< CacheState >::GetCacheLimit; + + typedef typename A::Label Label; + typedef typename A::Weight Weight; + typedef typename A::StateId StateId; + + typedef ToGallicMapper ToMapper; + typedef FromGallicMapper FromMapper; + + typedef typename ToMapper::ToArc ToArc; + typedef ArcMapFst ToFst; + typedef ArcMapFst FromFst; + + typedef GallicCommonDivisor CommonDivisor; + typedef GallicFactor FactorIterator; + + DeterminizeFstImpl(const Fst &fst, + const DeterminizeFstOptions &opts) + : DeterminizeFstImplBase(fst, opts), + delta_(opts.delta), + subsequential_label_(opts.subsequential_label) { + Init(GetFst()); + } + + DeterminizeFstImpl(const DeterminizeFstImpl &impl) + : DeterminizeFstImplBase(impl), + delta_(impl.delta_), + subsequential_label_(impl.subsequential_label_) { + Init(GetFst()); + } + + ~DeterminizeFstImpl() { delete from_fst_; } + + virtual DeterminizeFstImpl *Copy() { + return new DeterminizeFstImpl(*this); + } + + uint64 Properties() const { return Properties(kFstProperties); } + + // Set error if found; return FST impl properties. + uint64 Properties(uint64 mask) const { + if ((mask & kError) && (GetFst().Properties(kError, false) || + from_fst_->Properties(kError, false))) + SetProperties(kError, kError); + return FstImpl::Properties(mask); + } + + virtual StateId ComputeStart() { return from_fst_->Start(); } + + virtual Weight ComputeFinal(StateId s) { return from_fst_->Final(s); } + + virtual void Expand(StateId s) { + for (ArcIterator aiter(*from_fst_, s); + !aiter.Done(); + aiter.Next()) + CacheImpl::PushArc(s, aiter.Value()); + CacheImpl::SetArcs(s); + } + + private: + // Initialization of transducer determinization implementation, which + // is defined after DeterminizeFst since it calls it. + void Init(const Fst &fst); + + float delta_; + Label subsequential_label_; + FromFst *from_fst_; + + void operator=(const DeterminizeFstImpl &); // disallow +}; + + +// Determinizes a weighted transducer. This version is a delayed +// Fst. The result will be an equivalent FST that has the property +// that no state has two transitions with the same input label. +// For this algorithm, epsilon transitions are treated as regular +// symbols (cf. RmEpsilon). +// +// The transducer must be functional. The weights must be (weakly) +// left divisible (valid for TropicalWeight and LogWeight for instance) +// and be zero-sum-free if for all a,b: (Plus(a, b) = 0 => a = b = 0. +// +// Complexity: +// - Determinizable: exponential (polynomial in the size of the output) +// - Non-determinizable) does not terminate +// +// The determinizable automata include all unweighted and all acyclic input. +// +// References: +// - Mehryar Mohri, "Finite-State Transducers in Language and Speech +// Processing". Computational Linguistics, 23:2, 1997. +// +// This class attaches interface to implementation and handles +// reference counting, delegating most methods to ImplToFst. +template +class DeterminizeFst : public ImplToFst< DeterminizeFstImplBase > { + public: + friend class ArcIterator< DeterminizeFst >; + friend class StateIterator< DeterminizeFst >; + template + friend class DeterminizeFstImpl; + + typedef A Arc; + typedef typename A::Weight Weight; + typedef typename A::StateId StateId; + typedef typename A::Label Label; + typedef CacheState State; + typedef DeterminizeFstImplBase Impl; + + using ImplToFst::SetImpl; + + explicit DeterminizeFst(const Fst &fst) { + typedef DefaultCommonDivisor D; + typedef IdentityDeterminizeFilter F; + typedef DefaultDeterminizeStateTable T; + DeterminizeFstOptions opts; + if (fst.Properties(kAcceptor, true)) { + // Calls implementation for acceptors. + SetImpl(new DeterminizeFsaImpl(fst, 0, 0, opts)); + } else { + // Calls implementation for transducers. + SetImpl(new + DeterminizeFstImpl(fst, opts)); + } + } + + template + DeterminizeFst(const Fst &fst, + const DeterminizeFstOptions &opts) { + if (fst.Properties(kAcceptor, true)) { + // Calls implementation for acceptors. + SetImpl(new DeterminizeFsaImpl(fst, 0, 0, opts)); + } else { + // Calls implementation for transducers. + SetImpl(new + DeterminizeFstImpl(fst, opts)); + } + } + + // This acceptor-only version additionally computes the distance to + // final states in the output if provided with those distances for the + // input. Useful for e.g. unique N-shortest paths. + template + DeterminizeFst(const Fst &fst, + const vector *in_dist, vector *out_dist, + const DeterminizeFstOptions &opts) { + if (!fst.Properties(kAcceptor, true)) { + FSTERROR() << "DeterminizeFst:" + << " distance to final states computed for acceptors only"; + GetImpl()->SetProperties(kError, kError); + } + SetImpl(new DeterminizeFsaImpl(fst, in_dist, out_dist, opts)); + } + + // See Fst<>::Copy() for doc. + DeterminizeFst(const DeterminizeFst &fst, bool safe = false) { + if (safe) + SetImpl(fst.GetImpl()->Copy()); + else + SetImpl(fst.GetImpl(), false); + } + + // Get a copy of this DeterminizeFst. See Fst<>::Copy() for further doc. + virtual DeterminizeFst *Copy(bool safe = false) const { + return new DeterminizeFst(*this, safe); + } + + virtual inline void InitStateIterator(StateIteratorData *data) const; + + virtual void InitArcIterator(StateId s, ArcIteratorData *data) const { + GetImpl()->InitArcIterator(s, data); + } + + private: + // Makes visible to friends. + Impl *GetImpl() const { return ImplToFst::GetImpl(); } + + void operator=(const DeterminizeFst &fst); // Disallow +}; + + +// Initialization of transducer determinization implementation. which +// is defined after DeterminizeFst since it calls it. +template +void DeterminizeFstImpl::Init(const Fst &fst) { + // Mapper to an acceptor. + ToFst to_fst(fst, ToMapper()); + + // Determinizes acceptor. + // This recursive call terminates since it passes the common divisor + // to a private constructor. + CacheOptions copts(GetCacheGc(), GetCacheLimit()); + DeterminizeFstOptions dopts(copts, delta_); + // Uses acceptor-only constructor to avoid template recursion + DeterminizeFst det_fsa(to_fst, 0, 0, dopts); + + // Mapper back to transducer. + FactorWeightOptions fopts(CacheOptions(true, 0), delta_, + kFactorFinalWeights, + subsequential_label_, + subsequential_label_); + FactorWeightFst factored_fst(det_fsa, fopts); + from_fst_ = new FromFst(factored_fst, FromMapper(subsequential_label_)); +} + + +// Specialization for DeterminizeFst. +template +class StateIterator< DeterminizeFst > + : public CacheStateIterator< DeterminizeFst > { + public: + explicit StateIterator(const DeterminizeFst &fst) + : CacheStateIterator< DeterminizeFst >(fst, fst.GetImpl()) {} +}; + + +// Specialization for DeterminizeFst. +template +class ArcIterator< DeterminizeFst > + : public CacheArcIterator< DeterminizeFst > { + public: + typedef typename A::StateId StateId; + + ArcIterator(const DeterminizeFst &fst, StateId s) + : CacheArcIterator< DeterminizeFst >(fst.GetImpl(), s) { + if (!fst.GetImpl()->HasArcs(s)) + fst.GetImpl()->Expand(s); + } + + private: + DISALLOW_COPY_AND_ASSIGN(ArcIterator); +}; + + +template inline +void DeterminizeFst::InitStateIterator(StateIteratorData *data) const +{ + data->base = new StateIterator< DeterminizeFst >(*this); +} + + +// Useful aliases when using StdArc. +typedef DeterminizeFst StdDeterminizeFst; + + +template +struct DeterminizeOptions { + typedef typename Arc::StateId StateId; + typedef typename Arc::Weight Weight; + typedef typename Arc::Label Label; + + float delta; // Quantization delta for subset weights. + Weight weight_threshold; // Pruning weight threshold. + StateId state_threshold; // Pruning state threshold. + Label subsequential_label; // Label used for residual final output + // when producing subsequential transducers. + + explicit DeterminizeOptions(float d = kDelta, Weight w = Weight::Zero(), + StateId n = kNoStateId, Label l = 0) + : delta(d), weight_threshold(w), state_threshold(n), + subsequential_label(l) {} +}; + + +// Determinizes a weighted transducer. This version writes the +// determinized Fst to an output MutableFst. The result will be an +// equivalent FST that has the property that no state has two +// transitions with the same input label. For this algorithm, epsilon +// transitions are treated as regular symbols (cf. RmEpsilon). +// +// The transducer must be functional. The weights must be (weakly) +// left divisible (valid for TropicalWeight and LogWeight). +// +// Complexity: +// - Determinizable: exponential (polynomial in the size of the output) +// - Non-determinizable: does not terminate +// +// The determinizable automata include all unweighted and all acyclic input. +// +// References: +// - Mehryar Mohri, "Finite-State Transducers in Language and Speech +// Processing". Computational Linguistics, 23:2, 1997. +template +void Determinize(const Fst &ifst, MutableFst *ofst, + const DeterminizeOptions &opts + = DeterminizeOptions()) { + typedef typename Arc::StateId StateId; + typedef typename Arc::Weight Weight; + + DeterminizeFstOptions nopts; + nopts.delta = opts.delta; + nopts.subsequential_label = opts.subsequential_label; + + nopts.gc_limit = 0; // Cache only the last state for fastest copy. + + if (opts.weight_threshold != Weight::Zero() || + opts.state_threshold != kNoStateId) { + if (ifst.Properties(kAcceptor, false)) { + vector idistance, odistance; + ShortestDistance(ifst, &idistance, true); + DeterminizeFst dfst(ifst, &idistance, &odistance, nopts); + PruneOptions< Arc, AnyArcFilter > popts(opts.weight_threshold, + opts.state_threshold, + AnyArcFilter(), + &odistance); + Prune(dfst, ofst, popts); + } else { + *ofst = DeterminizeFst(ifst, nopts); + Prune(ofst, opts.weight_threshold, opts.state_threshold); + } + } else { + *ofst = DeterminizeFst(ifst, nopts); + } +} + + +} // namespace fst + +#endif // FST_LIB_DETERMINIZE_H__ -- cgit v1.2.3