diff options
Diffstat (limited to 'kaldi_io/src/tools/openfst/include/fst/label-reachable.h')
-rw-r--r-- | kaldi_io/src/tools/openfst/include/fst/label-reachable.h | 565 |
1 files changed, 565 insertions, 0 deletions
diff --git a/kaldi_io/src/tools/openfst/include/fst/label-reachable.h b/kaldi_io/src/tools/openfst/include/fst/label-reachable.h new file mode 100644 index 0000000..af06eef --- /dev/null +++ b/kaldi_io/src/tools/openfst/include/fst/label-reachable.h @@ -0,0 +1,565 @@ +// label_reachable.h + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: [email protected] (Michael Riley) +// +// \file +// Class to determine if a non-epsilon label can be read as the +// first non-epsilon symbol along some path from a given state. + + +#ifndef FST_LIB_LABEL_REACHABLE_H__ +#define FST_LIB_LABEL_REACHABLE_H__ + +#include <tr1/unordered_map> +using std::tr1::unordered_map; +using std::tr1::unordered_multimap; +#include <vector> +using std::vector; + +#include <fst/accumulator.h> +#include <fst/arcsort.h> +#include <fst/interval-set.h> +#include <fst/state-reachable.h> +#include <fst/vector-fst.h> + + +namespace fst { + +// Stores shareable data for label reachable class copies. +template <typename L> +class LabelReachableData { + public: + typedef L Label; + typedef typename IntervalSet<L>::Interval Interval; + + explicit LabelReachableData(bool reach_input, bool keep_relabel_data = true) + : reach_input_(reach_input), + keep_relabel_data_(keep_relabel_data), + have_relabel_data_(true), + final_label_(kNoLabel) {} + + ~LabelReachableData() {} + + bool ReachInput() const { return reach_input_; } + + vector< IntervalSet<L> > *IntervalSets() { return &isets_; } + + unordered_map<L, L> *Label2Index() { + if (!have_relabel_data_) + FSTERROR() << "LabelReachableData: no relabeling data"; + return &label2index_; + } + + Label FinalLabel() { + if (final_label_ == kNoLabel) + final_label_ = label2index_[kNoLabel]; + return final_label_; + } + + static LabelReachableData<L> *Read(istream &istrm) { + LabelReachableData<L> *data = new LabelReachableData<L>(); + + ReadType(istrm, &data->reach_input_); + ReadType(istrm, &data->keep_relabel_data_); + data->have_relabel_data_ = data->keep_relabel_data_; + if (data->keep_relabel_data_) + ReadType(istrm, &data->label2index_); + ReadType(istrm, &data->final_label_); + ReadType(istrm, &data->isets_); + return data; + } + + bool Write(ostream &ostrm) { + WriteType(ostrm, reach_input_); + WriteType(ostrm, keep_relabel_data_); + if (keep_relabel_data_) + WriteType(ostrm, label2index_); + WriteType(ostrm, FinalLabel()); + WriteType(ostrm, isets_); + return true; + } + + int RefCount() const { return ref_count_.count(); } + int IncrRefCount() { return ref_count_.Incr(); } + int DecrRefCount() { return ref_count_.Decr(); } + + private: + LabelReachableData() {} + + bool reach_input_; // Input or output labels considered? + bool keep_relabel_data_; // Save label2index_ to file? + bool have_relabel_data_; // Using label2index_? + Label final_label_; // Final label + RefCounter ref_count_; // Reference count. + unordered_map<L, L> label2index_; // Finds index for a label. + vector<IntervalSet <L> > isets_; // Interval sets per state. + + DISALLOW_COPY_AND_ASSIGN(LabelReachableData); +}; + + +// Tests reachability of labels from a given state. If reach_input = +// true, then input labels are considered, o.w. output labels are +// considered. To test for reachability from a state s, first do +// SetState(s). Then a label l can be reached from state s of FST f +// iff Reach(r) is true where r = Relabel(l). The relabeling is +// required to ensure a compact representation of the reachable +// labels. + +// The whole FST can be relabeled instead with Relabel(&f, +// reach_input) so that the test Reach(r) applies directly to the +// labels of the transformed FST f. The relabeled FST will also be +// sorted appropriately for composition. +// +// Reachablity of a final state from state s (via an epsilon path) +// can be tested with ReachFinal(); +// +// Reachability can also be tested on the set of labels specified by +// an arc iterator, useful for FST composition. In particular, +// Reach(aiter, ...) is true if labels on the input (output) side of +// the transitions of the arc iterator, when iter_input is true +// (false), can be reached from the state s. The iterator labels must +// have already been relabeled. +// +// With the arc iterator test of reachability, the begin position, end +// position and accumulated arc weight of the matches can be +// returned. The optional template argument controls how reachable arc +// weights are accumulated. The default uses the semiring +// Plus(). Alternative ones can be used to distribute the weights in +// composition in various ways. +template <class A, class S = DefaultAccumulator<A> > +class LabelReachable { + public: + typedef A Arc; + typedef typename A::StateId StateId; + typedef typename A::Label Label; + typedef typename A::Weight Weight; + typedef typename IntervalSet<Label>::Interval Interval; + + LabelReachable(const Fst<A> &fst, bool reach_input, S *s = 0, + bool keep_relabel_data = true) + : fst_(new VectorFst<Arc>(fst)), + s_(kNoStateId), + data_(new LabelReachableData<Label>(reach_input, keep_relabel_data)), + accumulator_(s ? s : new S()), + ncalls_(0), + nintervals_(0), + error_(false) { + StateId ins = fst_->NumStates(); + TransformFst(); + FindIntervals(ins); + delete fst_; + } + + explicit LabelReachable(LabelReachableData<Label> *data, S *s = 0) + : fst_(0), + s_(kNoStateId), + data_(data), + accumulator_(s ? s : new S()), + ncalls_(0), + nintervals_(0), + error_(false) { + data_->IncrRefCount(); + } + + LabelReachable(const LabelReachable<A, S> &reachable) : + fst_(0), + s_(kNoStateId), + data_(reachable.data_), + accumulator_(new S(*reachable.accumulator_)), + ncalls_(0), + nintervals_(0), + error_(reachable.error_) { + data_->IncrRefCount(); + } + + ~LabelReachable() { + if (!data_->DecrRefCount()) + delete data_; + delete accumulator_; + if (ncalls_ > 0) { + VLOG(2) << "# of calls: " << ncalls_; + VLOG(2) << "# of intervals/call: " << (nintervals_ / ncalls_); + } + } + + // Relabels w.r.t labels that give compact label sets. + Label Relabel(Label label) { + if (label == 0 || error_) + return label; + unordered_map<Label, Label> &label2index = *data_->Label2Index(); + Label &relabel = label2index[label]; + if (!relabel) // Add new label + relabel = label2index.size() + 1; + return relabel; + } + + // Relabels Fst w.r.t to labels that give compact label sets. + void Relabel(MutableFst<Arc> *fst, bool relabel_input) { + for (StateIterator< MutableFst<Arc> > siter(*fst); + !siter.Done(); siter.Next()) { + StateId s = siter.Value(); + for (MutableArcIterator< MutableFst<Arc> > aiter(fst, s); + !aiter.Done(); + aiter.Next()) { + Arc arc = aiter.Value(); + if (relabel_input) + arc.ilabel = Relabel(arc.ilabel); + else + arc.olabel = Relabel(arc.olabel); + aiter.SetValue(arc); + } + } + if (relabel_input) { + ArcSort(fst, ILabelCompare<Arc>()); + fst->SetInputSymbols(0); + } else { + ArcSort(fst, OLabelCompare<Arc>()); + fst->SetOutputSymbols(0); + } + } + + // Returns relabeling pairs (cf. relabel.h::Relabel()). + // If 'avoid_collisions' is true, extra pairs are added to + // ensure no collisions when relabeling automata that have + // labels unseen here. + void RelabelPairs(vector<pair<Label, Label> > *pairs, + bool avoid_collisions = false) { + pairs->clear(); + unordered_map<Label, Label> &label2index = *data_->Label2Index(); + // Maps labels to their new values in [1, label2index().size()] + for (typename unordered_map<Label, Label>::const_iterator + it = label2index.begin(); it != label2index.end(); ++it) + if (it->second != data_->FinalLabel()) + pairs->push_back(pair<Label, Label>(it->first, it->second)); + if (avoid_collisions) { + // Ensures any label in [1, label2index().size()] is mapped either + // by the above step or to label2index() + 1 (to avoid collisions). + for (int i = 1; i <= label2index.size(); ++i) { + typename unordered_map<Label, Label>::const_iterator + it = label2index.find(i); + if (it == label2index.end() || it->second == data_->FinalLabel()) + pairs->push_back(pair<Label, Label>(i, label2index.size() + 1)); + } + } + } + + // Set current state. Optionally set state associated + // with arc iterator to be passed to Reach. + void SetState(StateId s, StateId aiter_s = kNoStateId) { + s_ = s; + if (aiter_s != kNoStateId) { + accumulator_->SetState(aiter_s); + if (accumulator_->Error()) error_ = true; + } + } + + // Can reach this label from current state? + // Original labels must be transformed by the Relabel methods above. + bool Reach(Label label) { + if (label == 0 || error_) + return false; + vector< IntervalSet<Label> > &isets = *data_->IntervalSets(); + return isets[s_].Member(label); + + } + + // Can reach final state (via epsilon transitions) from this state? + bool ReachFinal() { + if (error_) return false; + vector< IntervalSet<Label> > &isets = *data_->IntervalSets(); + return isets[s_].Member(data_->FinalLabel()); + } + + // Initialize with secondary FST to be used with Reach(Iterator,...). + // If copy is true, then 'fst' is a copy of the FST used in the + // previous call to this method (useful to avoid unnecessary updates). + template <class F> + void ReachInit(const F &fst, bool copy = false) { + accumulator_->Init(fst, copy); + if (accumulator_->Error()) error_ = true; + } + + // Can reach any arc iterator label between iterator positions + // aiter_begin and aiter_end? If aiter_input = true, then iterator + // input labels are considered, o.w. output labels are considered. + // Arc iterator labels must be transformed by the Relabel methods + // above. If compute_weight is true, user may call ReachWeight(). + template <class Iterator> + bool Reach(Iterator *aiter, ssize_t aiter_begin, + ssize_t aiter_end, bool aiter_input, bool compute_weight) { + if (error_) return false; + vector< IntervalSet<Label> > &isets = *data_->IntervalSets(); + const vector<Interval> *intervals = isets[s_].Intervals(); + ++ncalls_; + nintervals_ += intervals->size(); + + reach_begin_ = -1; + reach_end_ = -1; + reach_weight_ = Weight::Zero(); + + uint32 flags = aiter->Flags(); // save flags to restore them on exit + aiter->SetFlags(kArcNoCache, kArcNoCache); // make caching optional + aiter->Seek(aiter_begin); + + if (2 * (aiter_end - aiter_begin) < intervals->size()) { + // Check each arc against intervals. + // Set arc iterator flags to only compute the ilabel or olabel values, + // since they are the only values required for most of the arcs processed. + aiter->SetFlags(aiter_input ? kArcILabelValue : kArcOLabelValue, + kArcValueFlags); + Label reach_label = kNoLabel; + for (ssize_t aiter_pos = aiter_begin; + aiter_pos < aiter_end; aiter->Next(), ++aiter_pos) { + const A &arc = aiter->Value(); + Label label = aiter_input ? arc.ilabel : arc.olabel; + if (label == reach_label || Reach(label)) { + reach_label = label; + if (reach_begin_ < 0) + reach_begin_ = aiter_pos; + reach_end_ = aiter_pos + 1; + if (compute_weight) { + if (!(aiter->Flags() & kArcWeightValue)) { + // If the 'arc.weight' wasn't computed by the call + // to 'aiter->Value()' above, we need to call + // 'aiter->Value()' again after having set the arc iterator + // flags to compute the arc weight value. + aiter->SetFlags(kArcWeightValue, kArcValueFlags); + const A &arcb = aiter->Value(); + // Call the accumulator. + reach_weight_ = accumulator_->Sum(reach_weight_, arcb.weight); + // Only ilabel or olabel required to process the following + // arcs. + aiter->SetFlags(aiter_input ? kArcILabelValue : kArcOLabelValue, + kArcValueFlags); + } else { + // Call the accumulator. + reach_weight_ = accumulator_->Sum(reach_weight_, arc.weight); + } + } + } + } + } else { + // Check each interval against arcs + ssize_t begin_low, end_low = aiter_begin; + for (typename vector<Interval>::const_iterator + iiter = intervals->begin(); + iiter != intervals->end(); ++iiter) { + begin_low = LowerBound(aiter, end_low, aiter_end, + aiter_input, iiter->begin); + end_low = LowerBound(aiter, begin_low, aiter_end, + aiter_input, iiter->end); + if (end_low - begin_low > 0) { + if (reach_begin_ < 0) + reach_begin_ = begin_low; + reach_end_ = end_low; + if (compute_weight) { + aiter->SetFlags(kArcWeightValue, kArcValueFlags); + reach_weight_ = accumulator_->Sum(reach_weight_, aiter, + begin_low, end_low); + } + } + } + } + + aiter->SetFlags(flags, kArcFlags); // restore original flag values + return reach_begin_ >= 0; + } + + // Returns iterator position of first matching arc. + ssize_t ReachBegin() const { return reach_begin_; } + + // Returns iterator position one past last matching arc. + ssize_t ReachEnd() const { return reach_end_; } + + // Return the sum of the weights for matching arcs. + // Valid only if compute_weight was true in Reach() call. + Weight ReachWeight() const { return reach_weight_; } + + // Access to the relabeling map. Excludes epsilon (0) label but + // includes kNoLabel that is used internally for super-final + // transitons. + const unordered_map<Label, Label>& Label2Index() const { + return *data_->Label2Index(); + } + + LabelReachableData<Label> *GetData() const { return data_; } + + bool Error() const { return error_ || accumulator_->Error(); } + + private: + // Redirects labeled arcs (input or output labels determined by + // ReachInput()) to new label-specific final states. Each original + // final state is redirected via a transition labeled with kNoLabel + // to a new kNoLabel-specific final state. Creates super-initial + // state for all states with zero in-degree. + void TransformFst() { + StateId ins = fst_->NumStates(); + StateId ons = ins; + + vector<ssize_t> indeg(ins, 0); + + // Redirects labeled arcs to new final states. + for (StateId s = 0; s < ins; ++s) { + for (MutableArcIterator< VectorFst<Arc> > aiter(fst_, s); + !aiter.Done(); + aiter.Next()) { + Arc arc = aiter.Value(); + Label label = data_->ReachInput() ? arc.ilabel : arc.olabel; + if (label) { + if (label2state_.find(label) == label2state_.end()) { + label2state_[label] = ons; + indeg.push_back(0); + ++ons; + } + arc.nextstate = label2state_[label]; + aiter.SetValue(arc); + } + ++indeg[arc.nextstate]; // Finds in-degrees for next step. + } + + // Redirects final weights to new final state. + Weight final = fst_->Final(s); + if (final != Weight::Zero()) { + if (label2state_.find(kNoLabel) == label2state_.end()) { + label2state_[kNoLabel] = ons; + indeg.push_back(0); + ++ons; + } + Arc arc(kNoLabel, kNoLabel, final, label2state_[kNoLabel]); + fst_->AddArc(s, arc); + ++indeg[arc.nextstate]; // Finds in-degrees for next step. + + fst_->SetFinal(s, Weight::Zero()); + } + } + + // Add new final states to Fst. + while (fst_->NumStates() < ons) { + StateId s = fst_->AddState(); + fst_->SetFinal(s, Weight::One()); + } + + // Creates a super-initial state for all states with zero in-degree. + StateId start = fst_->AddState(); + fst_->SetStart(start); + for (StateId s = 0; s < start; ++s) { + if (indeg[s] == 0) { + Arc arc(0, 0, Weight::One(), s); + fst_->AddArc(start, arc); + } + } + } + + void FindIntervals(StateId ins) { + StateReachable<A, Label> state_reachable(*fst_); + if (state_reachable.Error()) { + error_ = true; + return; + } + + vector<Label> &state2index = state_reachable.State2Index(); + vector< IntervalSet<Label> > &isets = *data_->IntervalSets(); + isets = state_reachable.IntervalSets(); + isets.resize(ins); + + unordered_map<Label, Label> &label2index = *data_->Label2Index(); + for (typename unordered_map<Label, StateId>::const_iterator + it = label2state_.begin(); + it != label2state_.end(); + ++it) { + Label l = it->first; + StateId s = it->second; + Label i = state2index[s]; + label2index[l] = i; + } + label2state_.clear(); + + double nintervals = 0; + ssize_t non_intervals = 0; + for (ssize_t s = 0; s < ins; ++s) { + nintervals += isets[s].Size(); + if (isets[s].Size() > 1) { + ++non_intervals; + VLOG(3) << "state: " << s << " # of intervals: " << isets[s].Size(); + } + } + VLOG(2) << "# of states: " << ins; + VLOG(2) << "# of intervals: " << nintervals; + VLOG(2) << "# of intervals/state: " << nintervals/ins; + VLOG(2) << "# of non-interval states: " << non_intervals; + } + + template <class Iterator> + ssize_t LowerBound(Iterator *aiter, ssize_t aiter_begin, + ssize_t aiter_end, bool aiter_input, + Label match_label) const { + // Only need to compute the ilabel or olabel of arcs when + // performing the binary search. + aiter->SetFlags(aiter_input ? kArcILabelValue : kArcOLabelValue, + kArcValueFlags); + ssize_t low = aiter_begin; + ssize_t high = aiter_end; + while (low < high) { + ssize_t mid = (low + high) / 2; + aiter->Seek(mid); + Label label = aiter_input ? + aiter->Value().ilabel : aiter->Value().olabel; + if (label > match_label) { + high = mid; + } else if (label < match_label) { + low = mid + 1; + } else { + // Find first matching label (when non-deterministic) + for (ssize_t i = mid; i > low; --i) { + aiter->Seek(i - 1); + label = aiter_input ? aiter->Value().ilabel : aiter->Value().olabel; + if (label != match_label) { + aiter->Seek(i); + aiter->SetFlags(kArcValueFlags, kArcValueFlags); + return i; + } + } + aiter->SetFlags(kArcValueFlags, kArcValueFlags); + return low; + } + } + aiter->Seek(low); + aiter->SetFlags(kArcValueFlags, kArcValueFlags); + return low; + } + + VectorFst<Arc> *fst_; + StateId s_; // Current state + unordered_map<Label, StateId> label2state_; // Finds final state for a label + + ssize_t reach_begin_; // Iterator pos of first match + ssize_t reach_end_; // Iterator pos after last match + Weight reach_weight_; // Gives weight sum of arc iterator + // arcs with reachable labels. + LabelReachableData<Label> *data_; // Shareable data between copies + S *accumulator_; // Sums arc weights + + double ncalls_; + double nintervals_; + bool error_; + + void operator=(const LabelReachable<A, S> &); // Disallow +}; + +} // namespace fst + +#endif // FST_LIB_LABEL_REACHABLE_H__ |