diff options
author | Determinant <[email protected]> | 2015-08-14 11:51:42 +0800 |
---|---|---|
committer | Determinant <[email protected]> | 2015-08-14 11:51:42 +0800 |
commit | 96a32415ab43377cf1575bd3f4f2980f58028209 (patch) | |
tree | 30a2d92d73e8f40ac87b79f6f56e227bfc4eea6e /kaldi_io/src/kaldi/hmm | |
parent | c177a7549bd90670af4b29fa813ddea32cfe0f78 (diff) |
add implementation for kaldi io (by ymz)
Diffstat (limited to 'kaldi_io/src/kaldi/hmm')
-rw-r--r-- | kaldi_io/src/kaldi/hmm/hmm-topology.h | 172 | ||||
-rw-r--r-- | kaldi_io/src/kaldi/hmm/hmm-utils.h | 295 | ||||
-rw-r--r-- | kaldi_io/src/kaldi/hmm/posterior.h | 214 | ||||
-rw-r--r-- | kaldi_io/src/kaldi/hmm/transition-model.h | 345 | ||||
-rw-r--r-- | kaldi_io/src/kaldi/hmm/tree-accu.h | 69 |
5 files changed, 1095 insertions, 0 deletions
diff --git a/kaldi_io/src/kaldi/hmm/hmm-topology.h b/kaldi_io/src/kaldi/hmm/hmm-topology.h new file mode 100644 index 0000000..53ca427 --- /dev/null +++ b/kaldi_io/src/kaldi/hmm/hmm-topology.h @@ -0,0 +1,172 @@ +// hmm/hmm-topology.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_HMM_HMM_TOPOLOGY_H_ +#define KALDI_HMM_HMM_TOPOLOGY_H_ + +#include "base/kaldi-common.h" +#include "tree/context-dep.h" +#include "util/const-integer-set.h" + + +namespace kaldi { + + +/// \addtogroup hmm_group +/// @{ + +/* + // The following would be the text form for the "normal" HMM topology. + // Note that the first state is the start state, and the final state, + // which must have no output transitions and must be nonemitting, has + // an exit probability of one (no other state can have nonzero exit + // probability; you can treat the transition probability to the final + // state as an exit probability). + // Note also that it's valid to omit the "<PdfClass>" entry of the <State>, which + // will mean we won't have a pdf on that state [non-emitting state]. This is equivalent + // to setting the <PdfClass> to -1. We do this normally just for the final state. + // The Topology object can have multiple <TopologyEntry> blocks. + // This is useful if there are multiple types of topology in the system. + + <Topology> + <TopologyEntry> + <ForPhones> 1 2 3 4 5 6 7 8 </ForPhones> + <State> 0 <PdfClass> 0 + <Transition> 0 0.5 + <Transition> 1 0.5 + </State> + <State> 1 <PdfClass> 1 + <Transition> 1 0.5 + <Transition> 2 0.5 + </State> + <State> 2 <PdfClass> 2 + <Transition> 2 0.5 + <Transition> 3 0.5 + <Final> 0.5 + </State> + <State> 3 + </State> + </TopologyEntry> + </Topology> +*/ + +// kNoPdf is used where pdf_class or pdf would be used, to indicate, +// none is there. Mainly useful in skippable models, but also used +// for end states. +// A caveat with nonemitting states is that their out-transitions +// are not trainable, due to technical issues with the way +// we decided to accumulate the stats. Any transitions arising from (*) +// HMM states with "kNoPdf" as the label are second-class transitions, +// They do not have "transition-states" or "transition-ids" associated +// with them. They are used to create the FST version of the +// HMMs, where they lead to epsilon arcs. +// (*) "arising from" is a bit of a technical term here, due to the way +// (if reorder == true), we put the transition-id associated with the +// outward arcs of the state, on the input transition to the state. + +/// A constant used in the HmmTopology class as the \ref pdf_class "pdf-class" +/// kNoPdf, which is used when a HMM-state is nonemitting (has no associated +/// PDF). + +static const int32 kNoPdf = -1; + +/// A class for storing topology information for phones. See \ref hmm for context. +/// This object is sometimes accessed in a file by itself, but more often +/// as a class member of the Transition class (this is for convenience to reduce +/// the number of files programs have to access). + +class HmmTopology { + public: + /// A structure defined inside HmmTopology to represent a HMM state. + struct HmmState { + /// The \ref pdf_class pdf-class, typically 0, 1 or 2 (the same as the HMM-state index), + /// but may be different to enable us to hardwire sharing of state, and may be + /// equal to \ref kNoPdf == -1 in order to specify nonemitting states (unusual). + int32 pdf_class; + + /// A list of transitions. The first member of each pair is the index of + /// the next HmmState, and the second is the default transition probability + /// (before training). + std::vector<std::pair<int32, BaseFloat> > transitions; + + explicit HmmState(int32 p): pdf_class(p) { } + + bool operator == (const HmmState &other) const { + return (pdf_class == other.pdf_class && transitions == other.transitions); + } + + HmmState(): pdf_class(-1) { } + }; + + /// TopologyEntry is a typedef that represents the topology of + /// a single (prototype) state. + typedef std::vector<HmmState> TopologyEntry; + + void Read(std::istream &is, bool binary); + void Write(std::ostream &os, bool binary) const; + + // Checks that the object is valid, and throw exception otherwise. + void Check(); + + + /// Returns the topology entry (i.e. vector of HmmState) for this phone; + /// will throw exception if phone not covered by the topology. + const TopologyEntry &TopologyForPhone(int32 phone) const; + + /// Returns the number of \ref pdf_class "pdf-classes" for this phone; + /// throws exception if phone not covered by this topology. + int32 NumPdfClasses(int32 phone) const; + + /// Returns a reference to a sorted, unique list of phones covered by + /// the topology (these phones will be positive integers, and usually + /// contiguous and starting from one but the toolkit doesn't assume + /// they are contiguous). + const std::vector<int32> &GetPhones() const { return phones_; }; + + /// Outputs a vector of int32, indexed by phone, that gives the + /// number of \ref pdf_class pdf-classes for the phones; this is + /// used by tree-building code such as BuildTree(). + void GetPhoneToNumPdfClasses(std::vector<int32> *phone2num_pdf_classes) const; + + HmmTopology() {} + + bool operator == (const HmmTopology &other) const { + return phones_ == other.phones_ && phone2idx_ == other.phone2idx_ + && entries_ == other.entries_; + } + // Allow default assignment operator and copy constructor. + private: + std::vector<int32> phones_; // list of all phones we have topology for. Sorted, uniq. no epsilon (zero) phone. + std::vector<int32> phone2idx_; // map from phones to indexes into the entries vector (or -1 for not present). + std::vector<TopologyEntry> entries_; +}; + + +/// This function returns a HmmTopology object giving a normal 3-state topology, +/// covering all phones in the list "phones". This is mainly of use in testing +/// code. +HmmTopology GetDefaultTopology(const std::vector<int32> &phones); + +/// @} end "addtogroup hmm_group" + + +} // end namespace kaldi + + +#endif diff --git a/kaldi_io/src/kaldi/hmm/hmm-utils.h b/kaldi_io/src/kaldi/hmm/hmm-utils.h new file mode 100644 index 0000000..240f706 --- /dev/null +++ b/kaldi_io/src/kaldi/hmm/hmm-utils.h @@ -0,0 +1,295 @@ +// hmm/hmm-utils.h + +// Copyright 2009-2011 Microsoft Corporation + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_HMM_HMM_UTILS_H_ +#define KALDI_HMM_HMM_UTILS_H_ + +#include "hmm/hmm-topology.h" +#include "hmm/transition-model.h" +#include "lat/kaldi-lattice.h" + +namespace kaldi { + + +/// \defgroup hmm_group_graph Classes and functions for creating FSTs from HMMs +/// \ingroup hmm_group +/// @{ + +/// Configuration class for the GetHTransducer() function; see +/// \ref hmm_graph_config for context. +struct HTransducerConfig { + /// Transition log-prob scale, see \ref hmm_scale. + /// Note this doesn't apply to self-loops; GetHTransducer() does + /// not include self-loops. + BaseFloat transition_scale; + + /// if true, we are constructing time-reversed FST: phone-seqs in ilabel_info + /// are backwards, and we want to output a backwards version of the HMM + /// corresponding to each phone. If reverse == true, + bool reverse; + + /// This variable is only looked at if reverse == true. If reverse == true + /// and push_weights == true, then we push the weights in the reversed FSTs we create for each + /// phone HMM. This is only safe if the HMMs are probabilistic (i.e. not discriminatively + bool push_weights; + + /// delta used if we do push_weights [only relevant if reverse == true + /// and push_weights == true]. + BaseFloat push_delta; + + HTransducerConfig(): + transition_scale(1.0), + reverse(false), + push_weights(true), + push_delta(0.001) + { } + + // Note-- this Register registers the easy-to-register options + // but not the "sym_type" which is an enum and should be handled + // separately in main(). + void Register (OptionsItf *po) { + po->Register("transition-scale", &transition_scale, + "Scale of transition probs (relative to LM)"); + po->Register("reverse", &reverse, + "Set true to build time-reversed FST."); + po->Register("push-weights", &push_weights, + "Push weights (only applicable if reverse == true)"); + po->Register("push-delta", &push_delta, + "Delta used in pushing weights (only applicable if " + "reverse && push-weights"); + } +}; + + +struct HmmCacheHash { + int operator () (const std::pair<int32, std::vector<int32> >&p) const { + VectorHasher<int32> v; + int32 prime = 103049; + return prime*p.first + v(p.second); + } +}; + +/// HmmCacheType is a map from (central-phone, sequence of pdf-ids) to FST, used +/// as cache in GetHmmAsFst, as an optimization. +typedef unordered_map<std::pair<int32, std::vector<int32> >, + fst::VectorFst<fst::StdArc>*, + HmmCacheHash> HmmCacheType; + + +/// Called by GetHTransducer() and probably will not need to be called directly; +/// it creates the FST corresponding to the phone. Does not include self-loops; +/// you have to call AddSelfLoops() for that. Result owned by caller. +/// Returns an acceptor (i.e. ilabels, olabels identical) with transition-ids +/// as the symbols. +/// For documentation in context, see \ref hmm_graph_get_hmm_as_fst +/// @param context_window A vector representing the phonetic context; see +/// \ref tree_window "here" for explanation. +/// @param ctx_dep The object that contains the phonetic decision-tree +/// @param trans_model The transition-model object, which provides +/// the mappings to transition-ids and also the transition +/// probabilities. +/// @param config Configuration object, see \ref HTransducerConfig. +/// @param cache Object used as a lookaside buffer to save computation; +/// if it finds that the object it needs is already there, it will +/// just return a pointer value from "cache"-- not that this means +/// you have to be careful not to delete things twice. + +fst::VectorFst<fst::StdArc> *GetHmmAsFst( + std::vector<int32> context_window, + const ContextDependencyInterface &ctx_dep, + const TransitionModel &trans_model, + const HTransducerConfig &config, + HmmCacheType *cache = NULL); + +/// Included mainly as a form of documentation, not used in any other code +/// currently. Creates the FST with self-loops, and with fewer options. +fst::VectorFst<fst::StdArc>* +GetHmmAsFstSimple(std::vector<int32> context_window, + const ContextDependencyInterface &ctx_dep, + const TransitionModel &trans_model, + BaseFloat prob_scale); + + +/** + * Returns the H tranducer; result owned by caller. + * See \ref hmm_graph_get_h_transducer. The H transducer has on the + * input transition-ids, and also possibly some disambiguation symbols, which + * will be put in disambig_syms. The output side contains the identifiers that + * are indexes into "ilabel_info" (these represent phones-in-context or + * disambiguation symbols). The ilabel_info vector allows GetHTransducer to map + * from symbols to phones-in-context (i.e. phonetic context windows). Any + * singleton symbols in the ilabel_info vector which are not phones, will be + * treated as disambiguation symbols. [Not all recipes use these]. The output + * "disambig_syms_left" will be set to a list of the disambiguation symbols on + * the input of the transducer (i.e. same symbol type as whatever is on the + * input of the transducer + */ +fst::VectorFst<fst::StdArc>* +GetHTransducer (const std::vector<std::vector<int32> > &ilabel_info, + const ContextDependencyInterface &ctx_dep, + const TransitionModel &trans_model, + const HTransducerConfig &config, + std::vector<int32> *disambig_syms_left); + +/** + * GetIlabelMapping produces a mapping that's similar to HTK's logical-to-physical + * model mapping (i.e. the xwrd.clustered.mlist files). It groups together + * "logical HMMs" (i.e. in our world, phonetic context windows) that share the + * same sequence of transition-ids. This can be used in an + * optional graph-creation step that produces a remapped form of CLG that can be + * more productively determinized and minimized. This is used in the command-line program + * make-ilabel-transducer.cc. + * @param ilabel_info_old [in] The original \ref tree_ilabel "ilabel_info" vector + * @param ctx_dep [in] The tree + * @param trans_model [in] The transition-model object + * @param old2new_map [out] The output; this vector, which is of size equal to the + * number of new labels, is a mapping to the old labels such that we could + * create a vector ilabel_info_new such that + * ilabel_info_new[i] == ilabel_info_old[old2new_map[i]] + */ +void GetIlabelMapping (const std::vector<std::vector<int32> > &ilabel_info_old, + const ContextDependencyInterface &ctx_dep, + const TransitionModel &trans_model, + std::vector<int32> *old2new_map); + + + +/** + * For context, see \ref hmm_graph_add_self_loops. Expands an FST that has been + * built without self-loops, and adds the self-loops (it also needs to modify + * the probability of the non-self-loop ones, as the graph without self-loops + * was created in such a way that it was stochastic). Note that the + * disambig_syms will be empty in some recipes (e.g. if you already removed + * the disambiguation symbols). + * @param trans_model [in] Transition model + * @param disambig_syms [in] Sorted, uniq list of disambiguation symbols, required + * if the graph contains disambiguation symbols but only needed for sanity checks. + * @param self_loop_scale [in] Transition-probability scale for self-loops; c.f. + * \ref hmm_scale + * @param reorder [in] If true, reorders the transitions (see \ref hmm_reorder). + * @param fst [in, out] The FST to be modified. + */ +void AddSelfLoops(const TransitionModel &trans_model, + const std::vector<int32> &disambig_syms, // used as a check only. + BaseFloat self_loop_scale, + bool reorder, // true->dan-style, false->lukas-style. + fst::VectorFst<fst::StdArc> *fst); + +/** + * Adds transition-probs, with the supplied + * scales (see \ref hmm_scale), to the graph. + * Useful if you want to create a graph without transition probs, then possibly + * train the model (including the transition probs) but keep the graph fixed, + * and add back in the transition probs. It assumes the fst has transition-ids + * on it. It is not an error if the FST has no states (nothing will be done). + * @param trans_model [in] The transition model + * @param disambig_syms [in] A list of disambiguation symbols, required if the + * graph has disambiguation symbols on its input but only + * used for checks. + * @param transition_scale [in] A scale on transition-probabilities apart from + * those involving self-loops; see \ref hmm_scale. + * @param self_loop_scale [in] A scale on self-loop transition probabilities; + * see \ref hmm_scale. + * @param fst [in, out] The FST to be modified. + */ +void AddTransitionProbs(const TransitionModel &trans_model, + const std::vector<int32> &disambig_syms, + BaseFloat transition_scale, + BaseFloat self_loop_scale, + fst::VectorFst<fst::StdArc> *fst); + +/** + This is as AddSelfLoops(), but operates on a Lattice, where + it affects the graph part of the weight (the first element + of the pair). */ +void AddTransitionProbs(const TransitionModel &trans_model, + BaseFloat transition_scale, + BaseFloat self_loop_scale, + Lattice *lat); + + +/// Returns a transducer from pdfs plus one (input) to transition-ids (output). +/// Currenly of use only for testing. +fst::VectorFst<fst::StdArc>* +GetPdfToTransitionIdTransducer(const TransitionModel &trans_model); + +/// Converts all transition-ids in the FST to pdfs plus one. +/// Placeholder: not implemented yet! +void ConvertTransitionIdsToPdfs(const TransitionModel &trans_model, + const std::vector<int32> &disambig_syms, + fst::VectorFst<fst::StdArc> *fst); + +/// @} end "defgroup hmm_group_graph" + +/// \addtogroup hmm_group +/// @{ + +/// SplitToPhones splits up the TransitionIds in "alignment" into their +/// individual phones (one vector per instance of a phone). At output, +/// the sum of the sizes of the vectors in split_alignment will be the same +/// as the corresponding sum for "alignment". The function returns +/// true on success. If the alignment appears to be incomplete, e.g. +/// not ending at the end-state of a phone, it will still break it up into +/// phones but it will return false. For more serious errors it will +/// die or throw an exception. +/// This function works out by itself whether the graph was created +/// with "reordering" (dan-style graph), and just does the right thing. + +bool SplitToPhones(const TransitionModel &trans_model, + const std::vector<int32> &alignment, + std::vector<std::vector<int32> > *split_alignment); + +/// ConvertAlignment converts an alignment that was created using one +/// model, to another model. They must use a compatible topology (so we +/// know the state alignments of the new model). +/// It returns false if it could not be split to phones (probably +/// because the alignment was partial), but for other kinds of +/// error that are more likely a coding error, it will throw +/// an exception. +bool ConvertAlignment(const TransitionModel &old_trans_model, + const TransitionModel &new_trans_model, + const ContextDependencyInterface &new_ctx_dep, + const std::vector<int32> &old_alignment, + const std::vector<int32> *phone_map, // may be NULL + std::vector<int32> *new_alignment); + +// ConvertPhnxToProns is only needed in bin/phones-to-prons.cc and +// isn't closely related with HMMs, but we put it here as there isn't +// any other obvious place for it and it needs to be tested. +// This function takes a phone-sequence with word-start and word-end +// markers in it, and a word-sequence, and outputs the pronunciations +// "prons"... the format of "prons" is, each element is a vector, +// where the first element is the word (or zero meaning no word, e.g. +// for optional silence introduced by the lexicon), and the remaining +// elements are the phones in the word's pronunciation. +// It returns false if it encounters a problem of some kind, e.g. +// if the phone-sequence doesn't seem to have the right number of +// words in it. +bool ConvertPhnxToProns(const std::vector<int32> &phnx, + const std::vector<int32> &words, + int32 word_start_sym, + int32 word_end_sym, + std::vector<std::vector<int32> > *prons); + +/// @} end "addtogroup hmm_group" + +} // end namespace kaldi + + +#endif diff --git a/kaldi_io/src/kaldi/hmm/posterior.h b/kaldi_io/src/kaldi/hmm/posterior.h new file mode 100644 index 0000000..be73be9 --- /dev/null +++ b/kaldi_io/src/kaldi/hmm/posterior.h @@ -0,0 +1,214 @@ +// hmm/posterior.h + +// Copyright 2009-2011 Microsoft Corporation +// 2013-2014 Johns Hopkins University (author: Daniel Povey) +// 2014 Guoguo Chen + + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_HMM_POSTERIOR_H_ +#define KALDI_HMM_POSTERIOR_H_ + +#include "base/kaldi-common.h" +#include "tree/context-dep.h" +#include "util/const-integer-set.h" +#include "util/kaldi-table.h" +#include "hmm/transition-model.h" + + +namespace kaldi { + + +/// \addtogroup posterior_group +/// @{ + +/// Posterior is a typedef for storing acoustic-state (actually, transition-id) +/// posteriors over an utterance. The "int32" is a transition-id, and the BaseFloat +/// is a probability (typically between zero and one). +typedef std::vector<std::vector<std::pair<int32, BaseFloat> > > Posterior; + +/// GaussPost is a typedef for storing Gaussian-level posteriors for an utterance. +/// the "int32" is a transition-id, and the Vector<BaseFloat> is a vector of +/// Gaussian posteriors. +/// WARNING: We changed "int32" from transition-id to pdf-id, and the change is +/// applied for all programs using GaussPost. This is for efficiency purpose. We +/// also changed the name slightly from GauPost to GaussPost to reduce the +/// chance that the change will go un-noticed in downstream code. +typedef std::vector<std::vector<std::pair<int32, Vector<BaseFloat> > > > GaussPost; + + +// PosteriorHolder is a holder for Posterior, which is +// std::vector<std::vector<std::pair<int32, BaseFloat> > > +// This is used for storing posteriors of transition id's for an +// utterance. +class PosteriorHolder { + public: + typedef Posterior T; + + PosteriorHolder() { } + + static bool Write(std::ostream &os, bool binary, const T &t); + + void Clear() { Posterior tmp; std::swap(tmp, t_); } + + // Reads into the holder. + bool Read(std::istream &is); + + // Kaldi objects always have the stream open in binary mode for + // reading. + static bool IsReadInBinary() { return true; } + + const T &Value() const { return t_; } + + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(PosteriorHolder); + T t_; +}; + + +// GaussPostHolder is a holder for GaussPost, which is +// std::vector<std::vector<std::pair<int32, Vector<BaseFloat> > > > +// This is used for storing posteriors of transition id's for an +// utterance. +class GaussPostHolder { + public: + typedef GaussPost T; + + GaussPostHolder() { } + + static bool Write(std::ostream &os, bool binary, const T &t); + + void Clear() { GaussPost tmp; std::swap(tmp, t_); } + + // Reads into the holder. + bool Read(std::istream &is); + + // Kaldi objects always have the stream open in binary mode for + // reading. + static bool IsReadInBinary() { return true; } + + const T &Value() const { return t_; } + + private: + KALDI_DISALLOW_COPY_AND_ASSIGN(GaussPostHolder); + T t_; +}; + + +// Posterior is a typedef: vector<vector<pair<int32, BaseFloat> > >, +// representing posteriors over (typically) transition-ids for an +// utterance. +typedef TableWriter<PosteriorHolder> PosteriorWriter; +typedef SequentialTableReader<PosteriorHolder> SequentialPosteriorReader; +typedef RandomAccessTableReader<PosteriorHolder> RandomAccessPosteriorReader; + + +// typedef std::vector<std::vector<std::pair<int32, Vector<BaseFloat> > > > GaussPost; +typedef TableWriter<GaussPostHolder> GaussPostWriter; +typedef SequentialTableReader<GaussPostHolder> SequentialGaussPostReader; +typedef RandomAccessTableReader<GaussPostHolder> RandomAccessGaussPostReader; + + +/// Scales the BaseFloat (weight) element in the posterior entries. +void ScalePosterior(BaseFloat scale, Posterior *post); + +/// Returns the total of all the weights in "post". +BaseFloat TotalPosterior(const Posterior &post); + +/// Returns true if the two lists of pairs have no common .first element. +bool PosteriorEntriesAreDisjoint( + const std::vector<std::pair<int32, BaseFloat> > &post_elem1, + const std::vector<std::pair<int32, BaseFloat> > &post_elem2); + + +/// Merge two sets of posteriors, which must have the same length. If "merge" +/// is true, it will make a common entry whenever there are duplicated entries, +/// adding up the weights. If "drop_frames" is true, for frames where the +/// two sets of posteriors were originally disjoint, makes no entries for that +/// frame (relates to frame dropping, or drop_frames, see Vesely et al, ICASSP +/// 2013). Returns the number of frames for which the two posteriors were +/// disjoint (i.e. no common transition-ids or whatever index we are using). +int32 MergePosteriors(const Posterior &post1, + const Posterior &post2, + bool merge, + bool drop_frames, + Posterior *post); + +/// Given a vector of log-likelihoods (typically of Gaussians in a GMM +/// but could be of pdf-ids), a number gselect >= 1 and a minimum posterior +/// 0 <= min_post < 1, it gets the posterior for each element of log-likes +/// by applying Softmax(), then prunes the posteriors using "gselect" and +/// "min_post" (keeping at least one), and outputs the result into +/// "post_entry", sorted from greatest to least posterior. +/// Returns the total log-likelihood (the output of calling ApplySoftMax() +/// on a copy of log_likes). +BaseFloat VectorToPosteriorEntry( + const VectorBase<BaseFloat> &log_likes, + int32 num_gselect, + BaseFloat min_post, + std::vector<std::pair<int32, BaseFloat> > *post_entry); + +/// Convert an alignment to a posterior (with a scale of 1.0 on +/// each entry). +void AlignmentToPosterior(const std::vector<int32> &ali, + Posterior *post); + +/// Sorts posterior entries so that transition-ids with same pdf-id are next to +/// each other. +void SortPosteriorByPdfs(const TransitionModel &tmodel, + Posterior *post); + + +/// Converts a posterior over transition-ids to be a posterior +/// over pdf-ids. +void ConvertPosteriorToPdfs(const TransitionModel &tmodel, + const Posterior &post_in, + Posterior *post_out); + +/// Converts a posterior over transition-ids to be a posterior +/// over phones. +void ConvertPosteriorToPhones(const TransitionModel &tmodel, + const Posterior &post_in, + Posterior *post_out); + +/// Weight any silence phones in the posterior (i.e. any phones +/// in the set "silence_set" by scale "silence_scale". +/// The interface was changed in Feb 2014 to do the modification +/// "in-place" rather than having separate input and output. +void WeightSilencePost(const TransitionModel &trans_model, + const ConstIntegerSet<int32> &silence_set, + BaseFloat silence_scale, + Posterior *post); + +/// This is similar to WeightSilencePost, except that on each frame it +/// works out the amount by which the overall posterior would be reduced, +/// and scales down everything on that frame by the same amount. It +/// has the effect that frames that are mostly silence get down-weighted. +/// The interface was changed in Feb 2014 to do the modification +/// "in-place" rather than having separate input and output. +void WeightSilencePostDistributed(const TransitionModel &trans_model, + const ConstIntegerSet<int32> &silence_set, + BaseFloat silence_scale, + Posterior *post); + +/// @} end "addtogroup posterior_group" + + +} // end namespace kaldi + + +#endif diff --git a/kaldi_io/src/kaldi/hmm/transition-model.h b/kaldi_io/src/kaldi/hmm/transition-model.h new file mode 100644 index 0000000..ccc4f11 --- /dev/null +++ b/kaldi_io/src/kaldi/hmm/transition-model.h @@ -0,0 +1,345 @@ +// hmm/transition-model.h + +// Copyright 2009-2012 Microsoft Corporation +// Johns Hopkins University (author: Guoguo Chen) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_HMM_TRANSITION_MODEL_H_ +#define KALDI_HMM_TRANSITION_MODEL_H_ + +#include "base/kaldi-common.h" +#include "tree/context-dep.h" +#include "util/const-integer-set.h" +#include "fst/fst-decl.h" // forward declarations. +#include "hmm/hmm-topology.h" +#include "itf/options-itf.h" + +namespace kaldi { + +/// \addtogroup hmm_group +/// @{ + +// The class TransitionModel is a repository for the transition probabilities. +// It also handles certain integer mappings. +// The basic model is as follows. Each phone has a HMM topology defined in +// hmm-topology.h. Each HMM-state of each of these phones has a number of +// transitions (and final-probs) out of it. Each HMM-state defined in the +// HmmTopology class has an associated "pdf_class". This gets replaced with +// an actual pdf-id via the tree. The transition model associates the +// transition probs with the (phone, HMM-state, pdf-id). We associate with +// each such triple a transition-state. Each +// transition-state has a number of associated probabilities to estimate; +// this depends on the number of transitions/final-probs in the topology for +// that (phone, HMM-state). Each probability has an associated transition-index. +// We associate with each (transition-state, transition-index) a unique transition-id. +// Each individual probability estimated by the transition-model is asociated with a +// transition-id. +// +// List of the various types of quantity referred to here and what they mean: +// phone: a phone index (1, 2, 3 ...) +// HMM-state: a number (0, 1, 2...) that indexes TopologyEntry (see hmm-topology.h) +// pdf-id: a number output by the Compute function of ContextDependency (it +// indexes pdf's). Zero-based. +// transition-state: the states for which we estimate transition probabilities for transitions +// out of them. In some topologies, will map one-to-one with pdf-ids. +// One-based, since it appears on FSTs. +// transition-index: identifier of a transition (or final-prob) in the HMM. Indexes the +// "transitions" vector in HmmTopology::HmmState. [if it is out of range, +// equal to transitions.size(), it refers to the final-prob.] +// Zero-based. +// transition-id: identifier of a unique parameter of the TransitionModel. +// Associated with a (transition-state, transition-index) pair. +// One-based, since it appears on FSTs. +// +// List of the possible mappings TransitionModel can do: +// (phone, HMM-state, pdf-id) -> transition-state +// (transition-state, transition-index) -> transition-id +// Reverse mappings: +// transition-id -> transition-state +// transition-id -> transition-index +// transition-state -> phone +// transition-state -> HMM-state +// transition-state -> pdf-id +// +// The main things the TransitionModel object can do are: +// Get initialized (need ContextDependency and HmmTopology objects). +// Read/write. +// Update [given a vector of counts indexed by transition-id]. +// Do the various integer mappings mentioned above. +// Get the probability (or log-probability) associated with a particular transition-id. + + +// Note: this was previously called TransitionUpdateConfig. +struct MleTransitionUpdateConfig { + BaseFloat floor; + BaseFloat mincount; + bool share_for_pdfs; // If true, share all transition parameters that have the same pdf. + MleTransitionUpdateConfig(BaseFloat floor = 0.01, + BaseFloat mincount = 5.0, + bool share_for_pdfs = false): + floor(floor), mincount(mincount), share_for_pdfs(share_for_pdfs) {} + + void Register (OptionsItf *po) { + po->Register("transition-floor", &floor, + "Floor for transition probabilities"); + po->Register("transition-min-count", &mincount, + "Minimum count required to update transitions from a state"); + po->Register("share-for-pdfs", &share_for_pdfs, + "If true, share all transition parameters where the states " + "have the same pdf."); + } +}; + +struct MapTransitionUpdateConfig { + BaseFloat tau; + bool share_for_pdfs; // If true, share all transition parameters that have the same pdf. + MapTransitionUpdateConfig(): tau(5.0), share_for_pdfs(false) { } + + void Register (OptionsItf *po) { + po->Register("transition-tau", &tau, "Tau value for MAP estimation of transition " + "probabilities."); + po->Register("share-for-pdfs", &share_for_pdfs, + "If true, share all transition parameters where the states " + "have the same pdf."); + } +}; + +class TransitionModel { + + public: + /// Initialize the object [e.g. at the start of training]. + /// The class keeps a copy of the HmmTopology object, but not + /// the ContextDependency object. + TransitionModel(const ContextDependency &ctx_dep, + const HmmTopology &hmm_topo); + + + /// Constructor that takes no arguments: typically used prior to calling Read. + TransitionModel() { } + + void Read(std::istream &is, bool binary); // note, no symbol table: topo object always read/written w/o symbols. + void Write(std::ostream &os, bool binary) const; + + + /// return reference to HMM-topology object. + const HmmTopology &GetTopo() const { return topo_; } + + /// \name Integer mapping functions + /// @{ + + int32 TripleToTransitionState(int32 phone, int32 hmm_state, int32 pdf) const; + int32 PairToTransitionId(int32 trans_state, int32 trans_index) const; + int32 TransitionIdToTransitionState(int32 trans_id) const; + int32 TransitionIdToTransitionIndex(int32 trans_id) const; + int32 TransitionStateToPhone(int32 trans_state) const; + int32 TransitionStateToHmmState(int32 trans_state) const; + int32 TransitionStateToPdf(int32 trans_state) const; + int32 SelfLoopOf(int32 trans_state) const; // returns the self-loop transition-id, or zero if + // this state doesn't have a self-loop. + + inline int32 TransitionIdToPdf(int32 trans_id) const; + int32 TransitionIdToPhone(int32 trans_id) const; + int32 TransitionIdToPdfClass(int32 trans_id) const; + int32 TransitionIdToHmmState(int32 trans_id) const; + + /// @} + + bool IsFinal(int32 trans_id) const; // returns true if this trans_id goes to the final state + // (which is bound to be nonemitting). + bool IsSelfLoop(int32 trans_id) const; // return true if this trans_id corresponds to a self-loop. + + /// Returns the total number of transition-ids (note, these are one-based). + inline int32 NumTransitionIds() const { return id2state_.size()-1; } + + /// Returns the number of transition-indices for a particular transition-state. + /// Note: "Indices" is the plural of "index". Index is not the same as "id", + /// here. A transition-index is a zero-based offset into the transitions + /// out of a particular transition state. + int32 NumTransitionIndices(int32 trans_state) const; + + /// Returns the total number of transition-states (note, these are one-based). + int32 NumTransitionStates() const { return triples_.size(); } + + // NumPdfs() actually returns the highest-numbered pdf we ever saw, plus one. + // In normal cases this should equal the number of pdfs in the system, but if you + // initialized this object with fewer than all the phones, and it happens that + // an unseen phone has the highest-numbered pdf, this might be different. + int32 NumPdfs() const { return num_pdfs_; } + + // This loops over the triples and finds the highest phone index present. If + // the FST symbol table for the phones is created in the expected way, i.e.: + // starting from 1 (<eps> is 0) and numbered contiguously till the last phone, + // this will be the total number of phones. + int32 NumPhones() const; + + /// Returns a sorted, unique list of phones. + const std::vector<int32> &GetPhones() const { return topo_.GetPhones(); } + + // Transition-parameter-getting functions: + BaseFloat GetTransitionProb(int32 trans_id) const; + BaseFloat GetTransitionLogProb(int32 trans_id) const; + + // The following functions are more specialized functions for getting + // transition probabilities, that are provided for convenience. + + /// Returns the log-probability of a particular non-self-loop transition + /// after subtracting the probability mass of the self-loop and renormalizing; + /// will crash if called on a self-loop. Specifically: + /// for non-self-loops it returns the log of that prob divided by (1 minus + /// self-loop-prob-for-that-state). + BaseFloat GetTransitionLogProbIgnoringSelfLoops(int32 trans_id) const; + + /// Returns the log-prob of the non-self-loop probability + /// mass for this transition state. (you can get the self-loop prob, if a self-loop + /// exists, by calling GetTransitionLogProb(SelfLoopOf(trans_state)). + BaseFloat GetNonSelfLoopLogProb(int32 trans_state) const; + + /// Does Maximum Likelihood estimation. The stats are counts/weights, indexed + /// by transition-id. This was previously called Update(). + void MleUpdate(const Vector<double> &stats, + const MleTransitionUpdateConfig &cfg, + BaseFloat *objf_impr_out, + BaseFloat *count_out); + + /// Does Maximum A Posteriori (MAP) estimation. The stats are counts/weights, + /// indexed by transition-id. + void MapUpdate(const Vector<double> &stats, + const MapTransitionUpdateConfig &cfg, + BaseFloat *objf_impr_out, + BaseFloat *count_out); + + /// Print will print the transition model in a human-readable way, for purposes of human + /// inspection. The "occs" are optional (they are indexed by pdf-id). + void Print(std::ostream &os, + const std::vector<std::string> &phone_names, + const Vector<double> *occs = NULL); + + + void InitStats(Vector<double> *stats) const { stats->Resize(NumTransitionIds()+1); } + + void Accumulate(BaseFloat prob, int32 trans_id, Vector<double> *stats) const { + KALDI_ASSERT(trans_id <= NumTransitionIds()); + (*stats)(trans_id) += prob; + // This is trivial and doesn't require class members, but leaves us more open + // to design changes than doing it manually. + } + + /// returns true if all the integer class members are identical (but does not + /// compare the transition probabilities. + bool Compatible(const TransitionModel &other) const; + + private: + void MleUpdateShared(const Vector<double> &stats, + const MleTransitionUpdateConfig &cfg, + BaseFloat *objf_impr_out, BaseFloat *count_out); + void MapUpdateShared(const Vector<double> &stats, + const MapTransitionUpdateConfig &cfg, + BaseFloat *objf_impr_out, BaseFloat *count_out); + void ComputeTriples(const ContextDependency &ctx_dep); // called from constructor. initializes triples_. + void ComputeDerived(); // called from constructor and Read function: computes state2id_ and id2state_. + void ComputeDerivedOfProbs(); // computes quantities derived from log-probs (currently just + // non_self_loop_log_probs_; called whenever log-probs change. + void InitializeProbs(); // called from constructor. + void Check() const; + + struct Triple { + int32 phone; + int32 hmm_state; + int32 pdf; + Triple() { } + Triple(int32 phone, int32 hmm_state, int32 pdf): + phone(phone), hmm_state(hmm_state), pdf(pdf) { } + bool operator < (const Triple &other) const { + if (phone < other.phone) return true; + else if (phone > other.phone) return false; + else if (hmm_state < other.hmm_state) return true; + else if (hmm_state > other.hmm_state) return false; + else return pdf < other.pdf; + } + bool operator == (const Triple &other) const { + return (phone == other.phone && hmm_state == other.hmm_state + && pdf == other.pdf); + } + }; + + HmmTopology topo_; + + /// Triples indexed by transition state minus one; + /// the triples are in sorted order which allows us to do the reverse mapping from + /// triple to transition state + std::vector<Triple> triples_; + + /// Gives the first transition_id of each transition-state; indexed by + /// the transition-state. Array indexed 1..num-transition-states+1 (the last one + /// is needed so we can know the num-transitions of the last transition-state. + std::vector<int32> state2id_; + + /// For each transition-id, the corresponding transition + /// state (indexed by transition-id). + std::vector<int32> id2state_; + + /// For each transition-id, the corresponding log-prob. Indexed by transition-id. + Vector<BaseFloat> log_probs_; + + /// For each transition-state, the log of (1 - self-loop-prob). Indexed by + /// transition-state. + Vector<BaseFloat> non_self_loop_log_probs_; + + /// This is actually one plus the highest-numbered pdf we ever got back from the + /// tree (but the tree numbers pdfs contiguously from zero so this is the number + /// of pdfs). + int32 num_pdfs_; + + + DISALLOW_COPY_AND_ASSIGN(TransitionModel); + +}; + +inline int32 TransitionModel::TransitionIdToPdf(int32 trans_id) const { + // If a lot of time is spent here we may create an extra array + // to handle this. + KALDI_ASSERT(static_cast<size_t>(trans_id) < id2state_.size() && + "Likely graph/model mismatch (graph built from wrong model?)"); + int32 trans_state = id2state_[trans_id]; + return triples_[trans_state-1].pdf; +} + +/// Works out which pdfs might correspond to the given phones. Will return true +/// if these pdfs correspond *just* to these phones, false if these pdfs are also +/// used by other phones. +/// @param trans_model [in] Transition-model used to work out this information +/// @param phones [in] A sorted, uniq vector that represents a set of phones +/// @param pdfs [out] Will be set to a sorted, uniq list of pdf-ids that correspond +/// to one of this set of phones. +/// @return Returns true if all of the pdfs output to "pdfs" correspond to phones from +/// just this set (false if they may be shared with phones outside this set). +bool GetPdfsForPhones(const TransitionModel &trans_model, + const std::vector<int32> &phones, + std::vector<int32> *pdfs); + +/// Works out which phones might correspond to the given pdfs. Similar to the +/// above GetPdfsForPhones(, ,) +bool GetPhonesForPdfs(const TransitionModel &trans_model, + const std::vector<int32> &pdfs, + std::vector<int32> *phones); +/// @} + + +} // end namespace kaldi + + +#endif diff --git a/kaldi_io/src/kaldi/hmm/tree-accu.h b/kaldi_io/src/kaldi/hmm/tree-accu.h new file mode 100644 index 0000000..d571762 --- /dev/null +++ b/kaldi_io/src/kaldi/hmm/tree-accu.h @@ -0,0 +1,69 @@ +// hmm/tree-accu.h + +// Copyright 2009-2011 Microsoft Corporation +// 2013 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. +#ifndef KALDI_HMM_TREE_ACCU_H_ +#define KALDI_HMM_TREE_ACCU_H_ + +#include <cctype> // For isspace. +#include <limits> +#include "base/kaldi-common.h" +#include "hmm/transition-model.h" +#include "tree/clusterable-classes.h" +#include "tree/build-tree-questions.h" // needed for this typedef: +// typedef std::vector<std::pair<EventVector, Clusterable*> > BuildTreeStatsType; + +namespace kaldi { + +/// \ingroup tree_group_top +/// @{ + + +/// Accumulates the stats needed for training context-dependency trees (in the +/// "normal" way). It adds to 'stats' the stats obtained from this file. Any +/// new GaussClusterable* pointers in "stats" will be allocated with "new". + +void AccumulateTreeStats(const TransitionModel &trans_model, + BaseFloat var_floor, + int N, // context window size. + int P, // central position. + const std::vector<int32> &ci_phones, // sorted + const std::vector<int32> &alignment, + const Matrix<BaseFloat> &features, + const std::vector<int32> *phone_map, // or NULL + std::map<EventType, GaussClusterable*> *stats); + + + +/*** Read a mapping from one phone set to another. The phone map file has lines + of the form <old-phone> <new-phone>, where both entries are integers, usually + nonzero (but this is not enforced). This program will crash if the input is + invalid, e.g. there are multiple inconsistent entries for the same old phone. + The output vector "phone_map" will be indexed by old-phone and will contain + the corresponding new-phone, or -1 for any entry that was not defined. */ + +void ReadPhoneMap(std::string phone_map_rxfilename, + std::vector<int32> *phone_map); + + + +/// @} + +} // end namespace kaldi. + +#endif |