summaryrefslogtreecommitdiff
path: root/kaldi_io/src/kaldi/hmm/hmm-utils.h
blob: 240f70632da77d683ee4d3c65dd4b5a90fed335d (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
// hmm/hmm-utils.h

// Copyright 2009-2011  Microsoft Corporation

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#ifndef KALDI_HMM_HMM_UTILS_H_
#define KALDI_HMM_HMM_UTILS_H_

#include "hmm/hmm-topology.h"
#include "hmm/transition-model.h"
#include "lat/kaldi-lattice.h"

namespace kaldi {


/// \defgroup hmm_group_graph Classes and functions for creating FSTs from HMMs
/// \ingroup hmm_group
/// @{

/// Configuration class for the GetHTransducer() function; see
/// \ref hmm_graph_config for context.
struct HTransducerConfig {
  /// Transition log-prob scale, see \ref hmm_scale.
  /// Note this doesn't apply to self-loops; GetHTransducer() does
  /// not include self-loops.
  BaseFloat transition_scale;

  /// if true, we are constructing time-reversed FST: phone-seqs in ilabel_info
  /// are backwards, and we want to output a backwards version of the HMM
  /// corresponding to each phone.  If reverse == true,
  bool reverse;

  /// This variable is only looked at if reverse == true.  If reverse == true
  /// and push_weights == true, then we push the weights in the reversed FSTs we create for each
  /// phone HMM.  This is only safe if the HMMs are probabilistic (i.e. not discriminatively
  bool push_weights;

  /// delta used if we do push_weights [only relevant if reverse == true
  /// and push_weights == true].
  BaseFloat push_delta;

  HTransducerConfig():
      transition_scale(1.0),
      reverse(false),
      push_weights(true),
      push_delta(0.001)
  { }

  // Note-- this Register registers the easy-to-register options
  // but not the "sym_type" which is an enum and should be handled
  // separately in main().
  void Register (OptionsItf *po) {
    po->Register("transition-scale", &transition_scale,
                 "Scale of transition probs (relative to LM)");
    po->Register("reverse", &reverse,
                 "Set true to build time-reversed FST.");
    po->Register("push-weights", &push_weights,
                 "Push weights (only applicable if reverse == true)");
    po->Register("push-delta", &push_delta,
                 "Delta used in pushing weights (only applicable if "
                 "reverse && push-weights");
  }
};


struct HmmCacheHash {
  int operator () (const std::pair<int32, std::vector<int32> >&p) const {
    VectorHasher<int32> v;
    int32 prime = 103049;
    return prime*p.first + v(p.second);
  }
};

/// HmmCacheType is a map from (central-phone, sequence of pdf-ids) to FST, used
/// as cache in GetHmmAsFst, as an optimization.
typedef unordered_map<std::pair<int32, std::vector<int32> >,
                      fst::VectorFst<fst::StdArc>*,
                      HmmCacheHash> HmmCacheType;


/// Called by GetHTransducer() and probably will not need to be called directly;
/// it creates the FST corresponding to the phone.  Does not include self-loops;
/// you have to call AddSelfLoops() for that.  Result owned by caller.
/// Returns an acceptor (i.e. ilabels, olabels identical) with transition-ids
/// as the symbols.
/// For documentation in context, see \ref hmm_graph_get_hmm_as_fst
///   @param context_window  A vector representing the phonetic context; see
///            \ref tree_window "here" for explanation.
///   @param ctx_dep The object that contains the phonetic decision-tree
///   @param trans_model The transition-model object, which provides
///         the mappings to transition-ids and also the transition
///         probabilities.
///   @param config Configuration object, see \ref HTransducerConfig.
///   @param cache Object used as a lookaside buffer to save computation;
///       if it finds that the object it needs is already there, it will
///       just return a pointer value from "cache"-- not that this means
///       you have to be careful not to delete things twice.

fst::VectorFst<fst::StdArc> *GetHmmAsFst(
    std::vector<int32> context_window,
    const ContextDependencyInterface &ctx_dep,
    const TransitionModel &trans_model,
    const HTransducerConfig &config,
    HmmCacheType *cache = NULL);

/// Included mainly as a form of documentation, not used in any other code
/// currently.  Creates the FST with self-loops, and with fewer options.
fst::VectorFst<fst::StdArc>*
GetHmmAsFstSimple(std::vector<int32> context_window,
                  const ContextDependencyInterface &ctx_dep,
                  const TransitionModel &trans_model,
                  BaseFloat prob_scale);


/**
  * Returns the H tranducer; result owned by caller.
  * See \ref hmm_graph_get_h_transducer.  The H transducer has on the
  * input transition-ids, and also possibly some disambiguation symbols, which
  * will be put in disambig_syms.  The output side contains the identifiers that
  * are indexes into "ilabel_info" (these represent phones-in-context or
  * disambiguation symbols).  The ilabel_info vector allows GetHTransducer to map
  * from symbols to phones-in-context (i.e. phonetic context windows).  Any
  * singleton symbols in the ilabel_info vector which are not phones, will be
  * treated as disambiguation symbols.  [Not all recipes use these].  The output
  * "disambig_syms_left" will be set to a list of the disambiguation symbols on
  * the input of the transducer (i.e. same symbol type as whatever is on the
  * input of the transducer
  */
fst::VectorFst<fst::StdArc>*
GetHTransducer (const std::vector<std::vector<int32> > &ilabel_info,
                const ContextDependencyInterface &ctx_dep,
                const TransitionModel &trans_model,
                const HTransducerConfig &config,
                std::vector<int32> *disambig_syms_left);

/**
  * GetIlabelMapping produces a mapping that's similar to HTK's logical-to-physical
  * model mapping (i.e. the xwrd.clustered.mlist files).   It groups together
  * "logical HMMs" (i.e. in our world, phonetic context windows) that share the
  * same sequence of transition-ids.   This can be used in an
  * optional graph-creation step that produces a remapped form of CLG that can be
  * more productively determinized and minimized.  This is used in the command-line program
  * make-ilabel-transducer.cc.
  * @param ilabel_info_old [in] The original \ref tree_ilabel "ilabel_info" vector
  * @param ctx_dep [in] The tree
  * @param trans_model [in] The transition-model object
  * @param old2new_map [out] The output; this vector, which is of size equal to the
  *       number of new labels, is a mapping to the old labels such that we could
  *       create a vector ilabel_info_new such that
  *       ilabel_info_new[i] == ilabel_info_old[old2new_map[i]]
  */
void GetIlabelMapping (const std::vector<std::vector<int32> > &ilabel_info_old,
                       const ContextDependencyInterface &ctx_dep,
                       const TransitionModel &trans_model,
                       std::vector<int32> *old2new_map);



/**
  * For context, see \ref hmm_graph_add_self_loops.  Expands an FST that has been
  * built without self-loops, and adds the self-loops (it also needs to modify
  * the probability of the non-self-loop ones, as the graph without self-loops
  * was created in such a way that it was stochastic).  Note that the
  * disambig_syms will be empty in some recipes (e.g.  if you already removed
  * the disambiguation symbols).
  * @param trans_model [in] Transition model
  * @param disambig_syms [in] Sorted, uniq list of disambiguation symbols, required
  *       if the graph contains disambiguation symbols but only needed for sanity checks.
  * @param self_loop_scale [in] Transition-probability scale for self-loops; c.f.
  *                    \ref hmm_scale
  * @param reorder [in] If true, reorders the transitions (see \ref hmm_reorder).
  * @param  fst [in, out] The FST to be modified.
  */
void AddSelfLoops(const TransitionModel &trans_model,
                  const std::vector<int32> &disambig_syms,  // used as a check only.
                  BaseFloat self_loop_scale,
                  bool reorder,  // true->dan-style, false->lukas-style.
                  fst::VectorFst<fst::StdArc> *fst);

/**
  * Adds transition-probs, with the supplied
  * scales (see \ref hmm_scale), to the graph.
  * Useful if you want to create a graph without transition probs, then possibly
  * train the model (including the transition probs) but keep the graph fixed,
  * and add back in the transition probs.  It assumes the fst has transition-ids
  * on it.  It is not an error if the FST has no states (nothing will be done).
  * @param trans_model [in] The transition model
  * @param disambig_syms [in] A list of disambiguation symbols, required if the
  *                       graph has disambiguation symbols on its input but only
  *                       used for checks.
  * @param transition_scale [in] A scale on transition-probabilities apart from
  *                      those involving self-loops; see \ref hmm_scale.
  * @param self_loop_scale [in] A scale on self-loop transition probabilities;
  *                      see \ref hmm_scale.
  * @param  fst [in, out] The FST to be modified.
  */
void AddTransitionProbs(const TransitionModel &trans_model,
                        const std::vector<int32> &disambig_syms,
                        BaseFloat transition_scale,
                        BaseFloat self_loop_scale,
                        fst::VectorFst<fst::StdArc> *fst);

/**
   This is as AddSelfLoops(), but operates on a Lattice, where
   it affects the graph part of the weight (the first element
   of the pair). */
void AddTransitionProbs(const TransitionModel &trans_model,
                        BaseFloat transition_scale,
                        BaseFloat self_loop_scale,
                        Lattice *lat);


/// Returns a transducer from pdfs plus one (input) to  transition-ids (output).
/// Currenly of use only for testing.
fst::VectorFst<fst::StdArc>*
GetPdfToTransitionIdTransducer(const TransitionModel &trans_model);

/// Converts all transition-ids in the FST to pdfs plus one.
/// Placeholder: not implemented yet!
void ConvertTransitionIdsToPdfs(const TransitionModel &trans_model,
                                const std::vector<int32> &disambig_syms,
                                fst::VectorFst<fst::StdArc> *fst);

/// @} end "defgroup hmm_group_graph"

/// \addtogroup hmm_group
/// @{

/// SplitToPhones splits up the TransitionIds in "alignment" into their
/// individual phones (one vector per instance of a phone).  At output,
/// the sum of the sizes of the vectors in split_alignment will be the same
/// as the corresponding sum for "alignment".  The function returns
/// true on success.  If the alignment appears to be incomplete, e.g.
/// not ending at the end-state of a phone, it will still break it up into
/// phones but it will return false.  For more serious errors it will
/// die or throw an exception.
/// This function works out by itself whether the graph was created
/// with "reordering" (dan-style graph), and just does the right thing.

bool SplitToPhones(const TransitionModel &trans_model,
                   const std::vector<int32> &alignment,
                   std::vector<std::vector<int32> > *split_alignment);

/// ConvertAlignment converts an alignment that was created using one
/// model, to another model.  They must use a compatible topology (so we
/// know the state alignments of the new model).
/// It returns false if it could not be split to phones (probably
/// because the alignment was partial), but for other kinds of
/// error that are more likely a coding error, it will throw
/// an exception.
bool ConvertAlignment(const TransitionModel &old_trans_model,
                      const TransitionModel &new_trans_model,
                      const ContextDependencyInterface &new_ctx_dep,
                      const std::vector<int32> &old_alignment,
                      const std::vector<int32> *phone_map,  // may be NULL
                      std::vector<int32> *new_alignment);

// ConvertPhnxToProns is only needed in bin/phones-to-prons.cc and
// isn't closely related with HMMs, but we put it here as there isn't
// any other obvious place for it and it needs to be tested.
// This function takes a phone-sequence with word-start and word-end
// markers in it, and a word-sequence, and outputs the pronunciations
// "prons"... the format of "prons" is, each element is a vector,
// where the first element is the word (or zero meaning no word, e.g.
// for optional silence introduced by the lexicon), and the remaining
// elements are the phones in the word's pronunciation.
// It returns false if it encounters a problem of some kind, e.g.
// if the phone-sequence doesn't seem to have the right number of
// words in it.
bool ConvertPhnxToProns(const std::vector<int32> &phnx,
                        const std::vector<int32> &words,
                        int32 word_start_sym,
                        int32 word_end_sym,
                        std::vector<std::vector<int32> > *prons);

/// @} end "addtogroup hmm_group"

} // end namespace kaldi


#endif