summaryrefslogtreecommitdiff
path: root/kaldi_io/src/kaldi/hmm/posterior.h
blob: be73be91a3c371fe76123ff7613b35172adaf09a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
// hmm/posterior.h

// Copyright 2009-2011     Microsoft Corporation
//           2013-2014     Johns Hopkins University (author: Daniel Povey)
//                2014     Guoguo Chen


// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#ifndef KALDI_HMM_POSTERIOR_H_
#define KALDI_HMM_POSTERIOR_H_

#include "base/kaldi-common.h"
#include "tree/context-dep.h"
#include "util/const-integer-set.h"
#include "util/kaldi-table.h"
#include "hmm/transition-model.h"


namespace kaldi {


/// \addtogroup posterior_group
/// @{

/// Posterior is a typedef for storing acoustic-state (actually, transition-id)
/// posteriors over an utterance.  The "int32" is a transition-id, and the BaseFloat
/// is a probability (typically between zero and one).
typedef std::vector<std::vector<std::pair<int32, BaseFloat> > > Posterior;

/// GaussPost is a typedef for storing Gaussian-level posteriors for an utterance.
/// the "int32" is a transition-id, and the Vector<BaseFloat> is a vector of
/// Gaussian posteriors.
/// WARNING: We changed "int32" from transition-id to pdf-id, and the change is
/// applied for all programs using GaussPost. This is for efficiency purpose. We
/// also changed the name slightly from GauPost to GaussPost to reduce the
/// chance that the change will go un-noticed in downstream code.
typedef std::vector<std::vector<std::pair<int32, Vector<BaseFloat> > > > GaussPost;


// PosteriorHolder is a holder for Posterior, which is
// std::vector<std::vector<std::pair<int32, BaseFloat> > >
// This is used for storing posteriors of transition id's for an
// utterance.
class PosteriorHolder {
 public:
  typedef Posterior T;

  PosteriorHolder() { }

  static bool Write(std::ostream &os, bool binary, const T &t);
  
  void Clear() { Posterior tmp; std::swap(tmp, t_); }

  // Reads into the holder.
  bool Read(std::istream &is);
  
  // Kaldi objects always have the stream open in binary mode for
  // reading.
  static bool IsReadInBinary() { return true; }

  const T &Value() const { return t_; }
  
 private:
  KALDI_DISALLOW_COPY_AND_ASSIGN(PosteriorHolder);
  T t_;
};


// GaussPostHolder is a holder for GaussPost, which is
// std::vector<std::vector<std::pair<int32, Vector<BaseFloat> > > >
// This is used for storing posteriors of transition id's for an
// utterance.
class GaussPostHolder {
 public:
  typedef GaussPost T;

  GaussPostHolder() { }

  static bool Write(std::ostream &os, bool binary, const T &t);  

  void Clear() {  GaussPost tmp;  std::swap(tmp, t_); }

  // Reads into the holder.
  bool Read(std::istream &is);
  
  // Kaldi objects always have the stream open in binary mode for
  // reading.
  static bool IsReadInBinary() { return true; }

  const T &Value() const { return t_; }
  
 private:
  KALDI_DISALLOW_COPY_AND_ASSIGN(GaussPostHolder);
  T t_;
};


// Posterior is a typedef: vector<vector<pair<int32, BaseFloat> > >,
// representing posteriors over (typically) transition-ids for an
// utterance.
typedef TableWriter<PosteriorHolder> PosteriorWriter;
typedef SequentialTableReader<PosteriorHolder> SequentialPosteriorReader;
typedef RandomAccessTableReader<PosteriorHolder> RandomAccessPosteriorReader;


// typedef std::vector<std::vector<std::pair<int32, Vector<BaseFloat> > > > GaussPost;
typedef TableWriter<GaussPostHolder> GaussPostWriter;
typedef SequentialTableReader<GaussPostHolder> SequentialGaussPostReader;
typedef RandomAccessTableReader<GaussPostHolder> RandomAccessGaussPostReader;


/// Scales the BaseFloat (weight) element in the posterior entries.
void ScalePosterior(BaseFloat scale, Posterior *post);

/// Returns the total of all the weights in "post".
BaseFloat TotalPosterior(const Posterior &post);

/// Returns true if the two lists of pairs have no common .first element.
bool PosteriorEntriesAreDisjoint(
    const std::vector<std::pair<int32, BaseFloat> > &post_elem1,
    const std::vector<std::pair<int32, BaseFloat> > &post_elem2);


/// Merge two sets of posteriors, which must have the same length.  If "merge"
/// is true, it will make a common entry whenever there are duplicated entries,
/// adding up the weights.  If "drop_frames" is true, for frames where the
/// two sets of posteriors were originally disjoint, makes no entries for that
/// frame (relates to frame dropping, or drop_frames, see Vesely et al, ICASSP
/// 2013).  Returns the number of frames for which the two posteriors were
/// disjoint (i.e. no common transition-ids or whatever index we are using).
int32 MergePosteriors(const Posterior &post1,
                      const Posterior &post2,
                      bool merge,
                      bool drop_frames,
                      Posterior *post);

/// Given a vector of log-likelihoods (typically of Gaussians in a GMM
/// but could be of pdf-ids), a number gselect >= 1 and a minimum posterior
/// 0 <= min_post < 1, it gets the posterior for each element of log-likes
/// by applying Softmax(), then prunes the posteriors using "gselect" and
/// "min_post" (keeping at least one), and outputs the result into
/// "post_entry", sorted from greatest to least posterior.
/// Returns the total log-likelihood (the output of calling ApplySoftMax()
/// on a copy of log_likes).
BaseFloat VectorToPosteriorEntry(
    const VectorBase<BaseFloat> &log_likes,
    int32 num_gselect,
    BaseFloat min_post,
    std::vector<std::pair<int32, BaseFloat> > *post_entry);

/// Convert an alignment to a posterior (with a scale of 1.0 on
/// each entry).
void AlignmentToPosterior(const std::vector<int32> &ali,
                          Posterior *post);

/// Sorts posterior entries so that transition-ids with same pdf-id are next to
/// each other.
void SortPosteriorByPdfs(const TransitionModel &tmodel,
                         Posterior *post);


/// Converts a posterior over transition-ids to be a posterior
/// over pdf-ids.
void ConvertPosteriorToPdfs(const TransitionModel &tmodel,
                            const Posterior &post_in,
                            Posterior *post_out);

/// Converts a posterior over transition-ids to be a posterior
/// over phones.
void ConvertPosteriorToPhones(const TransitionModel &tmodel,
                              const Posterior &post_in,
                              Posterior *post_out);

/// Weight any silence phones in the posterior (i.e. any phones
/// in the set "silence_set" by scale "silence_scale".
/// The interface was changed in Feb 2014 to do the modification
/// "in-place" rather than having separate input and output.
void WeightSilencePost(const TransitionModel &trans_model,
                       const ConstIntegerSet<int32> &silence_set,
                       BaseFloat silence_scale,
                       Posterior *post);

/// This is similar to WeightSilencePost, except that on each frame it
/// works out the amount by which the overall posterior would be reduced,
/// and scales down everything on that frame by the same amount.  It
/// has the effect that frames that are mostly silence get down-weighted.
/// The interface was changed in Feb 2014 to do the modification
/// "in-place" rather than having separate input and output.
void WeightSilencePostDistributed(const TransitionModel &trans_model,
                                  const ConstIntegerSet<int32> &silence_set,
                                  BaseFloat silence_scale,
                                  Posterior *post);

/// @} end "addtogroup posterior_group"


} // end namespace kaldi


#endif