kaldi_io/src/kaldi/tree/cluster-utils.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291

// tree/cluster-utils.h

// Copyright 2012   Arnab Ghoshal
// Copyright 2009-2011  Microsoft Corporation;  Saarland University

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#ifndef KALDI_TREE_CLUSTER_UTILS_H_
#define KALDI_TREE_CLUSTER_UTILS_H_

#include <vector>
#include "matrix/matrix-lib.h"
#include "itf/clusterable-itf.h"

namespace kaldi {

/// \addtogroup clustering_group_simple
/// @{

/// Returns the total objective function after adding up all the
/// statistics in the vector (pointers may be NULL).
BaseFloat SumClusterableObjf(const std::vector<Clusterable*> &vec);

/// Returns the total normalizer (usually count) of the cluster (pointers may be NULL).
BaseFloat SumClusterableNormalizer(const std::vector<Clusterable*> &vec);

/// Sums stats (ptrs may be NULL). Returns NULL if no non-NULL stats present.
Clusterable *SumClusterable(const std::vector<Clusterable*> &vec);

/** Fills in any (NULL) holes in "stats" vector, with empty stats, because
 *  certain algorithms require non-NULL stats.  If "stats" nonempty, requires it
 *  to contain at least one non-NULL pointer that we can call Copy() on.
 */
void EnsureClusterableVectorNotNull(std::vector<Clusterable*> *stats);


/** Given stats and a vector "assignments" of the same size (that maps to
 * cluster indices), sums the stats up into "clusters."  It will add to any
 * stats already present in "clusters" (although typically "clusters" will be
 * empty when called), and it will extend with NULL pointers for any unseen
 * indices. Call EnsureClusterableStatsNotNull afterwards if you want to ensure
 * all non-NULL clusters. Pointer in "clusters" are owned by caller. Pointers in
 * "stats" do not have to be non-NULL.
 */
void AddToClusters(const std::vector<Clusterable*> &stats,
                   const std::vector<int32> &assignments,
                   std::vector<Clusterable*> *clusters);


/// AddToClustersOptimized does the same as AddToClusters (it sums up the stats
/// within each cluster, except it uses the sum of all the stats ("total") to
/// optimize the computation for speed, if possible.  This will generally only be
/// a significant speedup in the case where there are just two clusters, which
/// can happen in algorithms that are doing binary splits; the idea is that we
/// sum up all the stats in one cluster (the one with the fewest points in it),
/// and then subtract from the total.
void AddToClustersOptimized(const std::vector<Clusterable*> &stats,
                            const std::vector<int32> &assignments,
                            const Clusterable &total,
                            std::vector<Clusterable*> *clusters);

/// @} end "addtogroup clustering_group_simple"

/// \addtogroup clustering_group_algo
/// @{

// Note, in the algorithms below, it is assumed that the input "points" (which
// is std::vector<Clusterable*>) is all non-NULL.

/** A bottom-up clustering algorithm. There are two parameters that control how
 *  many clusters we get: a "max_merge_thresh" which is a threshold for merging
 *  clusters, and a min_clust which puts a floor on the number of clusters we want. Set
 *  max_merge_thresh = large to use the min_clust only, or min_clust to 0 to use
 *  the max_merge_thresh only.
 *
 *  The algorithm is:
 *  \code
 *      while (num-clusters > min_clust && smallest_merge_cost <= max_merge_thresh)
 *          merge the closest two clusters.
 *  \endcode
 *
 *  @param points [in] Points to be clustered (may not contain NULL pointers)
 *  @param thresh [in] Threshold on cost change from merging clusters; clusters
 *               won't be merged if the cost is more than this
 *  @param min_clust [in] Minimum number of clusters desired; we'll stop merging
 *                  after reaching this number.
 *  @param clusters_out [out] If non-NULL, will be set to a vector of size equal
 *                 to the number of output clusters, containing the clustered
 *                 statistics.  Must be empty when called.
 *  @param assignments_out [out] If non-NULL, will be resized to the number of
 *                 points, and each element is the index of the cluster that point
 *                 was assigned to.
 *  @return Returns the total objf change relative to all clusters being separate, which is
 *    a negative.  Note that this is not the same as what the other clustering algorithms return.
 */
BaseFloat ClusterBottomUp(const std::vector<Clusterable*> &points,
                          BaseFloat thresh,
                          int32 min_clust,
                          std::vector<Clusterable*> *clusters_out,
                          std::vector<int32> *assignments_out);

/** This is a bottom-up clustering where the points are pre-clustered in a set
 *  of compartments, such that only points in the same compartment are clustered
 *  together. The compartment and pair of points with the smallest merge cost
 *  is selected and the points are clustered. The result stays in the same
 *  compartment. The code does not merge compartments, and hence assumes that
 *  the number of compartments is smaller than the 'min_clust' option.
 *  The clusters in "clusters_out" are newly allocated and owned by the caller.
 */
BaseFloat ClusterBottomUpCompartmentalized(
    const std::vector< std::vector<Clusterable*> > &points, BaseFloat thresh,
    int32 min_clust, std::vector< std::vector<Clusterable*> > *clusters_out,
    std::vector< std::vector<int32> > *assignments_out);


struct RefineClustersOptions {
  int32 num_iters;  // must be >= 0.  If zero, does nothing.
  int32 top_n;  // must be >= 2.
  RefineClustersOptions() : num_iters(100), top_n(5) {}
  RefineClustersOptions(int32 num_iters_in, int32 top_n_in)
      : num_iters(num_iters_in), top_n(top_n_in) {}
  // include Write and Read functions because this object gets written/read as
  // part of the QuestionsForKeyOptions class.
  void Write(std::ostream &os, bool binary) const;
  void Read(std::istream &is, bool binary);
};

/** RefineClusters is mainly used internally by other clustering algorithms.
 *
 *  It starts with a given assignment of points to clusters and
 *  keeps trying to improve it by moving points from cluster to cluster, up to
 *  a maximum number of iterations.
 *
 *  "clusters" and "assignments" are both input and output variables, and so
 *  both MUST be non-NULL.
 *
 *  "top_n" (>=2) is a pruning value: more is more exact, fewer is faster. The
 *  algorithm initially finds the "top_n" closest clusters to any given point,
 *  and from that point only consider move to those "top_n" clusters. Since
 *  RefineClusters is called multiple times from ClusterKMeans (for instance),
 *  this is not really a limitation.
 */
BaseFloat RefineClusters(const std::vector<Clusterable*> &points,
                         std::vector<Clusterable*> *clusters /*non-NULL*/,
                         std::vector<int32> *assignments /*non-NULL*/,
                         RefineClustersOptions cfg = RefineClustersOptions());

struct ClusterKMeansOptions {
  RefineClustersOptions refine_cfg;
  int32 num_iters;
  int32 num_tries;  // if >1, try whole procedure >once and pick best.
  bool verbose;
  ClusterKMeansOptions()
      : refine_cfg(), num_iters(20), num_tries(2), verbose(true)  {}
};

/** ClusterKMeans is a K-means-like clustering algorithm. It starts with
 *  pseudo-random initialization of points to clusters and uses RefineClusters
 *  to iteratively improve the cluster assignments.  It does this for
 *  multiple iterations and picks the result with the best objective function.
 *
 *
 *  ClusterKMeans implicitly uses Rand(). It will not necessarily return
 *  the same value on different calls.  Use sRand() if you want consistent
 *  results.
 *  The algorithm used in ClusterKMeans is a "k-means-like" algorithm that tries
 *  to be as efficient as possible.  Firstly, since the algorithm it uses
 *  includes random initialization, it tries the whole thing cfg.num_tries times
 *  and picks the one with the best objective function.  Each try, it does as
 *  follows: it randomly initializes points to clusters, and then for
 *  cfg.num_iters iterations it calls RefineClusters().  The options to
 *  RefineClusters() are given by cfg.refine_cfg.  Calling RefineClusters once
 *  will always be at least as good as doing one iteration of reassigning points to
 *  clusters, but will generally be quite a bit better (without taking too
 *  much extra time).
 *
 *  @param points [in]  points to be clustered (must be all non-NULL).
 *  @param num_clust [in] number of clusters requested (it will always return exactly
 *                 this many, or will fail if num_clust > points.size()).
 *  @param clusters_out [out] may be NULL; if non-NULL, should be empty when called.
 *          Will be set to a vector of statistics corresponding to the output clusters.
 *  @param assignments_out [out] may be NULL; if non-NULL, will be set to a vector of
 *             same size as "points", which says for each point which cluster
 *              it is assigned to.
 *  @param cfg [in] configuration class specifying options to the algorithm.
 *  @return Returns the objective function improvement versus everything being
 *     in the same cluster.
 *
 */
BaseFloat ClusterKMeans(const std::vector<Clusterable*> &points,
                        int32 num_clust,  // exact number of clusters
                        std::vector<Clusterable*> *clusters_out,  // may be NULL
                        std::vector<int32> *assignments_out,  // may be NULL
                        ClusterKMeansOptions cfg = ClusterKMeansOptions());

struct TreeClusterOptions  {
  ClusterKMeansOptions kmeans_cfg;
  int32 branch_factor;
  BaseFloat thresh;  // Objf change: if >0, may be used to control number of leaves.
  TreeClusterOptions()
      : kmeans_cfg(), branch_factor(2), thresh(0) {
    kmeans_cfg.verbose = false;
  }
};

/** TreeCluster is a top-down clustering algorithm, using a binary tree (not
 *  necessarily balanced). Returns objf improvement versus having all points
 *  in one cluster.  The algorithm is:
 *     - Initialize to 1 cluster (tree with 1 node).
 *     - Maintain, for each cluster, a "best-binary-split" (using ClusterKMeans
 *       to do so). Always split the highest scoring cluster, until we can do no
 *       more splits.
 *
 *  @param points [in] Data points to be clustered
 *  @param max_clust  [in] Maximum number of clusters (you will get exactly this number,
 *                if there are at least this many points, except if you set the
 *                cfg.thresh value nonzero, in which case that threshold may limit
 *                the number of clusters.
 *  @param clusters_out [out] If non-NULL, will be set to the a vector whose first
 *                (*num_leaves_out) elements are the leaf clusters, and whose
 *                subsequent elements are the nonleaf nodes in the tree, in
 *                topological order with the root node last.  Must be empty vector
 *                when this function is called.
 *  @param assignments_out [out] If non-NULL, will be set to a vector to a vector the
 *               same size as "points", where assignments[i] is the leaf node index i
 *               to which the i'th point gets clustered.
 *  @param clust_assignments_out [out] If non-NULL, will be set to a vector the same size
 *                as clusters_out  which says for each node (leaf or nonleaf), the
 *                index of its parent.  For the root node (which is last),
 *                assignments_out[i] == i.  For each i, assignments_out[i]>=i, i.e.
 *                any node's parent is higher numbered than itself.  If you don't need
 *                this information, consider using instead the ClusterTopDown function.
 *  @param num_leaves_out [out] If non-NULL, will be set to the number of leaf nodes
 *                in the tree.
 *  @param cfg [in] Configuration object that controls clustering behavior.  Most
 *                 important value is "thresh", which provides an alternative mechanism
 *                 [other than max_clust] to limit the number of leaves.
 */
BaseFloat TreeCluster(const std::vector<Clusterable*> &points,
                      int32 max_clust,  // max number of leaf-level clusters.
                      std::vector<Clusterable*> *clusters_out,
                      std::vector<int32> *assignments_out,
                      std::vector<int32> *clust_assignments_out,
                      int32 *num_leaves_out,
                      TreeClusterOptions cfg = TreeClusterOptions());


/**
 *  A clustering algorithm that internally uses TreeCluster,
 *  but does not give you the information about the structure of the tree.
 *  The "clusters_out" and "assignments_out" may be NULL if the outputs are not
 *  needed.
 *
 *  @param points [in]  points to be clustered (must be all non-NULL).
 *  @param max_clust [in] Maximum number of clusters (you will get exactly this number,
 *                if there are at least this many points, except if you set the
 *                cfg.thresh value nonzero, in which case that threshold may limit
 *                the number of clusters.
 *  @param clusters_out [out] may be NULL; if non-NULL, should be empty when called.
 *           Will be set to a vector of statistics corresponding to the output clusters.
 *  @param assignments_out [out] may be NULL; if non-NULL, will be set to a vector of
 *           same size as "points", which says for each point which cluster
 *            it is assigned to.
 *  @param cfg [in] Configuration object that controls clustering behavior.  Most
 *                important value is "thresh", which provides an alternative mechanism
 *                [other than max_clust] to limit the number of leaves.
*/
BaseFloat ClusterTopDown(const std::vector<Clusterable*> &points,
                         int32 max_clust,  // max number of clusters.
                         std::vector<Clusterable*> *clusters_out,
                         std::vector<int32> *assignments_out,
                         TreeClusterOptions cfg = TreeClusterOptions());

/// @} end of "addtogroup clustering_group_algo"

}  // end namespace kaldi.

#endif  // KALDI_TREE_CLUSTER_UTILS_H_