diff options
author | Ted Yin <[email protected]> | 2015-08-14 17:42:26 +0800 |
---|---|---|
committer | Ted Yin <[email protected]> | 2015-08-14 17:42:26 +0800 |
commit | c3cffb58b9921d78753336421b52b9ffdaa5515c (patch) | |
tree | bfea20e97c200cf734021e3756d749c892e658a4 /kaldi_io/src/kaldi/tree/cluster-utils.h | |
parent | 10cce5f6a5c9e2f8e00d5a2a4d87c9cb7c26bf4c (diff) | |
parent | dfdd17afc2e984ec6c32ea01290f5c76309a456a (diff) |
Merge pull request #2 from yimmon/master
remove needless files
Diffstat (limited to 'kaldi_io/src/kaldi/tree/cluster-utils.h')
-rw-r--r-- | kaldi_io/src/kaldi/tree/cluster-utils.h | 291 |
1 files changed, 0 insertions, 291 deletions
diff --git a/kaldi_io/src/kaldi/tree/cluster-utils.h b/kaldi_io/src/kaldi/tree/cluster-utils.h deleted file mode 100644 index 55583a2..0000000 --- a/kaldi_io/src/kaldi/tree/cluster-utils.h +++ /dev/null @@ -1,291 +0,0 @@ -// tree/cluster-utils.h - -// Copyright 2012 Arnab Ghoshal -// Copyright 2009-2011 Microsoft Corporation; Saarland University - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_TREE_CLUSTER_UTILS_H_ -#define KALDI_TREE_CLUSTER_UTILS_H_ - -#include <vector> -#include "matrix/matrix-lib.h" -#include "itf/clusterable-itf.h" - -namespace kaldi { - -/// \addtogroup clustering_group_simple -/// @{ - -/// Returns the total objective function after adding up all the -/// statistics in the vector (pointers may be NULL). -BaseFloat SumClusterableObjf(const std::vector<Clusterable*> &vec); - -/// Returns the total normalizer (usually count) of the cluster (pointers may be NULL). -BaseFloat SumClusterableNormalizer(const std::vector<Clusterable*> &vec); - -/// Sums stats (ptrs may be NULL). Returns NULL if no non-NULL stats present. -Clusterable *SumClusterable(const std::vector<Clusterable*> &vec); - -/** Fills in any (NULL) holes in "stats" vector, with empty stats, because - * certain algorithms require non-NULL stats. If "stats" nonempty, requires it - * to contain at least one non-NULL pointer that we can call Copy() on. - */ -void EnsureClusterableVectorNotNull(std::vector<Clusterable*> *stats); - - -/** Given stats and a vector "assignments" of the same size (that maps to - * cluster indices), sums the stats up into "clusters." It will add to any - * stats already present in "clusters" (although typically "clusters" will be - * empty when called), and it will extend with NULL pointers for any unseen - * indices. Call EnsureClusterableStatsNotNull afterwards if you want to ensure - * all non-NULL clusters. Pointer in "clusters" are owned by caller. Pointers in - * "stats" do not have to be non-NULL. - */ -void AddToClusters(const std::vector<Clusterable*> &stats, - const std::vector<int32> &assignments, - std::vector<Clusterable*> *clusters); - - -/// AddToClustersOptimized does the same as AddToClusters (it sums up the stats -/// within each cluster, except it uses the sum of all the stats ("total") to -/// optimize the computation for speed, if possible. This will generally only be -/// a significant speedup in the case where there are just two clusters, which -/// can happen in algorithms that are doing binary splits; the idea is that we -/// sum up all the stats in one cluster (the one with the fewest points in it), -/// and then subtract from the total. -void AddToClustersOptimized(const std::vector<Clusterable*> &stats, - const std::vector<int32> &assignments, - const Clusterable &total, - std::vector<Clusterable*> *clusters); - -/// @} end "addtogroup clustering_group_simple" - -/// \addtogroup clustering_group_algo -/// @{ - -// Note, in the algorithms below, it is assumed that the input "points" (which -// is std::vector<Clusterable*>) is all non-NULL. - -/** A bottom-up clustering algorithm. There are two parameters that control how - * many clusters we get: a "max_merge_thresh" which is a threshold for merging - * clusters, and a min_clust which puts a floor on the number of clusters we want. Set - * max_merge_thresh = large to use the min_clust only, or min_clust to 0 to use - * the max_merge_thresh only. - * - * The algorithm is: - * \code - * while (num-clusters > min_clust && smallest_merge_cost <= max_merge_thresh) - * merge the closest two clusters. - * \endcode - * - * @param points [in] Points to be clustered (may not contain NULL pointers) - * @param thresh [in] Threshold on cost change from merging clusters; clusters - * won't be merged if the cost is more than this - * @param min_clust [in] Minimum number of clusters desired; we'll stop merging - * after reaching this number. - * @param clusters_out [out] If non-NULL, will be set to a vector of size equal - * to the number of output clusters, containing the clustered - * statistics. Must be empty when called. - * @param assignments_out [out] If non-NULL, will be resized to the number of - * points, and each element is the index of the cluster that point - * was assigned to. - * @return Returns the total objf change relative to all clusters being separate, which is - * a negative. Note that this is not the same as what the other clustering algorithms return. - */ -BaseFloat ClusterBottomUp(const std::vector<Clusterable*> &points, - BaseFloat thresh, - int32 min_clust, - std::vector<Clusterable*> *clusters_out, - std::vector<int32> *assignments_out); - -/** This is a bottom-up clustering where the points are pre-clustered in a set - * of compartments, such that only points in the same compartment are clustered - * together. The compartment and pair of points with the smallest merge cost - * is selected and the points are clustered. The result stays in the same - * compartment. The code does not merge compartments, and hence assumes that - * the number of compartments is smaller than the 'min_clust' option. - * The clusters in "clusters_out" are newly allocated and owned by the caller. - */ -BaseFloat ClusterBottomUpCompartmentalized( - const std::vector< std::vector<Clusterable*> > &points, BaseFloat thresh, - int32 min_clust, std::vector< std::vector<Clusterable*> > *clusters_out, - std::vector< std::vector<int32> > *assignments_out); - - -struct RefineClustersOptions { - int32 num_iters; // must be >= 0. If zero, does nothing. - int32 top_n; // must be >= 2. - RefineClustersOptions() : num_iters(100), top_n(5) {} - RefineClustersOptions(int32 num_iters_in, int32 top_n_in) - : num_iters(num_iters_in), top_n(top_n_in) {} - // include Write and Read functions because this object gets written/read as - // part of the QuestionsForKeyOptions class. - void Write(std::ostream &os, bool binary) const; - void Read(std::istream &is, bool binary); -}; - -/** RefineClusters is mainly used internally by other clustering algorithms. - * - * It starts with a given assignment of points to clusters and - * keeps trying to improve it by moving points from cluster to cluster, up to - * a maximum number of iterations. - * - * "clusters" and "assignments" are both input and output variables, and so - * both MUST be non-NULL. - * - * "top_n" (>=2) is a pruning value: more is more exact, fewer is faster. The - * algorithm initially finds the "top_n" closest clusters to any given point, - * and from that point only consider move to those "top_n" clusters. Since - * RefineClusters is called multiple times from ClusterKMeans (for instance), - * this is not really a limitation. - */ -BaseFloat RefineClusters(const std::vector<Clusterable*> &points, - std::vector<Clusterable*> *clusters /*non-NULL*/, - std::vector<int32> *assignments /*non-NULL*/, - RefineClustersOptions cfg = RefineClustersOptions()); - -struct ClusterKMeansOptions { - RefineClustersOptions refine_cfg; - int32 num_iters; - int32 num_tries; // if >1, try whole procedure >once and pick best. - bool verbose; - ClusterKMeansOptions() - : refine_cfg(), num_iters(20), num_tries(2), verbose(true) {} -}; - -/** ClusterKMeans is a K-means-like clustering algorithm. It starts with - * pseudo-random initialization of points to clusters and uses RefineClusters - * to iteratively improve the cluster assignments. It does this for - * multiple iterations and picks the result with the best objective function. - * - * - * ClusterKMeans implicitly uses Rand(). It will not necessarily return - * the same value on different calls. Use sRand() if you want consistent - * results. - * The algorithm used in ClusterKMeans is a "k-means-like" algorithm that tries - * to be as efficient as possible. Firstly, since the algorithm it uses - * includes random initialization, it tries the whole thing cfg.num_tries times - * and picks the one with the best objective function. Each try, it does as - * follows: it randomly initializes points to clusters, and then for - * cfg.num_iters iterations it calls RefineClusters(). The options to - * RefineClusters() are given by cfg.refine_cfg. Calling RefineClusters once - * will always be at least as good as doing one iteration of reassigning points to - * clusters, but will generally be quite a bit better (without taking too - * much extra time). - * - * @param points [in] points to be clustered (must be all non-NULL). - * @param num_clust [in] number of clusters requested (it will always return exactly - * this many, or will fail if num_clust > points.size()). - * @param clusters_out [out] may be NULL; if non-NULL, should be empty when called. - * Will be set to a vector of statistics corresponding to the output clusters. - * @param assignments_out [out] may be NULL; if non-NULL, will be set to a vector of - * same size as "points", which says for each point which cluster - * it is assigned to. - * @param cfg [in] configuration class specifying options to the algorithm. - * @return Returns the objective function improvement versus everything being - * in the same cluster. - * - */ -BaseFloat ClusterKMeans(const std::vector<Clusterable*> &points, - int32 num_clust, // exact number of clusters - std::vector<Clusterable*> *clusters_out, // may be NULL - std::vector<int32> *assignments_out, // may be NULL - ClusterKMeansOptions cfg = ClusterKMeansOptions()); - -struct TreeClusterOptions { - ClusterKMeansOptions kmeans_cfg; - int32 branch_factor; - BaseFloat thresh; // Objf change: if >0, may be used to control number of leaves. - TreeClusterOptions() - : kmeans_cfg(), branch_factor(2), thresh(0) { - kmeans_cfg.verbose = false; - } -}; - -/** TreeCluster is a top-down clustering algorithm, using a binary tree (not - * necessarily balanced). Returns objf improvement versus having all points - * in one cluster. The algorithm is: - * - Initialize to 1 cluster (tree with 1 node). - * - Maintain, for each cluster, a "best-binary-split" (using ClusterKMeans - * to do so). Always split the highest scoring cluster, until we can do no - * more splits. - * - * @param points [in] Data points to be clustered - * @param max_clust [in] Maximum number of clusters (you will get exactly this number, - * if there are at least this many points, except if you set the - * cfg.thresh value nonzero, in which case that threshold may limit - * the number of clusters. - * @param clusters_out [out] If non-NULL, will be set to the a vector whose first - * (*num_leaves_out) elements are the leaf clusters, and whose - * subsequent elements are the nonleaf nodes in the tree, in - * topological order with the root node last. Must be empty vector - * when this function is called. - * @param assignments_out [out] If non-NULL, will be set to a vector to a vector the - * same size as "points", where assignments[i] is the leaf node index i - * to which the i'th point gets clustered. - * @param clust_assignments_out [out] If non-NULL, will be set to a vector the same size - * as clusters_out which says for each node (leaf or nonleaf), the - * index of its parent. For the root node (which is last), - * assignments_out[i] == i. For each i, assignments_out[i]>=i, i.e. - * any node's parent is higher numbered than itself. If you don't need - * this information, consider using instead the ClusterTopDown function. - * @param num_leaves_out [out] If non-NULL, will be set to the number of leaf nodes - * in the tree. - * @param cfg [in] Configuration object that controls clustering behavior. Most - * important value is "thresh", which provides an alternative mechanism - * [other than max_clust] to limit the number of leaves. - */ -BaseFloat TreeCluster(const std::vector<Clusterable*> &points, - int32 max_clust, // max number of leaf-level clusters. - std::vector<Clusterable*> *clusters_out, - std::vector<int32> *assignments_out, - std::vector<int32> *clust_assignments_out, - int32 *num_leaves_out, - TreeClusterOptions cfg = TreeClusterOptions()); - - -/** - * A clustering algorithm that internally uses TreeCluster, - * but does not give you the information about the structure of the tree. - * The "clusters_out" and "assignments_out" may be NULL if the outputs are not - * needed. - * - * @param points [in] points to be clustered (must be all non-NULL). - * @param max_clust [in] Maximum number of clusters (you will get exactly this number, - * if there are at least this many points, except if you set the - * cfg.thresh value nonzero, in which case that threshold may limit - * the number of clusters. - * @param clusters_out [out] may be NULL; if non-NULL, should be empty when called. - * Will be set to a vector of statistics corresponding to the output clusters. - * @param assignments_out [out] may be NULL; if non-NULL, will be set to a vector of - * same size as "points", which says for each point which cluster - * it is assigned to. - * @param cfg [in] Configuration object that controls clustering behavior. Most - * important value is "thresh", which provides an alternative mechanism - * [other than max_clust] to limit the number of leaves. -*/ -BaseFloat ClusterTopDown(const std::vector<Clusterable*> &points, - int32 max_clust, // max number of clusters. - std::vector<Clusterable*> *clusters_out, - std::vector<int32> *assignments_out, - TreeClusterOptions cfg = TreeClusterOptions()); - -/// @} end of "addtogroup clustering_group_algo" - -} // end namespace kaldi. - -#endif // KALDI_TREE_CLUSTER_UTILS_H_ |