Merge pull request #2 from yimmon/master

remove needless files
author: Ted Yin <[email protected]> 2015-08-14 17:42:26 +0800
committer: Ted Yin <[email protected]> 2015-08-14 17:42:26 +0800
commit: c3cffb58b9921d78753336421b52b9ffdaa5515c (patch)
tree: bfea20e97c200cf734021e3756d749c892e658a4 /kaldi_io/src/kaldi/tree/cluster-utils.h
parent: 10cce5f6a5c9e2f8e00d5a2a4d87c9cb7c26bf4c (diff)
parent: dfdd17afc2e984ec6c32ea01290f5c76309a456a (diff)
1 files changed, 0 insertions, 291 deletions
diff --git a/kaldi_io/src/kaldi/tree/cluster-utils.h b/kaldi_io/src/kaldi/tree/cluster-utils.h
deleted file mode 100644
index 55583a2..0000000
--- a/kaldi_io/src/kaldi/tree/cluster-utils.h
+++ /dev/null
@@ -1,291 +0,0 @@
-// tree/cluster-utils.h
-
-// Copyright 2012   Arnab Ghoshal
-// Copyright 2009-2011  Microsoft Corporation;  Saarland University
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_TREE_CLUSTER_UTILS_H_
-#define KALDI_TREE_CLUSTER_UTILS_H_
-
-#include <vector>
-#include "matrix/matrix-lib.h"
-#include "itf/clusterable-itf.h"
-
-namespace kaldi {
-
-/// \addtogroup clustering_group_simple
-/// @{
-
-/// Returns the total objective function after adding up all the
-/// statistics in the vector (pointers may be NULL).
-BaseFloat SumClusterableObjf(const std::vector<Clusterable*> &vec);
-
-/// Returns the total normalizer (usually count) of the cluster (pointers may be NULL).
-BaseFloat SumClusterableNormalizer(const std::vector<Clusterable*> &vec);
-
-/// Sums stats (ptrs may be NULL). Returns NULL if no non-NULL stats present.
-Clusterable *SumClusterable(const std::vector<Clusterable*> &vec);
-
-/** Fills in any (NULL) holes in "stats" vector, with empty stats, because
- *  certain algorithms require non-NULL stats.  If "stats" nonempty, requires it
- *  to contain at least one non-NULL pointer that we can call Copy() on.
- */
-void EnsureClusterableVectorNotNull(std::vector<Clusterable*> *stats);
-
-
-/** Given stats and a vector "assignments" of the same size (that maps to
- * cluster indices), sums the stats up into "clusters."  It will add to any
- * stats already present in "clusters" (although typically "clusters" will be
- * empty when called), and it will extend with NULL pointers for any unseen
- * indices. Call EnsureClusterableStatsNotNull afterwards if you want to ensure
- * all non-NULL clusters. Pointer in "clusters" are owned by caller. Pointers in
- * "stats" do not have to be non-NULL.
- */
-void AddToClusters(const std::vector<Clusterable*> &stats,
-                   const std::vector<int32> &assignments,
-                   std::vector<Clusterable*> *clusters);
-
-
-/// AddToClustersOptimized does the same as AddToClusters (it sums up the stats
-/// within each cluster, except it uses the sum of all the stats ("total") to
-/// optimize the computation for speed, if possible.  This will generally only be
-/// a significant speedup in the case where there are just two clusters, which
-/// can happen in algorithms that are doing binary splits; the idea is that we
-/// sum up all the stats in one cluster (the one with the fewest points in it),
-/// and then subtract from the total.
-void AddToClustersOptimized(const std::vector<Clusterable*> &stats,
-                            const std::vector<int32> &assignments,
-                            const Clusterable &total,
-                            std::vector<Clusterable*> *clusters);
-
-/// @} end "addtogroup clustering_group_simple"
-
-/// \addtogroup clustering_group_algo
-/// @{
-
-// Note, in the algorithms below, it is assumed that the input "points" (which
-// is std::vector<Clusterable*>) is all non-NULL.
-
-/** A bottom-up clustering algorithm. There are two parameters that control how
- *  many clusters we get: a "max_merge_thresh" which is a threshold for merging
- *  clusters, and a min_clust which puts a floor on the number of clusters we want. Set
- *  max_merge_thresh = large to use the min_clust only, or min_clust to 0 to use
- *  the max_merge_thresh only.
- *
- *  The algorithm is:
- *  \code
- *      while (num-clusters > min_clust && smallest_merge_cost <= max_merge_thresh)
- *          merge the closest two clusters.
- *  \endcode
- *
- *  @param points [in] Points to be clustered (may not contain NULL pointers)
- *  @param thresh [in] Threshold on cost change from merging clusters; clusters
- *               won't be merged if the cost is more than this
- *  @param min_clust [in] Minimum number of clusters desired; we'll stop merging
- *                  after reaching this number.
- *  @param clusters_out [out] If non-NULL, will be set to a vector of size equal
- *                 to the number of output clusters, containing the clustered
- *                 statistics.  Must be empty when called.
- *  @param assignments_out [out] If non-NULL, will be resized to the number of
- *                 points, and each element is the index of the cluster that point
- *                 was assigned to.
- *  @return Returns the total objf change relative to all clusters being separate, which is
- *    a negative.  Note that this is not the same as what the other clustering algorithms return.
- */
-BaseFloat ClusterBottomUp(const std::vector<Clusterable*> &points,
-                          BaseFloat thresh,
-                          int32 min_clust,
-                          std::vector<Clusterable*> *clusters_out,
-                          std::vector<int32> *assignments_out);
-
-/** This is a bottom-up clustering where the points are pre-clustered in a set
- *  of compartments, such that only points in the same compartment are clustered
- *  together. The compartment and pair of points with the smallest merge cost
- *  is selected and the points are clustered. The result stays in the same
- *  compartment. The code does not merge compartments, and hence assumes that
- *  the number of compartments is smaller than the 'min_clust' option.
- *  The clusters in "clusters_out" are newly allocated and owned by the caller.
- */
-BaseFloat ClusterBottomUpCompartmentalized(
-    const std::vector< std::vector<Clusterable*> > &points, BaseFloat thresh,
-    int32 min_clust, std::vector< std::vector<Clusterable*> > *clusters_out,
-    std::vector< std::vector<int32> > *assignments_out);
-
-
-struct RefineClustersOptions {
-  int32 num_iters;  // must be >= 0.  If zero, does nothing.
-  int32 top_n;  // must be >= 2.
-  RefineClustersOptions() : num_iters(100), top_n(5) {}
-  RefineClustersOptions(int32 num_iters_in, int32 top_n_in)
-      : num_iters(num_iters_in), top_n(top_n_in) {}
-  // include Write and Read functions because this object gets written/read as
-  // part of the QuestionsForKeyOptions class.
-  void Write(std::ostream &os, bool binary) const;
-  void Read(std::istream &is, bool binary);
-};
-
-/** RefineClusters is mainly used internally by other clustering algorithms.
- *
- *  It starts with a given assignment of points to clusters and
- *  keeps trying to improve it by moving points from cluster to cluster, up to
- *  a maximum number of iterations.
- *
- *  "clusters" and "assignments" are both input and output variables, and so
- *  both MUST be non-NULL.
- *
- *  "top_n" (>=2) is a pruning value: more is more exact, fewer is faster. The
- *  algorithm initially finds the "top_n" closest clusters to any given point,
- *  and from that point only consider move to those "top_n" clusters. Since
- *  RefineClusters is called multiple times from ClusterKMeans (for instance),
- *  this is not really a limitation.
- */
-BaseFloat RefineClusters(const std::vector<Clusterable*> &points,
-                         std::vector<Clusterable*> *clusters /*non-NULL*/,
-                         std::vector<int32> *assignments /*non-NULL*/,
-                         RefineClustersOptions cfg = RefineClustersOptions());
-
-struct ClusterKMeansOptions {
-  RefineClustersOptions refine_cfg;
-  int32 num_iters;
-  int32 num_tries;  // if >1, try whole procedure >once and pick best.
-  bool verbose;
-  ClusterKMeansOptions()
-      : refine_cfg(), num_iters(20), num_tries(2), verbose(true)  {}
-};
-
-/** ClusterKMeans is a K-means-like clustering algorithm. It starts with
- *  pseudo-random initialization of points to clusters and uses RefineClusters
- *  to iteratively improve the cluster assignments.  It does this for
- *  multiple iterations and picks the result with the best objective function.
- *
- *
- *  ClusterKMeans implicitly uses Rand(). It will not necessarily return
- *  the same value on different calls.  Use sRand() if you want consistent
- *  results.
- *  The algorithm used in ClusterKMeans is a "k-means-like" algorithm that tries
- *  to be as efficient as possible.  Firstly, since the algorithm it uses
- *  includes random initialization, it tries the whole thing cfg.num_tries times
- *  and picks the one with the best objective function.  Each try, it does as
- *  follows: it randomly initializes points to clusters, and then for
- *  cfg.num_iters iterations it calls RefineClusters().  The options to
- *  RefineClusters() are given by cfg.refine_cfg.  Calling RefineClusters once
- *  will always be at least as good as doing one iteration of reassigning points to
- *  clusters, but will generally be quite a bit better (without taking too
- *  much extra time).
- *
- *  @param points [in]  points to be clustered (must be all non-NULL).
- *  @param num_clust [in] number of clusters requested (it will always return exactly
- *                 this many, or will fail if num_clust > points.size()).
- *  @param clusters_out [out] may be NULL; if non-NULL, should be empty when called.
- *          Will be set to a vector of statistics corresponding to the output clusters.
- *  @param assignments_out [out] may be NULL; if non-NULL, will be set to a vector of
- *             same size as "points", which says for each point which cluster
- *              it is assigned to.
- *  @param cfg [in] configuration class specifying options to the algorithm.
- *  @return Returns the objective function improvement versus everything being
- *     in the same cluster.
- *
- */
-BaseFloat ClusterKMeans(const std::vector<Clusterable*> &points,
-                        int32 num_clust,  // exact number of clusters
-                        std::vector<Clusterable*> *clusters_out,  // may be NULL
-                        std::vector<int32> *assignments_out,  // may be NULL
-                        ClusterKMeansOptions cfg = ClusterKMeansOptions());
-
-struct TreeClusterOptions  {
-  ClusterKMeansOptions kmeans_cfg;
-  int32 branch_factor;
-  BaseFloat thresh;  // Objf change: if >0, may be used to control number of leaves.
-  TreeClusterOptions()
-      : kmeans_cfg(), branch_factor(2), thresh(0) {
-    kmeans_cfg.verbose = false;
-  }
-};
-
-/** TreeCluster is a top-down clustering algorithm, using a binary tree (not
- *  necessarily balanced). Returns objf improvement versus having all points
- *  in one cluster.  The algorithm is:
- *     - Initialize to 1 cluster (tree with 1 node).
- *     - Maintain, for each cluster, a "best-binary-split" (using ClusterKMeans
- *       to do so). Always split the highest scoring cluster, until we can do no
- *       more splits.
- *
- *  @param points [in] Data points to be clustered
- *  @param max_clust  [in] Maximum number of clusters (you will get exactly this number,
- *                if there are at least this many points, except if you set the
- *                cfg.thresh value nonzero, in which case that threshold may limit
- *                the number of clusters.
- *  @param clusters_out [out] If non-NULL, will be set to the a vector whose first
- *                (*num_leaves_out) elements are the leaf clusters, and whose
- *                subsequent elements are the nonleaf nodes in the tree, in
- *                topological order with the root node last.  Must be empty vector
- *                when this function is called.
- *  @param assignments_out [out] If non-NULL, will be set to a vector to a vector the
- *               same size as "points", where assignments[i] is the leaf node index i
- *               to which the i'th point gets clustered.
- *  @param clust_assignments_out [out] If non-NULL, will be set to a vector the same size
- *                as clusters_out  which says for each node (leaf or nonleaf), the
- *                index of its parent.  For the root node (which is last),
- *                assignments_out[i] == i.  For each i, assignments_out[i]>=i, i.e.
- *                any node's parent is higher numbered than itself.  If you don't need
- *                this information, consider using instead the ClusterTopDown function.
- *  @param num_leaves_out [out] If non-NULL, will be set to the number of leaf nodes
- *                in the tree.
- *  @param cfg [in] Configuration object that controls clustering behavior.  Most
- *                 important value is "thresh", which provides an alternative mechanism
- *                 [other than max_clust] to limit the number of leaves.
- */
-BaseFloat TreeCluster(const std::vector<Clusterable*> &points,
-                      int32 max_clust,  // max number of leaf-level clusters.
-                      std::vector<Clusterable*> *clusters_out,
-                      std::vector<int32> *assignments_out,
-                      std::vector<int32> *clust_assignments_out,
-                      int32 *num_leaves_out,
-                      TreeClusterOptions cfg = TreeClusterOptions());
-
-
-/**
- *  A clustering algorithm that internally uses TreeCluster,
- *  but does not give you the information about the structure of the tree.
- *  The "clusters_out" and "assignments_out" may be NULL if the outputs are not
- *  needed.
- *
- *  @param points [in]  points to be clustered (must be all non-NULL).
- *  @param max_clust [in] Maximum number of clusters (you will get exactly this number,
- *                if there are at least this many points, except if you set the
- *                cfg.thresh value nonzero, in which case that threshold may limit
- *                the number of clusters.
- *  @param clusters_out [out] may be NULL; if non-NULL, should be empty when called.
- *           Will be set to a vector of statistics corresponding to the output clusters.
- *  @param assignments_out [out] may be NULL; if non-NULL, will be set to a vector of
- *           same size as "points", which says for each point which cluster
- *            it is assigned to.
- *  @param cfg [in] Configuration object that controls clustering behavior.  Most
- *                important value is "thresh", which provides an alternative mechanism
- *                [other than max_clust] to limit the number of leaves.
-*/
-BaseFloat ClusterTopDown(const std::vector<Clusterable*> &points,
-                         int32 max_clust,  // max number of clusters.
-                         std::vector<Clusterable*> *clusters_out,
-                         std::vector<int32> *assignments_out,
-                         TreeClusterOptions cfg = TreeClusterOptions());
-
-/// @} end of "addtogroup clustering_group_algo"
-
-}  // end namespace kaldi.
-
-#endif  // KALDI_TREE_CLUSTER_UTILS_H_
author	Ted Yin <[email protected]>	2015-08-14 17:42:26 +0800
committer	Ted Yin <[email protected]>	2015-08-14 17:42:26 +0800
commit	c3cffb58b9921d78753336421b52b9ffdaa5515c (patch)
tree	bfea20e97c200cf734021e3756d749c892e658a4 /kaldi_io/src/kaldi/tree/cluster-utils.h
parent	10cce5f6a5c9e2f8e00d5a2a4d87c9cb7c26bf4c (diff)
parent	dfdd17afc2e984ec6c32ea01290f5c76309a456a (diff)