summaryrefslogtreecommitdiff
path: root/kaldi_io/src/kaldi/tree
diff options
context:
space:
mode:
authorDeterminant <ted.sybil@gmail.com>2015-08-14 11:51:42 +0800
committerDeterminant <ted.sybil@gmail.com>2015-08-14 11:51:42 +0800
commit96a32415ab43377cf1575bd3f4f2980f58028209 (patch)
tree30a2d92d73e8f40ac87b79f6f56e227bfc4eea6e /kaldi_io/src/kaldi/tree
parentc177a7549bd90670af4b29fa813ddea32cfe0f78 (diff)
add implementation for kaldi io (by ymz)
Diffstat (limited to 'kaldi_io/src/kaldi/tree')
-rw-r--r--kaldi_io/src/kaldi/tree/build-tree-questions.h133
-rw-r--r--kaldi_io/src/kaldi/tree/build-tree-utils.h324
-rw-r--r--kaldi_io/src/kaldi/tree/build-tree.h250
-rw-r--r--kaldi_io/src/kaldi/tree/cluster-utils.h291
-rw-r--r--kaldi_io/src/kaldi/tree/clusterable-classes.h158
-rw-r--r--kaldi_io/src/kaldi/tree/context-dep.h166
-rw-r--r--kaldi_io/src/kaldi/tree/event-map.h365
-rw-r--r--kaldi_io/src/kaldi/tree/tree-renderer.h84
8 files changed, 1771 insertions, 0 deletions
diff --git a/kaldi_io/src/kaldi/tree/build-tree-questions.h b/kaldi_io/src/kaldi/tree/build-tree-questions.h
new file mode 100644
index 0000000..a6bcfdd
--- /dev/null
+++ b/kaldi_io/src/kaldi/tree/build-tree-questions.h
@@ -0,0 +1,133 @@
+// tree/build-tree-questions.h
+
+// Copyright 2009-2011 Microsoft Corporation
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TREE_BUILD_TREE_QUESTIONS_H_
+#define KALDI_TREE_BUILD_TREE_QUESTIONS_H_
+
+#include "util/stl-utils.h"
+#include "tree/context-dep.h"
+
+namespace kaldi {
+
+
+/// \addtogroup tree_group
+/// @{
+/// Typedef for statistics to build trees.
+typedef std::vector<std::pair<EventType, Clusterable*> > BuildTreeStatsType;
+
+/// Typedef used when we get "all keys" from a set of stats-- used in specifying
+/// which kinds of questions to ask.
+typedef enum { kAllKeysInsistIdentical, kAllKeysIntersection, kAllKeysUnion } AllKeysType;
+
+/// @}
+
+/// \defgroup tree_group_questions Question sets for decision-tree clustering
+/// See \ref tree_internals (and specifically \ref treei_func_questions) for context.
+/// \ingroup tree_group
+/// @{
+
+/// QuestionsForKey is a class used to define the questions for a key,
+/// and also options that allow us to refine the question during tree-building
+/// (i.e. make a question specific to the location in the tree).
+/// The Questions class handles aggregating these options for a set
+/// of different keys.
+struct QuestionsForKey { // Configuration class associated with a particular key
+ // (of type EventKeyType). It also contains the questions themselves.
+ std::vector<std::vector<EventValueType> > initial_questions;
+ RefineClustersOptions refine_opts; // if refine_opts.max_iter == 0,
+ // we just pick from the initial questions.
+
+ QuestionsForKey(int32 num_iters = 5): refine_opts(num_iters, 2) {
+ // refine_cfg with 5 iters and top-n = 2 (this is no restriction because
+ // RefineClusters called with 2 clusters; would get set to that anyway as
+ // it's the only possible value for 2 clusters). User has to add questions.
+ // This config won't work as-is, as it has no questions.
+ }
+
+ void Check() const {
+ for (size_t i = 0;i < initial_questions.size();i++) KALDI_ASSERT(IsSorted(initial_questions[i]));
+ }
+
+ void Write(std::ostream &os, bool binary) const;
+ void Read(std::istream &is, bool binary);
+
+ // copy and assign allowed.
+};
+
+/// This class defines, for each EventKeyType, a set of initial questions that
+/// it tries and also a number of iterations for which to refine the questions to increase
+/// likelihood. It is perhaps a bit more than an options class, as it contains the
+/// actual questions.
+class Questions { // careful, this is a class.
+ public:
+ const QuestionsForKey &GetQuestionsOf(EventKeyType key) const {
+ std::map<EventKeyType, size_t>::const_iterator iter;
+ if ( (iter = key_idx_.find(key)) == key_idx_.end()) {
+ KALDI_ERR << "Questions: no options for key "<< key;
+ }
+ size_t idx = iter->second;
+ KALDI_ASSERT(idx < key_options_.size());
+ key_options_[idx]->Check();
+ return *(key_options_[idx]);
+ }
+ void SetQuestionsOf(EventKeyType key, const QuestionsForKey &options_of_key) {
+ options_of_key.Check();
+ if (key_idx_.count(key) == 0) {
+ key_idx_[key] = key_options_.size();
+ key_options_.push_back(new QuestionsForKey());
+ *(key_options_.back()) = options_of_key;
+ } else {
+ size_t idx = key_idx_[key];
+ KALDI_ASSERT(idx < key_options_.size());
+ *(key_options_[idx]) = options_of_key;
+ }
+ }
+ void GetKeysWithQuestions(std::vector<EventKeyType> *keys_out) const {
+ KALDI_ASSERT(keys_out != NULL);
+ CopyMapKeysToVector(key_idx_, keys_out);
+ }
+ const bool HasQuestionsForKey(EventKeyType key) const { return (key_idx_.count(key) != 0); }
+ ~Questions() { kaldi::DeletePointers(&key_options_); }
+
+
+ /// Initializer with arguments. After using this you would have to set up the config for each key you
+ /// are going to use, or use InitRand().
+ Questions() { }
+
+
+ /// InitRand attempts to generate "reasonable" random questions. Only
+ /// of use for debugging. This initializer creates a config that is
+ /// ready to use.
+ /// e.g. num_iters_refine = 0 means just use stated questions (if >1, will use
+ /// different questions at each split of the tree).
+ void InitRand(const BuildTreeStatsType &stats, int32 num_quest, int32 num_iters_refine, AllKeysType all_keys_type);
+
+ void Write(std::ostream &os, bool binary) const;
+ void Read(std::istream &is, bool binary);
+ private:
+ std::vector<QuestionsForKey*> key_options_;
+ std::map<EventKeyType, size_t> key_idx_;
+ KALDI_DISALLOW_COPY_AND_ASSIGN(Questions);
+};
+
+/// @}
+
+}// end namespace kaldi
+
+#endif // KALDI_TREE_BUILD_TREE_QUESTIONS_H_
diff --git a/kaldi_io/src/kaldi/tree/build-tree-utils.h b/kaldi_io/src/kaldi/tree/build-tree-utils.h
new file mode 100644
index 0000000..464fc6b
--- /dev/null
+++ b/kaldi_io/src/kaldi/tree/build-tree-utils.h
@@ -0,0 +1,324 @@
+// tree/build-tree-utils.h
+
+// Copyright 2009-2011 Microsoft Corporation
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TREE_BUILD_TREE_UTILS_H_
+#define KALDI_TREE_BUILD_TREE_UTILS_H_
+
+#include "tree/build-tree-questions.h"
+
+// build-tree-questions.h needed for this typedef:
+// typedef std::vector<std::pair<EventType, Clusterable*> > BuildTreeStatsType;
+// and for other #includes.
+
+namespace kaldi {
+
+
+/// \defgroup tree_group_lower Low-level functions for manipulating statistics and event-maps
+/// See \ref tree_internals and specifically \ref treei_func for context.
+/// \ingroup tree_group
+///
+/// @{
+
+
+
+/// This frees the Clusterable* pointers in "stats", where non-NULL, and sets them to NULL.
+/// Does not delete the pointer "stats" itself.
+void DeleteBuildTreeStats(BuildTreeStatsType *stats);
+
+/// Writes BuildTreeStats object. This works even if pointers are NULL.
+void WriteBuildTreeStats(std::ostream &os, bool binary,
+ const BuildTreeStatsType &stats);
+
+/// Reads BuildTreeStats object. The "example" argument must be of the same
+/// type as the stats on disk, and is needed for access to the correct "Read"
+/// function. It was organized this way for easier extensibility (so adding new
+/// Clusterable derived classes isn't painful)
+void ReadBuildTreeStats(std::istream &is, bool binary,
+ const Clusterable &example, BuildTreeStatsType *stats);
+
+/// Convenience function e.g. to work out possible values of the phones from just the stats.
+/// Returns true if key was always defined inside the stats.
+/// May be used with and == NULL to find out of key was always defined.
+bool PossibleValues(EventKeyType key, const BuildTreeStatsType &stats,
+ std::vector<EventValueType> *ans);
+
+
+/// Splits stats according to the EventMap, indexing them at output by the
+/// leaf type. A utility function. NOTE-- pointers in stats_out point to
+/// the same memory location as those in stats. No copying of Clusterable*
+/// objects happens. Will add to stats in stats_out if non-empty at input.
+/// This function may increase the size of vector stats_out as necessary
+/// to accommodate stats, but will never decrease the size.
+void SplitStatsByMap(const BuildTreeStatsType &stats_in, const EventMap &e,
+ std::vector<BuildTreeStatsType> *stats_out);
+
+/// SplitStatsByKey splits stats up according to the value of a particular key,
+/// which must be always defined and nonnegative. Like MapStats. Pointers to
+/// Clusterable* in stats_out are not newly allocated-- they are the same as the
+/// ones in stats_in. Generally they will still be owned at stats_in (user can
+/// decide where to allocate ownership).
+void SplitStatsByKey(const BuildTreeStatsType &stats_in, EventKeyType key,
+ std::vector<BuildTreeStatsType> *stats_out);
+
+
+/// Converts stats from a given context-window (N) and central-position (P) to a
+/// different N and P, by possibly reducing context. This function does a job
+/// that's quite specific to the "normal" stats format we use. See \ref
+/// tree_window for background. This function may delete some keys and change
+/// others, depending on the N and P values. It expects that at input, all keys
+/// will either be -1 or lie between 0 and oldN-1. At output, keys will be
+/// either -1 or between 0 and newN-1.
+/// Returns false if we could not convert the stats (e.g. because newN is larger
+/// than oldN).
+bool ConvertStats(int32 oldN, int32 oldP, int32 newN, int32 newP,
+ BuildTreeStatsType *stats);
+
+
+/// FilterStatsByKey filters the stats according the value of a specified key.
+/// If include_if_present == true, it only outputs the stats whose key is in
+/// "values"; otherwise it only outputs the stats whose key is not in "values".
+/// At input, "values" must be sorted and unique, and all stats in "stats_in"
+/// must have "key" defined. At output, pointers to Clusterable* in stats_out
+/// are not newly allocated-- they are the same as the ones in stats_in.
+void FilterStatsByKey(const BuildTreeStatsType &stats_in,
+ EventKeyType key,
+ std::vector<EventValueType> &values,
+ bool include_if_present, // true-> retain only if in "values",
+ // false-> retain only if not in "values".
+ BuildTreeStatsType *stats_out);
+
+
+/// Sums stats, or returns NULL stats_in has no non-NULL stats.
+/// Stats are newly allocated, owned by caller.
+Clusterable *SumStats(const BuildTreeStatsType &stats_in);
+
+/// Sums the normalizer [typically, data-count] over the stats.
+BaseFloat SumNormalizer(const BuildTreeStatsType &stats_in);
+
+/// Sums the objective function over the stats.
+BaseFloat SumObjf(const BuildTreeStatsType &stats_in);
+
+
+/// Sum a vector of stats. Leaves NULL as pointer if no stats available.
+/// The pointers in stats_out are owned by caller. At output, there may be
+/// NULLs in the vector stats_out.
+void SumStatsVec(const std::vector<BuildTreeStatsType> &stats_in, std::vector<Clusterable*> *stats_out);
+
+/// Cluster the stats given the event map return the total objf given those clusters.
+BaseFloat ObjfGivenMap(const BuildTreeStatsType &stats_in, const EventMap &e);
+
+
+/// FindAllKeys puts in *keys the (sorted, unique) list of all key identities in the stats.
+/// If type == kAllKeysInsistIdentical, it will insist that this set of keys is the same for all the
+/// stats (else exception is thrown).
+/// if type == kAllKeysIntersection, it will return the smallest common set of keys present in
+/// the set of stats
+/// if type== kAllKeysUnion (currently probably not so useful since maps will return "undefined"
+/// if key is not present), it will return the union of all the keys present in the stats.
+void FindAllKeys(const BuildTreeStatsType &stats, AllKeysType keys_type,
+ std::vector<EventKeyType> *keys);
+
+
+/// @}
+
+
+/**
+ \defgroup tree_group_intermediate Intermediate-level functions used in building the tree
+ These functions are are used in top-level tree-building code (\ref tree_group_top); see
+ \ref tree_internals for documentation.
+ \ingroup tree_group
+ @{
+*/
+
+
+/// Returns a tree with just one node. Used @ start of tree-building process.
+/// Not really used in current recipes.
+inline EventMap *TrivialTree(int32 *num_leaves) {
+ KALDI_ASSERT(*num_leaves == 0); // in envisaged usage.
+ return new ConstantEventMap( (*num_leaves)++ );
+}
+
+/// DoTableSplit does a complete split on this key (e.g. might correspond to central phone
+/// (key = P-1), or HMM-state position (key == kPdfClass == -1). Stats used to work out possible
+/// values of the event. "num_leaves" is used to allocate new leaves. All stats must have
+/// this key defined, or this function will crash.
+EventMap *DoTableSplit(const EventMap &orig, EventKeyType key,
+ const BuildTreeStatsType &stats, int32 *num_leaves);
+
+
+/// DoTableSplitMultiple does a complete split on all the keys, in order from keys[0],
+/// keys[1]
+/// and so on. The stats are used to work out possible values corresponding to the key.
+/// "num_leaves" is used to allocate new leaves. All stats must have
+/// the keys defined, or this function will crash.
+/// Returns a newly allocated event map.
+EventMap *DoTableSplitMultiple(const EventMap &orig,
+ const std::vector<EventKeyType> &keys,
+ const BuildTreeStatsType &stats,
+ int32 *num_leaves);
+
+
+/// "ClusterEventMapGetMapping" clusters the leaves of the EventMap, with "thresh" a delta-likelihood
+/// threshold to control how many leaves we combine (might be the same as the delta-like
+/// threshold used in splitting.
+// The function returns the #leaves we combined. The same leaf-ids of the leaves being clustered
+// will be used for the clustered leaves (but other than that there is no special rule which
+// leaf-ids should be used at output).
+// It outputs the mapping for leaves, in "mapping", which may be empty at the start
+// but may also contain mappings for other parts of the tree, which must contain
+// disjoint leaves from this part. This is so that Cluster can
+// be called multiple times for sub-parts of the tree (with disjoint sets of leaves),
+// e.g. if we want to avoid sharing across phones. Afterwards you can use Copy function
+// of EventMap to apply the mapping, i.e. call e_in.Copy(mapping) to get the new map.
+// Note that the application of Cluster creates gaps in the leaves. You should then
+// call RenumberEventMap(e_in.Copy(mapping), num_leaves).
+// *If you only want to cluster a subset of the leaves (e.g. just non-silence, or just
+// a particular phone, do this by providing a set of "stats" that correspond to just
+// this subset of leaves*. Leaves with no stats will not be clustered.
+// See build-tree.cc for an example of usage.
+int ClusterEventMapGetMapping(const EventMap &e_in, const BuildTreeStatsType &stats,
+ BaseFloat thresh, std::vector<EventMap*> *mapping);
+
+/// This is as ClusterEventMapGetMapping but a more convenient interface
+/// that exposes less of the internals. It uses a bottom-up clustering to
+/// combine the leaves, until the log-likelihood decrease from combinging two
+/// leaves exceeds the threshold.
+EventMap *ClusterEventMap(const EventMap &e_in, const BuildTreeStatsType &stats,
+ BaseFloat thresh, int32 *num_removed);
+
+/// This is as ClusterEventMap, but first splits the stats on the keys specified
+/// in "keys" (e.g. typically keys = [ -1, P ]), and only clusters within the
+/// classes defined by that splitting.
+/// Note-- leaves will be non-consecutive at output, use RenumberEventMap.
+EventMap *ClusterEventMapRestrictedByKeys(const EventMap &e_in,
+ const BuildTreeStatsType &stats,
+ BaseFloat thresh,
+ const std::vector<EventKeyType> &keys,
+ int32 *num_removed);
+
+
+/// This version of ClusterEventMapRestricted restricts the clustering to only
+/// allow things that "e_restrict" maps to the same value to be clustered
+/// together.
+EventMap *ClusterEventMapRestrictedByMap(const EventMap &e_in,
+ const BuildTreeStatsType &stats,
+ BaseFloat thresh,
+ const EventMap &e_restrict,
+ int32 *num_removed);
+
+
+/// RenumberEventMap [intended to be used after calling ClusterEventMap] renumbers
+/// an EventMap so its leaves are consecutive.
+/// It puts the number of leaves in *num_leaves. If later you need the mapping of
+/// the leaves, modify the function and add a new argument.
+EventMap *RenumberEventMap(const EventMap &e_in, int32 *num_leaves);
+
+/// This function remaps the event-map leaves using this mapping,
+/// indexed by the number at leaf.
+EventMap *MapEventMapLeaves(const EventMap &e_in,
+ const std::vector<int32> &mapping);
+
+
+
+/// ShareEventMapLeaves performs a quite specific function that allows us to
+/// generate trees where, for a certain list of phones, and for all states in
+/// the phone, all the pdf's are shared.
+/// Each element of "values" contains a list of phones (may be just one phone),
+/// all states of which we want shared together). Typically at input, "key" will
+/// equal P, the central-phone position, and "values" will contain just one
+/// list containing the silence phone.
+/// This function renumbers the event map leaves after doing the sharing, to
+/// make the event-map leaves contiguous.
+EventMap *ShareEventMapLeaves(const EventMap &e_in, EventKeyType key,
+ std::vector<std::vector<EventValueType> > &values,
+ int32 *num_leaves);
+
+
+
+/// Does a decision-tree split at the leaves of an EventMap.
+/// @param orig [in] The EventMap whose leaves we want to split. [may be either a trivial or a
+/// non-trivial one].
+/// @param stats [in] The statistics for splitting the tree; if you do not want a particular
+/// subset of leaves to be split, make sure the stats corresponding to those leaves
+/// are not present in "stats".
+/// @param qcfg [in] Configuration class that contains initial questions (e.g. sets of phones)
+/// for each key and says whether to refine these questions during tree building.
+/// @param thresh [in] A log-likelihood threshold (e.g. 300) that can be used to
+/// limit the number of leaves; you can use zero and set max_leaves instead.
+/// @param max_leaves [in] Will stop leaves being split after they reach this number.
+/// @param num_leaves [in,out] A pointer used to allocate leaves; always corresponds to the
+/// current number of leaves (is incremented when this is increased).
+/// @param objf_impr_out [out] If non-NULL, will be set to the objective improvement due to splitting
+/// (not normalized by the number of frames).
+/// @param smallest_split_change_out If non-NULL, will be set to the smallest objective-function
+/// improvement that we got from splitting any leaf; useful to provide a threshold
+/// for ClusterEventMap.
+/// @return The EventMap after splitting is returned; pointer is owned by caller.
+EventMap *SplitDecisionTree(const EventMap &orig,
+ const BuildTreeStatsType &stats,
+ Questions &qcfg,
+ BaseFloat thresh,
+ int32 max_leaves, // max_leaves<=0 -> no maximum.
+ int32 *num_leaves,
+ BaseFloat *objf_impr_out,
+ BaseFloat *smallest_split_change_out);
+
+/// CreateRandomQuestions will initialize a Questions randomly, in a reasonable
+/// way [for testing purposes, or when hand-designed questions are not available].
+/// e.g. num_quest = 5 might be a reasonable value if num_iters > 0, or num_quest = 20 otherwise.
+void CreateRandomQuestions(const BuildTreeStatsType &stats, int32 num_quest, Questions *cfg_out);
+
+
+/// FindBestSplitForKey is a function used in DoDecisionTreeSplit.
+/// It finds the best split for this key, given these stats.
+/// It will return 0 if the key was not always defined for the stats.
+BaseFloat FindBestSplitForKey(const BuildTreeStatsType &stats,
+ const Questions &qcfg,
+ EventKeyType key,
+ std::vector<EventValueType> *yes_set);
+
+
+/// GetStubMap is used in tree-building functions to get the initial
+/// to-states map, before the decision-tree-building process. It creates
+/// a simple map that splits on groups of phones. For the set of phones in
+/// phone_sets[i] it creates either: if share_roots[i] == true, a single
+/// leaf node, or if share_roots[i] == false, separate root nodes for
+/// each HMM-position (it goes up to the highest position for any
+/// phone in the set, although it will warn if you share roots between
+/// phones with different numbers of states, which is a weird thing to
+/// do but should still work. If any phone is present
+/// in "phone_sets" but "phone2num_pdf_classes" does not map it to a length,
+/// it is an error. Note that the behaviour of the resulting map is
+/// undefined for phones not present in "phone_sets".
+/// At entry, this function should be called with (*num_leaves == 0).
+/// It will number the leaves starting from (*num_leaves).
+
+EventMap *GetStubMap(int32 P,
+ const std::vector<std::vector<int32> > &phone_sets,
+ const std::vector<int32> &phone2num_pdf_classes,
+ const std::vector<bool> &share_roots, // indexed by index into phone_sets.
+ int32 *num_leaves);
+/// Note: GetStubMap with P = 0 can be used to get a standard monophone system.
+
+/// @}
+
+
+}// end namespace kaldi
+
+#endif
diff --git a/kaldi_io/src/kaldi/tree/build-tree.h b/kaldi_io/src/kaldi/tree/build-tree.h
new file mode 100644
index 0000000..37bb108
--- /dev/null
+++ b/kaldi_io/src/kaldi/tree/build-tree.h
@@ -0,0 +1,250 @@
+// tree/build-tree.h
+
+// Copyright 2009-2011 Microsoft Corporation
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_TREE_BUILD_TREE_H_
+#define KALDI_TREE_BUILD_TREE_H_
+
+// The file build-tree.h contains outer-level routines used in tree-building
+// and related tasks, that are directly called by the command-line tools.
+
+#include "tree/build-tree-utils.h"
+#include "tree/context-dep.h"
+namespace kaldi {
+
+/// \defgroup tree_group_top Top-level tree-building functions
+/// See \ref tree_internals for context.
+/// \ingroup tree_group
+/// @{
+
+// Note, in tree_group_top we also include AccumulateTreeStats, in
+// ../hmm/tree-accu.h (it has some extra dependencies so we didn't
+// want to include it here).
+
+/**
+ * BuildTree is the normal way to build a set of decision trees.
+ * The sets "phone_sets" dictate how we set up the roots of the decision trees.
+ * each set of phones phone_sets[i] has shared decision-tree roots, and if
+ * the corresponding variable share_roots[i] is true, the root will be shared
+ * for the different HMM-positions in the phone. All phones in "phone_sets"
+ * should be in the stats (use FixUnseenPhones to ensure this).
+ * if for any i, do_split[i] is false, we will not do any tree splitting for
+ * phones in that set.
+ * @param qopts [in] Questions options class, contains questions for each key
+ * (e.g. each phone position)
+ * @param phone_sets [in] Each element of phone_sets is a set of phones whose
+ * roots are shared together (prior to decision-tree splitting).
+ * @param phone2num_pdf_classes [in] A map from phones to the number of
+ * \ref pdf_class "pdf-classes"
+ * in the phone (this info is derived from the HmmTopology object)
+ * @param share_roots [in] A vector the same size as phone_sets; says for each
+ * phone set whether the root should be shared among all the
+ * pdf-classes or not.
+ * @param do_split [in] A vector the same size as phone_sets; says for each
+ * phone set whether decision-tree splitting should be done
+ * (generally true for non-silence phones).
+ * @param stats [in] The statistics used in tree-building.
+ * @param thresh [in] Threshold used in decision-tree splitting (e.g. 1000),
+ * or you may use 0 in which case max_leaves becomes the
+ * constraint.
+ * @param max_leaves [in] Maximum number of leaves it will create; set this
+ * to a large number if you want to just specify "thresh".
+ * @param cluster_thresh [in] Threshold for clustering leaves after decision-tree
+ * splitting (only within each phone-set); leaves will be combined
+ * if log-likelihood change is less than this. A value about equal
+ * to "thresh" is suitable
+ * if thresh != 0; otherwise, zero will mean no clustering is done,
+ * or a negative value (e.g. -1) sets it to the smallest likelihood
+ * change seen during the splitting algorithm; this typically causes
+ * about a 20% reduction in the number of leaves.
+
+ * @param P [in] The central position of the phone context window, e.g. 1 for a
+ * triphone system.
+ * @return Returns a pointer to an EventMap object that is the tree.
+
+*/
+
+EventMap *BuildTree(Questions &qopts,
+ const std::vector<std::vector<int32> > &phone_sets,
+ const std::vector<int32> &phone2num_pdf_classes,
+ const std::vector<bool> &share_roots,
+ const std::vector<bool> &do_split,
+ const BuildTreeStatsType &stats,
+ BaseFloat thresh,
+ int32 max_leaves,
+ BaseFloat cluster_thresh, // typically == thresh. If negative, use smallest split.
+ int32 P);
+
+
+/**
+ *
+ * BuildTreeTwoLevel builds a two-level tree, useful for example in building tied mixture
+ * systems with multiple codebooks. It first builds a small tree by splitting to
+ * "max_leaves_first". It then splits at the leaves of "max_leaves_first" (think of this
+ * as creating multiple little trees at the leaves of the first tree), until the total
+ * number of leaves reaches "max_leaves_second". It then outputs the second tree, along
+ * with a mapping from the leaf-ids of the second tree to the leaf-ids of the first tree.
+ * Note that the interface is similar to BuildTree, and in fact it calls BuildTree
+ * internally.
+ *
+ * The sets "phone_sets" dictate how we set up the roots of the decision trees.
+ * each set of phones phone_sets[i] has shared decision-tree roots, and if
+ * the corresponding variable share_roots[i] is true, the root will be shared
+ * for the different HMM-positions in the phone. All phones in "phone_sets"
+ * should be in the stats (use FixUnseenPhones to ensure this).
+ * if for any i, do_split[i] is false, we will not do any tree splitting for
+ * phones in that set.
+ *
+ * @param qopts [in] Questions options class, contains questions for each key
+ * (e.g. each phone position)
+ * @param phone_sets [in] Each element of phone_sets is a set of phones whose
+ * roots are shared together (prior to decision-tree splitting).
+ * @param phone2num_pdf_classes [in] A map from phones to the number of
+ * \ref pdf_class "pdf-classes"
+ * in the phone (this info is derived from the HmmTopology object)
+ * @param share_roots [in] A vector the same size as phone_sets; says for each
+ * phone set whether the root should be shared among all the
+ * pdf-classes or not.
+ * @param do_split [in] A vector the same size as phone_sets; says for each
+ * phone set whether decision-tree splitting should be done
+ * (generally true for non-silence phones).
+ * @param stats [in] The statistics used in tree-building.
+ * @param max_leaves_first [in] Maximum number of leaves it will create in first
+ * level of decision tree.
+ * @param max_leaves_second [in] Maximum number of leaves it will create in second
+ * level of decision tree. Must be > max_leaves_first.
+ * @param cluster_leaves [in] Boolean value; if true, we post-cluster the leaves produced
+ * in the second level of decision-tree split; if false, we don't.
+ * The threshold for post-clustering is the log-like change of the last
+ * decision-tree split; this typically causes about a 20% reduction in
+ * the number of leaves.
+ * @param P [in] The central position of the phone context window, e.g. 1 for a
+ * triphone system.
+ * @param leaf_map [out] Will be set to be a mapping from the leaves of the
+ * "big" tree to the leaves of the "little" tree, which you can
+ * view as cluster centers.
+ * @return Returns a pointer to an EventMap object that is the (big) tree.
+
+*/
+
+EventMap *BuildTreeTwoLevel(Questions &qopts,
+ const std::vector<std::vector<int32> > &phone_sets,
+ const std::vector<int32> &phone2num_pdf_classes,
+ const std::vector<bool> &share_roots,
+ const std::vector<bool> &do_split,
+ const BuildTreeStatsType &stats,
+ int32 max_leaves_first,
+ int32 max_leaves_second,
+ bool cluster_leaves,
+ int32 P,
+ std::vector<int32> *leaf_map);
+
+
+/// GenRandStats generates random statistics of the form used by BuildTree.
+/// It tries to do so in such a way that they mimic "real" stats. The event keys
+/// and their corresponding values are:
+/// - key == -1 == kPdfClass -> pdf-class, generally corresponds to
+/// zero-based position in HMM (0, 1, 2 .. hmm_lengths[phone]-1)
+/// - key == 0 -> phone-id of left-most context phone.
+/// - key == 1 -> phone-id of one-from-left-most context phone.
+/// - key == P-1 -> phone-id of central phone.
+/// - key == N-1 -> phone-id of right-most context phone.
+/// GenRandStats is useful only for testing but it serves to document the format of
+/// stats used by BuildTreeDefault.
+/// if is_ctx_dep[phone] is set to false, GenRandStats will not define the keys for
+/// other than the P-1'th phone.
+
+/// @param dim [in] dimension of features.
+/// @param num_stats [in] approximate number of separate phones-in-context wanted.
+/// @param N [in] context-size (typically 3)
+/// @param P [in] central-phone position in zero-based numbering (typically 1)
+/// @param phone_ids [in] integer ids of phones
+/// @param hmm_lengths [in] lengths of hmm for phone, indexed by phone.
+/// @param is_ctx_dep [in] boolean array indexed by phone, saying whether each phone
+/// is context dependent.
+/// @param ensure_all_phones_covered [in] Boolean argument: if true, GenRandStats
+/// ensures that every phone is seen at least once in the central position (P).
+/// @param stats_out [out] The statistics that this routine outputs.
+
+void GenRandStats(int32 dim, int32 num_stats, int32 N, int32 P,
+ const std::vector<int32> &phone_ids,
+ const std::vector<int32> &hmm_lengths,
+ const std::vector<bool> &is_ctx_dep,
+ bool ensure_all_phones_covered,
+ BuildTreeStatsType *stats_out);
+
+
+/// included here because it's used in some tree-building
+/// calling code. Reads an OpenFst symbl table,
+/// discards the symbols and outputs the integers
+void ReadSymbolTableAsIntegers(std::string filename,
+ bool include_eps,
+ std::vector<int32> *syms);
+
+
+
+/**
+ * Outputs sets of phones that are reasonable for questions
+ * to ask in the tree-building algorithm. These are obtained by tree
+ * clustering of the phones; for each node in the tree, all the leaves
+ * accessible from that node form one of the sets of phones.
+ * @param stats [in] The statistics as used for normal tree-building.
+ * @param phone_sets_in [in] All the phones, pre-partitioned into sets.
+ * The output sets will be various unions of these sets. These sets
+ * will normally correspond to "real phones", in cases where the phones
+ * have stress and position markings.
+ * @param all_pdf_classes_in [in] All the \ref pdf_class "pdf-classes"
+ * that we consider for clustering. In the normal case this is the singleton
+ * set {1}, which means that we only consider the central hmm-position
+ * of the standard 3-state HMM, for clustering purposes.
+ * @param P [in] The central position in the phone context window; normally
+ * 1 for triphone system.s
+ * @param questions_out [out] The questions (sets of phones) are output to here.
+ **/
+void AutomaticallyObtainQuestions(BuildTreeStatsType &stats,
+ const std::vector<std::vector<int32> > &phone_sets_in,
+ const std::vector<int32> &all_pdf_classes_in,
+ int32 P,
+ std::vector<std::vector<int32> > *questions_out);
+
+/// This function clusters the phones (or some initially specified sets of phones)
+/// into sets of phones, using a k-means algorithm. Useful, for example, in building
+/// simple models for purposes of adaptation.
+
+void KMeansClusterPhones(BuildTreeStatsType &stats,
+ const std::vector<std::vector<int32> > &phone_sets_in,
+ const std::vector<int32> &all_pdf_classes_in,
+ int32 P,
+ int32 num_classes,
+ std::vector<std::vector<int32> > *sets_out);
+
+/// Reads the roots file (throws on error). Format is lines like:
+/// "shared split 1 2 3 4",
+/// "not-shared not-split 5",
+/// and so on. The numbers are indexes of phones.
+void ReadRootsFile(std::istream &is,
+ std::vector<std::vector<int32> > *phone_sets,
+ std::vector<bool> *is_shared_root,
+ std::vector<bool> *is_split_root);
+
+
+/// @}
+