summaryrefslogtreecommitdiff
path: root/kaldi_io/src/kaldi/util/edit-distance-inl.h
diff options
context:
space:
mode:
authorDeterminant <[email protected]>2015-08-14 11:51:42 +0800
committerDeterminant <[email protected]>2015-08-14 11:51:42 +0800
commit96a32415ab43377cf1575bd3f4f2980f58028209 (patch)
tree30a2d92d73e8f40ac87b79f6f56e227bfc4eea6e /kaldi_io/src/kaldi/util/edit-distance-inl.h
parentc177a7549bd90670af4b29fa813ddea32cfe0f78 (diff)
add implementation for kaldi io (by ymz)
Diffstat (limited to 'kaldi_io/src/kaldi/util/edit-distance-inl.h')
-rw-r--r--kaldi_io/src/kaldi/util/edit-distance-inl.h189
1 files changed, 189 insertions, 0 deletions
diff --git a/kaldi_io/src/kaldi/util/edit-distance-inl.h b/kaldi_io/src/kaldi/util/edit-distance-inl.h
new file mode 100644
index 0000000..ebbfb71
--- /dev/null
+++ b/kaldi_io/src/kaldi/util/edit-distance-inl.h
@@ -0,0 +1,189 @@
+// util/edit-distance-inl.h
+
+// Copyright 2009-2011 Microsoft Corporation; Haihua Xu; Yanmin Qian
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_UTIL_EDIT_DISTANCE_INL_H_
+#define KALDI_UTIL_EDIT_DISTANCE_INL_H_
+#include "util/stl-utils.h"
+
+
+namespace kaldi {
+
+template<class T>
+int32 LevenshteinEditDistance(const std::vector<T> &a,
+ const std::vector<T> &b) {
+ // Algorithm:
+ // write A and B for the sequences, with elements a_0 ..
+ // let |A| = M and |B| = N be the lengths, and have
+ // elements a_0 ... a_{M-1} and b_0 ... b_{N-1}.
+ // We are computing the recursion
+ // E(m, n) = min( E(m-1, n-1) + (1-delta(a_{m-1}, b_{n-1})),
+ // E(m-1, n),
+ // E(m, n-1) ).
+ // where E(m, n) is defined for m = 0..M and n = 0..N and out-of-
+ // bounds quantities are considered to be infinity (i.e. the
+ // recursion does not visit them).
+
+ // We do this computation using a vector e of size N+1.
+ // The outer iterations range over m = 0..M.
+
+ int M = a.size(), N = b.size();
+ std::vector<int32> e(N+1);
+ std::vector<int32> e_tmp(N+1);
+ // initialize e.
+ for (size_t i = 0; i < e.size(); i++)
+ e[i] = i;
+ for (int32 m = 1; m <= M; m++) {
+ // computing E(m, .) from E(m-1, .)
+ // handle special case n = 0:
+ e_tmp[0] = e[0] + 1;
+
+ for (int32 n = 1; n <= N; n++) {
+ int32 term1 = e[n-1] + (a[m-1] == b[n-1] ? 0 : 1);
+ int32 term2 = e[n] + 1;
+ int32 term3 = e_tmp[n-1] + 1;
+ e_tmp[n] = std::min(term1, std::min(term2, term3));
+ }
+ e = e_tmp;
+ }
+ return e.back();
+}
+//
+struct error_stats{
+ int32 ins_num;
+ int32 del_num;
+ int32 sub_num;
+ int32 total_cost; // minimum total cost to the current alignment.
+};
+// Note that both hyp and ref should not contain noise word in
+// the following implementation.
+
+template<class T>
+int32 LevenshteinEditDistance(const std::vector<T> &ref,
+ const std::vector<T> &hyp,
+ int32 *ins, int32 *del, int32 *sub) {
+ // temp sequence to remember error type and stats.
+ std::vector<error_stats> e(ref.size()+1);
+ std::vector<error_stats> cur_e(ref.size()+1);
+ // initialize the first hypothesis aligned to the reference at each
+ // position:[hyp_index =0][ref_index]
+ for (size_t i =0; i < e.size(); i ++) {
+ e[i].ins_num = 0;
+ e[i].sub_num = 0;
+ e[i].del_num = i;
+ e[i].total_cost = i;
+ }
+
+ // for other alignments
+ for (size_t hyp_index = 1; hyp_index <= hyp.size(); hyp_index ++) {
+ cur_e[0] = e[0];
+ cur_e[0].ins_num ++;
+ cur_e[0].total_cost ++;
+ for (size_t ref_index = 1; ref_index <= ref.size(); ref_index ++) {
+
+ int32 ins_err = e[ref_index].total_cost + 1;
+ int32 del_err = cur_e[ref_index-1].total_cost + 1;
+ int32 sub_err = e[ref_index-1].total_cost;
+ if (hyp[hyp_index-1] != ref[ref_index-1])
+ sub_err ++;
+
+ if (sub_err < ins_err && sub_err < del_err) {
+ cur_e[ref_index] =e[ref_index-1];
+ if (hyp[hyp_index-1] != ref[ref_index-1])
+ cur_e[ref_index].sub_num ++; // substitution error should be increased
+ cur_e[ref_index].total_cost = sub_err;
+ }else if (del_err < ins_err ) {
+ cur_e[ref_index] = cur_e[ref_index-1];
+ cur_e[ref_index].total_cost = del_err;
+ cur_e[ref_index].del_num ++; // deletion number is increased.
+ }else{
+ cur_e[ref_index] = e[ref_index];
+ cur_e[ref_index].total_cost = ins_err;
+ cur_e[ref_index].ins_num ++; // insertion number is increased.
+ }
+ }
+ e = cur_e; // alternate for the next recursion.
+ }
+ size_t ref_index = e.size()-1;
+ *ins = e[ref_index].ins_num, *del = e[ref_index].del_num, *sub = e[ref_index].sub_num;
+ return e[ref_index].total_cost;
+}
+
+template<class T>
+int32 LevenshteinAlignment(const std::vector<T> &a,
+ const std::vector<T> &b,
+ T eps_symbol,
+ std::vector<std::pair<T, T> > *output) {
+ // Check inputs:
+ {
+ KALDI_ASSERT(output != NULL);
+ for (size_t i = 0; i < a.size(); i++) KALDI_ASSERT(a[i] != eps_symbol);
+ for (size_t i = 0; i < b.size(); i++) KALDI_ASSERT(b[i] != eps_symbol);
+ }
+ output->clear();
+ // This is very memory-inefficiently implemented using a vector of vectors.
+ size_t M = a.size(), N = b.size();
+ size_t m, n;
+ std::vector<std::vector<int32> > e(M+1);
+ for (m = 0; m <=M; m++) e[m].resize(N+1);
+ for (n = 0; n <= N; n++)
+ e[0][n] = n;
+ for (m = 1; m <= M; m++) {
+ e[m][0] = e[m-1][0] + 1;
+ for (n = 1; n <= N; n++) {
+ int32 sub_or_ok = e[m-1][n-1] + (a[m-1] == b[n-1] ? 0 : 1);
+ int32 del = e[m-1][n] + 1; // assumes a == ref, b == hyp.
+ int32 ins = e[m][n-1] + 1;
+ e[m][n] = std::min(sub_or_ok, std::min(del, ins));
+ }
+ }
+ // get time-reversed output first: trace back.
+ m = M; n = N;
+ while (m != 0 || n != 0) {
+ size_t last_m, last_n;
+ if (m == 0) { last_m = m; last_n = n-1; }
+ else if (n == 0) { last_m = m-1; last_n = n; }
+ else {
+ int32 sub_or_ok = e[m-1][n-1] + (a[m-1] == b[n-1] ? 0 : 1);
+ int32 del = e[m-1][n] + 1; // assumes a == ref, b == hyp.
+ int32 ins = e[m][n-1] + 1;
+ if (sub_or_ok <= std::min(del, ins)) { // choose sub_or_ok if all else equal.
+ last_m = m-1; last_n = n-1;
+ } else {
+ if (del <= ins) { // choose del over ins if equal.
+ last_m = m-1; last_n = n;
+ } else {
+ last_m = m; last_n = n-1;
+ }
+ }
+ }
+ T a_sym, b_sym;
+ a_sym = (last_m == m ? eps_symbol : a[last_m]);
+ b_sym = (last_n == n ? eps_symbol : b[last_n]);
+ output->push_back(std::make_pair(a_sym, b_sym));
+ m = last_m;
+ n = last_n;
+ }
+ ReverseVector(output);
+ return e[M][N];
+}
+
+
+} // end namespace kaldi
+
+#endif // KALDI_UTIL_EDIT_DISTANCE_INL_H_