summaryrefslogtreecommitdiff
path: root/kaldi_io/src/tools/openfst/include/fst/icu.h
diff options
context:
space:
mode:
Diffstat (limited to 'kaldi_io/src/tools/openfst/include/fst/icu.h')
-rw-r--r--kaldi_io/src/tools/openfst/include/fst/icu.h116
1 files changed, 116 insertions, 0 deletions
diff --git a/kaldi_io/src/tools/openfst/include/fst/icu.h b/kaldi_io/src/tools/openfst/include/fst/icu.h
new file mode 100644
index 0000000..3947716
--- /dev/null
+++ b/kaldi_io/src/tools/openfst/include/fst/icu.h
@@ -0,0 +1,116 @@
+// icu.h
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Copyright 2005-2010 Google, Inc.
+// Author: sorenj@google.com (Jeffrey Sorensen)
+// roubert@google.com (Fredrik Roubert)
+//
+// This library implements an unrestricted Thompson/Pike UTF-8 parser and
+// serializer. UTF-8 is a restricted subset of this byte stream encoding. See
+// http://en.wikipedia.org/wiki/UTF-8 for a good description of the encoding
+// details.
+
+#ifndef FST_LIB_ICU_H_
+#define FST_LIB_ICU_H_
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+namespace fst {
+
+template <class Label>
+bool UTF8StringToLabels(const string &str, vector<Label> *labels) {
+ const char *data = str.data();
+ size_t length = str.size();
+ for (int i = 0; i < length; /* no update */) {
+ int c = data[i++] & 0xff;
+ if ((c & 0x80) == 0) {
+ labels->push_back(c);
+ } else {
+ if ((c & 0xc0) == 0x80) {
+ LOG(ERROR) << "UTF8StringToLabels: continuation byte as lead byte";
+ return false;
+ }
+ int count = (c >= 0xc0) + (c >= 0xe0) + (c >= 0xf0) + (c >= 0xf8) +
+ (c >= 0xfc);
+ int code = c & ((1 << (6 - count)) - 1);
+ while (count != 0) {
+ if (i == length) {
+ LOG(ERROR) << "UTF8StringToLabels: truncated utf-8 byte sequence";
+ return false;
+ }
+ char cb = data[i++];
+ if ((cb & 0xc0) != 0x80) {
+ LOG(ERROR) << "UTF8StringToLabels: missing/invalid continuation byte";
+ return false;
+ }
+ code = (code << 6) | (cb & 0x3f);
+ count--;
+ }
+ if (code < 0) {
+ // This should not be able to happen.
+ LOG(ERROR) << "UTF8StringToLabels: Invalid character found: " << c;
+ return false;
+ }
+ labels->push_back(code);
+ }
+ }
+ return true;
+}
+
+template <class Label>
+bool LabelsToUTF8String(const vector<Label> &labels, string *str) {
+ ostringstream ostr;
+ for (size_t i = 0; i < labels.size(); ++i) {
+ int32_t code = labels[i];
+ if (code < 0) {
+ LOG(ERROR) << "LabelsToUTF8String: Invalid character found: " << code;
+ return false;
+ } else if (code < 0x80) {
+ ostr << static_cast<char>(code);
+ } else if (code < 0x800) {
+ ostr << static_cast<char>((code >> 6) | 0xc0);
+ ostr << static_cast<char>((code & 0x3f) | 0x80);
+ } else if (code < 0x10000) {
+ ostr << static_cast<char>((code >> 12) | 0xe0);
+ ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
+ ostr << static_cast<char>((code & 0x3f) | 0x80);
+ } else if (code < 0x200000) {
+ ostr << static_cast<char>((code >> 18) | 0xf0);
+ ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80);
+ ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
+ ostr << static_cast<char>((code & 0x3f) | 0x80);
+ } else if (code < 0x4000000) {
+ ostr << static_cast<char>((code >> 24) | 0xf8);
+ ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80);
+ ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80);
+ ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
+ ostr << static_cast<char>((code & 0x3f) | 0x80);
+ } else {
+ ostr << static_cast<char>((code >> 30) | 0xfc);
+ ostr << static_cast<char>(((code >> 24) & 0x3f) | 0x80);
+ ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80);
+ ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80);
+ ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80);
+ ostr << static_cast<char>((code & 0x3f) | 0x80);
+ }
+ }
+ *str = ostr.str();
+ return true;
+}
+
+} // namespace fst
+
+#endif // FST_LIB_ICU_H_