diff options
Diffstat (limited to 'kaldi_io/src/tools/openfst/include/fst/icu.h')
-rw-r--r-- | kaldi_io/src/tools/openfst/include/fst/icu.h | 116 |
1 files changed, 116 insertions, 0 deletions
diff --git a/kaldi_io/src/tools/openfst/include/fst/icu.h b/kaldi_io/src/tools/openfst/include/fst/icu.h new file mode 100644 index 0000000..3947716 --- /dev/null +++ b/kaldi_io/src/tools/openfst/include/fst/icu.h @@ -0,0 +1,116 @@ +// icu.h + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: [email protected] (Jeffrey Sorensen) +// [email protected] (Fredrik Roubert) +// +// This library implements an unrestricted Thompson/Pike UTF-8 parser and +// serializer. UTF-8 is a restricted subset of this byte stream encoding. See +// http://en.wikipedia.org/wiki/UTF-8 for a good description of the encoding +// details. + +#ifndef FST_LIB_ICU_H_ +#define FST_LIB_ICU_H_ + +#include <iostream> +#include <fstream> +#include <sstream> + +namespace fst { + +template <class Label> +bool UTF8StringToLabels(const string &str, vector<Label> *labels) { + const char *data = str.data(); + size_t length = str.size(); + for (int i = 0; i < length; /* no update */) { + int c = data[i++] & 0xff; + if ((c & 0x80) == 0) { + labels->push_back(c); + } else { + if ((c & 0xc0) == 0x80) { + LOG(ERROR) << "UTF8StringToLabels: continuation byte as lead byte"; + return false; + } + int count = (c >= 0xc0) + (c >= 0xe0) + (c >= 0xf0) + (c >= 0xf8) + + (c >= 0xfc); + int code = c & ((1 << (6 - count)) - 1); + while (count != 0) { + if (i == length) { + LOG(ERROR) << "UTF8StringToLabels: truncated utf-8 byte sequence"; + return false; + } + char cb = data[i++]; + if ((cb & 0xc0) != 0x80) { + LOG(ERROR) << "UTF8StringToLabels: missing/invalid continuation byte"; + return false; + } + code = (code << 6) | (cb & 0x3f); + count--; + } + if (code < 0) { + // This should not be able to happen. + LOG(ERROR) << "UTF8StringToLabels: Invalid character found: " << c; + return false; + } + labels->push_back(code); + } + } + return true; +} + +template <class Label> +bool LabelsToUTF8String(const vector<Label> &labels, string *str) { + ostringstream ostr; + for (size_t i = 0; i < labels.size(); ++i) { + int32_t code = labels[i]; + if (code < 0) { + LOG(ERROR) << "LabelsToUTF8String: Invalid character found: " << code; + return false; + } else if (code < 0x80) { + ostr << static_cast<char>(code); + } else if (code < 0x800) { + ostr << static_cast<char>((code >> 6) | 0xc0); + ostr << static_cast<char>((code & 0x3f) | 0x80); + } else if (code < 0x10000) { + ostr << static_cast<char>((code >> 12) | 0xe0); + ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); + ostr << static_cast<char>((code & 0x3f) | 0x80); + } else if (code < 0x200000) { + ostr << static_cast<char>((code >> 18) | 0xf0); + ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80); + ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); + ostr << static_cast<char>((code & 0x3f) | 0x80); + } else if (code < 0x4000000) { + ostr << static_cast<char>((code >> 24) | 0xf8); + ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80); + ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80); + ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); + ostr << static_cast<char>((code & 0x3f) | 0x80); + } else { + ostr << static_cast<char>((code >> 30) | 0xfc); + ostr << static_cast<char>(((code >> 24) & 0x3f) | 0x80); + ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80); + ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80); + ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); + ostr << static_cast<char>((code & 0x3f) | 0x80); + } + } + *str = ostr.str(); + return true; +} + +} // namespace fst + +#endif // FST_LIB_ICU_H_ |