summaryrefslogtreecommitdiff
path: root/htk_io/src/KaldiLib/Tokenizer.cc
diff options
context:
space:
mode:
Diffstat (limited to 'htk_io/src/KaldiLib/Tokenizer.cc')
-rw-r--r--htk_io/src/KaldiLib/Tokenizer.cc53
1 files changed, 53 insertions, 0 deletions
diff --git a/htk_io/src/KaldiLib/Tokenizer.cc b/htk_io/src/KaldiLib/Tokenizer.cc
new file mode 100644
index 0000000..0c49050
--- /dev/null
+++ b/htk_io/src/KaldiLib/Tokenizer.cc
@@ -0,0 +1,53 @@
+#include "Tokenizer.h"
+#include "string.h"
+
+namespace TNet
+{
+ //****************************************************************************
+ //****************************************************************************
+ void
+ Tokenizer::
+ AddString(const char* pString)
+ {
+ // copy into string struct, which is more convenient
+ std::string aux_string(pString);
+ std::string aux_record;
+ std::string::size_type cur_pos = 0;
+ std::string::size_type old_pos = 0;
+ std::string::size_type search_start = 0;
+
+ // make sure we have enough space
+ aux_record.reserve(aux_string.length());
+
+ // find all of separators and make a list of tokens
+ while(old_pos < std::string::npos) {
+ // find the next separator
+ cur_pos = aux_string.find_first_of(mSeparator, search_start);
+
+ // if backslash is in front of separator, ignore this separator
+ if (cur_pos != 0 && cur_pos != std::string::npos &&
+ pString[cur_pos - 1] == '\\') {
+ search_start = cur_pos + 1;
+ continue;
+ }
+
+ // we don't want to have empty records
+ if (!(cur_pos == old_pos && mSkipEmpty)) {
+ // extract token
+ aux_record.insert(0, pString+old_pos, cur_pos==std::string::npos ? strlen(pString+old_pos) : cur_pos - old_pos);
+ // insert to list
+ this->push_back(aux_record);
+
+ // we don't need the contents of the token
+ aux_record.erase();
+ }
+
+ // update old position so that it points behind the separator
+ old_pos = cur_pos < std::string::npos ? cur_pos + 1 : cur_pos;
+ search_start = old_pos;
+ }
+ }
+
+
+} // namespace TNet
+