// util/text-utils.h
// Copyright 2009-2011 Saarland University; Microsoft Corporation
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_UTIL_TEXT_UTILS_H_
#define KALDI_UTIL_TEXT_UTILS_H_
#include <algorithm>
#include <map>
#include <set>
#include <string>
#include <vector>
#include <errno.h>
#include "base/kaldi-common.h"
namespace kaldi {
/// Split a string using any of the single character delimiters.
/// If omit_empty_strings == true, the output will contain any
/// nonempty strings after splitting on any of the
/// characters in the delimiter. If omit_empty_strings == false,
/// the output will contain n+1 strings if there are n characters
/// in the set "delim" within the input string. In this case
/// the empty string is split to a single empty string.
void SplitStringToVector(const std::string &full, const char *delim,
bool omit_empty_strings,
std::vector<std::string> *out);
/// Joins the elements of a vector of strings into a single string using
/// "delim" as the delimiter. If omit_empty_strings == true, any empty strings
/// in the vector are skipped. A vector of empty strings results in an empty
/// string on the output.
void JoinVectorToString(const std::vector<std::string> &vec_in,
const char *delim, bool omit_empty_strings,
std::string *str_out);
/// Split a string (e.g. 1:2:3) into a vector of integers.
/// The delimiting char may be any character in "delim".
/// returns true on success, false on failure.
/// If omit_empty_strings == true, 1::2:3: will become
/// { 1, 2, 3 }. Otherwise it would be rejected.
/// Regardless of the value of omit_empty_strings,
/// the empty string is successfully parsed as an empty
/// vector of integers
template<class I>
bool SplitStringToIntegers(const std::string &full,
const char *delim,
bool omit_empty_strings, // typically false [but
// should probably be true
// if "delim" is spaces].
std::vector<I> *out) {
KALDI_ASSERT(out != NULL);
KALDI_ASSERT_IS_INTEGER_TYPE(I);
if ( *(full.c_str()) == '\0') {
out->clear();
return true;
}
std::vector<std::string> split;
SplitStringToVector(full, delim, omit_empty_strings, &split);
out->resize(split.size());
for (size_t i = 0; i < split.size(); i++) {
const char *this_str = split[i].c_str();
char *end = NULL;
long long int j = 0;
j = KALDI_STRTOLL(this_str, &end);
if (end == this_str || *end != '\0') {
out->clear();
return false;
} else {
I jI = static_cast<I>(j);
if (static_cast<long long int>(jI) != j) {
// output type cannot fit this integer.
out->clear();
return false;
}
(*out)[i] = jI;
}
}
return true;
}
// This is defined for F = float and double.
template<class F>
bool SplitStringToFloats(const std::string &full,
const char *delim,
bool omit_empty_strings, // typically false
std::vector<F> *out);
/// Converts a string into an integer via strtoll and returns false if there was
/// any kind of problem (i.e. the string was not an integer or contained extra
/// non-whitespace junk, or the integer was too large to fit into the type it is
/// being converted into). Only sets *out if everything was OK and it returns
/// true.
template<class Int>
bool ConvertStringToInteger(const std::string &str,
Int *out) {
KALDI_ASSERT_IS_INTEGER_TYPE(Int);
const char *this_str = str.c_str();
char *end = NULL;
errno = 0;
long long int i = KALDI_STRTOLL(this_str, &end);
if (end != this_str)
while (isspace(*end)) end++;
if (end == this_str || *end != '\0' || errno != 0)
return false;
Int iInt = static_cast<Int>(i);
if (static_cast<long long int>(iInt) != i || (i<0 && !std::numeric_limits<Int>::is_signed)) {
return false;
}
*out = iInt;
return true;
}
/// ConvertStringToReal converts a string into either float or double via strtod,
/// and returns false if there was any kind of problem (i.e. the string was not a
/// floating point number or contained extra non-whitespace junk.
/// Be careful- this function will successfully read inf's or nan's.
bool ConvertStringToReal(const std::string &str,
double *out);
bool ConvertStringToReal(const std::string &str,
float *out);
/// Removes the beginning and trailing whitespaces from a string
void Trim(std::string *str);
/// Removes leading and trailing white space from the string, then splits on the
/// first section of whitespace found (if present), putting the part before the
/// whitespace in "first" and the rest in "rest". If there is no such space,
/// everything that remains after removing leading and trailing whitespace goes
/// in "first".
void SplitStringOnFirstSpace(const std::string &line,
std::string *first,
std::string *rest);
/// Returns true if "token" is nonempty, and all characters are
/// printable and whitespace-free.
bool IsToken(const std::string &token);
/// Returns true if "line" is free of \n characters and unprintable
/// characters, and does not contain leading or trailing whitespace.
bool IsLine(const std::string &line);
} // namespace kaldi
#endif // KALDI_UTIL_TEXT_UTILS_H_