diff options
author | Determinant <[email protected]> | 2015-06-25 12:56:45 +0800 |
---|---|---|
committer | Determinant <[email protected]> | 2015-06-25 12:56:45 +0800 |
commit | a74183ddb4ab8383bfe214b3745eb8a0a99ee47a (patch) | |
tree | d5e69cf8c4c2db2e3a4722778352fc3c95953bb2 /htk_io/src/KaldiLib/StkMatch.cc | |
parent | b6301089cde20f4c825c7f5deaf179082aad63da (diff) |
let HTK I/O implementation be a single package
Diffstat (limited to 'htk_io/src/KaldiLib/StkMatch.cc')
-rw-r--r-- | htk_io/src/KaldiLib/StkMatch.cc | 582 |
1 files changed, 582 insertions, 0 deletions
diff --git a/htk_io/src/KaldiLib/StkMatch.cc b/htk_io/src/KaldiLib/StkMatch.cc new file mode 100644 index 0000000..4ff4b18 --- /dev/null +++ b/htk_io/src/KaldiLib/StkMatch.cc @@ -0,0 +1,582 @@ +/* + EPSHeader + + File: filmatch.c + Author: J. Kercheval + Created: Thu, 03/14/1991 22:22:01 +*/ + +/* + EPSRevision History + O. Glembek Thu, 03/11/2005 01:58:00 Added Mask extraction support (char % does this) + J. Kercheval Wed, 02/20/1991 22:29:01 Released to Public Domain + J. Kercheval Fri, 02/22/1991 15:29:01 fix '\' bugs (two :( of them) + J. Kercheval Sun, 03/10/1991 19:31:29 add error return to matche() + J. Kercheval Sun, 03/10/1991 20:11:11 add is_valid_pattern code + J. Kercheval Sun, 03/10/1991 20:37:11 beef up main() + J. Kercheval Tue, 03/12/1991 22:25:10 Released as V1.1 to Public Domain + J. Kercheval Thu, 03/14/1991 22:22:25 remove '\' for DOS file parsing + J. Kercheval Thu, 03/28/1991 20:58:27 include filmatch.h +*/ + +/* + Wildcard Pattern Matching +*/ + + +#include "StkMatch.h" +#include "Common.h" + +namespace TNet +{ + //#define TEST + static int matche_after_star (register const char *pattern, register const char *text, register char *s); + // following function is not defined or used. + // static int fast_match_after_star (register const char *pattern, register const char *text); + + /*---------------------------------------------------------------------------- + * + * Return true if PATTERN has any special wildcard characters + * + ----------------------------------------------------------------------------*/ + + bool is_pattern (const char *p) + { + while ( *p ) { + switch ( *p++ ) { + case '?': + case '*': + case '%': + case '[': + return true; + } + } + return false; + } + + + /*---------------------------------------------------------------------------- + * + * Return true if PATTERN has is a well formed regular expression according + * to the above syntax + * + * error_type is a return code based on the type of pattern error. Zero is + * returned in error_type if the pattern is a valid one. error_type return + * values are as follows: + * + * PATTERN_VALID - pattern is well formed + * PATTERN_RANGE - [..] construct has a no end range in a '-' pair (ie [a-]) + * PATTERN_CLOSE - [..] construct has no end bracket (ie [abc-g ) + * PATTERN_EMPTY - [..] construct is empty (ie []) + * + ----------------------------------------------------------------------------*/ + + bool is_valid_pattern (const char *p, int *error_type) + { + + /* init error_type */ + *error_type = PATTERN_VALID; + + /* loop through pattern to EOS */ + while ( *p ) + { + /* determine pattern type */ + switch ( *p ) + { + /* the [..] construct must be well formed */ + case '[': + { + p++; + + /* if the next character is ']' then bad pattern */ + if ( *p == ']' ) { + *error_type = PATTERN_EMPTY; + return false; + } + + /* if end of pattern here then bad pattern */ + if ( !*p ) + { + *error_type = PATTERN_CLOSE; + return false; + } + + /* loop to end of [..] construct */ + while ( *p != ']' ) + { + /* check for literal escape */ + if ( *p == '\\' ) + { + p++; + + /* if end of pattern here then bad pattern */ + if ( !*p++ ) { + *error_type = PATTERN_ESC; + return false; + } + } + else + p++; + + /* if end of pattern here then bad pattern */ + if ( !*p ) + { + *error_type = PATTERN_CLOSE; + return false; + } + + /* if this a range */ + if ( *p == '-' ) + { + /* we must have an end of range */ + if ( !*++p || *p == ']' ) + { + *error_type = PATTERN_RANGE; + return false; + } + else + { + + /* check for literal escape */ + if ( *p == '\\' ) + p++; + + /* if end of pattern here then bad pattern */ + if ( !*p++ ) + { + *error_type = PATTERN_ESC; + return false; + } + } + } + } + break; + } //case '[': + + + /* all other characters are valid pattern elements */ + case '*': + case '?': + case '%': + default: + p++; /* "normal" character */ + break; + } // switch ( *p ) + } // while ( *p ) + + return true; + } //bool is_valid_pattern (const char *p, int *error_type) + + + /*---------------------------------------------------------------------------- + * + * Match the pattern PATTERN against the string TEXT; + * + * returns MATCH_VALID if pattern matches, or an errorcode as follows + * otherwise: + * + * MATCH_PATTERN - bad pattern + * MATCH_RANGE - match failure on [..] construct + * MATCH_ABORT - premature end of text string + * MATCH_END - premature end of pattern string + * MATCH_VALID - valid match + * + * + * A match means the entire string TEXT is used up in matching. + * + * In the pattern string: + * `*' matches any sequence of characters (zero or more) + * `?' matches any character + * `%' matches any character and stores it in the s string + * [SET] matches any character in the specified set, + * [!SET] or [^SET] matches any character not in the specified set. + * \ is allowed within a set to escape a character like ']' or '-' + * + * A set is composed of characters or ranges; a range looks like + * character hyphen character (as in 0-9 or A-Z). [0-9a-zA-Z_] is the + * minimal set of characters allowed in the [..] pattern construct. + * Other characters are allowed (ie. 8 bit characters) if your system + * will support them. + * + * To suppress the special syntactic significance of any of `[]*?%!^-\', + * within a [..] construct and match the character exactly, precede it + * with a `\'. + * + ----------------------------------------------------------------------------*/ + + int matche ( register const char *p, register const char *t, register char *s ) + { + register char range_start, range_end; /* start and end in range */ + + bool invert; /* is this [..] or [!..] */ + bool member_match; /* have I matched the [..] construct? */ + bool loop; /* should I terminate? */ + + for ( ; *p; p++, t++ ) { + + /* if this is the end of the text then this is the end of the match */ + if (!*t) { + return ( *p == '*' && *++p == '\0' ) ? MATCH_VALID : MATCH_ABORT; + } + + /* determine and react to pattern type */ + switch ( *p ) { + + /* single any character match */ + case '?': + break; + + /* single any character match, with extraction*/ + case '%': { + *s++ = *t; + *s = '\0'; + break; + } + + /* multiple any character match */ + case '*': + return matche_after_star (p, t, s); + + /* [..] construct, single member/exclusion character match */ + case '[': { + /* move to beginning of range */ + p++; + + /* check if this is a member match or exclusion match */ + invert = false; + if ( *p == '!' || *p == '^') { + invert = true; + p++; + } + + /* if closing bracket here or at range start then we have a + malformed pattern */ + if ( *p == ']' ) { + return MATCH_PATTERN; + } + + member_match = false; + loop = true; + + while ( loop ) { + + /* if end of construct then loop is done */ + if (*p == ']') { + loop = false; + continue; + } + + /* matching a '!', '^', '-', '\' or a ']' */ + if ( *p == '\\' ) { + range_start = range_end = *++p; + } + else { + range_start = range_end = *p; + } + + /* if end of pattern then bad pattern (Missing ']') */ + if (!*p) + return MATCH_PATTERN; + + /* check for range bar */ + if (*++p == '-') { + + /* get the range end */ + range_end = *++p; + + /* if end of pattern or construct then bad pattern */ + if (range_end == '\0' || range_end == ']') + return MATCH_PATTERN; + + /* special character range end */ + if (range_end == '\\') { + range_end = *++p; + + /* if end of text then we have a bad pattern */ + if (!range_end) + return MATCH_PATTERN; + } + + /* move just beyond this range */ + p++; + } + + /* if the text character is in range then match found. + make sure the range letters have the proper + relationship to one another before comparison */ + if ( range_start < range_end ) { + if (*t >= range_start && *t <= range_end) { + member_match = true; + loop = false; + } + } + else { + if (*t >= range_end && *t <= range_start) { + member_match = true; + loop = false; + } + } + } + + /* if there was a match in an exclusion set then no match */ + /* if there was no match in a member set then no match */ + if ((invert && member_match) || + !(invert || member_match)) + return MATCH_RANGE; + + /* if this is not an exclusion then skip the rest of the [...] + construct that already matched. */ + if (member_match) { + while (*p != ']') { + + /* bad pattern (Missing ']') */ + if (!*p) + return MATCH_PATTERN; + + /* skip exact match */ + if (*p == '\\') { + p++; + + /* if end of text then we have a bad pattern */ + if (!*p) + return MATCH_PATTERN; + } + + /* move to next pattern char */ + p++; + } + } + + break; + } // case ']' + + /* must match this character exactly */ + default: + if (*p != *t) + return MATCH_LITERAL; + } + } + + //*s = '\0'; + /* if end of text not reached then the pattern fails */ + if ( *t ) + return MATCH_END; + else + return MATCH_VALID; + } + + + /*---------------------------------------------------------------------------- + * + * recursively call matche() with final segment of PATTERN and of TEXT. + * + ----------------------------------------------------------------------------*/ + + static int matche_after_star (register const char *p, register const char *t, register char *s) + { + register int match = 0; + register char nextp; + + /* pass over existing ? and * in pattern */ + while ( *p == '?' || *p == '%' || *p == '*' ) { + + /* take one char for each ? and + */ + if ( *p == '?') { + + /* if end of text then no match */ + if ( !*t++ ) { + return MATCH_ABORT; + } + } + + if ( *p == '%') { + *s++ = *t; + *s = '\0'; + /* if end of text then no match */ + if ( !*t++ ) { + return MATCH_ABORT; + } + } + + /* move to next char in pattern */ + p++; + } + + /* if end of pattern we have matched regardless of text left */ + if ( !*p ) { + return MATCH_VALID; + } + + /* get the next character to match which must be a literal or '[' */ + nextp = *p; + + /* Continue until we run out of text or definite result seen */ + do { + + /* a precondition for matching is that the next character + in the pattern match the next character in the text or that + the next pattern char is the beginning of a range. Increment + text pointer as we go here */ + if ( nextp == *t || nextp == '[' ) { + match = matche(p, t, s); + } + + /* if the end of text is reached then no match */ + if ( !*t++ ) match = MATCH_ABORT; + + } while ( match != MATCH_VALID && + match != MATCH_ABORT && + match != MATCH_PATTERN); + + /* return result */ + return match; + } + + + /*---------------------------------------------------------------------------- + * + * match() is a shell to matche() to return only bool values. + * + ----------------------------------------------------------------------------*/ + + bool match(const char *p, const char *t, char *s) + { + int error_type; + error_type = matche(p,t,s); + return (error_type != MATCH_VALID ) ? false : true; + } + + + //*************************************************************************** + //*************************************************************************** + bool + ProcessMask(const std::string & rString, + const std::string & rWildcard, + std::string & rSubstr) + { + char * substr; + int percent_count = 0; + int ret ; + size_t pos = 0; + + // let's find how many % to allocate enough space for the return substring + while ((pos = rWildcard.find('%', pos)) != rWildcard.npos) + { + percent_count++; + pos++; + } + + // allocate space for the substring + substr = new char[percent_count + 1]; + substr[percent_count] = 0; + substr[0] = '\0'; + + // optionally prepend '*/' to wildcard + std::string wildcard(rWildcard); + if(wildcard[0] != '*') { + wildcard = "*/" + wildcard; + } + + //optionally prepend '/' to string + std::string string1(rString); + if(string1[0] != '/') { + string1 = "/" + string1; + } + + // parse the string + if (0 != (ret = match(wildcard.c_str(), string1.c_str(), substr))) + { + rSubstr = substr; + } + delete[] substr; + return ret; + } // ProcessMask +} + + +#ifdef TEST + +/* +* This test main expects as first arg the pattern and as second arg +* the match string. Output is yaeh or nay on match. If nay on +* match then the error code is parsed and written. +*/ + +#include <stdio.h> + +int main(int argc, char *argv[]) +{ + int error; + int is_valid_error; + + char * tmp = argv[0]; + int i = 0; + for (; *tmp; tmp++) + if (*tmp=='%') i++; + + char s[i+1]; + + + if (argc != 3) { + printf("Usage: MATCH Pattern Text\n"); + } + else { + printf("Pattern: %s\n", argv[1]); + printf("Text : %s\n", argv[2]); + + if (!is_pattern(argv[1])) { + printf(" First Argument Is Not A Pattern\n"); + } + else { + match(argv[1],argv[2], s) ? printf("true") : printf("false"); + error = matche(argv[1],argv[2], s); + is_valid_pattern(argv[1],&is_valid_error); + + switch ( error ) { + case MATCH_VALID: + printf(" Match Successful"); + if (is_valid_error != PATTERN_VALID) + printf(" -- is_valid_pattern() is complaining\n"); + else + printf("\n"); + printf("%s\n", s); + + break; + case MATCH_RANGE: + printf(" Match Failed on [..]\n"); + break; + case MATCH_ABORT: + printf(" Match Failed on Early Text Termination\n"); + break; + case MATCH_END: + printf(" Match Failed on Early Pattern Termination\n"); + break; + case MATCH_PATTERN: + switch ( is_valid_error ) { + case PATTERN_VALID: + printf(" Internal Disagreement On Pattern\n"); + break; + case PATTERN_RANGE: + printf(" No End of Range in [..] Construct\n"); + break; + case PATTERN_CLOSE: + printf(" [..] Construct is Open\n"); + break; + case PATTERN_EMPTY: + printf(" [..] Construct is Empty\n"); + break; + default: + printf(" Internal Error in is_valid_pattern()\n"); + } + break; + default: + printf(" Internal Error in matche()\n"); + break; + } + } + + } + return(0); +} + +#endif |