summaryrefslogtreecommitdiff
path: root/htk_io/src/KaldiLib/StkMatch.cc
diff options
context:
space:
mode:
authorDeterminant <[email protected]>2015-06-25 12:56:45 +0800
committerDeterminant <[email protected]>2015-06-25 12:56:45 +0800
commita74183ddb4ab8383bfe214b3745eb8a0a99ee47a (patch)
treed5e69cf8c4c2db2e3a4722778352fc3c95953bb2 /htk_io/src/KaldiLib/StkMatch.cc
parentb6301089cde20f4c825c7f5deaf179082aad63da (diff)
let HTK I/O implementation be a single package
Diffstat (limited to 'htk_io/src/KaldiLib/StkMatch.cc')
-rw-r--r--htk_io/src/KaldiLib/StkMatch.cc582
1 files changed, 582 insertions, 0 deletions
diff --git a/htk_io/src/KaldiLib/StkMatch.cc b/htk_io/src/KaldiLib/StkMatch.cc
new file mode 100644
index 0000000..4ff4b18
--- /dev/null
+++ b/htk_io/src/KaldiLib/StkMatch.cc
@@ -0,0 +1,582 @@
+/*
+ EPSHeader
+
+ File: filmatch.c
+ Author: J. Kercheval
+ Created: Thu, 03/14/1991 22:22:01
+*/
+
+/*
+ EPSRevision History
+ O. Glembek Thu, 03/11/2005 01:58:00 Added Mask extraction support (char % does this)
+ J. Kercheval Wed, 02/20/1991 22:29:01 Released to Public Domain
+ J. Kercheval Fri, 02/22/1991 15:29:01 fix '\' bugs (two :( of them)
+ J. Kercheval Sun, 03/10/1991 19:31:29 add error return to matche()
+ J. Kercheval Sun, 03/10/1991 20:11:11 add is_valid_pattern code
+ J. Kercheval Sun, 03/10/1991 20:37:11 beef up main()
+ J. Kercheval Tue, 03/12/1991 22:25:10 Released as V1.1 to Public Domain
+ J. Kercheval Thu, 03/14/1991 22:22:25 remove '\' for DOS file parsing
+ J. Kercheval Thu, 03/28/1991 20:58:27 include filmatch.h
+*/
+
+/*
+ Wildcard Pattern Matching
+*/
+
+
+#include "StkMatch.h"
+#include "Common.h"
+
+namespace TNet
+{
+ //#define TEST
+ static int matche_after_star (register const char *pattern, register const char *text, register char *s);
+ // following function is not defined or used.
+ // static int fast_match_after_star (register const char *pattern, register const char *text);
+
+ /*----------------------------------------------------------------------------
+ *
+ * Return true if PATTERN has any special wildcard characters
+ *
+ ----------------------------------------------------------------------------*/
+
+ bool is_pattern (const char *p)
+ {
+ while ( *p ) {
+ switch ( *p++ ) {
+ case '?':
+ case '*':
+ case '%':
+ case '[':
+ return true;
+ }
+ }
+ return false;
+ }
+
+
+ /*----------------------------------------------------------------------------
+ *
+ * Return true if PATTERN has is a well formed regular expression according
+ * to the above syntax
+ *
+ * error_type is a return code based on the type of pattern error. Zero is
+ * returned in error_type if the pattern is a valid one. error_type return
+ * values are as follows:
+ *
+ * PATTERN_VALID - pattern is well formed
+ * PATTERN_RANGE - [..] construct has a no end range in a '-' pair (ie [a-])
+ * PATTERN_CLOSE - [..] construct has no end bracket (ie [abc-g )
+ * PATTERN_EMPTY - [..] construct is empty (ie [])
+ *
+ ----------------------------------------------------------------------------*/
+
+ bool is_valid_pattern (const char *p, int *error_type)
+ {
+
+ /* init error_type */
+ *error_type = PATTERN_VALID;
+
+ /* loop through pattern to EOS */
+ while ( *p )
+ {
+ /* determine pattern type */
+ switch ( *p )
+ {
+ /* the [..] construct must be well formed */
+ case '[':
+ {
+ p++;
+
+ /* if the next character is ']' then bad pattern */
+ if ( *p == ']' ) {
+ *error_type = PATTERN_EMPTY;
+ return false;
+ }
+
+ /* if end of pattern here then bad pattern */
+ if ( !*p )
+ {
+ *error_type = PATTERN_CLOSE;
+ return false;
+ }
+
+ /* loop to end of [..] construct */
+ while ( *p != ']' )
+ {
+ /* check for literal escape */
+ if ( *p == '\\' )
+ {
+ p++;
+
+ /* if end of pattern here then bad pattern */
+ if ( !*p++ ) {
+ *error_type = PATTERN_ESC;
+ return false;
+ }
+ }
+ else
+ p++;
+
+ /* if end of pattern here then bad pattern */
+ if ( !*p )
+ {
+ *error_type = PATTERN_CLOSE;
+ return false;
+ }
+
+ /* if this a range */
+ if ( *p == '-' )
+ {
+ /* we must have an end of range */
+ if ( !*++p || *p == ']' )
+ {
+ *error_type = PATTERN_RANGE;
+ return false;
+ }
+ else
+ {
+
+ /* check for literal escape */
+ if ( *p == '\\' )
+ p++;
+
+ /* if end of pattern here then bad pattern */
+ if ( !*p++ )
+ {
+ *error_type = PATTERN_ESC;
+ return false;
+ }
+ }
+ }
+ }
+ break;
+ } //case '[':
+
+
+ /* all other characters are valid pattern elements */
+ case '*':
+ case '?':
+ case '%':
+ default:
+ p++; /* "normal" character */
+ break;
+ } // switch ( *p )
+ } // while ( *p )
+
+ return true;
+ } //bool is_valid_pattern (const char *p, int *error_type)
+
+
+ /*----------------------------------------------------------------------------
+ *
+ * Match the pattern PATTERN against the string TEXT;
+ *
+ * returns MATCH_VALID if pattern matches, or an errorcode as follows
+ * otherwise:
+ *
+ * MATCH_PATTERN - bad pattern
+ * MATCH_RANGE - match failure on [..] construct
+ * MATCH_ABORT - premature end of text string
+ * MATCH_END - premature end of pattern string
+ * MATCH_VALID - valid match
+ *
+ *
+ * A match means the entire string TEXT is used up in matching.
+ *
+ * In the pattern string:
+ * `*' matches any sequence of characters (zero or more)
+ * `?' matches any character
+ * `%' matches any character and stores it in the s string
+ * [SET] matches any character in the specified set,
+ * [!SET] or [^SET] matches any character not in the specified set.
+ * \ is allowed within a set to escape a character like ']' or '-'
+ *
+ * A set is composed of characters or ranges; a range looks like
+ * character hyphen character (as in 0-9 or A-Z). [0-9a-zA-Z_] is the
+ * minimal set of characters allowed in the [..] pattern construct.
+ * Other characters are allowed (ie. 8 bit characters) if your system
+ * will support them.
+ *
+ * To suppress the special syntactic significance of any of `[]*?%!^-\',
+ * within a [..] construct and match the character exactly, precede it
+ * with a `\'.
+ *
+ ----------------------------------------------------------------------------*/
+
+ int matche ( register const char *p, register const char *t, register char *s )
+ {
+ register char range_start, range_end; /* start and end in range */
+
+ bool invert; /* is this [..] or [!..] */
+ bool member_match; /* have I matched the [..] construct? */
+ bool loop; /* should I terminate? */
+
+ for ( ; *p; p++, t++ ) {
+
+ /* if this is the end of the text then this is the end of the match */
+ if (!*t) {
+ return ( *p == '*' && *++p == '\0' ) ? MATCH_VALID : MATCH_ABORT;
+ }
+
+ /* determine and react to pattern type */
+ switch ( *p ) {
+
+ /* single any character match */
+ case '?':
+ break;
+
+ /* single any character match, with extraction*/
+ case '%': {
+ *s++ = *t;
+ *s = '\0';
+ break;
+ }
+
+ /* multiple any character match */
+ case '*':
+ return matche_after_star (p, t, s);
+
+ /* [..] construct, single member/exclusion character match */
+ case '[': {
+ /* move to beginning of range */
+ p++;
+
+ /* check if this is a member match or exclusion match */
+ invert = false;
+ if ( *p == '!' || *p == '^') {
+ invert = true;
+ p++;
+ }
+
+ /* if closing bracket here or at range start then we have a
+ malformed pattern */
+ if ( *p == ']' ) {
+ return MATCH_PATTERN;
+ }
+
+ member_match = false;
+ loop = true;
+
+ while ( loop ) {
+
+ /* if end of construct then loop is done */
+ if (*p == ']') {
+ loop = false;
+ continue;
+ }
+
+ /* matching a '!', '^', '-', '\' or a ']' */
+ if ( *p == '\\' ) {
+ range_start = range_end = *++p;
+ }
+ else {
+ range_start = range_end = *p;
+ }
+
+ /* if end of pattern then bad pattern (Missing ']') */
+ if (!*p)
+ return MATCH_PATTERN;
+
+ /* check for range bar */
+ if (*++p == '-') {
+
+ /* get the range end */
+ range_end = *++p;
+
+ /* if end of pattern or construct then bad pattern */
+ if (range_end == '\0' || range_end == ']')
+ return MATCH_PATTERN;
+
+ /* special character range end */
+ if (range_end == '\\') {
+ range_end = *++p;
+
+ /* if end of text then we have a bad pattern */
+ if (!range_end)
+ return MATCH_PATTERN;
+ }
+
+ /* move just beyond this range */
+ p++;
+ }
+
+ /* if the text character is in range then match found.
+ make sure the range letters have the proper
+ relationship to one another before comparison */
+ if ( range_start < range_end ) {
+ if (*t >= range_start && *t <= range_end) {
+ member_match = true;
+ loop = false;
+ }
+ }
+ else {
+ if (*t >= range_end && *t <= range_start) {
+ member_match = true;
+ loop = false;
+ }
+ }
+ }
+
+ /* if there was a match in an exclusion set then no match */
+ /* if there was no match in a member set then no match */
+ if ((invert && member_match) ||
+ !(invert || member_match))
+ return MATCH_RANGE;
+
+ /* if this is not an exclusion then skip the rest of the [...]
+ construct that already matched. */
+ if (member_match) {
+ while (*p != ']') {
+
+ /* bad pattern (Missing ']') */
+ if (!*p)
+ return MATCH_PATTERN;
+
+ /* skip exact match */
+ if (*p == '\\') {
+ p++;
+
+ /* if end of text then we have a bad pattern */
+ if (!*p)
+ return MATCH_PATTERN;
+ }
+
+ /* move to next pattern char */
+ p++;
+ }
+ }
+
+ break;
+ } // case ']'
+
+ /* must match this character exactly */
+ default:
+ if (*p != *t)
+ return MATCH_LITERAL;
+ }
+ }
+
+ //*s = '\0';
+ /* if end of text not reached then the pattern fails */
+ if ( *t )
+ return MATCH_END;
+ else
+ return MATCH_VALID;
+ }
+
+
+ /*----------------------------------------------------------------------------
+ *
+ * recursively call matche() with final segment of PATTERN and of TEXT.
+ *
+ ----------------------------------------------------------------------------*/
+
+ static int matche_after_star (register const char *p, register const char *t, register char *s)
+ {
+ register int match = 0;
+ register char nextp;
+
+ /* pass over existing ? and * in pattern */
+ while ( *p == '?' || *p == '%' || *p == '*' ) {
+
+ /* take one char for each ? and + */
+ if ( *p == '?') {
+
+ /* if end of text then no match */
+ if ( !*t++ ) {
+ return MATCH_ABORT;
+ }
+ }
+
+ if ( *p == '%') {
+ *s++ = *t;
+ *s = '\0';
+ /* if end of text then no match */
+ if ( !*t++ ) {
+ return MATCH_ABORT;
+ }
+ }
+
+ /* move to next char in pattern */
+ p++;
+ }
+
+ /* if end of pattern we have matched regardless of text left */
+ if ( !*p ) {
+ return MATCH_VALID;
+ }
+
+ /* get the next character to match which must be a literal or '[' */
+ nextp = *p;
+
+ /* Continue until we run out of text or definite result seen */
+ do {
+
+ /* a precondition for matching is that the next character
+ in the pattern match the next character in the text or that
+ the next pattern char is the beginning of a range. Increment
+ text pointer as we go here */
+ if ( nextp == *t || nextp == '[' ) {
+ match = matche(p, t, s);
+ }
+
+ /* if the end of text is reached then no match */
+ if ( !*t++ ) match = MATCH_ABORT;
+
+ } while ( match != MATCH_VALID &&
+ match != MATCH_ABORT &&
+ match != MATCH_PATTERN);
+
+ /* return result */
+ return match;
+ }
+
+
+ /*----------------------------------------------------------------------------
+ *
+ * match() is a shell to matche() to return only bool values.
+ *
+ ----------------------------------------------------------------------------*/
+
+ bool match(const char *p, const char *t, char *s)
+ {
+ int error_type;
+ error_type = matche(p,t,s);
+ return (error_type != MATCH_VALID ) ? false : true;
+ }
+
+
+ //***************************************************************************
+ //***************************************************************************
+ bool
+ ProcessMask(const std::string & rString,
+ const std::string & rWildcard,
+ std::string & rSubstr)
+ {
+ char * substr;
+ int percent_count = 0;
+ int ret ;
+ size_t pos = 0;
+
+ // let's find how many % to allocate enough space for the return substring
+ while ((pos = rWildcard.find('%', pos)) != rWildcard.npos)
+ {
+ percent_count++;
+ pos++;
+ }
+
+ // allocate space for the substring
+ substr = new char[percent_count + 1];
+ substr[percent_count] = 0;
+ substr[0] = '\0';
+
+ // optionally prepend '*/' to wildcard
+ std::string wildcard(rWildcard);
+ if(wildcard[0] != '*') {
+ wildcard = "*/" + wildcard;
+ }
+
+ //optionally prepend '/' to string
+ std::string string1(rString);
+ if(string1[0] != '/') {
+ string1 = "/" + string1;
+ }
+
+ // parse the string
+ if (0 != (ret = match(wildcard.c_str(), string1.c_str(), substr)))
+ {
+ rSubstr = substr;
+ }
+ delete[] substr;
+ return ret;
+ } // ProcessMask
+}
+
+
+#ifdef TEST
+
+/*
+* This test main expects as first arg the pattern and as second arg
+* the match string. Output is yaeh or nay on match. If nay on
+* match then the error code is parsed and written.
+*/
+
+#include <stdio.h>
+
+int main(int argc, char *argv[])
+{
+ int error;
+ int is_valid_error;
+
+ char * tmp = argv[0];
+ int i = 0;
+ for (; *tmp; tmp++)
+ if (*tmp=='%') i++;
+
+ char s[i+1];
+
+
+ if (argc != 3) {
+ printf("Usage: MATCH Pattern Text\n");
+ }
+ else {
+ printf("Pattern: %s\n", argv[1]);
+ printf("Text : %s\n", argv[2]);
+
+ if (!is_pattern(argv[1])) {
+ printf(" First Argument Is Not A Pattern\n");
+ }
+ else {
+ match(argv[1],argv[2], s) ? printf("true") : printf("false");
+ error = matche(argv[1],argv[2], s);
+ is_valid_pattern(argv[1],&is_valid_error);
+
+ switch ( error ) {
+ case MATCH_VALID:
+ printf(" Match Successful");
+ if (is_valid_error != PATTERN_VALID)
+ printf(" -- is_valid_pattern() is complaining\n");
+ else
+ printf("\n");
+ printf("%s\n", s);
+
+ break;
+ case MATCH_RANGE:
+ printf(" Match Failed on [..]\n");
+ break;
+ case MATCH_ABORT:
+ printf(" Match Failed on Early Text Termination\n");
+ break;
+ case MATCH_END:
+ printf(" Match Failed on Early Pattern Termination\n");
+ break;
+ case MATCH_PATTERN:
+ switch ( is_valid_error ) {
+ case PATTERN_VALID:
+ printf(" Internal Disagreement On Pattern\n");
+ break;
+ case PATTERN_RANGE:
+ printf(" No End of Range in [..] Construct\n");
+ break;
+ case PATTERN_CLOSE:
+ printf(" [..] Construct is Open\n");
+ break;
+ case PATTERN_EMPTY:
+ printf(" [..] Construct is Empty\n");
+ break;
+ default:
+ printf(" Internal Error in is_valid_pattern()\n");
+ }
+ break;
+ default:
+ printf(" Internal Error in matche()\n");
+ break;
+ }
+ }
+
+ }
+ return(0);
+}
+
+#endif