let HTK I/O implementation be a single package

author: Determinant <[email protected]> 2015-06-25 12:56:45 +0800
committer: Determinant <[email protected]> 2015-06-25 12:56:45 +0800
commit: a74183ddb4ab8383bfe214b3745eb8a0a99ee47a (patch)
tree: d5e69cf8c4c2db2e3a4722778352fc3c95953bb2 /htk_io/src/KaldiLib/StkMatch.cc
parent: b6301089cde20f4c825c7f5deaf179082aad63da (diff)
1 files changed, 582 insertions, 0 deletions
diff --git a/htk_io/src/KaldiLib/StkMatch.cc b/htk_io/src/KaldiLib/StkMatch.cc
new file mode 100644
index 0000000..4ff4b18
--- /dev/null
+++ b/htk_io/src/KaldiLib/StkMatch.cc
@@ -0,0 +1,582 @@
+/*
+ EPSHeader
+
+   File: filmatch.c
+   Author: J. Kercheval
+   Created: Thu, 03/14/1991  22:22:01
+*/
+
+/*
+ EPSRevision History
+   O. Glembek    Thu, 03/11/2005  01:58:00  Added Mask extraction support (char % does this)
+   J. Kercheval  Wed, 02/20/1991  22:29:01  Released to Public Domain
+   J. Kercheval  Fri, 02/22/1991  15:29:01  fix '\' bugs (two :( of them)
+   J. Kercheval  Sun, 03/10/1991  19:31:29  add error return to matche()
+   J. Kercheval  Sun, 03/10/1991  20:11:11  add is_valid_pattern code
+   J. Kercheval  Sun, 03/10/1991  20:37:11  beef up main()
+   J. Kercheval  Tue, 03/12/1991  22:25:10  Released as V1.1 to Public Domain
+   J. Kercheval  Thu, 03/14/1991  22:22:25  remove '\' for DOS file parsing
+   J. Kercheval  Thu, 03/28/1991  20:58:27  include filmatch.h
+*/
+
+/*
+   Wildcard Pattern Matching
+*/
+
+
+#include "StkMatch.h"
+#include "Common.h"
+
+namespace TNet
+{
+  //#define TEST
+  static int matche_after_star (register const char *pattern, register const char *text, register char *s);
+  // following function is not defined or used.
+  // static int fast_match_after_star (register const char *pattern, register const char *text);
+
+  /*----------------------------------------------------------------------------
+  *
+  * Return true if PATTERN has any special wildcard characters
+  *
+  ----------------------------------------------------------------------------*/
+
+  bool is_pattern (const char *p)
+  {
+      while ( *p ) {
+          switch ( *p++ ) {
+              case '?':
+              case '*':
+              case '%':
+              case '[':
+                return true;
+          }
+      }
+      return false;
+  }
+
+
+  /*----------------------------------------------------------------------------
+  *
+  * Return true if PATTERN has is a well formed regular expression according
+  * to the above syntax
+  *
+  * error_type is a return code based on the type of pattern error.  Zero is
+  * returned in error_type if the pattern is a valid one.  error_type return
+  * values are as follows:
+  *
+  *   PATTERN_VALID - pattern is well formed
+  *   PATTERN_RANGE - [..] construct has a no end range in a '-' pair (ie [a-])
+  *   PATTERN_CLOSE - [..] construct has no end bracket (ie [abc-g )
+  *   PATTERN_EMPTY - [..] construct is empty (ie [])
+  *
+  ----------------------------------------------------------------------------*/
+
+  bool is_valid_pattern (const char *p, int *error_type)
+  {
+
+  /* init error_type */
+  *error_type = PATTERN_VALID;
+
+    /* loop through pattern to EOS */
+    while ( *p )
+    {
+      /* determine pattern type */
+      switch ( *p )
+      {
+        /* the [..] construct must be well formed */
+        case '[':
+        {
+          p++;
+
+          /* if the next character is ']' then bad pattern */
+          if ( *p == ']' ) {
+            *error_type = PATTERN_EMPTY;
+            return false;
+          }
+
+          /* if end of pattern here then bad pattern */
+          if ( !*p )
+          {
+            *error_type = PATTERN_CLOSE;
+            return false;
+          }
+
+          /* loop to end of [..] construct */
+          while ( *p != ']' )
+          {
+            /* check for literal escape */
+            if ( *p == '\\' )
+            {
+                p++;
+
+                /* if end of pattern here then bad pattern */
+                if ( !*p++ ) {
+                    *error_type = PATTERN_ESC;
+                    return false;
+                }
+            }
+            else
+                p++;
+
+            /* if end of pattern here then bad pattern */
+            if ( !*p )
+            {
+              *error_type = PATTERN_CLOSE;
+              return false;
+            }
+
+            /* if this a range */
+            if ( *p == '-' )
+            {
+              /* we must have an end of range */
+              if ( !*++p || *p == ']' )
+              {
+                *error_type = PATTERN_RANGE;
+                return false;
+              }
+              else
+              {
+
+                /* check for literal escape */
+                if ( *p == '\\' )
+                    p++;
+
+                /* if end of pattern here then bad pattern */
+                if ( !*p++ )
+                {
+                    *error_type = PATTERN_ESC;
+                    return false;
+                }
+              }
+            }
+          }
+          break;
+        } //case '[':
+
+
+        /* all other characters are valid pattern elements */
+        case '*':
+        case '?':
+        case '%':
+        default:
+          p++;                              /* "normal" character */
+          break;
+      } // switch ( *p )
+    } // while ( *p )
+
+    return true;
+  } //bool is_valid_pattern (const char *p, int *error_type)
+
+
+  /*----------------------------------------------------------------------------
+  *
+  *  Match the pattern PATTERN against the string TEXT;
+  *
+  *  returns MATCH_VALID if pattern matches, or an errorcode as follows
+  *  otherwise:
+  *
+  *            MATCH_PATTERN  - bad pattern
+  *            MATCH_RANGE    - match failure on [..] construct
+  *            MATCH_ABORT    - premature end of text string
+  *            MATCH_END      - premature end of pattern string
+  *            MATCH_VALID    - valid match
+  *
+  *
+  *  A match means the entire string TEXT is used up in matching.
+  *
+  *  In the pattern string:
+  *       `*' matches any sequence of characters (zero or more)
+  *       `?' matches any character
+  *       `%' matches any character and stores it in the s string
+  *       [SET] matches any character in the specified set,
+  *       [!SET] or [^SET] matches any character not in the specified set.
+  *       \ is allowed within a set to escape a character like ']' or '-'
+  *
+  *  A set is composed of characters or ranges; a range looks like
+  *  character hyphen character (as in 0-9 or A-Z).  [0-9a-zA-Z_] is the
+  *  minimal set of characters allowed in the [..] pattern construct.
+  *  Other characters are allowed (ie. 8 bit characters) if your system
+  *  will support them.
+  *
+  *  To suppress the special syntactic significance of any of `[]*?%!^-\',
+  *  within a [..] construct and match the character exactly, precede it
+  *  with a `\'.
+  *
+  ----------------------------------------------------------------------------*/
+
+  int matche ( register const char *p, register const char *t, register char *s )
+  {
+      register char range_start, range_end;  /* start and end in range */
+
+      bool invert;             /* is this [..] or [!..] */
+      bool member_match;       /* have I matched the [..] construct? */
+      bool loop;               /* should I terminate? */
+
+      for ( ; *p; p++, t++ ) {
+
+          /* if this is the end of the text then this is the end of the match */
+          if (!*t) {
+              return ( *p == '*' && *++p == '\0' ) ? MATCH_VALID : MATCH_ABORT;
+          }
+
+          /* determine and react to pattern type */
+          switch ( *p ) {
+
+              /* single any character match */
+              case '?':
+                  break;
+
+              /* single any character match, with extraction*/
+              case '%': {
+                  *s++ = *t;
+                  *s   = '\0';
+                  break;
+              }
+
+              /* multiple any character match */
+              case '*':
+                  return matche_after_star (p, t, s);
+
+              /* [..] construct, single member/exclusion character match */
+              case '[': {
+                /* move to beginning of range */
+                p++;
+
+                /* check if this is a member match or exclusion match */
+                invert = false;
+                if ( *p == '!' || *p == '^') {
+                    invert = true;
+                    p++;
+                }
+
+                /* if closing bracket here or at range start then we have a
+                   malformed pattern */
+                if ( *p == ']' ) {
+                    return MATCH_PATTERN;
+                }
+
+                member_match = false;
+                loop = true;
+
+                while ( loop ) {
+
+                    /* if end of construct then loop is done */
+                    if (*p == ']') {
+                        loop = false;
+                        continue;
+                    }
+
+                    /* matching a '!', '^', '-', '\' or a ']' */
+                    if ( *p == '\\' ) {
+                        range_start = range_end = *++p;
+                    }
+                    else {
+                        range_start = range_end = *p;
+                    }
+
+                    /* if end of pattern then bad pattern (Missing ']') */
+                    if (!*p)
+                        return MATCH_PATTERN;
+
+                    /* check for range bar */
+                    if (*++p == '-') {
+
+                        /* get the range end */
+                        range_end = *++p;
+
+                        /* if end of pattern or construct then bad pattern */
+                        if (range_end == '\0' || range_end == ']')
+                            return MATCH_PATTERN;
+
+                        /* special character range end */
+                        if (range_end == '\\') {
+                            range_end = *++p;
+
+                            /* if end of text then we have a bad pattern */
+                            if (!range_end)
+                                return MATCH_PATTERN;
+                        }
+
+                        /* move just beyond this range */
+                        p++;
+                    }
+
+                    /* if the text character is in range then match found.
+                       make sure the range letters have the proper
+                       relationship to one another before comparison */
+                    if ( range_start < range_end  ) {
+                        if (*t >= range_start && *t <= range_end) {
+                            member_match = true;
+                            loop = false;
+                        }
+                    }
+                    else {
+                        if (*t >= range_end && *t <= range_start) {
+                            member_match = true;
+                            loop = false;
+                        }
+                    }
+                }
+
+                /* if there was a match in an exclusion set then no match */
+                /* if there was no match in a member set then no match */
+                if ((invert && member_match) ||
+                   !(invert || member_match))
+                    return MATCH_RANGE;
+
+                /* if this is not an exclusion then skip the rest of the [...]
+                    construct that already matched. */
+                if (member_match) {
+                    while (*p != ']') {
+
+                        /* bad pattern (Missing ']') */
+                        if (!*p)
+                            return MATCH_PATTERN;
+
+                        /* skip exact match */
+                        if (*p == '\\') {
+                            p++;
+
+                            /* if end of text then we have a bad pattern */
+                            if (!*p)
+                                return MATCH_PATTERN;
+                        }
+
+                        /* move to next pattern char */
+                        p++;
+                    }
+                }
+
+                break;
+            } // case ']'
+
+            /* must match this character exactly */
+            default:
+                if (*p != *t)
+                    return MATCH_LITERAL;
+        }
+      }
+
+      //*s = '\0';
+      /* if end of text not reached then the pattern fails */
+      if ( *t )
+          return MATCH_END;
+      else
+          return MATCH_VALID;
+  }
+
+
+  /*----------------------------------------------------------------------------
+  *
+  * recursively call matche() with final segment of PATTERN and of TEXT.
+  *
+  ----------------------------------------------------------------------------*/
+
+  static int matche_after_star (register const char *p, register const char *t, register char *s)
+  {
+      register int match = 0;
+      register char nextp;
+
+      /* pass over existing ? and * in pattern */
+      while ( *p == '?' || *p == '%' || *p == '*' ) {
+
+          /* take one char for each ? and + */
+          if ( *p == '?') {
+
+              /* if end of text then no match */
+              if ( !*t++ ) {
+                  return MATCH_ABORT;
+              }
+          }
+
+          if ( *p == '%') {
+              *s++ = *t;
+              *s   = '\0';
+              /* if end of text then no match */
+              if ( !*t++ ) {
+                  return MATCH_ABORT;
+              }
+          }
+
+          /* move to next char in pattern */
+          p++;
+      }
+
+      /* if end of pattern we have matched regardless of text left */
+      if ( !*p ) {
+          return MATCH_VALID;
+      }
+
+      /* get the next character to match which must be a literal or '[' */
+      nextp = *p;
+
+      /* Continue until we run out of text or definite result seen */
+      do {
+
+          /* a precondition for matching is that the next character
+             in the pattern match the next character in the text or that
+             the next pattern char is the beginning of a range.  Increment
+             text pointer as we go here */
+          if ( nextp == *t || nextp == '[' ) {
+              match = matche(p, t, s);
+          }
+
+          /* if the end of text is reached then no match */
+          if ( !*t++ ) match = MATCH_ABORT;
+
+      } while ( match != MATCH_VALID && 
+                match != MATCH_ABORT &&
+                match != MATCH_PATTERN);
+
+      /* return result */
+      return match;
+  }
+
+
+  /*----------------------------------------------------------------------------
+  *
+  * match() is a shell to matche() to return only bool values.
+  *
+  ----------------------------------------------------------------------------*/
+
+  bool match(const  char *p, const char *t, char *s)
+  {
+      int error_type;
+      error_type = matche(p,t,s);
+      return (error_type != MATCH_VALID ) ? false : true;
+  }
+
+
+  //***************************************************************************
+  //***************************************************************************
+  bool
+  ProcessMask(const std::string & rString,
+              const std::string & rWildcard,
+                    std::string & rSubstr)
+  {
+    char *  substr;
+    int     percent_count        = 0;
+    int     ret ;
+    size_t  pos                  = 0;
+
+    // let's find how many % to allocate enough space for the return substring
+    while ((pos = rWildcard.find('%', pos)) != rWildcard.npos)
+    {
+      percent_count++;
+      pos++;
+    }
+
+    // allocate space for the substring
+    substr = new char[percent_count + 1];
+    substr[percent_count] = 0;
+    substr[0]             = '\0';
+
+    // optionally prepend '*/' to wildcard
+    std::string wildcard(rWildcard);
+    if(wildcard[0] != '*') {
+      wildcard = "*/" + wildcard;
+    }
+
+    //optionally prepend '/' to string
+    std::string string1(rString);
+    if(string1[0] != '/') {
+      string1 = "/" + string1;
+    }
+
+    // parse the string
+    if (0 != (ret = match(wildcard.c_str(), string1.c_str(), substr)))
+    {
+      rSubstr = substr;
+    }
+    delete[] substr;
+    return ret;
+  } // ProcessMask
+}
+
+
+#ifdef TEST
+
+/*
+* This test main expects as first arg the pattern and as second arg
+* the match string.  Output is yaeh or nay on match.  If nay on
+* match then the error code is parsed and written.
+*/
+
+#include <stdio.h>
+
+int main(int argc, char *argv[])
+{
+    int error;
+    int is_valid_error;
+
+    char * tmp = argv[0];
+    int i = 0;
+    for (; *tmp; tmp++)
+      if (*tmp=='%') i++;
+
+    char s[i+1];
+
+
+    if (argc != 3) {
+        printf("Usage:  MATCH Pattern Text\n");
+    }
+    else {
+        printf("Pattern: %s\n", argv[1]);
+        printf("Text   : %s\n", argv[2]);
+        
+        if (!is_pattern(argv[1])) {
+            printf("    First Argument Is Not A Pattern\n");
+        }
+        else {
+            match(argv[1],argv[2], s) ? printf("true") : printf("false");
+            error = matche(argv[1],argv[2], s);
+            is_valid_pattern(argv[1],&is_valid_error);
+
+            switch ( error ) {
+                case MATCH_VALID:
+                    printf("    Match Successful");
+                    if (is_valid_error != PATTERN_VALID)
+                        printf(" -- is_valid_pattern() is complaining\n");
+                    else
+                        printf("\n");
+                    printf("%s\n", s);
+
+                    break;
+                case MATCH_RANGE:
+                    printf("    Match Failed on [..]\n");
+                    break;
+                case MATCH_ABORT:
+                    printf("    Match Failed on Early Text Termination\n");
+                    break;
+                case MATCH_END:
+                    printf("    Match Failed on Early Pattern Termination\n");
+                    break;
+                case MATCH_PATTERN:
+                    switch ( is_valid_error ) {
+                        case PATTERN_VALID:
+                            printf("    Internal Disagreement On Pattern\n");
+                            break;
+                        case PATTERN_RANGE:
+                            printf("    No End of Range in [..] Construct\n");
+                            break;
+                        case PATTERN_CLOSE:
+                            printf("    [..] Construct is Open\n");
+                            break;
+                        case PATTERN_EMPTY:
+                            printf("    [..] Construct is Empty\n");
+                            break;
+                        default:
+                            printf("    Internal Error in is_valid_pattern()\n");
+                    }
+                    break;
+                default:
+                    printf("    Internal Error in matche()\n");
+                    break;
+            }
+        }
+
+    }
+    return(0);
+}
+
+#endif
author	Determinant <[email protected]>	2015-06-25 12:56:45 +0800
committer	Determinant <[email protected]>	2015-06-25 12:56:45 +0800
commit	a74183ddb4ab8383bfe214b3745eb8a0a99ee47a (patch)
tree	d5e69cf8c4c2db2e3a4722778352fc3c95953bb2 /htk_io/src/KaldiLib/StkMatch.cc
parent	b6301089cde20f4c825c7f5deaf179082aad63da (diff)