[RFC/PATCH] Regex: support \A, \Z and \z anchors known from Perl

Bert Wesarg Tue, 06 Apr 2010 07:15:55 -0700

This is an proof-of-concept to implement the mentiond anchors. For those who
don't know them (that includes me until yesterday):


 \A - matches the very beginning of a file
 \z - matches the very end of a file
 \Z - matches the very end of a file
        or the last end of line if the file does not end with a newline

Here is a web reference, from where I acquired my new knowledge:
    http://www.regular-expressions.info/anchors.html

If someone out there with more experience with these anchors can test this,
he would be welcomed.

To the rational: I tried to highlight a patch header, that's the part of the
patch which spans until the first line starting with "diff" or "Index:". So
the \A would be a nice-to-have here. So here is it.

Uwe, you are on purpose on Cc here because of your matching pattern patch. For
what I have a question: Is it right that its not possible to specify a
zero-width pattern? I would like to use the \z as an end-pattern.

Thanks.

Regards,
Bert

---

 source/regularExp.c |  169 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 116 insertions(+), 53 deletions(-)

diff --quilt old/source/regularExp.c new/source/regularExp.c
--- old/source/regularExp.c
+++ new/source/regularExp.c
@@ -134,78 +134,84 @@ static const char CVSID[] = "$Id: regula
 
 /* Zero width positional assertions. */
 
-#define BOL                2  /* Match position at beginning of line. */
-#define EOL                3  /* Match position at end of line. */
-#define BOWORD             4  /* Match "" representing word delimiter or BOL */
-#define EOWORD             5  /* Match "" representing word delimiter or EOL */
-#define NOT_BOUNDARY       6  /* Not word boundary (\B, opposite of < and >) */
+#define FBOL               2  /* Match position at beginning of file. */
+                              /* \A, very first beginning of line */
+#define LEOL               3  /* Match position at end of file. */
+                              /* \Z, very last end of line */
+#define AEOF               4  /* Match position at absolute end of file. */
+                              /* \z, before \0 */
+#define BOL                5  /* Match position at beginning of line. */
+#define EOL                6  /* Match position at end of line. */
+#define BOWORD             7  /* Match "" representing word delimiter or BOL */
+#define EOWORD             8  /* Match "" representing word delimiter or EOL */
+#define NOT_BOUNDARY       9  /* Not word boundary (\B, opposite of < and >) */
 
 /* Op codes with null terminated string operands. */
 
-#define EXACTLY            7  /* Match this string. */
-#define SIMILAR            8  /* Match this case insensitive string */
-#define ANY_OF             9  /* Match any character in the set. */
-#define ANY_BUT           10  /* Match any character not in the set. */
+#define EXACTLY           10  /* Match this string. */
+#define SIMILAR           11  /* Match this case insensitive string */
+#define ANY_OF            12  /* Match any character in the set. */
+#define ANY_BUT           13  /* Match any character not in the set. */
 
 /* Op codes to match any character. */
 
-#define ANY               11  /* Match any one character (implements '.') */
-#define EVERY             12  /* Same as ANY but matches newline. */
+#define ANY               14  /* Match any one character (implements '.') */
+#define EVERY             15  /* Same as ANY but matches newline. */
 
 /* Shortcut escapes, \d, \D, \l, \L, \s, \S, \w, \W, \y, \Y. */
 
-#define DIGIT             13  /* Match any digit, i.e. [0123456789] */
-#define NOT_DIGIT         14  /* Match any non-digit, i.e. [^0123456789] */
-#define LETTER            15  /* Match any letter character [a-zA-Z] */
-#define NOT_LETTER        16  /* Match any non-letter character [^a-zA-Z] */
-#define SPACE             17  /* Match any whitespace character EXCEPT \n */
-#define SPACE_NL          18  /* Match any whitespace character INCLUDING \n */
-#define NOT_SPACE         19  /* Match any non-whitespace character */
-#define NOT_SPACE_NL      20  /* Same as NOT_SPACE but matches newline. */
-#define WORD_CHAR         21  /* Match any word character [a-zA-Z0-9_] */
-#define NOT_WORD_CHAR     22  /* Match any non-word character [^a-zA-Z0-9_] */
-#define IS_DELIM          23  /* Match any character that's a word delimiter */
-#define NOT_DELIM         24  /* Match any character NOT a word delimiter */
+#define DIGIT             16  /* Match any digit, i.e. [0123456789] */
+#define NOT_DIGIT         17  /* Match any non-digit, i.e. [^0123456789] */
+#define LETTER            18  /* Match any letter character [a-zA-Z] */
+#define NOT_LETTER        19  /* Match any non-letter character [^a-zA-Z] */
+#define SPACE             20  /* Match any whitespace character EXCEPT \n */
+#define SPACE_NL          21  /* Match any whitespace character INCLUDING \n */
+#define NOT_SPACE         22  /* Match any non-whitespace character */
+#define NOT_SPACE_NL      23  /* Same as NOT_SPACE but matches newline. */
+#define WORD_CHAR         24  /* Match any word character [a-zA-Z0-9_] */
+#define NOT_WORD_CHAR     25  /* Match any non-word character [^a-zA-Z0-9_] */
+#define IS_DELIM          26  /* Match any character that's a word delimiter */
+#define NOT_DELIM         27  /* Match any character NOT a word delimiter */
 
 /* Quantifier nodes. (Only applied to SIMPLE nodes.  Quantifiers applied to non
    SIMPLE nodes or larger atoms are implemented using complex constructs.)*/
 
-#define STAR              25  /* Match this (simple) thing 0 or more times. */
-#define LAZY_STAR         26  /* Minimal matching STAR */
-#define QUESTION          27  /* Match this (simple) thing 0 or 1 times. */
-#define LAZY_QUESTION     28  /* Minimal matching QUESTION */
-#define PLUS              29  /* Match this (simple) thing 1 or more times. */
-#define LAZY_PLUS         30  /* Minimal matching PLUS */
-#define BRACE             31  /* Match this (simple) thing m to n times. */
-#define LAZY_BRACE        32  /* Minimal matching BRACE */
+#define STAR              28  /* Match this (simple) thing 0 or more times. */
+#define LAZY_STAR         29  /* Minimal matching STAR */
+#define QUESTION          30  /* Match this (simple) thing 0 or 1 times. */
+#define LAZY_QUESTION     31  /* Minimal matching QUESTION */
+#define PLUS              32  /* Match this (simple) thing 1 or more times. */
+#define LAZY_PLUS         33  /* Minimal matching PLUS */
+#define BRACE             34  /* Match this (simple) thing m to n times. */
+#define LAZY_BRACE        35  /* Minimal matching BRACE */
 
 /* Nodes used to build complex constructs. */
 
-#define NOTHING           33  /* Match empty string (always matches) */
-#define BRANCH            34  /* Match this alternative, or the next... */
-#define BACK              35  /* Always matches, NEXT ptr points backward. */
-#define INIT_COUNT        36  /* Initialize {m,n} counter to zero */
-#define INC_COUNT         37  /* Increment {m,n} counter by one */
-#define TEST_COUNT        38  /* Test {m,n} counter against operand */
+#define NOTHING           36  /* Match empty string (always matches) */
+#define BRANCH            37  /* Match this alternative, or the next... */
+#define BACK              38  /* Always matches, NEXT ptr points backward. */
+#define INIT_COUNT        39  /* Initialize {m,n} counter to zero */
+#define INC_COUNT         40  /* Increment {m,n} counter by one */
+#define TEST_COUNT        41  /* Test {m,n} counter against operand */
 
 /* Back Reference nodes. */
 
-#define BACK_REF          39  /* Match latest matched parenthesized text */
-#define BACK_REF_CI       40  /* Case insensitive version of BACK_REF */
-#define X_REGEX_BR        41  /* Cross-Regex Back-Ref for syntax highlighting 
*/
-#define X_REGEX_BR_CI     42  /* Case insensitive version of X_REGEX_BR_CI */
+#define BACK_REF          42  /* Match latest matched parenthesized text */
+#define BACK_REF_CI       43  /* Case insensitive version of BACK_REF */
+#define X_REGEX_BR        44  /* Cross-Regex Back-Ref for syntax highlighting 
*/
+#define X_REGEX_BR_CI     45  /* Case insensitive version of X_REGEX_BR_CI */
 
 /* Various nodes used to implement parenthetical constructs. */
 
-#define POS_AHEAD_OPEN    43  /* Begin positive look ahead */
-#define NEG_AHEAD_OPEN    44  /* Begin negative look ahead */
-#define LOOK_AHEAD_CLOSE  45  /* End positive or negative look ahead */
-
-#define POS_BEHIND_OPEN   46  /* Begin positive look behind */
-#define NEG_BEHIND_OPEN   47  /* Begin negative look behind */
-#define LOOK_BEHIND_CLOSE 48  /* Close look behind */
+#define POS_AHEAD_OPEN    46  /* Begin positive look ahead */
+#define NEG_AHEAD_OPEN    47  /* Begin negative look ahead */
+#define LOOK_AHEAD_CLOSE  48  /* End positive or negative look ahead */
+
+#define POS_BEHIND_OPEN   49  /* Begin positive look behind */
+#define NEG_BEHIND_OPEN   50  /* Begin negative look behind */
+#define LOOK_BEHIND_CLOSE 51  /* Close look behind */
 
-#define OPEN              49  /* Open for capturing parentheses. */
+#define OPEN              52  /* Open for capturing parentheses. */
 
                               /*  OPEN+1 is number 1, etc. */
 #define CLOSE       (OPEN + NSUBEXP)  /* Close for capturing parentheses. */
@@ -2262,12 +2268,12 @@ static unsigned char * shortcut_escape (
    int            emit) {
 
    register unsigned char *class   = NULL;
-   static   unsigned char *codes   = (unsigned char *) "ByYdDlLsSwW";
+   static   unsigned char *codes   = (unsigned char *) "AByYzZdDlLsSwW";
             unsigned char *ret_val = (unsigned char *) 1; /* Assume success. */
             unsigned char *valid_codes;
 
    if (emit == EMIT_CLASS_BYTES || emit == CHECK_CLASS_ESCAPE) {
-      valid_codes = codes + 3; /* \B, \y and \Y are not allowed in classes */
+      valid_codes = codes + 6; /* \B, \y, \Y, \z and \Z are not allowed in 
classes */
    } else {
       valid_codes = codes;
    }
@@ -2354,6 +2360,16 @@ static unsigned char * shortcut_escape (
 
          break;
 
+      case 'A':
+
+         if (emit == EMIT_NODE) {
+            ret_val = emit_node (FBOL);
+         } else {
+            REG_FAIL ("internal error #7 `shortcut_escape\'");
+         }
+
+         break;
+
       case 'B':
 
          if (emit == EMIT_NODE) {
@@ -2364,6 +2380,26 @@ static unsigned char * shortcut_escape (
 
          break;
 
+      case 'z':
+
+         if (emit == EMIT_NODE) {
+            ret_val = emit_node (AEOF);
+         } else {
+            REG_FAIL ("internal error #7 `shortcut_escape\'");
+         }
+
+         break;
+
+      case 'Z':
+
+         if (emit == EMIT_NODE) {
+            ret_val = emit_node (LEOL);
+         } else {
+            REG_FAIL ("internal error #7 `shortcut_escape\'");
+         }
+
+         break;
+
       default:
          /* We get here if there isn't a case for every character in
             the string "codes" */
@@ -2371,7 +2407,7 @@ static unsigned char * shortcut_escape (
          REG_FAIL ("internal error #8 `shortcut_escape\'");
    }
 
-   if (emit == EMIT_NODE  &&  c != 'B') {
+   if (emit == EMIT_NODE  &&  !(c == 'A' || c == 'B' || c == 'z' || c == 'Z')) 
{
       *flag_param |= (HAS_WIDTH | SIMPLE);
    }
 
@@ -2642,6 +2678,8 @@ static int Recursion_Limit_Exceeded; /* 
 
 /* static regexp *Cross_Regex_Backref; */
 
+static int Prev_Is_FBOL;
+static int Succ_Is_AEOF;
 static int Prev_Is_BOL;
 static int Succ_Is_EOL;
 static int Prev_Is_Delim;
@@ -2736,7 +2774,9 @@ int ExecRE(regexp *prog, const char* str
    /* Remember the logical end of the string. */
    
    End_Of_String = (unsigned char *) match_to;
-   
+
+   /* \z needs the original succ_char */
+   Succ_Is_AEOF = ((succ_char == '\0') ? 1 : 0);
    if (end == NULL && reverse) {
       for (end = string; !AT_END_OF_STRING((unsigned char*)end); end++) ;
       succ_char = '\n';
@@ -2753,6 +2793,7 @@ int ExecRE(regexp *prog, const char* str
    Start_Of_String    = (unsigned char *) string;
    Look_Behind_To     = (unsigned char *) 
(look_behind_to?look_behind_to:string);
 
+   Prev_Is_FBOL       = ((prev_char == '\0') ? 1 : 0);
    Prev_Is_BOL        = ((prev_char == '\n') || (prev_char == '\0') ? 1 : 0);
    Succ_Is_EOL        = ((succ_char == '\n') || (succ_char == '\0') ? 1 : 0);
    Prev_Is_Delim      = (Current_Delimiters [(unsigned char)prev_char] ? 1 : 
0);
@@ -3130,6 +3171,28 @@ static int match (unsigned char *prog, i
 
             break;
 
+         case FBOL: /* `\A' (first beginning of line anchor) */
+            if ((Reg_Input == Start_Of_String) && Prev_Is_FBOL) {
+               break;
+            }
+
+            MATCH_RETURN (0);
+
+         case LEOL: /* `\Z' (last end of line anchor) */
+            if (((*Reg_Input == '\n') && AT_END_OF_STRING(Reg_Input + 1) && 
Succ_Is_AEOF) ||
+                (AT_END_OF_STRING(Reg_Input) && (*(Reg_Input - 1) != '\n'))) {
+               break;
+            }
+
+            MATCH_RETURN (0);
+
+         case AEOF: /* `\z' (absolute end of file anchor) */
+            if (AT_END_OF_STRING(Reg_Input) && Succ_Is_AEOF) {
+               break;
+            }
+
+            MATCH_RETURN (0);
+
          case BOL: /* `^' (beginning of line anchor) */
             if (Reg_Input == Start_Of_String) {
                if (Prev_Is_BOL) break;
-- 
NEdit Develop mailing list - [email protected]
http://www.nedit.org/mailman/listinfo/develop

[RFC/PATCH] Regex: support \A, \Z and \z anchors known from Perl

Reply via email to