This is an proof-of-concept to implement the mentiond anchors. For those who
don't know them (that includes me until yesterday):
\A - matches the very beginning of a file
\z - matches the very end of a file
\Z - matches the very end of a file
or the last end of line if the file does not end with a newline
Here is a web reference, from where I acquired my new knowledge:
http://www.regular-expressions.info/anchors.html
If someone out there with more experience with these anchors can test this,
he would be welcomed.
To the rational: I tried to highlight a patch header, that's the part of the
patch which spans until the first line starting with "diff" or "Index:". So
the \A would be a nice-to-have here. So here is it.
Uwe, you are on purpose on Cc here because of your matching pattern patch. For
what I have a question: Is it right that its not possible to specify a
zero-width pattern? I would like to use the \z as an end-pattern.
Thanks.
Regards,
Bert
---
source/regularExp.c | 169 +++++++++++++++++++++++++++++++++++-----------------
1 file changed, 116 insertions(+), 53 deletions(-)
diff --quilt old/source/regularExp.c new/source/regularExp.c
--- old/source/regularExp.c
+++ new/source/regularExp.c
@@ -134,78 +134,84 @@ static const char CVSID[] = "$Id: regula
/* Zero width positional assertions. */
-#define BOL 2 /* Match position at beginning of line. */
-#define EOL 3 /* Match position at end of line. */
-#define BOWORD 4 /* Match "" representing word delimiter or BOL */
-#define EOWORD 5 /* Match "" representing word delimiter or EOL */
-#define NOT_BOUNDARY 6 /* Not word boundary (\B, opposite of < and >) */
+#define FBOL 2 /* Match position at beginning of file. */
+ /* \A, very first beginning of line */
+#define LEOL 3 /* Match position at end of file. */
+ /* \Z, very last end of line */
+#define AEOF 4 /* Match position at absolute end of file. */
+ /* \z, before \0 */
+#define BOL 5 /* Match position at beginning of line. */
+#define EOL 6 /* Match position at end of line. */
+#define BOWORD 7 /* Match "" representing word delimiter or BOL */
+#define EOWORD 8 /* Match "" representing word delimiter or EOL */
+#define NOT_BOUNDARY 9 /* Not word boundary (\B, opposite of < and >) */
/* Op codes with null terminated string operands. */
-#define EXACTLY 7 /* Match this string. */
-#define SIMILAR 8 /* Match this case insensitive string */
-#define ANY_OF 9 /* Match any character in the set. */
-#define ANY_BUT 10 /* Match any character not in the set. */
+#define EXACTLY 10 /* Match this string. */
+#define SIMILAR 11 /* Match this case insensitive string */
+#define ANY_OF 12 /* Match any character in the set. */
+#define ANY_BUT 13 /* Match any character not in the set. */
/* Op codes to match any character. */
-#define ANY 11 /* Match any one character (implements '.') */
-#define EVERY 12 /* Same as ANY but matches newline. */
+#define ANY 14 /* Match any one character (implements '.') */
+#define EVERY 15 /* Same as ANY but matches newline. */
/* Shortcut escapes, \d, \D, \l, \L, \s, \S, \w, \W, \y, \Y. */
-#define DIGIT 13 /* Match any digit, i.e. [0123456789] */
-#define NOT_DIGIT 14 /* Match any non-digit, i.e. [^0123456789] */
-#define LETTER 15 /* Match any letter character [a-zA-Z] */
-#define NOT_LETTER 16 /* Match any non-letter character [^a-zA-Z] */
-#define SPACE 17 /* Match any whitespace character EXCEPT \n */
-#define SPACE_NL 18 /* Match any whitespace character INCLUDING \n */
-#define NOT_SPACE 19 /* Match any non-whitespace character */
-#define NOT_SPACE_NL 20 /* Same as NOT_SPACE but matches newline. */
-#define WORD_CHAR 21 /* Match any word character [a-zA-Z0-9_] */
-#define NOT_WORD_CHAR 22 /* Match any non-word character [^a-zA-Z0-9_] */
-#define IS_DELIM 23 /* Match any character that's a word delimiter */
-#define NOT_DELIM 24 /* Match any character NOT a word delimiter */
+#define DIGIT 16 /* Match any digit, i.e. [0123456789] */
+#define NOT_DIGIT 17 /* Match any non-digit, i.e. [^0123456789] */
+#define LETTER 18 /* Match any letter character [a-zA-Z] */
+#define NOT_LETTER 19 /* Match any non-letter character [^a-zA-Z] */
+#define SPACE 20 /* Match any whitespace character EXCEPT \n */
+#define SPACE_NL 21 /* Match any whitespace character INCLUDING \n */
+#define NOT_SPACE 22 /* Match any non-whitespace character */
+#define NOT_SPACE_NL 23 /* Same as NOT_SPACE but matches newline. */
+#define WORD_CHAR 24 /* Match any word character [a-zA-Z0-9_] */
+#define NOT_WORD_CHAR 25 /* Match any non-word character [^a-zA-Z0-9_] */
+#define IS_DELIM 26 /* Match any character that's a word delimiter */
+#define NOT_DELIM 27 /* Match any character NOT a word delimiter */
/* Quantifier nodes. (Only applied to SIMPLE nodes. Quantifiers applied to non
SIMPLE nodes or larger atoms are implemented using complex constructs.)*/
-#define STAR 25 /* Match this (simple) thing 0 or more times. */
-#define LAZY_STAR 26 /* Minimal matching STAR */
-#define QUESTION 27 /* Match this (simple) thing 0 or 1 times. */
-#define LAZY_QUESTION 28 /* Minimal matching QUESTION */
-#define PLUS 29 /* Match this (simple) thing 1 or more times. */
-#define LAZY_PLUS 30 /* Minimal matching PLUS */
-#define BRACE 31 /* Match this (simple) thing m to n times. */
-#define LAZY_BRACE 32 /* Minimal matching BRACE */
+#define STAR 28 /* Match this (simple) thing 0 or more times. */
+#define LAZY_STAR 29 /* Minimal matching STAR */
+#define QUESTION 30 /* Match this (simple) thing 0 or 1 times. */
+#define LAZY_QUESTION 31 /* Minimal matching QUESTION */
+#define PLUS 32 /* Match this (simple) thing 1 or more times. */
+#define LAZY_PLUS 33 /* Minimal matching PLUS */
+#define BRACE 34 /* Match this (simple) thing m to n times. */
+#define LAZY_BRACE 35 /* Minimal matching BRACE */
/* Nodes used to build complex constructs. */
-#define NOTHING 33 /* Match empty string (always matches) */
-#define BRANCH 34 /* Match this alternative, or the next... */
-#define BACK 35 /* Always matches, NEXT ptr points backward. */
-#define INIT_COUNT 36 /* Initialize {m,n} counter to zero */
-#define INC_COUNT 37 /* Increment {m,n} counter by one */
-#define TEST_COUNT 38 /* Test {m,n} counter against operand */
+#define NOTHING 36 /* Match empty string (always matches) */
+#define BRANCH 37 /* Match this alternative, or the next... */
+#define BACK 38 /* Always matches, NEXT ptr points backward. */
+#define INIT_COUNT 39 /* Initialize {m,n} counter to zero */
+#define INC_COUNT 40 /* Increment {m,n} counter by one */
+#define TEST_COUNT 41 /* Test {m,n} counter against operand */
/* Back Reference nodes. */
-#define BACK_REF 39 /* Match latest matched parenthesized text */
-#define BACK_REF_CI 40 /* Case insensitive version of BACK_REF */
-#define X_REGEX_BR 41 /* Cross-Regex Back-Ref for syntax highlighting
*/
-#define X_REGEX_BR_CI 42 /* Case insensitive version of X_REGEX_BR_CI */
+#define BACK_REF 42 /* Match latest matched parenthesized text */
+#define BACK_REF_CI 43 /* Case insensitive version of BACK_REF */
+#define X_REGEX_BR 44 /* Cross-Regex Back-Ref for syntax highlighting
*/
+#define X_REGEX_BR_CI 45 /* Case insensitive version of X_REGEX_BR_CI */
/* Various nodes used to implement parenthetical constructs. */
-#define POS_AHEAD_OPEN 43 /* Begin positive look ahead */
-#define NEG_AHEAD_OPEN 44 /* Begin negative look ahead */
-#define LOOK_AHEAD_CLOSE 45 /* End positive or negative look ahead */
-
-#define POS_BEHIND_OPEN 46 /* Begin positive look behind */
-#define NEG_BEHIND_OPEN 47 /* Begin negative look behind */
-#define LOOK_BEHIND_CLOSE 48 /* Close look behind */
+#define POS_AHEAD_OPEN 46 /* Begin positive look ahead */
+#define NEG_AHEAD_OPEN 47 /* Begin negative look ahead */
+#define LOOK_AHEAD_CLOSE 48 /* End positive or negative look ahead */
+
+#define POS_BEHIND_OPEN 49 /* Begin positive look behind */
+#define NEG_BEHIND_OPEN 50 /* Begin negative look behind */
+#define LOOK_BEHIND_CLOSE 51 /* Close look behind */
-#define OPEN 49 /* Open for capturing parentheses. */
+#define OPEN 52 /* Open for capturing parentheses. */
/* OPEN+1 is number 1, etc. */
#define CLOSE (OPEN + NSUBEXP) /* Close for capturing parentheses. */
@@ -2262,12 +2268,12 @@ static unsigned char * shortcut_escape (
int emit) {
register unsigned char *class = NULL;
- static unsigned char *codes = (unsigned char *) "ByYdDlLsSwW";
+ static unsigned char *codes = (unsigned char *) "AByYzZdDlLsSwW";
unsigned char *ret_val = (unsigned char *) 1; /* Assume success. */
unsigned char *valid_codes;
if (emit == EMIT_CLASS_BYTES || emit == CHECK_CLASS_ESCAPE) {
- valid_codes = codes + 3; /* \B, \y and \Y are not allowed in classes */
+ valid_codes = codes + 6; /* \B, \y, \Y, \z and \Z are not allowed in
classes */
} else {
valid_codes = codes;
}
@@ -2354,6 +2360,16 @@ static unsigned char * shortcut_escape (
break;
+ case 'A':
+
+ if (emit == EMIT_NODE) {
+ ret_val = emit_node (FBOL);
+ } else {
+ REG_FAIL ("internal error #7 `shortcut_escape\'");
+ }
+
+ break;
+
case 'B':
if (emit == EMIT_NODE) {
@@ -2364,6 +2380,26 @@ static unsigned char * shortcut_escape (
break;
+ case 'z':
+
+ if (emit == EMIT_NODE) {
+ ret_val = emit_node (AEOF);
+ } else {
+ REG_FAIL ("internal error #7 `shortcut_escape\'");
+ }
+
+ break;
+
+ case 'Z':
+
+ if (emit == EMIT_NODE) {
+ ret_val = emit_node (LEOL);
+ } else {
+ REG_FAIL ("internal error #7 `shortcut_escape\'");
+ }
+
+ break;
+
default:
/* We get here if there isn't a case for every character in
the string "codes" */
@@ -2371,7 +2407,7 @@ static unsigned char * shortcut_escape (
REG_FAIL ("internal error #8 `shortcut_escape\'");
}
- if (emit == EMIT_NODE && c != 'B') {
+ if (emit == EMIT_NODE && !(c == 'A' || c == 'B' || c == 'z' || c == 'Z'))
{
*flag_param |= (HAS_WIDTH | SIMPLE);
}
@@ -2642,6 +2678,8 @@ static int Recursion_Limit_Exceeded; /*
/* static regexp *Cross_Regex_Backref; */
+static int Prev_Is_FBOL;
+static int Succ_Is_AEOF;
static int Prev_Is_BOL;
static int Succ_Is_EOL;
static int Prev_Is_Delim;
@@ -2736,7 +2774,9 @@ int ExecRE(regexp *prog, const char* str
/* Remember the logical end of the string. */
End_Of_String = (unsigned char *) match_to;
-
+
+ /* \z needs the original succ_char */
+ Succ_Is_AEOF = ((succ_char == '\0') ? 1 : 0);
if (end == NULL && reverse) {
for (end = string; !AT_END_OF_STRING((unsigned char*)end); end++) ;
succ_char = '\n';
@@ -2753,6 +2793,7 @@ int ExecRE(regexp *prog, const char* str
Start_Of_String = (unsigned char *) string;
Look_Behind_To = (unsigned char *)
(look_behind_to?look_behind_to:string);
+ Prev_Is_FBOL = ((prev_char == '\0') ? 1 : 0);
Prev_Is_BOL = ((prev_char == '\n') || (prev_char == '\0') ? 1 : 0);
Succ_Is_EOL = ((succ_char == '\n') || (succ_char == '\0') ? 1 : 0);
Prev_Is_Delim = (Current_Delimiters [(unsigned char)prev_char] ? 1 :
0);
@@ -3130,6 +3171,28 @@ static int match (unsigned char *prog, i
break;
+ case FBOL: /* `\A' (first beginning of line anchor) */
+ if ((Reg_Input == Start_Of_String) && Prev_Is_FBOL) {
+ break;
+ }
+
+ MATCH_RETURN (0);
+
+ case LEOL: /* `\Z' (last end of line anchor) */
+ if (((*Reg_Input == '\n') && AT_END_OF_STRING(Reg_Input + 1) &&
Succ_Is_AEOF) ||
+ (AT_END_OF_STRING(Reg_Input) && (*(Reg_Input - 1) != '\n'))) {
+ break;
+ }
+
+ MATCH_RETURN (0);
+
+ case AEOF: /* `\z' (absolute end of file anchor) */
+ if (AT_END_OF_STRING(Reg_Input) && Succ_Is_AEOF) {
+ break;
+ }
+
+ MATCH_RETURN (0);
+
case BOL: /* `^' (beginning of line anchor) */
if (Reg_Input == Start_Of_String) {
if (Prev_Is_BOL) break;
--
NEdit Develop mailing list - [email protected]
http://www.nedit.org/mailman/listinfo/develop