mattwil Tue May 5 01:35:13 2009 UTC Modified files: /ZendEngine2 zend_highlight.c zend_language_scanner.l /php-src/ext/standard/tests/strings highlight_file.phpt Log: Implemented manual scanning for strings/comments, plus misc. fixes For details, see http://news.php.net/php.internals/43808
http://cvs.php.net/viewvc.cgi/ZendEngine2/zend_highlight.c?r1=1.65&r2=1.66&diff_format=u Index: ZendEngine2/zend_highlight.c diff -u ZendEngine2/zend_highlight.c:1.65 ZendEngine2/zend_highlight.c:1.66 --- ZendEngine2/zend_highlight.c:1.65 Wed Dec 31 11:12:29 2008 +++ ZendEngine2/zend_highlight.c Tue May 5 01:35:13 2009 @@ -17,7 +17,7 @@ +----------------------------------------------------------------------+ */ -/* $Id: zend_highlight.c,v 1.65 2008/12/31 11:12:29 sebastian Exp $ */ +/* $Id: zend_highlight.c,v 1.66 2009/05/05 01:35:13 mattwil Exp $ */ #include "zend.h" #include <zend_language_parser.h> @@ -127,14 +127,8 @@ zend_printf("<span style=\"color: %s\">", last_color); } } - switch (token_type) { - case T_END_HEREDOC: - zend_html_puts(Z_STRVAL(token), Z_STRLEN(token) TSRMLS_CC); - break; - default: - zend_html_puts(LANG_SCNG(yy_text), LANG_SCNG(yy_leng) TSRMLS_CC); - break; - } + + zend_html_puts(LANG_SCNG(yy_text), LANG_SCNG(yy_leng) TSRMLS_CC); if (Z_TYPE(token) == IS_STRING || Z_TYPE(token) == IS_UNICODE) { @@ -156,19 +150,6 @@ Z_TYPE(token) = 0; } - /* handler for trailing comments, see bug #42767 */ - if (LANG_SCNG(yy_leng) && LANG_SCNG(yy_text) < LANG_SCNG(yy_limit)) { - if (last_color != syntax_highlighter_ini->highlight_comment) { - if (last_color != syntax_highlighter_ini->highlight_html) { - zend_printf("</span>"); - } - if (syntax_highlighter_ini->highlight_comment != syntax_highlighter_ini->highlight_html) { - zend_printf("<span style=\"color: %s\">", syntax_highlighter_ini->highlight_comment); - } - } - zend_html_puts(LANG_SCNG(yy_text), (LANG_SCNG(yy_limit) - LANG_SCNG(yy_text)) TSRMLS_CC); - } - if (last_color != syntax_highlighter_ini->highlight_html) { zend_printf("</span>\n"); } http://cvs.php.net/viewvc.cgi/ZendEngine2/zend_language_scanner.l?r1=1.205&r2=1.206&diff_format=u Index: ZendEngine2/zend_language_scanner.l diff -u ZendEngine2/zend_language_scanner.l:1.205 ZendEngine2/zend_language_scanner.l:1.206 --- ZendEngine2/zend_language_scanner.l:1.205 Thu Mar 26 20:01:38 2009 +++ ZendEngine2/zend_language_scanner.l Tue May 5 01:35:13 2009 @@ -21,7 +21,7 @@ +----------------------------------------------------------------------+ */ -/* $Id: zend_language_scanner.l,v 1.205 2009/03/26 20:01:38 felipe Exp $ */ +/* $Id: zend_language_scanner.l,v 1.206 2009/05/05 01:35:13 mattwil Exp $ */ #if 0 # define YYDEBUG(s, c) printf("state: %d char: %c\n", s, c) @@ -115,13 +115,19 @@ } \ } +/* To save initial string length after scanning to first variable, CG(doc_comment_len) can be reused */ +#define SET_DOUBLE_QUOTES_SCANNED_LENGTH(len) CG(doc_comment_len) = (len) +#define GET_DOUBLE_QUOTES_SCANNED_LENGTH() CG(doc_comment_len) + +#define IS_LABEL_START(c) (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z') || (c) == '_' || (c) >= 0x7F) + #define ZEND_IS_OCT(c) ((c)>='0' && (c)<='7') #define ZEND_IS_HEX(c) (((c)>='0' && (c)<='9') || ((c)>='a' && (c)<='f') || ((c)>='A' && (c)<='F')) BEGIN_EXTERN_C() static void _yy_push_state(int new_state TSRMLS_DC) - { +{ zend_stack_push(&SCNG(state_stack), (void *) &YYGETCONDITION(), sizeof(int)); YYSETCONDITION(new_state); } @@ -1324,63 +1330,8 @@ WHITESPACE [ \n\r\t]+ TABS_AND_SPACES [ \t]* TOKENS [;:,.\[\]()|^&+-/*=%!~$<>?...@] -ANY_CHAR [^\x00] +ANY_CHAR [^] NEWLINE ("\r"|"\n"|"\r\n") -NULL [\x00]{1} - -/* - * LITERAL_DOLLAR matches unescaped $ that aren't followed by a label character - * or a { and therefore will be taken literally. The case of literal $ before - * a variable or "${" is handled in a rule for each string type - */ -DOUBLE_QUOTES_LITERAL_DOLLAR ("$"+([^a-zA-Z_\x7f-\xff$"\\{]|("\\"{ANY_CHAR}))) -BACKQUOTE_LITERAL_DOLLAR ("$"+([^a-zA-Z_\x7f-\xff$`\\{]|("\\"{ANY_CHAR}))) -HEREDOC_LITERAL_DOLLAR ("$"+([^a-zA-Z_\x7f-\xff$\n\r\\{]|("\\"[^\n\r]))) - -/* - * Usually, HEREDOC_NEWLINE will just function like a simple NEWLINE, but some - * special cases need to be handled. HEREDOC_CHARS doesn't allow a line to - * match when { or $, and/or \ is at the end. (("{"*|"$"*)"\\"?) handles that, - * along with cases where { or $, and/or \ is the ONLY thing on a line - * - * The other case is when a line contains a label, followed by ONLY - * { or $, and/or \ Handled by ({LABEL}";"?((("{"+|"$"+)"\\"?)|"\\")) - */ -HEREDOC_NEWLINE ((({LABEL}";"?((("{"+|"$"+)"\\"?)|"\\"))|(("{"*|"$"*)"\\"?)){NEWLINE}) - -/* - * This pattern is just used in the next 2 for matching { or literal $, and/or - * \ escape sequence immediately at the beginning of a line or after a label - */ -HEREDOC_CURLY_OR_ESCAPE_OR_DOLLAR (("{"+[^$\n\r\\{])|("{"*"\\"[^\n\r])|{HEREDOC_LITERAL_DOLLAR}) - -/* - * These 2 label-related patterns allow HEREDOC_CHARS to continue "regular" - * matching after a newline that starts with either a non-label character or a - * label that isn't followed by a newline. Like HEREDOC_CHARS, they won't match - * a variable or "{$" Matching a newline, and possibly label, up TO a variable - * or "{$", is handled in the heredoc rules - * - * The HEREDOC_LABEL_NO_NEWLINE pattern (";"[^$\n\r\\{]) handles cases where ; - * follows a label. [^a-zA-Z0-9_\x7f-\xff;$\n\r\\{] is needed to prevent a label - * character or ; from matching on a possible (real) ending label - */ -HEREDOC_NON_LABEL ([^a-zA-Z_\x7f-\xff$\n\r\\{]|{HEREDOC_CURLY_OR_ESCAPE_OR_DOLLAR}) -HEREDOC_LABEL_NO_NEWLINE ({LABEL}([^a-zA-Z0-9_\x7f-\xff;$\n\r\\{]|(";"[^$\n\r\\{])|(";"?{HEREDOC_CURLY_OR_ESCAPE_OR_DOLLAR}))) - -/* - * CHARS matches everything up to a variable or "{$" - * {'s are matched as long as they aren't followed by a $ - * The case of { before "{$" is handled in a rule for each string type - * - * For heredocs, matching continues across/after newlines if/when it's known - * that the next line doesn't contain a possible ending label - */ -DOUBLE_QUOTES_CHARS ("{"*([^$"\\{]|("\\"{ANY_CHAR}))|{DOUBLE_QUOTES_LITERAL_DOLLAR}) -BACKQUOTE_CHARS ("{"*([^$`\\{]|("\\"{ANY_CHAR}))|{BACKQUOTE_LITERAL_DOLLAR}) -HEREDOC_CHARS ("{"*([^$\n\r\\{]|("\\"[^\n\r]))|{HEREDOC_LITERAL_DOLLAR}|({HEREDOC_NEWLINE}+({HEREDOC_NON_LABEL}|{HEREDOC_LABEL_NO_NEWLINE}))) - -NOWDOC_CHARS ([^\n\r]|{NEWLINE}+([^a-zA-Z_\x7f-\xff\n\r]|({LABEL}([^a-zA-Z0-9_\x7f-\xff;\n\r]|(";"[^\n\r]))))) /* compute yyleng before each rule */ <!*> := yyleng = YYCURSOR - SCNG(yy_text); @@ -2037,6 +1988,14 @@ } <INITIAL>"<script"{WHITESPACE}+"language"{WHITESPACE}*"="{WHITESPACE}*("php"|"\"php\""|"'php'"){WHITESPACE}*">" { + YYCTYPE *bracket = zend_memrchr(yytext, '<', yyleng - (sizeof("script language=php>") - 1)); + + if (bracket != SCNG(yy_text)) { + /* Handle previously scanned HTML, as possible <script> tags found are assumed to not be PHP's */ + YYCURSOR = bracket; + goto inline_html; + } + HANDLE_NEWLINES(yytext, yyleng); Z_STRVAL_P(zendlval) = yytext; /* no copying - intentional */ Z_STRLEN_P(zendlval) = yyleng; @@ -2107,29 +2066,48 @@ } <INITIAL>{ANY_CHAR} { + if (YYCURSOR > YYLIMIT) { + return 0; + } inline_char_handler: while (1) { YYCTYPE *ptr = memchr(YYCURSOR, '<', YYLIMIT - YYCURSOR); - if (ptr == NULL) { - YYCURSOR = YYLIMIT; - yyleng = YYCURSOR - SCNG(yy_text); - break; - - } else { - YYCURSOR = ptr + 1; + YYCURSOR = ptr ? ptr + 1 : YYLIMIT; - /* stop if it may be an opening tag (<?, <%, <script>). this condition is not optimal though */ - if (YYCURSOR < YYLIMIT && (*YYCURSOR == '?' || *YYCURSOR == '%' || *YYCURSOR == 's')) { - --YYCURSOR; - yyleng = YYCURSOR - SCNG(yy_text); - break; + if (YYCURSOR < YYLIMIT) { + switch (*YYCURSOR) { + case '?': + if (CG(short_tags) || !strncasecmp(YYCURSOR + 1, "php", 3)) { /* Assume [ \t\n\r] follows "php" */ + break; + } + continue; + case '%': + if (CG(asp_tags)) { + break; + } + continue; + case 's': + case 'S': + /* Probably NOT an opening PHP <script> tag, so don't end the HTML chunk yet + * If it is, the PHP <script> tag rule checks for any HTML scanned before it */ + YYCURSOR--; + yymore(); + default: + continue; } + + YYCURSOR--; } + + break; } +inline_html: + yyleng = YYCURSOR - SCNG(yy_text); + Z_STRVAL_P(zendlval) = (char *) estrndup(yytext, yyleng); Z_STRLEN_P(zendlval) = yyleng; Z_TYPE_P(zendlval) = IS_STRING; @@ -2192,7 +2170,6 @@ /* Invalid rule to return a more explicit parse error with proper line number */ yyless(0); yy_pop_state(TSRMLS_C); - ZVAL_EMPTY_TEXT(zendlval); /* Empty since it won't be used */ return T_ENCAPSED_AND_WHITESPACE; } @@ -2215,98 +2192,78 @@ <ST_IN_SCRIPTING>"#"|"//" { - BEGIN(ST_ONE_LINE_COMMENT); - yymore(); -} - -<ST_ONE_LINE_COMMENT>"?"|"%"|">" { - yymore(); -} + while (YYCURSOR < YYLIMIT) { + switch (*YYCURSOR++) { + case '\r': + if (*YYCURSOR == '\n') { + YYCURSOR++; + } + /* fall through */ + case '\n': + CG(zend_lineno)++; + break; + case '%': + if (!CG(asp_tags)) { + continue; + } + /* fall through */ + case '?': + if (*YYCURSOR == '>') { + YYCURSOR--; + break; + } + /* fall through */ + default: + continue; + } -<ST_ONE_LINE_COMMENT>[^\n\r?%>]*{ANY_CHAR} { - switch (yytext[yyleng-1]) { - case '?': case '%': case '>': - yyless(yyleng-1); - yymore(); - break; - case '\n': - CG(zend_lineno)++; - /* intentional fall through */ - default: - Z_STRVAL_P(zendlval) = yytext; /* no copying - intentional */ - Z_STRLEN_P(zendlval) = yyleng; - Z_TYPE_P(zendlval) = IS_STRING; - BEGIN(ST_IN_SCRIPTING); - return T_COMMENT; + break; } -} -<ST_ONE_LINE_COMMENT>{NEWLINE} { - Z_STRVAL_P(zendlval) = yytext; /* no copying - intentional */ - Z_STRLEN_P(zendlval) = yyleng; - Z_TYPE_P(zendlval) = IS_STRING; - BEGIN(ST_IN_SCRIPTING); - CG(zend_lineno)++; + yyleng = YYCURSOR - SCNG(yy_text); + return T_COMMENT; } -<ST_ONE_LINE_COMMENT>"?>"|"%>" { - if (CG(asp_tags) || yytext[yyleng-2] != '%') { /* asp comment? */ - Z_STRVAL_P(zendlval) = yytext; /* no copying - intentional */ - Z_STRLEN_P(zendlval) = yyleng-2; - Z_TYPE_P(zendlval) = IS_STRING; - yyless(yyleng - 2); - BEGIN(ST_IN_SCRIPTING); - return T_COMMENT; +<ST_IN_SCRIPTING>"/*"|"/**"{WHITESPACE} { + int doc_com; + + if (yyleng > 2) { + doc_com = 1; + RESET_DOC_COMMENT(); } else { - yymore(); + doc_com = 0; } -} - -<ST_IN_SCRIPTING>"/**"{WHITESPACE} { - RESET_DOC_COMMENT(); - BEGIN(ST_DOC_COMMENT); - yymore(); -} - -<ST_COMMENT,ST_DOC_COMMENT>{NULL} { - zend_error(E_COMPILE_WARNING, "Unterminated comment starting line %d", CG(zend_lineno)); - return 0; -} -<ST_IN_SCRIPTING>"/*" { - BEGIN(ST_COMMENT); - yymore(); -} + while (YYCURSOR < YYLIMIT) { + if (*YYCURSOR++ == '*' && *YYCURSOR == '/') { + break; + } + } + if (YYCURSOR < YYLIMIT) { + YYCURSOR++; + } else { + zend_error(E_COMPILE_WARNING, "Unterminated comment starting line %d", CG(zend_lineno)); + } -<ST_COMMENT,ST_DOC_COMMENT>[^*]+ { - yymore(); -} + yyleng = YYCURSOR - SCNG(yy_text); + HANDLE_NEWLINES(yytext, yyleng); -<ST_DOC_COMMENT>"*/" { - zval temp; + if (doc_com) { + zval tmp; - HANDLE_NEWLINES(yytext, yyleng); - if (!zend_copy_scanner_string(&temp, yytext, yyleng, IS_UNICODE, SCNG(output_conv) TSRMLS_CC)) { - return 0; + if (!zend_copy_scanner_string(&tmp, yytext, yyleng, IS_UNICODE, SCNG(output_conv) TSRMLS_CC)) { + return 0; + } + CG(doc_comment) = tmp.value.uni.val; + CG(doc_comment_len) = tmp.value.uni.len; + return T_DOC_COMMENT; } - CG(doc_comment) = temp.value.uni.val; - CG(doc_comment_len) = temp.value.uni.len; - BEGIN(ST_IN_SCRIPTING); - return T_DOC_COMMENT; -} -<ST_COMMENT>"*/" { - HANDLE_NEWLINES(yytext, yyleng); - BEGIN(ST_IN_SCRIPTING); return T_COMMENT; } -<ST_COMMENT,ST_DOC_COMMENT>"*" { - yymore(); -} - <ST_IN_SCRIPTING>("?>"|"</script"{WHITESPACE}*">"){NEWLINE}? { Z_STRVAL_P(zendlval) = yytext; /* no copying - intentional */ Z_STRLEN_P(zendlval) = yyleng; @@ -2330,40 +2287,85 @@ } -/* ("{"*|"$"*) handles { or $ at the end of a string (or the entire contents) - */ -<ST_IN_SCRIPTING>(["]{DOUBLE_QUOTES_CHARS}*("{"*|"$"*)["]) { - return zend_scan_unicode_escape_string(zendlval, yytext+1, yyleng-2, 0x22 /*'"'*/, T_CONSTANT_ENCAPSED_STRING TSRMLS_CC); -} +<ST_IN_SCRIPTING>b?['] { + int bprefix = (yytext[0] != '\'') ? 1 : 0; + while (1) { + if (YYCURSOR < YYLIMIT) { + if (*YYCURSOR == '\'') { + YYCURSOR++; + yyleng = YYCURSOR - SCNG(yy_text); -<ST_IN_SCRIPTING>(b["]{DOUBLE_QUOTES_CHARS}*("{"*|"$"*)["]) { - zend_scan_binary_escape_string(zendlval, yytext+2, yyleng-3, '"' TSRMLS_CC); - return T_CONSTANT_ENCAPSED_STRING; -} + break; + } else if (*YYCURSOR++ == '\\' && YYCURSOR < YYLIMIT) { + YYCURSOR++; + } + } else { + yyleng = YYLIMIT - SCNG(yy_text); + /* Unclosed single quotes; treat similar to double quotes, but without a separate token + * for ' (unrecognized by parser), instead of old flex fallback to "Unexpected character..." + * rule, which continued in ST_IN_SCRIPTING state after the quote */ + return T_ENCAPSED_AND_WHITESPACE; + } + } -<ST_IN_SCRIPTING>([']([^'\\]|("\\"{ANY_CHAR}))*[']) { - return zend_scan_unicode_single_string(zendlval TSRMLS_CC); + if (bprefix) { + zend_scan_binary_single_string(zendlval, yytext+2, yyleng-3 TSRMLS_CC); + return T_CONSTANT_ENCAPSED_STRING; + } else { + return zend_scan_unicode_single_string(zendlval TSRMLS_CC); + } } -<ST_IN_SCRIPTING>("b'"([^'\\]|("\\"{ANY_CHAR}))*[']) { - zend_scan_binary_single_string(zendlval, yytext+2, yyleng-3 TSRMLS_CC); - return T_CONSTANT_ENCAPSED_STRING; -} +<ST_IN_SCRIPTING>b?["] { + int bprefix = (yytext[0] != '"') ? 1 : 0; + while (YYCURSOR < YYLIMIT) { + switch (*YYCURSOR++) { + case '"': + yyleng = YYCURSOR - SCNG(yy_text); -<ST_IN_SCRIPTING>["] { - BEGIN(ST_DOUBLE_QUOTES); - return '"'; -} + if (bprefix) { + zend_scan_binary_escape_string(zendlval, yytext+2, yyleng-3, '"' TSRMLS_CC); + return T_CONSTANT_ENCAPSED_STRING; + } else { + return zend_scan_unicode_escape_string(zendlval, yytext+1, yyleng-2, 0x22 /*'"'*/, T_CONSTANT_ENCAPSED_STRING TSRMLS_CC); + } + case '$': + if (IS_LABEL_START(*YYCURSOR) || *YYCURSOR == '{') { + break; + } + continue; + case '{': + if (*YYCURSOR == '$') { + break; + } + continue; + case '\\': + if (YYCURSOR < YYLIMIT) { + YYCURSOR++; + } + /* fall through */ + default: + continue; + } + + YYCURSOR--; + break; + } + + /* Remember how much was scanned to save rescanning */ + SET_DOUBLE_QUOTES_SCANNED_LENGTH(YYCURSOR - SCNG(yy_text) - yyleng); + + YYCURSOR = SCNG(yy_text) + yyleng; -<ST_IN_SCRIPTING>b["] { BEGIN(ST_DOUBLE_QUOTES); - return T_BINARY_DOUBLE; + return bprefix ? T_BINARY_DOUBLE : '"'; } + <ST_IN_SCRIPTING>b?"<<<"{TABS_AND_SPACES}({LABEL}|([']{LABEL}['])|(["]{LABEL}["])){NEWLINE} { char *s; int bprefix = (yytext[0] != '<') ? 1 : 0; @@ -2398,7 +2400,7 @@ /* Check for ending label on the next line */ if (CG(heredoc_len) < YYLIMIT - YYCURSOR && !memcmp(YYCURSOR, s, CG(heredoc_len))) { - unsigned char *end = YYCURSOR + CG(heredoc_len); + YYCTYPE *end = YYCURSOR + CG(heredoc_len); if (*end == ';') { end++; @@ -2419,54 +2421,6 @@ } -/* Match everything up to and including a possible ending label, so if the label - * doesn't match, it's kept with the rest of the string - * - * {HEREDOC_NEWLINE}+ handles the case of more than one newline sequence that - * couldn't be matched with HEREDOC_CHARS, because of the following label - */ -<ST_HEREDOC>{HEREDOC_CHARS}*{HEREDOC_NEWLINE}+{LABEL}";"?[\n\r] { - char *end = yytext + yyleng - 1; - - if (end[-1] == ';') { - end--; - yyleng--; - } - - if (yyleng > CG(heredoc_len) && !memcmp(end - CG(heredoc_len), CG(heredoc), CG(heredoc_len))) { - int len = yyleng - CG(heredoc_len) - 2; /* 2 for newline before and after label */ - - /* May have matched fooLABEL; make sure there's a newline before it */ - if (yytext[len] != '\n') { - if (yytext[len] != '\r') { - yyless(yyleng - 1); - yymore(); - } - } else if (len > 0 && yytext[len - 1] == '\r') { - len--; /* Windows newline */ - } - - /* Go back before label, to match in ST_END_HEREDOC state. yytext will include - * newline before label, for zend_highlight/strip, tokenizer, etc. */ - yyless(yyleng - CG(heredoc_len) - 1); /* 1 for newline after label */ - - CG(increment_lineno) = 1; /* For newline before label */ - BEGIN(ST_END_HEREDOC); - - if (CG(literal_type) == IS_UNICODE) { - return zend_scan_unicode_escape_string(zendlval, yytext, len, 0, T_ENCAPSED_AND_WHITESPACE TSRMLS_CC); - } else { - zend_scan_binary_escape_string(zendlval, yytext, len, 0 TSRMLS_CC); - return T_ENCAPSED_AND_WHITESPACE; - } - } else { - /* Go back to end of label, so the next match works correctly in case of - * a variable or another label at the beginning of the next line */ - yyless(yyleng - 1); - yymore(); - } -} - <ST_END_HEREDOC>{ANY_CHAR} { YYCURSOR += CG(heredoc_len) - 1; yyleng = CG(heredoc_len); @@ -2480,31 +2434,69 @@ } -/* Will only match when $ follows: "{$" */ -<ST_DOUBLE_QUOTES,ST_BACKQUOTE,ST_HEREDOC>"{" { +<ST_DOUBLE_QUOTES,ST_BACKQUOTE,ST_HEREDOC>"{$" { Z_LVAL_P(zendlval) = (long) '{'; yy_push_state(ST_IN_SCRIPTING TSRMLS_CC); + yyless(1); return T_CURLY_OPEN; } -<ST_DOUBLE_QUOTES>{DOUBLE_QUOTES_CHARS}+ { - if (CG(literal_type) == IS_UNICODE) { - return zend_scan_unicode_escape_string(zendlval, yytext, yyleng, 0x22 /*'"'*/, T_ENCAPSED_AND_WHITESPACE TSRMLS_CC); - } else { - zend_scan_binary_escape_string(zendlval, yytext, yyleng, '"' TSRMLS_CC); - return T_ENCAPSED_AND_WHITESPACE; - } +<ST_DOUBLE_QUOTES>["] { + BEGIN(ST_IN_SCRIPTING); + return '"'; } -/* "{"{2,}|"$"{2,} handles { before "{$" or literal $ before a variable or "${" - * (("{"+|"$"+)["]) handles { or $ at the end of a string - * - * Same for backquotes and heredocs, except the second case doesn't apply to - * heredocs. yyless(yyleng - 1) is used to correct taking one character too many - */ -<ST_DOUBLE_QUOTES>{DOUBLE_QUOTES_CHARS}*("{"{2,}|"$"{2,}|(("{"+|"$"+)["])) { - yyless(yyleng - 1); +<ST_BACKQUOTE>[`] { + BEGIN(ST_IN_SCRIPTING); + return '`'; +} + + +<ST_DOUBLE_QUOTES>{ANY_CHAR} { + if (GET_DOUBLE_QUOTES_SCANNED_LENGTH()) { + YYCURSOR += GET_DOUBLE_QUOTES_SCANNED_LENGTH() - 1; + SET_DOUBLE_QUOTES_SCANNED_LENGTH(0); + + goto double_quotes_scan_done; + } + + if (YYCURSOR > YYLIMIT) { + return 0; + } + if (yytext[0] == '\\' && YYCURSOR < YYLIMIT) { + YYCURSOR++; + } + + while (YYCURSOR < YYLIMIT) { + switch (*YYCURSOR++) { + case '"': + break; + case '$': + if (IS_LABEL_START(*YYCURSOR) || *YYCURSOR == '{') { + break; + } + continue; + case '{': + if (*YYCURSOR == '$') { + break; + } + continue; + case '\\': + if (YYCURSOR < YYLIMIT) { + YYCURSOR++; + } + /* fall through */ + default: + continue; + } + + YYCURSOR--; + break; + } + +double_quotes_scan_done: + yyleng = YYCURSOR - SCNG(yy_text); if (CG(literal_type) == IS_UNICODE) { return zend_scan_unicode_escape_string(zendlval, yytext, yyleng, 0x22 /*'"'*/, T_ENCAPSED_AND_WHITESPACE TSRMLS_CC); @@ -2515,17 +2507,42 @@ } -<ST_BACKQUOTE>{BACKQUOTE_CHARS}+ { - if (CG(literal_type) == IS_UNICODE) { - return zend_scan_unicode_escape_string(zendlval, yytext, yyleng, 0x60 /*'`'*/, T_ENCAPSED_AND_WHITESPACE TSRMLS_CC); - } else { - zend_scan_binary_escape_string(zendlval, yytext, yyleng, '`' TSRMLS_CC); - return T_ENCAPSED_AND_WHITESPACE; +<ST_BACKQUOTE>{ANY_CHAR} { + if (YYCURSOR > YYLIMIT) { + return 0; + } + if (yytext[0] == '\\' && YYCURSOR < YYLIMIT) { + YYCURSOR++; } -} -<ST_BACKQUOTE>{BACKQUOTE_CHARS}*("{"{2,}|"$"{2,}|(("{"+|"$"+)[`])) { - yyless(yyleng - 1); + while (YYCURSOR < YYLIMIT) { + switch (*YYCURSOR++) { + case '`': + break; + case '$': + if (IS_LABEL_START(*YYCURSOR) || *YYCURSOR == '{') { + break; + } + continue; + case '{': + if (*YYCURSOR == '$') { + break; + } + continue; + case '\\': + if (YYCURSOR < YYLIMIT) { + YYCURSOR++; + } + /* fall through */ + default: + continue; + } + + YYCURSOR--; + break; + } + + yyleng = YYCURSOR - SCNG(yy_text); if (CG(literal_type) == IS_UNICODE) { return zend_scan_unicode_escape_string(zendlval, yytext, yyleng, 0x60 /*'`'*/, T_ENCAPSED_AND_WHITESPACE TSRMLS_CC); @@ -2536,90 +2553,144 @@ } -/* ({HEREDOC_NEWLINE}+({LABEL}";"?)?)? handles the possible case of newline - * sequences, possibly followed by a label, that couldn't be matched with - * HEREDOC_CHARS because of a following variable or "{$" - * - * This doesn't affect real ending labels, as they are followed by a newline, - * which will result in a longer match for the correct rule if present - */ -<ST_HEREDOC>{HEREDOC_CHARS}*({HEREDOC_NEWLINE}+({LABEL}";"?)?)? { - if (CG(literal_type) == IS_UNICODE) { - return zend_scan_unicode_escape_string(zendlval, yytext, yyleng, 0, T_ENCAPSED_AND_WHITESPACE TSRMLS_CC); - } else { - zend_scan_binary_escape_string(zendlval, yytext, yyleng, 0 TSRMLS_CC); - return T_ENCAPSED_AND_WHITESPACE; +<ST_HEREDOC>{ANY_CHAR} { + int newline = 0; + + if (YYCURSOR > YYLIMIT) { + return 0; } -} -<ST_HEREDOC>{HEREDOC_CHARS}*({HEREDOC_NEWLINE}+({LABEL}";"?)?)?("{"{2,}|"$"{2,}) { - yyless(yyleng - 1); + YYCURSOR--; + + while (YYCURSOR < YYLIMIT) { + switch (*YYCURSOR++) { + case '\r': + if (*YYCURSOR == '\n') { + YYCURSOR++; + } + /* fall through */ + case '\n': + /* Check for ending label on the next line */ + if (IS_LABEL_START(*YYCURSOR) && CG(heredoc_len) < YYLIMIT - YYCURSOR && !memcmp(YYCURSOR, CG(heredoc), CG(heredoc_len))) { + YYCTYPE *end = YYCURSOR + CG(heredoc_len); + + if (*end == ';') { + end++; + } + + if (*end == '\n' || *end == '\r') { + /* newline before label will be subtracted from returned text, but + * yyleng/yytext will include it, for zend_highlight/strip, tokenizer, etc. */ + if (YYCURSOR[-2] == '\r' && YYCURSOR[-1] == '\n') { + newline = 2; /* Windows newline */ + } else { + newline = 1; + } + + CG(increment_lineno) = 1; /* For newline before label */ + BEGIN(ST_END_HEREDOC); + + goto heredoc_scan_done; + } + } + continue; + case '$': + if (IS_LABEL_START(*YYCURSOR) || *YYCURSOR == '{') { + break; + } + continue; + case '{': + if (*YYCURSOR == '$') { + break; + } + continue; + case '\\': + if (YYCURSOR < YYLIMIT && *YYCURSOR != '\n' && *YYCURSOR != '\r') { + YYCURSOR++; + } + /* fall through */ + default: + continue; + } + + YYCURSOR--; + break; + } + +heredoc_scan_done: + yyleng = YYCURSOR - SCNG(yy_text); if (CG(literal_type) == IS_UNICODE) { - return zend_scan_unicode_escape_string(zendlval, yytext, yyleng, 0, T_ENCAPSED_AND_WHITESPACE TSRMLS_CC); + return zend_scan_unicode_escape_string(zendlval, yytext, yyleng - newline, 0, T_ENCAPSED_AND_WHITESPACE TSRMLS_CC); } else { - zend_scan_binary_escape_string(zendlval, yytext, yyleng, 0 TSRMLS_CC); + zend_scan_binary_escape_string(zendlval, yytext, yyleng - newline, 0 TSRMLS_CC); return T_ENCAPSED_AND_WHITESPACE; } } -<ST_NOWDOC>({NOWDOC_CHARS}+{NEWLINE}+|{NEWLINE}+){LABEL}";"?[\n\r] { - char *end = yytext + yyleng - 1; +<ST_NOWDOC>{ANY_CHAR} { + int newline = 0; - if (end[-1] == ';') { - end--; - yyleng--; - } - - if (yyleng > CG(heredoc_len) && !memcmp(end - CG(heredoc_len), CG(heredoc), CG(heredoc_len))) { - int len = yyleng - CG(heredoc_len) - 2; /* 2 for newline before and after label */ - - /* May have matched fooLABEL; make sure there's a newline before it */ - if (yytext[len] != '\n') { - if (yytext[len] != '\r') { - yyless(yyleng - 1); - yymore(); - } - } else if (len > 0 && yytext[len - 1] == '\r') { - len--; /* Windows newline */ - } + if (YYCURSOR > YYLIMIT) { + return 0; + } - /* Go back before label, to match in ST_END_HEREDOC state. yytext will include - * newline before label, for zend_highlight/strip, tokenizer, etc. */ - yyless(yyleng - CG(heredoc_len) - 1); /* 1 for newline after label */ + YYCURSOR--; - CG(increment_lineno) = 1; /* For newline before label */ - BEGIN(ST_END_HEREDOC); + while (YYCURSOR < YYLIMIT) { + switch (*YYCURSOR++) { + case '\r': + if (*YYCURSOR == '\n') { + YYCURSOR++; + } + /* fall through */ + case '\n': + /* Check for ending label on the next line */ + if (IS_LABEL_START(*YYCURSOR) && CG(heredoc_len) < YYLIMIT - YYCURSOR && !memcmp(YYCURSOR, CG(heredoc), CG(heredoc_len))) { + YYCTYPE *end = YYCURSOR + CG(heredoc_len); - if (!zend_copy_scanner_string(zendlval, yytext, len, CG(literal_type), SCNG(output_conv) TSRMLS_CC)) { - return 0; - } - HANDLE_NEWLINES(yytext, len); - return T_ENCAPSED_AND_WHITESPACE; - } else { - /* Go back to end of label, so the next match works correctly in case of - * another label at the beginning of the next line */ - yyless(yyleng - 1); - yymore(); - } -} + if (*end == ';') { + end++; + } + if (*end == '\n' || *end == '\r') { + /* newline before label will be subtracted from returned text, but + * yyleng/yytext will include it, for zend_highlight/strip, tokenizer, etc. */ + if (YYCURSOR[-2] == '\r' && YYCURSOR[-1] == '\n') { + newline = 2; /* Windows newline */ + } else { + newline = 1; + } -<ST_DOUBLE_QUOTES>["] { - BEGIN(ST_IN_SCRIPTING); - return '"'; -} + CG(increment_lineno) = 1; /* For newline before label */ + BEGIN(ST_END_HEREDOC); + goto nowdoc_scan_done; + } + } + /* fall through */ + default: + continue; + } + } -<ST_BACKQUOTE>[`] { - BEGIN(ST_IN_SCRIPTING); - return '`'; +nowdoc_scan_done: + yyleng = YYCURSOR - SCNG(yy_text); + + if (!zend_copy_scanner_string(zendlval, yytext, yyleng - newline, CG(literal_type), SCNG(output_conv) TSRMLS_CC)) { + return 0; + } + HANDLE_NEWLINES(yytext, yyleng - newline); + return T_ENCAPSED_AND_WHITESPACE; } -<*>{NULL} { return 0; } /* EOF */ <ST_IN_SCRIPTING,ST_VAR_OFFSET>{ANY_CHAR} { + if (YYCURSOR > YYLIMIT) { + return 0; + } + zend_error(E_COMPILE_WARNING,"Unexpected character in input: '%c' (ASCII=%d) state=%d", yytext[0], yytext[0], YYSTATE); goto restart; } http://cvs.php.net/viewvc.cgi/php-src/ext/standard/tests/strings/highlight_file.phpt?r1=1.8&r2=1.9&diff_format=u Index: php-src/ext/standard/tests/strings/highlight_file.phpt diff -u php-src/ext/standard/tests/strings/highlight_file.phpt:1.8 php-src/ext/standard/tests/strings/highlight_file.phpt:1.9 --- php-src/ext/standard/tests/strings/highlight_file.phpt:1.8 Mon Mar 16 01:40:01 2009 +++ php-src/ext/standard/tests/strings/highlight_file.phpt Tue May 5 01:35:13 2009 @@ -49,7 +49,7 @@ </span> </code>bool(true) <code><span style="color: #000000"> -<span style="color: #0000BB"><?php </span><span style="color: #007700">echo </span><span style="color: #FF9900">"test ?></span> +<span style="color: #0000BB"><?php </span><span style="color: #007700">echo </span><span style="color: #DD0000">"test ?></span> </span> </code>bool(true) <code><span style="color: #000000">
-- PHP CVS Mailing List (http://www.php.net/) To unsubscribe, visit: http://www.php.net/unsub.php