Revision: 17483
Author: [email protected]
Date: Tue Nov 5 12:37:55 2013 UTC
Log: Experimental parser: parsing regex subexpressions
[email protected]
BUG=
Review URL: https://codereview.chromium.org/59603003
http://code.google.com/p/v8/source/detail?r=17483
Modified:
/branches/experimental/parser/src/lexer/lexer_py.re
/branches/experimental/parser/tools/lexer_generator/automata_test.py
/branches/experimental/parser/tools/lexer_generator/regex_lexer.py
/branches/experimental/parser/tools/lexer_generator/rule_lexer.py
/branches/experimental/parser/tools/lexer_generator/rule_parser.py
=======================================
--- /branches/experimental/parser/src/lexer/lexer_py.re Tue Nov 5 11:05:04
2013 UTC
+++ /branches/experimental/parser/src/lexer/lexer_py.re Tue Nov 5 12:37:55
2013 UTC
@@ -25,7 +25,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-whitespace_char = [ \t\v\f\r\240:ws:];
+whitespace_char = [ \t\v\f\r:ws:]; # TODO put back \240
whitespace = whitespace_char+;
identifier_start = [$_a-zA-Z:lit:];
identifier_char = [$_a-zA-Z0-9:lit:];
@@ -33,7 +33,7 @@
line_terminator = [\n\r]+;
digit = [0-9];
hex_digit = [0-9a-fA-F];
-maybe_exponent = ("e" [-+]? digit+)?;
+maybe_exponent = ("e" [\-+]? digit+)?;
number = ("0x" hex_digit+) | (("." digit+ maybe_exponent) | (digit+ ("."
digit*)? maybe_exponent));
<Normal> "break" not_identifier_char {
PUSH_TOKEN_LOOKAHEAD(Token::BREAK); }
@@ -151,7 +151,7 @@
<Normal> "'" :=> SingleQuoteString
<Normal> identifier_start :=> Identifier
-<Normal> "\\u[0-9a-fA-F]{4}" { if (ValidIdentifierStart()) {
YYSETCONDITION(kConditionIdentifier); goto yyc_Identifier; }
send(Token::ILLEGAL); start_ = cursor_; goto yyc_Normal; }
+<Normal> /\\u[0-9a-fA-F]{4}/ { if (ValidIdentifierStart()) {
YYSETCONDITION(kConditionIdentifier); goto yyc_Identifier; }
send(Token::ILLEGAL); start_ = cursor_; goto yyc_Normal; }
<Normal> "\\" { PUSH_TOKEN(Token::ILLEGAL); }
<Normal> eof { PUSH_EOF_AND_RETURN();}
@@ -160,25 +160,25 @@
<DoubleQuoteString> "\\\\" { goto yyc_DoubleQuoteString; }
<DoubleQuoteString> "\\\"" { goto yyc_DoubleQuoteString; }
<DoubleQuoteString> "\"" { PUSH_TOKEN(Token::STRING);}
-<DoubleQuoteString> "\\\n\r?" { goto yyc_DoubleQuoteString; }
-<DoubleQuoteString> "\\\r\n?" { goto yyc_DoubleQuoteString; }
+<DoubleQuoteString> /\\\n\r?/ { goto yyc_DoubleQuoteString; }
+<DoubleQuoteString> /\\\r\n?/ { goto yyc_DoubleQuoteString; }
<DoubleQuoteString> "\n" => Normal {
PUSH_TOKEN_LOOKAHEAD(Token::ILLEGAL); }
<DoubleQuoteString> "\r" => Normal {
PUSH_TOKEN_LOOKAHEAD(Token::ILLEGAL); }
<DoubleQuoteString> eof { TERMINATE_ILLEGAL(); }
<DoubleQuoteString> any { goto yyc_DoubleQuoteString; }
-<SingleQuoteString> "\\\\" { goto yyc_SingleQuoteString; }
+<SingleQuoteString> "\\" { goto yyc_SingleQuoteString; }
<SingleQuoteString> "\\'" { goto yyc_SingleQuoteString; }
-<SingleQuoteString> "'" { PUSH_TOKEN(Token::STRING);}
-<SingleQuoteString> "\\\n\r?" { goto yyc_SingleQuoteString; }
-<SingleQuoteString> "\\\r\n?" { goto yyc_SingleQuoteString; }
+<SingleQuoteString> "'" { PUSH_TOKEN(Token::STRING); }
+<SingleQuoteString> /\\\n\r?/ { goto yyc_SingleQuoteString; }
+<SingleQuoteString> /\\\r\n?/ { goto yyc_SingleQuoteString; }
<SingleQuoteString> "\n" => Normal {
PUSH_TOKEN_LOOKAHEAD(Token::ILLEGAL); }
<SingleQuoteString> "\r" => Normal {
PUSH_TOKEN_LOOKAHEAD(Token::ILLEGAL); }
<SingleQuoteString> eof { TERMINATE_ILLEGAL(); }
<SingleQuoteString> any { goto yyc_SingleQuoteString; }
<Identifier> identifier_char+ { goto yyc_Identifier; }
-<Identifier> "\\u[0-9a-fA-F]{4}" { if (ValidIdentifierPart()) { goto
yyc_Identifier; } YYSETCONDITION(kConditionNormal); send(Token::ILLEGAL);
start_ = cursor_; goto yyc_Normal; }
+<Identifier> /\\u[0-9a-fA-F]{4}/ { if (ValidIdentifierPart()) { goto
yyc_Identifier; } YYSETCONDITION(kConditionNormal); send(Token::ILLEGAL);
start_ = cursor_; goto yyc_Normal; }
<Identifier> "\\" { PUSH_TOKEN(Token::ILLEGAL); }
<Identifier> any { PUSH_TOKEN_LOOKAHEAD(Token::IDENTIFIER); }
@@ -186,11 +186,11 @@
<SingleLineComment> eof { start_ = cursor_ - 1;
PUSH_TOKEN(Token::EOS); }
<SingleLineComment> any { goto yyc_SingleLineComment; }
-<MultiLineComment> "*//" { PUSH_LINE_TERMINATOR();}
+<MultiLineComment> "*/" { PUSH_LINE_TERMINATOR();}
<MultiLineComment> eof { start_ = cursor_ - 1;
PUSH_TOKEN(Token::EOS); }
<MultiLineComment> any { goto yyc_MultiLineComment; }
-<HtmlComment> eof { start_ = cursor_ - 1; PUSH_TOKEN(Token::EOS); }
<HtmlComment> "-->" { PUSH_LINE_TERMINATOR();}
<HtmlComment> line_terminator+ { PUSH_LINE_TERMINATOR();}
+<HtmlComment> eof { start_ = cursor_ - 1; PUSH_TOKEN(Token::EOS); }
<HtmlComment> any { goto yyc_HtmlComment; }
=======================================
--- /branches/experimental/parser/tools/lexer_generator/automata_test.py
Mon Nov 4 15:04:49 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/automata_test.py
Tue Nov 5 12:37:55 2013 UTC
@@ -54,6 +54,7 @@
("a.?b", ["aab", "abb", "acb", "ab"], ["aaab", ""]),
("a.+b", ["aab", "abb", "acb"], ["aaac", "ab", ""]),
(".|.", ["a", "b"], ["aa", ""]),
+ ("//.", ["//a"], ["aa", ""]),
]
def test_matches(self):
@@ -84,9 +85,9 @@
dfa = dfa_from_nfa(nfa)
def verify(string, expected):
actions = list(dfa.collect_actions(string))
- assertEqual(len(expected), len(actions))
+ self.assertEqual(len(expected), len(actions))
for i, action in enumerate(actions):
- assertEqual(action[i], expected[i])
+ self.assertEqual(action[i], expected[i])
def verify_miss(string, expected):
verify(string, expected + [('MISS',)])
def verify_hit(string, expected):
=======================================
--- /branches/experimental/parser/tools/lexer_generator/regex_lexer.py Thu
Oct 31 14:46:33 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/regex_lexer.py Tue
Nov 5 12:37:55 2013 UTC
@@ -85,15 +85,15 @@
t_class_RANGE = '-'
t_class_NOT = '\^'
- t_class_CHARACTER_CLASS = ':ws:|:lit:'
+ t_class_CHARACTER_CLASS = r':\w+:'
def t_class_ESCAPED_CLASS_LITERAL(self, t):
- r'\\\^|\\-|\\\[|\\\]\\:'
+ r'\\\^|\\-|\\\[|\\\]|\\\:|\\\w'
t.type = 'CLASS_LITERAL'
t.value = t.value[1:]
return t
- t_class_CLASS_LITERAL = r'[a-zA-Z0-9]' # fix this
+ t_class_CLASS_LITERAL = r'[\w $_:+]' # fix this
t_ANY_ignore = '\n'
=======================================
--- /branches/experimental/parser/tools/lexer_generator/rule_lexer.py Tue
Nov 5 11:05:04 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/rule_lexer.py Tue
Nov 5 12:37:55 2013 UTC
@@ -31,7 +31,8 @@
tokens = (
'IDENTIFIER',
- 'STRING_REGEX',
+ 'STRING',
+ 'REGEX',
'CHARACTER_CLASS_REGEX',
'TRANSITION',
'TRANSITION_WITH_CODE',
@@ -65,7 +66,8 @@
pass
t_IDENTIFIER = r'[a-zA-Z0-9_]+'
- t_STRING_REGEX = r'"((\\("|\w|\\))|[^\\"])+"'
+ t_STRING = r'"((\\("|\w|\\))|[^\\"])+"'
+ t_REGEX = r'/[^\/]+/'
t_CHARACTER_CLASS_REGEX = r'\[([^\]]|\\\])+\]'
t_TRANSITION = r':=>'
t_TRANSITION_WITH_CODE = r'=>'
=======================================
--- /branches/experimental/parser/tools/lexer_generator/rule_parser.py Tue
Nov 5 11:05:04 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/rule_parser.py Tue
Nov 5 12:37:55 2013 UTC
@@ -27,13 +27,17 @@
import ply.yacc as yacc
from rule_lexer import RuleLexer
+from regex_parser import RegexParser
class RuleParser:
tokens = RuleLexer.tokens
def __init__(self):
- self.aliases = {}
+ self.aliases = {
+ 'eof' : "eof rule",
+ 'any' : "any rule",
+ }
self.current_transition = None
self.rules = {}
@@ -80,28 +84,48 @@
p[0] = self.current_transition
def p_composite_regex(self, p):
- '''composite_regex : regex_part OR regex_part maybe_regex_parts
- | regex_part maybe_regex_parts'''
+ '''composite_regex : regex_parts OR regex_parts
+ | regex_parts'''
if p[len(p)-1]:
p[0] = p[1:]
else:
p[0] = p[1:-1]
- def p_maybe_regex_part(self, p):
- '''maybe_regex_parts : composite_regex
- | empty'''
- p[0] = p[1]
+ def p_regex_parts(self, p):
+ '''regex_parts : regex_part
+ | regex_part regex_parts'''
+ p[0] = p[1:]
def p_regex_part(self, p):
'''regex_part : LEFT_PARENTHESIS composite_regex RIGHT_PARENTHESIS
modifier
- | STRING_REGEX modifier
- | CHARACTER_CLASS_REGEX modifier
- | IDENTIFIER modifier'''
+ | regex_string_literal modifier
+ | regex_class modifier
+ | regex modifier
+ | regex_alias modifier'''
if p[len(p)-1]:
p[0] = p[1:]
else:
p[0] = p[1:-1]
+ def p_regex_string_literal(self, p):
+ 'regex_string_literal : STRING'
+ string = p[1][1:-1]
+ for c in "\+?|*[]()":
+ string = string.replace(c, "\\" + c)
+ p[0] = RegexParser.parse(string)
+
+ def p_regex(self, p):
+ 'regex : REGEX'
+ p[0] = RegexParser.parse(p[1][1:-1])
+
+ def p_regex_class(self, p):
+ 'regex_class : CHARACTER_CLASS_REGEX'
+ p[0] = RegexParser.parse(p[1])
+
+ def p_regex_alias(self, p):
+ 'regex_alias : IDENTIFIER'
+ p[0] = self.aliases[p[1]]
+
def p_modifier(self, p):
'''modifier : PLUS
| QUESTION_MARK
--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
---
You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.