Revision: 17483
Author:   [email protected]
Date:     Tue Nov  5 12:37:55 2013 UTC
Log:      Experimental parser: parsing regex subexpressions

[email protected]

BUG=

Review URL: https://codereview.chromium.org/59603003
http://code.google.com/p/v8/source/detail?r=17483

Modified:
 /branches/experimental/parser/src/lexer/lexer_py.re
 /branches/experimental/parser/tools/lexer_generator/automata_test.py
 /branches/experimental/parser/tools/lexer_generator/regex_lexer.py
 /branches/experimental/parser/tools/lexer_generator/rule_lexer.py
 /branches/experimental/parser/tools/lexer_generator/rule_parser.py

=======================================
--- /branches/experimental/parser/src/lexer/lexer_py.re Tue Nov 5 11:05:04 2013 UTC +++ /branches/experimental/parser/src/lexer/lexer_py.re Tue Nov 5 12:37:55 2013 UTC
@@ -25,7 +25,7 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-whitespace_char = [ \t\v\f\r\240:ws:];
+whitespace_char = [ \t\v\f\r:ws:]; # TODO put back \240
 whitespace = whitespace_char+;
 identifier_start = [$_a-zA-Z:lit:];
 identifier_char = [$_a-zA-Z0-9:lit:];
@@ -33,7 +33,7 @@
 line_terminator = [\n\r]+;
 digit = [0-9];
 hex_digit = [0-9a-fA-F];
-maybe_exponent = ("e" [-+]? digit+)?;
+maybe_exponent = ("e" [\-+]? digit+)?;
number = ("0x" hex_digit+) | (("." digit+ maybe_exponent) | (digit+ ("." digit*)? maybe_exponent));

<Normal> "break" not_identifier_char { PUSH_TOKEN_LOOKAHEAD(Token::BREAK); }
@@ -151,7 +151,7 @@
 <Normal> "'"           :=> SingleQuoteString

 <Normal> identifier_start     :=> Identifier
-<Normal> "\\u[0-9a-fA-F]{4}" { if (ValidIdentifierStart()) { YYSETCONDITION(kConditionIdentifier); goto yyc_Identifier; } send(Token::ILLEGAL); start_ = cursor_; goto yyc_Normal; } +<Normal> /\\u[0-9a-fA-F]{4}/ { if (ValidIdentifierStart()) { YYSETCONDITION(kConditionIdentifier); goto yyc_Identifier; } send(Token::ILLEGAL); start_ = cursor_; goto yyc_Normal; }
 <Normal> "\\"                 { PUSH_TOKEN(Token::ILLEGAL); }

 <Normal> eof           { PUSH_EOF_AND_RETURN();}
@@ -160,25 +160,25 @@
 <DoubleQuoteString> "\\\\"  { goto yyc_DoubleQuoteString; }
 <DoubleQuoteString> "\\\""  { goto yyc_DoubleQuoteString; }
 <DoubleQuoteString> "\""     { PUSH_TOKEN(Token::STRING);}
-<DoubleQuoteString> "\\\n\r?" { goto yyc_DoubleQuoteString; }
-<DoubleQuoteString> "\\\r\n?" { goto yyc_DoubleQuoteString; }
+<DoubleQuoteString> /\\\n\r?/ { goto yyc_DoubleQuoteString; }
+<DoubleQuoteString> /\\\r\n?/ { goto yyc_DoubleQuoteString; }
<DoubleQuoteString> "\n" => Normal { PUSH_TOKEN_LOOKAHEAD(Token::ILLEGAL); } <DoubleQuoteString> "\r" => Normal { PUSH_TOKEN_LOOKAHEAD(Token::ILLEGAL); }
 <DoubleQuoteString> eof     { TERMINATE_ILLEGAL(); }
 <DoubleQuoteString> any     { goto yyc_DoubleQuoteString; }

-<SingleQuoteString> "\\\\"  { goto yyc_SingleQuoteString; }
+<SingleQuoteString> "\\"  { goto yyc_SingleQuoteString; }
 <SingleQuoteString> "\\'"   { goto yyc_SingleQuoteString; }
-<SingleQuoteString> "'"     { PUSH_TOKEN(Token::STRING);}
-<SingleQuoteString> "\\\n\r?" { goto yyc_SingleQuoteString; }
-<SingleQuoteString> "\\\r\n?" { goto yyc_SingleQuoteString; }
+<SingleQuoteString> "'"     { PUSH_TOKEN(Token::STRING); }
+<SingleQuoteString> /\\\n\r?/ { goto yyc_SingleQuoteString; }
+<SingleQuoteString> /\\\r\n?/ { goto yyc_SingleQuoteString; }
<SingleQuoteString> "\n" => Normal { PUSH_TOKEN_LOOKAHEAD(Token::ILLEGAL); } <SingleQuoteString> "\r" => Normal { PUSH_TOKEN_LOOKAHEAD(Token::ILLEGAL); }
 <SingleQuoteString> eof     { TERMINATE_ILLEGAL(); }
 <SingleQuoteString> any     { goto yyc_SingleQuoteString; }

 <Identifier> identifier_char+  { goto yyc_Identifier; }
-<Identifier> "\\u[0-9a-fA-F]{4}" { if (ValidIdentifierPart()) { goto yyc_Identifier; } YYSETCONDITION(kConditionNormal); send(Token::ILLEGAL); start_ = cursor_; goto yyc_Normal; } +<Identifier> /\\u[0-9a-fA-F]{4}/ { if (ValidIdentifierPart()) { goto yyc_Identifier; } YYSETCONDITION(kConditionNormal); send(Token::ILLEGAL); start_ = cursor_; goto yyc_Normal; }
 <Identifier> "\\"              { PUSH_TOKEN(Token::ILLEGAL); }
 <Identifier> any               { PUSH_TOKEN_LOOKAHEAD(Token::IDENTIFIER); }

@@ -186,11 +186,11 @@
<SingleLineComment> eof { start_ = cursor_ - 1; PUSH_TOKEN(Token::EOS); }
 <SingleLineComment> any             { goto yyc_SingleLineComment; }

-<MultiLineComment> "*//"  { PUSH_LINE_TERMINATOR();}
+<MultiLineComment> "*/"  { PUSH_LINE_TERMINATOR();}
<MultiLineComment> eof { start_ = cursor_ - 1; PUSH_TOKEN(Token::EOS); }
 <MultiLineComment> any      { goto yyc_MultiLineComment; }

-<HtmlComment> eof        { start_ = cursor_ - 1; PUSH_TOKEN(Token::EOS); }
 <HtmlComment> "-->"      { PUSH_LINE_TERMINATOR();}
 <HtmlComment> line_terminator+ { PUSH_LINE_TERMINATOR();}
+<HtmlComment> eof        { start_ = cursor_ - 1; PUSH_TOKEN(Token::EOS); }
 <HtmlComment> any        { goto yyc_HtmlComment; }
=======================================
--- /branches/experimental/parser/tools/lexer_generator/automata_test.py Mon Nov 4 15:04:49 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/automata_test.py Tue Nov 5 12:37:55 2013 UTC
@@ -54,6 +54,7 @@
       ("a.?b", ["aab", "abb", "acb", "ab"], ["aaab", ""]),
       ("a.+b", ["aab", "abb", "acb"], ["aaac", "ab", ""]),
       (".|.", ["a", "b"], ["aa", ""]),
+      ("//.", ["//a"], ["aa", ""]),
     ]

     def test_matches(self):
@@ -84,9 +85,9 @@
       dfa = dfa_from_nfa(nfa)
       def verify(string, expected):
         actions = list(dfa.collect_actions(string))
-        assertEqual(len(expected), len(actions))
+        self.assertEqual(len(expected), len(actions))
         for i, action in enumerate(actions):
-          assertEqual(action[i], expected[i])
+          self.assertEqual(action[i], expected[i])
       def verify_miss(string, expected):
         verify(string, expected + [('MISS',)])
       def verify_hit(string, expected):
=======================================
--- /branches/experimental/parser/tools/lexer_generator/regex_lexer.py Thu Oct 31 14:46:33 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/regex_lexer.py Tue Nov 5 12:37:55 2013 UTC
@@ -85,15 +85,15 @@

   t_class_RANGE = '-'
   t_class_NOT = '\^'
-  t_class_CHARACTER_CLASS = ':ws:|:lit:'
+  t_class_CHARACTER_CLASS = r':\w+:'

   def t_class_ESCAPED_CLASS_LITERAL(self, t):
-    r'\\\^|\\-|\\\[|\\\]\\:'
+    r'\\\^|\\-|\\\[|\\\]|\\\:|\\\w'
     t.type = 'CLASS_LITERAL'
     t.value = t.value[1:]
     return t

-  t_class_CLASS_LITERAL = r'[a-zA-Z0-9]' # fix this
+  t_class_CLASS_LITERAL = r'[\w $_:+]' # fix this

   t_ANY_ignore  = '\n'

=======================================
--- /branches/experimental/parser/tools/lexer_generator/rule_lexer.py Tue Nov 5 11:05:04 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/rule_lexer.py Tue Nov 5 12:37:55 2013 UTC
@@ -31,7 +31,8 @@

   tokens = (
     'IDENTIFIER',
-    'STRING_REGEX',
+    'STRING',
+    'REGEX',
     'CHARACTER_CLASS_REGEX',
     'TRANSITION',
     'TRANSITION_WITH_CODE',
@@ -65,7 +66,8 @@
     pass

   t_IDENTIFIER = r'[a-zA-Z0-9_]+'
-  t_STRING_REGEX = r'"((\\("|\w|\\))|[^\\"])+"'
+  t_STRING = r'"((\\("|\w|\\))|[^\\"])+"'
+  t_REGEX = r'/[^\/]+/'
   t_CHARACTER_CLASS_REGEX = r'\[([^\]]|\\\])+\]'
   t_TRANSITION = r':=>'
   t_TRANSITION_WITH_CODE = r'=>'
=======================================
--- /branches/experimental/parser/tools/lexer_generator/rule_parser.py Tue Nov 5 11:05:04 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/rule_parser.py Tue Nov 5 12:37:55 2013 UTC
@@ -27,13 +27,17 @@

 import ply.yacc as yacc
 from rule_lexer import RuleLexer
+from regex_parser import RegexParser

 class RuleParser:

   tokens = RuleLexer.tokens

   def __init__(self):
-    self.aliases = {}
+    self.aliases = {
+      'eof' : "eof rule",
+      'any' : "any rule",
+    }
     self.current_transition = None
     self.rules = {}

@@ -80,28 +84,48 @@
     p[0] = self.current_transition

   def p_composite_regex(self, p):
-    '''composite_regex : regex_part OR regex_part maybe_regex_parts
-                       | regex_part maybe_regex_parts'''
+    '''composite_regex : regex_parts OR regex_parts
+                       | regex_parts'''
     if p[len(p)-1]:
       p[0] = p[1:]
     else:
       p[0] = p[1:-1]

-  def p_maybe_regex_part(self, p):
-    '''maybe_regex_parts : composite_regex
-                         | empty'''
-    p[0] = p[1]
+  def p_regex_parts(self, p):
+    '''regex_parts : regex_part
+                   | regex_part regex_parts'''
+    p[0] = p[1:]

   def p_regex_part(self, p):
'''regex_part : LEFT_PARENTHESIS composite_regex RIGHT_PARENTHESIS modifier
-                  | STRING_REGEX modifier
-                  | CHARACTER_CLASS_REGEX modifier
-                  | IDENTIFIER modifier'''
+                  | regex_string_literal modifier
+                  | regex_class modifier
+                  | regex modifier
+                  | regex_alias modifier'''
     if p[len(p)-1]:
       p[0] = p[1:]
     else:
       p[0] = p[1:-1]

+  def p_regex_string_literal(self, p):
+    'regex_string_literal : STRING'
+    string = p[1][1:-1]
+    for c in "\+?|*[]()":
+      string = string.replace(c, "\\" + c)
+    p[0] = RegexParser.parse(string)
+
+  def p_regex(self, p):
+    'regex : REGEX'
+    p[0] = RegexParser.parse(p[1][1:-1])
+
+  def p_regex_class(self, p):
+    'regex_class : CHARACTER_CLASS_REGEX'
+    p[0] = RegexParser.parse(p[1])
+
+  def p_regex_alias(self, p):
+    'regex_alias : IDENTIFIER'
+    p[0] = self.aliases[p[1]]
+
   def p_modifier(self, p):
     '''modifier : PLUS
                 | QUESTION_MARK

--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.

Reply via email to