Revision: 17628
Author:   [email protected]
Date:     Tue Nov 12 07:12:31 2013 UTC
Log:      Experimental parser: add catch all rule

[email protected]

BUG=

Review URL: https://codereview.chromium.org/69293005
http://code.google.com/p/v8/source/detail?r=17628

Modified:
 /branches/experimental/parser/src/lexer/lexer_py.re
 /branches/experimental/parser/tools/lexer_generator/automata_test.py
 /branches/experimental/parser/tools/lexer_generator/dfa.py
 /branches/experimental/parser/tools/lexer_generator/generator.py
 /branches/experimental/parser/tools/lexer_generator/nfa.py
 /branches/experimental/parser/tools/lexer_generator/regex_lexer.py
 /branches/experimental/parser/tools/lexer_generator/rule_lexer.py
 /branches/experimental/parser/tools/lexer_generator/rule_parser.py
 /branches/experimental/parser/tools/lexer_generator/rule_parser_test.py
 /branches/experimental/parser/tools/lexer_generator/transition_keys.py

=======================================
--- /branches/experimental/parser/src/lexer/lexer_py.re Mon Nov 11 14:46:29 2013 UTC +++ /branches/experimental/parser/src/lexer/lexer_py.re Tue Nov 12 07:12:31 2013 UTC
@@ -163,24 +163,22 @@
 default_action  { PUSH_TOKEN(ILLEGAL); }

 <DoubleQuoteString>
-"\\"      <<continue>>
-"\\\""    <<continue>>
-"\""      { PUSH_TOKEN(STRING); }
 /\\\n\r?/ <<continue>>
 /\\\r\n?/ <<continue>>
-/\n|\r/    { PUSH_TOKEN(ILLEGAL); }
+/\\./     <<continue>>
+/\n|\r/   { PUSH_TOKEN(ILLEGAL); }
+"\""      { PUSH_TOKEN(STRING); }
 eof       <<terminate_illegal>>
-/./ <<continue>>
+catch_all <<continue>>

 <SingleQuoteString>
-"\\"      <<continue>>
-"\\'"     <<continue>>
-"'"       { PUSH_TOKEN(STRING); }
 /\\\n\r?/ <<continue>>
 /\\\r\n?/ <<continue>>
-/\n|\r/    { PUSH_TOKEN(ILLEGAL); }
+/\\./     <<continue>>
+/\n|\r/   { PUSH_TOKEN(ILLEGAL); }
+"'"       { PUSH_TOKEN(STRING); }
 eof       <<terminate_illegal>>
-/./ <<continue>>
+catch_all <<continue>>

 <Identifier>
 identifier_char    <<continue>>
@@ -194,18 +192,21 @@
 <SingleLineComment>
 line_terminator  { PUSH_LINE_TERMINATOR(); }
 eof              <<terminate>>
-/./ <<continue>>
+catch_all <<continue>>

 <MultiLineComment>
 "*/"             { SKIP(); }
+/\*./             <<continue>>
 # need to force action
 line_terminator+ { PUSH_LINE_TERMINATOR(); } <<continue>>
 eof              <<terminate>>
-/./ <<continue>>
+catch_all <<continue>>

 <HtmlComment>
 "-->"            { SKIP(); }
+/--./            <<continue>>
+/-./             <<continue>>
 # need to force action
 line_terminator+ { PUSH_LINE_TERMINATOR(); } <<continue>>
 eof              <<terminate>>
-/./ <<continue>>
+catch_all <<continue>>
=======================================
--- /branches/experimental/parser/tools/lexer_generator/automata_test.py Thu Nov 7 12:55:33 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/automata_test.py Tue Nov 12 07:12:31 2013 UTC
@@ -104,13 +104,9 @@
       verify_hit("leftrightleftright", [l, r, l, r])
       verify_miss("leftrightleftrightx", [l, r, l, r])

-    def test_continue(self):
-      graph = NfaBuilder.cat_graphs([
-        NfaBuilder.cat_graphs([
-          RegexParser.parse("ab"),
-          NfaBuilder.add_continue(RegexParser.parse("cd"))]),
-        RegexParser.parse("ef")])
-      nfa = NfaBuilder().nfa(graph)
-      self.assertFalse(nfa.matches(""))
-      self.assertTrue(nfa.matches("abcdef"))
-      self.assertTrue(nfa.matches("abcdabcdef"))
+    def test_minimization(self):
+      for (regex, matches, not_matches) in self.__test_data:
+        (nfa, dfa) = build_automata(regex)
+        dfa.minimize()
+
+
=======================================
--- /branches/experimental/parser/tools/lexer_generator/dfa.py Mon Nov 11 14:46:29 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/dfa.py Tue Nov 12 07:12:31 2013 UTC
@@ -156,6 +156,9 @@
     state_iterator = lambda x : [x]
return self.generate_dot(self.__start, self.__terminal_set, iterator, state_iterator)

+  def minimize(self):
+    pass
+
   def to_code(self):
     code = '''
 char c = *cursor;
=======================================
--- /branches/experimental/parser/tools/lexer_generator/generator.py Mon Nov 11 14:22:00 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/generator.py Tue Nov 12 07:12:31 2013 UTC
@@ -91,19 +91,18 @@
     assert 'default' in parser_state.rules
     def process(k, v):
       graphs = []
+      continues = 0
       for (graph, (precedence, code, transition)) in v['regex']:
         default_code = v['default_action']
         action = code if code else default_code
         if action:
           graph = NfaBuilder.add_action(graph, (precedence, action))
-        if not transition:
+        if not transition or transition == 'break':
           pass
         elif transition == 'continue':
           assert not k == 'default'
+          continues += 1
           graph = NfaBuilder.add_continue(graph)
-        elif transition == 'break':
-          assert code
-          graph = NfaBuilder.add_break(graph)
         elif (transition == 'terminate' or
               transition == 'terminate_illegal'):
           assert not code
@@ -114,6 +113,11 @@
           graph = NfaBuilder.join_subgraph(
             graph, transition, rule_map[transition], subgraph_modifier)
         graphs.append(graph)
+      if continues == len(graphs):
+        graphs.append(NfaBuilder.epsilon())
+      if v['catch_all']:
+        assert v['catch_all'] == 'continue'
+        graphs.append(NfaBuilder.add_continue(NfaBuilder.catch_all()))
       graph = NfaBuilder.or_graphs(graphs)
       rule_map[k] = graph
     # process first the subgraphs, then the default graph
=======================================
--- /branches/experimental/parser/tools/lexer_generator/nfa.py Mon Nov 11 12:28:48 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/nfa.py Tue Nov 12 07:12:31 2013 UTC
@@ -106,10 +106,27 @@
   def key_matches(self, value):
     return self.__matches(lambda k, v : k.matches_key(v), value)

+  def replace_catch_all(self):
+    catch_all = TransitionKey.unique('catch_all')
+    if not catch_all in self.__transitions:
+      return
+    f = lambda acc, state: acc | state.__epsilon_closure
+    reachable_states = reduce(f, self.__transitions[catch_all], set())
+    f = lambda acc, state: acc | set(state.__transitions.keys())
+    keys = reduce(f, reachable_states, set())
+    keys.discard(TransitionKey.epsilon())
+    keys.discard(catch_all)
+    inverse_key = TransitionKey.inverse_key(keys)
+    if inverse_key:
+      self.__transitions[inverse_key] = self.__transitions[catch_all]
+    del self.__transitions[catch_all]
+
   @staticmethod
   def gather_transition_keys(state_set):
     f = lambda acc, state: acc | set(state.__transitions.keys())
-    return TransitionKey.disjoint_keys(reduce(f, state_set, set()))
+    keys = reduce(f, state_set, set())
+    keys.discard(TransitionKey.epsilon())
+    return TransitionKey.disjoint_keys(keys)

 class NfaBuilder:

@@ -202,6 +219,12 @@
   def __any(self, graph):
     return self.__key_state(TransitionKey.any())

+  def __epsilon(self, graph):
+    start = self.__new_state()
+    end = self.__new_state()
+    start.close(end)
+    return (start, [end])
+
   def __action(self, graph):
     (start, ends) = self.__process(graph[1])
     action = graph[2]
@@ -215,19 +238,11 @@
     state = self.__peek_state()
     if not state['start_node']:
       state['start_node'] = self.__new_state()
-    end = self.__new_state()
-    self.__patch_ends(ends, end)
-    ends = [end]
-    end.add_epsilon_transition(state['start_node'])
-    return (start, ends)
+    self.__patch_ends(ends, state['start_node'])
+    return (start, [])

-  def __break(self, graph):
-    (start, ends) = self.__process(graph[1])
-    self.__peek_state()['unpatched_ends'] += ends
-    new_end = self.__new_state()
-    for end in ends:
-      end.add_epsilon_transition(new_end)
-    return (start, [new_end])
+  def __catch_all(self, graph):
+    return self.__key_state(TransitionKey.unique('catch_all'))

   def __join(self, graph):
     (graph, name, subgraph, modifier) = graph[1:]
@@ -302,9 +317,13 @@
   def add_continue(graph):
     return ('CONTINUE', graph)

+  @staticmethod
+  def catch_all():
+    return ('CATCH_ALL',)
+
   @staticmethod
-  def add_break(graph):
-    return ('BREAK', graph)
+  def epsilon():
+    return ('EPSILON',)

   @staticmethod
   def join_subgraph(graph, name, subgraph, modifier):
@@ -413,6 +432,7 @@

   def compute_dfa(self):
     self.__compute_epsilon_closures()
+ self.__visit_all_edges(lambda node, state: node.replace_catch_all(), None)
     dfa_nodes = {}
     start_name = self.__to_dfa(set([self.__start]), dfa_nodes, self.__end)
     return (start_name, dfa_nodes)
=======================================
--- /branches/experimental/parser/tools/lexer_generator/regex_lexer.py Fri Nov 8 10:33:44 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/regex_lexer.py Tue Nov 12 07:12:31 2013 UTC
@@ -107,7 +107,7 @@
     r'\\\d+'
     return t

-  __escaped_class_literals = build_escape_map("^[]-:")
+  __escaped_class_literals = build_escape_map("^[]-:\\")

   def t_class_ESCAPED_CLASS_LITERAL(self, t):
     r'\\.'
@@ -115,7 +115,7 @@
     t.value = RegexLexer.__escaped_class_literals[t.value]
     return t

-  t_class_CLASS_LITERAL = r'[\w $_+]'
+  t_class_CLASS_LITERAL = r'[\w $_+\']'

   def t_REPEAT_BEGIN(self, t):
     r'\{'
=======================================
--- /branches/experimental/parser/tools/lexer_generator/rule_lexer.py Mon Nov 11 07:52:15 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/rule_lexer.py Tue Nov 12 07:12:31 2013 UTC
@@ -32,6 +32,8 @@
   tokens = (
     'DEFAULT',
     'DEFAULT_ACTION',
+    'CATCH_ALL',
+
     'IDENTIFIER',
     'STRING',
     'REGEX',
@@ -67,12 +69,13 @@
     r'\#.*[\n\r]+'
     pass

+  __special_identifiers = set(map(lambda s: s.lower(),
+    ['DEFAULT', 'DEFAULT_ACTION', 'CATCH_ALL',]))
+
   def t_IDENTIFIER(self, t):
     r'[a-zA-Z][a-zA-Z0-9_]*'
-    if t.value == 'default':
-      t.type = 'DEFAULT'
-    if t.value == 'default_action':
-      t.type = 'DEFAULT_ACTION'
+    if t.value in self.__special_identifiers:
+      t.type = t.value.upper()
     return t

   t_STRING = r'"((\\("|\w|\\))|[^\\"])+"'
=======================================
--- /branches/experimental/parser/tools/lexer_generator/rule_parser.py Mon Nov 11 07:52:15 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/rule_parser.py Tue Nov 12 07:12:31 2013 UTC
@@ -86,6 +86,7 @@
     if not state.current_state in state.rules:
       state.rules[state.current_state] = {
         'default_action': None,
+        'catch_all' : None,
         'regex' : []
       }
     p[0] = state.current_state
@@ -98,9 +99,10 @@
     '''transition_rule : composite_regex code action
                        | composite_regex empty action
                        | composite_regex code empty
-                       | DEFAULT_ACTION code empty'''
-    transition = p[3] if p[3] else 'break'
-    if not transition in self.__keyword_transitions:
+                       | DEFAULT_ACTION code empty
+                       | CATCH_ALL empty action'''
+    transition = p[3]
+    if transition and not transition in self.__keyword_transitions:
       assert not transition == 'default'
       self.__state.transitions.add(transition)
     RuleParser.__rule_precedence_counter += 1
@@ -109,6 +111,9 @@
     if p[1] == 'default_action':
       assert not rules['default_action']
       rules['default_action'] = code
+    elif p[1] == 'catch_all':
+      assert not rules['catch_all']
+      rules['catch_all'] = transition
     else:
rule = (p[1], (RuleParser.__rule_precedence_counter, code, transition))
       rules['regex'].append(rule)
=======================================
--- /branches/experimental/parser/tools/lexer_generator/rule_parser_test.py Thu Nov 7 12:03:35 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/rule_parser_test.py Tue Nov 12 07:12:31 2013 UTC
@@ -53,19 +53,19 @@

      self.assertTrue(len(self.state.rules), 2)
      self.assertTrue('cond1' in self.state.rules)
-     self.assertEquals(len(self.state.rules['cond1']), 2)
+     self.assertEquals(len(self.state.rules['cond1']['regex']), 2)
      # self.assertTrue('regex2' in self.state.rules['cond1'])
      # self.assertEquals(self.state.rules['cond1']['regex2'],
      #                   ('condition', 'cond2'))

      self.assertTrue('cond2' in self.state.rules)
-     self.assertEquals(len(self.state.rules['cond2']), 2)
+     self.assertEquals(len(self.state.rules['cond2']['regex']), 2)
      # self.assertTrue('regex3' in self.state.rules['cond2'])
      # self.assertEquals(self.state.rules['cond2']['regex3'],
      #                   ('body', 'body'))

      self.assertTrue('cond3' in self.state.rules)
-     self.assertEquals(len(self.state.rules['cond3']), 2)
+     self.assertEquals(len(self.state.rules['cond3']['regex']), 2)
      # self.assertTrue('regex4' in self.state.rules['cond3'])
      # self.assertEquals(self.state.rules['cond3']['regex4'],
      #                   ('condition_and_body', 'cond4', 'body'))
@@ -79,7 +79,7 @@
      self.assertEquals(self.state.aliases['alias'],
                        RegexParser.parse("regex;with;semicolon"))
      self.assertTrue('cond1' in self.state.rules)
-     self.assertEquals(len(self.state.rules['cond1']), 2)
+     self.assertEquals(len(self.state.rules['cond1']['regex']), 2)

      # self.assertEquals(
      #     self.parse['cond1']['regex4{with{braces}'],
=======================================
--- /branches/experimental/parser/tools/lexer_generator/transition_keys.py Mon Nov 11 14:22:00 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/transition_keys.py Tue Nov 12 07:12:31 2013 UTC
@@ -41,6 +41,12 @@

   __unique_key_counter = -1

+  @staticmethod
+  def alphabet_iter():
+    for k, (lower, upper) in TransitionKey.__class_bounds.items():
+      for i in range(lower, upper + 1):
+        yield i
+
   @staticmethod
   def __in_latin_1(char):
     bound = TransitionKey.__class_bounds["latin_1"]
@@ -86,6 +92,9 @@
     key.__cached_hash = None
     return key

+  def __is_unique(self):
+ return len(self.__ranges) == 1 and self.__is_unique_range(self.__ranges[0])
+
   @staticmethod
   def __cached_key(name, bounds_getter):
     if not name in TransitionKey.__cached_keys:
@@ -109,7 +118,7 @@
     return TransitionKey.__create([(char, char)])

   @staticmethod
-  def unique_key(name):
+  def unique(name):
     def get_bounds(name):
       bound = TransitionKey.__unique_key_counter
       TransitionKey.__unique_key_counter -= 1
@@ -242,20 +251,34 @@
     return ranges

   @staticmethod
-  def disjoint_keys(key_set):
-    key_set.discard(TransitionKey.epsilon())
+  def __disjoint_ranges_from_key_set(key_set):
     if not key_set:
       return []
     range_map = {}
     for x in key_set:
+      assert not x.__is_unique()
+      assert x != TransitionKey.epsilon
       for r in x.__ranges:
         if not r[0] in range_map:
           range_map[r[0]] = []
         range_map[r[0]].append(r[1])
     ranges = TransitionKey.__disjoint_keys(range_map)
     TransitionKey.__verify_ranges(ranges, False)
+    return ranges
+
+  @staticmethod
+  def disjoint_keys(key_set):
+    ranges = TransitionKey.__disjoint_ranges_from_key_set(key_set)
     return map(lambda x : TransitionKey.__create([x]), ranges)

+  @staticmethod
+  def inverse_key(key_set):
+    ranges = TransitionKey.__disjoint_ranges_from_key_set(key_set)
+    inverse = TransitionKey.__invert_ranges(ranges)
+    if not inverse:
+      return None
+    return TransitionKey.__create(inverse)
+
   @staticmethod
   def __key_from_ranges(invert, ranges):
     range_map = {}
@@ -296,7 +319,8 @@
     inverted = []
     last = None
     classes = set(TransitionKey.__class_bounds.values())
-    classes.remove(TransitionKey.__class_bounds["latin_1"])
+    latin_1 = TransitionKey.__class_bounds["latin_1"]
+    classes.remove(latin_1)
     for r in ranges:
       assert not TransitionKey.__is_unique_range(r)
       if TransitionKey.__is_class_range(r):
@@ -308,8 +332,10 @@
       elif last[1] + 1 < r[0]:
         inverted.append((last[1] + 1, r[0] - 1))
       last = r
-    upper_bound = TransitionKey.__class_bounds["latin_1"][1]
-    if last != None and last[1] < upper_bound:
+    upper_bound = latin_1[1]
+    if last == None:
+      inverted.append(latin_1)
+    elif last[1] < upper_bound:
       inverted.append((last[1] + 1, upper_bound))
     inverted += list(classes)
     return inverted

--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.

Reply via email to