Revision: 18772
Author:   [email protected]
Date:     Thu Jan 23 11:39:04 2014 UTC
Log:      Experimental parser: better eos handling

[email protected]

BUG=

Review URL: https://codereview.chromium.org/144603003
http://code.google.com/p/v8/source/detail?r=18772

Modified:
 /branches/experimental/parser/src/lexer/lexer_py.re
 /branches/experimental/parser/tools/lexer_generator/code_generator.jinja
 /branches/experimental/parser/tools/lexer_generator/code_generator.py
 /branches/experimental/parser/tools/lexer_generator/code_generator_test.py
 /branches/experimental/parser/tools/lexer_generator/lexer_test.py
 /branches/experimental/parser/tools/lexer_generator/nfa_builder.py
 /branches/experimental/parser/tools/lexer_generator/rule_lexer.py
 /branches/experimental/parser/tools/lexer_generator/rule_parser.py
 /branches/experimental/parser/tools/lexer_generator/transition_keys.py

=======================================
--- /branches/experimental/parser/src/lexer/lexer_py.re Thu Jan 23 08:24:55 2014 UTC +++ /branches/experimental/parser/src/lexer/lexer_py.re Thu Jan 23 11:39:04 2014 UTC
@@ -51,7 +51,6 @@
   non_octal_whole_part /(\.[:digit:]*)?/ maybe_exponent );
 harmony_number = "0"[bBoO][:digit:]+;
 line_terminator_sequence = /[:line_terminator:]|(\r\n|\n\r)/;
-eos = [:eos:];

 # Rules.

@@ -197,7 +196,7 @@
 identifier_start <|token(IDENTIFIER)|Identifier>
/\\u[:hex_digit:]{4}/ <check_escaped_identifier_start|token(IDENTIFIER)| Identifier>

-eos             <|terminate|>
+eos             <terminate>
 default_action  <do_token_and_go_forward(ILLEGAL)>

 <<DoubleQuoteString>>
@@ -211,8 +210,8 @@
 "\\"                          <|token(ILLEGAL)|>
 line_terminator               <|token(ILLEGAL)|>
 "\""                          <|token(STRING)|>
-eos                           <|terminate_illegal|>
 catch_all                     <||continue>
+eos                           <terminate_illegal>

 <<SingleQuoteString>>
 # TODO subgraph for '\'
@@ -226,8 +225,8 @@
 "\\"                          <|token(ILLEGAL)|>
 line_terminator               <|token(ILLEGAL)|>
 "'"                           <|token(STRING)|>
-eos                           <|terminate_illegal|>
 catch_all                     <||continue>
+eos                           <terminate_illegal>

 <<Identifier>>
 identifier_char <|token(IDENTIFIER)|continue>
@@ -235,7 +234,7 @@

 <<SingleLineComment>>
 line_terminator  <|line_terminator|>
-eos              <|skip_and_terminate|>
+eos              <skip_and_terminate>
 catch_all        <||continue>

 <<MultiLineComment>>
@@ -243,5 +242,5 @@
 # TODO find a way to generate the below rule
 /\*+[^\/*]/      <||continue>
 line_terminator  <line_terminator_in_multiline_comment||continue>
-eos              <|terminate_illegal|>
 catch_all        <||continue>
+eos              <terminate_illegal>
=======================================
--- /branches/experimental/parser/tools/lexer_generator/code_generator.jinja Thu Jan 23 10:16:42 2014 UTC +++ /branches/experimental/parser/tools/lexer_generator/code_generator.jinja Thu Jan 23 11:39:04 2014 UTC
@@ -237,14 +237,22 @@
   {% endif -%}

   {%- macro do_transition(jump_id) -%}
-    {%- set transition_state_id = jump_table[jump_id][0] -%}
-    {%- set inline_transition = jump_table[jump_id][1] == 'inline' %}
     FORWARD();
-    {%- if inline_transition %}
-      {{ do_dfa_state(transition_state_id) }}
-    {% else %}
-      {{ jump(jump_id) }}
-    {% endif %}
+    if (cursor_ < buffer_end_) {
+      {%- set transition_state_id = jump_table[jump_id][0] -%}
+      {%- set inline_transition = jump_table[jump_id][1] == 'inline' %}
+      {%- if inline_transition %}
+        {{ do_dfa_state(transition_state_id) }}
+      {% else %}
+        {{ jump(jump_id) }}
+      {% endif %}
+    }
+    {% if 'eos' in state['unique_transitions'] -%}
+      {{ jump(state['unique_transitions']['eos']) }} // eos handler
+    {%- else -%}
+      BACKWARD(1);
+      {{ jump(state['jump_before_match']) }} // no eos handler
+    {%- endif %}
   {%- endmacro -%}

   {%- if state['switch_transitions'] -%}
@@ -283,9 +291,10 @@
       {% endfor -%}
     }
   {%- endif-%}
+
+  {{ write_label('before_match', node_number) }}

   {%- set match_action = state.match_action -%}
-
   {%- if match_action %}
     {{ dispatch_match_action(match_action[0], match_action[1]) }}
     CRASH();
@@ -317,8 +326,8 @@
 }

 #define READ_CURSOR() {                           \
-  if (cursor_ >= buffer_end_) primary_char = 0;   \
-  else primary_char = *(cursor_);                 \
+  ASSERT(cursor_ < buffer_end_);                  \
+  primary_char = *(cursor_);                      \
 }

 #ifdef DEBUG
=======================================
--- /branches/experimental/parser/tools/lexer_generator/code_generator.py Thu Jan 23 10:16:42 2014 UTC +++ /branches/experimental/parser/tools/lexer_generator/code_generator.py Thu Jan 23 11:39:04 2014 UTC
@@ -54,7 +54,7 @@
     self.__switching = switching
     self.__jump_table = []

-  __jump_labels = ['state_entry', 'after_entry_code']
+  __jump_labels = ['state_entry', 'after_entry_code', 'before_match']

   @staticmethod
   def __transform_state(encoding, state):
@@ -92,7 +92,6 @@
           raise Exception()
       if keys:
         transitions.append((keys, transition_id))
-    # eos_transitions is for a followup cl
     return {
       'node_number' : None,
       'original_node_number' : state.node_number(),
@@ -129,9 +128,9 @@
   def __set_inline(self, count, state):
     assert state['inline'] == None
     inline = False
-    # inline terminal states
     if state['must_not_inline']:
       inline = False
+    # inline terminal states
     elif not state['transitions']:
       inline = True
     # inline next to terminal states with 1 or 2 transitions
@@ -193,7 +192,7 @@
     bom = 'byte_order_mark'
     catch_all = 'non_primary_everything_else'
     all_classes = set(encoding.class_name_iter())
-    fast_classes = set(['eos', 'zero'])
+    fast_classes = set([])
     call_classes = all_classes - fast_classes - set([bom, catch_all])
     def remap_transition(class_name):
       if class_name in call_classes:
@@ -258,6 +257,17 @@
     for node_number in current_node['unique_transitions'].values():
       CodeGenerator.__reorder(node_number, id_map, dfa_states)

+  @staticmethod
+  def __mark_eos_states(dfa_states, eos_states):
+    for state_id in eos_states:
+      state = dfa_states[state_id]
+      state['is_eos_handler'] = True
+      state['must_not_inline'] = True
+      assert state['match_action']
+      assert not state['entry_action']
+      assert not state['transitions']
+      assert not state['unique_transitions']
+
   def __build_dfa_states(self):
     dfa_states = []
self.__dfa.visit_all_states(lambda state, acc: dfa_states.append(state))
@@ -267,13 +277,20 @@
     id_map = {x['original_node_number'] : x for x in dfa_states}
     dfa_states = []
     CodeGenerator.__reorder(self.__start_node_number, id_map, dfa_states)
+    # store states
+    eos_states = set([])
     def f((key, original_node_number)):
       return (key, id_map[original_node_number]['node_number'])
     for state in dfa_states:
       state['transitions'] = map(f, state['transitions'])
+      state['unique_transitions'] = {k : id_map[v]['node_number']
+          for k, v in state['unique_transitions'].items()}
+      if 'eos' in state['unique_transitions']:
+        eos_states.add(state['unique_transitions']['eos'])
     assert id_map[self.__start_node_number]['node_number'] == 0
     assert len(dfa_states) == self.__dfa.node_count()
-    # store states
+    # mark eos states
+    self.__mark_eos_states(dfa_states, eos_states)
     self.__dfa_states = dfa_states

   def __rewrite_gotos(self):
@@ -311,8 +328,6 @@
     self.__dfa_states.append(state)
     # mark inline as none, to be correctly handled below
     state['inline'] = None
-    # clear transitions, they shouldn't be used anymore
-    state['transitions'] = []
     # clear entry points
state['entry_points'] = {k : False for k in CodeGenerator.__jump_labels}
     return state['node_number']
@@ -350,6 +365,14 @@
         return (key, self.__register_jump(target_id, jump_type))
       for name in transition_names:
         state[name] = map(generate_jump, state[name])
+      if 'eos' in state['unique_transitions']:
+        # eos state is not inlined
+        eos_state_id = state['unique_transitions']['eos']
+        jump = self.__register_jump(eos_state_id, 'state_entry')
+        state['unique_transitions']['eos'] = jump
+      elif state['transitions']:
+        jump = self.__register_jump(state_id, 'before_match')
+        state['jump_before_match'] = jump
       # now rewrite all nodes created
       nodes_created = len(inline_mapping) - len(inline_mapping_in)
       assert len(self.__dfa_states) == (
=======================================
--- /branches/experimental/parser/tools/lexer_generator/code_generator_test.py Tue Jan 14 12:22:39 2014 UTC +++ /branches/experimental/parser/tools/lexer_generator/code_generator_test.py Thu Jan 23 11:39:04 2014 UTC
@@ -33,12 +33,11 @@

   def test_simple(self):
     rules = '''
-    eos = [:eos:];
     <<default>>
     "("           <|LBRACE|>
     ")"           <|RBRACE|>

     "foo"         <|FOO|>
-    eos           <|terminate|>
+    eos           <terminate>
     default_action <DEFAULT>'''
     CodeGenerator(RuleProcessor.parse(rules, 'latin1'))
=======================================
--- /branches/experimental/parser/tools/lexer_generator/lexer_test.py Tue Jan 21 15:59:38 2014 UTC +++ /branches/experimental/parser/tools/lexer_generator/lexer_test.py Thu Jan 23 11:39:04 2014 UTC
@@ -48,13 +48,12 @@

   def test_simple(self):
     rules = '''
-    eos = [:eos:];
     <<default>>
     "("           <|LBRACE|>
     ")"           <|RBRACE|>

     "foo"         <|FOO|>
-    eos           <|terminate|>'''
+    eos           <terminate>'''

     string = 'foo()'
     self.__verify_action_stream(rules, string,
@@ -62,12 +61,11 @@

   def test_maximal_matching(self):
     rules = '''
-    eos = [:eos:];
     <<default>>
     "<"           <|LT|>
     "<<"          <|SHL|>
     " "           <|SPACE|>
-    eos           <|terminate|>'''
+    eos           <terminate>'''

     string = '<< <'
     self.__verify_action_stream(rules, string,
@@ -75,7 +73,6 @@

   def test_consecutive_epsilon_transitions(self):
     rules = '''
-    eos = [:eos:];
     digit = [0-9];
     number = (digit+ ("." digit+)?);
     <<default>>
=======================================
--- /branches/experimental/parser/tools/lexer_generator/nfa_builder.py Thu Jan 23 09:11:01 2014 UTC +++ /branches/experimental/parser/tools/lexer_generator/nfa_builder.py Thu Jan 23 11:39:04 2014 UTC
@@ -251,6 +251,7 @@
     keys = reduce(f, reachable_states, set())
     keys.discard(TransitionKey.epsilon())
     keys.discard(catch_all)
+    keys.discard(TransitionKey.unique('eos'))
     inverse_key = TransitionKey.inverse_key(encoding, keys)
     if inverse_key:
       transitions[inverse_key] = transitions[catch_all]
=======================================
--- /branches/experimental/parser/tools/lexer_generator/rule_lexer.py Tue Jan 14 12:22:39 2014 UTC +++ /branches/experimental/parser/tools/lexer_generator/rule_lexer.py Thu Jan 23 11:39:04 2014 UTC
@@ -31,6 +31,7 @@

   tokens = (
     'DEFAULT_ACTION',
+    'EOS',
     'CATCH_ALL',

     'IDENTIFIER',
@@ -61,7 +62,7 @@
     pass

   __special_identifiers = set(map(lambda s: s.lower(),
-    ['DEFAULT_ACTION', 'CATCH_ALL']))
+    ['DEFAULT_ACTION', 'CATCH_ALL', 'EOS']))

   def t_IDENTIFIER(self, t):
     r'[a-zA-Z0-9_]+'
=======================================
--- /branches/experimental/parser/tools/lexer_generator/rule_parser.py Wed Jan 22 09:26:03 2014 UTC +++ /branches/experimental/parser/tools/lexer_generator/rule_parser.py Thu Jan 23 11:39:04 2014 UTC
@@ -99,6 +99,7 @@
   def p_transition_rule(self, p):
     '''transition_rule : composite_regex action
                        | DEFAULT_ACTION default_action
+                       | EOS eos
                        | CATCH_ALL action'''
     precedence = RuleParser.__rule_precedence_counter
     RuleParser.__rule_precedence_counter += 1
@@ -112,7 +113,7 @@
       assert self.__state.current_state == 'default'
       assert not rules['default_action']
       rules['default_action'] = action
-    elif p[1] == 'catch_all':
+    elif p[1] == 'eos' or p[1] == 'catch_all':
       assert p[1] not in rules['uniques']
       rules['uniques'][p[1]] = True
rules['regex'].append((NfaBuilder.unique_key(p[1]), precedence, action))
@@ -128,6 +129,10 @@
     'default_action : ACTION_OPEN identifier_action ACTION_CLOSE'
     p[0] = (None, p[2], None)

+  def p_eos(self, p):
+    'eos : ACTION_OPEN identifier_action ACTION_CLOSE'
+    p[0] = (None, p[2], None)
+
   def p_maybe_identifier_action(self, p):
     '''maybe_identifier_action : identifier_action
                          | empty'''
=======================================
--- /branches/experimental/parser/tools/lexer_generator/transition_keys.py Wed Jan 22 09:26:03 2014 UTC +++ /branches/experimental/parser/tools/lexer_generator/transition_keys.py Thu Jan 23 11:39:04 2014 UTC
@@ -475,8 +475,8 @@
   def __init__(self):
     super(Latin1Encoding, self).__init__(
       'latin1',
-      (1, 255),
-      ['eos', 'zero'])
+      (0, 255),
+      [])
     self.add_predefined_range(
       'whitespace', [(9, 9), (11, 12), (32, 32), (133, 133), (160, 160)])
     self.add_predefined_range(
@@ -492,8 +492,8 @@
   def __init__(self):
     super(Utf16Encoding, self).__init__(
       'utf16',
-      (1, 255),
-      ['eos', 'zero', 'byte_order_mark',
+      (0, 255),
+      ['byte_order_mark',
        'non_primary_whitespace',
        'non_primary_letter',
        'non_primary_identifier_part_not_letter',
@@ -522,8 +522,8 @@
   def __init__(self):
     super(Utf8Encoding, self).__init__(
       'utf8',
-      (1, 127),
-      ['eos', 'zero', 'byte_order_mark',
+      (0, 127),
+      ['byte_order_mark',
        'non_primary_whitespace',
        'non_primary_letter',
        'non_primary_identifier_part_not_letter',

--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.

Reply via email to