Revision: 18014
Author:   [email protected]
Date:     Fri Nov 22 13:16:25 2013 UTC
Log: Experimental parser: cleanup code requiring calls to the unicode cache

[email protected]

BUG=

Review URL: https://codereview.chromium.org/83243004
http://code.google.com/p/v8/source/detail?r=18014

Modified:
 /branches/experimental/parser/src/lexer/lexer_py.re
 /branches/experimental/parser/tools/lexer_generator/code_generator.jinja
 /branches/experimental/parser/tools/lexer_generator/code_generator.py
 /branches/experimental/parser/tools/lexer_generator/transition_keys.py

=======================================
--- /branches/experimental/parser/src/lexer/lexer_py.re Fri Nov 22 08:51:22 2013 UTC +++ /branches/experimental/parser/src/lexer/lexer_py.re Fri Nov 22 13:16:25 2013 UTC
@@ -68,21 +68,19 @@
 "<!--"        <||SingleLineComment>

 "<!-"        <|{
-  cursor_ -= 2;
-  yych = *(cursor_);
+  BACKWARD(2);
   PUSH_TOKEN(Token::LT);
 }|>

 "<!"        <|{
-  cursor_ -= 1;
-  yych = *(cursor_);
+  BACKWARD(1);
   PUSH_TOKEN(Token::LT);
 }|>


 "-->" <{
   if (!just_seen_line_terminator_) {
-    yych = *(--cursor_);
+    BACKWARD(1);
     PUSH_TOKEN(Token::DEC);
   }
 }||SingleLineComment>
@@ -134,7 +132,7 @@
 ","           <|push_token(COMMA)|>

 line_terminator+                     <|push_line_terminator|>
-/[:whitespace::byte_order_mark:]+/   <|skip|>
+/[:whitespace:]+/   <|skip|>

 "\""           <set_marker(1)||DoubleQuoteString>
 "'"            <set_marker(1)||SingleQuoteString>
=======================================
--- /branches/experimental/parser/tools/lexer_generator/code_generator.jinja Fri Nov 22 12:59:16 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/code_generator.jinja Fri Nov 22 13:16:25 2013 UTC
@@ -5,43 +5,32 @@
     {%- if not loop.first %} || {% endif -%}
     {%- if r[0] == 'PRIMARY_RANGE' -%}
       {%- if r[1][0] == r[1][1] -%}
-        yych == {{r[1][0]}}
+        primary_char == {{r[1][0]}}
       {%- elif r[1][0] == 0 -%}
-        yych <= {{r[1][1]}}
+        primary_char <= {{r[1][1]}}
       {%- elif r[1][1] == upper_bound and not encoding == 'utf16'-%}
-        yych >= {{r[1][0]}}
+        primary_char >= {{r[1][0]}}
       {%- else -%}
-        ({{r[1][0]}} <= yych && yych <= {{r[1][1]}})
+        ({{r[1][0]}} <= primary_char && primary_char <= {{r[1][1]}})
       {%- endif -%}
     {%- elif r[0] == 'CLASS' -%}
       {%- if r[1] == 'eos' -%}
-        (yych == 0 && cursor_ >= buffer_end_)
+        (primary_char == 0 && cursor_ >= buffer_end_)
       {%- elif r[1] == 'zero' -%}
-        (yych == 0 && cursor_ < buffer_end_)
-      {%- elif encoding == 'latin1' -%}
-        false /* {{r[1]}} */
-      {%- elif encoding == 'utf16' -%}
-        {%- if r[1] == 'byte_order_mark' -%}
-          (yych == 0xfffe || yych == 0xfeff)
-        {%- elif r[1] == 'non_latin_1_whitespace' -%}
-          (yych > {{upper_bound}} && unicode_cache_->IsWhiteSpace(yych))
-        {%- elif r[1] == 'non_latin_1_letter' -%}
-          (yych > {{upper_bound}} &&  unicode_cache_->IsLetter(yych))
-        {%- elif r[1] == 'non_latin_1_identifier_part_not_letter' -%}
-          (yych > {{upper_bound}} &&
-            unicode_cache_->IsIdentifierPartNotLetter(yych))
-        {%- elif r[1] == 'non_latin_1_line_terminator' -%}
- (yych > {{upper_bound}} && unicode_cache_->IsLineTerminator(yych))
-        {%- elif r[1] == 'non_latin_1_everything_else' -%}
-          {# FIXME: Optimize this away #}
-          (yych > {{upper_bound}} &&
-           !unicode_cache_->IsWhiteSpace(yych) &&
-           !unicode_cache_->IsLetter(yych) &&
-           !unicode_cache_->IsIdentifierPartNotLetter(yych) &&
-           !unicode_cache_->IsLineTerminator(yych))
-        {%- else %}
-          uncompilable code for {{encoding}} {{r[0]}} {{r[1]}}
-        {%- endif -%}
+        (primary_char == 0 && cursor_ < buffer_end_)
+      {%- else %}
+        uncompilable code for {{encoding}} {{r[0]}} {{r[1]}}
+      {%- endif -%}
+ {# These classes require long_char and to be outside the primary range #} + {%- elif r[0] == 'LONG_CHAR_CLASS' and encoding in ['utf16', 'utf8'] -%}
+      {%- if r[1] == 'byte_order_mark' -%}
+        (long_char == 0xfffe || long_char == 0xfeff)
+      {%- elif r[1] == 'call' -%}
+        unicode_cache_->{{r[2]}}(long_char)
+      {%- elif r[1] == 'invert' -%}
+        !({{do_key(r[2])}})
+      {%- elif r[1] == 'catch_all' -%}
+        (true || long_char == 0) /* {{r[1]}} */
       {%- else -%}
         uncompilable code for {{encoding}} {{r[0]}} {{r[1]}}
       {%- endif -%}
@@ -58,7 +47,7 @@
   {% elif type == 'terminate' %}
     PUSH_EOS();
   {% elif type == 'terminate_illegal' %}
-    start_ = marker_; BACKWARD(); PUSH_TOKEN(Token::ILLEGAL);
+    start_ = marker_; BACKWARD(1); PUSH_TOKEN(Token::ILLEGAL);
   {% elif type == 'skip' %}
     SKIP();
   {% elif type == 'skip_and_terminate' %}
@@ -112,7 +101,7 @@
   {%- endif %}

   {%- if debug_print %}
-    fprintf(stderr, "char at hand is %c (%d)\n", yych, yych);
+ fprintf(stderr, "char at hand is %c (%d)\n", primary_char, primary_char);
   {% endif -%}

   {%- macro do_transition(transition_state_id) -%}
@@ -126,7 +115,7 @@
   {%- endmacro -%}

   {%- if state['switch_transitions'] -%}
-    switch(yych) {
+    switch(primary_char) {
     {%- for ranges, transition_state_id in state['switch_transitions'] %}
       {%- for r in ranges -%}
         {%- for key in range(r[0], r[1] + 1) -%}
@@ -139,7 +128,7 @@
   {%- endif -%}

   {%- for key, transition_state_id in state.transitions %}
-    if ({{do_key(key)}}) {
+    if ({{do_key(key)}}) { // normal if transition
       {{ do_transition(transition_state_id) }}
     }
   {% endfor -%}
@@ -149,6 +138,18 @@
       {{ do_transition(transition_state_id) }}
     }
   {% endfor -%}
+
+  {%- if state['long_char_transitions'] -%}
+    {# TODO macro this up for utf8 #}
+    if (primary_char > {{upper_bound}}) {
+      uint32_t long_char = primary_char;
+      {%- for key, transition_state_id in state['long_char_transitions'] %}
+        if ({{do_key(key)}}) { // long_char transition
+          {{ do_transition(transition_state_id) }}
+        }
+      {% endfor -%}
+    }
+  {%- endif-%}

   {%- set match_action = state.match_action -%}

@@ -189,14 +190,15 @@
   just_seen_line_terminator_ = true;  \
 }

-#define FORWARD() {                   \
-  if (++cursor_ >= buffer_end_) yych = 0; \
-  else yych = *(cursor_);                 \
+#define FORWARD() {                               \
+  if (++cursor_ >= buffer_end_) primary_char = 0; \
+  else primary_char = *(cursor_);                 \
 }

-#define BACKWARD() {                  \
-  if (--cursor_ >= buffer_end_) yych = 0; \
-  else yych = *(cursor_);                 \
+#define BACKWARD(n) {                             \
+  cursor_ -= n;                                   \
+  if (cursor_ >= buffer_end_) primary_char = 0;   \
+  else primary_char = *(cursor_);                 \
 }

 #define SKIP() {                      \
@@ -209,9 +211,9 @@
Token::Value ExperimentalScanner<{{char_type}}>::Next(int* beg_pos_to_return, int* end_pos_to_return) {
   // Setup environment.
-  {{char_type}} yych;
-  if (cursor_ >= buffer_end_) yych = 0;
-  else yych = *(cursor_);
+  {{char_type}} primary_char;
+  if (cursor_ >= buffer_end_) primary_char = 0;
+  else primary_char = *(cursor_);

 {# first node is start node #}
 {% for dfa_state in dfa_states -%}
=======================================
--- /branches/experimental/parser/tools/lexer_generator/code_generator.py Fri Nov 22 09:25:13 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/code_generator.py Fri Nov 22 13:16:25 2013 UTC
@@ -129,6 +129,7 @@
       'transitions' : transitions,
       'switch_transitions' : [],
       'deferred_transitions' : [],
+      'long_char_transitions' : [],
       'disjoint_keys' : disjoint_keys,
       'inline' : None,
       'depth' : None,
@@ -198,6 +199,75 @@
     state['deferred_transitions'] = deferred_transitions
     return split_count + (0 if no_switch else 1)

+  __call_map = {
+    'non_primary_whitespace' : 'IsWhiteSpace',
+    'non_primary_letter' : 'IsLetter',
+    'non_primary_identifier_part_not_letter' : 'IsIdentifierPartNotLetter',
+    'non_primary_line_terminator' : 'IsLineTerminator',
+  }
+
+  def __rewrite_deferred_transitions(self, state):
+    assert not state['long_char_transitions']
+    transitions = state['deferred_transitions']
+    if not transitions:
+      return
+    encoding = self.__dfa.encoding()
+    bom = 'byte_order_mark'
+    catch_all = 'non_primary_everything_else'
+    all_classes = set(encoding.class_name_iter())
+    fast_classes = set(['eos', 'zero'])
+    call_classes = all_classes - fast_classes - set([bom, catch_all])
+    def remap_transition(class_name):
+      if class_name in call_classes:
+        return ('LONG_CHAR_CLASS', 'call', self.__call_map[class_name])
+      if class_name == bom:
+        return ('LONG_CHAR_CLASS', class_name)
+      raise Exception(class_name)
+    fast_transitions = []
+    long_class_transitions = []
+    long_class_map = {}
+    catchall_transition = None
+    # loop through and remove catch_all_transitions
+    for (classes, transition_node_id) in transitions:
+      ft = []
+      lct = []
+      has_catch_all = False
+      for (class_type, class_name) in classes:
+        if class_name in fast_classes:
+          ft.append((class_type, class_name))
+        else:
+          assert not class_name in long_class_map
+          long_class_map[class_name] = transition_node_id
+          if class_name == catch_all:
+            assert not has_catch_all
+            assert catchall_transition == None
+            has_catch_all = True
+          else:
+            lct.append(remap_transition(class_name))
+      if ft:
+        fast_transitions.append((ft, transition_node_id))
+      if has_catch_all:
+        catchall_transition = (lct, transition_node_id)
+      elif lct:
+        long_class_transitions.append((lct, transition_node_id))
+    # all transitions are fast
+    if not long_class_map:
+      return
+    if catchall_transition:
+      catchall_transitions = all_classes - fast_classes
+      for class_name in long_class_map.iterkeys():
+        catchall_transitions.remove(class_name)
+      assert not catchall_transitions, "class inversion not unimplemented"
+    # split deferred transitions
+    state['deferred_transitions'] = fast_transitions
+    if catchall_transition:
+      catchall_transition = [
+        ([('LONG_CHAR_CLASS', 'catch_all')], catchall_transition[1])]
+    else:
+      catchall_transition = []
+    state['long_char_transitions'] = (long_class_transitions +
+                                      catchall_transition) # must be last
+
   def __canonicalize_traversal(self):
     dfa_states = []
self.__dfa.visit_all_states(lambda state, acc: dfa_states.append(state))
@@ -229,6 +299,9 @@
     switched = reduce(self.__split_transitions, dfa_states, 0)
     if self.__log:
       print "%s states use switch (instead of if)" % switched
+    # rewrite deferred transitions
+    for state in dfa_states:
+      self.__rewrite_deferred_transitions(state)

   def process(self):

=======================================
--- /branches/experimental/parser/tools/lexer_generator/transition_keys.py Fri Nov 22 09:25:13 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/transition_keys.py Fri Nov 22 13:16:25 2013 UTC
@@ -57,7 +57,6 @@
     return self.__name

   def add_predefined_range(self, name, ranges):
-    # TODO verify disjointness
     self.__predefined_ranges[name] = ranges

   def lower_bound(self):
@@ -76,6 +75,9 @@
   def class_range_iter(self):
     return self.__class_ranges.iteritems()

+  def class_name_iter(self):
+    return self.__class_ranges.iterkeys()
+
   def class_value_iter(self):
     return self.__class_ranges.itervalues()

@@ -463,7 +465,7 @@
     super(Latin1Encoding, self).__init__(
       'latin1',
       (1, 255),
-      ['eos', 'zero', 'byte_order_mark'])
+      ['eos', 'zero'])
     self.add_predefined_range(
       'whitespace', [(9, 9), (11, 12), (32, 32), (133, 133), (160, 160)])
     self.add_predefined_range(
@@ -481,27 +483,28 @@
       'utf16',
       (1, 255),
       ['eos', 'zero', 'byte_order_mark',
-       'non_latin_1_whitespace',
-       'non_latin_1_letter',
-       'non_latin_1_identifier_part_not_letter',
-       'non_latin_1_line_terminator',
-       'non_latin_1_everything_else'])
+       'non_primary_whitespace',
+       'non_primary_letter',
+       'non_primary_identifier_part_not_letter',
+       'non_primary_line_terminator',
+       'non_primary_everything_else'])
     self.add_predefined_range(
       'whitespace',
       [(9, 9), (11, 12), (32, 32), (133, 133), (160, 160),
-       self.class_range('non_latin_1_whitespace')])
+       self.class_range('byte_order_mark'),
+       self.class_range('non_primary_whitespace')])
     self.add_predefined_range(
       'letter', [
         (65, 90), (97, 122), (170, 170), (181, 181),
         (186, 186), (192, 214), (216, 246), (248, 255),
-        self.class_range('non_latin_1_letter')])
+        self.class_range('non_primary_letter')])
     self.add_predefined_range(
       'line_terminator',
- [(10, 10), (13, 13), self.class_range('non_latin_1_line_terminator')]) + [(10, 10), (13, 13), self.class_range('non_primary_line_terminator')])
     self.add_predefined_range(
       'identifier_part_not_letter',
       [(48, 57), (95, 95),
-       self.class_range('non_latin_1_identifier_part_not_letter')])
+       self.class_range('non_primary_identifier_part_not_letter')])

 class Utf8Encoding(KeyEncoding):

@@ -510,20 +513,22 @@
       'utf8',
       (1, 127),
       ['eos', 'zero', 'byte_order_mark',
-       'non_ascii_whitespace',
-       'non_ascii_letter',
-       'non_ascii_identifier_part_not_letter',
-       'non_ascii_line_terminator',
-       'non_ascii_everything_else'])
+       'non_primary_whitespace',
+       'non_primary_letter',
+       'non_primary_identifier_part_not_letter',
+       'non_primary_line_terminator',
+       'non_primary_everything_else'])
     self.add_predefined_range(
       'whitespace',
- [(9, 9), (11, 12), (32, 32), self.class_range('non_ascii_whitespace')])
+      [(9, 9), (11, 12), (32, 32),
+        self.class_range('byte_order_mark'),
+        self.class_range('non_primary_whitespace')])
     self.add_predefined_range(
- 'letter', [(65, 90), (97, 122), self.class_range('non_ascii_letter')]) + 'letter', [(65, 90), (97, 122), self.class_range('non_primary_letter')])
     self.add_predefined_range(
       'line_terminator',
-      [(10, 10), (13, 13), self.class_range('non_ascii_line_terminator')])
+ [(10, 10), (13, 13), self.class_range('non_primary_line_terminator')])
     self.add_predefined_range(
       'identifier_part_not_letter',
       [(48, 57), (95, 95),
-       self.class_range('non_ascii_identifier_part_not_letter')])
+       self.class_range('non_primary_identifier_part_not_letter')])

--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.

Reply via email to