Revision: 17945
Author:   [email protected]
Date:     Thu Nov 21 08:21:45 2013 UTC
Log:      Experimental parser: split classes into latin1 and non latin1

[email protected]

BUG=

Review URL: https://codereview.chromium.org/80263003
http://code.google.com/p/v8/source/detail?r=17945

Modified:
 /branches/experimental/parser/src/lexer/lexer_py.re
 /branches/experimental/parser/tools/lexer_generator/code_generator.jinja
 /branches/experimental/parser/tools/lexer_generator/transition_keys.py

=======================================
--- /branches/experimental/parser/src/lexer/lexer_py.re Wed Nov 20 16:10:09 2013 UTC +++ /branches/experimental/parser/src/lexer/lexer_py.re Thu Nov 21 08:21:45 2013 UTC
@@ -25,11 +25,11 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-whitespace_char = [ \t\v\f\r:whitespace:\240];
+whitespace_char = [:whitespace:];
 whitespace = whitespace_char+;
-identifier_start = [$_a-zA-Z:letter:];
-identifier_char = [0-9:identifier_part_not_letter::identifier_start:];
-line_terminator = [\n\r];
+identifier_start = [$_:letter:];
+identifier_char = [:identifier_start::identifier_part_not_letter:];
+line_terminator = [:line_terminator:];
 digit = [0-9];
 hex_digit = [0-9a-fA-F];
 single_escape_char = ['"\\bfnrtv];
@@ -38,8 +38,7 @@
   /0[xX][:hex_digit:]+/ | (
   /\.[:digit:]+/ maybe_exponent |
   /[:digit:]+(\.[:digit:]*)?/ maybe_exponent );
-# TODO this is incomplete/incorrect
-line_terminator_sequence = (/\n\r?/)|(/\r\n?/);
+line_terminator_sequence = /[:line_terminator:]|\r\n/;
 eos = [:eos:];

 # grammar is
=======================================
--- /branches/experimental/parser/tools/lexer_generator/code_generator.jinja Wed Nov 20 17:15:07 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/code_generator.jinja Thu Nov 21 08:21:45 2013 UTC
@@ -5,29 +5,31 @@
     {%- if not loop.first %} || {% endif -%}
     {%- if r[0] == 'LATIN_1' -%}
       {%- if r[1][0] == r[1][1] -%}
-       yych == {{r[1][0]}}
+             yych == {{r[1][0]}}
       {%- elif r[1][0] == 0 -%}
-       yych <= {{r[1][1]}}
+             yych <= {{r[1][1]}}
       {%- elif r[1][1] == 255 -%}
-       yych >= {{r[1][0]}}
+             yych >= {{r[1][0]}}
       {%- else -%}
-       ({{r[1][0]}} <= yych && yych <= {{r[1][1]}})
+             ({{r[1][0]}} <= yych && yych <= {{r[1][1]}})
       {%- endif -%}
     {%- elif r[0] == 'CLASS' -%}
       {%- if r[1] == 'eos' -%}
-       (yych == 0 && cursor_ >= buffer_end_)
+             (yych == 0 && cursor_ >= buffer_end_)
       {%- elif r[1] == 'zero' -%}
-       (yych == 0 && cursor_ < buffer_end_)
-      {%- elif r[1] == 'whitespace' and encoding == 'utf16'-%}
+             (yych == 0 && cursor_ < buffer_end_)
+      {%- elif r[1] == 'non_latin_1_whitespace' and encoding == 'utf16'-%}
         {# FIXME: Add and use unicode_cache_->InNonAsciiWhitespace #}
- (yych != ' ' && yych != '\t' && yych != '\v' && yych != '\f' && yych != '\r' && yych != '\n' && unicode_cache_->IsWhiteSpace(yych))
-      {%- elif r[1] == 'letter' and encoding == 'utf16'-%}
+        (yych > 255 && unicode_cache_->IsWhiteSpace(yych))
+      {%- elif r[1] == 'non_latin_1_letter' and encoding == 'utf16'-%}
         {# FIXME: Add and use unicode_cache_->InNonAsciiLetter #}
- (!(yych >= 'a' && yych <= 'z') && !(yych >= 'A' && yych <= 'Z') && unicode_cache_->IsLetter(yych)) - {%- elif r[1] == 'identifier_part_not_letter' and encoding == 'utf16'-%}
-        unicode_cache_->IsIdentifierPartNotLetter(yych)
+        (yych > 255 &&  unicode_cache_->IsLetter(yych))
+ {%- elif r[1] == 'non_latin1_identifier_part_not_letter' and encoding == 'utf16'-%}
+        (yych > 255 &&  unicode_cache_->IsIdentifierPartNotLetter(yych))
+ {%- elif r[1] == 'non_latin1_line_terminator' and encoding == 'utf16'-%}
+        (yych > 255 &&  unicode_cache_->IsLineTerminator(yych))
       {%- else -%}
-       false /* {{r[1]}} */
+             false /* {{r[1]}} */
       {%- endif -%}
     {%- else -%}
       false
=======================================
--- /branches/experimental/parser/tools/lexer_generator/transition_keys.py Wed Nov 20 16:10:09 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/transition_keys.py Thu Nov 21 08:21:45 2013 UTC
@@ -40,19 +40,36 @@
     'latin_1' : (1, 255),
# These are not real ranges; they just need to be separate from any real
     # ranges.
-    'whitespace' : (256, 256),
-    'letter' : (257, 257),
-    'identifier_part_not_letter' : (258, 258),
-    'eos' : (259, 259),
-    'zero' : (260, 260),
+    'non_latin_1_whitespace' : (256, 256),
+    'non_latin_1_letter' : (257, 257),
+    'non_latin1_identifier_part_not_letter' : (258, 258),
+    'non_latin1_line_terminator' : (259, 259),
+    'eos' : (260, 260),
+    'zero' : (261, 261),
   }
-  __lower_bound = 1
+  __lower_bound = min(__class_bounds.values(), key=lambda item: item[0])[0]
   __upper_bound = max(__class_bounds.values(), key=lambda item: item[1])[1]

   __cached_keys = {}

   __unique_key_counter = -1

+  __predefined_ranges = {
+    'whitespace' : [
+        (9, 9), (11, 12), (32, 32), (133, 133), (160, 160),
+        __class_bounds['non_latin_1_whitespace']],
+    'letter' : [
+        (65, 90), (97, 122), (170, 170), (181, 181),
+        (186, 186), (192, 214), (216, 246), (248, 255),
+        __class_bounds['non_latin_1_letter']],
+    'line_terminator' : [
+        (10, 10), (13, 13),
+        __class_bounds['non_latin1_line_terminator']],
+    'identifier_part_not_letter' : [
+        (48, 57), (95, 95),
+        __class_bounds['non_latin1_identifier_part_not_letter']],
+  }
+
   @staticmethod
   def __in_latin_1(char):
     bound = TransitionKey.__class_bounds['latin_1']
@@ -140,8 +157,16 @@
         TransitionKey.__process_graph(x, ranges, key_map)
     elif key == 'CHARACTER_CLASS':
       class_name = graph[1]
-      if class_name in TransitionKey.__class_bounds.keys():
+      if class_name in TransitionKey.__class_bounds:
+        if class_name in key_map:
+          assert (key_map[class_name] ==
+              TransitionKey([TransitionKey.__class_bounds[class_name]]))
         ranges.append(TransitionKey.__class_bounds[class_name])
+      elif class_name in TransitionKey.__predefined_ranges:
+        if class_name in key_map:
+          assert (key_map[class_name] ==
+              TransitionKey(TransitionKey.__predefined_ranges[class_name]))
+        ranges += TransitionKey.__predefined_ranges[class_name]
       elif class_name in key_map:
         ranges += key_map[class_name].__ranges
       else:

--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.

Reply via email to