Reviewers: marja,

Message:
Committed patchset #1 manually as r18903 (presubmit successful).

Description:
Experimental parser: remove bom handling

[email protected]

BUG=

Committed: https://code.google.com/p/v8/source/detail?r=18903

Please review this at https://codereview.chromium.org/148283007/

SVN Base: https://v8.googlecode.com/svn/branches/experimental/parser

Affected files (+7, -14 lines):
  M src/scanner.h
  M tools/lexer_generator/code_generator.jinja
  M tools/lexer_generator/code_generator.py
  M tools/lexer_generator/transition_keys.py


Index: src/scanner.h
diff --git a/src/scanner.h b/src/scanner.h
index f6f797e22f1e08dce696b9141fb7d7440195a85d..f19689755c0c3bcf215e35d49b56f6a23d6487fb 100644
--- a/src/scanner.h
+++ b/src/scanner.h
@@ -139,8 +139,10 @@ class UnicodeCache {
bool IsIdentifierPart(unibrow::uchar c) { return kIsIdentifierPart.get(c); } bool IsLineTerminator(unibrow::uchar c) { return kIsLineTerminator.get(c); }
   bool IsWhiteSpace(unibrow::uchar c) { return kIsWhiteSpace.get(c); }
+ bool IsByteOrderMark(unibrow::uchar c) { return c == 0xfffe || c == 0xfeff; }
   bool IsWhiteSpaceNotLineTerminator(unibrow::uchar c) {
-    return !kIsLineTerminator.get(c) && kIsWhiteSpace.get(c);
+    return (kIsWhiteSpace.get(c) && !kIsLineTerminator.get(c)) ||
+        IsByteOrderMark(c);
   }
   bool IsLetter(unibrow::uchar c) { return kIsLetter.get(c); }
   bool IsIdentifierPartNotLetter(unibrow::uchar c) {
Index: tools/lexer_generator/code_generator.jinja
diff --git a/tools/lexer_generator/code_generator.jinja b/tools/lexer_generator/code_generator.jinja index ab1c05720a086b2c524931c40b79f1c190202fd2..a7b759a307b3d2b8a9671e39b8b2e602ef3ea255 100644
--- a/tools/lexer_generator/code_generator.jinja
+++ b/tools/lexer_generator/code_generator.jinja
@@ -47,9 +47,7 @@
       {%- endif -%}
{# These classes require long_char and to be outside the primary range #} {%- elif r[0] == 'LONG_CHAR_CLASS' and encoding in ['utf16', 'utf8'] -%}
-      {%- if r[1] == 'byte_order_mark' -%}
-        (long_char == 0xfffe || long_char == 0xfeff)
-      {%- elif r[1] == 'call' -%}
+      {%- if r[1] == 'call' -%}
         unicode_cache_->{{r[2]}}(long_char)
       {%- elif r[1] == 'invert' -%}
         !({{do_key(r[2])}})
Index: tools/lexer_generator/code_generator.py
diff --git a/tools/lexer_generator/code_generator.py b/tools/lexer_generator/code_generator.py index c98110fa2b8d84add2cf890956f4d35067b1da29..907d7e6c22c6f94e7a5ce4b6b39d70aa07a2a859 100644
--- a/tools/lexer_generator/code_generator.py
+++ b/tools/lexer_generator/code_generator.py
@@ -205,15 +205,12 @@ class CodeGenerator:
     if not transitions:
       return
     encoding = self.__dfa.encoding()
-    bom = 'byte_order_mark'
     catch_all = 'non_primary_everything_else'
     all_classes = set(encoding.class_name_iter())
-    call_classes = all_classes - set([bom, catch_all])
+    call_classes = all_classes - set([catch_all])
     def remap_transition(class_name):
       if class_name in call_classes:
         return ('LONG_CHAR_CLASS', 'call', self.__call_map[class_name])
-      if class_name == bom:
-        return ('LONG_CHAR_CLASS', class_name)
       raise Exception(class_name)
     long_class_transitions = []
     long_class_map = {}
Index: tools/lexer_generator/transition_keys.py
diff --git a/tools/lexer_generator/transition_keys.py b/tools/lexer_generator/transition_keys.py index e3664d4ea7e1415555c73ed783cd8dd9c1506c5e..33019b99a09ba92c78244a84553debdc265ec1d4 100644
--- a/tools/lexer_generator/transition_keys.py
+++ b/tools/lexer_generator/transition_keys.py
@@ -493,8 +493,7 @@ class Utf16Encoding(KeyEncoding):
     super(Utf16Encoding, self).__init__(
       'utf16',
       (0, 255),
-      ['byte_order_mark',
-       'non_primary_whitespace',
+      ['non_primary_whitespace',
        'non_primary_letter',
        'non_primary_identifier_part_not_letter',
        'non_primary_line_terminator',
@@ -502,7 +501,6 @@ class Utf16Encoding(KeyEncoding):
     self.add_predefined_range(
       'whitespace',
       [(9, 9), (11, 12), (32, 32), (133, 133), (160, 160),
-       self.class_range('byte_order_mark'),
        self.class_range('non_primary_whitespace')])
     self.add_predefined_range(
       'letter', [
@@ -523,8 +521,7 @@ class Utf8Encoding(KeyEncoding):
     super(Utf8Encoding, self).__init__(
       'utf8',
       (0, 127),
-      ['byte_order_mark',
-       'non_primary_whitespace',
+      ['non_primary_whitespace',
        'non_primary_letter',
        'non_primary_identifier_part_not_letter',
        'non_primary_line_terminator',
@@ -532,7 +529,6 @@ class Utf8Encoding(KeyEncoding):
     self.add_predefined_range(
       'whitespace',
       [(9, 9), (11, 12), (32, 32),
-        self.class_range('byte_order_mark'),
         self.class_range('non_primary_whitespace')])
     self.add_predefined_range(
'letter', [(65, 90), (97, 122), self.class_range('non_primary_letter')])


--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.

Reply via email to