Reviewers: dcarney,

Message:
Committed patchset #1 manually as r17947.

Description:
Experimental lexer generator: Fix byte order mark detection.

BUG=
[email protected]

Committed: https://code.google.com/p/v8/source/detail?r=17947

Please review this at https://codereview.chromium.org/80503002/

SVN Base: https://v8.googlecode.com/svn/branches/experimental/parser

Affected files (+11, -20 lines):
  M src/lexer/even-more-experimental-scanner.cc
  M src/lexer/lexer_py.re
  M tools/lexer_generator/code_generator.jinja
  M tools/lexer_generator/transition_keys.py


Index: src/lexer/even-more-experimental-scanner.cc
diff --git a/src/lexer/even-more-experimental-scanner.cc b/src/lexer/even-more-experimental-scanner.cc index 96c753f7d372b46c7072c3bc005f90cb6cbd50df..ac26d1d16bb1c6cabe012865138c19871372355b 100644
--- a/src/lexer/even-more-experimental-scanner.cc
+++ b/src/lexer/even-more-experimental-scanner.cc
@@ -69,32 +69,19 @@ const byte* ReadFile(const char* name, Isolate* isolate,
   int file_size = ftell(file);
   rewind(file);

-  byte* file_contents = new byte[file_size];
+  *size = file_size * repeat;
+
+  byte* chars = new byte[*size];
   for (int i = 0; i < file_size;) {
-    int read =
-        static_cast<int>(fread(&file_contents[i], 1, file_size - i, file));
+    int read = static_cast<int>(fread(&chars[i], 1, file_size - i, file));
     i += read;
   }
   fclose(file);

-  // If the file contains the UTF16 little endian magic bytes, skip them.
- // FIXME: what if we see big endian magic bytes? Do we do the right thing for
-  // big endian anyway?
-  byte* start = file_contents;
-  if (*start == 0xff && *(start + 1) == 0xfe) {
-    start += 2;
-    file_size -= 2;
+  for (int i = file_size; i < *size; i++) {
+    chars[i] = chars[i - file_size];
   }

-  *size = file_size * repeat;
-  byte* chars = new byte[*size];
-
-  for (int i = 0; i < *size; i++) {
-    chars[i] = start[i % file_size];
-  }
-
-  delete file_contents;
-
   return chars;
 }

Index: src/lexer/lexer_py.re
diff --git a/src/lexer/lexer_py.re b/src/lexer/lexer_py.re
index 2ce0db54899469526637f52b81bc1ec5b4de2b82..b9c75d1caccb69fff8e13cde2886282be15dbb87 100644
--- a/src/lexer/lexer_py.re
+++ b/src/lexer/lexer_py.re
@@ -26,7 +26,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 whitespace_char = [:whitespace:];
-whitespace = whitespace_char+;
+byte_order_mark_char = [:byte_order_mark:];
+whitespace = whitespace_char+|byte_order_mark_char+;
 identifier_start = [$_:letter:];
 identifier_char = [:identifier_start::identifier_part_not_letter:];
 line_terminator = [:line_terminator:];
Index: tools/lexer_generator/code_generator.jinja
diff --git a/tools/lexer_generator/code_generator.jinja b/tools/lexer_generator/code_generator.jinja index 580c7896035f23ddaf7d61e385482d21fd21f15b..097de4f9fda067aace27de0a228e8c2dab222647 100644
--- a/tools/lexer_generator/code_generator.jinja
+++ b/tools/lexer_generator/code_generator.jinja
@@ -18,6 +18,8 @@
              (yych == 0 && cursor_ >= buffer_end_)
       {%- elif r[1] == 'zero' -%}
              (yych == 0 && cursor_ < buffer_end_)
+      {%- elif r[1] == 'byte_order_mark' and encoding == 'utf16'-%}
+        (yych == 0xfffe || yych == 0xfeff)
       {%- elif r[1] == 'non_latin_1_whitespace' and encoding == 'utf16'-%}
         {# FIXME: Add and use unicode_cache_->InNonAsciiWhitespace #}
         (yych > 255 && unicode_cache_->IsWhiteSpace(yych))
Index: tools/lexer_generator/transition_keys.py
diff --git a/tools/lexer_generator/transition_keys.py b/tools/lexer_generator/transition_keys.py index ca99b74f01e24eab3ed0e9bb2b4e021c3c930284..d6bd0e8e8baebb02805c8d52b423446d20454f69 100644
--- a/tools/lexer_generator/transition_keys.py
+++ b/tools/lexer_generator/transition_keys.py
@@ -46,6 +46,7 @@ class TransitionKey:
     'non_latin1_line_terminator' : (259, 259),
     'eos' : (260, 260),
     'zero' : (261, 261),
+    'byte_order_mark' : (262, 262),
   }
   __lower_bound = min(__class_bounds.values(), key=lambda item: item[0])[0]
   __upper_bound = max(__class_bounds.values(), key=lambda item: item[1])[1]


--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.

Reply via email to