Revision: 17947
Author:   [email protected]
Date:     Thu Nov 21 08:55:13 2013 UTC
Log:      Experimental lexer generator: Fix byte order mark detection.

BUG=
[email protected]

Review URL: https://codereview.chromium.org/80503002
http://code.google.com/p/v8/source/detail?r=17947

Modified:
 /branches/experimental/parser/src/lexer/even-more-experimental-scanner.cc
 /branches/experimental/parser/src/lexer/lexer_py.re
 /branches/experimental/parser/tools/lexer_generator/code_generator.jinja
 /branches/experimental/parser/tools/lexer_generator/transition_keys.py

=======================================
--- /branches/experimental/parser/src/lexer/even-more-experimental-scanner.cc Wed Nov 20 14:53:10 2013 UTC +++ /branches/experimental/parser/src/lexer/even-more-experimental-scanner.cc Thu Nov 21 08:55:13 2013 UTC
@@ -69,31 +69,18 @@
   int file_size = ftell(file);
   rewind(file);

-  byte* file_contents = new byte[file_size];
+  *size = file_size * repeat;
+
+  byte* chars = new byte[*size];
   for (int i = 0; i < file_size;) {
-    int read =
-        static_cast<int>(fread(&file_contents[i], 1, file_size - i, file));
+    int read = static_cast<int>(fread(&chars[i], 1, file_size - i, file));
     i += read;
   }
   fclose(file);

-  // If the file contains the UTF16 little endian magic bytes, skip them.
- // FIXME: what if we see big endian magic bytes? Do we do the right thing for
-  // big endian anyway?
-  byte* start = file_contents;
-  if (*start == 0xff && *(start + 1) == 0xfe) {
-    start += 2;
-    file_size -= 2;
-  }
-
-  *size = file_size * repeat;
-  byte* chars = new byte[*size];
-
-  for (int i = 0; i < *size; i++) {
-    chars[i] = start[i % file_size];
+  for (int i = file_size; i < *size; i++) {
+    chars[i] = chars[i - file_size];
   }
-
-  delete file_contents;

   return chars;
 }
=======================================
--- /branches/experimental/parser/src/lexer/lexer_py.re Thu Nov 21 08:21:45 2013 UTC +++ /branches/experimental/parser/src/lexer/lexer_py.re Thu Nov 21 08:55:13 2013 UTC
@@ -26,7 +26,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 whitespace_char = [:whitespace:];
-whitespace = whitespace_char+;
+byte_order_mark_char = [:byte_order_mark:];
+whitespace = whitespace_char+|byte_order_mark_char+;
 identifier_start = [$_:letter:];
 identifier_char = [:identifier_start::identifier_part_not_letter:];
 line_terminator = [:line_terminator:];
=======================================
--- /branches/experimental/parser/tools/lexer_generator/code_generator.jinja Thu Nov 21 08:21:45 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/code_generator.jinja Thu Nov 21 08:55:13 2013 UTC
@@ -18,6 +18,8 @@
              (yych == 0 && cursor_ >= buffer_end_)
       {%- elif r[1] == 'zero' -%}
              (yych == 0 && cursor_ < buffer_end_)
+      {%- elif r[1] == 'byte_order_mark' and encoding == 'utf16'-%}
+        (yych == 0xfffe || yych == 0xfeff)
       {%- elif r[1] == 'non_latin_1_whitespace' and encoding == 'utf16'-%}
         {# FIXME: Add and use unicode_cache_->InNonAsciiWhitespace #}
         (yych > 255 && unicode_cache_->IsWhiteSpace(yych))
=======================================
--- /branches/experimental/parser/tools/lexer_generator/transition_keys.py Thu Nov 21 08:21:45 2013 UTC +++ /branches/experimental/parser/tools/lexer_generator/transition_keys.py Thu Nov 21 08:55:13 2013 UTC
@@ -46,6 +46,7 @@
     'non_latin1_line_terminator' : (259, 259),
     'eos' : (260, 260),
     'zero' : (261, 261),
+    'byte_order_mark' : (262, 262),
   }
   __lower_bound = min(__class_bounds.values(), key=lambda item: item[0])[0]
   __upper_bound = max(__class_bounds.values(), key=lambda item: item[1])[1]

--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.

Reply via email to