Revision: 17947
Author: [email protected]
Date: Thu Nov 21 08:55:13 2013 UTC
Log: Experimental lexer generator: Fix byte order mark detection.
BUG=
[email protected]
Review URL: https://codereview.chromium.org/80503002
http://code.google.com/p/v8/source/detail?r=17947
Modified:
/branches/experimental/parser/src/lexer/even-more-experimental-scanner.cc
/branches/experimental/parser/src/lexer/lexer_py.re
/branches/experimental/parser/tools/lexer_generator/code_generator.jinja
/branches/experimental/parser/tools/lexer_generator/transition_keys.py
=======================================
---
/branches/experimental/parser/src/lexer/even-more-experimental-scanner.cc
Wed Nov 20 14:53:10 2013 UTC
+++
/branches/experimental/parser/src/lexer/even-more-experimental-scanner.cc
Thu Nov 21 08:55:13 2013 UTC
@@ -69,31 +69,18 @@
int file_size = ftell(file);
rewind(file);
- byte* file_contents = new byte[file_size];
+ *size = file_size * repeat;
+
+ byte* chars = new byte[*size];
for (int i = 0; i < file_size;) {
- int read =
- static_cast<int>(fread(&file_contents[i], 1, file_size - i, file));
+ int read = static_cast<int>(fread(&chars[i], 1, file_size - i, file));
i += read;
}
fclose(file);
- // If the file contains the UTF16 little endian magic bytes, skip them.
- // FIXME: what if we see big endian magic bytes? Do we do the right
thing for
- // big endian anyway?
- byte* start = file_contents;
- if (*start == 0xff && *(start + 1) == 0xfe) {
- start += 2;
- file_size -= 2;
- }
-
- *size = file_size * repeat;
- byte* chars = new byte[*size];
-
- for (int i = 0; i < *size; i++) {
- chars[i] = start[i % file_size];
+ for (int i = file_size; i < *size; i++) {
+ chars[i] = chars[i - file_size];
}
-
- delete file_contents;
return chars;
}
=======================================
--- /branches/experimental/parser/src/lexer/lexer_py.re Thu Nov 21 08:21:45
2013 UTC
+++ /branches/experimental/parser/src/lexer/lexer_py.re Thu Nov 21 08:55:13
2013 UTC
@@ -26,7 +26,8 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
whitespace_char = [:whitespace:];
-whitespace = whitespace_char+;
+byte_order_mark_char = [:byte_order_mark:];
+whitespace = whitespace_char+|byte_order_mark_char+;
identifier_start = [$_:letter:];
identifier_char = [:identifier_start::identifier_part_not_letter:];
line_terminator = [:line_terminator:];
=======================================
---
/branches/experimental/parser/tools/lexer_generator/code_generator.jinja
Thu Nov 21 08:21:45 2013 UTC
+++
/branches/experimental/parser/tools/lexer_generator/code_generator.jinja
Thu Nov 21 08:55:13 2013 UTC
@@ -18,6 +18,8 @@
(yych == 0 && cursor_ >= buffer_end_)
{%- elif r[1] == 'zero' -%}
(yych == 0 && cursor_ < buffer_end_)
+ {%- elif r[1] == 'byte_order_mark' and encoding == 'utf16'-%}
+ (yych == 0xfffe || yych == 0xfeff)
{%- elif r[1] == 'non_latin_1_whitespace' and encoding == 'utf16'-%}
{# FIXME: Add and use unicode_cache_->InNonAsciiWhitespace #}
(yych > 255 && unicode_cache_->IsWhiteSpace(yych))
=======================================
--- /branches/experimental/parser/tools/lexer_generator/transition_keys.py
Thu Nov 21 08:21:45 2013 UTC
+++ /branches/experimental/parser/tools/lexer_generator/transition_keys.py
Thu Nov 21 08:55:13 2013 UTC
@@ -46,6 +46,7 @@
'non_latin1_line_terminator' : (259, 259),
'eos' : (260, 260),
'zero' : (261, 261),
+ 'byte_order_mark' : (262, 262),
}
__lower_bound = min(__class_bounds.values(), key=lambda item: item[0])[0]
__upper_bound = max(__class_bounds.values(), key=lambda item: item[1])[1]
--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
---
You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.