Reviewers: dcarney,
Message:
Committed patchset #1 manually as r17947.
Description:
Experimental lexer generator: Fix byte order mark detection.
BUG=
[email protected]
Committed: https://code.google.com/p/v8/source/detail?r=17947
Please review this at https://codereview.chromium.org/80503002/
SVN Base: https://v8.googlecode.com/svn/branches/experimental/parser
Affected files (+11, -20 lines):
M src/lexer/even-more-experimental-scanner.cc
M src/lexer/lexer_py.re
M tools/lexer_generator/code_generator.jinja
M tools/lexer_generator/transition_keys.py
Index: src/lexer/even-more-experimental-scanner.cc
diff --git a/src/lexer/even-more-experimental-scanner.cc
b/src/lexer/even-more-experimental-scanner.cc
index
96c753f7d372b46c7072c3bc005f90cb6cbd50df..ac26d1d16bb1c6cabe012865138c19871372355b
100644
--- a/src/lexer/even-more-experimental-scanner.cc
+++ b/src/lexer/even-more-experimental-scanner.cc
@@ -69,32 +69,19 @@ const byte* ReadFile(const char* name, Isolate* isolate,
int file_size = ftell(file);
rewind(file);
- byte* file_contents = new byte[file_size];
+ *size = file_size * repeat;
+
+ byte* chars = new byte[*size];
for (int i = 0; i < file_size;) {
- int read =
- static_cast<int>(fread(&file_contents[i], 1, file_size - i, file));
+ int read = static_cast<int>(fread(&chars[i], 1, file_size - i, file));
i += read;
}
fclose(file);
- // If the file contains the UTF16 little endian magic bytes, skip them.
- // FIXME: what if we see big endian magic bytes? Do we do the right
thing for
- // big endian anyway?
- byte* start = file_contents;
- if (*start == 0xff && *(start + 1) == 0xfe) {
- start += 2;
- file_size -= 2;
+ for (int i = file_size; i < *size; i++) {
+ chars[i] = chars[i - file_size];
}
- *size = file_size * repeat;
- byte* chars = new byte[*size];
-
- for (int i = 0; i < *size; i++) {
- chars[i] = start[i % file_size];
- }
-
- delete file_contents;
-
return chars;
}
Index: src/lexer/lexer_py.re
diff --git a/src/lexer/lexer_py.re b/src/lexer/lexer_py.re
index
2ce0db54899469526637f52b81bc1ec5b4de2b82..b9c75d1caccb69fff8e13cde2886282be15dbb87
100644
--- a/src/lexer/lexer_py.re
+++ b/src/lexer/lexer_py.re
@@ -26,7 +26,8 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
whitespace_char = [:whitespace:];
-whitespace = whitespace_char+;
+byte_order_mark_char = [:byte_order_mark:];
+whitespace = whitespace_char+|byte_order_mark_char+;
identifier_start = [$_:letter:];
identifier_char = [:identifier_start::identifier_part_not_letter:];
line_terminator = [:line_terminator:];
Index: tools/lexer_generator/code_generator.jinja
diff --git a/tools/lexer_generator/code_generator.jinja
b/tools/lexer_generator/code_generator.jinja
index
580c7896035f23ddaf7d61e385482d21fd21f15b..097de4f9fda067aace27de0a228e8c2dab222647
100644
--- a/tools/lexer_generator/code_generator.jinja
+++ b/tools/lexer_generator/code_generator.jinja
@@ -18,6 +18,8 @@
(yych == 0 && cursor_ >= buffer_end_)
{%- elif r[1] == 'zero' -%}
(yych == 0 && cursor_ < buffer_end_)
+ {%- elif r[1] == 'byte_order_mark' and encoding == 'utf16'-%}
+ (yych == 0xfffe || yych == 0xfeff)
{%- elif r[1] == 'non_latin_1_whitespace' and encoding == 'utf16'-%}
{# FIXME: Add and use unicode_cache_->InNonAsciiWhitespace #}
(yych > 255 && unicode_cache_->IsWhiteSpace(yych))
Index: tools/lexer_generator/transition_keys.py
diff --git a/tools/lexer_generator/transition_keys.py
b/tools/lexer_generator/transition_keys.py
index
ca99b74f01e24eab3ed0e9bb2b4e021c3c930284..d6bd0e8e8baebb02805c8d52b423446d20454f69
100644
--- a/tools/lexer_generator/transition_keys.py
+++ b/tools/lexer_generator/transition_keys.py
@@ -46,6 +46,7 @@ class TransitionKey:
'non_latin1_line_terminator' : (259, 259),
'eos' : (260, 260),
'zero' : (261, 261),
+ 'byte_order_mark' : (262, 262),
}
__lower_bound = min(__class_bounds.values(), key=lambda item: item[0])[0]
__upper_bound = max(__class_bounds.values(), key=lambda item: item[1])[1]
--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
---
You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.