Revision: 19999
Author: [email protected]
Date: Mon Mar 17 14:04:22 2014 UTC
Log: Experimental parser: add utf8tolatin1 conversion
[email protected]
BUG=
Review URL: https://codereview.chromium.org/196943021
http://code.google.com/p/v8/source/detail?r=19999
Modified:
/branches/experimental/parser/src/lexer/lexer-shell.cc
/branches/experimental/parser/tools/lexer_generator/test/run_lexing_tests.py
=======================================
--- /branches/experimental/parser/src/lexer/lexer-shell.cc Fri Mar 14
20:36:21 2014 UTC
+++ /branches/experimental/parser/src/lexer/lexer-shell.cc Mon Mar 17
14:04:22 2014 UTC
@@ -53,7 +53,7 @@
UTF8,
UTF16,
UTF8TO16, // Convert stream via scanner input stream
- UTF8TO16_PRECONVERT // Convert stream during file read
+ UTF8TOLATIN1, // Convert stream via scanner input stream
};
@@ -81,12 +81,14 @@
static uint16_t* ConvertUtf8ToUtf16(const uint16_t* const data_in,
- unsigned* length) {
+ unsigned* length,
+ bool* is_one_byte) {
const unsigned file_size = *length;
const uint8_t* char_data = reinterpret_cast<const uint8_t*>(data_in);
const uint32_t kMaxUtf16Character = 0xffff;
// Get utf8 length.
unsigned utf16_chars = 0;
+ *is_one_byte = true;
{
unsigned position = 0;
while (position < file_size) {
@@ -94,6 +96,7 @@
if (c <= unibrow::Utf8::kMaxOneByteChar) {
position++;
} else {
+ *is_one_byte = false;
c = unibrow::Utf8::CalculateValue(char_data + position,
file_size - position,
&position);
@@ -128,6 +131,17 @@
*length = 2 * utf16_chars;
return data;
}
+
+
+static uint16_t* ConvertUtf16ToLatin1(const uint16_t* const data_in,
+ unsigned* length) {
+ const unsigned size = *length / 2 + *length % 2;
+ uint16_t* data = new uint16_t[size];
+ uint8_t* char_data = reinterpret_cast<uint8_t*>(data);
+ CopyChars(char_data, data_in, size);
+ *length = size;
+ return data;
+}
static uint16_t* Repeat(int repeat,
@@ -166,16 +180,32 @@
static uint16_t* ReadFile(const char* name,
const LexerShellSettings& settings,
- unsigned* length) {
+ unsigned* length,
+ Encoding* output_encoding) {
uint16_t* data = ReadFile(name, length);
CHECK_GE(*length, 0);
if (*length == 0) return data;
- if (settings.encoding == UTF8TO16_PRECONVERT) {
- uint16_t* new_data = ConvertUtf8ToUtf16(data, length);
+ *output_encoding = settings.encoding;
+
+ if (settings.encoding == UTF8TO16 ||
+ settings.encoding == UTF8TOLATIN1) {
+ bool is_one_byte;
+ uint16_t* new_data = ConvertUtf8ToUtf16(data, length, &is_one_byte);
+ if (settings.encoding == UTF8TOLATIN1 && is_one_byte) {
+ *output_encoding = LATIN1;
+ } else {
+ *output_encoding = UTF16;
+ }
delete data;
data = new_data;
}
+
+ if (settings.encoding == UTF8TOLATIN1 && *output_encoding == LATIN1) {
+ uint16_t* new_data = ConvertUtf16ToLatin1(data, length);
+ delete data;
+ data = new_data;
+ }
if (settings.repeat > 1) {
uint16_t* new_data = Repeat(settings.repeat, data, length);
@@ -265,16 +295,15 @@
static TimeDelta RunLexer(const uint16_t* source,
const uint8_t* source_end,
Isolate* isolate,
+ Encoding output_encoding,
const LexerShellSettings& settings) {
SmartPointer<Utf16CharacterStream> stream;
const uint8_t* one_byte_source = reinterpret_cast<const
uint8_t*>(source);
int bytes = source_end - one_byte_source;
- switch (settings.encoding) {
- case UTF8TO16:
+ switch (output_encoding) {
case UTF8:
stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes));
break;
- case UTF8TO16_PRECONVERT:
case UTF16: {
CHECK_EQ(0, bytes % 2);
Handle<String> result = isolate->factory()->NewStringFromTwoByte(
@@ -290,6 +319,9 @@
new GenericStringUtf16CharacterStream(result, 0,
result->length()));
break;
}
+ case UTF8TO16:
+ case UTF8TOLATIN1:
+ CHECK(false);
}
Scanner scanner(isolate->unicode_cache());
scanner.SetHarmonyNumericLiterals(settings.harmony_numeric_literals);
@@ -340,14 +372,16 @@
TimeDelta time;
{
unsigned length_in_bytes;
- const uint16_t* buffer = ReadFile(fname, settings, &length_in_bytes);
+ Encoding output_encoding;
+ const uint16_t* buffer =
+ ReadFile(fname, settings, &length_in_bytes, &output_encoding);
const uint8_t* char_data = reinterpret_cast<const uint8_t*>(buffer);
const uint8_t* buffer_end = &char_data[length_in_bytes];
if (truncate_by > buffer_end - char_data) {
*can_truncate = false;
} else {
buffer_end -= truncate_by;
- time = RunLexer(buffer, buffer_end, isolate, settings);
+ time = RunLexer(buffer, buffer_end, isolate, output_encoding,
settings);
}
delete[] buffer;
}
@@ -370,9 +404,15 @@
settings.encoding = UTF16;
} else if (strcmp(argv[i], "--utf8to16") == 0) {
#ifdef V8_USE_GENERATED_LEXER
- settings.encoding = UTF8TO16_PRECONVERT;
-#else
settings.encoding = UTF8TO16;
+#else
+ settings.encoding = UTF8;
+#endif
+ } else if (strcmp(argv[i], "--utf8tolatin1") == 0) {
+#ifdef V8_USE_GENERATED_LEXER
+ settings.encoding = UTF8TOLATIN1;
+#else
+ settings.encoding = UTF8;
#endif
} else if (strcmp(argv[i], "--print-tokens") == 0) {
settings.print_tokens = true;
=======================================
---
/branches/experimental/parser/tools/lexer_generator/test/run_lexing_tests.py
Tue Mar 11 12:27:04 2014 UTC
+++
/branches/experimental/parser/tools/lexer_generator/test/run_lexing_tests.py
Mon Mar 17 14:04:22 2014 UTC
@@ -105,6 +105,26 @@
data['buffer'].append(data['process'].stdout.read())
return ''.join(data['buffer'])
+ @staticmethod
+ def analyse_diff(left_data, right_data):
+ left = left_data.split("\n");
+ right = right_data.split("\n");
+ for i in range(min(len(left), len(right))):
+ if left[i] != right[i]:
+ message = "differ at token %d" % i
+ for j in range(i-4, i-1):
+ if j >= 0:
+ message += "\n\n%s\n%s" % (left[j], right[j])
+ message += "\n\n%s\n%s\n" % (left[i], right[i])
+ logging.info(message)
+ return
+ if len(right) > len(left):
+ logging.info("right longer")
+ return
+ if len(left) > len(right):
+ logging.info("left longer")
+ return
+
def compare_results(self, left, right):
f = left['file']
assert f == right['file']
@@ -117,7 +137,7 @@
left_data = self.buffer_contents(left)
right_data = self.buffer_contents(right)
if left_data != right_data:
- # TODO(dcarney): analyse differences
+ self.analyse_diff(left_data, right_data)
print "%s failed" % f
return
print "%s succeeded" % f
@@ -184,7 +204,8 @@
parser.add_argument('-f', '--single-file', default='')
parser.add_argument('-p', '--parallel-process-count', default=1,
type=int)
parser.add_argument('-e', '--encoding',
- choices=['latin1', 'utf8', 'utf8to16', 'utf16'], default='utf8')
+ choices=['latin1', 'utf8', 'utf16', 'utf8to16', 'utf8tolatin1'],
+ default='utf8')
parser.add_argument('--use-harmony', action='store_true')
parser.add_argument('-v', '--verbose', action='store_true')
args = parser.parse_args()
--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
---
You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/d/optout.