Revision: 19766
Author: [email protected]
Date: Mon Mar 10 16:13:09 2014 UTC
Log: Experimental parser: fix UTF8TO16 handling.
[email protected]
BUG=
Review URL: https://codereview.chromium.org/192643003
http://code.google.com/p/v8/source/detail?r=19766
Modified:
/branches/experimental/parser/src/lexer/lexer-shell.cc
/branches/experimental/parser/tools/lexer_generator/test/run_lexing_tests.py
=======================================
--- /branches/experimental/parser/src/lexer/lexer-shell.cc Wed Mar 5
15:21:46 2014 UTC
+++ /branches/experimental/parser/src/lexer/lexer-shell.cc Mon Mar 10
16:13:09 2014 UTC
@@ -47,49 +47,6 @@
using namespace v8::internal;
-static byte* ReadFile(const char* name, const byte** end, int repeat,
- bool convert_to_utf16) {
- FILE* file = fopen(name, "rb");
- if (file == NULL) return NULL;
-
- fseek(file, 0, SEEK_END);
- int file_size = ftell(file);
- rewind(file);
-
- int size = file_size * repeat;
-
- byte* chars = new byte[size];
- for (int i = 0; i < file_size;) {
- int read = static_cast<int>(fread(&chars[i], 1, file_size - i, file));
- i += read;
- }
- fclose(file);
-
- for (int i = file_size; i < size; i++) {
- chars[i] = chars[i - file_size];
- }
- *end = &chars[size];
-
- if (!convert_to_utf16) return chars;
-
- // Length of new_chars is not strictly accurate, but should be enough.
- uint16_t* new_chars = new uint16_t[size];
- {
- Utf8ToUtf16CharacterStream stream(chars, size);
- uint16_t* cursor = new_chars;
- // uc32 c;
- // The 32-bit char type is probably only so that we can have -1 as a
return
- // value. If the char is not -1, it should fit into 16 bits.
- CHECK(false);
- // while ((c = stream.Advance()) != -1) {
- // *cursor++ = c;
- // }
- *end = reinterpret_cast<byte*>(cursor);
- }
- delete[] chars;
- return reinterpret_cast<byte*>(new_chars);
-}
-
enum Encoding {
LATIN1,
@@ -118,6 +75,90 @@
harmony_modules(false),
harmony_scoping(false) {}
};
+
+
+static uint16_t* ReadFile(const char* name, const uint8_t** end,
+ const LexerShellSettings& settings) {
+ FILE* file = fopen(name, "rb");
+ CHECK(file != NULL);
+
+ fseek(file, 0, SEEK_END);
+ unsigned file_size = ftell(file);
+ rewind(file);
+
+ uint16_t* two_byte_data = new uint16_t[file_size / 2 + file_size % 2];
+
+ uint8_t* char_data = reinterpret_cast<uint8_t*>(two_byte_data);
+ for (unsigned i = 0; i < file_size;) {
+ i += fread(&char_data[i], 1, file_size - i, file);
+ }
+ fclose(file);
+
+ if (settings.encoding == UTF8TO16) {
+ const uint32_t kMaxUtf16Character = 0xffff;
+ // Get utf8 length.
+ unsigned utf16_chars = 0;
+ {
+ unsigned position = 0;
+ while (position < file_size) {
+ uint32_t c = char_data[position];
+ if (c <= unibrow::Utf8::kMaxOneByteChar) {
+ position++;
+ } else {
+ c = unibrow::Utf8::CalculateValue(char_data + position,
+ file_size - position,
+ &position);
+ }
+ if (c > kMaxUtf16Character) {
+ utf16_chars += 2;
+ } else {
+ utf16_chars += 1;
+ }
+ }
+ }
+ // Write new buffer out.
+ uint16_t* data = new uint16_t[utf16_chars];
+ unsigned position = 0;
+ unsigned i = 0;
+ while (position < file_size) {
+ uint32_t c = char_data[position];
+ if (c <= unibrow::Utf8::kMaxOneByteChar) {
+ position++;
+ } else {
+ c = unibrow::Utf8::CalculateValue(char_data + position,
+ file_size - position,
+ &position);
+ }
+ if (c > kMaxUtf16Character) {
+ data[i++] = unibrow::Utf16::LeadSurrogate(c);
+ data[i++] = unibrow::Utf16::TrailSurrogate(c);
+ } else {
+ data[i++] = static_cast<uc16>(c);
+ }
+ }
+ // Swap buffers.
+ delete two_byte_data;
+ file_size = utf16_chars * 2;
+ two_byte_data = data;
+ char_data = reinterpret_cast<uint8_t*>(two_byte_data);
+ }
+
+ // Duplicate buffer if necessary.
+ if (settings.repeat > 1) {
+ unsigned size = file_size * settings.repeat;
+ uint16_t* data = new uint16_t[size / 2 + size % 2];
+ char_data = reinterpret_cast<uint8_t*>(two_byte_data);
+ for (int i = 0; i < settings.repeat; i++) {
+ memcpy(&char_data[i * file_size], two_byte_data, file_size);
+ }
+ delete two_byte_data;
+ file_size = size;
+ two_byte_data = data;
+ }
+
+ *end = &char_data[file_size];
+ return two_byte_data;
+}
struct TokenWithLocation {
@@ -193,29 +234,30 @@
}
-static TimeDelta RunLexer(const byte* source,
- const byte* source_end,
+static TimeDelta RunLexer(const uint16_t* source,
+ const uint8_t* source_end,
Isolate* isolate,
std::vector<TokenWithLocation>* tokens,
const LexerShellSettings& settings) {
SmartPointer<Utf16CharacterStream> stream;
+ const uint8_t* one_byte_source = reinterpret_cast<const
uint8_t*>(source);
+ int bytes = source_end - one_byte_source;
switch (settings.encoding) {
case UTF8:
- case UTF8TO16:
- stream.Reset(new Utf8ToUtf16CharacterStream(source, source_end -
source));
+ stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes));
break;
+ case UTF8TO16:
case UTF16: {
+ CHECK_EQ(0, bytes % 2);
Handle<String> result = isolate->factory()->NewStringFromTwoByte(
- Vector<const uint16_t>(
- reinterpret_cast<const uint16_t*>(source),
- (source_end - source) / 2));
+ Vector<const uint16_t>(source, bytes / 2));
stream.Reset(
new GenericStringUtf16CharacterStream(result, 0,
result->length()));
break;
}
case LATIN1: {
Handle<String> result = isolate->factory()->NewStringFromOneByte(
- Vector<const uint8_t>(source, source_end - source));
+ Vector<const uint8_t>(one_byte_source, bytes));
stream.Reset(
new GenericStringUtf16CharacterStream(result, 0,
result->length()));
break;
@@ -258,9 +300,9 @@
std::vector<TokenWithLocation> tokens;
TimeDelta time;
{
- const byte* buffer_end = 0;
- const byte* buffer = ReadFile(fname, &buffer_end, settings.repeat,
false);
- if (truncate_by > buffer_end - buffer) {
+ const uint8_t* buffer_end = 0;
+ const uint16_t* buffer = ReadFile(fname, &buffer_end, settings);
+ if (truncate_by > buffer_end - reinterpret_cast<const
uint8_t*>(buffer)) {
*can_truncate = false;
} else {
buffer_end -= truncate_by;
=======================================
---
/branches/experimental/parser/tools/lexer_generator/test/run_lexing_tests.py
Mon Feb 17 10:21:04 2014 UTC
+++
/branches/experimental/parser/tools/lexer_generator/test/run_lexing_tests.py
Mon Mar 10 16:13:09 2014 UTC
@@ -25,36 +25,125 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import argparse
import subprocess
import sys
+import time
+import logging
-def wait_processes(processes):
- for p in processes:
- if p[1].wait():
- print p[0], 'failed'
- else:
- print p[0], 'ok'
+class ProcessRunner:
+
+ def __init__(self, files, args):
+ self.files = files
+ self.process_map = {}
+ self.complete_processes = {}
+ self.left_path = args.left_path
+ self.right_path = args.right_path
+ self.max_process_count = args.parallel_process_count
+ self.args = ['--break-after-illegal']
+ if args.use_harmony:
+ self.args.append('--use-harmony')
+ self.args.append('--%s' % args.encoding)
+ if self.right_path:
+ self.args.append('--print-tokens')
+
+ def build_process_map(self):
+ process_map = self.process_map
+ for i, f in enumerate(self.files):
+ process_map[2 * i] = {
+ 'file': f, 'path' : self.left_path, 'type' : 'left' }
+ if self.right_path:
+ process_map[2 * i + 1] = {
+ 'file': f, 'path' : self.right, 'type' : 'right' }
+
+ def wait_processes(self, running_processes):
+ complete_ids = []
+ while True:
+ for i in running_processes:
+ data = self.process_map[i]
+ response = data['process'].poll()
+ if response == None:
+ continue
+ self.complete_processes[i] = data
+ complete_ids.append(i)
+ if complete_ids:
+ break
+ time.sleep(0.001)
+ for i in complete_ids:
+ running_processes.remove(i)
+ del self.process_map[i]
+
+ def process_complete_processes(self):
+ complete_processes = self.complete_processes
+ complete_ids = []
+ for i, data in complete_processes.iteritems():
+ p = data['process']
+ if not self.right_path:
+ if p.returncode:
+ print "%s failed" % data['file']
+ else:
+ print "%s succeeded" % data['file']
+ complete_ids.append(i)
+ else:
+ # TODO(dcarney): perform compare
+ pass
+ # clear processed data
+ for i in complete_ids:
+ del complete_processes[i]
+
+ def run(self):
+ assert not self.process_map
+ self.build_process_map()
+ process_map = self.process_map
+ complete_processes = self.complete_processes
+ running_processes = set()
+ with open('/dev/null', 'w') as dev_null:
+ while True:
+ for id, data in process_map.iteritems():
+ if id in running_processes:
+ continue
+ if len(running_processes) == self.max_process_count:
+ break
+ out = sys.PIPE if self.right_path else dev_null
+ args = [data['path'], data['file']] + self.args
+ logging.info("running [%s]" % ' '.join(args))
+ data['process'] = subprocess.Popen(args,
+ stdout=out,
+ stderr=dev_null,
+ bufsize=16*1024)
+ running_processes.add(id)
+ if not running_processes:
+ break
+ self.wait_processes(running_processes)
+ self.process_complete_processes()
+ assert not running_processes
+ assert not self.process_map
+ assert not self.complete_processes
if __name__ == '__main__':
- if len(sys.argv) < 4:
- error_message = ('Usage:' + sys.argv[0] +
- 'LEXER_SHELL_PATH FILE_LIST_FILE
PARALLEL_PROCESS_COUNT ' +
- '[OTHER_ARGS]')
- print >> sys.stderr, error_message
- sys.exit(1)
- lexer_shell = sys.argv[1]
- file_file = sys.argv[2]
- process_count = int(sys.argv[3])
- with open(file_file, 'r') as f:
- test_files = [filename for filename in f.read().split('\n') if
filename]
- with open('/dev/null', 'w') as dev_null:
- processes = []
- for i, f in enumerate(test_files):
- lexer_shell_args = [lexer_shell, f, '--break-after-illegal'] +
sys.argv[4:]
- processes.append((f, subprocess.Popen(lexer_shell_args,
stdout=dev_null)))
- if i % process_count == process_count - 1:
- wait_processes(processes)
- processes = []
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-l', '--left-path')
+ parser.add_argument('-r', '--right-path', default='')
+ parser.add_argument('-i', '--input-files-path', default='')
+ parser.add_argument('-f', '--single-file', default='')
+ parser.add_argument('-p', '--parallel-process-count', default=1,
type=int)
+ parser.add_argument('-e', '--encoding',
+ choices=['latin1', 'utf8', 'utf8to16', 'utf16'], default='utf8')
+ parser.add_argument('--use-harmony', action='store_true')
+ parser.add_argument('-v', '--verbose', action='store_true')
+ args = parser.parse_args()
- wait_processes(processes)
+ if args.verbose:
+ logging.basicConfig(level=logging.INFO)
+
+ files = []
+ if args.input_files_path:
+ with open(args.input_files_path, 'r') as f:
+ files = [filename for filename in f.read().split('\n') if filename]
+ if args.single_file:
+ files.append(args.single_file)
+ assert files
+
+ process_runner = ProcessRunner(files, args)
+ process_runner.run()
--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
---
You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/d/optout.