Revision: 19766
Author:   [email protected]
Date:     Mon Mar 10 16:13:09 2014 UTC
Log:      Experimental parser: fix UTF8TO16 handling.

[email protected]

BUG=

Review URL: https://codereview.chromium.org/192643003
http://code.google.com/p/v8/source/detail?r=19766

Modified:
 /branches/experimental/parser/src/lexer/lexer-shell.cc
/branches/experimental/parser/tools/lexer_generator/test/run_lexing_tests.py

=======================================
--- /branches/experimental/parser/src/lexer/lexer-shell.cc Wed Mar 5 15:21:46 2014 UTC +++ /branches/experimental/parser/src/lexer/lexer-shell.cc Mon Mar 10 16:13:09 2014 UTC
@@ -47,49 +47,6 @@

 using namespace v8::internal;

-static byte* ReadFile(const char* name, const byte** end, int repeat,
-                      bool convert_to_utf16) {
-  FILE* file = fopen(name, "rb");
-  if (file == NULL) return NULL;
-
-  fseek(file, 0, SEEK_END);
-  int file_size = ftell(file);
-  rewind(file);
-
-  int size = file_size * repeat;
-
-  byte* chars = new byte[size];
-  for (int i = 0; i < file_size;) {
-    int read = static_cast<int>(fread(&chars[i], 1, file_size - i, file));
-    i += read;
-  }
-  fclose(file);
-
-  for (int i = file_size; i < size; i++) {
-    chars[i] = chars[i - file_size];
-  }
-  *end = &chars[size];
-
-  if (!convert_to_utf16) return chars;
-
-  // Length of new_chars is not strictly accurate, but should be enough.
-  uint16_t* new_chars = new uint16_t[size];
-  {
-    Utf8ToUtf16CharacterStream stream(chars, size);
-    uint16_t* cursor = new_chars;
-    // uc32 c;
- // The 32-bit char type is probably only so that we can have -1 as a return
-    // value. If the char is not -1, it should fit into 16 bits.
-    CHECK(false);
-    // while ((c = stream.Advance()) != -1) {
-    //   *cursor++ = c;
-    // }
-    *end = reinterpret_cast<byte*>(cursor);
-  }
-  delete[] chars;
-  return reinterpret_cast<byte*>(new_chars);
-}
-

 enum Encoding {
   LATIN1,
@@ -118,6 +75,90 @@
         harmony_modules(false),
         harmony_scoping(false) {}
 };
+
+
+static uint16_t* ReadFile(const char* name, const uint8_t** end,
+                          const LexerShellSettings& settings) {
+  FILE* file = fopen(name, "rb");
+  CHECK(file != NULL);
+
+  fseek(file, 0, SEEK_END);
+  unsigned file_size = ftell(file);
+  rewind(file);
+
+  uint16_t* two_byte_data = new uint16_t[file_size / 2 + file_size % 2];
+
+  uint8_t* char_data = reinterpret_cast<uint8_t*>(two_byte_data);
+  for (unsigned i = 0; i < file_size;) {
+    i += fread(&char_data[i], 1, file_size - i, file);
+  }
+  fclose(file);
+
+  if (settings.encoding == UTF8TO16) {
+    const uint32_t kMaxUtf16Character = 0xffff;
+    // Get utf8 length.
+    unsigned utf16_chars = 0;
+    {
+      unsigned position = 0;
+      while (position < file_size) {
+        uint32_t c = char_data[position];
+        if (c <= unibrow::Utf8::kMaxOneByteChar) {
+          position++;
+        } else {
+          c =  unibrow::Utf8::CalculateValue(char_data + position,
+                                             file_size - position,
+                                             &position);
+        }
+        if (c > kMaxUtf16Character) {
+          utf16_chars += 2;
+        } else {
+          utf16_chars += 1;
+        }
+      }
+    }
+    // Write new buffer out.
+    uint16_t* data = new uint16_t[utf16_chars];
+    unsigned position = 0;
+    unsigned i = 0;
+    while (position < file_size) {
+      uint32_t c = char_data[position];
+      if (c <= unibrow::Utf8::kMaxOneByteChar) {
+        position++;
+      } else {
+        c =  unibrow::Utf8::CalculateValue(char_data + position,
+                                           file_size - position,
+                                           &position);
+      }
+      if (c > kMaxUtf16Character) {
+        data[i++] = unibrow::Utf16::LeadSurrogate(c);
+        data[i++] = unibrow::Utf16::TrailSurrogate(c);
+      } else {
+        data[i++] = static_cast<uc16>(c);
+      }
+    }
+    // Swap buffers.
+    delete two_byte_data;
+    file_size = utf16_chars * 2;
+    two_byte_data = data;
+    char_data = reinterpret_cast<uint8_t*>(two_byte_data);
+  }
+
+  // Duplicate buffer if necessary.
+  if (settings.repeat > 1) {
+    unsigned size = file_size * settings.repeat;
+    uint16_t* data = new uint16_t[size / 2 + size % 2];
+    char_data = reinterpret_cast<uint8_t*>(two_byte_data);
+    for (int i = 0; i < settings.repeat; i++) {
+      memcpy(&char_data[i * file_size], two_byte_data, file_size);
+    }
+    delete two_byte_data;
+    file_size = size;
+    two_byte_data = data;
+  }
+
+  *end = &char_data[file_size];
+  return two_byte_data;
+}


 struct TokenWithLocation {
@@ -193,29 +234,30 @@
 }


-static TimeDelta RunLexer(const byte* source,
-                          const byte* source_end,
+static TimeDelta RunLexer(const uint16_t* source,
+                          const uint8_t* source_end,
                           Isolate* isolate,
                           std::vector<TokenWithLocation>* tokens,
                           const LexerShellSettings& settings) {
   SmartPointer<Utf16CharacterStream> stream;
+ const uint8_t* one_byte_source = reinterpret_cast<const uint8_t*>(source);
+  int bytes = source_end - one_byte_source;
   switch (settings.encoding) {
     case UTF8:
-    case UTF8TO16:
- stream.Reset(new Utf8ToUtf16CharacterStream(source, source_end - source));
+      stream.Reset(new Utf8ToUtf16CharacterStream(one_byte_source, bytes));
       break;
+    case UTF8TO16:
     case UTF16: {
+      CHECK_EQ(0, bytes % 2);
       Handle<String> result = isolate->factory()->NewStringFromTwoByte(
-          Vector<const uint16_t>(
-              reinterpret_cast<const uint16_t*>(source),
-              (source_end - source) / 2));
+          Vector<const uint16_t>(source, bytes / 2));
       stream.Reset(
new GenericStringUtf16CharacterStream(result, 0, result->length()));
       break;
     }
     case LATIN1: {
       Handle<String> result = isolate->factory()->NewStringFromOneByte(
-          Vector<const uint8_t>(source, source_end - source));
+          Vector<const uint8_t>(one_byte_source, bytes));
       stream.Reset(
new GenericStringUtf16CharacterStream(result, 0, result->length()));
       break;
@@ -258,9 +300,9 @@
   std::vector<TokenWithLocation> tokens;
   TimeDelta time;
   {
-    const byte* buffer_end = 0;
- const byte* buffer = ReadFile(fname, &buffer_end, settings.repeat, false);
-    if (truncate_by > buffer_end - buffer) {
+    const uint8_t* buffer_end = 0;
+    const uint16_t* buffer = ReadFile(fname, &buffer_end, settings);
+ if (truncate_by > buffer_end - reinterpret_cast<const uint8_t*>(buffer)) {
       *can_truncate = false;
     } else {
       buffer_end -= truncate_by;
=======================================
--- /branches/experimental/parser/tools/lexer_generator/test/run_lexing_tests.py Mon Feb 17 10:21:04 2014 UTC +++ /branches/experimental/parser/tools/lexer_generator/test/run_lexing_tests.py Mon Mar 10 16:13:09 2014 UTC
@@ -25,36 +25,125 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+import argparse
 import subprocess
 import sys
+import time
+import logging

-def wait_processes(processes):
-  for p in processes:
-    if p[1].wait():
-      print p[0], 'failed'
-    else:
-      print p[0], 'ok'
+class ProcessRunner:
+
+  def __init__(self, files, args):
+    self.files = files
+    self.process_map = {}
+    self.complete_processes = {}
+    self.left_path = args.left_path
+    self.right_path = args.right_path
+    self.max_process_count = args.parallel_process_count
+    self.args = ['--break-after-illegal']
+    if args.use_harmony:
+      self.args.append('--use-harmony')
+    self.args.append('--%s' % args.encoding)
+    if self.right_path:
+      self.args.append('--print-tokens')
+
+  def build_process_map(self):
+    process_map = self.process_map
+    for i, f in enumerate(self.files):
+      process_map[2 * i] = {
+        'file': f, 'path' : self.left_path, 'type' : 'left' }
+      if self.right_path:
+        process_map[2 * i + 1] = {
+          'file': f, 'path' : self.right, 'type' : 'right' }
+
+  def wait_processes(self, running_processes):
+    complete_ids = []
+    while True:
+      for i in running_processes:
+        data = self.process_map[i]
+        response = data['process'].poll()
+        if response == None:
+          continue
+        self.complete_processes[i] = data
+        complete_ids.append(i)
+      if complete_ids:
+        break
+      time.sleep(0.001)
+    for i in complete_ids:
+      running_processes.remove(i)
+      del self.process_map[i]
+
+  def process_complete_processes(self):
+    complete_processes = self.complete_processes
+    complete_ids = []
+    for i, data in complete_processes.iteritems():
+      p = data['process']
+      if not self.right_path:
+        if p.returncode:
+          print "%s failed" % data['file']
+        else:
+          print "%s succeeded" % data['file']
+        complete_ids.append(i)
+      else:
+        # TODO(dcarney): perform compare
+        pass
+    # clear processed data
+    for i in complete_ids:
+      del complete_processes[i]
+
+  def run(self):
+    assert not self.process_map
+    self.build_process_map()
+    process_map = self.process_map
+    complete_processes = self.complete_processes
+    running_processes = set()
+    with open('/dev/null', 'w') as dev_null:
+      while True:
+        for id, data in process_map.iteritems():
+          if id in running_processes:
+            continue
+          if len(running_processes) == self.max_process_count:
+            break
+          out = sys.PIPE if self.right_path else dev_null
+          args = [data['path'], data['file']] + self.args
+          logging.info("running [%s]" % ' '.join(args))
+          data['process'] = subprocess.Popen(args,
+                                             stdout=out,
+                                             stderr=dev_null,
+                                             bufsize=16*1024)
+          running_processes.add(id)
+        if not running_processes:
+          break
+        self.wait_processes(running_processes)
+        self.process_complete_processes()
+    assert not running_processes
+    assert not self.process_map
+    assert not self.complete_processes

 if __name__ == '__main__':
-  if len(sys.argv) < 4:
-    error_message = ('Usage:' + sys.argv[0] +
- 'LEXER_SHELL_PATH FILE_LIST_FILE PARALLEL_PROCESS_COUNT ' +
-                     '[OTHER_ARGS]')
-    print >> sys.stderr, error_message
-    sys.exit(1)
-  lexer_shell = sys.argv[1]
-  file_file = sys.argv[2]
-  process_count = int(sys.argv[3])
-  with open(file_file, 'r') as f:
- test_files = [filename for filename in f.read().split('\n') if filename]

-  with open('/dev/null', 'w') as dev_null:
-    processes = []
-    for i, f in enumerate(test_files):
- lexer_shell_args = [lexer_shell, f, '--break-after-illegal'] + sys.argv[4:] - processes.append((f, subprocess.Popen(lexer_shell_args, stdout=dev_null)))
-      if i % process_count == process_count - 1:
-        wait_processes(processes)
-        processes = []
+  parser = argparse.ArgumentParser()
+  parser.add_argument('-l', '--left-path')
+  parser.add_argument('-r', '--right-path', default='')
+  parser.add_argument('-i', '--input-files-path', default='')
+  parser.add_argument('-f', '--single-file', default='')
+ parser.add_argument('-p', '--parallel-process-count', default=1, type=int)
+  parser.add_argument('-e', '--encoding',
+    choices=['latin1', 'utf8', 'utf8to16', 'utf16'], default='utf8')
+  parser.add_argument('--use-harmony', action='store_true')
+  parser.add_argument('-v', '--verbose', action='store_true')
+  args = parser.parse_args()

-    wait_processes(processes)
+  if args.verbose:
+    logging.basicConfig(level=logging.INFO)
+
+  files = []
+  if args.input_files_path:
+    with open(args.input_files_path, 'r') as f:
+      files = [filename for filename in f.read().split('\n') if filename]
+  if args.single_file:
+    files.append(args.single_file)
+  assert files
+
+  process_runner = ProcessRunner(files, args)
+  process_runner.run()

--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

Reply via email to