Revision: 24510
Author:   [email protected]
Date:     Fri Oct 10 07:13:46 2014 UTC
Log: Allow identifier code points from supplementary multilingual planes.

ES5.1 section 6 ("Source Text"):
"Throughout the rest of this document, the phrase "code unit" and the
word "character" will be used to refer to a 16-bit unsigned value
used to represent a single 16-bit unit of text."

This changed in ES6 draft section 10.1 ("Source Text"):
"The ECMAScript code is expressed using Unicode, version 5.1 or later.
ECMAScript source text is a sequence of code points. All Unicode code
point values from U+0000 to U+10FFFF, including surrogate code points,
may occur in source text where permitted by the ECMAScript grammars."

This patch is to reflect this spec change.

BUG=v8:3617
LOG=Y
[email protected]

Review URL: https://codereview.chromium.org/640193002
https://code.google.com/p/v8/source/detail?r=24510

Added:
 /branches/bleeding_edge/src/char-predicates.cc
 /branches/bleeding_edge/test/intl/general/smp-identifier.js
 /branches/bleeding_edge/test/mjsunit/parse-surrogates.js
Modified:
 /branches/bleeding_edge/BUILD.gn
 /branches/bleeding_edge/src/char-predicates.h
 /branches/bleeding_edge/src/scanner.h
/branches/bleeding_edge/test/unittests/unicode/unicode-predicates-unittest.cc
 /branches/bleeding_edge/tools/gyp/v8.gyp

=======================================
--- /dev/null
+++ /branches/bleeding_edge/src/char-predicates.cc Fri Oct 10 07:13:46 2014 UTC
@@ -0,0 +1,42 @@
+// Copyright 2011 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "src/char-predicates.h"
+
+#ifdef V8_I18N_SUPPORT
+#include "unicode/uchar.h"
+#include "unicode/urename.h"
+#endif  // V8_I18N_SUPPORT
+
+namespace v8 {
+namespace internal {
+
+bool SupplementaryPlanes::IsIDStart(uc32 c) {
+  DCHECK(c > 0xFFFF);
+#ifdef V8_I18N_SUPPORT
+ // This only works for code points in the SMPs, since ICU does not exclude
+  // code points with properties 'Pattern_Syntax' or 'Pattern_White_Space'.
+  // Code points in the SMP do not have those properties.
+  return u_isIDStart(c);
+#else
+  // This is incorrect, but if we don't have ICU, use this as fallback.
+  return false;
+#endif  // V8_I18N_SUPPORT
+}
+
+
+bool SupplementaryPlanes::IsIDPart(uc32 c) {
+  DCHECK(c > 0xFFFF);
+#ifdef V8_I18N_SUPPORT
+ // This only works for code points in the SMPs, since ICU does not exclude
+  // code points with properties 'Pattern_Syntax' or 'Pattern_White_Space'.
+  // Code points in the SMP do not have those properties.
+  return u_isIDPart(c);
+#else
+  // This is incorrect, but if we don't have ICU, use this as fallback.
+  return false;
+#endif  // V8_I18N_SUPPORT
+}
+}
+}  // namespace v8::internal
=======================================
--- /dev/null
+++ /branches/bleeding_edge/test/intl/general/smp-identifier.js Fri Oct 10 07:13:46 2014 UTC
@@ -0,0 +1,43 @@
+// Copyright 2014 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+function toSurrogatePair(c) {
+  return String.fromCharCode(((c - 0x10000) >>> 10) & 0x3FF | 0xD800) +
+         String.fromCharCode(c & 0x3FF | 0xDC00);
+}
+
+function testIdStart(c, is_id_start) {
+  var source = "var " + toSurrogatePair(c);
+  print(source);
+  if (is_id_start) {
+    assertDoesNotThrow(source);
+  } else {
+    assertThrows(source);
+  }
+}
+
+function testIdPart(c, is_id_start) {
+  var source = "var v" + toSurrogatePair(c);
+  print(source);
+  if (is_id_start) {
+    assertDoesNotThrow(source);
+  } else {
+    assertThrows(source);
+  }
+}
+
+[0x10403, 0x1043C, 0x16F9C, 0x10048, 0x1014D].forEach(function(c) {
+  testIdStart(c, true);
+  testIdPart(c, true);
+});
+
+[0x101FD, 0x11002, 0x104A9].forEach(function(c) {
+  testIdStart(c, false);
+  testIdPart(c, true);
+});
+
+[0x10111, 0x1F4A9].forEach(function(c) {
+  testIdStart(c, false);
+  testIdPart(c, false);
+});
=======================================
--- /dev/null
+++ /branches/bleeding_edge/test/mjsunit/parse-surrogates.js Fri Oct 10 07:13:46 2014 UTC
@@ -0,0 +1,7 @@
+// Copyright 2014 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Test that the parser throws on unmatched surrogates.
+assertThrows("var \uD801\uABCD;", SyntaxError);
+assertThrows("'\\u000\uD801\uABCD'", SyntaxError);
=======================================
--- /branches/bleeding_edge/BUILD.gn    Wed Oct  8 14:55:03 2014 UTC
+++ /branches/bleeding_edge/BUILD.gn    Fri Oct 10 07:13:46 2014 UTC
@@ -453,6 +453,7 @@
     "src/bytecodes-irregexp.h",
     "src/cached-powers.cc",
     "src/cached-powers.h",
+    "src/char-predicates.cc",
     "src/char-predicates-inl.h",
     "src/char-predicates.h",
     "src/checks.cc",
=======================================
--- /branches/bleeding_edge/src/char-predicates.h Wed Oct 8 14:55:03 2014 UTC +++ /branches/bleeding_edge/src/char-predicates.h Fri Oct 10 07:13:46 2014 UTC
@@ -22,13 +22,24 @@
 inline bool IsRegExpWord(uc32 c);
 inline bool IsRegExpNewline(uc32 c);

+
+struct SupplementaryPlanes {
+  static bool IsIDStart(uc32 c);
+  static bool IsIDPart(uc32 c);
+};
+
+
 // ES6 draft section 11.6
 // This includes '_', '$' and '\', and ID_Start according to
 // http://www.unicode.org/reports/tr31/, which consists of categories
 // 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', but excluding properties
 // 'Pattern_Syntax' or 'Pattern_White_Space'.
+// For code points in the SMPs, we can resort to ICU (if available).
 struct IdentifierStart {
-  static inline bool Is(uc32 c) { return unibrow::ID_Start::Is(c); }
+  static inline bool Is(uc32 c) {
+    if (c > 0xFFFF) return SupplementaryPlanes::IsIDStart(c);
+    return unibrow::ID_Start::Is(c);
+  }
 };


@@ -37,8 +48,10 @@
 // http://www.unicode.org/reports/tr31/, which consists of ID_Start,
 // the categories 'Mn', 'Mc', 'Nd', 'Pc', but excluding properties
 // 'Pattern_Syntax' or 'Pattern_White_Space'.
+// For code points in the SMPs, we can resort to ICU (if available).
 struct IdentifierPart {
   static inline bool Is(uc32 c) {
+    if (c > 0xFFFF) return SupplementaryPlanes::IsIDPart(c);
     return unibrow::ID_Start::Is(c) || unibrow::ID_Continue::Is(c);
   }
 };
@@ -49,6 +62,7 @@
// \u180e stops being one as of Unicode 6.3.0, but ES6 adheres to Unicode 5.1,
 // so it is also included.
 // Further included are \u0009, \u000b, \u0020, \u00a0, \u000c, and \ufeff.
+// There are no category 'Zs' code points in the SMPs.
 struct WhiteSpace {
   static inline bool Is(uc32 c) { return unibrow::WhiteSpace::Is(c); }
 };
=======================================
--- /branches/bleeding_edge/src/scanner.h       Wed Oct  8 14:55:03 2014 UTC
+++ /branches/bleeding_edge/src/scanner.h       Fri Oct 10 07:13:46 2014 UTC
@@ -212,9 +212,17 @@
       }
       ConvertToTwoByte();
     }
-    DCHECK(code_unit < 0x10000u);
-    *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
-    position_ += kUC16Size;
+    if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
+      *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
+      position_ += kUC16Size;
+    } else {
+      *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
+          unibrow::Utf16::LeadSurrogate(code_unit);
+      position_ += kUC16Size;
+      *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
+          unibrow::Utf16::TrailSurrogate(code_unit);
+      position_ += kUC16Size;
+    }
   }

   bool is_one_byte() const { return is_one_byte_; }
@@ -519,9 +527,25 @@
   }

   // Low-level scanning support.
-  void Advance() { c0_ = source_->Advance(); }
+  void Advance() {
+    c0_ = source_->Advance();
+    if (unibrow::Utf16::IsLeadSurrogate(c0_)) {
+      uc32 c1 = source_->Advance();
+      if (!unibrow::Utf16::IsTrailSurrogate(c1)) {
+        source_->PushBack(c1);
+      } else {
+        c0_ = unibrow::Utf16::CombineSurrogatePair(c0_, c1);
+      }
+    }
+  }
+
   void PushBack(uc32 ch) {
-    source_->PushBack(c0_);
+    if (ch > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
+      source_->PushBack(unibrow::Utf16::TrailSurrogate(c0_));
+      source_->PushBack(unibrow::Utf16::LeadSurrogate(c0_));
+    } else {
+      source_->PushBack(c0_);
+    }
     c0_ = ch;
   }

=======================================
--- /branches/bleeding_edge/test/unittests/unicode/unicode-predicates-unittest.cc Wed Oct 8 14:55:03 2014 UTC +++ /branches/bleeding_edge/test/unittests/unicode/unicode-predicates-unittest.cc Fri Oct 10 07:13:46 2014 UTC
@@ -85,6 +85,37 @@
   // \u2E2F has the Pattern_Syntax property, excluding it from ID_Start.
   EXPECT_FALSE(IdentifierPart::Is(0x2E2F));
 }
+
+
+#ifdef V8_I18N_SUPPORT
+TEST(UnicodePredicatesTest, SupplementaryPlaneIdentifiers) {
+  // Both ID_Start and ID_Continue.
+  EXPECT_TRUE(IdentifierStart::Is(0x10403));  // Category Lu
+  EXPECT_TRUE(IdentifierPart::Is(0x10403));
+  EXPECT_TRUE(IdentifierStart::Is(0x1043C));  // Category Ll
+  EXPECT_TRUE(IdentifierPart::Is(0x1043C));
+  EXPECT_TRUE(IdentifierStart::Is(0x16F9C));  // Category Lm
+  EXPECT_TRUE(IdentifierPart::Is(0x16F9C));
+  EXPECT_TRUE(IdentifierStart::Is(0x10048));  // Category Lo
+  EXPECT_TRUE(IdentifierPart::Is(0x10048));
+  EXPECT_TRUE(IdentifierStart::Is(0x1014D));  // Category Nl
+  EXPECT_TRUE(IdentifierPart::Is(0x1014D));
+
+  // Only ID_Continue.
+  EXPECT_FALSE(IdentifierStart::Is(0x101FD));  // Category Mn
+  EXPECT_TRUE(IdentifierPart::Is(0x101FD));
+  EXPECT_FALSE(IdentifierStart::Is(0x11002));  // Category Mc
+  EXPECT_TRUE(IdentifierPart::Is(0x11002));
+  EXPECT_FALSE(IdentifierStart::Is(0x104A9));  // Category Nd
+  EXPECT_TRUE(IdentifierPart::Is(0x104A9));
+
+  // Neither.
+  EXPECT_FALSE(IdentifierStart::Is(0x10111));  // Category No
+  EXPECT_FALSE(IdentifierPart::Is(0x10111));
+  EXPECT_FALSE(IdentifierStart::Is(0x1F4A9));  // Category So
+  EXPECT_FALSE(IdentifierPart::Is(0x1F4A9));
+}
+#endif  // V8_I18N_SUPPORT

 }  // namespace internal
 }  // namespace v8
=======================================
--- /branches/bleeding_edge/tools/gyp/v8.gyp    Wed Oct  8 14:55:03 2014 UTC
+++ /branches/bleeding_edge/tools/gyp/v8.gyp    Fri Oct 10 07:13:46 2014 UTC
@@ -364,6 +364,7 @@
         '../../src/bytecodes-irregexp.h',
         '../../src/cached-powers.cc',
         '../../src/cached-powers.h',
+        '../../src/char-predicates.cc',
         '../../src/char-predicates-inl.h',
         '../../src/char-predicates.h',
         '../../src/checks.cc',

--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

Reply via email to