Reviewers: jochen, mathias,

Description:
Allow identifier code points from supplementary multilingual planes.

ES5.1 section 6 ("Source Text"):
"Throughout the rest of this document, the phrase "code unit" and the
word "character" will be used to refer to a 16-bit unsigned value
used to represent a single 16-bit unit of text."

This changed in ES6 draft section 10.1 ("Source Text"):
"The ECMAScript code is expressed using Unicode, version 5.1 or later.
ECMAScript source text is a sequence of code points. All Unicode code
point values from U+0000 to U+10FFFF, including surrogate code points,
may occur in source text where permitted by the ECMAScript grammars."

This patch is to reflect this spec change.

BUG=v8:3617
LOG=Y

Please review this at https://codereview.chromium.org/640193002/

SVN Base: https://v8.googlecode.com/svn/branches/bleeding_edge

Affected files (+48, -1 lines):
  M BUILD.gn
  M src/char-predicates.h
  M test/unittests/unicode/unicode-predicates-unittest.cc
  M tools/gyp/v8.gyp


Index: BUILD.gn
diff --git a/BUILD.gn b/BUILD.gn
index 7de6830679d37f75efe24b1241a49789ebc9b5a0..2ff8b86c6a17029a80dd4acdb9fbb6a697c5e866 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -453,6 +453,7 @@ source_set("v8_base") {
     "src/bytecodes-irregexp.h",
     "src/cached-powers.cc",
     "src/cached-powers.h",
+    "src/char-predicates.cc",
     "src/char-predicates-inl.h",
     "src/char-predicates.h",
     "src/checks.cc",
Index: src/char-predicates.h
diff --git a/src/char-predicates.h b/src/char-predicates.h
index bfe7fe18bf6f048ee85d57d1fb38134ea2b0035c..5ecb07de992a6f62b1ad605ee97ce7f81c04a624 100644
--- a/src/char-predicates.h
+++ b/src/char-predicates.h
@@ -22,13 +22,24 @@ inline bool IsBinaryDigit(uc32 c);
 inline bool IsRegExpWord(uc32 c);
 inline bool IsRegExpNewline(uc32 c);

+
+struct SupplementaryPlanes {
+  static bool IsIDStart(uc32 c);
+  static bool IsIDPart(uc32 c);
+};
+
+
 // ES6 draft section 11.6
 // This includes '_', '$' and '\', and ID_Start according to
 // http://www.unicode.org/reports/tr31/, which consists of categories
 // 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', but excluding properties
 // 'Pattern_Syntax' or 'Pattern_White_Space'.
+// For code points in the SMPs, we can resort to ICU (if available).
 struct IdentifierStart {
-  static inline bool Is(uc32 c) { return unibrow::ID_Start::Is(c); }
+  static inline bool Is(uc32 c) {
+    if (c > 0xFFFF) return SupplementaryPlanes::IsIDStart(c);
+    return unibrow::ID_Start::Is(c);
+  }
 };


@@ -37,8 +48,10 @@ struct IdentifierStart {
 // http://www.unicode.org/reports/tr31/, which consists of ID_Start,
 // the categories 'Mn', 'Mc', 'Nd', 'Pc', but excluding properties
 // 'Pattern_Syntax' or 'Pattern_White_Space'.
+// For code points in the SMPs, we can resort to ICU (if available).
 struct IdentifierPart {
   static inline bool Is(uc32 c) {
+    if (c > 0xFFFF) return SupplementaryPlanes::IsIDPart(c);
     return unibrow::ID_Start::Is(c) || unibrow::ID_Continue::Is(c);
   }
 };
@@ -49,6 +62,7 @@ struct IdentifierPart {
// \u180e stops being one as of Unicode 6.3.0, but ES6 adheres to Unicode 5.1,
 // so it is also included.
 // Further included are \u0009, \u000b, \u0020, \u00a0, \u000c, and \ufeff.
+// There are no category 'Zs' code points in the SMPs.
 struct WhiteSpace {
   static inline bool Is(uc32 c) { return unibrow::WhiteSpace::Is(c); }
 };
Index: test/unittests/unicode/unicode-predicates-unittest.cc
diff --git a/test/unittests/unicode/unicode-predicates-unittest.cc b/test/unittests/unicode/unicode-predicates-unittest.cc index 88008d55bd22e81d33781c39ffe71b070fffe4e9..71f20d28733f72ef790e60b9ada12004b1bd89a8 100644
--- a/test/unittests/unicode/unicode-predicates-unittest.cc
+++ b/test/unittests/unicode/unicode-predicates-unittest.cc
@@ -86,5 +86,36 @@ TEST(UnicodePredicatesTest, IdentifierPart) {
   EXPECT_FALSE(IdentifierPart::Is(0x2E2F));
 }

+
+#ifdef V8_I18N_SUPPORT
+TEST(UnicodePredicatesTest, SupplementaryPlaneIdentifiers) {
+  // Both ID_Start and ID_Continue.
+  EXPECT_TRUE(IdentifierStart::Is(0x10403));  // Category Lu
+  EXPECT_TRUE(IdentifierPart::Is(0x10403));
+  EXPECT_TRUE(IdentifierStart::Is(0x1043C));  // Category Ll
+  EXPECT_TRUE(IdentifierPart::Is(0x1043C));
+  EXPECT_TRUE(IdentifierStart::Is(0x16F9C));  // Category Lm
+  EXPECT_TRUE(IdentifierPart::Is(0x16F9C));
+  EXPECT_TRUE(IdentifierStart::Is(0x10048));  // Category Lo
+  EXPECT_TRUE(IdentifierPart::Is(0x10048));
+  EXPECT_TRUE(IdentifierStart::Is(0x1014D));  // Category Nl
+  EXPECT_TRUE(IdentifierPart::Is(0x1014D));
+
+  // Only ID_Continue.
+  EXPECT_FALSE(IdentifierStart::Is(0x101FD));  // Category Mn
+  EXPECT_TRUE(IdentifierPart::Is(0x101FD));
+  EXPECT_FALSE(IdentifierStart::Is(0x11002));  // Category Mc
+  EXPECT_TRUE(IdentifierPart::Is(0x11002));
+  EXPECT_FALSE(IdentifierStart::Is(0x104A9));  // Category Nd
+  EXPECT_TRUE(IdentifierPart::Is(0x104A9));
+
+  // Neither.
+  EXPECT_FALSE(IdentifierStart::Is(0x10111));  // Category No
+  EXPECT_FALSE(IdentifierPart::Is(0x10111));
+  EXPECT_FALSE(IdentifierStart::Is(0x1F4A9));  // Category So
+  EXPECT_FALSE(IdentifierPart::Is(0x1F4A9));
+}
+#endif  // V8_I18N_SUPPORT
+
 }  // namespace internal
 }  // namespace v8
Index: tools/gyp/v8.gyp
diff --git a/tools/gyp/v8.gyp b/tools/gyp/v8.gyp
index 763eed9a92efac451590c0fc81209b86dece67da..76c92a783f722692589291d7ed94561106a7e9e8 100644
--- a/tools/gyp/v8.gyp
+++ b/tools/gyp/v8.gyp
@@ -364,6 +364,7 @@
         '../../src/bytecodes-irregexp.h',
         '../../src/cached-powers.cc',
         '../../src/cached-powers.h',
+        '../../src/char-predicates.cc',
         '../../src/char-predicates-inl.h',
         '../../src/char-predicates.h',
         '../../src/checks.cc',


--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
--- You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.

Reply via email to