[v8-dev] [v8] r19196 committed - Fix inconsistencies wrt whitespaces....

codesite-noreply Fri, 07 Feb 2014 04:35:41 -0800

Revision: 19196
Author:   [email protected]
Date:     Fri Feb  7 12:34:45 2014 UTC
Log:      Fix inconsistencies wrt whitespaces.

\u0085 (NEL) is now considered a whitespace in accordance tohttp://www.unicode.org/Public/6.3.0/ucd/PropList.txt


[email protected]
BUG=v8:3109
LOG=Y

Review URL: https://codereview.chromium.org/146983007
http://code.google.com/p/v8/source/detail?r=19196

Added:
 /branches/bleeding_edge/test/mjsunit/whitespaces.js
Modified:
 /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc
 /branches/bleeding_edge/src/char-predicates.h
 /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc
 /branches/bleeding_edge/src/jsregexp.cc
 /branches/bleeding_edge/src/runtime.cc
 /branches/bleeding_edge/src/scanner.h
 /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc
 /branches/bleeding_edge/test/cctest/test-regexp.cc
 /branches/bleeding_edge/test/mjsunit/third_party/string-trim.js

=======================================
--- /dev/null

+++ /branches/bleeding_edge/test/mjsunit/whitespaces.js Fri Feb 7 12:34:452014 UTC

@@ -0,0 +1,134 @@
+// Copyright 2014 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+var whitespaces = [
+  // Whitespaces defined in ECMA-262 5.1, 7.2
+  0x0009,  // Tab              TAB
+  0x000B,  // Vertical Tab     VT
+  0x000C,  // Form Feed        FF
+  0x0020,  // Space            SP
+  0x00A0,  // No-break space   NBSP
+  0xFEFF,  // Byte Order Mark  BOM
+  // Unicode whitespaces
+  0x000A,  // Line Feed        LF
+  0x000D,  // Carriage Return  CR
+  0x0085,  // Next Line        NEL
+  0x1680,  // Ogham Space Mark
+  0x180E,  // Mongolian Vowel Separator
+  0x2000,  // EN QUAD
+  0x2001,  // EM QUAD
+  0x2002,  // EN SPACE
+  0x2003,  // EM SPACE
+  0x2004,  // THREE-PER-EM SPACE
+  0x2005,  // FOUR-PER-EM SPACE
+  0x2006,  // SIX-PER-EM SPACE
+  0x2007,  // FIGURE SPACE
+  0x2008,  // PUNCTUATION SPACE
+  0x2009,  // THIN SPACE
+  0x200A,  // HAIR SPACE
+  0x2028,  // LINE SEPARATOR
+  0x2029,  // PARAGRAPH SEPARATOR
+  0x202F,  // NARROW NO-BREAK SPACE
+  0x205F,  // MEDIUM MATHEMATICAL SPACE
+  0x3000,  // IDEOGRAPHIC SPACE
+];
+
+// Add single twobyte char to force twobyte representation.
+// Interestingly, snowman is not "white" space :)
+var twobyte = "\u2603";
+var onebyte = "\u007E";
+var twobytespace = "\u2000";
+var onebytespace = "\u0020";
+
+function is_whitespace(c) {
+  return whitespaces.indexOf(c.charCodeAt(0)) > -1;
+}
+
+function test_regexp(str) {
+  var pos_match = str.match(/\s/);
+  var neg_match = str.match(/\S/);
+  var test_char = str[0];
+  var postfix = str[1];
+  if (is_whitespace(test_char)) {
+    assertEquals(test_char, pos_match[0]);
+    assertEquals(postfix, neg_match[0]);
+  } else {
+    assertEquals(test_char, neg_match[0]);
+    assertNull(pos_match);
+  }
+}
+
+function test_trim(c, infix) {
+  var str = c + c + c + infix + c;
+  if (is_whitespace(c)) {
+    assertEquals(infix, str.trim());
+  } else {
+    assertEquals(str, str.trim());
+  }
+}
+
+function test_parseInt(c, postfix) {
+  // Skip if prefix is a digit.
+  if (c >= "0" && c <= 9) return;
+  var str = c + c + "123" + postfix;
+  if (is_whitespace(c)) {
+    assertEquals(123, parseInt(str));
+  } else {
+    assertEquals(NaN, parseInt(str));
+  }
+}
+
+function test_eval(c, content) {
+  if (!is_whitespace(c)) return;
+  var str = c + c + "'" + content + "'" + c + c;
+  assertEquals(content, eval(str));
+}
+
+function test_stringtonumber(c, postfix) {
+  // Skip if prefix is a digit.
+  if (c >= "0" && c <= 9) return;
+  var result = 1 + Number(c + "123" + c + postfix);
+  if (is_whitespace(c)) {
+    assertEquals(124, result);
+  } else {
+    assertEquals(NaN, result);
+  }
+}
+
+for (var i = 0; i < 0x10000; i++) {
+  c = String.fromCharCode(i);
+  test_regexp(c + onebyte);
+  test_regexp(c + twobyte);
+  test_trim(c, onebyte + "trim");
+  test_trim(c, twobyte + "trim");
+  test_parseInt(c, onebyte);
+  test_parseInt(c, twobyte);
+  test_eval(c, onebyte);
+  test_eval(c, twobyte);
+  test_stringtonumber(c, onebytespace);
+  test_stringtonumber(c, twobytespace);
+}
=======================================

--- /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc Thu Sep12 10:37:42 2013 UTC+++ /branches/bleeding_edge/src/arm/regexp-macro-assembler-arm.cc Fri Feb7 12:34:45 2014 UTC

@@ -497,6 +497,8 @@
       __ b(ls, &success);
       // \u00a0 (NBSP).
       __ cmp(r0, Operand(0x00a0 - '\t'));
+      // \u0085 (NEL).
+      __ cmp(r0, Operand(0x0085 - '\t'), ne);
       BranchOrBacktrack(ne, on_no_match);
       __ bind(&success);
       return true;
=======================================

--- /branches/bleeding_edge/src/char-predicates.h Fri Jul 19 09:57:35 2013UTC+++ /branches/bleeding_edge/src/char-predicates.h Fri Feb 7 12:34:45 2014UTC

@@ -66,6 +66,14 @@
   }
 };

+
+struct WhiteSpace {
+  static inline bool Is(uc32 c) {
+    return unibrow::WhiteSpace::Is(c) ||

+ c == 0xFEFF; // BYTE ORDER MARK is a white space in ECMA-262 5.1,7.2.

+  }
+};
+
 } }  // namespace v8::internal

 #endif  // V8_CHAR_PREDICATES_H_
=======================================

--- /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc Wed Sep11 10:51:06 2013 UTC+++ /branches/bleeding_edge/src/ia32/regexp-macro-assembler-ia32.cc FriFeb 7 12:34:45 2014 UTC

@@ -526,6 +526,9 @@
       __ j(below_equal, &success, Label::kNear);
       // \u00a0 (NBSP).
       __ cmp(eax, 0x00a0 - '\t');
+      __ j(equal, &success, Label::kNear);
+      // \u0085 (NEL).
+      __ cmp(eax, 0x0085 - '\t');
       BranchOrBacktrack(not_equal, on_no_match);
       __ bind(&success);
       return true;
=======================================
--- /branches/bleeding_edge/src/jsregexp.cc     Mon Dec  9 07:41:20 2013 UTC
+++ /branches/bleeding_edge/src/jsregexp.cc     Fri Feb  7 12:34:45 2014 UTC
@@ -3597,9 +3597,10 @@


 // The '2' variant is has inclusive from and exclusive to.
-static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0,
-    0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A,

- 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00,0x10000 };

+static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
+    0x0085, 0x0086, 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F,
+    0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060,
+    0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000 };
 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);

 static const int kWordRanges[] = {
=======================================
--- /branches/bleeding_edge/src/runtime.cc      Fri Feb  7 01:08:50 2014 UTC
+++ /branches/bleeding_edge/src/runtime.cc      Fri Feb  7 12:34:45 2014 UTC
@@ -6105,8 +6105,10 @@
       // Fast check for a junk value. A valid string may start from a

// whitespace, a sign ('+' or '-'), the decimal point, a decimaldigit or// the 'I' character ('Infinity'). All of that have codes notgreater than

-      // '9' except 'I' and &nbsp;.
-      if (data[start_pos] != 'I' && data[start_pos] != 0xa0) {
+      // '9' except 'I', NBSP and NEL.
+      if (data[start_pos] != 'I' &&
+          data[start_pos] != 0xa0 &&
+          data[start_pos] != 0x85) {
         return isolate->heap()->nan_value();
       }
     } else if (len - start_pos < 10 && AreDigits(data, start_pos, len)) {
@@ -6539,11 +6541,6 @@
   return ConvertCase(
       args, isolate, isolate->runtime_state()->to_upper_mapping());
 }
-
-
-static inline bool IsTrimWhiteSpace(unibrow::uchar c) {
-  return unibrow::WhiteSpace::Is(c) || c == 0x200b || c == 0xfeff;
-}


 RUNTIME_FUNCTION(MaybeObject*, Runtime_StringTrim) {
@@ -6558,15 +6555,17 @@
   int length = string->length();

   int left = 0;
+  UnicodeCache* unicode_cache = isolate->unicode_cache();
   if (trimLeft) {
-    while (left < length && IsTrimWhiteSpace(string->Get(left))) {

+ while (left < length &&unicode_cache->IsWhiteSpace(string->Get(left))) {

       left++;
     }
   }

   int right = length;
   if (trimRight) {
-    while (right > left && IsTrimWhiteSpace(string->Get(right - 1))) {
+    while (right > left &&
+           unicode_cache->IsWhiteSpace(string->Get(right - 1))) {
       right--;
     }
   }
=======================================
--- /branches/bleeding_edge/src/scanner.h       Thu Oct 10 11:58:16 2013 UTC
+++ /branches/bleeding_edge/src/scanner.h       Fri Feb  7 12:34:45 2014 UTC
@@ -144,7 +144,7 @@
   unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
   unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
   unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
-  unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
+  unibrow::Predicate<WhiteSpace, 128> kIsWhiteSpace;
   StaticResource<Utf8Decoder> utf8_decoder_;

   DISALLOW_COPY_AND_ASSIGN(UnicodeCache);
=======================================

--- /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc Mon Jan20 04:59:40 2014 UTC+++ /branches/bleeding_edge/src/x64/regexp-macro-assembler-x64.cc Fri Feb7 12:34:45 2014 UTC

@@ -552,6 +552,9 @@
       __ j(below_equal, &success, Label::kNear);
       // \u00a0 (NBSP).
       __ cmpl(rax, Immediate(0x00a0 - '\t'));
+      __ j(equal, &success, Label::kNear);
+      // \u0085 (NEL).
+      __ cmpl(rax, Immediate(0x0085 - '\t'));
       BranchOrBacktrack(not_equal, on_no_match);
       __ bind(&success);
       return true;
=======================================

--- /branches/bleeding_edge/test/cctest/test-regexp.cc Thu Feb 6 07:16:412014 UTC+++ /branches/bleeding_edge/test/cctest/test-regexp.cc Fri Feb 7 12:34:452014 UTC

@@ -445,21 +445,7 @@


 static bool IsWhiteSpace(uc16 c) {
-  switch (c) {
-    case 0x09:
-    case 0x0A:
-    case 0x0B:
-    case 0x0C:
-    case 0x0d:
-    case 0x20:
-    case 0xA0:
-    case 0x2028:
-    case 0x2029:
-    case 0xFEFF:
-      return true;
-    default:
-      return unibrow::Space::Is(c);
-  }
+  return v8::internal::WhiteSpace::Is(c);
 }


=======================================

--- /branches/bleeding_edge/test/mjsunit/third_party/string-trim.js TueDec 7 11:01:02 2010 UTC+++ /branches/bleeding_edge/test/mjsunit/third_party/string-trim.js FriFeb 7 12:34:45 2014 UTC

@@ -66,7 +66,8 @@
   {s : '\u3000', t : 'IDEOGRAPHIC SPACE'},
   {s : '\u2028', t : 'LINE SEPARATOR'},
   {s : '\u2029', t : 'PARAGRAPH SEPARATOR'},
-  {s : '\u200B', t : 'ZERO WIDTH SPACE (category Cf)'}
+  // \u200B is not a whitespace character according to Unicode 6.3.0.
+  // {s : '\u200B', t : 'ZERO WIDTH SPACE (category Cf)'}
 ];

 for (var i = 0; i < whitespace.length; i++) {

--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev

---You received this message because you are subscribed to the Google Groups "v8-dev" group.

To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.

[v8-dev] [v8] r19196 committed - Fix inconsistencies wrt whitespaces....

Reply via email to