Revision: 18683
Author: [email protected]
Date: Mon Jan 20 09:52:54 2014 UTC
Log: String:WriteUtf8: Add REPLACE_INVALID_UTF8 option
This patch makes String::WriteUtf8 replace invalid code points (i.e.
unmatched
surrogates) with the unicode replacement character when
REPLACE_INVALID_UTF8 is
set. This is done to avoid creating invalid UTF-8 output which can lead to
compatibility issues with software requiring valid UTF-8 inputs (e.g. the
WebSocket protocol requires valid UTF-8 and terminates connections when
invalid
UTF-8 is encountered).
[email protected]
BUG=
Review URL: https://codereview.chromium.org/121173009
Patch from Felix Geisendörfer <[email protected]>.
http://code.google.com/p/v8/source/detail?r=18683
Modified:
/branches/bleeding_edge/include/v8.h
/branches/bleeding_edge/src/api.cc
/branches/bleeding_edge/src/unicode-inl.h
/branches/bleeding_edge/src/unicode.h
/branches/bleeding_edge/test/cctest/test-api.cc
=======================================
--- /branches/bleeding_edge/include/v8.h Fri Jan 17 10:52:00 2014 UTC
+++ /branches/bleeding_edge/include/v8.h Mon Jan 20 09:52:54 2014 UTC
@@ -1628,7 +1628,11 @@
NO_OPTIONS = 0,
HINT_MANY_WRITES_EXPECTED = 1,
NO_NULL_TERMINATION = 2,
- PRESERVE_ASCII_NULL = 4
+ PRESERVE_ASCII_NULL = 4,
+ // Used by WriteUtf8 to replace orphan surrogate code units with the
+ // unicode replacement character. Needs to be set to guarantee valid
UTF-8
+ // output.
+ REPLACE_INVALID_UTF8 = 8
};
// 16-bit character codes.
=======================================
--- /branches/bleeding_edge/src/api.cc Thu Jan 16 08:17:40 2014 UTC
+++ /branches/bleeding_edge/src/api.cc Mon Jan 20 09:52:54 2014 UTC
@@ -4454,28 +4454,35 @@
class Utf8WriterVisitor {
public:
Utf8WriterVisitor(
- char* buffer, int capacity, bool skip_capacity_check)
- : early_termination_(false),
- last_character_(unibrow::Utf16::kNoPreviousCharacter),
- buffer_(buffer),
- start_(buffer),
- capacity_(capacity),
- skip_capacity_check_(capacity == -1 || skip_capacity_check),
- utf16_chars_read_(0) {
+ char* buffer,
+ int capacity,
+ bool skip_capacity_check,
+ bool replace_invalid_utf8)
+ : early_termination_(false),
+ last_character_(unibrow::Utf16::kNoPreviousCharacter),
+ buffer_(buffer),
+ start_(buffer),
+ capacity_(capacity),
+ skip_capacity_check_(capacity == -1 || skip_capacity_check),
+ replace_invalid_utf8_(replace_invalid_utf8),
+ utf16_chars_read_(0) {
}
static int WriteEndCharacter(uint16_t character,
int last_character,
int remaining,
- char* const buffer) {
+ char* const buffer,
+ bool replace_invalid_utf8) {
using namespace unibrow;
ASSERT(remaining > 0);
// We can't use a local buffer here because Encode needs to modify
// previous characters in the stream. We know, however, that
// exactly one character will be advanced.
- if (Utf16::IsTrailSurrogate(character) &&
- Utf16::IsLeadSurrogate(last_character)) {
- int written = Utf8::Encode(buffer, character, last_character);
+ if (Utf16::IsSurrogatePair(last_character, character)) {
+ int written = Utf8::Encode(buffer,
+ character,
+ last_character,
+ replace_invalid_utf8);
ASSERT(written == 1);
return written;
}
@@ -4484,7 +4491,8 @@
// Can't encode using last_character as gcc has array bounds issues.
int written = Utf8::Encode(temp_buffer,
character,
- Utf16::kNoPreviousCharacter);
+ Utf16::kNoPreviousCharacter,
+ replace_invalid_utf8);
// Won't fit.
if (written > remaining) return 0;
// Copy over the character from temp_buffer.
@@ -4494,6 +4502,16 @@
return written;
}
+ // Visit writes out a group of code units (chars) of a v8::String to the
+ // internal buffer_. This is done in two phases. The first phase
calculates a
+ // pesimistic estimate (writable_length) on how many code units can be
safely
+ // written without exceeding the buffer capacity and without writing the
last
+ // code unit (it could be a lead surrogate). The estimated number of code
+ // units is then written out in one go, and the reported byte usage is
used
+ // to correct the estimate. This is repeated until the estimate becomes
<= 0
+ // or all code units have been written out. The second phase writes out
code
+ // units until the buffer capacity is reached, would be exceeded by the
next
+ // unit, or all units have been written out.
template<typename Char>
void Visit(const Char* chars, const int length) {
using namespace unibrow;
@@ -4531,7 +4549,10 @@
} else {
for (; i < fast_length; i++) {
uint16_t character = *chars++;
- buffer += Utf8::Encode(buffer, character, last_character);
+ buffer += Utf8::Encode(buffer,
+ character,
+ last_character,
+ replace_invalid_utf8_);
last_character = character;
ASSERT(capacity_ == -1 || (buffer - start_) <= capacity_);
}
@@ -4551,10 +4572,17 @@
ASSERT(remaining_capacity >= 0);
for (; i < length && remaining_capacity > 0; i++) {
uint16_t character = *chars++;
+ // remaining_capacity is <= 3 bytes at this point, so we do not
write out
+ // an umatched lead surrogate.
+ if (replace_invalid_utf8_ && Utf16::IsLeadSurrogate(character)) {
+ early_termination_ = true;
+ break;
+ }
int written = WriteEndCharacter(character,
last_character,
remaining_capacity,
- buffer);
+ buffer,
+ replace_invalid_utf8_);
if (written == 0) {
early_termination_ = true;
break;
@@ -4602,6 +4630,7 @@
char* const start_;
int capacity_;
bool const skip_capacity_check_;
+ bool const replace_invalid_utf8_;
int utf16_chars_read_;
DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8WriterVisitor);
};
@@ -4640,9 +4669,11 @@
}
const int string_length = str->length();
bool write_null = !(options & NO_NULL_TERMINATION);
+ bool replace_invalid_utf8 = (options & REPLACE_INVALID_UTF8);
+ int max16BitCodeUnitSize = unibrow::Utf8::kMax16BitCodeUnitSize;
// First check if we can just write the string without checking capacity.
- if (capacity == -1 || capacity / 3 >= string_length) {
- Utf8WriterVisitor writer(buffer, capacity, true);
+ if (capacity == -1 || capacity / max16BitCodeUnitSize >= string_length) {
+ Utf8WriterVisitor writer(buffer, capacity, true, replace_invalid_utf8);
const int kMaxRecursion = 100;
bool success = RecursivelySerializeToUtf8(*str, &writer,
kMaxRecursion);
if (success) return writer.CompleteWrite(write_null, nchars_ref);
@@ -4670,7 +4701,7 @@
}
// Recursive slow path can potentially be unreasonable slow. Flatten.
str = FlattenGetString(str);
- Utf8WriterVisitor writer(buffer, capacity, false);
+ Utf8WriterVisitor writer(buffer, capacity, false, replace_invalid_utf8);
i::String::VisitFlat(&writer, *str);
return writer.CompleteWrite(write_null, nchars_ref);
}
=======================================
--- /branches/bleeding_edge/src/unicode-inl.h Tue Apr 16 12:30:51 2013 UTC
+++ /branches/bleeding_edge/src/unicode-inl.h Mon Jan 20 09:52:54 2014 UTC
@@ -107,8 +107,14 @@
return 2;
}
-
-unsigned Utf8::Encode(char* str, uchar c, int previous) {
+// Encode encodes the UTF-16 code units c and previous into the given str
+// buffer, and combines surrogate code units into single code points. If
+// replace_invalid is set to true, orphan surrogate code units will be
replaced
+// with kBadChar.
+unsigned Utf8::Encode(char* str,
+ uchar c,
+ int previous,
+ bool replace_invalid) {
static const int kMask = ~(1 << 6);
if (c <= kMaxOneByteChar) {
str[0] = c;
@@ -118,12 +124,16 @@
str[1] = 0x80 | (c & kMask);
return 2;
} else if (c <= kMaxThreeByteChar) {
- if (Utf16::IsTrailSurrogate(c) &&
- Utf16::IsLeadSurrogate(previous)) {
+ if (Utf16::IsSurrogatePair(previous, c)) {
const int kUnmatchedSize = kSizeOfUnmatchedSurrogate;
return Encode(str - kUnmatchedSize,
Utf16::CombineSurrogatePair(previous, c),
- Utf16::kNoPreviousCharacter) - kUnmatchedSize;
+ Utf16::kNoPreviousCharacter,
+ replace_invalid) - kUnmatchedSize;
+ } else if (replace_invalid &&
+ (Utf16::IsLeadSurrogate(c) ||
+ Utf16::IsTrailSurrogate(c))) {
+ c = kBadChar;
}
str[0] = 0xE0 | (c >> 12);
str[1] = 0x80 | ((c >> 6) & kMask);
=======================================
--- /branches/bleeding_edge/src/unicode.h Thu Nov 7 09:08:34 2013 UTC
+++ /branches/bleeding_edge/src/unicode.h Mon Jan 20 09:52:54 2014 UTC
@@ -102,6 +102,9 @@
class Utf16 {
public:
+ static inline bool IsSurrogatePair(int lead, int trail) {
+ return IsLeadSurrogate(lead) && IsTrailSurrogate(trail);
+ }
static inline bool IsLeadSurrogate(int code) {
if (code == kNoPreviousCharacter) return false;
return (code & 0xfc00) == 0xd800;
@@ -146,11 +149,16 @@
public:
static inline uchar Length(uchar chr, int previous);
static inline unsigned EncodeOneByte(char* out, uint8_t c);
- static inline unsigned Encode(
- char* out, uchar c, int previous);
+ static inline unsigned Encode(char* out,
+ uchar c,
+ int previous,
+ bool replace_invalid = false);
static uchar CalculateValue(const byte* str,
unsigned length,
unsigned* cursor);
+
+ // The unicode replacement character, used to signal invalid unicode
+ // sequences (e.g. an orphan surrogate) when converting to a UTF-8
encoding.
static const uchar kBadChar = 0xFFFD;
static const unsigned kMaxEncodedSize = 4;
static const unsigned kMaxOneByteChar = 0x7f;
@@ -162,6 +170,9 @@
// that match are coded as a 4 byte UTF-8 sequence.
static const unsigned kBytesSavedByCombiningSurrogates = 2;
static const unsigned kSizeOfUnmatchedSurrogate = 3;
+ // The maximum size a single UTF-16 code unit may take up when encoded as
+ // UTF-8.
+ static const unsigned kMax16BitCodeUnitSize = 3;
static inline uchar ValueOf(const byte* str,
unsigned length,
unsigned* cursor);
=======================================
--- /branches/bleeding_edge/test/cctest/test-api.cc Fri Jan 17 10:34:43
2014 UTC
+++ /branches/bleeding_edge/test/cctest/test-api.cc Mon Jan 20 09:52:54
2014 UTC
@@ -7614,6 +7614,22 @@
v8::Handle<String> str2 = v8_str("abc\303\260\342\230\203");
v8::Handle<String> str3 = v8::String::NewFromUtf8(
context->GetIsolate(), "abc\0def", v8::String::kNormalString, 7);
+ // "ab" + lead surrogate + "cd" + trail surrogate + "ef"
+ uint16_t orphans[8] = { 0x61, 0x62, 0xd800, 0x63, 0x64, 0xdc00, 0x65,
0x66 };
+ v8::Handle<String> orphans_str = v8::String::NewFromTwoByte(
+ context->GetIsolate(), orphans, v8::String::kNormalString, 8);
+ // single lead surrogate
+ uint16_t lead[1] = { 0xd800 };
+ v8::Handle<String> lead_str = v8::String::NewFromTwoByte(
+ context->GetIsolate(), lead, v8::String::kNormalString, 1);
+ // single trail surrogate
+ uint16_t trail[1] = { 0xdc00 };
+ v8::Handle<String> trail_str = v8::String::NewFromTwoByte(
+ context->GetIsolate(), trail, v8::String::kNormalString, 1);
+ // surrogate pair
+ uint16_t pair[2] = { 0xd800, 0xdc00 };
+ v8::Handle<String> pair_str = v8::String::NewFromTwoByte(
+ context->GetIsolate(), pair, v8::String::kNormalString, 2);
const int kStride = 4; // Must match stride in for loops in JS below.
CompileRun(
"var left = '';"
@@ -7687,6 +7703,53 @@
CHECK_EQ(2, charlen);
CHECK_EQ(0, strncmp(utf8buf, "ab\1", 3));
+ // allow orphan surrogates by default
+ memset(utf8buf, 0x1, 1000);
+ len = orphans_str->WriteUtf8(utf8buf, sizeof(utf8buf), &charlen);
+ CHECK_EQ(13, len);
+ CHECK_EQ(8, charlen);
+ CHECK_EQ(0, strcmp(utf8buf, "ab\355\240\200cd\355\260\200ef"));
+
+ // replace orphan surrogates with unicode replacement character
+ memset(utf8buf, 0x1, 1000);
+ len = orphans_str->WriteUtf8(utf8buf,
+ sizeof(utf8buf),
+ &charlen,
+ String::REPLACE_INVALID_UTF8);
+ CHECK_EQ(13, len);
+ CHECK_EQ(8, charlen);
+ CHECK_EQ(0, strcmp(utf8buf, "ab\357\277\275cd\357\277\275ef"));
+
+ // replace single lead surrogate with unicode replacement character
+ memset(utf8buf, 0x1, 1000);
+ len = lead_str->WriteUtf8(utf8buf,
+ sizeof(utf8buf),
+ &charlen,
+ String::REPLACE_INVALID_UTF8);
+ CHECK_EQ(4, len);
+ CHECK_EQ(1, charlen);
+ CHECK_EQ(0, strcmp(utf8buf, "\357\277\275"));
+
+ // replace single trail surrogate with unicode replacement character
+ memset(utf8buf, 0x1, 1000);
+ len = trail_str->WriteUtf8(utf8buf,
+ sizeof(utf8buf),
+ &charlen,
+ String::REPLACE_INVALID_UTF8);
+ CHECK_EQ(4, len);
+ CHECK_EQ(1, charlen);
+ CHECK_EQ(0, strcmp(utf8buf, "\357\277\275"));
+
+ // do not replace / write anything if surrogate pair does not fit the
buffer
+ // space
+ memset(utf8buf, 0x1, 1000);
+ len = pair_str->WriteUtf8(utf8buf,
+ 3,
+ &charlen,
+ String::REPLACE_INVALID_UTF8);
+ CHECK_EQ(0, len);
+ CHECK_EQ(0, charlen);
+
memset(utf8buf, 0x1, sizeof(utf8buf));
len = GetUtf8Length(left_tree);
int utf8_expected =
--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev
---
You received this message because you are subscribed to the Google Groups "v8-dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.