viktorluc-db commented on code in PR #47771:
URL: https://github.com/apache/spark/pull/47771#discussion_r1729639176
##########
common/unsafe/src/main/java/org/apache/spark/unsafe/UTF8StringBuilder.java:
##########
@@ -93,7 +93,38 @@ public void appendBytes(Object base, long offset, int
length) {
cursor += length;
}
+ public void appendByte(byte singleByte) {
+ grow(1);
+ buffer[cursor - Platform.BYTE_ARRAY_OFFSET] = singleByte;
+ cursor++;
+ }
+
public UTF8String build() {
return UTF8String.fromBytes(buffer, 0, totalSize());
}
+
+ public void appendCodePoint(int codePoint) {
+ if (codePoint <= 0x7F) {
+ appendByte((byte) codePoint);
+ }
+ else if (codePoint <= 0x7FF) {
+ appendByte((byte) (0xC0 | (codePoint >> 6)));
+ appendByte((byte) (0x80 | (codePoint & 0x3F)));
+ }
+ else if (codePoint <= 0xFFFF) {
+ appendByte((byte) (0xE0 | (codePoint >> 12)));
+ appendByte((byte) (0x80 | ((codePoint >> 6) & 0x3F)));
+ appendByte((byte) (0x80 | (codePoint & 0x3F)));
+ }
+ else if (codePoint <= 0x10FFFF) {
+ appendByte((byte) (0xF0 | (codePoint >> 18)));
+ appendByte((byte) (0x80 | ((codePoint >> 12) & 0x3F)));
+ appendByte((byte) (0x80 | ((codePoint >> 6) & 0x3F)));
+ appendByte((byte) (0x80 | (codePoint & 0x3F)));
+ }
+ else {
+ throw new IllegalArgumentException("Invalid Unicode codePoint: " +
codePoint);
+ }
Review Comment:
There are, but for the ones i found, we would have to allocate at least
something on the heap. So i went for this approach because of performance.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]