uros-db commented on code in PR #46899:
URL: https://github.com/apache/spark/pull/46899#discussion_r1631021388
##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
}
}
+ /**
+ * Utility methods and constants for UTF-8 string validation.
+ */
+
+ private static boolean isValidContinuationByte(byte b) {
+ return (byte) 0x80 <= b && b <= (byte) 0xBF;
+ }
+
+ private static boolean isValidSecondByte(byte b, byte firstByte) {
+ return switch (firstByte) {
+ case (byte) 0xE0 -> (byte) 0xA0 <= b && b <= (byte) 0xBF;
+ case (byte) 0xED -> (byte) 0x80 <= b && b <= (byte) 0x9F;
+ case (byte) 0xF0 -> (byte) 0x90 <= b && b <= (byte) 0xBF;
+ case (byte) 0xF4 -> (byte) 0x80 <= b && b <= (byte) 0x8F;
+ default -> isValidContinuationByte(b);
+ };
+ }
+
+ private static final byte[] UNICODE_REPLACEMENT_CHARACTER =
+ new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD };
+
+ private static void appendReplacementCharacter(ArrayList<Byte> bytes) {
+ for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b);
+ }
+
+ /**
+ * Returns a validated version of the current UTF-8 string by replacing
invalid UTF-8 sequences
+ * with the Unicode replacement character (U+FFFD), as per the rules defined
in the Unicode
+ * standard. This behaviour is consistent with the behaviour of
`UnicodeString` in ICU4C.
+ *
+ * @return A new UTF8String that is a valid UTF8 byte sequence.
+ */
+ public UTF8String makeValidUTF8() {
+ ArrayList<Byte> bytes = new ArrayList<>();
+ int byteIndex = 0;
+ byteIteration:
Review Comment:
forgot to remove this while prototyping
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]