Re: [PR] [WIP][SQL] Invalid UTF-8 byte sequence replacement [spark]

via GitHub Thu, 06 Jun 2024 23:32:30 -0700


mkaravel commented on code in PR #46899:
URL: https://github.com/apache/spark/pull/46899#discussion_r1630683951



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Utility methods and constants for UTF-8 string validation.
+   */
+
+  private static boolean isValidContinuationByte(byte b) {
+     return (byte) 0x80 <= b && b <= (byte) 0xBF;

Review Comment:
   ```suggestion
        return b >= (byte) 0x80 && b <= (byte) 0xBF;
   ```
   uber nit: I find it more readable this way because `b` is on the same side 
of both equalities.



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Utility methods and constants for UTF-8 string validation.
+   */
+
+  private static boolean isValidContinuationByte(byte b) {
+     return (byte) 0x80 <= b && b <= (byte) 0xBF;
+  }
+
+  private static boolean isValidSecondByte(byte b, byte firstByte) {
+    return switch (firstByte) {
+      case (byte) 0xE0 -> (byte) 0xA0 <= b && b <= (byte) 0xBF;
+      case (byte) 0xED -> (byte) 0x80 <= b && b <= (byte) 0x9F;
+      case (byte) 0xF0 -> (byte) 0x90 <= b && b <= (byte) 0xBF;
+      case (byte) 0xF4 -> (byte) 0x80 <= b && b <= (byte) 0x8F;
+      default -> isValidContinuationByte(b);
+    };
+  }
+
+  private static final byte[] UNICODE_REPLACEMENT_CHARACTER =
+    new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD };
+
+  private static void appendReplacementCharacter(ArrayList<Byte> bytes) {
+    for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b);
+  }
+
+  /**
+   * Returns a validated version of the current UTF-8 string by replacing 
invalid UTF-8 sequences
+   * with the Unicode replacement character (U+FFFD), as per the rules defined 
in the Unicode
+   * standard. This behaviour is consistent with the behaviour of 
`UnicodeString` in ICU4C.
+   *
+   * @return A new UTF8String that is a valid UTF8 byte sequence.

Review Comment:
   ```suggestion
      * @return A new UTF8String that is a valid UTF8 string.
   ```



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Utility methods and constants for UTF-8 string validation.
+   */
+
+  private static boolean isValidContinuationByte(byte b) {
+     return (byte) 0x80 <= b && b <= (byte) 0xBF;
+  }
+
+  private static boolean isValidSecondByte(byte b, byte firstByte) {
+    return switch (firstByte) {
+      case (byte) 0xE0 -> (byte) 0xA0 <= b && b <= (byte) 0xBF;
+      case (byte) 0xED -> (byte) 0x80 <= b && b <= (byte) 0x9F;
+      case (byte) 0xF0 -> (byte) 0x90 <= b && b <= (byte) 0xBF;
+      case (byte) 0xF4 -> (byte) 0x80 <= b && b <= (byte) 0x8F;
+      default -> isValidContinuationByte(b);
+    };
+  }
+
+  private static final byte[] UNICODE_REPLACEMENT_CHARACTER =
+    new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD };
+
+  private static void appendReplacementCharacter(ArrayList<Byte> bytes) {
+    for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b);
+  }
+
+  /**
+   * Returns a validated version of the current UTF-8 string by replacing 
invalid UTF-8 sequences
+   * with the Unicode replacement character (U+FFFD), as per the rules defined 
in the Unicode
+   * standard. This behaviour is consistent with the behaviour of 
`UnicodeString` in ICU4C.

Review Comment:
   Let's specify the part of the standard: Unicode standard 15, Section 3.9, 
Paragraph D86, Table 3-7.



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Utility methods and constants for UTF-8 string validation.
+   */
+
+  private static boolean isValidContinuationByte(byte b) {
+     return (byte) 0x80 <= b && b <= (byte) 0xBF;
+  }
+
+  private static boolean isValidSecondByte(byte b, byte firstByte) {
+    return switch (firstByte) {
+      case (byte) 0xE0 -> (byte) 0xA0 <= b && b <= (byte) 0xBF;
+      case (byte) 0xED -> (byte) 0x80 <= b && b <= (byte) 0x9F;
+      case (byte) 0xF0 -> (byte) 0x90 <= b && b <= (byte) 0xBF;
+      case (byte) 0xF4 -> (byte) 0x80 <= b && b <= (byte) 0x8F;

Review Comment:
   ```suggestion
         case (byte) 0xE0 -> b >= (byte) 0xA0 && b <= (byte) 0xBF;
         case (byte) 0xED -> b >= (byte) 0x80 && b <= (byte) 0x9F;
         case (byte) 0xF0 -> b >= (byte) 0x90 && b <= (byte) 0xBF;
         case (byte) 0xF4 -> b >= (byte) 0x80 && b <= (byte) 0x8F;
   ```
   uber nit: same comment here.



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Utility methods and constants for UTF-8 string validation.
+   */
+
+  private static boolean isValidContinuationByte(byte b) {
+     return (byte) 0x80 <= b && b <= (byte) 0xBF;
+  }
+
+  private static boolean isValidSecondByte(byte b, byte firstByte) {
+    return switch (firstByte) {
+      case (byte) 0xE0 -> (byte) 0xA0 <= b && b <= (byte) 0xBF;
+      case (byte) 0xED -> (byte) 0x80 <= b && b <= (byte) 0x9F;
+      case (byte) 0xF0 -> (byte) 0x90 <= b && b <= (byte) 0xBF;
+      case (byte) 0xF4 -> (byte) 0x80 <= b && b <= (byte) 0x8F;
+      default -> isValidContinuationByte(b);
+    };
+  }
+
+  private static final byte[] UNICODE_REPLACEMENT_CHARACTER =
+    new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD };
+
+  private static void appendReplacementCharacter(ArrayList<Byte> bytes) {
+    for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b);
+  }
+
+  /**
+   * Returns a validated version of the current UTF-8 string by replacing 
invalid UTF-8 sequences
+   * with the Unicode replacement character (U+FFFD), as per the rules defined 
in the Unicode
+   * standard. This behaviour is consistent with the behaviour of 
`UnicodeString` in ICU4C.
+   *
+   * @return A new UTF8String that is a valid UTF8 byte sequence.
+   */
+  public UTF8String makeValidUTF8() {
+    ArrayList<Byte> bytes = new ArrayList<>();
+    int byteIndex = 0;
+    byteIteration:

Review Comment:
   What is this useful for?



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Utility methods and constants for UTF-8 string validation.
+   */
+
+  private static boolean isValidContinuationByte(byte b) {
+     return (byte) 0x80 <= b && b <= (byte) 0xBF;
+  }
+
+  private static boolean isValidSecondByte(byte b, byte firstByte) {
+    return switch (firstByte) {
+      case (byte) 0xE0 -> (byte) 0xA0 <= b && b <= (byte) 0xBF;
+      case (byte) 0xED -> (byte) 0x80 <= b && b <= (byte) 0x9F;
+      case (byte) 0xF0 -> (byte) 0x90 <= b && b <= (byte) 0xBF;
+      case (byte) 0xF4 -> (byte) 0x80 <= b && b <= (byte) 0x8F;
+      default -> isValidContinuationByte(b);
+    };
+  }
+
+  private static final byte[] UNICODE_REPLACEMENT_CHARACTER =
+    new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD };
+
+  private static void appendReplacementCharacter(ArrayList<Byte> bytes) {
+    for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b);
+  }
+
+  /**
+   * Returns a validated version of the current UTF-8 string by replacing 
invalid UTF-8 sequences
+   * with the Unicode replacement character (U+FFFD), as per the rules defined 
in the Unicode
+   * standard. This behaviour is consistent with the behaviour of 
`UnicodeString` in ICU4C.
+   *
+   * @return A new UTF8String that is a valid UTF8 byte sequence.
+   */
+  public UTF8String makeValidUTF8() {
+    ArrayList<Byte> bytes = new ArrayList<>();
+    int byteIndex = 0;
+    byteIteration:
+    while (byteIndex < numBytes) {
+      // Read the first byte.
+      byte firstByte = getByte(byteIndex);
+      int expectedLen = bytesOfCodePointInUTF8[firstByte & 0xFF];
+      int codePointLen = Math.min(expectedLen, numBytes - byteIndex);
+      // 0B UTF-8 sequence (invalid first byte).
+      if (codePointLen == 0) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // 1B UTF-8 sequence (ASCII or invalid).
+      if (codePointLen == 1) {
+        if (firstByte >= 0) bytes.add(firstByte);
+        else appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }

Review Comment:
   ```suggestion
         // 1B UTF-8 sequence (ASCII).
         if (codePointLen == 1) {
           bytes.add(firstByte);
           ++byteIndex;
           continue;
         }
   ```
   Looking at the definition of the `bytesOfCodePointInUTF8` array, it looks 
that we get a codepoint length of 1 for ASCII characters only.



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Utility methods and constants for UTF-8 string validation.
+   */
+
+  private static boolean isValidContinuationByte(byte b) {
+     return (byte) 0x80 <= b && b <= (byte) 0xBF;
+  }
+
+  private static boolean isValidSecondByte(byte b, byte firstByte) {
+    return switch (firstByte) {
+      case (byte) 0xE0 -> (byte) 0xA0 <= b && b <= (byte) 0xBF;
+      case (byte) 0xED -> (byte) 0x80 <= b && b <= (byte) 0x9F;
+      case (byte) 0xF0 -> (byte) 0x90 <= b && b <= (byte) 0xBF;
+      case (byte) 0xF4 -> (byte) 0x80 <= b && b <= (byte) 0x8F;
+      default -> isValidContinuationByte(b);
+    };
+  }
+
+  private static final byte[] UNICODE_REPLACEMENT_CHARACTER =
+    new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD };
+
+  private static void appendReplacementCharacter(ArrayList<Byte> bytes) {
+    for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b);
+  }
+
+  /**
+   * Returns a validated version of the current UTF-8 string by replacing 
invalid UTF-8 sequences
+   * with the Unicode replacement character (U+FFFD), as per the rules defined 
in the Unicode
+   * standard. This behaviour is consistent with the behaviour of 
`UnicodeString` in ICU4C.
+   *
+   * @return A new UTF8String that is a valid UTF8 byte sequence.
+   */
+  public UTF8String makeValidUTF8() {
+    ArrayList<Byte> bytes = new ArrayList<>();
+    int byteIndex = 0;
+    byteIteration:
+    while (byteIndex < numBytes) {
+      // Read the first byte.
+      byte firstByte = getByte(byteIndex);
+      int expectedLen = bytesOfCodePointInUTF8[firstByte & 0xFF];
+      int codePointLen = Math.min(expectedLen, numBytes - byteIndex);
+      // 0B UTF-8 sequence (invalid first byte).
+      if (codePointLen == 0) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // 1B UTF-8 sequence (ASCII or invalid).
+      if (codePointLen == 1) {
+        if (firstByte >= 0) bytes.add(firstByte);
+        else appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read the second byte.
+      byte secondByte = getByte(byteIndex + 1);
+      if (!isValidSecondByte(secondByte, firstByte)) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read remaining continuation bytes.
+      int continuationBytes = 2;
+      for (; continuationBytes < codePointLen; ++continuationBytes) {
+        byte nextByte = getByte(byteIndex + continuationBytes);
+        if (!isValidContinuationByte(nextByte)) {
+          break;
+        }
+      }
+      // Invalid UTF-8 sequence (not enough continuation bytes).
+      if (continuationBytes < expectedLen) {
+        appendReplacementCharacter(bytes);
+        byteIndex += continuationBytes;
+        continue;
+      }
+      // Valid UTF-8 sequence.
+      for (int i = 0; i < codePointLen; ++i) {
+        bytes.add(getByte(byteIndex + i));
+      }
+      byteIndex += codePointLen;
+    }
+    return UTF8String.fromBytes(bytes);
+  }
+
+  /**
+   * Checks if the current UTF8String is valid.
+   *
+   * @return If string represents a valid UTF8 byte sequence.
+   */
+  public boolean isValidUTF8() {

Review Comment:
   ```suggestion
     public boolean isValid() {
   ```
   Same here.



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Utility methods and constants for UTF-8 string validation.
+   */
+
+  private static boolean isValidContinuationByte(byte b) {
+     return (byte) 0x80 <= b && b <= (byte) 0xBF;
+  }
+
+  private static boolean isValidSecondByte(byte b, byte firstByte) {
+    return switch (firstByte) {
+      case (byte) 0xE0 -> (byte) 0xA0 <= b && b <= (byte) 0xBF;
+      case (byte) 0xED -> (byte) 0x80 <= b && b <= (byte) 0x9F;
+      case (byte) 0xF0 -> (byte) 0x90 <= b && b <= (byte) 0xBF;
+      case (byte) 0xF4 -> (byte) 0x80 <= b && b <= (byte) 0x8F;
+      default -> isValidContinuationByte(b);
+    };
+  }
+
+  private static final byte[] UNICODE_REPLACEMENT_CHARACTER =
+    new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD };
+
+  private static void appendReplacementCharacter(ArrayList<Byte> bytes) {
+    for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b);
+  }
+
+  /**
+   * Returns a validated version of the current UTF-8 string by replacing 
invalid UTF-8 sequences
+   * with the Unicode replacement character (U+FFFD), as per the rules defined 
in the Unicode
+   * standard. This behaviour is consistent with the behaviour of 
`UnicodeString` in ICU4C.
+   *
+   * @return A new UTF8String that is a valid UTF8 byte sequence.
+   */
+  public UTF8String makeValidUTF8() {
+    ArrayList<Byte> bytes = new ArrayList<>();
+    int byteIndex = 0;
+    byteIteration:
+    while (byteIndex < numBytes) {
+      // Read the first byte.
+      byte firstByte = getByte(byteIndex);
+      int expectedLen = bytesOfCodePointInUTF8[firstByte & 0xFF];
+      int codePointLen = Math.min(expectedLen, numBytes - byteIndex);
+      // 0B UTF-8 sequence (invalid first byte).
+      if (codePointLen == 0) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // 1B UTF-8 sequence (ASCII or invalid).
+      if (codePointLen == 1) {
+        if (firstByte >= 0) bytes.add(firstByte);
+        else appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read the second byte.
+      byte secondByte = getByte(byteIndex + 1);
+      if (!isValidSecondByte(secondByte, firstByte)) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read remaining continuation bytes.
+      int continuationBytes = 2;
+      for (; continuationBytes < codePointLen; ++continuationBytes) {
+        byte nextByte = getByte(byteIndex + continuationBytes);
+        if (!isValidContinuationByte(nextByte)) {
+          break;
+        }
+      }
+      // Invalid UTF-8 sequence (not enough continuation bytes).
+      if (continuationBytes < expectedLen) {
+        appendReplacementCharacter(bytes);
+        byteIndex += continuationBytes;
+        continue;
+      }
+      // Valid UTF-8 sequence.
+      for (int i = 0; i < codePointLen; ++i) {
+        bytes.add(getByte(byteIndex + i));
+      }
+      byteIndex += codePointLen;
+    }
+    return UTF8String.fromBytes(bytes);
+  }
+
+  /**
+   * Checks if the current UTF8String is valid.
+   *
+   * @return If string represents a valid UTF8 byte sequence.
+   */
+  public boolean isValidUTF8() {
+    return makeValidUTF8().equals(this);
+  }
+
+  /**
+   * Returns the current string if it is a valid UTF8 byte sequence, otherwise 
throws an exception.

Review Comment:
   ```suggestion
      * Returns the current string if it is a valid UTF8 string, otherwise 
throws an exception.
   ```



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Utility methods and constants for UTF-8 string validation.
+   */
+
+  private static boolean isValidContinuationByte(byte b) {
+     return (byte) 0x80 <= b && b <= (byte) 0xBF;
+  }
+
+  private static boolean isValidSecondByte(byte b, byte firstByte) {
+    return switch (firstByte) {
+      case (byte) 0xE0 -> (byte) 0xA0 <= b && b <= (byte) 0xBF;
+      case (byte) 0xED -> (byte) 0x80 <= b && b <= (byte) 0x9F;
+      case (byte) 0xF0 -> (byte) 0x90 <= b && b <= (byte) 0xBF;
+      case (byte) 0xF4 -> (byte) 0x80 <= b && b <= (byte) 0x8F;
+      default -> isValidContinuationByte(b);
+    };
+  }
+
+  private static final byte[] UNICODE_REPLACEMENT_CHARACTER =
+    new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD };
+
+  private static void appendReplacementCharacter(ArrayList<Byte> bytes) {
+    for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b);
+  }
+
+  /**
+   * Returns a validated version of the current UTF-8 string by replacing 
invalid UTF-8 sequences
+   * with the Unicode replacement character (U+FFFD), as per the rules defined 
in the Unicode
+   * standard. This behaviour is consistent with the behaviour of 
`UnicodeString` in ICU4C.
+   *
+   * @return A new UTF8String that is a valid UTF8 byte sequence.
+   */
+  public UTF8String makeValidUTF8() {
+    ArrayList<Byte> bytes = new ArrayList<>();
+    int byteIndex = 0;
+    byteIteration:
+    while (byteIndex < numBytes) {
+      // Read the first byte.
+      byte firstByte = getByte(byteIndex);
+      int expectedLen = bytesOfCodePointInUTF8[firstByte & 0xFF];
+      int codePointLen = Math.min(expectedLen, numBytes - byteIndex);
+      // 0B UTF-8 sequence (invalid first byte).
+      if (codePointLen == 0) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // 1B UTF-8 sequence (ASCII or invalid).
+      if (codePointLen == 1) {
+        if (firstByte >= 0) bytes.add(firstByte);
+        else appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read the second byte.
+      byte secondByte = getByte(byteIndex + 1);
+      if (!isValidSecondByte(secondByte, firstByte)) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read remaining continuation bytes.
+      int continuationBytes = 2;
+      for (; continuationBytes < codePointLen; ++continuationBytes) {
+        byte nextByte = getByte(byteIndex + continuationBytes);
+        if (!isValidContinuationByte(nextByte)) {
+          break;
+        }
+      }
+      // Invalid UTF-8 sequence (not enough continuation bytes).
+      if (continuationBytes < expectedLen) {
+        appendReplacementCharacter(bytes);
+        byteIndex += continuationBytes;
+        continue;
+      }
+      // Valid UTF-8 sequence.
+      for (int i = 0; i < codePointLen; ++i) {
+        bytes.add(getByte(byteIndex + i));
+      }
+      byteIndex += codePointLen;
+    }
+    return UTF8String.fromBytes(bytes);
+  }
+
+  /**
+   * Checks if the current UTF8String is valid.
+   *
+   * @return If string represents a valid UTF8 byte sequence.

Review Comment:
   ```suggestion
      * @return If string represents a valid UTF8 string.
   ```



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Utility methods and constants for UTF-8 string validation.
+   */
+
+  private static boolean isValidContinuationByte(byte b) {
+     return (byte) 0x80 <= b && b <= (byte) 0xBF;
+  }
+
+  private static boolean isValidSecondByte(byte b, byte firstByte) {
+    return switch (firstByte) {
+      case (byte) 0xE0 -> (byte) 0xA0 <= b && b <= (byte) 0xBF;
+      case (byte) 0xED -> (byte) 0x80 <= b && b <= (byte) 0x9F;
+      case (byte) 0xF0 -> (byte) 0x90 <= b && b <= (byte) 0xBF;
+      case (byte) 0xF4 -> (byte) 0x80 <= b && b <= (byte) 0x8F;
+      default -> isValidContinuationByte(b);
+    };
+  }
+
+  private static final byte[] UNICODE_REPLACEMENT_CHARACTER =
+    new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD };
+
+  private static void appendReplacementCharacter(ArrayList<Byte> bytes) {
+    for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b);
+  }
+
+  /**
+   * Returns a validated version of the current UTF-8 string by replacing 
invalid UTF-8 sequences
+   * with the Unicode replacement character (U+FFFD), as per the rules defined 
in the Unicode
+   * standard. This behaviour is consistent with the behaviour of 
`UnicodeString` in ICU4C.
+   *
+   * @return A new UTF8String that is a valid UTF8 byte sequence.
+   */
+  public UTF8String makeValidUTF8() {
+    ArrayList<Byte> bytes = new ArrayList<>();
+    int byteIndex = 0;
+    byteIteration:
+    while (byteIndex < numBytes) {
+      // Read the first byte.
+      byte firstByte = getByte(byteIndex);
+      int expectedLen = bytesOfCodePointInUTF8[firstByte & 0xFF];
+      int codePointLen = Math.min(expectedLen, numBytes - byteIndex);
+      // 0B UTF-8 sequence (invalid first byte).
+      if (codePointLen == 0) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // 1B UTF-8 sequence (ASCII or invalid).
+      if (codePointLen == 1) {
+        if (firstByte >= 0) bytes.add(firstByte);
+        else appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read the second byte.
+      byte secondByte = getByte(byteIndex + 1);
+      if (!isValidSecondByte(secondByte, firstByte)) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read remaining continuation bytes.
+      int continuationBytes = 2;
+      for (; continuationBytes < codePointLen; ++continuationBytes) {
+        byte nextByte = getByte(byteIndex + continuationBytes);
+        if (!isValidContinuationByte(nextByte)) {
+          break;
+        }
+      }
+      // Invalid UTF-8 sequence (not enough continuation bytes).
+      if (continuationBytes < expectedLen) {
+        appendReplacementCharacter(bytes);
+        byteIndex += continuationBytes;
+        continue;
+      }
+      // Valid UTF-8 sequence.
+      for (int i = 0; i < codePointLen; ++i) {
+        bytes.add(getByte(byteIndex + i));
+      }
+      byteIndex += codePointLen;
+    }
+    return UTF8String.fromBytes(bytes);
+  }
+
+  /**
+   * Checks if the current UTF8String is valid.
+   *
+   * @return If string represents a valid UTF8 byte sequence.
+   */
+  public boolean isValidUTF8() {
+    return makeValidUTF8().equals(this);
+  }
+
+  /**
+   * Returns the current string if it is a valid UTF8 byte sequence, otherwise 
throws an exception.
+   *
+   * @return The UTF8String itself if it is a valid UTF8 byte sequence.
+   */
+  public UTF8String validateUTF8() {
+    if (!isValidUTF8()) {
+      throw new IllegalArgumentException("Invalid UTF-8 string");
+    }
+    return this;
+  }
+
+  /**
+   * Returns the current string if it is a valid UTF8 byte sequence, otherwise 
returns null.
+   *
+   * @return The UTF8String itself if it is a valid UTF8 byte sequence, 
otherwise null.

Review Comment:
   ```suggestion
      * @return The UTF8String itself if it is a valid UTF8 string, otherwise 
null.
   ```
   I have made this kind of suggestion many times already. It is a matter of 
terminology for me (and the UNICODE standard follows similar terminology): a 
string is valid or invalid/illegal UTF8 string. Byte sequences refer to 
individual UTF8 characters and they can be valid/legal or invalid/illegal.



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Utility methods and constants for UTF-8 string validation.
+   */
+
+  private static boolean isValidContinuationByte(byte b) {
+     return (byte) 0x80 <= b && b <= (byte) 0xBF;
+  }
+
+  private static boolean isValidSecondByte(byte b, byte firstByte) {
+    return switch (firstByte) {
+      case (byte) 0xE0 -> (byte) 0xA0 <= b && b <= (byte) 0xBF;
+      case (byte) 0xED -> (byte) 0x80 <= b && b <= (byte) 0x9F;
+      case (byte) 0xF0 -> (byte) 0x90 <= b && b <= (byte) 0xBF;
+      case (byte) 0xF4 -> (byte) 0x80 <= b && b <= (byte) 0x8F;
+      default -> isValidContinuationByte(b);
+    };
+  }
+
+  private static final byte[] UNICODE_REPLACEMENT_CHARACTER =
+    new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD };
+
+  private static void appendReplacementCharacter(ArrayList<Byte> bytes) {
+    for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b);
+  }
+
+  /**
+   * Returns a validated version of the current UTF-8 string by replacing 
invalid UTF-8 sequences
+   * with the Unicode replacement character (U+FFFD), as per the rules defined 
in the Unicode
+   * standard. This behaviour is consistent with the behaviour of 
`UnicodeString` in ICU4C.
+   *
+   * @return A new UTF8String that is a valid UTF8 byte sequence.
+   */
+  public UTF8String makeValidUTF8() {
+    ArrayList<Byte> bytes = new ArrayList<>();
+    int byteIndex = 0;
+    byteIteration:
+    while (byteIndex < numBytes) {
+      // Read the first byte.
+      byte firstByte = getByte(byteIndex);
+      int expectedLen = bytesOfCodePointInUTF8[firstByte & 0xFF];
+      int codePointLen = Math.min(expectedLen, numBytes - byteIndex);
+      // 0B UTF-8 sequence (invalid first byte).
+      if (codePointLen == 0) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // 1B UTF-8 sequence (ASCII or invalid).
+      if (codePointLen == 1) {
+        if (firstByte >= 0) bytes.add(firstByte);
+        else appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read the second byte.
+      byte secondByte = getByte(byteIndex + 1);
+      if (!isValidSecondByte(secondByte, firstByte)) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read remaining continuation bytes.
+      int continuationBytes = 2;
+      for (; continuationBytes < codePointLen; ++continuationBytes) {
+        byte nextByte = getByte(byteIndex + continuationBytes);
+        if (!isValidContinuationByte(nextByte)) {
+          break;
+        }
+      }
+      // Invalid UTF-8 sequence (not enough continuation bytes).
+      if (continuationBytes < expectedLen) {
+        appendReplacementCharacter(bytes);
+        byteIndex += continuationBytes;
+        continue;
+      }
+      // Valid UTF-8 sequence.
+      for (int i = 0; i < codePointLen; ++i) {
+        bytes.add(getByte(byteIndex + i));
+      }
+      byteIndex += codePointLen;
+    }
+    return UTF8String.fromBytes(bytes);
+  }
+
+  /**
+   * Checks if the current UTF8String is valid.
+   *
+   * @return If string represents a valid UTF8 byte sequence.
+   */
+  public boolean isValidUTF8() {
+    return makeValidUTF8().equals(this);
+  }
+
+  /**
+   * Returns the current string if it is a valid UTF8 byte sequence, otherwise 
throws an exception.
+   *
+   * @return The UTF8String itself if it is a valid UTF8 byte sequence.
+   */
+  public UTF8String validateUTF8() {
+    if (!isValidUTF8()) {
+      throw new IllegalArgumentException("Invalid UTF-8 string");
+    }
+    return this;
+  }
+
+  /**
+   * Returns the current string if it is a valid UTF8 byte sequence, otherwise 
returns null.
+   *
+   * @return The UTF8String itself if it is a valid UTF8 byte sequence, 
otherwise null.
+   */
+  public UTF8String tryValidateUTF8() {
+    if (!isValidUTF8()) {
+      return null;
+    }

Review Comment:
   ```suggestion
       if (!isValidUTF8()) return null;
   ```
   One line?



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Utility methods and constants for UTF-8 string validation.
+   */
+
+  private static boolean isValidContinuationByte(byte b) {
+     return (byte) 0x80 <= b && b <= (byte) 0xBF;
+  }
+
+  private static boolean isValidSecondByte(byte b, byte firstByte) {
+    return switch (firstByte) {
+      case (byte) 0xE0 -> (byte) 0xA0 <= b && b <= (byte) 0xBF;
+      case (byte) 0xED -> (byte) 0x80 <= b && b <= (byte) 0x9F;
+      case (byte) 0xF0 -> (byte) 0x90 <= b && b <= (byte) 0xBF;
+      case (byte) 0xF4 -> (byte) 0x80 <= b && b <= (byte) 0x8F;
+      default -> isValidContinuationByte(b);
+    };
+  }
+
+  private static final byte[] UNICODE_REPLACEMENT_CHARACTER =
+    new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD };
+
+  private static void appendReplacementCharacter(ArrayList<Byte> bytes) {
+    for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b);
+  }
+
+  /**
+   * Returns a validated version of the current UTF-8 string by replacing 
invalid UTF-8 sequences
+   * with the Unicode replacement character (U+FFFD), as per the rules defined 
in the Unicode
+   * standard. This behaviour is consistent with the behaviour of 
`UnicodeString` in ICU4C.
+   *
+   * @return A new UTF8String that is a valid UTF8 byte sequence.
+   */
+  public UTF8String makeValidUTF8() {
+    ArrayList<Byte> bytes = new ArrayList<>();
+    int byteIndex = 0;
+    byteIteration:
+    while (byteIndex < numBytes) {
+      // Read the first byte.
+      byte firstByte = getByte(byteIndex);
+      int expectedLen = bytesOfCodePointInUTF8[firstByte & 0xFF];
+      int codePointLen = Math.min(expectedLen, numBytes - byteIndex);
+      // 0B UTF-8 sequence (invalid first byte).
+      if (codePointLen == 0) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // 1B UTF-8 sequence (ASCII or invalid).
+      if (codePointLen == 1) {
+        if (firstByte >= 0) bytes.add(firstByte);
+        else appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read the second byte.
+      byte secondByte = getByte(byteIndex + 1);
+      if (!isValidSecondByte(secondByte, firstByte)) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read remaining continuation bytes.
+      int continuationBytes = 2;
+      for (; continuationBytes < codePointLen; ++continuationBytes) {
+        byte nextByte = getByte(byteIndex + continuationBytes);
+        if (!isValidContinuationByte(nextByte)) {
+          break;
+        }
+      }
+      // Invalid UTF-8 sequence (not enough continuation bytes).
+      if (continuationBytes < expectedLen) {
+        appendReplacementCharacter(bytes);
+        byteIndex += continuationBytes;
+        continue;
+      }
+      // Valid UTF-8 sequence.
+      for (int i = 0; i < codePointLen; ++i) {
+        bytes.add(getByte(byteIndex + i));
+      }
+      byteIndex += codePointLen;
+    }
+    return UTF8String.fromBytes(bytes);
+  }
+
+  /**
+   * Checks if the current UTF8String is valid.
+   *
+   * @return If string represents a valid UTF8 byte sequence.
+   */
+  public boolean isValidUTF8() {
+    return makeValidUTF8().equals(this);
+  }
+
+  /**
+   * Returns the current string if it is a valid UTF8 byte sequence, otherwise 
throws an exception.
+   *
+   * @return The UTF8String itself if it is a valid UTF8 byte sequence.
+   */
+  public UTF8String validateUTF8() {
+    if (!isValidUTF8()) {
+      throw new IllegalArgumentException("Invalid UTF-8 string");
+    }
+    return this;
+  }
+
+  /**
+   * Returns the current string if it is a valid UTF8 byte sequence, otherwise 
returns null.

Review Comment:
   ```suggestion
      * Returns the current string if it is a valid UTF8 string, otherwise 
returns null.
   ```



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Utility methods and constants for UTF-8 string validation.
+   */
+
+  private static boolean isValidContinuationByte(byte b) {
+     return (byte) 0x80 <= b && b <= (byte) 0xBF;
+  }
+
+  private static boolean isValidSecondByte(byte b, byte firstByte) {
+    return switch (firstByte) {
+      case (byte) 0xE0 -> (byte) 0xA0 <= b && b <= (byte) 0xBF;
+      case (byte) 0xED -> (byte) 0x80 <= b && b <= (byte) 0x9F;
+      case (byte) 0xF0 -> (byte) 0x90 <= b && b <= (byte) 0xBF;
+      case (byte) 0xF4 -> (byte) 0x80 <= b && b <= (byte) 0x8F;
+      default -> isValidContinuationByte(b);
+    };
+  }
+
+  private static final byte[] UNICODE_REPLACEMENT_CHARACTER =
+    new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD };
+
+  private static void appendReplacementCharacter(ArrayList<Byte> bytes) {
+    for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b);
+  }
+
+  /**
+   * Returns a validated version of the current UTF-8 string by replacing 
invalid UTF-8 sequences
+   * with the Unicode replacement character (U+FFFD), as per the rules defined 
in the Unicode
+   * standard. This behaviour is consistent with the behaviour of 
`UnicodeString` in ICU4C.
+   *
+   * @return A new UTF8String that is a valid UTF8 byte sequence.
+   */
+  public UTF8String makeValidUTF8() {
+    ArrayList<Byte> bytes = new ArrayList<>();
+    int byteIndex = 0;
+    byteIteration:
+    while (byteIndex < numBytes) {
+      // Read the first byte.
+      byte firstByte = getByte(byteIndex);
+      int expectedLen = bytesOfCodePointInUTF8[firstByte & 0xFF];
+      int codePointLen = Math.min(expectedLen, numBytes - byteIndex);
+      // 0B UTF-8 sequence (invalid first byte).
+      if (codePointLen == 0) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // 1B UTF-8 sequence (ASCII or invalid).
+      if (codePointLen == 1) {
+        if (firstByte >= 0) bytes.add(firstByte);
+        else appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read the second byte.
+      byte secondByte = getByte(byteIndex + 1);
+      if (!isValidSecondByte(secondByte, firstByte)) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read remaining continuation bytes.
+      int continuationBytes = 2;
+      for (; continuationBytes < codePointLen; ++continuationBytes) {
+        byte nextByte = getByte(byteIndex + continuationBytes);
+        if (!isValidContinuationByte(nextByte)) {
+          break;
+        }
+      }
+      // Invalid UTF-8 sequence (not enough continuation bytes).
+      if (continuationBytes < expectedLen) {
+        appendReplacementCharacter(bytes);
+        byteIndex += continuationBytes;
+        continue;
+      }
+      // Valid UTF-8 sequence.
+      for (int i = 0; i < codePointLen; ++i) {
+        bytes.add(getByte(byteIndex + i));
+      }
+      byteIndex += codePointLen;
+    }
+    return UTF8String.fromBytes(bytes);
+  }
+
+  /**
+   * Checks if the current UTF8String is valid.
+   *
+   * @return If string represents a valid UTF8 byte sequence.
+   */
+  public boolean isValidUTF8() {
+    return makeValidUTF8().equals(this);
+  }
+
+  /**
+   * Returns the current string if it is a valid UTF8 byte sequence, otherwise 
throws an exception.
+   *
+   * @return The UTF8String itself if it is a valid UTF8 byte sequence.
+   */
+  public UTF8String validateUTF8() {
+    if (!isValidUTF8()) {
+      throw new IllegalArgumentException("Invalid UTF-8 string");

Review Comment:
   Why are we adding this? Where is this going to be useful?
   If the prospect is a SQL `validate_utf8` expression then we better implement 
it using `isValid()` and by throwing a proper error class.
   I don't think this method is useful, but I may be wrong...



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Utility methods and constants for UTF-8 string validation.
+   */
+
+  private static boolean isValidContinuationByte(byte b) {
+     return (byte) 0x80 <= b && b <= (byte) 0xBF;
+  }
+
+  private static boolean isValidSecondByte(byte b, byte firstByte) {
+    return switch (firstByte) {
+      case (byte) 0xE0 -> (byte) 0xA0 <= b && b <= (byte) 0xBF;
+      case (byte) 0xED -> (byte) 0x80 <= b && b <= (byte) 0x9F;
+      case (byte) 0xF0 -> (byte) 0x90 <= b && b <= (byte) 0xBF;
+      case (byte) 0xF4 -> (byte) 0x80 <= b && b <= (byte) 0x8F;
+      default -> isValidContinuationByte(b);
+    };
+  }
+
+  private static final byte[] UNICODE_REPLACEMENT_CHARACTER =
+    new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD };
+
+  private static void appendReplacementCharacter(ArrayList<Byte> bytes) {
+    for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b);
+  }
+
+  /**
+   * Returns a validated version of the current UTF-8 string by replacing 
invalid UTF-8 sequences
+   * with the Unicode replacement character (U+FFFD), as per the rules defined 
in the Unicode
+   * standard. This behaviour is consistent with the behaviour of 
`UnicodeString` in ICU4C.
+   *
+   * @return A new UTF8String that is a valid UTF8 byte sequence.
+   */
+  public UTF8String makeValidUTF8() {

Review Comment:
   ```suggestion
     public UTF8String makeValid() {
   ```
   We are inside the `UTF8String` class. I don't think the "UTF8" suffix is 
necessary.



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Utility methods and constants for UTF-8 string validation.
+   */
+
+  private static boolean isValidContinuationByte(byte b) {
+     return (byte) 0x80 <= b && b <= (byte) 0xBF;
+  }
+
+  private static boolean isValidSecondByte(byte b, byte firstByte) {
+    return switch (firstByte) {
+      case (byte) 0xE0 -> (byte) 0xA0 <= b && b <= (byte) 0xBF;
+      case (byte) 0xED -> (byte) 0x80 <= b && b <= (byte) 0x9F;
+      case (byte) 0xF0 -> (byte) 0x90 <= b && b <= (byte) 0xBF;
+      case (byte) 0xF4 -> (byte) 0x80 <= b && b <= (byte) 0x8F;
+      default -> isValidContinuationByte(b);
+    };
+  }
+
+  private static final byte[] UNICODE_REPLACEMENT_CHARACTER =
+    new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD };
+
+  private static void appendReplacementCharacter(ArrayList<Byte> bytes) {
+    for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b);
+  }
+
+  /**
+   * Returns a validated version of the current UTF-8 string by replacing 
invalid UTF-8 sequences
+   * with the Unicode replacement character (U+FFFD), as per the rules defined 
in the Unicode
+   * standard. This behaviour is consistent with the behaviour of 
`UnicodeString` in ICU4C.
+   *
+   * @return A new UTF8String that is a valid UTF8 byte sequence.
+   */
+  public UTF8String makeValidUTF8() {
+    ArrayList<Byte> bytes = new ArrayList<>();
+    int byteIndex = 0;
+    byteIteration:
+    while (byteIndex < numBytes) {
+      // Read the first byte.
+      byte firstByte = getByte(byteIndex);
+      int expectedLen = bytesOfCodePointInUTF8[firstByte & 0xFF];
+      int codePointLen = Math.min(expectedLen, numBytes - byteIndex);
+      // 0B UTF-8 sequence (invalid first byte).
+      if (codePointLen == 0) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // 1B UTF-8 sequence (ASCII or invalid).
+      if (codePointLen == 1) {
+        if (firstByte >= 0) bytes.add(firstByte);
+        else appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read the second byte.
+      byte secondByte = getByte(byteIndex + 1);
+      if (!isValidSecondByte(secondByte, firstByte)) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read remaining continuation bytes.
+      int continuationBytes = 2;
+      for (; continuationBytes < codePointLen; ++continuationBytes) {
+        byte nextByte = getByte(byteIndex + continuationBytes);
+        if (!isValidContinuationByte(nextByte)) {
+          break;
+        }
+      }
+      // Invalid UTF-8 sequence (not enough continuation bytes).
+      if (continuationBytes < expectedLen) {
+        appendReplacementCharacter(bytes);
+        byteIndex += continuationBytes;
+        continue;
+      }
+      // Valid UTF-8 sequence.
+      for (int i = 0; i < codePointLen; ++i) {
+        bytes.add(getByte(byteIndex + i));
+      }
+      byteIndex += codePointLen;
+    }
+    return UTF8String.fromBytes(bytes);
+  }
+
+  /**
+   * Checks if the current UTF8String is valid.
+   *
+   * @return If string represents a valid UTF8 byte sequence.
+   */
+  public boolean isValidUTF8() {
+    return makeValidUTF8().equals(this);
+  }
+
+  /**
+   * Returns the current string if it is a valid UTF8 byte sequence, otherwise 
throws an exception.
+   *
+   * @return The UTF8String itself if it is a valid UTF8 byte sequence.

Review Comment:
   ```suggestion
      * @return The UTF8String itself if it is a valid UTF8 string.
   ```



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Utility methods and constants for UTF-8 string validation.
+   */
+
+  private static boolean isValidContinuationByte(byte b) {
+     return (byte) 0x80 <= b && b <= (byte) 0xBF;
+  }
+
+  private static boolean isValidSecondByte(byte b, byte firstByte) {
+    return switch (firstByte) {
+      case (byte) 0xE0 -> (byte) 0xA0 <= b && b <= (byte) 0xBF;
+      case (byte) 0xED -> (byte) 0x80 <= b && b <= (byte) 0x9F;
+      case (byte) 0xF0 -> (byte) 0x90 <= b && b <= (byte) 0xBF;
+      case (byte) 0xF4 -> (byte) 0x80 <= b && b <= (byte) 0x8F;
+      default -> isValidContinuationByte(b);
+    };
+  }
+
+  private static final byte[] UNICODE_REPLACEMENT_CHARACTER =
+    new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD };
+
+  private static void appendReplacementCharacter(ArrayList<Byte> bytes) {
+    for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b);
+  }
+
+  /**
+   * Returns a validated version of the current UTF-8 string by replacing 
invalid UTF-8 sequences
+   * with the Unicode replacement character (U+FFFD), as per the rules defined 
in the Unicode
+   * standard. This behaviour is consistent with the behaviour of 
`UnicodeString` in ICU4C.
+   *
+   * @return A new UTF8String that is a valid UTF8 byte sequence.
+   */
+  public UTF8String makeValidUTF8() {
+    ArrayList<Byte> bytes = new ArrayList<>();
+    int byteIndex = 0;
+    byteIteration:
+    while (byteIndex < numBytes) {
+      // Read the first byte.
+      byte firstByte = getByte(byteIndex);
+      int expectedLen = bytesOfCodePointInUTF8[firstByte & 0xFF];
+      int codePointLen = Math.min(expectedLen, numBytes - byteIndex);
+      // 0B UTF-8 sequence (invalid first byte).
+      if (codePointLen == 0) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // 1B UTF-8 sequence (ASCII or invalid).
+      if (codePointLen == 1) {
+        if (firstByte >= 0) bytes.add(firstByte);
+        else appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read the second byte.
+      byte secondByte = getByte(byteIndex + 1);
+      if (!isValidSecondByte(secondByte, firstByte)) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;
+        continue;
+      }
+      // Read remaining continuation bytes.
+      int continuationBytes = 2;
+      for (; continuationBytes < codePointLen; ++continuationBytes) {
+        byte nextByte = getByte(byteIndex + continuationBytes);
+        if (!isValidContinuationByte(nextByte)) {
+          break;
+        }
+      }
+      // Invalid UTF-8 sequence (not enough continuation bytes).
+      if (continuationBytes < expectedLen) {
+        appendReplacementCharacter(bytes);
+        byteIndex += continuationBytes;
+        continue;
+      }
+      // Valid UTF-8 sequence.
+      for (int i = 0; i < codePointLen; ++i) {
+        bytes.add(getByte(byteIndex + i));
+      }
+      byteIndex += codePointLen;
+    }
+    return UTF8String.fromBytes(bytes);
+  }
+
+  /**
+   * Checks if the current UTF8String is valid.
+   *
+   * @return If string represents a valid UTF8 byte sequence.
+   */
+  public boolean isValidUTF8() {
+    return makeValidUTF8().equals(this);
+  }

Review Comment:
   This is not as efficient as it could be because we are constructing the 
valid version of the string and then we compare. It should be very easy to 
implement this directly:
   ```java
     public UTF8String isValid() {
       int byteIndex = 0;
       while (byteIndex < numBytes) {
         // Read the first byte.
         byte firstByte = getByte(byteIndex);
         int expectedLen = bytesOfCodePointInUTF8[firstByte & 0xFF];
         int codePointLen = Math.min(expectedLen, numBytes - byteIndex);
         // 0B UTF-8 sequence (invalid first byte).
         if (codePointLen == 0) return false;
         if (codePointLen == 1) {
           ++byteIndex;
           continue;
         }
         // Read the second byte.
         byte secondByte = getByte(byteIndex + 1);
         if (!isValidSecondByte(secondByte, firstByte)) return false;
         // Read remaining continuation bytes.
         int continuationBytes = 2;
         for (; continuationBytes < codePointLen; ++continuationBytes) {
           byte nextByte = getByte(byteIndex + continuationBytes);
           if (!isValidContinuationByte(nextByte)) return false;
         }
         // Invalid UTF-8 sequence (not enough continuation bytes).
         if (continuationBytes < expectedLen) return false;
         byteIndex += codePointLen;
       }
       return true;
     }
   ```



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -270,6 +279,123 @@ public byte[] getBytes() {
     }
   }
 
+  /**
+   * Utility methods and constants for UTF-8 string validation.
+   */
+
+  private static boolean isValidContinuationByte(byte b) {
+     return (byte) 0x80 <= b && b <= (byte) 0xBF;
+  }
+
+  private static boolean isValidSecondByte(byte b, byte firstByte) {
+    return switch (firstByte) {
+      case (byte) 0xE0 -> (byte) 0xA0 <= b && b <= (byte) 0xBF;
+      case (byte) 0xED -> (byte) 0x80 <= b && b <= (byte) 0x9F;
+      case (byte) 0xF0 -> (byte) 0x90 <= b && b <= (byte) 0xBF;
+      case (byte) 0xF4 -> (byte) 0x80 <= b && b <= (byte) 0x8F;
+      default -> isValidContinuationByte(b);
+    };
+  }
+
+  private static final byte[] UNICODE_REPLACEMENT_CHARACTER =
+    new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD };
+
+  private static void appendReplacementCharacter(ArrayList<Byte> bytes) {
+    for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b);
+  }
+
+  /**
+   * Returns a validated version of the current UTF-8 string by replacing 
invalid UTF-8 sequences
+   * with the Unicode replacement character (U+FFFD), as per the rules defined 
in the Unicode
+   * standard. This behaviour is consistent with the behaviour of 
`UnicodeString` in ICU4C.
+   *
+   * @return A new UTF8String that is a valid UTF8 byte sequence.
+   */
+  public UTF8String makeValidUTF8() {
+    ArrayList<Byte> bytes = new ArrayList<>();
+    int byteIndex = 0;
+    byteIteration:
+    while (byteIndex < numBytes) {
+      // Read the first byte.
+      byte firstByte = getByte(byteIndex);
+      int expectedLen = bytesOfCodePointInUTF8[firstByte & 0xFF];
+      int codePointLen = Math.min(expectedLen, numBytes - byteIndex);
+      // 0B UTF-8 sequence (invalid first byte).
+      if (codePointLen == 0) {
+        appendReplacementCharacter(bytes);
+        byteIndex += 1;

Review Comment:
   ```suggestion
           ++byteIndex;
   ```
   Nit.
   Same below instead of `byteIndex += 1`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [WIP][SQL] Invalid UTF-8 byte sequence replacement [spark]

Reply via email to