mikemccand commented on code in PR #12320:
URL: https://github.com/apache/lucene/pull/12320#discussion_r1206892745
##########
lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java:
##########
@@ -477,38 +477,60 @@ public static int UTF8toUTF32(final BytesRef utf8, final
int[] ints) {
int utf8Upto = utf8.offset;
final byte[] bytes = utf8.bytes;
final int utf8Limit = utf8.offset + utf8.length;
+ UTF8CodePoint reuse = null;
while (utf8Upto < utf8Limit) {
- final int numBytes = utf8CodeLength[bytes[utf8Upto] & 0xFF];
- int v = 0;
- switch (numBytes) {
- case 1:
- ints[utf32Count++] = bytes[utf8Upto++];
- continue;
- case 2:
- // 5 useful bits
- v = bytes[utf8Upto++] & 31;
- break;
- case 3:
- // 4 useful bits
- v = bytes[utf8Upto++] & 15;
- break;
- case 4:
- // 3 useful bits
- v = bytes[utf8Upto++] & 7;
- break;
- default:
- throw new IllegalArgumentException("invalid utf8");
- }
+ reuse = codePointAt(bytes, utf8Upto, reuse);
+ ints[utf32Count++] = reuse.codePoint;
+ utf8Upto += reuse.codePointBytes;
+ }
- // TODO: this may read past utf8's limit.
- final int limit = utf8Upto + numBytes - 1;
- while (utf8Upto < limit) {
- v = v << 6 | bytes[utf8Upto++] & 63;
+ return utf32Count;
+ }
+
+ /**
+ * Computes the codepoint and codepoint length (in bytes) of the specified
{@code offset} in the
+ * provided {@code utf8} byte array, assuming UTF8 encoding. As with other
related methods in this
+ * class, this assumes valid UTF8 input and <strong>does not
perform</strong> full UTF8
+ * validation.
+ *
+ * @throws IllegalArgumentException If invalid codepoint header byte occurs
or the content is
Review Comment:
I think we may also throw `ArrayIndexOutOfBoundException` on really badly
not-UTF-8 `byte[]`? The `utf8CodeLength` array is I think length 248 (256 -
8). Also, it has a bunch of `v` in it, which I think are invalid UTF-8 first
bytes, which should throw the `IllegalArgumentException`.
Maybe either catch the AIOOBE and rethrow as IAE, or, soften the statement
to say "throws various exceptions on invalid UTF-8, or, if the provided pos is
NOT the start of a Unicode character". I don't think we want to promise we
will always detect invalid UTF-8 and throw a clean exception.
##########
lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java:
##########
@@ -477,38 +477,60 @@ public static int UTF8toUTF32(final BytesRef utf8, final
int[] ints) {
int utf8Upto = utf8.offset;
final byte[] bytes = utf8.bytes;
final int utf8Limit = utf8.offset + utf8.length;
+ UTF8CodePoint reuse = null;
while (utf8Upto < utf8Limit) {
- final int numBytes = utf8CodeLength[bytes[utf8Upto] & 0xFF];
- int v = 0;
- switch (numBytes) {
- case 1:
- ints[utf32Count++] = bytes[utf8Upto++];
- continue;
- case 2:
- // 5 useful bits
- v = bytes[utf8Upto++] & 31;
- break;
- case 3:
- // 4 useful bits
- v = bytes[utf8Upto++] & 15;
- break;
- case 4:
- // 3 useful bits
- v = bytes[utf8Upto++] & 7;
- break;
- default:
- throw new IllegalArgumentException("invalid utf8");
- }
+ reuse = codePointAt(bytes, utf8Upto, reuse);
+ ints[utf32Count++] = reuse.codePoint;
+ utf8Upto += reuse.codePointBytes;
+ }
- // TODO: this may read past utf8's limit.
- final int limit = utf8Upto + numBytes - 1;
- while (utf8Upto < limit) {
- v = v << 6 | bytes[utf8Upto++] & 63;
+ return utf32Count;
+ }
+
+ /**
+ * Computes the codepoint and codepoint length (in bytes) of the specified
{@code offset} in the
+ * provided {@code utf8} byte array, assuming UTF8 encoding. As with other
related methods in this
+ * class, this assumes valid UTF8 input and <strong>does not
perform</strong> full UTF8
+ * validation.
+ *
+ * @throws IllegalArgumentException If invalid codepoint header byte occurs
or the content is
+ * prematurely truncated.
+ */
+ public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint
reuse) {
+ if (reuse == null) {
+ reuse = new UTF8CodePoint();
+ }
+
+ int leadByte = utf8[pos] & 0xFF;
+ int numBytes = utf8CodeLength[leadByte];
+ reuse.codePointBytes = numBytes;
+ int v;
+ switch (numBytes) {
+ case 1 -> {
+ reuse.codePoint = leadByte;
+ return reuse;
}
- ints[utf32Count++] = v;
+ case 2 -> v = leadByte & 31; // 5 useful bits
+ case 3 -> v = leadByte & 15; // 4 useful bits
+ case 4 -> v = leadByte & 7; // 3 useful bits
+ default -> throw new IllegalArgumentException("invalid utf8");
Review Comment:
Maybe include the `Arrays.toString(utf8)` and `pos` in the exception
message? Or perhaps just the fragment where the malformed utf-8 started
(`utf8[pos:` in Python syntax).
##########
lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java:
##########
@@ -477,38 +477,60 @@ public static int UTF8toUTF32(final BytesRef utf8, final
int[] ints) {
int utf8Upto = utf8.offset;
final byte[] bytes = utf8.bytes;
final int utf8Limit = utf8.offset + utf8.length;
+ UTF8CodePoint reuse = null;
while (utf8Upto < utf8Limit) {
- final int numBytes = utf8CodeLength[bytes[utf8Upto] & 0xFF];
- int v = 0;
- switch (numBytes) {
- case 1:
- ints[utf32Count++] = bytes[utf8Upto++];
- continue;
- case 2:
- // 5 useful bits
- v = bytes[utf8Upto++] & 31;
- break;
- case 3:
- // 4 useful bits
- v = bytes[utf8Upto++] & 15;
- break;
- case 4:
- // 3 useful bits
- v = bytes[utf8Upto++] & 7;
- break;
- default:
- throw new IllegalArgumentException("invalid utf8");
- }
+ reuse = codePointAt(bytes, utf8Upto, reuse);
+ ints[utf32Count++] = reuse.codePoint;
+ utf8Upto += reuse.codePointBytes;
+ }
- // TODO: this may read past utf8's limit.
- final int limit = utf8Upto + numBytes - 1;
- while (utf8Upto < limit) {
- v = v << 6 | bytes[utf8Upto++] & 63;
+ return utf32Count;
+ }
+
+ /**
+ * Computes the codepoint and codepoint length (in bytes) of the specified
{@code offset} in the
+ * provided {@code utf8} byte array, assuming UTF8 encoding. As with other
related methods in this
+ * class, this assumes valid UTF8 input and <strong>does not
perform</strong> full UTF8
+ * validation.
+ *
+ * @throws IllegalArgumentException If invalid codepoint header byte occurs
or the content is
+ * prematurely truncated.
+ */
+ public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint
reuse) {
+ if (reuse == null) {
+ reuse = new UTF8CodePoint();
+ }
+
+ int leadByte = utf8[pos] & 0xFF;
+ int numBytes = utf8CodeLength[leadByte];
+ reuse.codePointBytes = numBytes;
+ int v;
+ switch (numBytes) {
+ case 1 -> {
+ reuse.codePoint = leadByte;
+ return reuse;
}
- ints[utf32Count++] = v;
+ case 2 -> v = leadByte & 31; // 5 useful bits
+ case 3 -> v = leadByte & 15; // 4 useful bits
+ case 4 -> v = leadByte & 7; // 3 useful bits
+ default -> throw new IllegalArgumentException("invalid utf8");
}
- return utf32Count;
+ // TODO: this may read past utf8's limit.
Review Comment:
Ahh yes another `AIOOBE` case. I think it's fine if we throw whatever
exceptions if you pass invalid UTF-8.
##########
lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java:
##########
@@ -477,38 +477,60 @@ public static int UTF8toUTF32(final BytesRef utf8, final
int[] ints) {
int utf8Upto = utf8.offset;
final byte[] bytes = utf8.bytes;
final int utf8Limit = utf8.offset + utf8.length;
+ UTF8CodePoint reuse = null;
while (utf8Upto < utf8Limit) {
- final int numBytes = utf8CodeLength[bytes[utf8Upto] & 0xFF];
- int v = 0;
- switch (numBytes) {
- case 1:
- ints[utf32Count++] = bytes[utf8Upto++];
- continue;
- case 2:
- // 5 useful bits
- v = bytes[utf8Upto++] & 31;
- break;
- case 3:
- // 4 useful bits
- v = bytes[utf8Upto++] & 15;
- break;
- case 4:
- // 3 useful bits
- v = bytes[utf8Upto++] & 7;
- break;
- default:
- throw new IllegalArgumentException("invalid utf8");
- }
+ reuse = codePointAt(bytes, utf8Upto, reuse);
+ ints[utf32Count++] = reuse.codePoint;
+ utf8Upto += reuse.codePointBytes;
+ }
- // TODO: this may read past utf8's limit.
- final int limit = utf8Upto + numBytes - 1;
- while (utf8Upto < limit) {
- v = v << 6 | bytes[utf8Upto++] & 63;
+ return utf32Count;
+ }
+
+ /**
+ * Computes the codepoint and codepoint length (in bytes) of the specified
{@code offset} in the
+ * provided {@code utf8} byte array, assuming UTF8 encoding. As with other
related methods in this
+ * class, this assumes valid UTF8 input and <strong>does not
perform</strong> full UTF8
+ * validation.
+ *
+ * @throws IllegalArgumentException If invalid codepoint header byte occurs
or the content is
+ * prematurely truncated.
+ */
+ public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint
reuse) {
+ if (reuse == null) {
+ reuse = new UTF8CodePoint();
+ }
+
+ int leadByte = utf8[pos] & 0xFF;
+ int numBytes = utf8CodeLength[leadByte];
+ reuse.codePointBytes = numBytes;
+ int v;
+ switch (numBytes) {
+ case 1 -> {
+ reuse.codePoint = leadByte;
+ return reuse;
}
- ints[utf32Count++] = v;
+ case 2 -> v = leadByte & 31; // 5 useful bits
+ case 3 -> v = leadByte & 15; // 4 useful bits
+ case 4 -> v = leadByte & 7; // 3 useful bits
+ default -> throw new IllegalArgumentException("invalid utf8");
}
- return utf32Count;
+ // TODO: this may read past utf8's limit.
+ final int limit = pos + numBytes;
+ pos++;
+ while (pos < limit) {
+ v = v << 6 | utf8[pos++] & 63;
+ }
+ reuse.codePoint = v;
+
+ return reuse;
+ }
+
+ /** Holds a codepoint along with the number of bytes required to represent
it in UTF8 */
+ public static final class UTF8CodePoint {
+ public int codePoint;
+ public int codePointBytes;
Review Comment:
Maybe rename to `numBytes`? The `codePoint` prefix seems redundant.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]