This is an automated email from the ASF dual-hosted git repository.
garydgregory pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-lang.git
The following commit(s) were added to refs/heads/master by this push:
new 6334440d5 Fold supplementary code points in
CharSequenceUtils.regionMatches (#1725)
6334440d5 is described below
commit 6334440d5a61fdee73c8c7dd947f7adb6fa31331
Author: alhuda <[email protected]>
AuthorDate: Wed Jun 24 17:22:42 2026 +0530
Fold supplementary code points in CharSequenceUtils.regionMatches (#1725)
* fold supplementary code points in CharSequenceUtils.regionMatches
* move supplementary case-fold cases to a non-String regionMatches test
String.regionMatches folds a supplementary code point at the code point
level only from Java 9 onward, so the shared String-parity rows in
TEST_DATA returned false on Java 8 and broke the build there. The green
non-String path that this change fixes is JDK independent, so move those
cases into a dedicated testRegionMatchesSupplementaryCaseFold and keep
TEST_DATA on the BMP inputs that match String on every supported JDK.
* Match running JDK String for surrogate-pair case folding in regionMatches
Gate the supplementary code point fold on SystemUtils.IS_JAVA_1_8 so the
non-String path reproduces String#regionMatches on the running JDK: Java 9+
folds the whole code point, Java 8 stays char-by-char. The test now checks
String, StringBuilder, StringBuffer and CharBuffer all agree with String
rather than asserting a fixed result that only holds on Java 9+.
* Probe String behaviour instead of JDK version for surrogate case folding
in regionMatches
---
.../apache/commons/lang3/CharSequenceUtils.java | 75 ++++++++++++++++++----
.../commons/lang3/CharSequenceUtilsTest.java | 28 ++++++++
2 files changed, 92 insertions(+), 11 deletions(-)
diff --git a/src/main/java/org/apache/commons/lang3/CharSequenceUtils.java
b/src/main/java/org/apache/commons/lang3/CharSequenceUtils.java
index 4fd9445b1..6009ba1d3 100644
--- a/src/main/java/org/apache/commons/lang3/CharSequenceUtils.java
+++ b/src/main/java/org/apache/commons/lang3/CharSequenceUtils.java
@@ -27,6 +27,17 @@ public class CharSequenceUtils {
private static final int NOT_FOUND = -1;
+ /**
+ * Whether the running JDK folds a supplementary code point split across a
surrogate pair when comparing case
+ * insensitively in {@link String#regionMatches(boolean, int, String, int,
int)}. JDKs up to and including Java 11
+ * compare surrogate by surrogate and never match such a pair; later JDKs
fold the whole code point. Probing what
+ * {@link String} actually does (rather than gating on a version constant)
keeps every {@link CharSequence} type in
+ * step with {@link String} on whatever JDK is running. DESERET CAPITAL
LETTER LONG I (U+10400) folds to its small
+ * form (U+10428).
+ */
+ private static final boolean STRING_FOLDS_SUPPLEMENTARY_CASE =
+ new String(Character.toChars(0x10400)).regionMatches(true, 0, new
String(Character.toChars(0x10428)), 0, 2);
+
static final int TO_STRING_LIMIT = 16;
private static boolean checkLaterThan1(final CharSequence cs, final
CharSequence searchChar, final int len2, final int start1) {
@@ -293,9 +304,6 @@ static boolean regionMatches(final CharSequence cs, final
boolean ignoreCase, fi
if (cs instanceof String && substring instanceof String) {
return ((String) cs).regionMatches(ignoreCase, thisStart, (String)
substring, start, length);
}
- int index1 = thisStart;
- int index2 = start;
- int tmpLen = length;
// Extract these first so we detect NPEs the same as the
java.lang.String version
final int srcLen = cs.length() - thisStart;
final int otherLen = substring.length() - start;
@@ -307,25 +315,70 @@ static boolean regionMatches(final CharSequence cs, final
boolean ignoreCase, fi
if (srcLen < length || otherLen < length) {
return false;
}
- while (tmpLen-- > 0) {
- final char c1 = cs.charAt(index1++);
- final char c2 = substring.charAt(index2++);
+ final int end1 = thisStart + length;
+ final int end2 = start + length;
+ int index1 = thisStart;
+ int index2 = start;
+ while (index1 < end1 && index2 < end2) {
+ final char c1 = cs.charAt(index1);
+ final char c2 = substring.charAt(index2);
if (c1 == c2) {
+ index1++;
+ index2++;
continue;
}
if (!ignoreCase) {
return false;
}
- // The real same check as in String#regionMatches(boolean, int,
String, int, int):
- final char u1 = Character.toUpperCase(c1);
- final char u2 = Character.toUpperCase(c2);
- if (u1 != u2 && Character.toLowerCase(u1) !=
Character.toLowerCase(u2)) {
- return false;
+ // The same case-insensitive check as
String#regionMatches(boolean, int, String, int, int).
+ if (!equalsIgnoreCase(c1, c2)) {
+ // Only fold a supplementary code point split across a
surrogate pair where String itself does, so
+ // every CharSequence type gives the same result that String
does on the running JDK (see field).
+ if (!STRING_FOLDS_SUPPLEMENTARY_CASE) {
+ return false;
+ }
+ int cp1 = c1;
+ if (Character.isHighSurrogate(c1)) {
+ if (index1 + 1 < end1 &&
Character.isLowSurrogate(cs.charAt(index1 + 1))) {
+ cp1 = Character.toCodePoint(c1, cs.charAt(index1 + 1));
+ index1++;
+ }
+ } else if (Character.isLowSurrogate(c1) && index1 > thisStart
&& Character.isHighSurrogate(cs.charAt(index1 - 1))) {
+ cp1 = Character.toCodePoint(cs.charAt(index1 - 1), c1);
+ }
+ int cp2 = c2;
+ if (Character.isHighSurrogate(c2)) {
+ if (index2 + 1 < end2 &&
Character.isLowSurrogate(substring.charAt(index2 + 1))) {
+ cp2 = Character.toCodePoint(c2,
substring.charAt(index2 + 1));
+ index2++;
+ }
+ } else if (Character.isLowSurrogate(c2) && index2 > start &&
Character.isHighSurrogate(substring.charAt(index2 - 1))) {
+ cp2 = Character.toCodePoint(substring.charAt(index2 - 1),
c2);
+ }
+ if (!equalsIgnoreCase(cp1, cp2)) {
+ return false;
+ }
}
+ index1++;
+ index2++;
}
return true;
}
+ /**
+ * Tests whether two code points are equal ignoring case, matching the
folding used by
+ * {@link String#regionMatches(boolean, int, String, int, int)}.
+ *
+ * @param cp1 the first code point.
+ * @param cp2 the second code point.
+ * @return whether the code points are equal ignoring case.
+ */
+ private static boolean equalsIgnoreCase(final int cp1, final int cp2) {
+ final int u1 = Character.toUpperCase(cp1);
+ final int u2 = Character.toUpperCase(cp2);
+ return u1 == u2 || Character.toLowerCase(u1) ==
Character.toLowerCase(u2);
+ }
+
/**
* Returns a new {@link CharSequence} that is a subsequence of this
* sequence starting with the {@code char} value at the specified index.
diff --git a/src/test/java/org/apache/commons/lang3/CharSequenceUtilsTest.java
b/src/test/java/org/apache/commons/lang3/CharSequenceUtilsTest.java
index 1ea19165b..48bbf6b60 100644
--- a/src/test/java/org/apache/commons/lang3/CharSequenceUtilsTest.java
+++ b/src/test/java/org/apache/commons/lang3/CharSequenceUtilsTest.java
@@ -28,6 +28,7 @@
import java.lang.reflect.Constructor;
import java.lang.reflect.Modifier;
+import java.nio.CharBuffer;
import java.util.Random;
import java.util.stream.IntStream;
import java.util.stream.Stream;
@@ -276,6 +277,33 @@ boolean invoke() {
}
}
+ private static void assertRegionMatchesParity(final String source, final
boolean ignoreCase, final int toffset, final String other,
+ final int ooffset, final int len) {
+ // String is the reference: whatever the running JDK does for String,
every CharSequence type must match.
+ final boolean expected = source.regionMatches(ignoreCase, toffset,
other, ooffset, len);
+ final CharSequence[] sources = {source, new StringBuilder(source), new
StringBuffer(source), CharBuffer.wrap(source)};
+ for (final CharSequence cs : sources) {
+ assertEquals(expected, CharSequenceUtils.regionMatches(cs,
ignoreCase, toffset, other, ooffset, len),
+ cs.getClass().getSimpleName() + " differs from String for
" + source + " vs " + other);
+ }
+ }
+
+ /**
+ * A supplementary code point split across a surrogate pair must fold the
same way for every {@link CharSequence}
+ * type that it does for {@link String} on the running JDK. {@link
String#regionMatches(boolean, int, String, int, int)}
+ * only folds such a code point from Java 9 on, so these rows are checked
against {@link String} itself rather than a
+ * fixed result: {@link String}, {@link StringBuilder}, {@link
StringBuffer} and {@link CharBuffer} all have to agree.
+ * Deseret CAPITAL LONG I (U+10400) folds to SMALL LONG I (U+10428).
+ */
+ @Test
+ void testRegionMatchesSupplementaryCaseFold() {
+ assertRegionMatchesParity("\uD801\uDC00", true, 0, "\uD801\uDC28", 0,
2);
+ assertRegionMatchesParity("\uD801\uDC00", false, 0, "\uD801\uDC28", 0,
2);
+ assertRegionMatchesParity("\uD801\uDC28", true, 0, "\uD801\uDC00", 0,
2);
+ assertRegionMatchesParity("x\uD801\uDC00", true, 1, "\uD801\uDC28", 0,
2);
+ assertRegionMatchesParity("\uD801\uDC00", true, 0, "\uD801\uDC29", 0,
2);
+ }
+
@Test
void testSubSequence() {
//