Hi,
Any opinions on the attached patch or someone tempted to sponsor it?
Regards,
Philipp
diff -r 84215fa115fc src/java.base/share/classes/java/util/regex/Grapheme.java
--- a/src/java.base/share/classes/java/util/regex/Grapheme.java Fri Mar 20 17:37:52 2020 -0700
+++ b/src/java.base/share/classes/java/util/regex/Grapheme.java Sat Mar 21 15:51:11 2020 +0100
@@ -30,21 +30,8 @@
final class Grapheme {
/**
- * Determines if there is an extended grapheme cluster boundary between two
- * continuing characters {@code cp1} and {@code cp2}.
- * <p>
- * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
- * for the extended grapheme cluster boundary rules
- * <p>
- * Note: this method does not take care of stateful breaking.
- */
- static boolean isBoundary(int cp1, int cp2) {
- return rules[getType(cp1)][getType(cp2)];
- }
-
- /**
- * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
- * the start of the char sequence is a boundary.
+ * Look for the next extended grapheme cluster boundary in a CharSequence.
+ * It assumes the start of the char sequence at offset {@code off} is a boundary.
* <p>
* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
* for the extended grapheme cluster boundary rules. The following implementation
@@ -54,20 +41,19 @@
* @param src the {@code CharSequence} to be scanned
* @param off offset to start looking for the next boundary in the src
* @param limit limit offset in the src (exclusive)
- * @return the next possible boundary
+ * @return the next grapheme boundary
*/
static int nextBoundary(CharSequence src, int off, int limit) {
Objects.checkFromToIndex(off, limit, src.length());
- int ch0 = Character.codePointAt(src, 0);
- int ret = Character.charCount(ch0);
- int ch1;
+ int ch0 = Character.codePointAt(src, off);
+ int ret = off + Character.charCount(ch0);
// indicates whether gb11 or gb12 is underway
int t0 = getGraphemeType(ch0);
int riCount = t0 == RI ? 1 : 0;
boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
while (ret < limit) {
- ch1 = Character.codePointAt(src, ret);
+ int ch1 = Character.codePointAt(src, ret);
int t1 = getGraphemeType(ch1);
if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
@@ -177,6 +163,7 @@
cp == 0xAA7B || cp == 0xAA7D;
}
+ @SuppressWarnings("fallthrough")
private static int getGraphemeType(int cp) {
if (cp < 0x007F) { // ASCII
if (cp < 32) { // Control characters
@@ -188,11 +175,7 @@
}
return OTHER;
}
- return getType(cp);
- }
- @SuppressWarnings("fallthrough")
- private static int getType(int cp) {
if (EmojiData.isExtendedPictographic(cp)) {
return EXTENDED_PICTOGRAPHIC;
}
diff -r 84215fa115fc src/java.base/share/classes/java/util/regex/Pattern.java
--- a/src/java.base/share/classes/java/util/regex/Pattern.java Fri Mar 20 17:37:52 2020 -0700
+++ b/src/java.base/share/classes/java/util/regex/Pattern.java Sat Mar 21 15:51:11 2020 +0100
@@ -4035,17 +4035,8 @@
if (i < matcher.to) {
int ch0 = Character.codePointAt(seq, i);
int n = Character.charCount(ch0);
- int j = i + n;
- // Fast check if it's necessary to call Normalizer;
- // testing Grapheme.isBoundary is enough for this case
- while (j < matcher.to) {
- int ch1 = Character.codePointAt(seq, j);
- if (Grapheme.isBoundary(ch0, ch1))
- break;
- ch0 = ch1;
- j += Character.charCount(ch1);
- }
- if (i + n == j) { // single, assume nfc cp
+ int j = Grapheme.nextBoundary(seq, i, matcher.to);
+ if (i + n == j) { // single cp grapheme, assume nfc
if (predicate.is(ch0))
return next.match(matcher, j, seq);
} else {
@@ -4109,13 +4100,12 @@
endIndex = matcher.getTextLength();
}
if (i == startIndex) {
- return next.match(matcher, i, seq);
- }
- if (i < endIndex) {
- if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) ||
- Grapheme.nextBoundary(seq,
- i - Character.charCount(Character.codePointBefore(seq, i)),
- i + Character.charCount(Character.codePointAt(seq, i))) > i) {
+ // continue with return below
+ } else if (i < endIndex) {
+ if (Character.isSurrogatePair(seq.charAt(i - 1), seq.charAt(i))) {
+ return false;
+ }
+ if (Grapheme.nextBoundary(seq, matcher.last, endIndex) > i) {
return false;
}
} else {
diff -r 84215fa115fc test/jdk/java/util/regex/GraphemeTimePerCharTest.java
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/jdk/java/util/regex/GraphemeTimePerCharTest.java Sat Mar 21 15:51:11 2020 +0100
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.testng.annotations.*;
+import static org.testng.Assert.*;
+
+/**
+ * @test
+ * @bug 8241055
+ * @run testng GraphemeTimePerCharTest
+ * @summary Measures {@link java.util.regex.Grapheme#nextBoundary} performance
+ * depending on input character sequence size.
+ */
+public class GraphemeTimePerCharTest {
+
+ static final Pattern GRAPHEME_REGEX = Pattern.compile("\\X");
+
+ void matchPerformance(String value) {
+ Matcher charMatcher = GRAPHEME_REGEX.matcher(value);
+ while (charMatcher.find()) charMatcher.group();
+ }
+
+ @Test
+ public void testDependsLinearlyOnInputSize() throws Exception {
+ matchPerformance("warmup");
+
+ final int STEPS = 50; // number of measurements
+ final int LIMIT = 5000000; // maximum string size tested
+
+ double[][] strTimesByNChars = new double[STEPS][];
+ double[][] chrTimesByNChars = new double[STEPS][];
+ System.out.println(
+ "number of chars, time string [ms], time per char [ns]");
+ for (int i = 0; i < STEPS; i++) {
+ int chunkSize = (i + 1) * LIMIT / STEPS;
+ String value = "x".repeat(chunkSize);
+ long start = System.nanoTime();
+ matchPerformance(value);
+ long end = System.nanoTime();
+ long t = end - start;
+ strTimesByNChars[i] = new double[] { chunkSize, t };
+ double timePerChar = (double) t / (double) chunkSize;
+ chrTimesByNChars[i] = new double[] { chunkSize, timePerChar };
+ System.out.printf("%7d %6d %10.3f%n",
+ chunkSize, (t / 1000000), timePerChar);
+ }
+ double cStr = correlate(strTimesByNChars);
+ double cChr = correlate(chrTimesByNChars);
+ System.out.println("correlation factor between string size and "
+ + "total time (expected 1): " + cStr);
+ System.out.println("correlation factor between string size and "
+ + "time per character (expected 0 or less): " + cChr);
+ assertTrue(cChr < .5 && .5 < cStr);
+ }
+
+ double correlate(double[][] data) {
+ double sumx = 0, sumy = 0;
+ for (int i = 0; i < data.length; i++) {
+ sumx += data[i][0];
+ sumy += data[i][1];
+ }
+ double avgx = sumx / data.length;
+ double avgy = sumy / data.length;
+ double sxy = 0, sxx = 0, syy = 0;
+ for (int i = 0; i < data.length; i++) {
+ sxy += (data[i][0] - avgx) * (data[i][1] - avgy);
+ sxx += (data[i][0] - avgx) * (data[i][0] - avgx);
+ syy += (data[i][1] - avgy) * (data[i][1] - avgy);
+ }
+ return sxy / Math.sqrt(sxx * syy);
+ }
+
+}
diff -r 84215fa115fc test/jdk/java/util/regex/RegExTest.java
--- a/test/jdk/java/util/regex/RegExTest.java Fri Mar 20 17:37:52 2020 -0700
+++ b/test/jdk/java/util/regex/RegExTest.java Sat Mar 21 15:51:11 2020 +0100
@@ -4796,48 +4796,97 @@
}
private static void grapheme() throws Exception {
+ final int[] lineNumber = new int[1];
Stream.concat(Files.lines(UCDFiles.GRAPHEME_BREAK_TEST),
Files.lines(Paths.get(System.getProperty("test.src", "."), "GraphemeTestCases.txt")))
- .filter( ln -> ln.length() != 0 && !ln.startsWith("#") )
.forEach( ln -> {
- ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", "");
- // System.out.println(str);
- String[] strs = ln.split("\u00f7|\u00d7");
- StringBuilder src = new StringBuilder();
- ArrayList<String> graphemes = new ArrayList<>();
- StringBuilder buf = new StringBuilder();
- int offBk = 0;
- for (String str : strs) {
- if (str.length() == 0) // first empty str
- continue;
- int cp = Integer.parseInt(str, 16);
- src.appendCodePoint(cp);
- buf.appendCodePoint(cp);
- offBk += (str.length() + 1);
- if (ln.charAt(offBk) == '\u00f7') { // DIV
- graphemes.add(buf.toString());
- buf = new StringBuilder();
+ lineNumber[0]++;
+ if (ln.length() == 0 || ln.startsWith("#")) return;
+ ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", "");
+ // System.out.println(str);
+ String[] strs = ln.split("\u00f7|\u00d7");
+ StringBuilder src = new StringBuilder();
+ ArrayList<String> graphemes = new ArrayList<>();
+ StringBuilder buf = new StringBuilder();
+ int offBk = 0;
+ for (String str : strs) {
+ if (str.length() == 0) // first empty str
+ continue;
+ int cp = Integer.parseInt(str, 16);
+ src.appendCodePoint(cp);
+ buf.appendCodePoint(cp);
+ offBk += (str.length() + 1);
+ if (ln.charAt(offBk) == '\u00f7') { // DIV
+ graphemes.add(buf.toString());
+ buf = new StringBuilder();
+ }
}
- }
- Pattern p = Pattern.compile("\\X");
- Matcher m = p.matcher(src.toString());
- Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}");
- for (String g : graphemes) {
- // System.out.printf(" grapheme:=[%s]%n", g);
- // (1) test \\X directly
- if (!m.find() || !m.group().equals(g)) {
- System.out.println("Failed \\X [" + ln + "] : " + g);
+ Pattern p = Pattern.compile("\\X");
+ // (1) test \X directly
+ Matcher m = p.matcher(src.toString());
+ for (String g : graphemes) {
+ // System.out.printf(" grapheme:=[%s]%n", g);
+ String group = null;
+ if (!m.find() || !(group = m.group()).equals(g)) {
+ System.out.println("Failed pattern \\X [" + ln + "] : "
+ + "expected: " + g + " - actual: " + group
+ + "(line " + lineNumber[0] + ")");
+ failCount++;
+ }
+ }
+ if (m.find()) {
failCount++;
}
- // (2) test \\b{g} + \\X via Scanner
- boolean hasNext = s.hasNext(p);
- // if (!s.hasNext() || !s.next().equals(next)) {
- if (!s.hasNext(p) || !s.next(p).equals(g)) {
- System.out.println("Failed b{g} [" + ln + "] : " + g);
+ // test \b{g} without \X via Pattern
+ Pattern pbg = Pattern.compile("\\b{g}");
+ m = pbg.matcher(src.toString());
+ m.find();
+ int prev = m.end();
+ for (String g : graphemes) {
+ String group = null;
+ if (!m.find() || !(group = src.substring(prev, m.end())).equals(g)) {
+ System.out.println("Failed pattern \\b{g} [" + ln + "] : "
+ + "expected: " + g + " - actual: " + group
+ + "(line " + lineNumber[0] + ")");
+ failCount++;
+ }
+ if (!"".equals(m.group())) {
+ failCount++;
+ }
+ prev = m.end();
+ }
+ if (m.find()) {
failCount++;
}
- }
- });
+ // (2) test \b{g} + \X via Scanner
+ Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}");
+ for (String g : graphemes) {
+ String next = null;
+ if (!s.hasNext(p) || !(next = s.next(p)).equals(g)) {
+ System.out.println("Failed \\b{g} [" + ln + "] : "
+ + "expected: " + g + " - actual: " + next
+ + " (line " + lineNumber[0] + ")");
+ failCount++;
+ }
+ }
+ if (s.hasNext(p)) {
+ failCount++;
+ }
+ // test \b{g} without \X via Scanner
+ s = new Scanner(src.toString()).useDelimiter("\\b{g}");
+ for (String g : graphemes) {
+ String next = null;
+ if (!s.hasNext() || !(next = s.next()).equals(g)) {
+ System.out.println("Failed \\b{g} [" + ln + "] : "
+ + "expected: " + g + " - actual: " + next
+ + " (line " + lineNumber[0] + ")");
+ failCount++;
+ }
+ }
+ if (s.hasNext()) {
+ failCount++;
+ }
+ });
// some sanity checks
if (!Pattern.compile("\\X{10}").matcher("abcdefghij").matches() ||
!Pattern.compile("\\b{g}(?:\\X\\b{g}){5}\\b{g}").matcher("abcde").matches() ||