Hi,

Any opinions on the attached patch or someone tempted to sponsor it?

Regards,
Philipp
diff -r 84215fa115fc src/java.base/share/classes/java/util/regex/Grapheme.java
--- a/src/java.base/share/classes/java/util/regex/Grapheme.java	Fri Mar 20 17:37:52 2020 -0700
+++ b/src/java.base/share/classes/java/util/regex/Grapheme.java	Sat Mar 21 15:51:11 2020 +0100
@@ -30,21 +30,8 @@
 final class Grapheme {
 
     /**
-     * Determines if there is an extended  grapheme cluster boundary between two
-     * continuing characters {@code cp1} and {@code cp2}.
-     * <p>
-     * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
-     * for the extended grapheme cluster boundary rules
-     * <p>
-     * Note: this method does not take care of stateful breaking.
-     */
-    static boolean isBoundary(int cp1, int cp2) {
-        return rules[getType(cp1)][getType(cp2)];
-    }
-
-    /**
-     * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
-     * the start of the char sequence is a boundary.
+     * Look for the next extended grapheme cluster boundary in a CharSequence.
+     * It assumes the start of the char sequence at offset {@code off} is a boundary.
      * <p>
      * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
      * for the extended grapheme cluster boundary rules. The following implementation
@@ -54,20 +41,19 @@
      * @param src the {@code CharSequence} to be scanned
      * @param off offset to start looking for the next boundary in the src
      * @param limit limit offset in the src (exclusive)
-     * @return the next possible boundary
+     * @return the next grapheme boundary
      */
     static int nextBoundary(CharSequence src, int off, int limit) {
         Objects.checkFromToIndex(off, limit, src.length());
 
-        int ch0 = Character.codePointAt(src, 0);
-        int ret = Character.charCount(ch0);
-        int ch1;
+        int ch0 = Character.codePointAt(src, off);
+        int ret = off + Character.charCount(ch0);
         // indicates whether gb11 or gb12 is underway
         int t0 = getGraphemeType(ch0);
         int riCount = t0 == RI ? 1 : 0;
         boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
         while (ret < limit) {
-            ch1 = Character.codePointAt(src, ret);
+            int ch1 = Character.codePointAt(src, ret);
             int t1 = getGraphemeType(ch1);
 
             if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
@@ -177,6 +163,7 @@
                cp == 0xAA7B || cp == 0xAA7D;
     }
 
+    @SuppressWarnings("fallthrough")
     private static int getGraphemeType(int cp) {
         if (cp < 0x007F) { // ASCII
             if (cp < 32) { // Control characters
@@ -188,11 +175,7 @@
             }
             return OTHER;
         }
-        return getType(cp);
-    }
 
-    @SuppressWarnings("fallthrough")
-    private static int getType(int cp) {
         if (EmojiData.isExtendedPictographic(cp)) {
             return EXTENDED_PICTOGRAPHIC;
         }
diff -r 84215fa115fc src/java.base/share/classes/java/util/regex/Pattern.java
--- a/src/java.base/share/classes/java/util/regex/Pattern.java	Fri Mar 20 17:37:52 2020 -0700
+++ b/src/java.base/share/classes/java/util/regex/Pattern.java	Sat Mar 21 15:51:11 2020 +0100
@@ -4035,17 +4035,8 @@
             if (i < matcher.to) {
                 int ch0 = Character.codePointAt(seq, i);
                 int n = Character.charCount(ch0);
-                int j = i + n;
-                // Fast check if it's necessary to call Normalizer;
-                // testing Grapheme.isBoundary is enough for this case
-                while (j < matcher.to) {
-                    int ch1 = Character.codePointAt(seq, j);
-                    if (Grapheme.isBoundary(ch0, ch1))
-                        break;
-                    ch0 = ch1;
-                    j += Character.charCount(ch1);
-                }
-                if (i + n == j) {    // single, assume nfc cp
+                int j = Grapheme.nextBoundary(seq, i, matcher.to);
+                if (i + n == j) { // single cp grapheme, assume nfc
                     if (predicate.is(ch0))
                         return next.match(matcher, j, seq);
                 } else {
@@ -4109,13 +4100,12 @@
                 endIndex = matcher.getTextLength();
             }
             if (i == startIndex) {
-                return next.match(matcher, i, seq);
-            }
-            if (i < endIndex) {
-                if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) ||
-                    Grapheme.nextBoundary(seq,
-                        i - Character.charCount(Character.codePointBefore(seq, i)),
-                        i + Character.charCount(Character.codePointAt(seq, i))) > i) {
+                // continue with return below
+            } else if (i < endIndex) {
+                if (Character.isSurrogatePair(seq.charAt(i - 1), seq.charAt(i))) {
+                    return false;
+                }
+                if (Grapheme.nextBoundary(seq, matcher.last, endIndex) > i) {
                     return false;
                 }
             } else {
diff -r 84215fa115fc test/jdk/java/util/regex/GraphemeTimePerCharTest.java
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/jdk/java/util/regex/GraphemeTimePerCharTest.java	Sat Mar 21 15:51:11 2020 +0100
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.testng.annotations.*;
+import static org.testng.Assert.*;
+
+/**
+ * @test
+ * @bug 8241055
+ * @run testng GraphemeTimePerCharTest
+ * @summary Measures {@link java.util.regex.Grapheme#nextBoundary} performance
+ * depending on input character sequence size.
+ */
+public class GraphemeTimePerCharTest {
+
+    static final Pattern GRAPHEME_REGEX = Pattern.compile("\\X");
+
+    void matchPerformance(String value) {
+        Matcher charMatcher = GRAPHEME_REGEX.matcher(value);
+        while (charMatcher.find()) charMatcher.group();
+    }
+
+    @Test
+    public void testDependsLinearlyOnInputSize() throws Exception {
+        matchPerformance("warmup");
+
+        final int STEPS = 50; // number of measurements
+        final int LIMIT = 5000000; // maximum string size tested
+
+        double[][] strTimesByNChars = new double[STEPS][];
+        double[][] chrTimesByNChars = new double[STEPS][];
+        System.out.println(
+                "number of chars, time string [ms], time per char [ns]");
+        for (int i = 0; i < STEPS; i++) {
+            int chunkSize = (i + 1) * LIMIT / STEPS;
+            String value = "x".repeat(chunkSize);
+            long start = System.nanoTime();
+            matchPerformance(value);
+            long end = System.nanoTime();
+            long t = end - start;
+            strTimesByNChars[i] = new double[] { chunkSize, t };
+            double timePerChar = (double) t / (double) chunkSize;
+            chrTimesByNChars[i] = new double[] { chunkSize, timePerChar };
+            System.out.printf("%7d %6d %10.3f%n",
+                    chunkSize, (t / 1000000), timePerChar);
+        }
+        double cStr = correlate(strTimesByNChars);
+        double cChr = correlate(chrTimesByNChars);
+        System.out.println("correlation factor between string size and "
+                + "total time (expected 1): " + cStr);
+        System.out.println("correlation factor between string size and "
+                + "time per character (expected 0 or less): " + cChr);
+        assertTrue(cChr < .5 && .5 < cStr);
+    }
+
+    double correlate(double[][] data) {
+        double sumx = 0, sumy = 0;
+        for (int i = 0; i < data.length; i++) {
+            sumx += data[i][0];
+            sumy += data[i][1];
+        }
+        double avgx = sumx / data.length;
+        double avgy = sumy / data.length;
+        double sxy = 0, sxx = 0, syy = 0;
+        for (int i = 0; i < data.length; i++) {
+            sxy += (data[i][0] - avgx) * (data[i][1] - avgy);
+            sxx += (data[i][0] - avgx) * (data[i][0] - avgx);
+            syy += (data[i][1] - avgy) * (data[i][1] - avgy);
+        }
+        return sxy / Math.sqrt(sxx * syy);
+    }
+
+}
diff -r 84215fa115fc test/jdk/java/util/regex/RegExTest.java
--- a/test/jdk/java/util/regex/RegExTest.java	Fri Mar 20 17:37:52 2020 -0700
+++ b/test/jdk/java/util/regex/RegExTest.java	Sat Mar 21 15:51:11 2020 +0100
@@ -4796,48 +4796,97 @@
     }
 
     private static void grapheme() throws Exception {
+        final int[] lineNumber = new int[1];
         Stream.concat(Files.lines(UCDFiles.GRAPHEME_BREAK_TEST),
                 Files.lines(Paths.get(System.getProperty("test.src", "."), "GraphemeTestCases.txt")))
-            .filter( ln -> ln.length() != 0 && !ln.startsWith("#") )
             .forEach( ln -> {
-                ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", "");
-                // System.out.println(str);
-                String[] strs = ln.split("\u00f7|\u00d7");
-                StringBuilder src = new StringBuilder();
-                ArrayList<String> graphemes = new ArrayList<>();
-                StringBuilder buf = new StringBuilder();
-                int offBk = 0;
-                for (String str : strs) {
-                    if (str.length() == 0)  // first empty str
-                        continue;
-                    int cp = Integer.parseInt(str, 16);
-                    src.appendCodePoint(cp);
-                    buf.appendCodePoint(cp);
-                    offBk += (str.length() + 1);
-                    if (ln.charAt(offBk) == '\u00f7') {    // DIV
-                        graphemes.add(buf.toString());
-                        buf = new StringBuilder();
+                    lineNumber[0]++;
+                    if (ln.length() == 0 || ln.startsWith("#")) return;
+                    ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", "");
+                    // System.out.println(str);
+                    String[] strs = ln.split("\u00f7|\u00d7");
+                    StringBuilder src = new StringBuilder();
+                    ArrayList<String> graphemes = new ArrayList<>();
+                    StringBuilder buf = new StringBuilder();
+                    int offBk = 0;
+                    for (String str : strs) {
+                        if (str.length() == 0)  // first empty str
+                            continue;
+                        int cp = Integer.parseInt(str, 16);
+                        src.appendCodePoint(cp);
+                        buf.appendCodePoint(cp);
+                        offBk += (str.length() + 1);
+                        if (ln.charAt(offBk) == '\u00f7') {    // DIV
+                            graphemes.add(buf.toString());
+                            buf = new StringBuilder();
+                        }
                     }
-                }
-                Pattern p = Pattern.compile("\\X");
-                Matcher m = p.matcher(src.toString());
-                Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}");
-                for (String g : graphemes) {
-                    // System.out.printf("     grapheme:=[%s]%n", g);
-                    // (1) test \\X directly
-                    if (!m.find() || !m.group().equals(g)) {
-                        System.out.println("Failed \\X [" + ln + "] : " + g);
+                    Pattern p = Pattern.compile("\\X");
+                    // (1) test \X directly
+                    Matcher m = p.matcher(src.toString());
+                    for (String g : graphemes) {
+                        // System.out.printf("     grapheme:=[%s]%n", g);
+                        String group = null;
+                        if (!m.find() || !(group = m.group()).equals(g)) {
+                            System.out.println("Failed pattern \\X [" + ln + "] : "
+                                    + "expected: " + g + " - actual: " + group
+                                    + "(line " + lineNumber[0] + ")");
+                            failCount++;
+                        }
+                    }
+                    if (m.find()) {
                         failCount++;
                     }
-                    // (2) test \\b{g} + \\X  via Scanner
-                    boolean hasNext = s.hasNext(p);
-                    // if (!s.hasNext() || !s.next().equals(next)) {
-                    if (!s.hasNext(p) || !s.next(p).equals(g)) {
-                        System.out.println("Failed b{g} [" + ln + "] : " + g);
+                    // test \b{g} without \X via Pattern
+                    Pattern pbg = Pattern.compile("\\b{g}");
+                    m = pbg.matcher(src.toString());
+                    m.find();
+                    int prev = m.end();
+                    for (String g : graphemes) {
+                        String group = null;
+                        if (!m.find() || !(group = src.substring(prev, m.end())).equals(g)) {
+                            System.out.println("Failed pattern \\b{g} [" + ln + "] : "
+                                    + "expected: " + g + " - actual: " + group
+                                    + "(line " + lineNumber[0] + ")");
+                            failCount++;
+                        }
+                        if (!"".equals(m.group())) {
+                            failCount++;
+                        }
+                        prev = m.end();
+                    }
+                    if (m.find()) {
                         failCount++;
                     }
-                }
-            });
+                    // (2) test \b{g} + \X  via Scanner
+                    Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}");
+                    for (String g : graphemes) {
+                        String next = null;
+                        if (!s.hasNext(p) || !(next = s.next(p)).equals(g)) {
+                            System.out.println("Failed \\b{g} [" + ln + "] : "
+                                    + "expected: " + g + " - actual: " + next
+                                    + " (line " + lineNumber[0] + ")");
+                            failCount++;
+                        }
+                    }
+                    if (s.hasNext(p)) {
+                        failCount++;
+                    }
+                    // test \b{g} without \X via Scanner
+                    s = new Scanner(src.toString()).useDelimiter("\\b{g}");
+                    for (String g : graphemes) {
+                        String next = null;
+                        if (!s.hasNext() || !(next = s.next()).equals(g)) {
+                            System.out.println("Failed \\b{g} [" + ln + "] : "
+                                    + "expected: " + g + " - actual: " + next
+                                    + " (line " + lineNumber[0] + ")");
+                            failCount++;
+                        }
+                    }
+                    if (s.hasNext()) {
+                        failCount++;
+                    }
+                });
         // some sanity checks
         if (!Pattern.compile("\\X{10}").matcher("abcdefghij").matches() ||
             !Pattern.compile("\\b{g}(?:\\X\\b{g}){5}\\b{g}").matcher("abcde").matches() ||

Reply via email to