Hi Naoto,

I agree, see attached patch.

Regards,
Philipp


On Thu, 2020-03-26 at 14:14 -0700, naoto.s...@oracle.com wrote:
> Hi Philipp,
> 
> I looked at the patch, and it looks good to me. As to the test case, 
> since it is measuring the performance, I would make it in a JMH test. 
> Maybe you would want to put it similar to 
> open/test/micro/org/openjdk/bench/java/util/regex/PatternBench.java
> 
> Minor suggestion: I would rather rename "getGraphemeType()" to just 
> "getType()", as it is simply retrieving the character type.
> 
> Naoto
> 
> On 3/21/20 7:58 AM, Philipp Kunz wrote:
> > Hi,
> > 
> > Any opinions on the attached patch or someone tempted to sponsor it?
> > 
> > Regards,
> > Philipp
diff -r 9a81c0a34bd0 src/java.base/share/classes/java/util/regex/Grapheme.java
--- a/src/java.base/share/classes/java/util/regex/Grapheme.java	Tue Apr 14 10:04:05 2020 +0800
+++ b/src/java.base/share/classes/java/util/regex/Grapheme.java	Tue Apr 14 18:25:57 2020 +0200
@@ -30,21 +30,8 @@
 final class Grapheme {
 
     /**
-     * Determines if there is an extended  grapheme cluster boundary between two
-     * continuing characters {@code cp1} and {@code cp2}.
-     * <p>
-     * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
-     * for the extended grapheme cluster boundary rules
-     * <p>
-     * Note: this method does not take care of stateful breaking.
-     */
-    static boolean isBoundary(int cp1, int cp2) {
-        return rules[getType(cp1)][getType(cp2)];
-    }
-
-    /**
-     * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
-     * the start of the char sequence is a boundary.
+     * Look for the next extended grapheme cluster boundary in a CharSequence.
+     * It assumes the start of the char sequence at offset {@code off} is a boundary.
      * <p>
      * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
      * for the extended grapheme cluster boundary rules. The following implementation
@@ -54,21 +41,20 @@
      * @param src the {@code CharSequence} to be scanned
      * @param off offset to start looking for the next boundary in the src
      * @param limit limit offset in the src (exclusive)
-     * @return the next possible boundary
+     * @return the next grapheme boundary
      */
     static int nextBoundary(CharSequence src, int off, int limit) {
         Objects.checkFromToIndex(off, limit, src.length());
 
-        int ch0 = Character.codePointAt(src, 0);
-        int ret = Character.charCount(ch0);
-        int ch1;
+        int ch0 = Character.codePointAt(src, off);
+        int ret = off + Character.charCount(ch0);
         // indicates whether gb11 or gb12 is underway
-        int t0 = getGraphemeType(ch0);
+        int t0 = getType(ch0);
         int riCount = t0 == RI ? 1 : 0;
         boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
         while (ret < limit) {
-            ch1 = Character.codePointAt(src, ret);
-            int t1 = getGraphemeType(ch1);
+            int ch1 = Character.codePointAt(src, ret);
+            int t1 = getType(ch1);
 
             if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
                 // continue for gb11
@@ -177,7 +163,8 @@
                cp == 0xAA7B || cp == 0xAA7D;
     }
 
-    private static int getGraphemeType(int cp) {
+    @SuppressWarnings("fallthrough")
+    private static int getType(int cp) {
         if (cp < 0x007F) { // ASCII
             if (cp < 32) { // Control characters
                 if (cp == 0x000D)
@@ -188,11 +175,7 @@
             }
             return OTHER;
         }
-        return getType(cp);
-    }
 
-    @SuppressWarnings("fallthrough")
-    private static int getType(int cp) {
         if (EmojiData.isExtendedPictographic(cp)) {
             return EXTENDED_PICTOGRAPHIC;
         }
diff -r 9a81c0a34bd0 src/java.base/share/classes/java/util/regex/Pattern.java
--- a/src/java.base/share/classes/java/util/regex/Pattern.java	Tue Apr 14 10:04:05 2020 +0800
+++ b/src/java.base/share/classes/java/util/regex/Pattern.java	Tue Apr 14 18:25:57 2020 +0200
@@ -4035,17 +4035,8 @@
             if (i < matcher.to) {
                 int ch0 = Character.codePointAt(seq, i);
                 int n = Character.charCount(ch0);
-                int j = i + n;
-                // Fast check if it's necessary to call Normalizer;
-                // testing Grapheme.isBoundary is enough for this case
-                while (j < matcher.to) {
-                    int ch1 = Character.codePointAt(seq, j);
-                    if (Grapheme.isBoundary(ch0, ch1))
-                        break;
-                    ch0 = ch1;
-                    j += Character.charCount(ch1);
-                }
-                if (i + n == j) {    // single, assume nfc cp
+                int j = Grapheme.nextBoundary(seq, i, matcher.to);
+                if (i + n == j) { // single cp grapheme, assume nfc
                     if (predicate.is(ch0))
                         return next.match(matcher, j, seq);
                 } else {
@@ -4109,13 +4100,12 @@
                 endIndex = matcher.getTextLength();
             }
             if (i == startIndex) {
-                return next.match(matcher, i, seq);
-            }
-            if (i < endIndex) {
-                if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) ||
-                    Grapheme.nextBoundary(seq,
-                        i - Character.charCount(Character.codePointBefore(seq, i)),
-                        i + Character.charCount(Character.codePointAt(seq, i))) > i) {
+                // continue with return below
+            } else if (i < endIndex) {
+                if (Character.isSurrogatePair(seq.charAt(i - 1), seq.charAt(i))) {
+                    return false;
+                }
+                if (Grapheme.nextBoundary(seq, matcher.last, endIndex) > i) {
                     return false;
                 }
             } else {
diff -r 9a81c0a34bd0 test/jdk/java/util/regex/RegExTest.java
--- a/test/jdk/java/util/regex/RegExTest.java	Tue Apr 14 10:04:05 2020 +0800
+++ b/test/jdk/java/util/regex/RegExTest.java	Tue Apr 14 18:25:57 2020 +0200
@@ -36,7 +36,7 @@
  * 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895
  * 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
  * 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812
- * 8216332 8214245 8237599
+ * 8216332 8214245 8237599 8241055
  *
  * @library /test/lib
  * @library /lib/testlibrary/java/lang
@@ -4797,48 +4797,97 @@
     }
 
     private static void grapheme() throws Exception {
+        final int[] lineNumber = new int[1];
         Stream.concat(Files.lines(UCDFiles.GRAPHEME_BREAK_TEST),
                 Files.lines(Paths.get(System.getProperty("test.src", "."), "GraphemeTestCases.txt")))
-            .filter( ln -> ln.length() != 0 && !ln.startsWith("#") )
             .forEach( ln -> {
-                ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", "");
-                // System.out.println(str);
-                String[] strs = ln.split("\u00f7|\u00d7");
-                StringBuilder src = new StringBuilder();
-                ArrayList<String> graphemes = new ArrayList<>();
-                StringBuilder buf = new StringBuilder();
-                int offBk = 0;
-                for (String str : strs) {
-                    if (str.length() == 0)  // first empty str
-                        continue;
-                    int cp = Integer.parseInt(str, 16);
-                    src.appendCodePoint(cp);
-                    buf.appendCodePoint(cp);
-                    offBk += (str.length() + 1);
-                    if (ln.charAt(offBk) == '\u00f7') {    // DIV
-                        graphemes.add(buf.toString());
-                        buf = new StringBuilder();
+                    lineNumber[0]++;
+                    if (ln.length() == 0 || ln.startsWith("#")) return;
+                    ln = ln.replaceAll("\\s+|\\([a-zA-Z]+\\)|\\[[a-zA-Z]]+\\]|#.*", "");
+                    // System.out.println(str);
+                    String[] strs = ln.split("\u00f7|\u00d7");
+                    StringBuilder src = new StringBuilder();
+                    ArrayList<String> graphemes = new ArrayList<>();
+                    StringBuilder buf = new StringBuilder();
+                    int offBk = 0;
+                    for (String str : strs) {
+                        if (str.length() == 0)  // first empty str
+                            continue;
+                        int cp = Integer.parseInt(str, 16);
+                        src.appendCodePoint(cp);
+                        buf.appendCodePoint(cp);
+                        offBk += (str.length() + 1);
+                        if (ln.charAt(offBk) == '\u00f7') {    // DIV
+                            graphemes.add(buf.toString());
+                            buf = new StringBuilder();
+                        }
                     }
-                }
-                Pattern p = Pattern.compile("\\X");
-                Matcher m = p.matcher(src.toString());
-                Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}");
-                for (String g : graphemes) {
-                    // System.out.printf("     grapheme:=[%s]%n", g);
-                    // (1) test \\X directly
-                    if (!m.find() || !m.group().equals(g)) {
-                        System.out.println("Failed \\X [" + ln + "] : " + g);
+                    Pattern p = Pattern.compile("\\X");
+                    // (1) test \X directly
+                    Matcher m = p.matcher(src.toString());
+                    for (String g : graphemes) {
+                        // System.out.printf("     grapheme:=[%s]%n", g);
+                        String group = null;
+                        if (!m.find() || !(group = m.group()).equals(g)) {
+                            System.out.println("Failed pattern \\X [" + ln + "] : "
+                                    + "expected: " + g + " - actual: " + group
+                                    + "(line " + lineNumber[0] + ")");
+                            failCount++;
+                        }
+                    }
+                    if (m.find()) {
                         failCount++;
                     }
-                    // (2) test \\b{g} + \\X  via Scanner
-                    boolean hasNext = s.hasNext(p);
-                    // if (!s.hasNext() || !s.next().equals(next)) {
-                    if (!s.hasNext(p) || !s.next(p).equals(g)) {
-                        System.out.println("Failed b{g} [" + ln + "] : " + g);
+                    // test \b{g} without \X via Pattern
+                    Pattern pbg = Pattern.compile("\\b{g}");
+                    m = pbg.matcher(src.toString());
+                    m.find();
+                    int prev = m.end();
+                    for (String g : graphemes) {
+                        String group = null;
+                        if (!m.find() || !(group = src.substring(prev, m.end())).equals(g)) {
+                            System.out.println("Failed pattern \\b{g} [" + ln + "] : "
+                                    + "expected: " + g + " - actual: " + group
+                                    + "(line " + lineNumber[0] + ")");
+                            failCount++;
+                        }
+                        if (!"".equals(m.group())) {
+                            failCount++;
+                        }
+                        prev = m.end();
+                    }
+                    if (m.find()) {
                         failCount++;
                     }
-                }
-            });
+                    // (2) test \b{g} + \X  via Scanner
+                    Scanner s = new Scanner(src.toString()).useDelimiter("\\b{g}");
+                    for (String g : graphemes) {
+                        String next = null;
+                        if (!s.hasNext(p) || !(next = s.next(p)).equals(g)) {
+                            System.out.println("Failed \\b{g} [" + ln + "] : "
+                                    + "expected: " + g + " - actual: " + next
+                                    + " (line " + lineNumber[0] + ")");
+                            failCount++;
+                        }
+                    }
+                    if (s.hasNext(p)) {
+                        failCount++;
+                    }
+                    // test \b{g} without \X via Scanner
+                    s = new Scanner(src.toString()).useDelimiter("\\b{g}");
+                    for (String g : graphemes) {
+                        String next = null;
+                        if (!s.hasNext() || !(next = s.next()).equals(g)) {
+                            System.out.println("Failed \\b{g} [" + ln + "] : "
+                                    + "expected: " + g + " - actual: " + next
+                                    + " (line " + lineNumber[0] + ")");
+                            failCount++;
+                        }
+                    }
+                    if (s.hasNext()) {
+                        failCount++;
+                    }
+                });
         // some sanity checks
         if (!Pattern.compile("\\X{10}").matcher("abcdefghij").matches() ||
             !Pattern.compile("\\b{g}(?:\\X\\b{g}){5}\\b{g}").matcher("abcde").matches() ||
diff -r 9a81c0a34bd0 test/micro/org/openjdk/bench/java/util/regex/PatternBench.java
--- a/test/micro/org/openjdk/bench/java/util/regex/PatternBench.java	Tue Apr 14 10:04:05 2020 +0800
+++ b/test/micro/org/openjdk/bench/java/util/regex/PatternBench.java	Tue Apr 14 18:25:57 2020 +0200
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -46,6 +46,7 @@
     public String flagsString;
 
     public Pattern graphemePattern;
+    public Pattern graphemeBoundaryPattern;
     public Pattern jmodPattern;
     public Pattern jmodCanonicalPattern;
 
@@ -57,7 +58,8 @@
     public void setup() {
         flagsString = "\ud83c\udde6\ud83c\uddec\ud83c\uddec\ud83c\udde6\ud83c\uddfa\ud83c\uddf8\ud83c\uddeb\ud83c\uddf7";
         fileTestString = "META-INF/providers/org.openjdk.foo_hotspot_nodes_PluginFactory_EndLockScopeNode";
-        graphemePattern = Pattern.compile("\\b{g}");
+        graphemePattern = Pattern.compile("\\X");
+        graphemeBoundaryPattern = Pattern.compile("\\b{g}");
 
         String jmodRegex = "^.*(?:(?:_the\\.[^/]*)|(?:_[^/]*\\.marker)|(?:[^/]*\\.diz)|(?:[^/]*\\.debuginfo)|(?:[^/]*\\.dSYM/.*)|(?:[^/]*\\.dSYM)|(?:[^/]*\\.pdb)|(?:[^/]*\\.map))$";
         jmodCanonicalPattern = Pattern.compile(jmodRegex, Pattern.CANON_EQ);
@@ -71,8 +73,15 @@
     @Benchmark
     @Warmup(iterations = 3)
     @Measurement(iterations = 3)
+    public long longStringGraphemeMatches() {
+        return graphemePattern.matcher(flagsString.repeat(3)).results().count();
+    }
+
+    @Benchmark
+    @Warmup(iterations = 3)
+    @Measurement(iterations = 3)
     public int splitFlags() {
-        return graphemePattern.split(flagsString).length;
+        return graphemeBoundaryPattern.split(flagsString).length;
     }
 
     @Benchmark

Reply via email to