This is an automated email from the ASF dual-hosted git repository.

mbudiu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/calcite.git


The following commit(s) were added to refs/heads/main by this push:
     new e352e93305 [CALCITE-7153] Mixed wildcards of _ and % need to be 
simplified in LIKE operator
e352e93305 is described below

commit e352e933059ae4d895d8e943604ef40fb2a43e40
Author: xuzifu666 <[email protected]>
AuthorDate: Wed Sep 3 11:56:55 2025 +0800

    [CALCITE-7153] Mixed wildcards of _ and % need to be simplified in LIKE 
operator
---
 .../java/org/apache/calcite/rex/RexSimplify.java   | 63 +++++++++++++++++++++-
 .../org/apache/calcite/rex/RexProgramTest.java     | 26 ++++++++-
 2 files changed, 86 insertions(+), 3 deletions(-)

diff --git a/core/src/main/java/org/apache/calcite/rex/RexSimplify.java 
b/core/src/main/java/org/apache/calcite/rex/RexSimplify.java
index 410ce4fd12..c9c8081861 100644
--- a/core/src/main/java/org/apache/calcite/rex/RexSimplify.java
+++ b/core/src/main/java/org/apache/calcite/rex/RexSimplify.java
@@ -65,6 +65,7 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import static org.apache.calcite.linq4j.Nullness.castNonNull;
@@ -529,6 +530,66 @@ private RexNode simplifyLike(RexCall e, RexUnknownAs 
unknownAs) {
     return simplifyGenericNode(e);
   }
 
+  // string 'AA%%__%%AA' simplify to 'AA__%AA'
+  // string with even escapes 'AA\\\\%%__%%AA' simplify to 'AA\\__%AA'
+  // string with odd escapes 'AA\\\\\\%%__%%AA' simplify to 'AA\\\\\\%__%AA'
+  private String simplifyMixedWildcards(String str, char escape) {
+    Pattern pattern = Pattern.compile("[_%]+");
+    Matcher matcher = pattern.matcher(str);
+    StringBuilder builder = new StringBuilder();
+    int from = 0;
+    while (matcher.find()) {
+      int start = matcher.start();
+      String group = requireNonNull(matcher.group(0));
+      if (start > 0
+          && str.charAt(start - 1) == escape
+          && consecutiveSameCharCountBefore(str, start - 1, escape) % 2 == 1) {
+        builder.append(str.substring(from, start + 1));
+        builder.append(simplifyPercentAndUnderline(group.substring(1)));
+      } else {
+        builder.append(str.substring(from, start));
+        builder.append(simplifyPercentAndUnderline(group));
+      }
+      from = matcher.end();
+    }
+    if (from < str.length()) {
+      builder.append(str.substring(from));
+    }
+    return builder.toString();
+  }
+
+  // Tool method: count the number of consecutive identical characters before 
index
+  private int consecutiveSameCharCountBefore(String str, int index, char 
escape) {
+    int count = 0;
+    while (index >= 0) {
+      if (str.charAt(index) != escape) {
+        break;
+      }
+      count++;
+      index--;
+    }
+    return count;
+  }
+
+  // Tool method: simplified string mixed with '%' and '_'
+  private String simplifyPercentAndUnderline(String str) {
+    StringBuilder builder = new StringBuilder();
+    boolean containsPercent = false;
+    for (int index = 0; index < str.length(); index++) {
+      if (str.charAt(index) == '%') {
+        containsPercent = true;
+        continue;
+      }
+      if (str.charAt(index) == '_') {
+        builder.append('_');
+      }
+    }
+    if (containsPercent) {
+      builder.append('%');
+    }
+    return builder.toString();
+  }
+
   /**
    * Simplifies like string with escape.
    * A like '%%#%%A%%' escape '#' should simplify to A like '%#%%A%' escape 
'#'.
@@ -559,7 +620,7 @@ private String simplifyLikeString(String content, char 
escape, char wildcard) {
       escapeCount = 0;
       wildcardCount = 0;
     }
-    return builder.toString();
+    return simplifyMixedWildcards(builder.toString(), escape);
   }
 
   // e must be a comparison (=, >, >=, <, <=, !=)
diff --git a/core/src/test/java/org/apache/calcite/rex/RexProgramTest.java 
b/core/src/test/java/org/apache/calcite/rex/RexProgramTest.java
index 8c583c8a66..db9a427fea 100644
--- a/core/src/test/java/org/apache/calcite/rex/RexProgramTest.java
+++ b/core/src/test/java/org/apache/calcite/rex/RexProgramTest.java
@@ -3940,7 +3940,9 @@ private void checkSarg(String message, Sarg sarg,
    * <a 
href="https://issues.apache.org/jira/browse/CALCITE-4094";>[CALCITE-4094]
    * RexSimplify should simplify more always true OR expressions</a>,
    * <a 
href="https://issues.apache.org/jira/browse/CALCITE-7088";>[CALCITE-7088]
-   * Multiple consecutive '%' in the string matched by LIKE should simplify to 
a single '%'</a>.
+   * Multiple consecutive '%' in the string matched by LIKE should simplify to 
a single '%'</a>,
+   * <a 
href="https://issues.apache.org/jira/browse/CALCITE-7153";>[CALCITE-7153]
+   * Mixed wildcards of _ and % need to be simplified in LIKE operator</a>.
    * */
   @Test void testSimplifyLike() {
     final RexNode ref = input(tVarchar(true, 10), 0);
@@ -3963,12 +3965,32 @@ private void checkSarg(String message, Sarg sarg,
         "true");
     checkSimplifyUnchanged(like(ref, literal("%A")));
     checkSimplify(like(ref, literal("%%A")), "LIKE($0, '%A')");
-    checkSimplify(like(ref, literal("%%%_A%%B%%")), "LIKE($0, '%_A%B%')");
+    checkSimplify(like(ref, literal("%%%_A%%B%%")), "LIKE($0, '_%A%B%')");
     checkSimplify(like(ref, literal("%%A%%%")), "LIKE($0, '%A%')");
     checkSimplify(like(ref, literal("%%\\%%A\\%%%%%")), "LIKE($0, 
'%\\%%A\\%%')");
     checkSimplify(like(ref, literal("%%A"), literal("#")), "LIKE($0, '%A', 
'#')");
     checkSimplify(like(ref, literal("%%#%%A%%"), literal("#")),
         "LIKE($0, '%#%%A%', '#')");
+    checkSimplify(like(ref, literal("AA%__%BB%_CC")),
+        "LIKE($0, 'AA__%BB_%CC')");
+    checkSimplify(like(ref, literal("%%__%AA%_BB")),
+        "LIKE($0, '__%AA_%BB')");
+    checkSimplify(like(ref, literal("AA\\%%___%BB%%%___%CC")),
+        "LIKE($0, 'AA\\%___%BB___%CC')");
+    checkSimplify(like(ref, literal("AA#%%___%BB%%%___%CC")),
+        "LIKE($0, 'AA#___%BB___%CC')");
+    checkSimplify(like(ref, literal("AA#%%___%BB%%%___%CC"), literal("#")),
+        "LIKE($0, 'AA#%___%BB___%CC', '#')");
+    // odd number of '#', the next % would not be simplified
+    checkSimplify(like(ref, literal("AA###%%___%BB%%%___%CC"), literal("#")),
+        "LIKE($0, 'AA###%___%BB___%CC', '#')");
+    checkSimplify(like(ref, literal("AA\\\\\\%%___%BB%%%___%CC")),
+        "LIKE($0, 'AA\\\\\\%___%BB___%CC')");
+    // even number of '#', the next % would be simplified
+    checkSimplify(like(ref, literal("AA##%%___%BB%%%___%CC"), literal("#")),
+        "LIKE($0, 'AA##___%BB___%CC', '#')");
+    checkSimplify(like(ref, literal("AA\\\\\\\\%%___%BB%%%___%CC")),
+        "LIKE($0, 'AA\\\\\\\\___%BB___%CC')");
     checkSimplify(like(ref, literal("%%#%#%A%%"), literal("#")),
         "LIKE($0, '%#%#%A%', '#')");
     checkSimplify(like(ref, literal("###%%#%#%A%%##%%%"), literal("#")),

Reply via email to