This is an automated email from the ASF dual-hosted git repository.

dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 2134e3dafaf HIVE-26047: Vectorized LIKE UDF optimization (Ryu 
Kobayashi, reviewed by Denys Kuzmenko)
2134e3dafaf is described below

commit 2134e3dafaf95d56ec8531a1185ac0170199b218
Author: Ryu Kobayashi <[email protected]>
AuthorDate: Fri Apr 19 21:37:33 2024 +0900

    HIVE-26047: Vectorized LIKE UDF optimization (Ryu Kobayashi, reviewed by 
Denys Kuzmenko)
    
    Closes #4998
---
 .../AbstractFilterStringColLikeStringScalar.java   |  14 +-
 .../FilterStringColLikeStringScalar.java           | 164 ++++++++++-----------
 .../expressions/TestVectorStringExpressions.java   |  68 +++++++++
 3 files changed, 153 insertions(+), 93 deletions(-)

diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java
index 85c07b6dc51..542c6b38149 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java
@@ -226,7 +226,7 @@ public abstract class 
AbstractFilterStringColLikeStringScalar extends VectorExpr
   protected static final class NoneChecker implements Checker {
     final byte [] byteSub;
 
-    NoneChecker(String pattern) {
+    public NoneChecker(String pattern) {
       byteSub = pattern.getBytes(StandardCharsets.UTF_8);
     }
 
@@ -250,7 +250,7 @@ public abstract class 
AbstractFilterStringColLikeStringScalar extends VectorExpr
   protected static final class BeginChecker implements Checker {
     final byte[] byteSub;
 
-    BeginChecker(String pattern) {
+    public BeginChecker(String pattern) {
       byteSub = pattern.getBytes(StandardCharsets.UTF_8);
     }
 
@@ -269,7 +269,7 @@ public abstract class 
AbstractFilterStringColLikeStringScalar extends VectorExpr
   protected static final class EndChecker implements Checker {
     final byte[] byteSub;
 
-    EndChecker(String pattern) {
+    public EndChecker(String pattern) {
       byteSub = pattern.getBytes(StandardCharsets.UTF_8);
     }
 
@@ -288,7 +288,7 @@ public abstract class 
AbstractFilterStringColLikeStringScalar extends VectorExpr
   protected static final class MiddleChecker implements Checker {
     final StringExpr.Finder finder;
 
-    MiddleChecker(String pattern) {
+    public MiddleChecker(String pattern) {
       finder = StringExpr.compile(pattern.getBytes(StandardCharsets.UTF_8));
     }
 
@@ -324,7 +324,7 @@ public abstract class 
AbstractFilterStringColLikeStringScalar extends VectorExpr
     final int beginLen;
     final int endLen;
 
-    ChainedChecker(String pattern) {
+    public ChainedChecker(String pattern) {
       final StringTokenizer tokens = new StringTokenizer(pattern, "%");
       final boolean leftAnchor = pattern.startsWith("%") == false;
       final boolean rightAnchor = pattern.endsWith("%") == false;
@@ -413,12 +413,12 @@ public abstract class 
AbstractFilterStringColLikeStringScalar extends VectorExpr
   /**
    * Matches each string to a pattern with Java regular expression package.
    */
-  protected static class ComplexChecker implements Checker {
+  protected static final class ComplexChecker implements Checker {
     Pattern compiledPattern;
     Matcher matcher;
     FastUTF8Decoder decoder;
 
-    ComplexChecker(String pattern) {
+    public ComplexChecker(String pattern) {
       compiledPattern = Pattern.compile(pattern);
       matcher = compiledPattern.matcher("");
       decoder = new FastUTF8Decoder();
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
index 46cc4300413..88f12a2a9fa 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
@@ -1,4 +1,4 @@
-/*
+/**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -20,11 +20,10 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions;
 
 import org.apache.hadoop.hive.ql.udf.UDFLike;
 
+import com.google.common.collect.ImmutableList;
+
 import java.nio.charset.StandardCharsets;
-import java.util.Arrays;
 import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 /**
  * Evaluate LIKE filter on a batch for a vector of strings.
@@ -32,13 +31,16 @@ import java.util.regex.Pattern;
 public class FilterStringColLikeStringScalar extends 
AbstractFilterStringColLikeStringScalar {
   private static final long serialVersionUID = 1L;
 
-  private transient final static List<CheckerFactory> checkerFactories = 
Arrays.asList(
-      new BeginCheckerFactory(),
-      new EndCheckerFactory(),
-      new MiddleCheckerFactory(),
-      new NoneCheckerFactory(),
-      new ChainedCheckerFactory(),
-      new ComplexCheckerFactory());
+  private static final List<CheckerFactory> CHECKER_FACTORIES = 
ImmutableList.of(
+    pattern -> {
+      UDFLikePattern udfLike = UDFLikePattern.matcher(pattern);
+      try {
+        return udfLike.checker.getConstructor(String.class).newInstance(
+          udfLike.format(pattern));
+      } catch (Exception e) {
+        throw new IllegalArgumentException("unable to initialize Checker");
+      }
+    });
 
   public FilterStringColLikeStringScalar() {
     super();
@@ -51,93 +53,83 @@ public class FilterStringColLikeStringScalar extends 
AbstractFilterStringColLike
 
   @Override
   protected List<CheckerFactory> getCheckerFactories() {
-    return checkerFactories;
+    return CHECKER_FACTORIES;
   }
 
-  /**
-   * Accepts simple LIKE patterns like "abc%" and creates corresponding 
checkers.
-   */
-  private static class BeginCheckerFactory implements CheckerFactory {
-    private static final Pattern BEGIN_PATTERN = Pattern.compile("([^_%]+)%");
-
-    public Checker tryCreate(String pattern) {
-      Matcher matcher = BEGIN_PATTERN.matcher(pattern);
-      if (matcher.matches()) {
-        return new BeginChecker(matcher.group(1));
+  private enum UDFLikePattern {
+    // Accepts simple LIKE patterns like "abc%" and creates corresponding 
checkers.
+    BEGIN(BeginChecker.class) {
+      @Override
+      String format(String pattern) {
+        return pattern.substring(0, pattern.length() - 1);
       }
-      return null;
-    }
-  }
-
-  /**
-   * Accepts simple LIKE patterns like "%abc" and creates a corresponding 
checkers.
-   */
-  private static class EndCheckerFactory implements CheckerFactory {
-    private static final Pattern END_PATTERN = Pattern.compile("%([^_%]+)");
-
-    public Checker tryCreate(String pattern) {
-      Matcher matcher = END_PATTERN.matcher(pattern);
-      if (matcher.matches()) {
-        return new EndChecker(matcher.group(1));
+    },
+    // Accepts simple LIKE patterns like "%abc" and creates a corresponding 
checkers.
+    END(EndChecker.class) {
+      @Override
+      String format(String pattern) {
+        return pattern.substring(1);
       }
-      return null;
-    }
-  }
-
-  /**
-   * Accepts simple LIKE patterns like "%abc%" and creates a corresponding 
checkers.
-   */
-  private static class MiddleCheckerFactory implements CheckerFactory {
-    private static final Pattern MIDDLE_PATTERN = 
Pattern.compile("%([^_%]+)%");
-
-    public Checker tryCreate(String pattern) {
-      Matcher matcher = MIDDLE_PATTERN.matcher(pattern);
-      if (matcher.matches()) {
-        return new MiddleChecker(matcher.group(1));
+    },
+    // Accepts simple LIKE patterns like "%abc%" and creates a corresponding 
checkers.
+    MIDDLE(MiddleChecker.class) {
+      @Override
+      String format(String pattern) {
+        return pattern.substring(1, pattern.length() - 1);
       }
-      return null;
-    }
-  }
+    },
+    // Accepts any LIKE patterns and creates corresponding checkers.
+    COMPLEX(ComplexChecker.class) {
+      @Override
+      String format(String pattern) {
+        return "^" + UDFLike.likePatternToRegExp(pattern) + "$";
+      }
+    },
+    // Accepts chained LIKE patterns without escaping like "abc%def%ghi%" and
+    // creates corresponding checkers.
+    CHAINED(ChainedChecker.class),
+    // Accepts simple LIKE patterns like "abc" and creates corresponding 
checkers.
+    NONE(NoneChecker.class);
 
-  /**
-   * Accepts simple LIKE patterns like "abc" and creates corresponding 
checkers.
-   */
-  private static class NoneCheckerFactory implements CheckerFactory {
-    private static final Pattern NONE_PATTERN = Pattern.compile("[^%_]+");
+    Class<? extends Checker> checker;
 
-    public Checker tryCreate(String pattern) {
-      Matcher matcher = NONE_PATTERN.matcher(pattern);
-      if (matcher.matches()) {
-        return new NoneChecker(pattern);
-      }
-      return null;
+    UDFLikePattern(Class<? extends Checker> checker) {
+      this.checker = checker;
     }
-  }
 
-  /**
-   * Accepts chained LIKE patterns without escaping like "abc%def%ghi%" and 
creates corresponding
-   * checkers.
-   *
-   */
-  private static class ChainedCheckerFactory implements CheckerFactory {
-    private static final Pattern CHAIN_PATTERN = 
Pattern.compile("(%?[^%_\\\\]+%?)+");
-
-    public Checker tryCreate(String pattern) {
-      Matcher matcher = CHAIN_PATTERN.matcher(pattern);
-      if (matcher.matches()) {
-        return new ChainedChecker(pattern);
+    private static UDFLikePattern matcher(String pattern) {
+      UDFLikePattern lastType = NONE;
+      int length = pattern.length();
+      char lastChar = 0;
+
+      for (int i = 0; i < length; i++) {
+        char n = pattern.charAt(i);
+        if (n == '_' && lastChar != '\\') { // such as "a_bc"
+          return COMPLEX;
+        } else if (n == '%') {
+          if (i == 0) { // such as "%abc"
+            lastType = END;
+          } else if (i < length - 1) {
+            if (lastChar != '\\') { // such as "a%bc"
+              lastType = CHAINED;
+            }
+          } else {
+            if (lastChar != '\\') {
+              if (lastType == END) { // such as "%abc%"
+                lastType = MIDDLE;
+              } else if (lastType != CHAINED) {
+                lastType = BEGIN; // such as "abc%"
+              }
+            }
+          }
+        }
+        lastChar = n;
       }
-      return null;
+      return lastType;
     }
-  }
 
-  /**
-   * Accepts any LIKE patterns and creates corresponding checkers.
-   */
-  private static class ComplexCheckerFactory implements CheckerFactory {
-    public Checker tryCreate(String pattern) {
-      // anchor the pattern to the start:end of the whole string.
-      return new ComplexChecker("^" + UDFLike.likePatternToRegExp(pattern) + 
"$");
+    String format(String pattern) {
+      return pattern;
     }
   }
 }
diff --git 
a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
 
b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
index 62c262e3409..22ac562c43a 100644
--- 
a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
+++ 
b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
@@ -4303,6 +4303,12 @@ public class TestVectorStringExpressions {
     Assert.assertEquals(FilterStringColLikeStringScalar.BeginChecker.class,
         expr.checker.getClass());
 
+    expr = new FilterStringColLikeStringScalar(0, "abc\\%def%".getBytes());
+    expr.transientInit(hiveConf);
+    expr.evaluate(vrb);
+    Assert.assertEquals(FilterStringColLikeStringScalar.BeginChecker.class,
+        expr.checker.getClass());
+
     // END pattern
     expr = new FilterStringColLikeStringScalar(0, 
"%abc".getBytes(StandardCharsets.UTF_8));
     expr.transientInit(hiveConf);
@@ -4310,6 +4316,12 @@ public class TestVectorStringExpressions {
     Assert.assertEquals(FilterStringColLikeStringScalar.EndChecker.class,
         expr.checker.getClass());
 
+    expr = new FilterStringColLikeStringScalar(0, 
"%abc\\%def".getBytes(StandardCharsets.UTF_8));
+    expr.transientInit(hiveConf);
+    expr.evaluate(vrb);
+    Assert.assertEquals(FilterStringColLikeStringScalar.EndChecker.class,
+        expr.checker.getClass());
+
     // MIDDLE pattern
     expr = new FilterStringColLikeStringScalar(0, "%abc%".getBytes());
     expr.transientInit(hiveConf);
@@ -4317,6 +4329,12 @@ public class TestVectorStringExpressions {
     Assert.assertEquals(FilterStringColLikeStringScalar.MiddleChecker.class,
         expr.checker.getClass());
 
+    expr = new FilterStringColLikeStringScalar(0, "%abc\\%def%".getBytes());
+    expr.transientInit(hiveConf);
+    expr.evaluate(vrb);
+    Assert.assertEquals(FilterStringColLikeStringScalar.MiddleChecker.class,
+        expr.checker.getClass());
+
     // CHAIN pattern
     expr = new FilterStringColLikeStringScalar(0, "%abc%de".getBytes());
     expr.transientInit(hiveConf);
@@ -4331,6 +4349,56 @@ public class TestVectorStringExpressions {
     Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
         expr.checker.getClass());
 
+    expr = new FilterStringColLikeStringScalar(0, "abc_".getBytes());
+    expr.transientInit(hiveConf);
+    expr.evaluate(vrb);
+    Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+        expr.checker.getClass());
+
+    expr = new FilterStringColLikeStringScalar(0, "abc\\_def_".getBytes());
+    expr.transientInit(hiveConf);
+    expr.evaluate(vrb);
+    Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+        expr.checker.getClass());
+
+    expr = new FilterStringColLikeStringScalar(0, 
"_abc".getBytes(StandardCharsets.UTF_8));
+    expr.transientInit(hiveConf);
+    expr.evaluate(vrb);
+    Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+        expr.checker.getClass());
+
+    expr = new FilterStringColLikeStringScalar(0, 
"_abc\\_def".getBytes(StandardCharsets.UTF_8));
+    expr.transientInit(hiveConf);
+    expr.evaluate(vrb);
+    Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+        expr.checker.getClass());
+
+    expr = new FilterStringColLikeStringScalar(0, "_abc_".getBytes());
+    expr.transientInit(hiveConf);
+    expr.evaluate(vrb);
+    Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+        expr.checker.getClass());
+
+    expr = new FilterStringColLikeStringScalar(0, "_abc\\_def_".getBytes());
+    expr.transientInit(hiveConf);
+    expr.evaluate(vrb);
+    Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+        expr.checker.getClass());
+
+    expr = new FilterStringColLikeStringScalar(0, "_abc_de".getBytes());
+    expr.transientInit(hiveConf);
+    expr.evaluate(vrb);
+    Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+        expr.checker.getClass());
+
+
+    expr = new FilterStringColLikeStringScalar(0,
+        
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_b".getBytes());
+    expr.transientInit(hiveConf);
+    expr.evaluate(vrb);
+    Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+        expr.checker.getClass());
+
     // NONE pattern
     expr = new FilterStringColLikeStringScalar(0, "abc".getBytes());
     expr.transientInit(hiveConf);

Reply via email to