This is an automated email from the ASF dual-hosted git repository.
dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 2134e3dafaf HIVE-26047: Vectorized LIKE UDF optimization (Ryu
Kobayashi, reviewed by Denys Kuzmenko)
2134e3dafaf is described below
commit 2134e3dafaf95d56ec8531a1185ac0170199b218
Author: Ryu Kobayashi <[email protected]>
AuthorDate: Fri Apr 19 21:37:33 2024 +0900
HIVE-26047: Vectorized LIKE UDF optimization (Ryu Kobayashi, reviewed by
Denys Kuzmenko)
Closes #4998
---
.../AbstractFilterStringColLikeStringScalar.java | 14 +-
.../FilterStringColLikeStringScalar.java | 164 ++++++++++-----------
.../expressions/TestVectorStringExpressions.java | 68 +++++++++
3 files changed, 153 insertions(+), 93 deletions(-)
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java
index 85c07b6dc51..542c6b38149 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java
@@ -226,7 +226,7 @@ public abstract class
AbstractFilterStringColLikeStringScalar extends VectorExpr
protected static final class NoneChecker implements Checker {
final byte [] byteSub;
- NoneChecker(String pattern) {
+ public NoneChecker(String pattern) {
byteSub = pattern.getBytes(StandardCharsets.UTF_8);
}
@@ -250,7 +250,7 @@ public abstract class
AbstractFilterStringColLikeStringScalar extends VectorExpr
protected static final class BeginChecker implements Checker {
final byte[] byteSub;
- BeginChecker(String pattern) {
+ public BeginChecker(String pattern) {
byteSub = pattern.getBytes(StandardCharsets.UTF_8);
}
@@ -269,7 +269,7 @@ public abstract class
AbstractFilterStringColLikeStringScalar extends VectorExpr
protected static final class EndChecker implements Checker {
final byte[] byteSub;
- EndChecker(String pattern) {
+ public EndChecker(String pattern) {
byteSub = pattern.getBytes(StandardCharsets.UTF_8);
}
@@ -288,7 +288,7 @@ public abstract class
AbstractFilterStringColLikeStringScalar extends VectorExpr
protected static final class MiddleChecker implements Checker {
final StringExpr.Finder finder;
- MiddleChecker(String pattern) {
+ public MiddleChecker(String pattern) {
finder = StringExpr.compile(pattern.getBytes(StandardCharsets.UTF_8));
}
@@ -324,7 +324,7 @@ public abstract class
AbstractFilterStringColLikeStringScalar extends VectorExpr
final int beginLen;
final int endLen;
- ChainedChecker(String pattern) {
+ public ChainedChecker(String pattern) {
final StringTokenizer tokens = new StringTokenizer(pattern, "%");
final boolean leftAnchor = pattern.startsWith("%") == false;
final boolean rightAnchor = pattern.endsWith("%") == false;
@@ -413,12 +413,12 @@ public abstract class
AbstractFilterStringColLikeStringScalar extends VectorExpr
/**
* Matches each string to a pattern with Java regular expression package.
*/
- protected static class ComplexChecker implements Checker {
+ protected static final class ComplexChecker implements Checker {
Pattern compiledPattern;
Matcher matcher;
FastUTF8Decoder decoder;
- ComplexChecker(String pattern) {
+ public ComplexChecker(String pattern) {
compiledPattern = Pattern.compile(pattern);
matcher = compiledPattern.matcher("");
decoder = new FastUTF8Decoder();
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
index 46cc4300413..88f12a2a9fa 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
@@ -1,4 +1,4 @@
-/*
+/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -20,11 +20,10 @@ package org.apache.hadoop.hive.ql.exec.vector.expressions;
import org.apache.hadoop.hive.ql.udf.UDFLike;
+import com.google.common.collect.ImmutableList;
+
import java.nio.charset.StandardCharsets;
-import java.util.Arrays;
import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
/**
* Evaluate LIKE filter on a batch for a vector of strings.
@@ -32,13 +31,16 @@ import java.util.regex.Pattern;
public class FilterStringColLikeStringScalar extends
AbstractFilterStringColLikeStringScalar {
private static final long serialVersionUID = 1L;
- private transient final static List<CheckerFactory> checkerFactories =
Arrays.asList(
- new BeginCheckerFactory(),
- new EndCheckerFactory(),
- new MiddleCheckerFactory(),
- new NoneCheckerFactory(),
- new ChainedCheckerFactory(),
- new ComplexCheckerFactory());
+ private static final List<CheckerFactory> CHECKER_FACTORIES =
ImmutableList.of(
+ pattern -> {
+ UDFLikePattern udfLike = UDFLikePattern.matcher(pattern);
+ try {
+ return udfLike.checker.getConstructor(String.class).newInstance(
+ udfLike.format(pattern));
+ } catch (Exception e) {
+ throw new IllegalArgumentException("unable to initialize Checker");
+ }
+ });
public FilterStringColLikeStringScalar() {
super();
@@ -51,93 +53,83 @@ public class FilterStringColLikeStringScalar extends
AbstractFilterStringColLike
@Override
protected List<CheckerFactory> getCheckerFactories() {
- return checkerFactories;
+ return CHECKER_FACTORIES;
}
- /**
- * Accepts simple LIKE patterns like "abc%" and creates corresponding
checkers.
- */
- private static class BeginCheckerFactory implements CheckerFactory {
- private static final Pattern BEGIN_PATTERN = Pattern.compile("([^_%]+)%");
-
- public Checker tryCreate(String pattern) {
- Matcher matcher = BEGIN_PATTERN.matcher(pattern);
- if (matcher.matches()) {
- return new BeginChecker(matcher.group(1));
+ private enum UDFLikePattern {
+ // Accepts simple LIKE patterns like "abc%" and creates corresponding
checkers.
+ BEGIN(BeginChecker.class) {
+ @Override
+ String format(String pattern) {
+ return pattern.substring(0, pattern.length() - 1);
}
- return null;
- }
- }
-
- /**
- * Accepts simple LIKE patterns like "%abc" and creates a corresponding
checkers.
- */
- private static class EndCheckerFactory implements CheckerFactory {
- private static final Pattern END_PATTERN = Pattern.compile("%([^_%]+)");
-
- public Checker tryCreate(String pattern) {
- Matcher matcher = END_PATTERN.matcher(pattern);
- if (matcher.matches()) {
- return new EndChecker(matcher.group(1));
+ },
+ // Accepts simple LIKE patterns like "%abc" and creates a corresponding
checkers.
+ END(EndChecker.class) {
+ @Override
+ String format(String pattern) {
+ return pattern.substring(1);
}
- return null;
- }
- }
-
- /**
- * Accepts simple LIKE patterns like "%abc%" and creates a corresponding
checkers.
- */
- private static class MiddleCheckerFactory implements CheckerFactory {
- private static final Pattern MIDDLE_PATTERN =
Pattern.compile("%([^_%]+)%");
-
- public Checker tryCreate(String pattern) {
- Matcher matcher = MIDDLE_PATTERN.matcher(pattern);
- if (matcher.matches()) {
- return new MiddleChecker(matcher.group(1));
+ },
+ // Accepts simple LIKE patterns like "%abc%" and creates a corresponding
checkers.
+ MIDDLE(MiddleChecker.class) {
+ @Override
+ String format(String pattern) {
+ return pattern.substring(1, pattern.length() - 1);
}
- return null;
- }
- }
+ },
+ // Accepts any LIKE patterns and creates corresponding checkers.
+ COMPLEX(ComplexChecker.class) {
+ @Override
+ String format(String pattern) {
+ return "^" + UDFLike.likePatternToRegExp(pattern) + "$";
+ }
+ },
+ // Accepts chained LIKE patterns without escaping like "abc%def%ghi%" and
+ // creates corresponding checkers.
+ CHAINED(ChainedChecker.class),
+ // Accepts simple LIKE patterns like "abc" and creates corresponding
checkers.
+ NONE(NoneChecker.class);
- /**
- * Accepts simple LIKE patterns like "abc" and creates corresponding
checkers.
- */
- private static class NoneCheckerFactory implements CheckerFactory {
- private static final Pattern NONE_PATTERN = Pattern.compile("[^%_]+");
+ Class<? extends Checker> checker;
- public Checker tryCreate(String pattern) {
- Matcher matcher = NONE_PATTERN.matcher(pattern);
- if (matcher.matches()) {
- return new NoneChecker(pattern);
- }
- return null;
+ UDFLikePattern(Class<? extends Checker> checker) {
+ this.checker = checker;
}
- }
- /**
- * Accepts chained LIKE patterns without escaping like "abc%def%ghi%" and
creates corresponding
- * checkers.
- *
- */
- private static class ChainedCheckerFactory implements CheckerFactory {
- private static final Pattern CHAIN_PATTERN =
Pattern.compile("(%?[^%_\\\\]+%?)+");
-
- public Checker tryCreate(String pattern) {
- Matcher matcher = CHAIN_PATTERN.matcher(pattern);
- if (matcher.matches()) {
- return new ChainedChecker(pattern);
+ private static UDFLikePattern matcher(String pattern) {
+ UDFLikePattern lastType = NONE;
+ int length = pattern.length();
+ char lastChar = 0;
+
+ for (int i = 0; i < length; i++) {
+ char n = pattern.charAt(i);
+ if (n == '_' && lastChar != '\\') { // such as "a_bc"
+ return COMPLEX;
+ } else if (n == '%') {
+ if (i == 0) { // such as "%abc"
+ lastType = END;
+ } else if (i < length - 1) {
+ if (lastChar != '\\') { // such as "a%bc"
+ lastType = CHAINED;
+ }
+ } else {
+ if (lastChar != '\\') {
+ if (lastType == END) { // such as "%abc%"
+ lastType = MIDDLE;
+ } else if (lastType != CHAINED) {
+ lastType = BEGIN; // such as "abc%"
+ }
+ }
+ }
+ }
+ lastChar = n;
}
- return null;
+ return lastType;
}
- }
- /**
- * Accepts any LIKE patterns and creates corresponding checkers.
- */
- private static class ComplexCheckerFactory implements CheckerFactory {
- public Checker tryCreate(String pattern) {
- // anchor the pattern to the start:end of the whole string.
- return new ComplexChecker("^" + UDFLike.likePatternToRegExp(pattern) +
"$");
+ String format(String pattern) {
+ return pattern;
}
}
}
diff --git
a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
index 62c262e3409..22ac562c43a 100644
---
a/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
+++
b/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
@@ -4303,6 +4303,12 @@ public class TestVectorStringExpressions {
Assert.assertEquals(FilterStringColLikeStringScalar.BeginChecker.class,
expr.checker.getClass());
+ expr = new FilterStringColLikeStringScalar(0, "abc\\%def%".getBytes());
+ expr.transientInit(hiveConf);
+ expr.evaluate(vrb);
+ Assert.assertEquals(FilterStringColLikeStringScalar.BeginChecker.class,
+ expr.checker.getClass());
+
// END pattern
expr = new FilterStringColLikeStringScalar(0,
"%abc".getBytes(StandardCharsets.UTF_8));
expr.transientInit(hiveConf);
@@ -4310,6 +4316,12 @@ public class TestVectorStringExpressions {
Assert.assertEquals(FilterStringColLikeStringScalar.EndChecker.class,
expr.checker.getClass());
+ expr = new FilterStringColLikeStringScalar(0,
"%abc\\%def".getBytes(StandardCharsets.UTF_8));
+ expr.transientInit(hiveConf);
+ expr.evaluate(vrb);
+ Assert.assertEquals(FilterStringColLikeStringScalar.EndChecker.class,
+ expr.checker.getClass());
+
// MIDDLE pattern
expr = new FilterStringColLikeStringScalar(0, "%abc%".getBytes());
expr.transientInit(hiveConf);
@@ -4317,6 +4329,12 @@ public class TestVectorStringExpressions {
Assert.assertEquals(FilterStringColLikeStringScalar.MiddleChecker.class,
expr.checker.getClass());
+ expr = new FilterStringColLikeStringScalar(0, "%abc\\%def%".getBytes());
+ expr.transientInit(hiveConf);
+ expr.evaluate(vrb);
+ Assert.assertEquals(FilterStringColLikeStringScalar.MiddleChecker.class,
+ expr.checker.getClass());
+
// CHAIN pattern
expr = new FilterStringColLikeStringScalar(0, "%abc%de".getBytes());
expr.transientInit(hiveConf);
@@ -4331,6 +4349,56 @@ public class TestVectorStringExpressions {
Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
expr.checker.getClass());
+ expr = new FilterStringColLikeStringScalar(0, "abc_".getBytes());
+ expr.transientInit(hiveConf);
+ expr.evaluate(vrb);
+ Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+ expr.checker.getClass());
+
+ expr = new FilterStringColLikeStringScalar(0, "abc\\_def_".getBytes());
+ expr.transientInit(hiveConf);
+ expr.evaluate(vrb);
+ Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+ expr.checker.getClass());
+
+ expr = new FilterStringColLikeStringScalar(0,
"_abc".getBytes(StandardCharsets.UTF_8));
+ expr.transientInit(hiveConf);
+ expr.evaluate(vrb);
+ Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+ expr.checker.getClass());
+
+ expr = new FilterStringColLikeStringScalar(0,
"_abc\\_def".getBytes(StandardCharsets.UTF_8));
+ expr.transientInit(hiveConf);
+ expr.evaluate(vrb);
+ Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+ expr.checker.getClass());
+
+ expr = new FilterStringColLikeStringScalar(0, "_abc_".getBytes());
+ expr.transientInit(hiveConf);
+ expr.evaluate(vrb);
+ Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+ expr.checker.getClass());
+
+ expr = new FilterStringColLikeStringScalar(0, "_abc\\_def_".getBytes());
+ expr.transientInit(hiveConf);
+ expr.evaluate(vrb);
+ Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+ expr.checker.getClass());
+
+ expr = new FilterStringColLikeStringScalar(0, "_abc_de".getBytes());
+ expr.transientInit(hiveConf);
+ expr.evaluate(vrb);
+ Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+ expr.checker.getClass());
+
+
+ expr = new FilterStringColLikeStringScalar(0,
+
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_b".getBytes());
+ expr.transientInit(hiveConf);
+ expr.evaluate(vrb);
+ Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+ expr.checker.getClass());
+
// NONE pattern
expr = new FilterStringColLikeStringScalar(0, "abc".getBytes());
expr.transientInit(hiveConf);