This is an automated email from the ASF dual-hosted git repository.
hashutosh pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new f7b1809 HIVE-19661 : switch Hive UDFs to use Re2J regex engine
(Rajkumar Singh via Ashutosh Chauhan)
f7b1809 is described below
commit f7b18096eea4e59668d038d21682c4d59548ed8f
Author: Rajkumar Singh <[email protected]>
AuthorDate: Sun May 27 15:52:00 2018 -0700
HIVE-19661 : switch Hive UDFs to use Re2J regex engine (Rajkumar Singh via
Ashutosh Chauhan)
Signed-off-by: Ashutosh Chauhan <[email protected]>
---
LICENSE | 30 +++++++++++
.../java/org/apache/hadoop/hive/conf/HiveConf.java | 1 +
pom.xml | 6 +++
ql/pom.xml | 6 +++
.../hive/ql/udf/generic/GenericUDFRegExp.java | 61 +++++++++++++++++-----
5 files changed, 92 insertions(+), 12 deletions(-)
diff --git a/LICENSE b/LICENSE
index 3e7dc6b..316afc6 100644
--- a/LICENSE
+++ b/LICENSE
@@ -404,4 +404,34 @@ products or services of Licensee, or any third party.
agrees to be bound by the terms and conditions of this License
Agreement.
+For google re2j (https://github.com/google/re2j/blob/master/LICENSE):
+
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 391cdc1..fcdfef3 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -3888,6 +3888,7 @@ public class HiveConf extends Configuration {
"Time to wait to finish prewarming spark executors"),
HIVESTAGEIDREARRANGE("hive.stageid.rearrange", "none", new
StringSet("none", "idonly", "traverse", "execution"), ""),
HIVEEXPLAINDEPENDENCYAPPENDTASKTYPES("hive.explain.dependency.append.tasktype",
false, ""),
+ HIVEUSEGOOGLEREGEXENGINE("hive.use.googleregex.engine",false,"whether to
use google regex engine or not, default regex engine is java.util.regex"),
HIVECOUNTERGROUP("hive.counters.group.name", "HIVE",
"The name of counter group for internal Hive variables (CREATED_FILE,
FATAL_ERROR, etc.)"),
diff --git a/pom.xml b/pom.xml
index fa7a123..7649af1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -218,6 +218,7 @@
<jsr305.version>3.0.0</jsr305.version>
<tephra.version>0.6.0</tephra.version>
<gson.version>2.2.4</gson.version>
+ <re2j.version>1.2</re2j.version>
<rs-api.version>2.0.1</rs-api.version>
</properties>
@@ -919,6 +920,11 @@
<artifactId>snappy-java</artifactId>
<version>${snappy.version}</version>
</dependency>
+ <dependency>
+ <groupId>com.google.re2j</groupId>
+ <artifactId>re2j</artifactId>
+ <version>${re2j.version}</version>
+ </dependency>
</dependencies>
</dependencyManagement>
diff --git a/ql/pom.xml b/ql/pom.xml
index 7c4d26f..d2fe8f5 100644
--- a/ql/pom.xml
+++ b/ql/pom.xml
@@ -756,6 +756,11 @@
<version>${guava.version}</version>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>com.google.re2j</groupId>
+ <artifactId>re2j</artifactId>
+ <version>${re2j.version}</version>
+ </dependency>
</dependencies>
<profiles>
@@ -978,6 +983,7 @@
<include>org.apache.orc:orc-shims</include>
<include>org.apache.orc:orc-tools</include>
<include>joda-time:joda-time</include>
+ <include>com.google.re2j:re2j</include>
</includes>
</artifactSet>
<relocations>
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java
b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java
index d309c37..3bf3cfd 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java
@@ -23,6 +23,9 @@ import static
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveO
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.exec.MapredContext;
+import org.apache.hadoop.hive.ql.session.SessionState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.ql.exec.Description;
@@ -36,7 +39,6 @@ import
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.C
import
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.io.BooleanWritable;
-
/**
* UDF to extract a specific group identified by a java regex. Note that if a
* regexp has a backslash ('\'), then need to specify '\\' For example,
@@ -54,11 +56,28 @@ public class GenericUDFRegExp extends GenericUDF {
private final BooleanWritable output = new BooleanWritable();
private transient boolean isRegexConst;
private transient String regexConst;
- private transient Pattern patternConst;
private transient boolean warned;
+ private transient java.util.regex.Pattern patternConst;
+ private transient com.google.re2j.Pattern patternConstR2j;
+ private boolean useGoogleRegexEngine=false;
+
+ @Override
+ public void configure(MapredContext context) {
+ if (context != null) {
+
if(context.getJobConf().get("hive.use.googleregex.engine").equals("true")){
+ this.useGoogleRegexEngine=true;
+ }
+ }
+
+ }
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws
UDFArgumentException {
+ SessionState ss = SessionState.get();
+ if (ss != null) {
+ this.useGoogleRegexEngine =
ss.getConf().getBoolVar(HiveConf.ConfVars.HIVEUSEGOOGLEREGEXENGINE);
+ }
+
checkArgsSize(arguments, 2, 2);
checkArgPrimitive(arguments, 0);
@@ -73,7 +92,12 @@ public class GenericUDFRegExp extends GenericUDF {
if (arguments[1] instanceof ConstantObjectInspector) {
regexConst = getConstantStringValue(arguments, 1);
if (regexConst != null) {
- patternConst = Pattern.compile(regexConst);
+ if(!useGoogleRegexEngine){
+ //if(!HiveConf.getVar(hiveConf,
HiveConf.ConfVars.HIVEUSEGOOGLEREGEXENGINE)){
+ patternConst = Pattern.compile(regexConst);
+ }else{
+ patternConstR2j = com.google.re2j.Pattern.compile(regexConst);
+ }
}
isRegexConst = true;
}
@@ -109,16 +133,29 @@ public class GenericUDFRegExp extends GenericUDF {
return output;
}
- Pattern p;
- if (isRegexConst) {
- p = patternConst;
- } else {
- p = Pattern.compile(regex);
- }
+ if(!useGoogleRegexEngine){
+ Pattern p;
+ if (isRegexConst) {
+ p = patternConst;
+ } else {
+ p = Pattern.compile(regex);
+ }
- Matcher m = p.matcher(s);
- output.set(m.find(0));
- return output;
+ Matcher m = p.matcher(s);
+ output.set(m.find(0));
+ return output;
+ }else{
+ com.google.re2j.Pattern patternR2j;
+ if (isRegexConst) {
+ patternR2j = patternConstR2j;
+ } else {
+ patternR2j = com.google.re2j.Pattern.compile(regex);
+ }
+
+ com.google.re2j.Matcher m = patternR2j.matcher(s);
+ output.set(m.find(0));
+ return output;
+ }
}
@Override