[
https://issues.apache.org/jira/browse/HIVE-2327?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14548626#comment-14548626
]
Alexander Pivovarov commented on HIVE-2327:
-------------------------------------------
I checked old and new regexp UDF performance. New implementation is 1% faster.
table files generator
{code}
import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.math.BigInteger;
import java.security.SecureRandom;
public class RandomStringGenerator {
private SecureRandom random = new SecureRandom();
public String nextSessionId() {
return new BigInteger(130, random).toString(32);
}
public static void main(String[] args) throws FileNotFoundException {
RandomStringGenerator g = new RandomStringGenerator();
// lets generate 10 files 10Mil rows each
for (int j = 0; j < 10; j++) {
System.out.println("start file " + j);
PrintWriter pw = new PrintWriter("/tmp/rexexp_test/00000" + j + "_0");
try {
for (int i = 0; i < 1000000; i++) {
String id = g.nextSessionId();
pw.println(id);
}
} finally {
pw.close();
}
}
System.out.println("All Done");
}
}
{code}
create table
{code}
hadoop fs -put -f /tmp/regexp_test /tmp
create table regexp_test (
a string
)
stored as textfile
location '/tmp/regexp_test';
{code}
test queries
{code}
--1
time bin/hive -e "select * from regexp_test where regexp(a, '.*abcd.*')"
--2
time bin/hive -e "select * from regexp_test where regexp(a, '.*efgh.*')"
--3
time bin/hive -e "select * from regexp_test where regexp(a, '.*ijkl.*')"
--4
time bin/hive -e "select a from regexp_test where regexp(a, '.*mnop.*')"
{code}
old regexp implementation
{code}
--1 233 rows
real 1m6.881s
user 1m10.582s
sys 0m1.652s
--2 247 rows
real 1m6.520s
user 1m10.082s
sys 0m1.534s
--3 224 rows
real 1m8.037s
user 1m11.718s
sys 0m1.608s
--4 rows 232
real 1m6.698s
user 1m10.378s
sys 0m1.499s
--AVG 67.034
{code}
new regexp implementation
{code}
--1 233 rows
real 1m6.762s
user 1m10.517s
sys 0m1.471s
--2 247 rows
real 1m6.362s
user 1m9.961s
sys 0m1.558s
--3 224 rows
real 1m5.854s
user 1m9.534s
sys 0m1.452s
--4 232 rows
real 1m6.435s
user 1m10.816s
sys 0m1.571s
--AVG 66.35325
{code}
delta = AVG2 - AVG1 = 0.68075 sec
new implementation is 1% faster (delta / max(AVG1, AVG2))
> Optimize REGEX UDFs with constant parameter information
> -------------------------------------------------------
>
> Key: HIVE-2327
> URL: https://issues.apache.org/jira/browse/HIVE-2327
> Project: Hive
> Issue Type: Improvement
> Components: UDF
> Reporter: Adam Kramer
> Assignee: Alexander Pivovarov
> Attachments: HIVE-2327.01.patch, HIVE-2327.2.patch
>
>
> There are a lot of UDFs which would show major performance differences if one
> assumes that some of its arguments are constant.
> Consider, for example, any UDF that takes a regular expression as input: This
> can be complied once (fast) if it's a constant, or once per row (wicked slow)
> if it's not a constant.
> Or, consider any UDF that reads from a file and/or takes a filename as input;
> it would have to re-read the whole file if the filename changes.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)