HIVE-10485: Create md5 UDF (Alex Pivovarov via Jason Dere)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/9803344b Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/9803344b Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/9803344b Branch: refs/heads/llap Commit: 9803344bff3d8aecafae3e03261b48592a86bfb1 Parents: ec12a61 Author: Jason Dere <[email protected]> Authored: Mon May 4 23:11:47 2015 -0700 Committer: Jason Dere <[email protected]> Committed: Mon May 4 23:11:47 2015 -0700 ---------------------------------------------------------------------- .../hadoop/hive/ql/exec/FunctionRegistry.java | 6 +- .../org/apache/hadoop/hive/ql/udf/UDFMd5.java | 79 ++++++++++++++++++++ .../apache/hadoop/hive/ql/udf/TestUDFMd5.java | 57 ++++++++++++++ ql/src/test/queries/clientpositive/udf_md5.q | 13 ++++ .../results/clientpositive/show_functions.q.out | 2 + .../test/results/clientpositive/udf_md5.q.out | 61 +++++++++++++++ 6 files changed, 216 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/9803344b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index bf2809c..02a604f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -24,8 +24,8 @@ import java.util.Arrays; import java.util.Collections; import java.util.EnumMap; import java.util.HashSet; -import java.util.LinkedHashSet; import java.util.Iterator; +import java.util.LinkedHashSet; import java.util.List; import java.util.Set; import java.util.TreeSet; @@ -65,12 +65,13 @@ import org.apache.hadoop.hive.ql.udf.UDFLn; import org.apache.hadoop.hive.ql.udf.UDFLog; import org.apache.hadoop.hive.ql.udf.UDFLog10; import org.apache.hadoop.hive.ql.udf.UDFLog2; +import org.apache.hadoop.hive.ql.udf.UDFMd5; import org.apache.hadoop.hive.ql.udf.UDFMinute; import org.apache.hadoop.hive.ql.udf.UDFMonth; import org.apache.hadoop.hive.ql.udf.UDFOPBitAnd; -import org.apache.hadoop.hive.ql.udf.UDFOPBitShiftLeft; import org.apache.hadoop.hive.ql.udf.UDFOPBitNot; import org.apache.hadoop.hive.ql.udf.UDFOPBitOr; +import org.apache.hadoop.hive.ql.udf.UDFOPBitShiftLeft; import org.apache.hadoop.hive.ql.udf.UDFOPBitShiftRight; import org.apache.hadoop.hive.ql.udf.UDFOPBitShiftRightUnsigned; import org.apache.hadoop.hive.ql.udf.UDFOPBitXor; @@ -224,6 +225,7 @@ public final class FunctionRegistry { system.registerUDF("unhex", UDFUnhex.class, false); system.registerUDF("base64", UDFBase64.class, false); system.registerUDF("unbase64", UDFUnbase64.class, false); + system.registerUDF("md5", UDFMd5.class, false); system.registerGenericUDF("encode", GenericUDFEncode.class); system.registerGenericUDF("decode", GenericUDFDecode.class); http://git-wip-us.apache.org/repos/asf/hive/blob/9803344b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMd5.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMd5.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMd5.java new file mode 100644 index 0000000..62c16c2 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFMd5.java @@ -0,0 +1,79 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.udf; + +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Text; + +/** + * UDFMd5. + * + */ +@Description(name = "md5", + value = "_FUNC_(str or bin) - Calculates an MD5 128-bit checksum for the string or binary.", + extended = "The value is returned as a string of 32 hex digits, or NULL if the argument was NULL.\n" + + "Example:\n" + + " > SELECT _FUNC_('ABC');\n" + + " '902fbdd2b1df0c4f70b4a5d23525e932'\n" + + " > SELECT _FUNC_(binary('ABC'));\n" + + " '902fbdd2b1df0c4f70b4a5d23525e932'") +public class UDFMd5 extends UDF { + + private final Text result = new Text(); + + /** + * Convert String to md5 + */ + public Text evaluate(Text n) { + if (n == null) { + return null; + } + + String str = n.toString(); + String md5Hex = DigestUtils.md5Hex(str); + + result.set(md5Hex); + return result; + } + + /** + * Convert bytes to md5 + */ + public Text evaluate(BytesWritable b) { + if (b == null) { + return null; + } + + byte[] bytes = copyBytes(b); + String md5Hex = DigestUtils.md5Hex(bytes); + + result.set(md5Hex); + return result; + } + + protected byte[] copyBytes(BytesWritable b) { + int size = b.getLength(); + byte[] result = new byte[size]; + System.arraycopy(b.getBytes(), 0, result, 0, size); + return result; + } +} http://git-wip-us.apache.org/repos/asf/hive/blob/9803344b/ql/src/test/org/apache/hadoop/hive/ql/udf/TestUDFMd5.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/udf/TestUDFMd5.java b/ql/src/test/org/apache/hadoop/hive/ql/udf/TestUDFMd5.java new file mode 100644 index 0000000..715e987 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/udf/TestUDFMd5.java @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.udf; + +import junit.framework.TestCase; + +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Text; + +public class TestUDFMd5 extends TestCase { + + public void testMD5Str() throws HiveException { + UDFMd5 udf = new UDFMd5(); + + runAndVerifyStr("ABC", "902fbdd2b1df0c4f70b4a5d23525e932", udf); + runAndVerifyStr("", "d41d8cd98f00b204e9800998ecf8427e", udf); + // null + runAndVerifyStr(null, null, udf); + } + + public void testMD5Bin() throws HiveException { + UDFMd5 udf = new UDFMd5(); + + runAndVerifyBin(new byte[] { 65, 66, 67 }, "902fbdd2b1df0c4f70b4a5d23525e932", udf); + runAndVerifyBin(new byte[0], "d41d8cd98f00b204e9800998ecf8427e", udf); + // null + runAndVerifyBin(null, null, udf); + } + + private void runAndVerifyStr(String str, String expResult, UDFMd5 udf) throws HiveException { + Text t = str != null ? new Text(str) : null; + Text output = (Text) udf.evaluate(t); + assertEquals("md5() test ", expResult, output != null ? output.toString() : null); + } + + private void runAndVerifyBin(byte[] binV, String expResult, UDFMd5 udf) throws HiveException { + BytesWritable binWr = binV != null ? new BytesWritable(binV) : null; + Text output = (Text) udf.evaluate(binWr); + assertEquals("md5() test ", expResult, output != null ? output.toString() : null); + } +} http://git-wip-us.apache.org/repos/asf/hive/blob/9803344b/ql/src/test/queries/clientpositive/udf_md5.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/udf_md5.q b/ql/src/test/queries/clientpositive/udf_md5.q new file mode 100644 index 0000000..c22417a --- /dev/null +++ b/ql/src/test/queries/clientpositive/udf_md5.q @@ -0,0 +1,13 @@ +DESCRIBE FUNCTION md5; +DESC FUNCTION EXTENDED md5; + +explain select md5('ABC'); + +select +md5('ABC'), +md5(''), +md5(binary('ABC')), +md5(binary('')), +md5(cast(null as string)), +md5(cast(null as binary)), +md5(null); http://git-wip-us.apache.org/repos/asf/hive/blob/9803344b/ql/src/test/results/clientpositive/show_functions.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/show_functions.q.out b/ql/src/test/results/clientpositive/show_functions.q.out index ffc32c8..a422760 100644 --- a/ql/src/test/results/clientpositive/show_functions.q.out +++ b/ql/src/test/results/clientpositive/show_functions.q.out @@ -123,6 +123,7 @@ map_keys map_values matchpath max +md5 min minute month @@ -327,6 +328,7 @@ map_keys map_values matchpath max +md5 min minute month http://git-wip-us.apache.org/repos/asf/hive/blob/9803344b/ql/src/test/results/clientpositive/udf_md5.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/udf_md5.q.out b/ql/src/test/results/clientpositive/udf_md5.q.out new file mode 100644 index 0000000..01744fe --- /dev/null +++ b/ql/src/test/results/clientpositive/udf_md5.q.out @@ -0,0 +1,61 @@ +PREHOOK: query: DESCRIBE FUNCTION md5 +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: DESCRIBE FUNCTION md5 +POSTHOOK: type: DESCFUNCTION +md5(str or bin) - Calculates an MD5 128-bit checksum for the string or binary. +PREHOOK: query: DESC FUNCTION EXTENDED md5 +PREHOOK: type: DESCFUNCTION +POSTHOOK: query: DESC FUNCTION EXTENDED md5 +POSTHOOK: type: DESCFUNCTION +md5(str or bin) - Calculates an MD5 128-bit checksum for the string or binary. +The value is returned as a string of 32 hex digits, or NULL if the argument was NULL. +Example: + > SELECT md5('ABC'); + '902fbdd2b1df0c4f70b4a5d23525e932' + > SELECT md5(binary('ABC')); + '902fbdd2b1df0c4f70b4a5d23525e932' +PREHOOK: query: explain select md5('ABC') +PREHOOK: type: QUERY +POSTHOOK: query: explain select md5('ABC') +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: _dummy_table + Row Limit Per Split: 1 + Statistics: Num rows: 0 Data size: 1 Basic stats: PARTIAL Column stats: COMPLETE + Select Operator + expressions: '902fbdd2b1df0c4f70b4a5d23525e932' (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 0 Data size: 1 Basic stats: PARTIAL Column stats: COMPLETE + ListSink + +PREHOOK: query: select +md5('ABC'), +md5(''), +md5(binary('ABC')), +md5(binary('')), +md5(cast(null as string)), +md5(cast(null as binary)), +md5(null) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: select +md5('ABC'), +md5(''), +md5(binary('ABC')), +md5(binary('')), +md5(cast(null as string)), +md5(cast(null as binary)), +md5(null) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +902fbdd2b1df0c4f70b4a5d23525e932 d41d8cd98f00b204e9800998ecf8427e 902fbdd2b1df0c4f70b4a5d23525e932 d41d8cd98f00b204e9800998ecf8427e NULL NULL NULL
