Repository: incubator-hivemall Updated Branches: refs/heads/master 85f8e173a -> 389a9e331
Close #43: Added rownum() UDF Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/389a9e33 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/389a9e33 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/389a9e33 Branch: refs/heads/master Commit: 389a9e331a1fdb2884d7b184b8ed63e510d9acc0 Parents: 85f8e17 Author: myui <yuin...@gmail.com> Authored: Fri Feb 10 20:49:06 2017 +0900 Committer: myui <yuin...@gmail.com> Committed: Fri Feb 10 20:49:06 2017 +0900 ---------------------------------------------------------------------- .github/PULL_REQUEST_TEMPLATE | 6 +- .../hivemall/tools/mapred/RowNumberUDF.java | 69 ++++++++++++++++++++ resources/ddl/define-all-as-permanent.hive | 3 + resources/ddl/define-all.hive | 3 + 4 files changed, 80 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/389a9e33/.github/PULL_REQUEST_TEMPLATE ---------------------------------------------------------------------- diff --git a/.github/PULL_REQUEST_TEMPLATE b/.github/PULL_REQUEST_TEMPLATE index 04f3004..361d613 100644 --- a/.github/PULL_REQUEST_TEMPLATE +++ b/.github/PULL_REQUEST_TEMPLATE @@ -6,10 +6,14 @@ [Bug Fix | Improvement | Feature | Documentation | Hot Fix | Refactoring] -### What is the Jira issue? +## What is the Jira issue? (Put link here and add [HIVEMALL-*Jira number*] in PR title, e.g., [HIVEMALL-533]) ## How was this patch tested? (Please explain how this patch was tested. e.g., unit tests, integration tests, manual tests) + +## How to use this feature? + +(Please remove this section if not needed) http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/389a9e33/core/src/main/java/hivemall/tools/mapred/RowNumberUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/mapred/RowNumberUDF.java b/core/src/main/java/hivemall/tools/mapred/RowNumberUDF.java new file mode 100644 index 0000000..59f64ba --- /dev/null +++ b/core/src/main/java/hivemall/tools/mapred/RowNumberUDF.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.mapred; + +import hivemall.utils.hadoop.HadoopUtils; + +import javax.annotation.Nonnull; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.io.LongWritable; + +@Description(name = "rownum", value = "_FUNC_() - Returns a generated row number in long", + extended = "returns sprintf(`%d%04d`,sequence,taskId) as long") +@UDFType(deterministic = false, stateful = true) +public final class RowNumberUDF extends UDF { + + private long sequence; + private int taskId; + @Nonnull + private final LongWritable result; + + public RowNumberUDF() { + this.sequence = 0L; + this.taskId = -1; + this.result = new LongWritable(Double.doubleToLongBits(Double.NaN)); + } + + @Nonnull + public LongWritable evaluate() throws HiveException { + if (taskId == -1) { + this.taskId = HadoopUtils.getTaskId() + 1; + if (taskId > 9999) { + throw new HiveException("TaskId out of range `" + taskId + + "`. rownum() supports 9999 tasks at max"); + } + } + sequence++; + + String rowid = String.format("%d%04d", sequence, taskId); + final long l; + try { + l = Long.parseLong(rowid); + } catch (NumberFormatException e) { + throw new HiveException("failed to parse `" + rowid + "` as long", e); + } + + result.set(l); + return result; + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/389a9e33/resources/ddl/define-all-as-permanent.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index 33a15a1..f9f9fd8 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -477,6 +477,9 @@ CREATE FUNCTION jobid as 'hivemall.tools.mapred.JobIdUDF' USING JAR '${hivemall_ DROP FUNCTION IF EXISTS rowid; CREATE FUNCTION rowid as 'hivemall.tools.mapred.RowIdUDF' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS rownum; +CREATE FUNCTION rownum as 'hivemall.tools.mapred.RowNumberUDF' USING JAR '${hivemall_jar}'; + DROP FUNCTION IF EXISTS distcache_gets; CREATE FUNCTION distcache_gets as 'hivemall.tools.mapred.DistributedCacheLookupUDF' USING JAR '${hivemall_jar}'; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/389a9e33/resources/ddl/define-all.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 9cc348f..6e0f911 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -473,6 +473,9 @@ create temporary function jobid as 'hivemall.tools.mapred.JobIdUDF'; drop temporary function if exists rowid; create temporary function rowid as 'hivemall.tools.mapred.RowIdUDF'; +drop temporary function if exists rownum; +create temporary function rownum as 'hivemall.tools.mapred.RowNumberUDF'; + drop temporary function if exists distcache_gets; create temporary function distcache_gets as 'hivemall.tools.mapred.DistributedCacheLookupUDF';