[HIVEMALL-145] Merge Brickhouse functions ## What changes were proposed in this pull request?
Merge [brickhouse](https://github.com/klout/brickhouse) functions. ## What type of PR is it? Feature ## What is the Jira issue? https://issues.apache.org/jira/browse/HIVEMALL-145 ## How was this patch tested? unit tests and manual tests ## How to use this feature? as described in [user guide](http://hivemall.incubator.apache.org/userguide/misc/generic_funcs.html). ## Checklist - [x] Did you apply source code formatter, i.e., `mvn formatter:format`, for your commit? - [x] Did you run system tests on Hive (or Spark)? - [x] Invite active/main Brickhouse developers as Hivemall PPMC members or committers. https://github.com/klout/brickhouse/issues/149 - [x] +1 from Klout members to merge Author: Makoto Yui <[email protected]> Closes #135 from myui/merge_brickhouse. Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/49496032 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/49496032 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/49496032 Branch: refs/heads/master Commit: 49496032498182baa60e5f93a1f3717359909433 Parents: 7ea6bfd Author: Makoto Yui <[email protected]> Authored: Wed Jun 6 18:09:17 2018 +0900 Committer: Makoto Yui <[email protected]> Committed: Wed Jun 6 18:09:17 2018 +0900 ---------------------------------------------------------------------- NOTICE | 2 +- bin/update_func_md.sh | 47 ++ .../main/java/hivemall/HivemallVersionUDF.java | 2 +- .../java/hivemall/sketch/bloom/BloomAndUDF.java | 3 +- .../sketch/bloom/BloomContainsAnyUDF.java | 111 ++++ .../hivemall/sketch/bloom/BloomContainsUDF.java | 85 ++- .../hivemall/sketch/bloom/BloomFilterUDAF.java | 114 ++++ .../java/hivemall/sketch/bloom/BloomNotUDF.java | 3 +- .../java/hivemall/sketch/bloom/BloomOrUDF.java | 3 +- .../java/hivemall/sketch/bloom/BloomUDAF.java | 101 ---- .../smile/classification/DecisionTree.java | 6 +- .../smile/regression/RegressionTree.java | 10 +- .../hivemall/smile/tools/TreePredictUDF.java | 2 +- .../hivemall/statistics/MovingAverageUDTF.java | 84 --- .../java/hivemall/tools/GenerateSeriesUDTF.java | 190 +++++-- .../main/java/hivemall/tools/TryCastUDF.java | 4 +- .../hivemall/tools/array/ArrayAppendUDF.java | 11 +- .../hivemall/tools/array/ArrayConcatUDF.java | 6 +- .../hivemall/tools/array/ArrayElementAtUDF.java | 4 +- .../hivemall/tools/array/ArrayFlattenUDF.java | 39 +- .../hivemall/tools/array/ArrayIntersectUDF.java | 7 +- .../hivemall/tools/array/ArrayRemoveUDF.java | 4 +- .../hivemall/tools/array/ArraySliceUDF.java | 41 +- .../hivemall/tools/array/ArrayToStrUDF.java | 96 ++++ .../hivemall/tools/array/ArrayUnionUDF.java | 8 +- .../tools/array/ConditionalEmitUDTF.java | 19 +- .../hivemall/tools/array/FirstElementUDF.java | 4 +- .../hivemall/tools/array/LastElementUDF.java | 3 +- .../tools/array/SortAndUniqArrayUDF.java | 2 +- .../tools/array/SubarrayEndWithUDF.java | 2 +- .../tools/array/SubarrayStartWithUDF.java | 2 +- .../java/hivemall/tools/bits/BitsORUDF.java | 4 +- .../java/hivemall/tools/bits/ToBitsUDF.java | 2 +- .../java/hivemall/tools/bits/UnBitsUDF.java | 2 +- .../hivemall/tools/compress/DeflateUDF.java | 2 +- .../hivemall/tools/compress/InflateUDF.java | 4 +- .../hivemall/tools/datetime/SessionizeUDF.java | 99 ++++ .../java/hivemall/tools/json/FromJsonUDF.java | 40 +- .../java/hivemall/tools/json/ToJsonUDF.java | 76 ++- .../hivemall/tools/list/UDAFToOrderedList.java | 16 +- .../hivemall/tools/map/MapExcludeKeysUDF.java | 100 ++++ .../hivemall/tools/map/MapIncludeKeysUDF.java | 103 ++++ .../java/hivemall/tools/map/MapIndexUDF.java | 104 ++++ .../hivemall/tools/map/MapKeyValuesUDF.java | 97 ++++ .../java/hivemall/tools/map/MergeMapsUDAF.java | 172 ++++++ .../hivemall/tools/mapred/RowNumberUDF.java | 4 +- .../java/hivemall/tools/sanity/AssertUDF.java | 6 +- .../hivemall/tools/sanity/RaiseErrorUDF.java | 43 +- .../java/hivemall/tools/text/Base91UDF.java | 2 +- .../tools/text/NormalizeUnicodeUDF.java | 4 +- .../hivemall/tools/text/SingularizeUDF.java | 2 +- .../java/hivemall/tools/text/SplitWordsUDF.java | 2 +- .../java/hivemall/tools/text/Unbase91UDF.java | 4 +- .../java/hivemall/tools/text/WordNgramsUDF.java | 4 +- .../tools/timeseries/MovingAverageUDTF.java | 98 ++++ .../hivemall/tools/vector/VectorAddUDF.java | 3 +- .../hivemall/tools/vector/VectorDotUDF.java | 21 +- .../java/hivemall/utils/hadoop/HiveUtils.java | 2 +- .../java/hivemall/utils/math/MatrixUtils.java | 7 +- .../FieldAwareFactorizationMachineUDTFTest.java | 2 + .../ftvec/hashing/FeatureHashingUDFTest.java | 6 +- .../smile/classification/DecisionTreeTest.java | 10 +- .../smile/regression/RegressionTreeTest.java | 5 +- .../statistics/MovingAverageUDTFTest.java | 78 --- .../hivemall/tools/GenerateSeriesUDTFTest.java | 246 +++++++++ .../tools/array/ArrayAppendUDFTest.java | 9 +- .../tools/array/ArrayFlattenUDFTest.java | 4 +- .../hivemall/tools/array/ArraySliceUDFTest.java | 4 +- .../hivemall/tools/array/ArrayToStrUDFTest.java | 103 ++++ .../hivemall/tools/array/ArrayUnionUDFTest.java | 1 + .../tools/array/ConditionalEmitUDTFTest.java | 3 +- .../tools/datetime/SessionizeUDFTest.java | 76 +++ .../hivemall/tools/json/FromJsonUDFTest.java | 1 + .../java/hivemall/tools/json/ToJsonUDFTest.java | 6 +- .../hivemall/tools/map/MapKeyValuesUDFTest.java | 81 +++ .../tools/sanity/RaiseErrorUDFTest.java | 10 +- .../tools/timeseries/MovingAverageUDTFTest.java | 80 +++ .../hivemall/tools/vector/VectorAddUDFTest.java | 1 + .../hivemall/tools/vector/VectorDotUDFTest.java | 7 +- docs/gitbook/misc/funcs.md | 80 ++- docs/gitbook/misc/generic_funcs.md | 531 +++++++++++++++---- pom.xml | 12 + resources/ddl/define-all-as-permanent.hive | 99 +++- resources/ddl/define-all.hive | 99 +++- resources/ddl/define-all.spark | 95 +++- resources/ddl/define-udfs.td.hql | 30 +- spark/spark-2.2/pom.xml | 2 +- tools/hivemall-docs/pom.xml | 113 +++- .../java/hivemall/docs/FuncsListGenerator.java | 249 --------- .../hivemall/docs/FuncsListGeneratorMojo.java | 264 +++++++++ ...rojectDependenciesComponentConfigurator.java | 100 ++++ 91 files changed, 3621 insertions(+), 824 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/NOTICE ---------------------------------------------------------------------- diff --git a/NOTICE b/NOTICE index 385a198..b4e5ace 100644 --- a/NOTICE +++ b/NOTICE @@ -10,4 +10,4 @@ the following organizations and individuals: - Copyright 2013-2015 National Institute of Advanced Industrial Science and Technology (AIST) - Copyright 2015-2016 Makoto Yui - Copyright 2015-2016 Treasure Data, Inc. - - Copyright 2012 Klout, Inc. \ No newline at end of file + - Copyright 2012 Klout, Inc. http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/bin/update_func_md.sh ---------------------------------------------------------------------- diff --git a/bin/update_func_md.sh b/bin/update_func_md.sh new file mode 100755 index 0000000..bb0afda --- /dev/null +++ b/bin/update_func_md.sh @@ -0,0 +1,47 @@ +#!/bin/sh +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +if [ "$HIVEMALL_HOME" = "" ]; then + if [ -e ../bin/${0##*/} ]; then + HIVEMALL_HOME=".." + elif [ -e ./bin/${0##*/} ]; then + HIVEMALL_HOME="." + else + echo "env HIVEMALL_HOME not defined" + exit 1 + fi +fi + +cd $HIVEMALL_HOME +HIVEMALL_HOME=`pwd` + +# Deploy to local Maven repos + +export MAVEN_OPTS=-XX:MaxPermSize=256m +mvn clean install -DskipTests=true -Dmaven.test.skip=true -pl '.,core,nlp,xgboost,tools/hivemall-docs' + +# Generate docs + +mvn org.apache.hivemall:hivemall-docs:generate-funcs-list -pl '.,core,nlp,xgboost,tools/hivemall-docs' -X + +# Run HTTP server on localhost:040 + +cd ${HIVEMALL_HOME}/docs/gitbook +gitbook install && gitbook serve http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/HivemallVersionUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/HivemallVersionUDF.java b/core/src/main/java/hivemall/HivemallVersionUDF.java index c42701f..92636d5 100644 --- a/core/src/main/java/hivemall/HivemallVersionUDF.java +++ b/core/src/main/java/hivemall/HivemallVersionUDF.java @@ -24,7 +24,7 @@ import org.apache.hadoop.hive.ql.udf.UDFType; import org.apache.hadoop.io.Text; @Description(name = "hivemall_version", value = "_FUNC_() - Returns the version of Hivemall", - extended = "Usage: SELECT hivemall_version();") + extended = "SELECT hivemall_version();") @UDFType(deterministic = true, stateful = false) public final class HivemallVersionUDF extends UDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/sketch/bloom/BloomAndUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomAndUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomAndUDF.java index 87769da..f723acd 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomAndUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomAndUDF.java @@ -31,7 +31,8 @@ import org.apache.hadoop.util.bloom.DynamicBloomFilter; import org.apache.hadoop.util.bloom.Filter; @Description(name = "bloom_and", - value = "_FUNC_(string bloom1, string bloom2) - Returns the logical AND of two bloom filters") + value = "_FUNC_(string bloom1, string bloom2) - Returns the logical AND of two bloom filters", + extended = "SELECT bloom_and(bf1, bf2) FROM xxx;") @UDFType(deterministic = true, stateful = false) public final class BloomAndUDF extends UDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/sketch/bloom/BloomContainsAnyUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomContainsAnyUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomContainsAnyUDF.java new file mode 100644 index 0000000..d8ff28b --- /dev/null +++ b/core/src/main/java/hivemall/sketch/bloom/BloomContainsAnyUDF.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.sketch.bloom; + +import java.io.IOException; +import java.util.List; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.util.bloom.DynamicBloomFilter; +import org.apache.hadoop.util.bloom.Filter; +import org.apache.hadoop.util.bloom.Key; + +//@formatter:off +@Description(name = "bloom_contains_any", + value = "_FUNC_(string bloom, string key) or _FUNC_(string bloom, array<string> keys)" + + "- Returns true if the bloom filter contains any of the given key", + extended = "WITH data1 as (\n" + + " SELECT explode(array(1,2,3,4,5)) as id\n" + + "),\n" + + "data2 as (\n" + + " SELECT explode(array(1,3,5,6,8)) as id\n" + + "),\n" + + "bloom as (\n" + + " SELECT bloom(id) as bf\n" + + " FROM data1\n" + + ")\n" + + "SELECT \n" + + " l.* \n" + + "FROM \n" + + " data2 l\n" + + " CROSS JOIN bloom r\n" + + "WHERE\n" + + " bloom_contains_any(r.bf, array(l.id))") +//@formatter:on +@UDFType(deterministic = true, stateful = false) +public final class BloomContainsAnyUDF extends UDF { + + @Nonnull + private final Key key = new Key(); + + @Nullable + private Text prevBfStr; + @Nullable + private Filter prevBf; + + @Nullable + public Boolean evaluate(@Nullable Text bloomStr, @Nullable List<Text> keys) + throws HiveException { + if (bloomStr == null) { + return null; + } + if (keys == null) { + return Boolean.FALSE; + } + + final Filter bloom = getFilter(bloomStr); + + for (Text keyStr : keys) { + if (keyStr == null) { + continue; + } + key.set(keyStr.copyBytes(), 1.0d); + if (bloom.membershipTest(key)) { + return Boolean.TRUE; + } + } + + return Boolean.FALSE; + } + + @Nonnull + private Filter getFilter(@Nonnull final Text bloomStr) throws HiveException { + final Filter bloom; + if (prevBf != null && prevBfStr.equals(bloomStr)) { + bloom = prevBf; + } else { + try { + bloom = BloomFilterUtils.deserialize(bloomStr, new DynamicBloomFilter()); + } catch (IOException e) { + throw new HiveException(e); + } + this.prevBfStr = new Text(bloomStr); + this.prevBf = bloom; + } + return bloom; + } + +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java index 2da65b3..f76b85d 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomContainsUDF.java @@ -19,6 +19,7 @@ package hivemall.sketch.bloom; import java.io.IOException; +import java.util.List; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -32,8 +33,37 @@ import org.apache.hadoop.util.bloom.DynamicBloomFilter; import org.apache.hadoop.util.bloom.Filter; import org.apache.hadoop.util.bloom.Key; +//@formatter:off @Description(name = "bloom_contains", - value = "_FUNC_(string bloom, string key) - Returns true if the bloom filter contains the given key") + value = "_FUNC_(string bloom, string key) or _FUNC_(string bloom, array<string> keys)" + + " - Returns true if the bloom filter contains all the given key(s). Returns false if key is null.", + extended = "WITH satisfied_movies as (\n" + + " SELECT bloom(movieid) as movies\n" + + " FROM (\n" + + " SELECT movieid\n" + + " FROM ratings\n" + + " GROUP BY movieid\n" + + " HAVING avg(rating) >= 4.0\n" + + " ) t\n" + + ")\n" + + "SELECT\n" + + " l.rating,\n" + + " count(distinct l.userid) as cnt\n" + + "FROM\n" + + " ratings l \n" + + " CROSS JOIN satisfied_movies r\n" + + "WHERE\n" + + " bloom_contains(r.movies, l.movieid) -- includes false positive\n" + + "GROUP BY \n" + + " l.rating;\n" + + "\n" + + "l.rating cnt\n" + + "1 1296\n" + + "2 2770\n" + + "3 5008\n" + + "4 5824\n" + + "5 5925") +//@formatter:on @UDFType(deterministic = true, stateful = false) public final class BloomContainsUDF extends UDF { @@ -41,31 +71,64 @@ public final class BloomContainsUDF extends UDF { private final Key key = new Key(); @Nullable - private Text prevKey; + private Text prevBfStr; @Nullable - private Filter prevFilter; + private Filter prevBf; @Nullable public Boolean evaluate(@Nullable Text bloomStr, @Nullable Text keyStr) throws HiveException { - if (bloomStr == null || key == null) { + if (bloomStr == null) { return null; } + if (keyStr == null) { + return Boolean.FALSE; + } + + Filter bloom = getFilter(bloomStr); + key.set(keyStr.copyBytes(), 1.0d); + return Boolean.valueOf(bloom.membershipTest(key)); + } + + @Nullable + public Boolean evaluate(@Nullable Text bloomStr, @Nullable List<Text> keys) + throws HiveException { + if (bloomStr == null) { + return null; + } + if (keys == null) { + return Boolean.FALSE; + } + + final Filter bloom = getFilter(bloomStr); + for (Text keyStr : keys) { + if (keyStr == null) { + continue; + } + key.set(keyStr.copyBytes(), 1.0d); + if (bloom.membershipTest(key) == false) { + return Boolean.FALSE; + } + } + + return Boolean.TRUE; + } + + @Nonnull + private Filter getFilter(@Nonnull final Text bloomStr) throws HiveException { final Filter bloom; - if (prevFilter != null && prevKey.equals(keyStr)) { - bloom = prevFilter; + if (prevBf != null && prevBfStr.equals(bloomStr)) { + bloom = prevBf; } else { try { bloom = BloomFilterUtils.deserialize(bloomStr, new DynamicBloomFilter()); } catch (IOException e) { throw new HiveException(e); } - this.prevKey = keyStr; - this.prevFilter = bloom; - key.set(keyStr.getBytes(), 1.0d); + this.prevBfStr = new Text(bloomStr); + this.prevBf = bloom; } - - return Boolean.valueOf(bloom.membershipTest(key)); + return bloom; } } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java b/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java new file mode 100644 index 0000000..a76a60c --- /dev/null +++ b/core/src/main/java/hivemall/sketch/bloom/BloomFilterUDAF.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.sketch.bloom; + +import java.io.IOException; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDAF; +import org.apache.hadoop.hive.ql.exec.UDAFEvaluator; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.util.bloom.DynamicBloomFilter; +import org.apache.hadoop.util.bloom.Filter; +import org.apache.hadoop.util.bloom.Key; + +//@formatter:off +@Description(name = "bloom", + value = "_FUNC_(string key) - Constructs a BloomFilter by aggregating a set of keys", + extended = "CREATE TABLE satisfied_movies AS \n" + + " SELECT bloom(movieid) as movies\n" + + " FROM (\n" + + " SELECT movieid\n" + + " FROM ratings\n" + + " GROUP BY movieid\n" + + " HAVING avg(rating) >= 4.0\n" + + " ) t;") +//@formatter:on +@SuppressWarnings("deprecation") +public final class BloomFilterUDAF extends UDAF { + + public static class Evaluator implements UDAFEvaluator { + + private Filter filter; + private Key key; + + @Override + public void init() { + this.filter = BloomFilterUtils.newDynamicBloomFilter(); + this.key = new Key(); + } + + public boolean iterate(@Nullable Text keyStr) { + if (keyStr == null) { + return true; + } + if (filter == null) { + init(); + } + + key.set(keyStr.copyBytes(), 1.0d); + filter.add(key); + + return true; + } + + @Nonnull + public Text terminatePartial() throws HiveException { + try { + return BloomFilterUtils.serialize(filter, new Text()); + } catch (IOException e) { + throw new HiveException(e); + } + } + + public boolean merge(@Nonnull Text partial) throws HiveException { + final DynamicBloomFilter other; + try { + other = BloomFilterUtils.deserialize(partial, new DynamicBloomFilter()); + } catch (IOException e) { + throw new HiveException(e); + } + + if (filter == null) { + this.filter = other; + } else { + filter.or(other); + } + return true; + } + + @Nullable + public Text terminate() throws HiveException { + if (filter == null) { + return null; + } + + try { + return BloomFilterUtils.serialize(filter, new Text()); + } catch (IOException e) { + throw new HiveException(e); + } + } + + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/sketch/bloom/BloomNotUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomNotUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomNotUDF.java index cd385e3..1a074b0 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomNotUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomNotUDF.java @@ -31,7 +31,8 @@ import org.apache.hadoop.util.bloom.DynamicBloomFilter; import org.apache.hadoop.util.bloom.Filter; @Description(name = "bloom_not", - value = "_FUNC_(string bloom) - Returns the logical NOT of a bloom filters") + value = "_FUNC_(string bloom) - Returns the logical NOT of a bloom filters", + extended = "SELECT bloom_not(bf) FROM xxx;") @UDFType(deterministic = true, stateful = false) public final class BloomNotUDF extends UDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/sketch/bloom/BloomOrUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomOrUDF.java b/core/src/main/java/hivemall/sketch/bloom/BloomOrUDF.java index 7d2980e..0fcb539 100644 --- a/core/src/main/java/hivemall/sketch/bloom/BloomOrUDF.java +++ b/core/src/main/java/hivemall/sketch/bloom/BloomOrUDF.java @@ -31,7 +31,8 @@ import org.apache.hadoop.util.bloom.DynamicBloomFilter; import org.apache.hadoop.util.bloom.Filter; @Description(name = "bloom_or", - value = "_FUNC_(string bloom1, string bloom2) - Returns the logical OR of two bloom filters") + value = "_FUNC_(string bloom1, string bloom2) - Returns the logical OR of two bloom filters", + extended = "SELECT bloom_or(bf1, bf2) FROM xxx;") @UDFType(deterministic = true, stateful = false) public final class BloomOrUDF extends UDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/sketch/bloom/BloomUDAF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/sketch/bloom/BloomUDAF.java b/core/src/main/java/hivemall/sketch/bloom/BloomUDAF.java deleted file mode 100644 index cb09b93..0000000 --- a/core/src/main/java/hivemall/sketch/bloom/BloomUDAF.java +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package hivemall.sketch.bloom; - -import java.io.IOException; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; - -import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDAF; -import org.apache.hadoop.hive.ql.exec.UDAFEvaluator; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.util.bloom.DynamicBloomFilter; -import org.apache.hadoop.util.bloom.Filter; -import org.apache.hadoop.util.bloom.Key; - -@Description(name = "bloom", - value = "_FUNC_(string key) - Constructs a BloomFilter by aggregating a set of keys") -@SuppressWarnings("deprecation") -public final class BloomUDAF extends UDAF { - - public static class Evaluator implements UDAFEvaluator { - - private Filter filter; - private Key key; - - @Override - public void init() { - this.filter = BloomFilterUtils.newDynamicBloomFilter(); - this.key = new Key(); - } - - public boolean iterate(@Nullable Text keyStr) { - if (keyStr == null) { - return true; - } - key.set(keyStr.getBytes(), 1.0d); - - filter.add(key); - - return true; - } - - @Nonnull - public Text terminatePartial() throws HiveException { - try { - return BloomFilterUtils.serialize(filter, new Text()); - } catch (IOException e) { - throw new HiveException(e); - } - } - - public boolean merge(@Nonnull Text partial) throws HiveException { - final DynamicBloomFilter other; - try { - other = BloomFilterUtils.deserialize(partial, new DynamicBloomFilter()); - } catch (IOException e) { - throw new HiveException(e); - } - - if (filter == null) { - this.filter = other; - } else { - filter.or(other); - } - return true; - } - - @Nullable - public Text terminate() throws HiveException { - if (filter == null) { - return null; - } - - try { - return BloomFilterUtils.serialize(filter, new Text()); - } catch (IOException e) { - throw new HiveException(e); - } - } - - } -} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/smile/classification/DecisionTree.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/smile/classification/DecisionTree.java b/core/src/main/java/hivemall/smile/classification/DecisionTree.java index cc92ae8..a80a299 100644 --- a/core/src/main/java/hivemall/smile/classification/DecisionTree.java +++ b/core/src/main/java/hivemall/smile/classification/DecisionTree.java @@ -19,6 +19,7 @@ package hivemall.smile.classification; import static hivemall.smile.utils.SmileExtUtils.resolveFeatureName; import static hivemall.smile.utils.SmileExtUtils.resolveName; + import hivemall.annotations.VisibleForTesting; import hivemall.math.matrix.Matrix; import hivemall.math.matrix.ints.ColumnMajorIntMatrix; @@ -36,6 +37,8 @@ import hivemall.utils.lang.ObjectUtils; import hivemall.utils.lang.StringUtils; import hivemall.utils.lang.mutable.MutableInt; import hivemall.utils.sampling.IntReservoirSampler; +import smile.classification.Classifier; +import smile.math.Math; import java.io.Externalizable; import java.io.IOException; @@ -53,9 +56,6 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; import org.roaringbitmap.IntConsumer; import org.roaringbitmap.RoaringBitmap; -import smile.classification.Classifier; -import smile.math.Math; - /** * Decision tree for classification. A decision tree can be learned by splitting the training set * into subsets based on an attribute value test. This process is repeated on each derived subset in http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/smile/regression/RegressionTree.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/smile/regression/RegressionTree.java b/core/src/main/java/hivemall/smile/regression/RegressionTree.java index b8a3cc7..61964ae 100755 --- a/core/src/main/java/hivemall/smile/regression/RegressionTree.java +++ b/core/src/main/java/hivemall/smile/regression/RegressionTree.java @@ -18,6 +18,7 @@ package hivemall.smile.regression; import static hivemall.smile.utils.SmileExtUtils.resolveFeatureName; + import hivemall.annotations.VisibleForTesting; import hivemall.math.matrix.Matrix; import hivemall.math.matrix.ints.ColumnMajorIntMatrix; @@ -36,6 +37,10 @@ import hivemall.utils.lang.ObjectUtils; import hivemall.utils.lang.StringUtils; import hivemall.utils.lang.mutable.MutableInt; import hivemall.utils.math.MathUtils; +import smile.math.Math; +import smile.regression.GradientTreeBoost; +import smile.regression.RandomForest; +import smile.regression.Regression; import java.io.Externalizable; import java.io.IOException; @@ -51,11 +56,6 @@ import javax.annotation.Nullable; import org.apache.hadoop.hive.ql.metadata.HiveException; -import smile.math.Math; -import smile.regression.GradientTreeBoost; -import smile.regression.RandomForest; -import smile.regression.Regression; - /** * Decision tree for regression. A decision tree can be learned by splitting the training set into * subsets based on an attribute value test. This process is repeated on each derived subset in a http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/smile/tools/TreePredictUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/smile/tools/TreePredictUDF.java b/core/src/main/java/hivemall/smile/tools/TreePredictUDF.java index 9b775bf..511944c 100644 --- a/core/src/main/java/hivemall/smile/tools/TreePredictUDF.java +++ b/core/src/main/java/hivemall/smile/tools/TreePredictUDF.java @@ -158,7 +158,7 @@ public final class TreePredictUDF extends UDFWithOptions { Object arg2 = arguments[2].get(); if (arg2 == null) { - throw new HiveException("array<double> features was null"); + throw new HiveException("features was null"); } this.featuresProbe = parseFeatures(arg2, featuresProbe); http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java b/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java deleted file mode 100644 index 112c47f..0000000 --- a/core/src/main/java/hivemall/statistics/MovingAverageUDTF.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package hivemall.statistics; - -import hivemall.utils.hadoop.HiveUtils; -import hivemall.utils.stats.MovingAverage; - -import java.util.Arrays; -import java.util.List; - -import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDFArgumentException; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; -import org.apache.hadoop.hive.serde2.io.DoubleWritable; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; - -@Description(name = "moving_avg", value = "_FUNC_(NUMBER value, const int windowSize)" - + " - Returns moving average of a time series using a given window") -public final class MovingAverageUDTF extends GenericUDTF { - - private PrimitiveObjectInspector valueOI; - - private MovingAverage movingAvg; - - private Object[] forwardObjs; - private DoubleWritable result; - - @Override - public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { - if (argOIs.length != 2) { - throw new UDFArgumentException( - "Two argument is expected for moving_avg(NUMBER value, const int windowSize): " - + argOIs.length); - } - this.valueOI = HiveUtils.asNumberOI(argOIs[0]); - - int windowSize = HiveUtils.getConstInt(argOIs[1]); - this.movingAvg = new MovingAverage(windowSize); - - this.result = new DoubleWritable(); - this.forwardObjs = new Object[] {result}; - - List<String> fieldNames = Arrays.asList("avg"); - List<ObjectInspector> fieldOIs = Arrays.<ObjectInspector>asList( - PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); - - return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs); - } - - @Override - public void process(Object[] args) throws HiveException { - double x = HiveUtils.getDouble(args[0], valueOI); - - double avg = movingAvg.add(x); - result.set(avg); - - forward(forwardObjs); - } - - @Override - public void close() throws HiveException {} - -} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java b/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java index 2567ac7..ed9a7a2 100644 --- a/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java +++ b/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java @@ -21,6 +21,10 @@ package hivemall.tools; import hivemall.utils.hadoop.HiveUtils; import java.util.ArrayList; +import java.util.List; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; @@ -28,70 +32,176 @@ import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Writable; +// @formatter:off @Description(name = "generate_series", value = "_FUNC_(const int|bigint start, const int|bigint end) - " - + "Generate a series of values, from start to end. A similar function to PostgreSQL's `generate_serics`. http://www.postgresql.org/docs/current/static/functions-srf.html", - extended = "select generate_series(1,9);\n\n" + "1\n" + "2\n" + "3\n" + "4\n" + "5\n" - + "6\n" + "7\n" + "8\n" + "9") + + "Generate a series of values, from start to end. " + + "A similar function to PostgreSQL's [generate_serics](http://www.postgresql.org/docs/current/static/functions-srf.html)", + extended = "SELECT generate_series(2,4);\n" + + "\n" + + " 2\n" + + " 3\n" + + " 4\n" + + "\n" + + "SELECT generate_series(5,1,-2);\n" + + "\n" + + " 5\n" + + " 3\n" + + " 1\n" + + "\n" + + "SELECT generate_series(4,3);\n" + + "\n" + + " (no return)\n" + + "\n" + + "SELECT date_add(current_date(),value),value from (SELECT generate_series(1,3)) t;\n" + + "\n" + + " 2018-04-21 1\n" + + " 2018-04-22 2\n" + + " 2018-04-23 3\n" + + "\n" + + "WITH input as (\n" + + " SELECT 1 as c1, 10 as c2, 3 as step\n" + + " UNION ALL\n" + + " SELECT 10, 2, -3\n" + + ")\n" + + "SELECT generate_series(c1, c2, step) as series\n" + + "FROM input;\n" + + "\n" + + " 1\n" + + " 4\n" + + " 7\n" + + " 10\n" + + " 10\n" + + " 7\n" + + " 4") +// @formatter:on public final class GenerateSeriesUDTF extends GenericUDTF { - private long start, end; - private boolean useBigInt; + private PrimitiveObjectInspector startOI, endOI; + @Nullable + private PrimitiveObjectInspector stepOI; + + @Nonnull + private final Writable[] row = new Writable[1]; + private boolean returnLong; @Override public StructObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { - if (argOIs.length != 2) { - throw new UDFArgumentException("Expected number of arguments is 2: " + argOIs.length); + if (argOIs.length != 2 && argOIs.length != 3) { + throw new UDFArgumentException( + "Expected number of arguments is 2 or 3: " + argOIs.length); + } + if (!HiveUtils.isIntegerOI(argOIs[0])) { + throw new UDFArgumentException( + "Expected Integer type for the first argument: " + argOIs[0].getTypeName()); + } + if (!HiveUtils.isIntegerOI(argOIs[1])) { + throw new UDFArgumentException( + "Expected Integer type for the second argument: " + argOIs[1].getTypeName()); } + this.startOI = HiveUtils.asIntegerOI(argOIs[0]); + this.endOI = HiveUtils.asIntegerOI(argOIs[1]); - ArrayList<String> fieldNames = new ArrayList<String>(1); + if (argOIs.length == 3) { + if (!HiveUtils.isIntegerOI(argOIs[2])) { + throw new UDFArgumentException( + "Expected Integer type for the third argument: " + argOIs[2].getTypeName()); + } + this.stepOI = HiveUtils.asIntegerOI(argOIs[2]); + } + + this.returnLong = HiveUtils.isBigIntOI(startOI) || HiveUtils.isBigIntOI(endOI); + + List<String> fieldNames = new ArrayList<>(1); fieldNames.add("value"); - ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>(1); + List<ObjectInspector> fieldOIs = new ArrayList<>(1); + if (returnLong) { + fieldOIs.add(PrimitiveObjectInspectorFactory.writableLongObjectInspector); + } else { + fieldOIs.add(PrimitiveObjectInspectorFactory.writableIntObjectInspector); + } + return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs); + } - this.useBigInt = HiveUtils.isBigIntOI(argOIs[1]); - if (useBigInt) { - fieldOIs.add(PrimitiveObjectInspectorFactory.javaLongObjectInspector); + @Override + public void process(Object[] args) throws HiveException { + if (returnLong) { + generateLongSeries(args); } else { - fieldOIs.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector); + generateIntSeries(args); } + } - this.start = HiveUtils.getAsConstLong(argOIs[0]); - this.end = HiveUtils.getAsConstLong(argOIs[1]); - if (start > end) { - throw new UDFArgumentException( - "start '" + start + "' must be less than or equals to end '" + end + "'"); + private void generateLongSeries(@Nonnull final Object[] args) throws HiveException { + final long start, end; + long step = 1L; + switch (args.length) { + case 3: + step = PrimitiveObjectInspectorUtils.getLong(args[2], stepOI); + if (step == 0) { + throw new UDFArgumentException("Step MUST NOT be zero"); + } + // fall through + case 2: + start = PrimitiveObjectInspectorUtils.getLong(args[0], startOI); + end = PrimitiveObjectInspectorUtils.getLong(args[1], endOI); + break; + default: + throw new UDFArgumentException("Expected number of arguments: " + args.length); } - return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs); + final LongWritable row0 = new LongWritable(); + row[0] = row0; + if (step > 0) { + for (long i = start; i <= end; i += step) { + row0.set(i); + forward(row); + } + } else { + for (long i = start; i >= end; i += step) { + row0.set(i); + forward(row); + } + } } - @Override - public void process(Object[] argOIs) throws HiveException { - final Object[] forwardObjs = new Object[1]; - if (useBigInt) { - if (start == end) { - forwardObjs[0] = start; - forward(forwardObjs); - } else { - for (long i = start; i <= end; i++) { - forwardObjs[0] = i; - forward(forwardObjs); + private void generateIntSeries(@Nonnull final Object[] args) throws HiveException { + final int start, end; + int step = 1; + switch (args.length) { + case 3: + step = PrimitiveObjectInspectorUtils.getInt(args[2], stepOI); + if (step == 0) { + throw new UDFArgumentException("Step MUST NOT be zero"); } + // fall through + case 2: + start = PrimitiveObjectInspectorUtils.getInt(args[0], startOI); + end = PrimitiveObjectInspectorUtils.getInt(args[1], endOI); + break; + default: + throw new UDFArgumentException("Expected number of arguments: " + args.length); + } + + final IntWritable row0 = new IntWritable(); + row[0] = row0; + if (step > 0) { + for (int i = start; i <= end; i += step) { + row0.set(i); + forward(row); } } else { - int starti = (int) start; - int endi = (int) end; - if (starti == endi) { - forwardObjs[0] = starti; - forward(forwardObjs); - } else { - for (int i = starti; i <= endi; i++) { - forwardObjs[0] = i; - forward(forwardObjs); - } + for (int i = start; i >= end; i += step) { + row0.set(i); + forward(row); } } } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/TryCastUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/TryCastUDF.java b/core/src/main/java/hivemall/tools/TryCastUDF.java index a0f3257..adb8328 100644 --- a/core/src/main/java/hivemall/tools/TryCastUDF.java +++ b/core/src/main/java/hivemall/tools/TryCastUDF.java @@ -35,8 +35,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; @Description(name = "try_cast", value = "_FUNC_(ANY src, const string typeName)" + " - Explicitly cast a value as a type. Returns null if cast fails.", - extended = "Usage: select try_cast(array(1.0,2.0,3.0), 'array<string>')\n" - + " select try_cast(map('A',10,'B',20,'C',30), 'map<string,double>')") + extended = "SELECT try_cast(array(1.0,2.0,3.0), 'array<string>')\n" + + "SELECT try_cast(map('A',10,'B',20,'C',30), 'map<string,double>')") @UDFType(deterministic = true, stateful = false) public final class TryCastUDF extends GenericUDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java b/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java index 25d0f4c..b3768bb 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayAppendUDF.java @@ -21,6 +21,7 @@ package hivemall.tools.array; import hivemall.utils.hadoop.HiveUtils; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import javax.annotation.Nonnull; @@ -38,7 +39,9 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; @Description(name = "array_append", - value = "_FUNC_(array<T> arr, T elem) - Append an element to the end of an array") + value = "_FUNC_(array<T> arr, T elem) - Append an element to the end of an array", + extended = "SELECT array_append(array(1,2),3);\n 1,2,3\n\n" + + "SELECT array_append(array('a','b'),'c');\n \"a\",\"b\",\"c\"") @UDFType(deterministic = true, stateful = false) public final class ArrayAppendUDF extends GenericUDF { @@ -71,6 +74,12 @@ public final class ArrayAppendUDF extends GenericUDF { Object arg0 = args[0].get(); if (arg0 == null) { + Object arg1 = args[1].get(); + if (arg1 != null) { + Object toAppend = returnWritables ? primInspector.getPrimitiveWritableObject(arg1) + : primInspector.getPrimitiveJavaObject(arg1); + return Arrays.asList(toAppend); + } return null; } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java b/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java index 62e3e36..7781564 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java @@ -38,7 +38,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.Object @Description(name = "array_concat", value = "_FUNC_(array<ANY> x1, array<ANY> x2, ..) - Returns a concatenated array", - extended = "select array_concat(array(1),array(2,3));\n" + "> [1,2,3]") + extended = "SELECT array_concat(array(1),array(2,3));\n" + " [1,2,3]") @UDFType(deterministic = true, stateful = false) public class ArrayConcatUDF extends GenericUDF { /** @@ -95,11 +95,11 @@ public class ArrayConcatUDF extends GenericUDF { continue; } - final ListObjectInspector arrayOI = (ListObjectInspector) argumentOIs[i]; + final ListObjectInspector arrayOI = argumentOIs[i]; + final ObjectInspector elemOI = arrayOI.getListElementObjectInspector(); final int arraylength = arrayOI.getListLength(arrayObject); for (int j = 0; j < arraylength; j++) { Object rawObj = arrayOI.getListElement(arrayObject, j); - ObjectInspector elemOI = arrayOI.getListElementObjectInspector(); Object obj = ObjectInspectorUtils.copyToStandardObject(rawObj, elemOI, ObjectInspectorCopyOption.WRITABLE); ret.add(obj); http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/array/ArrayElementAtUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/ArrayElementAtUDF.java b/core/src/main/java/hivemall/tools/array/ArrayElementAtUDF.java index 631d92d..21be8d4 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayElementAtUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayElementAtUDF.java @@ -30,7 +30,9 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; @Description(name = "element_at", - value = "_FUNC_(array<T> list, int pos) - Returns an element at the given position") + value = "_FUNC_(array<T> list, int pos) - Returns an element at the given position", + extended = "SELECT element_at(array(1,2,3,4),0);\n 1\n\n" + + "SELECT element_at(array(1,2,3,4),-2);\n 3") @UDFType(deterministic = true, stateful = false) public final class ArrayElementAtUDF extends GenericUDF { private ListObjectInspector listInspector; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/array/ArrayFlattenUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/ArrayFlattenUDF.java b/core/src/main/java/hivemall/tools/array/ArrayFlattenUDF.java index 906d594..39d0110 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayFlattenUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayFlattenUDF.java @@ -19,6 +19,7 @@ package hivemall.tools.array; import hivemall.utils.hadoop.HiveUtils; +import hivemall.utils.lang.StringUtils; import java.util.ArrayList; import java.util.List; @@ -33,16 +34,16 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; @Description(name = "array_flatten", - value = "_FUNC_(array<array<ANY>>) - Returns an array with the elements flattened.") + value = "_FUNC_(array<array<ANY>>) - Returns an array with the elements flattened.", + extended = "SELECT array_flatten(array(array(1,2,3),array(4,5),array(6,7,8)));\n" + + " [1,2,3,4,5,6,7,8]") @UDFType(deterministic = true, stateful = false) public final class ArrayFlattenUDF extends GenericUDF { private ListObjectInspector listOI; - private ListObjectInspector nextedListOI; - private ObjectInspector elemOI; - private final List<Object> result = new ArrayList<>(); @Override @@ -58,11 +59,13 @@ public final class ArrayFlattenUDF extends GenericUDF { throw new UDFArgumentException( "array_flatten takes array of array for the argument: " + listOI.toString()); } - this.nextedListOI = HiveUtils.asListOI(listElemOI); - this.elemOI = nextedListOI.getListElementObjectInspector(); + + ListObjectInspector nestedListOI = HiveUtils.asListOI(listElemOI); + ObjectInspector elemOI = nestedListOI.getListElementObjectInspector(); return ObjectInspectorFactory.getStandardListObjectInspector( - ObjectInspectorUtils.getStandardObjectInspector(elemOI)); + ObjectInspectorUtils.getStandardObjectInspector(elemOI, + ObjectInspectorCopyOption.WRITABLE)); } @Override @@ -81,12 +84,17 @@ public final class ArrayFlattenUDF extends GenericUDF { continue; } - final int subarrayLength = nextedListOI.getListLength(subarray); + final ListObjectInspector subarrayOI = + HiveUtils.asListOI(listOI.getListElementObjectInspector()); + final ObjectInspector elemOI = subarrayOI.getListElementObjectInspector(); + final int subarrayLength = subarrayOI.getListLength(subarray); for (int j = 0; j < subarrayLength; j++) { - Object elem = nextedListOI.getListElement(subarray, j); - if (elem == null) { + Object rawElem = subarrayOI.getListElement(subarray, j); + if (rawElem == null) { continue; } + Object elem = ObjectInspectorUtils.copyToStandardObject(rawElem, elemOI, + ObjectInspectorCopyOption.WRITABLE); result.add(elem); } } @@ -96,16 +104,7 @@ public final class ArrayFlattenUDF extends GenericUDF { @Override public String getDisplayString(String[] args) { - final StringBuffer buf = new StringBuffer(); - buf.append("array_flatten("); - for (int i = 0, len = args.length; i < len; i++) { - if (i != 0) { - buf.append(", "); - } - buf.append(args[i]); - } - buf.append(")"); - return buf.toString(); + return "array_flatten(" + StringUtils.join(args, ',') + ")"; } } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java b/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java index 909176a..4b7f91c 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java @@ -41,7 +41,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; @Description(name = "array_intersect", value = "_FUNC_(array<ANY> x1, array<ANY> x2, ..) - Returns an intersect of given arrays", - extended = "select array_intersect(array(1,3,4),array(2,3,4),array(3,5));\n" + "> [3]") + extended = "SELECT array_intersect(array(1,3,4),array(2,3,4),array(3,5));\n" + " [3]") @UDFType(deterministic = true, stateful = false) public final class ArrayIntersectUDF extends GenericUDF { @@ -88,7 +88,7 @@ public final class ArrayIntersectUDF extends GenericUDF { return Collections.emptyList(); } - Set<InspectableObject> checkSet = new HashSet<ArrayIntersectUDF.InspectableObject>(); + Set<InspectableObject> checkSet = new HashSet<InspectableObject>(); final ListObjectInspector arg0ListOI = argListOIs[0]; final ObjectInspector arg0ElemOI = arg0ListOI.getListElementObjectInspector(); final int arg0size = arg0ListOI.getListLength(arg0); @@ -106,8 +106,7 @@ public final class ArrayIntersectUDF extends GenericUDF { if (argI == null) { continue; } - final Set<InspectableObject> newSet = - new HashSet<ArrayIntersectUDF.InspectableObject>(); + final Set<InspectableObject> newSet = new HashSet<InspectableObject>(); final ListObjectInspector argIListOI = argListOIs[i]; final ObjectInspector argIElemOI = argIListOI.getListElementObjectInspector(); for (int j = 0, j_size = argIListOI.getListLength(argI); j < j_size; j++) { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java b/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java index 523093b..207c398 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java @@ -29,8 +29,8 @@ import org.apache.hadoop.io.Text; @Description(name = "array_remove", value = "_FUNC_(array<int|text> original, int|text|array<int> target)" + " - Returns an array that the target is removed " + "from the original array", - extended = "select array_remove(array(1,null,3),array(null));\n" + "> [3]\n" + "\n" - + "select array_remove(array(\"aaa\",\"bbb\"),\"bbb\");\n" + "> [\"aaa\"]") + extended = "SELECT array_remove(array(1,null,3),array(null));\n" + " [3]\n" + "\n" + + "SELECT array_remove(array(\"aaa\",\"bbb\"),\"bbb\");\n" + " [\"aaa\"]") @UDFType(deterministic = true, stateful = false) public class ArrayRemoveUDF extends UDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java b/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java index 2bc98b9..e842df6 100644 --- a/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java @@ -39,9 +39,48 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; +// @formatter:off @Description(name = "array_slice", value = "_FUNC_(array<ANY> values, int offset [, int length]) - Slices the given array by the given offset and length parameters.", - extended = "select array_slice(array(1,2,3,4,5,6), 2,4);\n" + "> [3,4]") + extended = "SELECT \n" + + " array_slice(array(1,2,3,4,5,6), 2,4),\n" + + " array_slice(\n" + + " array(\"zero\", \"one\", \"two\", \"three\", \"four\", \"five\", \"six\", \"seven\", \"eight\", \"nine\", \"ten\"),\n" + + " 0, -- offset\n" + + " 2 -- length\n" + + " ),\n" + + " array_slice(\n" + + " array(\"zero\", \"one\", \"two\", \"three\", \"four\", \"five\", \"six\", \"seven\", \"eight\", \"nine\", \"ten\"),\n" + + " 6, -- offset\n" + + " 3 -- length\n" + + " ),\n" + + " array_slice(\n" + + " array(\"zero\", \"one\", \"two\", \"three\", \"four\", \"five\", \"six\", \"seven\", \"eight\", \"nine\", \"ten\"),\n" + + " 6, -- offset\n" + + " 10 -- length\n" + + " ),\n" + + " array_slice(\n" + + " array(\"zero\", \"one\", \"two\", \"three\", \"four\", \"five\", \"six\", \"seven\", \"eight\", \"nine\", \"ten\"),\n" + + " 6 -- offset\n" + + " ),\n" + + " array_slice(\n" + + " array(\"zero\", \"one\", \"two\", \"three\", \"four\", \"five\", \"six\", \"seven\", \"eight\", \"nine\", \"ten\"),\n" + + " -3 -- offset\n" + + " ),\n" + + " array_slice(\n" + + " array(\"zero\", \"one\", \"two\", \"three\", \"four\", \"five\", \"six\", \"seven\", \"eight\", \"nine\", \"ten\"),\n" + + " -3, -- offset\n" + + " 2 -- length\n" + + " );\n" + + "\n" + + " [3,4]\n" + + " [\"zero\",\"one\"] \n" + + " [\"six\",\"seven\",\"eight\"]\n" + + " [\"six\",\"seven\",\"eight\",\"nine\",\"ten\"]\n" + + " [\"six\",\"seven\",\"eight\",\"nine\",\"ten\"]\n" + + " [\"eight\",\"nine\",\"ten\"]\n" + + " [\"eight\",\"nine\"]") +// @formatter:on @UDFType(deterministic = true, stateful = false) public final class ArraySliceUDF extends GenericUDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/array/ArrayToStrUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/ArrayToStrUDF.java b/core/src/main/java/hivemall/tools/array/ArrayToStrUDF.java new file mode 100644 index 0000000..185abc7 --- /dev/null +++ b/core/src/main/java/hivemall/tools/array/ArrayToStrUDF.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.array; + +import hivemall.utils.hadoop.HiveUtils; + +import javax.annotation.Nullable; + +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; + +@Description(name = "array_to_str", + value = "_FUNC_(array arr [, string sep=',']) - Convert array to string using a sperator", + extended = "SELECT array_to_str(array(1,2,3),'-');\n" + "1-2-3") +@UDFType(deterministic = true, stateful = false) +public final class ArrayToStrUDF extends GenericUDF { + + private ListObjectInspector listOI; + @Nullable + private StringObjectInspector sepOI; + + @Override + public ObjectInspector initialize(ObjectInspector[] argOIs) throws UDFArgumentException { + if (argOIs.length != 1 && argOIs.length != 2) { + throw new UDFArgumentLengthException( + "array_to_str(array, string sep) expects one or two arguments: " + argOIs.length); + } + + this.listOI = HiveUtils.asListOI(argOIs[0]); + if (argOIs.length == 2) { + this.sepOI = HiveUtils.asStringOI(argOIs[1]); + } + + return PrimitiveObjectInspectorFactory.javaStringObjectInspector; + } + + @Override + public String evaluate(DeferredObject[] arguments) throws HiveException { + Object arg0 = arguments[0].get(); + if (arg0 == null) { + return null; + } + + String sep = ","; + if (arguments.length == 2) { + Object arg1 = arguments[1].get(); + if (arg1 != null) { + sep = sepOI.getPrimitiveJavaObject(arg1); + } + } + + final StringBuilder buf = new StringBuilder(); + final int len = listOI.getListLength(arg0); + for (int i = 0; i < len; i++) { + Object e = listOI.getListElement(arg0, i); + if (e != null) { + if (i != 0 && buf.length() > 0) { + buf.append(sep); + } + buf.append(e.toString()); + } + } + return buf.toString(); + } + + @Override + public String getDisplayString(String[] children) { + return "array_to_str(" + StringUtils.join(children, ',') + ")"; + } + +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java b/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java index 921bbfa..91ae53f 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayUnionUDF.java @@ -46,7 +46,9 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.Object * </pre> */ @Description(name = "array_union", - value = "_FUNC_(array1, array2, ...) - Returns the union of a set of arrays") + value = "_FUNC_(array1, array2, ...) - Returns the union of a set of arrays", + extended = "SELECT array_union(array(1,2),array(1,2));\n" + "[1,2]\n\n" + + "SELECT array_union(array(1,2),array(2,3),array(2,5));\n" + "[1,2,3,5]") @UDFType(deterministic = true, stateful = false) public final class ArrayUnionUDF extends GenericUDF { @@ -95,9 +97,9 @@ public final class ArrayUnionUDF extends GenericUDF { for (int j = 0, len = oi.getListLength(undeferred); j < len; ++j) { Object nonStd = oi.getListElement(undeferred, j); - Object copied = ObjectInspectorUtils.copyToStandardObject(nonStd, elemOI, + Object copyed = ObjectInspectorUtils.copyToStandardObject(nonStd, elemOI, ObjectInspectorCopyOption.WRITABLE); - objectSet.add(copied); + objectSet.add(copyed); } } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/array/ConditionalEmitUDTF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/ConditionalEmitUDTF.java b/core/src/main/java/hivemall/tools/array/ConditionalEmitUDTF.java index a73a06f..89862b9 100644 --- a/core/src/main/java/hivemall/tools/array/ConditionalEmitUDTF.java +++ b/core/src/main/java/hivemall/tools/array/ConditionalEmitUDTF.java @@ -55,9 +55,26 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInsp * table_to_scan_once * </pre> */ +// @formatter:off @Description(name = "conditional_emit", value = "_FUNC_(array<boolean> conditions, array<primitive> features)" - + " - Emit features of a row according to various conditions") + + " - Emit features of a row according to various conditions", + extended = "WITH input as (\n" + + " select array(true, false, true) as conditions, array(\"one\", \"two\", \"three\") as features\n" + + " UNION ALL\n" + + " select array(true, true, false), array(\"four\", \"five\", \"six\")\n" + + ")\n" + + "SELECT\n" + + " conditional_emit(\n" + + " conditions, features\n" + + " )\n" + + "FROM \n" + + " input;\n" + + " one\n" + + " three\n" + + " four\n" + + " five") +// @formatter:on @UDFType(deterministic = true, stateful = false) public final class ConditionalEmitUDTF extends GenericUDTF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/array/FirstElementUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/FirstElementUDF.java b/core/src/main/java/hivemall/tools/array/FirstElementUDF.java index 957e724..a708529 100644 --- a/core/src/main/java/hivemall/tools/array/FirstElementUDF.java +++ b/core/src/main/java/hivemall/tools/array/FirstElementUDF.java @@ -31,7 +31,9 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; /** * Return the first element in an array. */ -@Description(name = "first_element", value = "_FUNC_(x) - Returns the first element in an array ") +@Description(name = "first_element", value = "_FUNC_(x) - Returns the first element in an array", + extended = "SELECT first_element(array('a','b','c'));\n a\n\n" + + "SELECT first_element(array());\n NULL") @UDFType(deterministic = true, stateful = false) public class FirstElementUDF extends GenericUDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/array/LastElementUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/LastElementUDF.java b/core/src/main/java/hivemall/tools/array/LastElementUDF.java index 91050b8..f36a6aa 100644 --- a/core/src/main/java/hivemall/tools/array/LastElementUDF.java +++ b/core/src/main/java/hivemall/tools/array/LastElementUDF.java @@ -31,7 +31,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; /** * Return the last element in an array. */ -@Description(name = "last_element", value = "_FUNC_(x) - Return the last element in an array") +@Description(name = "last_element", value = "_FUNC_(x) - Return the last element in an array", + extended = "SELECT last_element(array('a','b','c'));\n c") @UDFType(deterministic = true, stateful = false) public class LastElementUDF extends GenericUDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java b/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java index e9c1cc5..d9f265e 100644 --- a/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java +++ b/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java @@ -31,7 +31,7 @@ import org.apache.hadoop.io.IntWritable; @Description(name = "sort_and_uniq_array", value = "_FUNC_(array<int>) - Takes array<int> and " + "returns a sorted array with duplicate elements eliminated", - extended = "select sort_and_uniq_array(array(3,1,1,-2,10));\n" + "> [-2,1,3,10]") + extended = "SELECT sort_and_uniq_array(array(3,1,1,-2,10));\n" + " [-2,1,3,10]") @UDFType(deterministic = true, stateful = false) public class SortAndUniqArrayUDF extends UDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java b/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java index 9018117..36f23b5 100644 --- a/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java +++ b/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java @@ -29,7 +29,7 @@ import org.apache.hadoop.io.Text; @Description(name = "subarray_endwith", value = "_FUNC_(array<int|text> original, int|text key)" + " - Returns an array that ends with the specified key", - extended = "select subarray_endwith(array(1,2,3,4), 3);\n" + "> [1,2,3]") + extended = "SELECT subarray_endwith(array(1,2,3,4), 3);\n" + " [1,2,3]") @UDFType(deterministic = true, stateful = false) public class SubarrayEndWithUDF extends UDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java b/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java index ae0d4fb..95ae6cf 100644 --- a/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java +++ b/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java @@ -29,7 +29,7 @@ import org.apache.hadoop.io.Text; @Description(name = "subarray_startwith", value = "_FUNC_(array<int|text> original, int|text key)" + " - Returns an array that starts with the specified key", - extended = "select subarray_startwith(array(1,2,3,4), 2);\n" + "> [2,3,4]") + extended = "SELECT subarray_startwith(array(1,2,3,4), 2);\n" + " [2,3,4]") @UDFType(deterministic = true, stateful = false) public class SubarrayStartWithUDF extends UDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/bits/BitsORUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/bits/BitsORUDF.java b/core/src/main/java/hivemall/tools/bits/BitsORUDF.java index d5497b6..40f7b5c 100644 --- a/core/src/main/java/hivemall/tools/bits/BitsORUDF.java +++ b/core/src/main/java/hivemall/tools/bits/BitsORUDF.java @@ -38,8 +38,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn @Description(name = "bits_or", value = "_FUNC_(array<long> b1, array<long> b2, ..) - Returns a logical OR given bitsets", - extended = "select unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3))));\n" - + "> [1,2,3,4]") + extended = "SELECT unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3))));\n" + + " [1,2,3,4]") public final class BitsORUDF extends GenericUDF { private ListObjectInspector[] _listOIs; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java b/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java index 599d90a..c33bd31 100644 --- a/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java +++ b/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java @@ -40,7 +40,7 @@ import org.apache.hadoop.io.LongWritable; @Description(name = "to_bits", value = "_FUNC_(int[] indexes) - Returns an bitset representation if the given indexes in long[]", - extended = "select to_bits(array(1,2,3,128));\n" + "> [14,-9223372036854775808]") + extended = "SELECT to_bits(array(1,2,3,128));\n" + " [14,-9223372036854775808]") @UDFType(deterministic = true, stateful = false) public final class ToBitsUDF extends GenericUDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java b/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java index e3f6bae..0ec3762 100644 --- a/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java +++ b/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java @@ -40,7 +40,7 @@ import org.apache.hadoop.io.LongWritable; @Description(name = "unbits", value = "_FUNC_(long[] bitset) - Returns an long array of the give bitset representation", - extended = "select unbits(to_bits(array(1,4,2,3)));\n" + "> [1,2,3,4]") + extended = "SELECT unbits(to_bits(array(1,4,2,3)));\n" + " [1,2,3,4]") @UDFType(deterministic = true, stateful = false) public final class UnBitsUDF extends GenericUDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/compress/DeflateUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/compress/DeflateUDF.java b/core/src/main/java/hivemall/tools/compress/DeflateUDF.java index 4c82387..410ffd6 100644 --- a/core/src/main/java/hivemall/tools/compress/DeflateUDF.java +++ b/core/src/main/java/hivemall/tools/compress/DeflateUDF.java @@ -41,7 +41,7 @@ import org.apache.hadoop.io.Text; @Description(name = "deflate", value = "_FUNC_(TEXT data [, const int compressionLevel]) - " + "Returns a compressed BINARY object by using Deflater. The compression level must be in range [-1,9]", - extended = "select base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));\n" + "> AA+=kaIM|WTt!+wbGAA") + extended = "SELECT base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));\n" + " AA+=kaIM|WTt!+wbGAA") @UDFType(deterministic = true, stateful = false) public final class DeflateUDF extends GenericUDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/compress/InflateUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/compress/InflateUDF.java b/core/src/main/java/hivemall/tools/compress/InflateUDF.java index e2c4cc2..883d432 100644 --- a/core/src/main/java/hivemall/tools/compress/InflateUDF.java +++ b/core/src/main/java/hivemall/tools/compress/InflateUDF.java @@ -40,8 +40,8 @@ import org.apache.hadoop.io.Text; @Description(name = "inflate", value = "_FUNC_(BINARY compressedData) - Returns a decompressed STRING by using Inflater", - extended = "select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));\n" - + "> aaaaaaaaaaaaaaaabbbbccc") + extended = "SELECT inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));\n" + + " aaaaaaaaaaaaaaaabbbbccc") @UDFType(deterministic = true, stateful = false) public final class InflateUDF extends GenericUDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java b/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java new file mode 100644 index 0000000..8235842 --- /dev/null +++ b/core/src/main/java/hivemall/tools/datetime/SessionizeUDF.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.datetime; + +import java.util.UUID; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; + +//@formatter:off +@Description(name = "sessionize", + value = "_FUNC_(long timeInSec, long thresholdInSec [, String subject])" + + "- Returns a UUID string of a session.", + extended = "SELECT \n" + + " sessionize(time, 3600, ip_addr) as session_id, \n" + + " time, ip_addr\n" + + "FROM (\n" + + " SELECT time, ipaddr \n" + + " FROM weblog \n" + + " DISTRIBUTE BY ip_addr, time SORT BY ip_addr, time DESC\n" + + ") t1") +//@formatter:on +@UDFType(deterministic = false, stateful = true) +public final class SessionizeUDF extends UDF { + + private long lastTime; + @Nullable + private Text lastSubject; + @Nonnull + private final Text sessionId = new Text(); + + @Nullable + public Text evaluate(@Nullable LongWritable time, @Nullable LongWritable threshold) { + if (time == null || threshold == null) { + return null; + } + + final long thisTime = time.get(); + final long diff = thisTime - lastTime; + if (diff < threshold.get()) { + this.lastTime = thisTime; + return sessionId; + } + + sessionId.set(UUID.randomUUID().toString()); + this.lastTime = time.get(); + return sessionId; + } + + @Nullable + public Text evaluate(@Nullable LongWritable time, @Nullable LongWritable threshold, + @Nullable Text subject) { + if (time == null || threshold == null || subject == null) { + return null; + } + + if (subject.equals(lastSubject)) { + final long thisTime = time.get(); + final long diff = thisTime - lastTime; + if (diff < threshold.get()) { + this.lastTime = thisTime; + return sessionId; + } + } else { + if (lastSubject == null) { + lastSubject = new Text(subject); + } else { + lastSubject.set(subject); + } + } + + sessionId.set(UUID.randomUUID().toString()); + this.lastTime = time.get(); + return sessionId; + } + +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/main/java/hivemall/tools/json/FromJsonUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/json/FromJsonUDF.java b/core/src/main/java/hivemall/tools/json/FromJsonUDF.java index 36c29cc..97409dd 100644 --- a/core/src/main/java/hivemall/tools/json/FromJsonUDF.java +++ b/core/src/main/java/hivemall/tools/json/FromJsonUDF.java @@ -43,9 +43,47 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.io.Text; import org.apache.hive.hcatalog.data.HCatRecordObjectInspectorFactory; +// @formatter:off @Description(name = "from_json", value = "_FUNC_(string jsonString, const string returnTypes [, const array<string>|const string columnNames])" - + " - Return Hive object.") + + " - Return Hive object.", + extended = "SELECT\n" + + " from_json(\n" + + " '{ \"person\" : { \"name\" : \"makoto\" , \"age\" : 37 } }',\n" + + " 'struct<name:string,age:int>', \n" + + " array('person')\n" + + " ),\n" + + " from_json(\n" + + " '[0.1,1.1,2.2]',\n" + + " 'array<double>'\n" + + " ),\n" + + " from_json(to_json(\n" + + " ARRAY(\n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"tokyo\"), \n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"osaka\")\n" + + " )\n" + + " ),'array<struct<country:string,city:string>>'),\n" + + " from_json(to_json(\n" + + " ARRAY(\n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"tokyo\"), \n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"osaka\")\n" + + " ),\n" + + " array('city')\n" + + " ), 'array<struct<country:string,city:string>>'),\n" + + " from_json(to_json(\n" + + " ARRAY(\n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"tokyo\"), \n" + + " NAMED_STRUCT(\"country\", \"japan\", \"city\", \"osaka\")\n" + + " )\n" + + " ),'array<struct<city:string>>');\n" + + "```\n\n" + + "```\n" + + " {\"name\":\"makoto\",\"age\":37}\n" + + " [0.1,1.1,2.2]\n" + + " [{\"country\":\"japan\",\"city\":\"tokyo\"},{\"country\":\"japan\",\"city\":\"osaka\"}]\n" + + " [{\"country\":\"japan\",\"city\":\"tokyo\"},{\"country\":\"japan\",\"city\":\"osaka\"}]\n" + + " [{\"city\":\"tokyo\"},{\"city\":\"osaka\"}]") +//@formatter:on @UDFType(deterministic = true, stateful = false) public final class FromJsonUDF extends GenericUDF {
