Repository: incubator-hivemall Updated Branches: refs/heads/master 3960cf2cd -> fc9694d78
Close #125: [HIVEMALL-18] approx_distinct_count UDAF using HyperLogLog++ #125 Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/fc9694d7 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/fc9694d7 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/fc9694d7 Branch: refs/heads/master Commit: fc9694d78c0f16331ae19690ae5cef58e631417e Parents: 3960cf2 Author: Makoto Yui <[email protected]> Authored: Tue Nov 21 21:49:24 2017 +0900 Committer: Makoto Yui <[email protected]> Committed: Tue Nov 21 21:49:30 2017 +0900 ---------------------------------------------------------------------- core/pom.xml | 7 + .../java/hivemall/evaluation/FMeasureUDAF.java | 4 +- .../sketch/hll/ApproxCountDistinctUDAF.java | 253 +++++++++++++++++++ .../java/hivemall/utils/hadoop/HiveUtils.java | 55 ++-- docs/gitbook/SUMMARY.md | 1 + docs/gitbook/misc/approx.md | 86 +++++++ docs/gitbook/misc/tokenizer.md | 4 +- resources/ddl/define-all-as-permanent.hive | 8 +- resources/ddl/define-all.hive | 7 + resources/ddl/define-all.spark | 8 + resources/ddl/define-udfs.td.hql | 6 +- 11 files changed, 410 insertions(+), 29 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/core/pom.xml ---------------------------------------------------------------------- diff --git a/core/pom.xml b/core/pom.xml index 838677c..59af7e5 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -148,6 +148,12 @@ <version>[7.2.1,7.3)</version> <scope>compile</scope> </dependency> + <dependency> + <groupId>com.clearspring.analytics</groupId> + <artifactId>stream</artifactId> + <version>2.9.5</version> + <scope>compile</scope> + </dependency> <!-- test scope --> <dependency> @@ -219,6 +225,7 @@ <include>org.apache.commons:commons-math3</include> <include>org.roaringbitmap:RoaringBitmap</include> <include>it.unimi.dsi:fastutil</include> + <include>com.clearspring.analytics:stream</include> </includes> </artifactSet> <transformers> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/core/src/main/java/hivemall/evaluation/FMeasureUDAF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/evaluation/FMeasureUDAF.java b/core/src/main/java/hivemall/evaluation/FMeasureUDAF.java index e64dc12..22c0b7f 100644 --- a/core/src/main/java/hivemall/evaluation/FMeasureUDAF.java +++ b/core/src/main/java/hivemall/evaluation/FMeasureUDAF.java @@ -82,7 +82,7 @@ public final class FMeasureUDAF extends AbstractGenericUDAFResolver { "The second argument `array/int/boolean predicted` is invalid form: " + typeInfo[1]); } - if (typeInfo[0] != typeInfo[1]) { + if (!typeInfo[0].equals(typeInfo[1])) { throw new UDFArgumentTypeException(1, "The first argument `actual`'s type is " + typeInfo[0] + ", but the second argument `predicted`'s type is not match: " + typeInfo[1]); @@ -158,7 +158,7 @@ public final class FMeasureUDAF extends AbstractGenericUDAFResolver { // initialize input if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {// from original data - this.processOptions(parameters); + processOptions(parameters); this.actualOI = parameters[0]; this.predictedOI = parameters[1]; } else {// from partial aggregation http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/core/src/main/java/hivemall/sketch/hll/ApproxCountDistinctUDAF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/sketch/hll/ApproxCountDistinctUDAF.java b/core/src/main/java/hivemall/sketch/hll/ApproxCountDistinctUDAF.java new file mode 100644 index 0000000..79b4ee9 --- /dev/null +++ b/core/src/main/java/hivemall/sketch/hll/ApproxCountDistinctUDAF.java @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.sketch.hll; + +import hivemall.UDAFEvaluatorWithOptions; +import hivemall.utils.hadoop.HiveUtils; +import hivemall.utils.lang.Preconditions; +import hivemall.utils.lang.Primitives; + +import java.io.IOException; + +import javax.annotation.Nonnegative; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.Options; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AbstractAggregationBuffer; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AggregationType; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.io.LongWritable; + +import com.clearspring.analytics.stream.cardinality.CardinalityMergeException; +import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus; + +@Description(name = "approx_count_distinct", value = "_FUNC_(expr x [, const string options])" + + " - Returns an approximation of count(DISTINCT x) using HyperLogLogPlus algorithm") +public final class ApproxCountDistinctUDAF extends AbstractGenericUDAFResolver { + + @Override + public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) + throws SemanticException { + if (typeInfo.length != 1 && typeInfo.length != 2) { + throw new UDFArgumentTypeException(typeInfo.length - 1, + "_FUNC_ takes one or two arguments"); + } + if (typeInfo.length == 2 && !HiveUtils.isStringTypeInfo(typeInfo[1])) { + throw new UDFArgumentTypeException(1, + "The second argument type expected to be const string: " + typeInfo[1]); + } + + return new HLLEvaluator(); + } + + public static final class HLLEvaluator extends UDAFEvaluatorWithOptions { + + @Nullable + private int[] params; + + private ObjectInspector origInputOI; + private BinaryObjectInspector mergeInputOI; + + @Override + protected Options getOptions() { + Options opts = new Options(); + opts.addOption("p", true, + "The size of registers for the normal set. `p` MUST be in the range [4,sp] and 15 by the default"); + opts.addOption("sp", true, + "The size of registers for the sparse set. `sp` MUST be in the range [4,32] and 25 by the default"); + return opts; + } + + @Override + protected CommandLine processOptions(@Nonnull ObjectInspector[] argOIs) + throws UDFArgumentException { + CommandLine cl = null; + + int p = 15, sp = 25; + if (argOIs.length == 2) { + if (!HiveUtils.isConstString(argOIs[1])) { + throw new UDFArgumentException( + "The second argument type expected to be const string: " + argOIs[1]); + } + cl = parseOptions(HiveUtils.getConstString(argOIs[1])); + + p = Primitives.parseInt(cl.getOptionValue("p"), p); + sp = Primitives.parseInt(cl.getOptionValue("sp"), sp); + validateArguments(p, sp); + } + + this.params = new int[] {p, sp}; + + return cl; + } + + @Override + public ObjectInspector init(@Nonnull Mode mode, @Nonnull ObjectInspector[] parameters) + throws HiveException { + assert (parameters.length == 1 || parameters.length == 2) : parameters.length; + super.init(mode, parameters); + + // initialize input + if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {// from original data + processOptions(parameters); + this.origInputOI = parameters[0]; + } else {// from partial aggregation + this.mergeInputOI = HiveUtils.asBinaryOI(parameters[0]); + } + + // initialize output + final ObjectInspector outputOI; + if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) {// terminatePartial + outputOI = PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector; + } else {// terminate + outputOI = PrimitiveObjectInspectorFactory.writableLongObjectInspector; + } + return outputOI; + } + + @Override + public HLLBuffer getNewAggregationBuffer() throws HiveException { + HLLBuffer buf = new HLLBuffer(); + if (params != null) { + buf.reset(params[0], params[1]); + } + return buf; + } + + @SuppressWarnings("deprecation") + @Override + public void reset(@Nonnull AggregationBuffer agg) throws HiveException { + HLLBuffer buf = (HLLBuffer) agg; + if (params != null) { + buf.reset(params[0], params[1]); + } else { + buf.hll = null; + } + } + + @SuppressWarnings("deprecation") + @Override + public void iterate(@Nonnull AggregationBuffer agg, @Nonnull Object[] parameters) + throws HiveException { + if (parameters[0] == null) { + return; + } + + HLLBuffer buf = (HLLBuffer) agg; + Object value = + ObjectInspectorUtils.copyToStandardJavaObject(parameters[0], origInputOI); + Preconditions.checkNotNull(buf.hll, HiveException.class); + buf.hll.offer(value); + } + + @SuppressWarnings("deprecation") + @Override + @Nullable + public byte[] terminatePartial(@Nonnull AggregationBuffer agg) throws HiveException { + HLLBuffer buf = (HLLBuffer) agg; + if (buf.hll == null) { + return null; + } + try { + return buf.hll.getBytes(); + } catch (IOException e) { + throw new HiveException(e); + } + } + + @SuppressWarnings("deprecation") + @Override + public void merge(@Nonnull AggregationBuffer agg, @Nullable Object partial) + throws HiveException { + if (partial == null) { + return; + } + + byte[] data = mergeInputOI.getPrimitiveJavaObject(partial); + final HyperLogLogPlus otherHLL; + try { + otherHLL = HyperLogLogPlus.Builder.build(data); + } catch (IOException e) { + throw new HiveException("Failed to build other HLL"); + } + + final HLLBuffer buf = (HLLBuffer) agg; + if (buf.hll == null) { + buf.hll = otherHLL; + } else { + try { + buf.hll.addAll(otherHLL); + } catch (CardinalityMergeException e) { + throw new HiveException("Failed to merge HLL"); + } + } + } + + @SuppressWarnings("deprecation") + @Override + public LongWritable terminate(@Nonnull AggregationBuffer agg) throws HiveException { + HLLBuffer buf = (HLLBuffer) agg; + + long cardinarity = (buf.hll == null) ? 0L : buf.hll.cardinality(); + return new LongWritable(cardinarity); + } + + } + + private static void validateArguments(final int p, final int sp) throws UDFArgumentException { + if (p < 4 || p > sp) { + throw new UDFArgumentException("p must be between 4 and sp (inclusive)"); + } + if (sp > 32) { + throw new UDFArgumentException("sp values greater than 32 not supported"); + } + } + + @AggregationType(estimable = true) + static final class HLLBuffer extends AbstractAggregationBuffer { + + @Nullable + private HyperLogLogPlus hll; + + HLLBuffer() {} + + @Override + public int estimate() { + return (hll == null) ? 0 : hll.sizeof(); + } + + void reset(@Nonnegative int p, @Nonnegative int sp) { + this.hll = new HyperLogLogPlus(p, sp); + } + + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java index b8b344c..e9a1efb 100644 --- a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java +++ b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java @@ -102,8 +102,8 @@ public final class HiveUtils { if (o instanceof LongWritable) { long l = ((LongWritable) o).get(); if (l > 0x7fffffffL) { - throw new IllegalArgumentException("feature index must be less than " - + Integer.MAX_VALUE + ", but was " + l); + throw new IllegalArgumentException( + "feature index must be less than " + Integer.MAX_VALUE + ", but was " + l); } return (int) l; } @@ -256,6 +256,11 @@ public final class HiveUtils { return BOOLEAN_TYPE_NAME.equals(typeName); } + public static boolean isBinaryOI(@Nonnull final ObjectInspector oi) { + String typeName = oi.getTypeName(); + return BINARY_TYPE_NAME.equals(typeName); + } + public static boolean isNumberOI(@Nonnull final ObjectInspector argOI) { if (argOI.getCategory() != Category.PRIMITIVE) { return false; @@ -486,8 +491,8 @@ public final class HiveUtils { } ConstantObjectInspector constOI = (ConstantObjectInspector) oi; if (constOI.getCategory() != Category.LIST) { - throw new UDFArgumentException("argument must be an array: " - + TypeInfoUtils.getTypeInfoFromObjectInspector(oi)); + throw new UDFArgumentException( + "argument must be an array: " + TypeInfoUtils.getTypeInfoFromObjectInspector(oi)); } final List<?> lst = (List<?>) constOI.getWritableConstantValue(); if (lst == null) { @@ -513,11 +518,12 @@ public final class HiveUtils { } ConstantObjectInspector constOI = (ConstantObjectInspector) oi; if (constOI.getCategory() != Category.LIST) { - throw new UDFArgumentException("argument must be an array: " - + TypeInfoUtils.getTypeInfoFromObjectInspector(oi)); + throw new UDFArgumentException( + "argument must be an array: " + TypeInfoUtils.getTypeInfoFromObjectInspector(oi)); } StandardConstantListObjectInspector listOI = (StandardConstantListObjectInspector) constOI; - PrimitiveObjectInspector elemOI = HiveUtils.asDoubleCompatibleOI(listOI.getListElementObjectInspector()); + PrimitiveObjectInspector elemOI = + HiveUtils.asDoubleCompatibleOI(listOI.getListElementObjectInspector()); final List<?> lst = listOI.getWritableConstantValue(); if (lst == null) { @@ -778,8 +784,8 @@ public final class HiveUtils { } final int length = listOI.getListLength(argObj); if (out.length != length) { - throw new UDFArgumentException("Dimension mismatched. Expected: " + out.length - + ", Actual: " + length); + throw new UDFArgumentException( + "Dimension mismatched. Expected: " + out.length + ", Actual: " + length); } for (int i = 0; i < length; i++) { Object o = listOI.getListElement(argObj, i); @@ -804,8 +810,8 @@ public final class HiveUtils { } final int length = listOI.getListLength(argObj); if (out.length != length) { - throw new UDFArgumentException("Dimension mismatched. Expected: " + out.length - + ", Actual: " + length); + throw new UDFArgumentException( + "Dimension mismatched. Expected: " + out.length + ", Actual: " + length); } for (int i = 0; i < length; i++) { Object o = listOI.getListElement(argObj, i); @@ -940,8 +946,8 @@ public final class HiveUtils { case STRING: break; default: - throw new UDFArgumentTypeException(0, "Unxpected type '" + argOI.getTypeName() - + "' is passed."); + throw new UDFArgumentTypeException(0, + "Unxpected type '" + argOI.getTypeName() + "' is passed."); } return oi; } @@ -967,8 +973,8 @@ public final class HiveUtils { case TIMESTAMP: break; default: - throw new UDFArgumentTypeException(0, "Unxpected type '" + argOI.getTypeName() - + "' is passed."); + throw new UDFArgumentTypeException(0, + "Unxpected type '" + argOI.getTypeName() + "' is passed."); } return oi; } @@ -988,15 +994,15 @@ public final class HiveUtils { case BYTE: break; default: - throw new UDFArgumentTypeException(0, "Unxpected type '" + argOI.getTypeName() - + "' is passed."); + throw new UDFArgumentTypeException(0, + "Unxpected type '" + argOI.getTypeName() + "' is passed."); } return oi; } @Nonnull - public static PrimitiveObjectInspector asDoubleCompatibleOI(@Nonnull final ObjectInspector argOI) - throws UDFArgumentTypeException { + public static PrimitiveObjectInspector asDoubleCompatibleOI( + @Nonnull final ObjectInspector argOI) throws UDFArgumentTypeException { if (argOI.getCategory() != Category.PRIMITIVE) { throw new UDFArgumentTypeException(0, "Only primitive type arguments are accepted but " + argOI.getTypeName() + " is passed."); @@ -1159,8 +1165,8 @@ public final class HiveUtils { @Nonnull public static LazyString lazyString(@Nonnull final String str, final byte escapeChar) { - LazyStringObjectInspector oi = LazyPrimitiveObjectInspectorFactory.getLazyStringObjectInspector( - false, escapeChar); + LazyStringObjectInspector oi = + LazyPrimitiveObjectInspectorFactory.getLazyStringObjectInspector(false, escapeChar); return lazyString(str, oi); } @@ -1177,15 +1183,16 @@ public final class HiveUtils { @Nonnull public static LazyInteger lazyInteger(@Nonnull final int v) { - LazyInteger lazy = new LazyInteger( - LazyPrimitiveObjectInspectorFactory.LAZY_INT_OBJECT_INSPECTOR); + LazyInteger lazy = + new LazyInteger(LazyPrimitiveObjectInspectorFactory.LAZY_INT_OBJECT_INSPECTOR); lazy.getWritableObject().set(v); return lazy; } @Nonnull public static LazyLong lazyLong(@Nonnull final long v) { - LazyLong lazy = new LazyLong(LazyPrimitiveObjectInspectorFactory.LAZY_LONG_OBJECT_INSPECTOR); + LazyLong lazy = + new LazyLong(LazyPrimitiveObjectInspectorFactory.LAZY_LONG_OBJECT_INSPECTOR); lazy.getWritableObject().set(v); return lazy; } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/docs/gitbook/SUMMARY.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md index 8b76a7f..0d30ba0 100644 --- a/docs/gitbook/SUMMARY.md +++ b/docs/gitbook/SUMMARY.md @@ -50,6 +50,7 @@ * [List of generic Hivemall functions](misc/generic_funcs.md) * [Efficient Top-K query processing](misc/topk.md) * [Text Tokenizer](misc/tokenizer.md) +* [Approximate Aggregate Functions](misc/approx.md) ## Part III - Feature Engineering http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/docs/gitbook/misc/approx.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/misc/approx.md b/docs/gitbook/misc/approx.md new file mode 100644 index 0000000..2e365de --- /dev/null +++ b/docs/gitbook/misc/approx.md @@ -0,0 +1,86 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<!-- toc --> + +# Approximate Counting using HyperLogLog + +`count(distinct value)` can often cause memory exhausted errors where input data and the cardinarity of value are large. + +[HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) is an efficient algorithm for approximating the number of distinct elements in a [multiset](https://en.wikipedia.org/wiki/Multiset). +Hivemall implements [HyperLogLog++](https://en.wikipedia.org/wiki/HyperLogLog#HLL.2B.2B) in `approx_count_distinct`. + +## Usage + +`approx_count_distinct` is less accurate than COUNT(DISTINCT expression), but performs better on huge input. + +```sql +select + count(distinct rowid) as actual, + approx_count_distinct(rowid) as default_p +from + train; +``` + +| actual | default_p | +|:------:|:---------:| +| 45840617 | 45567770 | + + +```sql +select + approx_count_distinct(rowid, '-p 4') as p4, + approx_count_distinct(rowid, '-p 6 -sp 6') as p6_sp6, + approx_count_distinct(rowid, '-p 14') as p14, + approx_count_distinct(rowid, '-p 15') as p15, + approx_count_distinct(rowid, '-p 16') as p16, + approx_count_distinct(rowid, '-p 24') as p24, + approx_count_distinct(rowid, '-p 25') as p25, + approx_count_distinct(rowid, '-p 15 -sp 15') as p15_sp15 +from + train; +``` + +| p4 | p6_sp6 | p14 | p15 | p16 | p24 | p25 | p15_sp15 | +|:--:|:------:|:---:|:---:|:---:|:---:|:---:|:--------:| +| 38033066 | 49332600 | 45051015 | 45567770 | 45614484 | 45831359 | 45832280 | 45567770 | + +> #### Note +> +> `p` controls expected precision and memory consumption tradeoff and `default p=15` generally works well. Find More information on [this paper](https://research.google.com/pubs/pub40671.html). + +## Function Signature + +You can find the function signature and options of `approx_count_distinct` is as follows: + +```sql +select + approx_count_distinct(rowid, '-help') +from + train; +``` + +``` +usage: HLLEvaluator [-help] [-p <arg>] [-sp <arg>] + -help Show function help + -p <arg> The size of registers for the normal set. `p` MUST be in the + range [4,sp] and 15 by the default + -sp <arg> The size of registers for the sparse set. `sp` MUST be in the + range [4,32] and 25 by the defaul +``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/docs/gitbook/misc/tokenizer.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md index b056874..b691230 100644 --- a/docs/gitbook/misc/tokenizer.md +++ b/docs/gitbook/misc/tokenizer.md @@ -16,7 +16,9 @@ specific language governing permissions and limitations under the License. --> - + +<!-- toc --> + # Tokenizer for English Texts Hivemall provides simple English text tokenizer UDF that has following syntax: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/resources/ddl/define-all-as-permanent.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index b0107af..fa307d5 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -735,6 +735,13 @@ CREATE FUNCTION guess_attribute_types as 'hivemall.smile.tools.GuessAttributesUD DROP FUNCTION IF EXISTS train_slim; CREATE FUNCTION train_slim as 'hivemall.recommend.SlimUDTF' USING JAR '${hivemall_jar}'; +----------------- +-- Data Sketch -- +----------------- + +DROP FUNCTION IF EXISTS approx_count_distinct; +CREATE FUNCTION approx_count_distinct as 'hivemall.sketch.hll.ApproxCountDistinctUDAF' USING JAR '${hivemall_jar}'; + ------------------------------ -- XGBoost related features -- ------------------------------ @@ -753,4 +760,3 @@ CREATE FUNCTION xgboost_predict AS 'hivemall.xgboost.tools.XGBoostPredictUDTF' U DROP FUNCTION xgboost_multiclass_predict; CREATE FUNCTION xgboost_multiclass_predict AS 'hivemall.xgboost.tools.XGBoostMulticlassPredictUDTF' USING JAR '${hivemall_jar}'; -======= http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/resources/ddl/define-all.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 4f91f79..13abe76 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -727,6 +727,13 @@ create temporary function guess_attribute_types as 'hivemall.smile.tools.GuessAt drop temporary function if exists train_slim; create temporary function train_slim as 'hivemall.recommend.SlimUDTF'; +----------------- +-- Data Sketch -- +----------------- + +drop temporary function if exists approx_count_distinct; +create temporary function approx_count_distinct as 'hivemall.sketch.hll.ApproxCountDistinctUDAF'; + -------------------------------------------------------------------------------------------------- -- macros available from hive 0.12.0 -- see https://issues.apache.org/jira/browse/HIVE-2655 http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/resources/ddl/define-all.spark ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index 02f92ec..67e3765 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -712,3 +712,11 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION train_gradient_tree_boosting_classifie sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS train_slim") sqlContext.sql("CREATE TEMPORARY FUNCTION train_slim AS 'hivemall.recommend.SlimUDTF'") + +/** + * Data Sketch + */ + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS approx_count_distinct") +sqlContext.sql("CREATE TEMPORARY FUNCTION approx_count_distinct AS 'hivemall.sketch.hll.ApproxCountDistinctUDAF'") + http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/resources/ddl/define-udfs.td.hql ---------------------------------------------------------------------- diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index 28b77cb..5bcd366 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -156,7 +156,7 @@ create temporary function train_randomforest_regr as 'hivemall.smile.regression. create temporary function tree_predict as 'hivemall.smile.tools.TreePredictUDF'; create temporary function rf_ensemble as 'hivemall.smile.tools.RandomForestEnsembleUDAF'; create temporary function guess_attribute_types as 'hivemall.smile.tools.GuessAttributesUDF'; --- since Hivemall v0.5-rc.1 +-- since Hivemall v0.5 create temporary function changefinder as 'hivemall.anomaly.ChangeFinderUDF'; create temporary function sst as 'hivemall.anomaly.SingularSpectrumTransformUDF'; create temporary function train_lda as 'hivemall.topicmodel.LDAUDTF'; @@ -183,6 +183,7 @@ create temporary function singularize as 'hivemall.tools.text.SingularizeUDF'; create temporary function train_slim as 'hivemall.recommend.SlimUDTF'; create temporary function hitrate as 'hivemall.evaluation.HitRateUDAF'; create temporary function word_ngrams as 'hivemall.tools.text.WordNgramsUDF'; +create temporary function approx_count_distinct as 'hivemall.sketch.hll.ApproxCountDistinctUDAF'; -- NLP features create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF'; @@ -195,3 +196,6 @@ create temporary function arow_regress as 'hivemall.regression.AROWRegressionUDT create temporary function addBias as 'hivemall.ftvec.AddBiasUDF'; create temporary function tree_predict_v1 as 'hivemall.smile.tools.TreePredictUDFv1'; create temporary function add_field_indicies as 'hivemall.ftvec.trans.AddFieldIndicesUDF'; + +-- alias for TD +create temporary function approx_distinct as 'hivemall.sketch.hll.ApproxCountDistinctUDAF';
