Repository: incubator-hivemall
Updated Branches:
  refs/heads/master 3960cf2cd -> fc9694d78


Close #125: [HIVEMALL-18] approx_distinct_count UDAF using HyperLogLog++ #125


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/fc9694d7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/fc9694d7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/fc9694d7

Branch: refs/heads/master
Commit: fc9694d78c0f16331ae19690ae5cef58e631417e
Parents: 3960cf2
Author: Makoto Yui <[email protected]>
Authored: Tue Nov 21 21:49:24 2017 +0900
Committer: Makoto Yui <[email protected]>
Committed: Tue Nov 21 21:49:30 2017 +0900

----------------------------------------------------------------------
 core/pom.xml                                    |   7 +
 .../java/hivemall/evaluation/FMeasureUDAF.java  |   4 +-
 .../sketch/hll/ApproxCountDistinctUDAF.java     | 253 +++++++++++++++++++
 .../java/hivemall/utils/hadoop/HiveUtils.java   |  55 ++--
 docs/gitbook/SUMMARY.md                         |   1 +
 docs/gitbook/misc/approx.md                     |  86 +++++++
 docs/gitbook/misc/tokenizer.md                  |   4 +-
 resources/ddl/define-all-as-permanent.hive      |   8 +-
 resources/ddl/define-all.hive                   |   7 +
 resources/ddl/define-all.spark                  |   8 +
 resources/ddl/define-udfs.td.hql                |   6 +-
 11 files changed, 410 insertions(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/core/pom.xml
----------------------------------------------------------------------
diff --git a/core/pom.xml b/core/pom.xml
index 838677c..59af7e5 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -148,6 +148,12 @@
                        <version>[7.2.1,7.3)</version>
                        <scope>compile</scope>
                </dependency>
+               <dependency>
+                 <groupId>com.clearspring.analytics</groupId>
+                 <artifactId>stream</artifactId>
+                 <version>2.9.5</version>
+                 <scope>compile</scope>
+               </dependency>
 
                <!-- test scope -->
                <dependency>
@@ -219,6 +225,7 @@
                                                                        
<include>org.apache.commons:commons-math3</include>
                                                                        
<include>org.roaringbitmap:RoaringBitmap</include>
                                                                        
<include>it.unimi.dsi:fastutil</include>
+                                                                       
<include>com.clearspring.analytics:stream</include>
                                                                </includes>
                                                        </artifactSet>
                                                        <transformers>

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/core/src/main/java/hivemall/evaluation/FMeasureUDAF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/evaluation/FMeasureUDAF.java 
b/core/src/main/java/hivemall/evaluation/FMeasureUDAF.java
index e64dc12..22c0b7f 100644
--- a/core/src/main/java/hivemall/evaluation/FMeasureUDAF.java
+++ b/core/src/main/java/hivemall/evaluation/FMeasureUDAF.java
@@ -82,7 +82,7 @@ public final class FMeasureUDAF extends 
AbstractGenericUDAFResolver {
                 "The second argument `array/int/boolean predicted` is invalid 
form: " + typeInfo[1]);
         }
 
-        if (typeInfo[0] != typeInfo[1]) {
+        if (!typeInfo[0].equals(typeInfo[1])) {
             throw new UDFArgumentTypeException(1, "The first argument 
`actual`'s type is "
                     + typeInfo[0] + ", but the second argument `predicted`'s 
type is not match: "
                     + typeInfo[1]);
@@ -158,7 +158,7 @@ public final class FMeasureUDAF extends 
AbstractGenericUDAFResolver {
 
             // initialize input
             if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {// from 
original data
-                this.processOptions(parameters);
+                processOptions(parameters);
                 this.actualOI = parameters[0];
                 this.predictedOI = parameters[1];
             } else {// from partial aggregation

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/core/src/main/java/hivemall/sketch/hll/ApproxCountDistinctUDAF.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/hivemall/sketch/hll/ApproxCountDistinctUDAF.java 
b/core/src/main/java/hivemall/sketch/hll/ApproxCountDistinctUDAF.java
new file mode 100644
index 0000000..79b4ee9
--- /dev/null
+++ b/core/src/main/java/hivemall/sketch/hll/ApproxCountDistinctUDAF.java
@@ -0,0 +1,253 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.sketch.hll;
+
+import hivemall.UDAFEvaluatorWithOptions;
+import hivemall.utils.hadoop.HiveUtils;
+import hivemall.utils.lang.Preconditions;
+import hivemall.utils.lang.Primitives;
+
+import java.io.IOException;
+
+import javax.annotation.Nonnegative;
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
+import 
org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AbstractAggregationBuffer;
+import 
org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AggregationType;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
+
+import com.clearspring.analytics.stream.cardinality.CardinalityMergeException;
+import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus;
+
+@Description(name = "approx_count_distinct", value = "_FUNC_(expr x [, const 
string options])"
+        + " - Returns an approximation of count(DISTINCT x) using 
HyperLogLogPlus algorithm")
+public final class ApproxCountDistinctUDAF extends AbstractGenericUDAFResolver 
{
+
+    @Override
+    public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo)
+            throws SemanticException {
+        if (typeInfo.length != 1 && typeInfo.length != 2) {
+            throw new UDFArgumentTypeException(typeInfo.length - 1,
+                "_FUNC_ takes one or two arguments");
+        }
+        if (typeInfo.length == 2 && !HiveUtils.isStringTypeInfo(typeInfo[1])) {
+            throw new UDFArgumentTypeException(1,
+                "The second argument type expected to be const string: " + 
typeInfo[1]);
+        }
+
+        return new HLLEvaluator();
+    }
+
+    public static final class HLLEvaluator extends UDAFEvaluatorWithOptions {
+
+        @Nullable
+        private int[] params;
+
+        private ObjectInspector origInputOI;
+        private BinaryObjectInspector mergeInputOI;
+
+        @Override
+        protected Options getOptions() {
+            Options opts = new Options();
+            opts.addOption("p", true,
+                "The size of registers for the normal set. `p` MUST be in the 
range [4,sp] and 15 by the default");
+            opts.addOption("sp", true,
+                "The size of registers for the sparse set. `sp` MUST be in the 
range [4,32] and 25 by the default");
+            return opts;
+        }
+
+        @Override
+        protected CommandLine processOptions(@Nonnull ObjectInspector[] argOIs)
+                throws UDFArgumentException {
+            CommandLine cl = null;
+
+            int p = 15, sp = 25;
+            if (argOIs.length == 2) {
+                if (!HiveUtils.isConstString(argOIs[1])) {
+                    throw new UDFArgumentException(
+                        "The second argument type expected to be const string: 
" + argOIs[1]);
+                }
+                cl = parseOptions(HiveUtils.getConstString(argOIs[1]));
+
+                p = Primitives.parseInt(cl.getOptionValue("p"), p);
+                sp = Primitives.parseInt(cl.getOptionValue("sp"), sp);
+                validateArguments(p, sp);
+            }
+
+            this.params = new int[] {p, sp};
+
+            return cl;
+        }
+
+        @Override
+        public ObjectInspector init(@Nonnull Mode mode, @Nonnull 
ObjectInspector[] parameters)
+                throws HiveException {
+            assert (parameters.length == 1 || parameters.length == 2) : 
parameters.length;
+            super.init(mode, parameters);
+
+            // initialize input
+            if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {// from 
original data
+                processOptions(parameters);
+                this.origInputOI = parameters[0];
+            } else {// from partial aggregation               
+                this.mergeInputOI = HiveUtils.asBinaryOI(parameters[0]);
+            }
+
+            // initialize output
+            final ObjectInspector outputOI;
+            if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) {// 
terminatePartial                
+                outputOI = 
PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector;
+            } else {// terminate
+                outputOI = 
PrimitiveObjectInspectorFactory.writableLongObjectInspector;
+            }
+            return outputOI;
+        }
+
+        @Override
+        public HLLBuffer getNewAggregationBuffer() throws HiveException {
+            HLLBuffer buf = new HLLBuffer();
+            if (params != null) {
+                buf.reset(params[0], params[1]);
+            }
+            return buf;
+        }
+
+        @SuppressWarnings("deprecation")
+        @Override
+        public void reset(@Nonnull AggregationBuffer agg) throws HiveException 
{
+            HLLBuffer buf = (HLLBuffer) agg;
+            if (params != null) {
+                buf.reset(params[0], params[1]);
+            } else {
+                buf.hll = null;
+            }
+        }
+
+        @SuppressWarnings("deprecation")
+        @Override
+        public void iterate(@Nonnull AggregationBuffer agg, @Nonnull Object[] 
parameters)
+                throws HiveException {
+            if (parameters[0] == null) {
+                return;
+            }
+
+            HLLBuffer buf = (HLLBuffer) agg;
+            Object value =
+                    
ObjectInspectorUtils.copyToStandardJavaObject(parameters[0], origInputOI);
+            Preconditions.checkNotNull(buf.hll, HiveException.class);
+            buf.hll.offer(value);
+        }
+
+        @SuppressWarnings("deprecation")
+        @Override
+        @Nullable
+        public byte[] terminatePartial(@Nonnull AggregationBuffer agg) throws 
HiveException {
+            HLLBuffer buf = (HLLBuffer) agg;
+            if (buf.hll == null) {
+                return null;
+            }
+            try {
+                return buf.hll.getBytes();
+            } catch (IOException e) {
+                throw new HiveException(e);
+            }
+        }
+
+        @SuppressWarnings("deprecation")
+        @Override
+        public void merge(@Nonnull AggregationBuffer agg, @Nullable Object 
partial)
+                throws HiveException {
+            if (partial == null) {
+                return;
+            }
+
+            byte[] data = mergeInputOI.getPrimitiveJavaObject(partial);
+            final HyperLogLogPlus otherHLL;
+            try {
+                otherHLL = HyperLogLogPlus.Builder.build(data);
+            } catch (IOException e) {
+                throw new HiveException("Failed to build other HLL");
+            }
+
+            final HLLBuffer buf = (HLLBuffer) agg;
+            if (buf.hll == null) {
+                buf.hll = otherHLL;
+            } else {
+                try {
+                    buf.hll.addAll(otherHLL);
+                } catch (CardinalityMergeException e) {
+                    throw new HiveException("Failed to merge HLL");
+                }
+            }
+        }
+
+        @SuppressWarnings("deprecation")
+        @Override
+        public LongWritable terminate(@Nonnull AggregationBuffer agg) throws 
HiveException {
+            HLLBuffer buf = (HLLBuffer) agg;
+
+            long cardinarity = (buf.hll == null) ? 0L : buf.hll.cardinality();
+            return new LongWritable(cardinarity);
+        }
+
+    }
+
+    private static void validateArguments(final int p, final int sp) throws 
UDFArgumentException {
+        if (p < 4 || p > sp) {
+            throw new UDFArgumentException("p must be between 4 and sp 
(inclusive)");
+        }
+        if (sp > 32) {
+            throw new UDFArgumentException("sp values greater than 32 not 
supported");
+        }
+    }
+
+    @AggregationType(estimable = true)
+    static final class HLLBuffer extends AbstractAggregationBuffer {
+
+        @Nullable
+        private HyperLogLogPlus hll;
+
+        HLLBuffer() {}
+
+        @Override
+        public int estimate() {
+            return (hll == null) ? 0 : hll.sizeof();
+        }
+
+        void reset(@Nonnegative int p, @Nonnegative int sp) {
+            this.hll = new HyperLogLogPlus(p, sp);
+        }
+
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java 
b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java
index b8b344c..e9a1efb 100644
--- a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java
+++ b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java
@@ -102,8 +102,8 @@ public final class HiveUtils {
         if (o instanceof LongWritable) {
             long l = ((LongWritable) o).get();
             if (l > 0x7fffffffL) {
-                throw new IllegalArgumentException("feature index must be less 
than "
-                        + Integer.MAX_VALUE + ", but was " + l);
+                throw new IllegalArgumentException(
+                    "feature index must be less than " + Integer.MAX_VALUE + 
", but was " + l);
             }
             return (int) l;
         }
@@ -256,6 +256,11 @@ public final class HiveUtils {
         return BOOLEAN_TYPE_NAME.equals(typeName);
     }
 
+    public static boolean isBinaryOI(@Nonnull final ObjectInspector oi) {
+        String typeName = oi.getTypeName();
+        return BINARY_TYPE_NAME.equals(typeName);
+    }
+
     public static boolean isNumberOI(@Nonnull final ObjectInspector argOI) {
         if (argOI.getCategory() != Category.PRIMITIVE) {
             return false;
@@ -486,8 +491,8 @@ public final class HiveUtils {
         }
         ConstantObjectInspector constOI = (ConstantObjectInspector) oi;
         if (constOI.getCategory() != Category.LIST) {
-            throw new UDFArgumentException("argument must be an array: "
-                    + TypeInfoUtils.getTypeInfoFromObjectInspector(oi));
+            throw new UDFArgumentException(
+                "argument must be an array: " + 
TypeInfoUtils.getTypeInfoFromObjectInspector(oi));
         }
         final List<?> lst = (List<?>) constOI.getWritableConstantValue();
         if (lst == null) {
@@ -513,11 +518,12 @@ public final class HiveUtils {
         }
         ConstantObjectInspector constOI = (ConstantObjectInspector) oi;
         if (constOI.getCategory() != Category.LIST) {
-            throw new UDFArgumentException("argument must be an array: "
-                    + TypeInfoUtils.getTypeInfoFromObjectInspector(oi));
+            throw new UDFArgumentException(
+                "argument must be an array: " + 
TypeInfoUtils.getTypeInfoFromObjectInspector(oi));
         }
         StandardConstantListObjectInspector listOI = 
(StandardConstantListObjectInspector) constOI;
-        PrimitiveObjectInspector elemOI = 
HiveUtils.asDoubleCompatibleOI(listOI.getListElementObjectInspector());
+        PrimitiveObjectInspector elemOI =
+                
HiveUtils.asDoubleCompatibleOI(listOI.getListElementObjectInspector());
 
         final List<?> lst = listOI.getWritableConstantValue();
         if (lst == null) {
@@ -778,8 +784,8 @@ public final class HiveUtils {
         }
         final int length = listOI.getListLength(argObj);
         if (out.length != length) {
-            throw new UDFArgumentException("Dimension mismatched. Expected: " 
+ out.length
-                    + ", Actual: " + length);
+            throw new UDFArgumentException(
+                "Dimension mismatched. Expected: " + out.length + ", Actual: " 
+ length);
         }
         for (int i = 0; i < length; i++) {
             Object o = listOI.getListElement(argObj, i);
@@ -804,8 +810,8 @@ public final class HiveUtils {
         }
         final int length = listOI.getListLength(argObj);
         if (out.length != length) {
-            throw new UDFArgumentException("Dimension mismatched. Expected: " 
+ out.length
-                    + ", Actual: " + length);
+            throw new UDFArgumentException(
+                "Dimension mismatched. Expected: " + out.length + ", Actual: " 
+ length);
         }
         for (int i = 0; i < length; i++) {
             Object o = listOI.getListElement(argObj, i);
@@ -940,8 +946,8 @@ public final class HiveUtils {
             case STRING:
                 break;
             default:
-                throw new UDFArgumentTypeException(0, "Unxpected type '" + 
argOI.getTypeName()
-                        + "' is passed.");
+                throw new UDFArgumentTypeException(0,
+                    "Unxpected type '" + argOI.getTypeName() + "' is passed.");
         }
         return oi;
     }
@@ -967,8 +973,8 @@ public final class HiveUtils {
             case TIMESTAMP:
                 break;
             default:
-                throw new UDFArgumentTypeException(0, "Unxpected type '" + 
argOI.getTypeName()
-                        + "' is passed.");
+                throw new UDFArgumentTypeException(0,
+                    "Unxpected type '" + argOI.getTypeName() + "' is passed.");
         }
         return oi;
     }
@@ -988,15 +994,15 @@ public final class HiveUtils {
             case BYTE:
                 break;
             default:
-                throw new UDFArgumentTypeException(0, "Unxpected type '" + 
argOI.getTypeName()
-                        + "' is passed.");
+                throw new UDFArgumentTypeException(0,
+                    "Unxpected type '" + argOI.getTypeName() + "' is passed.");
         }
         return oi;
     }
 
     @Nonnull
-    public static PrimitiveObjectInspector asDoubleCompatibleOI(@Nonnull final 
ObjectInspector argOI)
-            throws UDFArgumentTypeException {
+    public static PrimitiveObjectInspector asDoubleCompatibleOI(
+            @Nonnull final ObjectInspector argOI) throws 
UDFArgumentTypeException {
         if (argOI.getCategory() != Category.PRIMITIVE) {
             throw new UDFArgumentTypeException(0, "Only primitive type 
arguments are accepted but "
                     + argOI.getTypeName() + " is passed.");
@@ -1159,8 +1165,8 @@ public final class HiveUtils {
 
     @Nonnull
     public static LazyString lazyString(@Nonnull final String str, final byte 
escapeChar) {
-        LazyStringObjectInspector oi = 
LazyPrimitiveObjectInspectorFactory.getLazyStringObjectInspector(
-            false, escapeChar);
+        LazyStringObjectInspector oi =
+                
LazyPrimitiveObjectInspectorFactory.getLazyStringObjectInspector(false, 
escapeChar);
         return lazyString(str, oi);
     }
 
@@ -1177,15 +1183,16 @@ public final class HiveUtils {
 
     @Nonnull
     public static LazyInteger lazyInteger(@Nonnull final int v) {
-        LazyInteger lazy = new LazyInteger(
-            LazyPrimitiveObjectInspectorFactory.LAZY_INT_OBJECT_INSPECTOR);
+        LazyInteger lazy =
+                new 
LazyInteger(LazyPrimitiveObjectInspectorFactory.LAZY_INT_OBJECT_INSPECTOR);
         lazy.getWritableObject().set(v);
         return lazy;
     }
 
     @Nonnull
     public static LazyLong lazyLong(@Nonnull final long v) {
-        LazyLong lazy = new 
LazyLong(LazyPrimitiveObjectInspectorFactory.LAZY_LONG_OBJECT_INSPECTOR);
+        LazyLong lazy =
+                new 
LazyLong(LazyPrimitiveObjectInspectorFactory.LAZY_LONG_OBJECT_INSPECTOR);
         lazy.getWritableObject().set(v);
         return lazy;
     }

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/docs/gitbook/SUMMARY.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md
index 8b76a7f..0d30ba0 100644
--- a/docs/gitbook/SUMMARY.md
+++ b/docs/gitbook/SUMMARY.md
@@ -50,6 +50,7 @@
 * [List of generic Hivemall functions](misc/generic_funcs.md)
 * [Efficient Top-K query processing](misc/topk.md)
 * [Text Tokenizer](misc/tokenizer.md)
+* [Approximate Aggregate Functions](misc/approx.md)
 
 ## Part III - Feature Engineering
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/docs/gitbook/misc/approx.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/misc/approx.md b/docs/gitbook/misc/approx.md
new file mode 100644
index 0000000..2e365de
--- /dev/null
+++ b/docs/gitbook/misc/approx.md
@@ -0,0 +1,86 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+<!-- toc -->
+
+# Approximate Counting using HyperLogLog
+
+`count(distinct value)` can often cause memory exhausted errors where input 
data and the cardinarity of value are large.
+
+[HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) is an efficient 
algorithm for approximating the number of distinct elements in a 
[multiset](https://en.wikipedia.org/wiki/Multiset). 
+Hivemall implements 
[HyperLogLog++](https://en.wikipedia.org/wiki/HyperLogLog#HLL.2B.2B) in 
`approx_count_distinct`.
+
+## Usage
+
+`approx_count_distinct` is less accurate than COUNT(DISTINCT expression), but 
performs better on huge input.
+
+```sql
+select
+    count(distinct rowid) as actual,
+    approx_count_distinct(rowid) as default_p 
+from
+    train;
+```
+
+| actual | default_p |
+|:------:|:---------:|
+| 45840617 | 45567770 |
+
+
+```sql
+select
+    approx_count_distinct(rowid, '-p 4') as p4,
+    approx_count_distinct(rowid, '-p 6 -sp 6') as p6_sp6,
+    approx_count_distinct(rowid, '-p 14') as p14,
+    approx_count_distinct(rowid, '-p 15') as p15,
+    approx_count_distinct(rowid, '-p 16') as p16,
+    approx_count_distinct(rowid, '-p 24') as p24,
+    approx_count_distinct(rowid, '-p 25') as p25,
+    approx_count_distinct(rowid, '-p 15 -sp 15') as p15_sp15
+from
+    train;
+```
+
+| p4 | p6_sp6 | p14 | p15 | p16 | p24 | p25 | p15_sp15 |
+|:--:|:------:|:---:|:---:|:---:|:---:|:---:|:--------:|
+| 38033066 | 49332600 | 45051015 | 45567770 | 45614484 | 45831359 | 45832280 | 
45567770 |
+
+> #### Note
+>
+> `p` controls expected precision and memory consumption tradeoff and `default 
p=15` generally works well. Find More information on [this 
paper](https://research.google.com/pubs/pub40671.html).
+
+## Function Signature
+
+You can find the function signature and options of `approx_count_distinct` is 
as follows:
+
+```sql
+select 
+    approx_count_distinct(rowid, '-help')
+from
+    train;
+```
+
+```
+usage: HLLEvaluator [-help] [-p <arg>] [-sp <arg>]
+ -help       Show function help
+ -p <arg>    The size of registers for the normal set. `p` MUST be in the
+             range [4,sp] and 15 by the default
+ -sp <arg>   The size of registers for the sparse set. `sp` MUST be in the
+             range [4,32] and 25 by the defaul
+```

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/docs/gitbook/misc/tokenizer.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/misc/tokenizer.md b/docs/gitbook/misc/tokenizer.md
index b056874..b691230 100644
--- a/docs/gitbook/misc/tokenizer.md
+++ b/docs/gitbook/misc/tokenizer.md
@@ -16,7 +16,9 @@
   specific language governing permissions and limitations
   under the License.
 -->
-        
+
+<!-- toc -->
+
 # Tokenizer for English Texts
 
 Hivemall provides simple English text tokenizer UDF that has following syntax:

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/resources/ddl/define-all-as-permanent.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all-as-permanent.hive 
b/resources/ddl/define-all-as-permanent.hive
index b0107af..fa307d5 100644
--- a/resources/ddl/define-all-as-permanent.hive
+++ b/resources/ddl/define-all-as-permanent.hive
@@ -735,6 +735,13 @@ CREATE FUNCTION guess_attribute_types as 
'hivemall.smile.tools.GuessAttributesUD
 DROP FUNCTION IF EXISTS train_slim;
 CREATE FUNCTION train_slim as 'hivemall.recommend.SlimUDTF' USING JAR 
'${hivemall_jar}';
 
+-----------------
+-- Data Sketch --
+-----------------
+
+DROP FUNCTION IF EXISTS approx_count_distinct;
+CREATE FUNCTION approx_count_distinct as 
'hivemall.sketch.hll.ApproxCountDistinctUDAF' USING JAR '${hivemall_jar}';
+
 ------------------------------
 -- XGBoost related features --
 ------------------------------
@@ -753,4 +760,3 @@ CREATE FUNCTION xgboost_predict AS 
'hivemall.xgboost.tools.XGBoostPredictUDTF' U
 
 DROP FUNCTION xgboost_multiclass_predict;
 CREATE FUNCTION xgboost_multiclass_predict AS 
'hivemall.xgboost.tools.XGBoostMulticlassPredictUDTF' USING JAR 
'${hivemall_jar}';
-=======

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/resources/ddl/define-all.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive
index 4f91f79..13abe76 100644
--- a/resources/ddl/define-all.hive
+++ b/resources/ddl/define-all.hive
@@ -727,6 +727,13 @@ create temporary function guess_attribute_types as 
'hivemall.smile.tools.GuessAt
 drop temporary function if exists train_slim;
 create temporary function train_slim as 'hivemall.recommend.SlimUDTF';
 
+-----------------
+-- Data Sketch --
+-----------------
+
+drop temporary function if exists approx_count_distinct;
+create temporary function approx_count_distinct as 
'hivemall.sketch.hll.ApproxCountDistinctUDAF';
+
 
--------------------------------------------------------------------------------------------------
 -- macros available from hive 0.12.0
 -- see https://issues.apache.org/jira/browse/HIVE-2655

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/resources/ddl/define-all.spark
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark
index 02f92ec..67e3765 100644
--- a/resources/ddl/define-all.spark
+++ b/resources/ddl/define-all.spark
@@ -712,3 +712,11 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION 
train_gradient_tree_boosting_classifie
 
 sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS train_slim")
 sqlContext.sql("CREATE TEMPORARY FUNCTION train_slim AS 
'hivemall.recommend.SlimUDTF'")
+
+/**
+ * Data Sketch
+ */
+
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS approx_count_distinct")
+sqlContext.sql("CREATE TEMPORARY FUNCTION approx_count_distinct AS 
'hivemall.sketch.hll.ApproxCountDistinctUDAF'")
+

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fc9694d7/resources/ddl/define-udfs.td.hql
----------------------------------------------------------------------
diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql
index 28b77cb..5bcd366 100644
--- a/resources/ddl/define-udfs.td.hql
+++ b/resources/ddl/define-udfs.td.hql
@@ -156,7 +156,7 @@ create temporary function train_randomforest_regr as 
'hivemall.smile.regression.
 create temporary function tree_predict as 
'hivemall.smile.tools.TreePredictUDF';
 create temporary function rf_ensemble as 
'hivemall.smile.tools.RandomForestEnsembleUDAF';
 create temporary function guess_attribute_types as 
'hivemall.smile.tools.GuessAttributesUDF';
--- since Hivemall v0.5-rc.1
+-- since Hivemall v0.5
 create temporary function changefinder as 'hivemall.anomaly.ChangeFinderUDF';
 create temporary function sst as 
'hivemall.anomaly.SingularSpectrumTransformUDF';
 create temporary function train_lda as 'hivemall.topicmodel.LDAUDTF';
@@ -183,6 +183,7 @@ create temporary function singularize as 
'hivemall.tools.text.SingularizeUDF';
 create temporary function train_slim as 'hivemall.recommend.SlimUDTF';
 create temporary function hitrate as 'hivemall.evaluation.HitRateUDAF';
 create temporary function word_ngrams as 'hivemall.tools.text.WordNgramsUDF';
+create temporary function approx_count_distinct as 
'hivemall.sketch.hll.ApproxCountDistinctUDAF';
 
 -- NLP features
 create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF';
@@ -195,3 +196,6 @@ create temporary function arow_regress as 
'hivemall.regression.AROWRegressionUDT
 create temporary function addBias as 'hivemall.ftvec.AddBiasUDF';
 create temporary function tree_predict_v1 as 
'hivemall.smile.tools.TreePredictUDFv1';
 create temporary function add_field_indicies as 
'hivemall.ftvec.trans.AddFieldIndicesUDF';
+
+-- alias for TD
+create temporary function approx_distinct as 
'hivemall.sketch.hll.ApproxCountDistinctUDAF';

Reply via email to