http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/test/java/hivemall/tools/datetime/SessionizeUDFTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/tools/datetime/SessionizeUDFTest.java b/core/src/test/java/hivemall/tools/datetime/SessionizeUDFTest.java new file mode 100644 index 0000000..2aca351 --- /dev/null +++ b/core/src/test/java/hivemall/tools/datetime/SessionizeUDFTest.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.datetime; + +import static hivemall.utils.hadoop.WritableUtils.val; + +import hivemall.TestUtils; + +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Test; + +public class SessionizeUDFTest { + + @Test + public void testTwoArgs() { + SessionizeUDF udf = new SessionizeUDF(); + + Text session1 = new Text(udf.evaluate(val(30L), val(10L))); + Assert.assertNotNull(session1); + + Text session2 = new Text(udf.evaluate(val(35L), val(10L))); + Assert.assertEquals(session1, session2); + + Text session3 = new Text(udf.evaluate(val(40L), val(10L))); + Assert.assertEquals(session2, session3); + + Text session4 = new Text(udf.evaluate(val(50L), val(10L))); + Assert.assertNotEquals(session3, session4); + } + + @Test + public void testThreeArgs() { + SessionizeUDF udf = new SessionizeUDF(); + + Text session1 = new Text(udf.evaluate(val(30L), val(10L), val("subject1"))); + Assert.assertNotNull(session1); + + Text session2 = new Text(udf.evaluate(val(35L), val(10L), val("subject1"))); + Assert.assertEquals(session1, session2); + + Text session3 = new Text(udf.evaluate(val(40L), val(10L), val("subject2"))); + Assert.assertNotEquals(session2, session3); + + Text session4 = new Text(udf.evaluate(val(45L), val(10L), val("subject2"))); + Assert.assertEquals(session3, session4); + } + + @Test + public void testSerialization() throws HiveException { + SessionizeUDF udf = new SessionizeUDF(); + + udf.evaluate(val((long) (System.currentTimeMillis() / 1000.0d)), val(30L)); + udf.evaluate(val((long) (System.currentTimeMillis() / 1000.0d)), val(30L)); + + byte[] serialized = TestUtils.serializeObjectByKryo(udf); + TestUtils.deserializeObjectByKryo(serialized, SessionizeUDF.class); + } +}
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java b/core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java index 8bb8db7..738a939 100644 --- a/core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java +++ b/core/src/test/java/hivemall/tools/json/FromJsonUDFTest.java @@ -90,4 +90,5 @@ public class FromJsonUDFTest { HiveUtils.getConstStringObjectInspector("array<double>")}, new Object[] {"[0.1,1.1,2.2]"}); } + } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java b/core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java index 39bd64f..f7f698c 100644 --- a/core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java +++ b/core/src/test/java/hivemall/tools/json/ToJsonUDFTest.java @@ -21,6 +21,9 @@ package hivemall.tools.json; import hivemall.TestUtils; import hivemall.utils.hadoop.WritableUtils; +import java.io.IOException; +import java.util.Arrays; + import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; @@ -31,9 +34,6 @@ import org.apache.hadoop.io.Text; import org.junit.Assert; import org.junit.Test; -import java.io.IOException; -import java.util.Arrays; - public class ToJsonUDFTest { @Test http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/test/java/hivemall/tools/map/MapKeyValuesUDFTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/tools/map/MapKeyValuesUDFTest.java b/core/src/test/java/hivemall/tools/map/MapKeyValuesUDFTest.java new file mode 100644 index 0000000..2164dc1 --- /dev/null +++ b/core/src/test/java/hivemall/tools/map/MapKeyValuesUDFTest.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.map; + +import hivemall.TestUtils; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.junit.Assert; +import org.junit.Test; + +public class MapKeyValuesUDFTest { + + + @Test + public void testStringDouble() throws HiveException, IOException { + MapKeyValuesUDF udf = new MapKeyValuesUDF(); + + udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardMapObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); + + Map<String, DoubleWritable> input = new HashMap<>(); + for (int i = 0; i < 10; i++) { + input.put("k" + i, new DoubleWritable(i)); + } + + GenericUDF.DeferredObject[] arguments = + new GenericUDF.DeferredObject[] {new GenericUDF.DeferredJavaObject(input)}; + + List<Object[]> actual = udf.evaluate(arguments); + + Assert.assertEquals(input.size(), actual.size()); + for (Object[] e : actual) { + Assert.assertEquals(2, e.length); + Object v = input.get(e[0]); + Assert.assertEquals(e[1], v); + } + + udf.close(); + } + + @Test + public void testSerialization() throws UDFArgumentException { + MapKeyValuesUDF udf = new MapKeyValuesUDF(); + + udf.initialize(new ObjectInspector[] {ObjectInspectorFactory.getStandardMapObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, + PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)}); + + byte[] serialized = TestUtils.serializeObjectByKryo(udf); + TestUtils.deserializeObjectByKryo(serialized, MapKeyValuesUDF.class); + } + +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/test/java/hivemall/tools/sanity/RaiseErrorUDFTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/tools/sanity/RaiseErrorUDFTest.java b/core/src/test/java/hivemall/tools/sanity/RaiseErrorUDFTest.java index 004ba26..a96ea57 100644 --- a/core/src/test/java/hivemall/tools/sanity/RaiseErrorUDFTest.java +++ b/core/src/test/java/hivemall/tools/sanity/RaiseErrorUDFTest.java @@ -18,15 +18,21 @@ */ package hivemall.tools.sanity; +import java.io.IOException; + import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; import org.junit.Test; public class RaiseErrorUDFTest { @Test(expected = HiveException.class) - public void test() throws HiveException { + public void test() throws HiveException, IOException { RaiseErrorUDF udf = new RaiseErrorUDF(); - udf.evaluate(); + + udf.evaluate(new DeferredObject[] {}); + + udf.close(); } } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/test/java/hivemall/tools/timeseries/MovingAverageUDTFTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/tools/timeseries/MovingAverageUDTFTest.java b/core/src/test/java/hivemall/tools/timeseries/MovingAverageUDTFTest.java new file mode 100644 index 0000000..815b567 --- /dev/null +++ b/core/src/test/java/hivemall/tools/timeseries/MovingAverageUDTFTest.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.tools.timeseries; + +import hivemall.TestUtils; +import hivemall.tools.timeseries.MovingAverageUDTF; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.Collector; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.junit.Assert; +import org.junit.Test; + +public class MovingAverageUDTFTest { + + @Test + public void test() throws HiveException { + MovingAverageUDTF udtf = new MovingAverageUDTF(); + + ObjectInspector argOI0 = PrimitiveObjectInspectorFactory.javaFloatObjectInspector; + ObjectInspector argOI1 = ObjectInspectorUtils.getConstantObjectInspector( + PrimitiveObjectInspectorFactory.javaIntObjectInspector, 3); + + final List<Double> results = new ArrayList<>(); + udtf.initialize(new ObjectInspector[] {argOI0, argOI1}); + udtf.setCollector(new Collector() { + @Override + public void collect(Object input) throws HiveException { + Object[] objs = (Object[]) input; + Assert.assertEquals(1, objs.length); + Assert.assertTrue(objs[0] instanceof DoubleWritable); + double x = ((DoubleWritable) objs[0]).get(); + results.add(x); + } + }); + + udtf.process(new Object[] {1.f, null}); + udtf.process(new Object[] {2.f, null}); + udtf.process(new Object[] {3.f, null}); + udtf.process(new Object[] {4.f, null}); + udtf.process(new Object[] {5.f, null}); + udtf.process(new Object[] {6.f, null}); + udtf.process(new Object[] {7.f, null}); + + Assert.assertEquals(Arrays.asList(1.d, 1.5d, 2.d, 3.d, 4.d, 5.d, 6.d), results); + } + + @Test + public void testSerialization() throws HiveException { + TestUtils.testGenericUDTFSerialization(MovingAverageUDTF.class, + new ObjectInspector[] {PrimitiveObjectInspectorFactory.javaFloatObjectInspector, + ObjectInspectorUtils.getConstantObjectInspector( + PrimitiveObjectInspectorFactory.javaIntObjectInspector, 3)}, + new Object[][] {{1.f}, {2.f}, {3.f}, {4.f}, {5.f}}); + } + +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/test/java/hivemall/tools/vector/VectorAddUDFTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/tools/vector/VectorAddUDFTest.java b/core/src/test/java/hivemall/tools/vector/VectorAddUDFTest.java index 0aa90e7..fd70fcb 100644 --- a/core/src/test/java/hivemall/tools/vector/VectorAddUDFTest.java +++ b/core/src/test/java/hivemall/tools/vector/VectorAddUDFTest.java @@ -94,4 +94,5 @@ public class VectorAddUDFTest { PrimitiveObjectInspectorFactory.javaFloatObjectInspector)}, new Object[] {Arrays.asList(1.d, 2.d, 3.d), Arrays.asList(2.f, 3.f, 4.f)}); } + } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java b/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java index b13c447..eb5c08f 100644 --- a/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java +++ b/core/src/test/java/hivemall/tools/vector/VectorDotUDFTest.java @@ -52,8 +52,8 @@ public class VectorDotUDFTest { new GenericUDF.DeferredJavaObject( WritableUtils.toWritableList(new float[] {2, 3, 4}))}; - List<Double> actual = udf.evaluate(args); - List<Double> expected = Arrays.asList(2.d, 6.d, 12.d); + Object actual = udf.evaluate(args); + Double expected = Double.valueOf(1.d * 2.d + 2.d * 3.d + 3.d * 4.d); Assert.assertEquals(expected, actual); @@ -74,7 +74,7 @@ public class VectorDotUDFTest { WritableUtils.toWritableList(new double[] {1, 2, 3})), new GenericUDF.DeferredJavaObject(WritableUtils.val(2.f))}; - List<Double> actual = udf.evaluate(args); + Object actual = udf.evaluate(args); List<Double> expected = Arrays.asList(2.d, 4.d, 6.d); Assert.assertEquals(expected, actual); @@ -92,4 +92,5 @@ public class VectorDotUDFTest { PrimitiveObjectInspectorFactory.javaFloatObjectInspector)}, new Object[] {Arrays.asList(1.d, 2.d, 3.d), Arrays.asList(2.f, 3.f, 4.f)}); } + } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/docs/gitbook/misc/funcs.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/misc/funcs.md b/docs/gitbook/misc/funcs.md index 00d7bba..3449419 100644 --- a/docs/gitbook/misc/funcs.md +++ b/docs/gitbook/misc/funcs.md @@ -393,6 +393,84 @@ This page describes a list of Hivemall functions. See also a [list of generic Hi - `approx_count_distinct(expr x [, const string options])` - Returns an approximation of count(DISTINCT x) using HyperLogLogPlus algorithm +- `bloom(string key)` - Constructs a BloomFilter by aggregating a set of keys + ```sql + CREATE TABLE satisfied_movies AS + SELECT bloom(movieid) as movies + FROM ( + SELECT movieid + FROM ratings + GROUP BY movieid + HAVING avg(rating) >= 4.0 + ) t; + ``` + +- `bloom_and(string bloom1, string bloom2)` - Returns the logical AND of two bloom filters + ```sql + SELECT bloom_and(bf1, bf2) FROM xxx; + ``` + +- `bloom_contains(string bloom, string key)` or _FUNC_(string bloom, array<string> keys) - Returns true if the bloom filter contains all the given key(s). Returns false if key is null. + ```sql + WITH satisfied_movies as ( + SELECT bloom(movieid) as movies + FROM ( + SELECT movieid + FROM ratings + GROUP BY movieid + HAVING avg(rating) >= 4.0 + ) t + ) + SELECT + l.rating, + count(distinct l.userid) as cnt + FROM + ratings l + CROSS JOIN satisfied_movies r + WHERE + bloom_contains(r.movies, l.movieid) -- includes false positive + GROUP BY + l.rating; + + l.rating cnt + 1 1296 + 2 2770 + 3 5008 + 4 5824 + 5 5925 + ``` + +- `bloom_contains_any(string bloom, string key)` or _FUNC_(string bloom, array<string> keys)- Returns true if the bloom filter contains any of the given key + ```sql + WITH data1 as ( + SELECT explode(array(1,2,3,4,5)) as id + ), + data2 as ( + SELECT explode(array(1,3,5,6,8)) as id + ), + bloom as ( + SELECT bloom(id) as bf + FROM data1 + ) + SELECT + l.* + FROM + data2 l + CROSS JOIN bloom r + WHERE + bloom_contains_any(r.bf, array(l.id)) + ``` + +- `bloom_not(string bloom)` - Returns the logical NOT of a bloom filters + ```sql + SELECT bloom_not(bf) FROM xxx; + ``` + +- `bloom_or(string bloom1, string bloom2)` - Returns the logical OR of two bloom filters + ```sql + SELECT bloom_or(bf1, bf2) FROM xxx; + ``` + # Ensemble learning - `argmin_kld(float mean, float covar)` - Returns mean or covar that minimize a KL-distance among distributions @@ -446,7 +524,7 @@ This page describes a list of Hivemall functions. See also a [list of generic Hi - `hivemall_version()` - Returns the version of Hivemall ```sql - Usage: SELECT hivemall_version(); + SELECT hivemall_version(); ``` - `lr_datagen(options string)` - Generates a logistic regression dataset http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/docs/gitbook/misc/generic_funcs.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/misc/generic_funcs.md b/docs/gitbook/misc/generic_funcs.md index d33ab21..343a64a 100644 --- a/docs/gitbook/misc/generic_funcs.md +++ b/docs/gitbook/misc/generic_funcs.md @@ -24,88 +24,183 @@ This page describes a list of useful Hivemall generic functions. See also a [lis # Array - `array_append(array<T> arr, T elem)` - Append an element to the end of an array + ```sql + SELECT array_append(array(1,2),3); + 1,2,3 + + SELECT array_append(array('a','b'),'c'); + "a","b","c" + ``` - `array_avg(array<number>)` - Returns an array<double> in which each element is the mean of a set of numbers - `array_concat(array<ANY> x1, array<ANY> x2, ..)` - Returns a concatenated array ```sql - select array_concat(array(1),array(2,3)); - > [1,2,3] + SELECT array_concat(array(1),array(2,3)); + [1,2,3] ``` - `array_flatten(array<array<ANY>>)` - Returns an array with the elements flattened. + ```sql + SELECT array_flatten(array(array(1,2,3),array(4,5),array(6,7,8))); + [1,2,3,4,5,6,7,8] + ``` - `array_intersect(array<ANY> x1, array<ANY> x2, ..)` - Returns an intersect of given arrays ```sql - select array_intersect(array(1,3,4),array(2,3,4),array(3,5)); - > [3] + SELECT array_intersect(array(1,3,4),array(2,3,4),array(3,5)); + [3] ``` - `array_remove(array<int|text> original, int|text|array<int> target)` - Returns an array that the target is removed from the original array ```sql - select array_remove(array(1,null,3),array(null)); - > [3] + SELECT array_remove(array(1,null,3),array(null)); + [3] - select array_remove(array("aaa","bbb"),"bbb"); - > ["aaa"] + SELECT array_remove(array("aaa","bbb"),"bbb"); + ["aaa"] ``` - `array_slice(array<ANY> values, int offset [, int length])` - Slices the given array by the given offset and length parameters. ```sql - select array_slice(array(1,2,3,4,5,6), 2,4); - > [3,4] + SELECT + array_slice(array(1,2,3,4,5,6), 2,4), + array_slice( + array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), + 0, -- offset + 2 -- length + ), + array_slice( + array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), + 6, -- offset + 3 -- length + ), + array_slice( + array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), + 6, -- offset + 10 -- length + ), + array_slice( + array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), + 6 -- offset + ), + array_slice( + array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), + -3 -- offset + ), + array_slice( + array("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"), + -3, -- offset + 2 -- length + ); + + [3,4] + ["zero","one"] + ["six","seven","eight"] + ["six","seven","eight","nine","ten"] + ["six","seven","eight","nine","ten"] + ["eight","nine","ten"] + ["eight","nine"] ``` - `array_sum(array<number>)` - Returns an array<double> in which each element is summed up +- `array_to_str(array arr [, string sep=','])` - Convert array to string using a sperator + ```sql + SELECT array_to_str(array(1,2,3),'-'); + 1-2-3 + ``` + - `array_union(array1, array2, ...)` - Returns the union of a set of arrays + ```sql + SELECT array_union(array(1,2),array(1,2)); + [1,2] + + SELECT array_union(array(1,2),array(2,3),array(2,5)); + [1,2,3,5] + ``` - `conditional_emit(array<boolean> conditions, array<primitive> features)` - Emit features of a row according to various conditions + ```sql + WITH input as ( + select array(true, false, true) as conditions, array("one", "two", "three") as features + UNION ALL + select array(true, true, false), array("four", "five", "six") + ) + SELECT + conditional_emit( + conditions, features + ) + FROM + input; + one + three + four + five + ``` - `element_at(array<T> list, int pos)` - Returns an element at the given position + ```sql + SELECT element_at(array(1,2,3,4),0); + 1 + + SELECT element_at(array(1,2,3,4),-2); + 3 + ``` + +- `first_element(x)` - Returns the first element in an array + ```sql + SELECT first_element(array('a','b','c')); + a -- `first_element(x)` - Returns the first element in an array + SELECT first_element(array()); + NULL + ``` - `float_array(nDims)` - Returns an array<float> of nDims elements - `last_element(x)` - Return the last element in an array + ```sql + SELECT last_element(array('a','b','c')); + c + ``` - `select_k_best(array<number> array, const array<number> importance, const int k)` - Returns selected top-k elements as array<double> - `sort_and_uniq_array(array<int>)` - Takes array<int> and returns a sorted array with duplicate elements eliminated ```sql - select sort_and_uniq_array(array(3,1,1,-2,10)); - > [-2,1,3,10] + SELECT sort_and_uniq_array(array(3,1,1,-2,10)); + [-2,1,3,10] ``` - `subarray_endwith(array<int|text> original, int|text key)` - Returns an array that ends with the specified key ```sql - select subarray_endwith(array(1,2,3,4), 3); - > [1,2,3] + SELECT subarray_endwith(array(1,2,3,4), 3); + [1,2,3] ``` - `subarray_startwith(array<int|text> original, int|text key)` - Returns an array that starts with the specified key ```sql - select subarray_startwith(array(1,2,3,4), 2); - > [2,3,4] + SELECT subarray_startwith(array(1,2,3,4), 2); + [2,3,4] ``` - `to_string_array(array<ANY>)` - Returns an array of strings - `to_ordered_list(PRIMITIVE value [, PRIMITIVE key, const string options])` - Return list of values sorted by value itself or specific key ```sql - with t as ( - select 5 as key, 'apple' as value - union all - select 3 as key, 'banana' as value - union all - select 4 as key, 'candy' as value - union all - select 2 as key, 'donut' as value - union all - select 3 as key, 'egg' as value + WITH t as ( + SELECT 5 as key, 'apple' as value + UNION ALL + SELECT 3 as key, 'banana' as value + UNION ALL + SELECT 4 as key, 'candy' as value + UNION ALL + SELECT 2 as key, 'donut' as value + UNION ALL + SELECT 3 as key, 'egg' as value ) - select -- expected output + SELECT -- expected output to_ordered_list(value, key, '-reverse'), -- [apple, candy, (banana, egg | egg, banana), donut] (reverse order) to_ordered_list(value, key, '-k 2'), -- [apple, candy] (top-k) to_ordered_list(value, key, '-k 100'), -- [apple, candy, (banana, egg | egg, banana), dunut] @@ -117,16 +212,230 @@ This page describes a list of useful Hivemall generic functions. See also a [lis to_ordered_list(value, '-k 2'), -- [egg, donut] (alphabetically) to_ordered_list(key, '-k -2 -reverse'), -- [5, 4] (top-2 keys) to_ordered_list(key) -- [2, 3, 3, 4, 5] (natural ordered keys) - from + FROM t ``` +# Bitset + +- `bits_collect(int|long x)` - Returns a bitset in array<long> + +- `bits_or(array<long> b1, array<long> b2, ..)` - Returns a logical OR given bitsets + ```sql + SELECT unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3)))); + [1,2,3,4] + ``` + +- `to_bits(int[] indexes)` - Returns an bitset representation if the given indexes in long[] + ```sql + SELECT to_bits(array(1,2,3,128)); + [14,-9223372036854775808] + ``` + +- `unbits(long[] bitset)` - Returns an long array of the give bitset representation + ```sql + SELECT unbits(to_bits(array(1,4,2,3))); + [1,2,3,4] + ``` + +# Compression + +- `deflate(TEXT data [, const int compressionLevel])` - Returns a compressed BINARY object by using Deflater. The compression level must be in range [-1,9] + ```sql + SELECT base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); + AA+=kaIM|WTt!+wbGAA + ``` + +- `inflate(BINARY compressedData)` - Returns a decompressed STRING by using Inflater + ```sql + SELECT inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); + aaaaaaaaaaaaaaaabbbbccc + ``` + +# Datetime + +- `sessionize(long timeInSec, long thresholdInSec [, String subject])`- Returns a UUID string of a session. + ```sql + SELECT + sessionize(time, 3600, ip_addr) as session_id, + time, ip_addr + FROM ( + SELECT time, ipaddr + FROM weblog + DISTRIBUTE BY ip_addr, time SORT BY ip_addr, time DESC + ) t1 + ``` + +# JSON + +- `from_json(string jsonString, const string returnTypes [, const array<string>|const string columnNames])` - Return Hive object. + ```sql + SELECT + from_json( + '{ "person" : { "name" : "makoto" , "age" : 37 } }', + 'struct<name:string,age:int>', + array('person') + ), + from_json( + '[0.1,1.1,2.2]', + 'array<double>' + ), + from_json(to_json( + ARRAY( + NAMED_STRUCT("country", "japan", "city", "tokyo"), + NAMED_STRUCT("country", "japan", "city", "osaka") + ) + ),'array<struct<country:string,city:string>>'), + from_json(to_json( + ARRAY( + NAMED_STRUCT("country", "japan", "city", "tokyo"), + NAMED_STRUCT("country", "japan", "city", "osaka") + ), + array('city') + ), 'array<struct<country:string,city:string>>'), + from_json(to_json( + ARRAY( + NAMED_STRUCT("country", "japan", "city", "tokyo"), + NAMED_STRUCT("country", "japan", "city", "osaka") + ) + ),'array<struct<city:string>>'); + ``` + + ``` + {"name":"makoto","age":37} + [0.1,1.1,2.2] + [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}] + [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}] + [{"city":"tokyo"},{"city":"osaka"}] + ``` + +- `to_json(ANY object [, const array<string>|const string columnNames])` - Returns Json string + ```sql + SELECT + NAMED_STRUCT("Name", "John", "age", 31), + to_json( + NAMED_STRUCT("Name", "John", "age", 31) + ), + to_json( + NAMED_STRUCT("Name", "John", "age", 31), + array('Name', 'age') + ), + to_json( + NAMED_STRUCT("Name", "John", "age", 31), + array('name', 'age') + ), + to_json( + NAMED_STRUCT("Name", "John", "age", 31), + array('age') + ), + to_json( + NAMED_STRUCT("Name", "John", "age", 31), + array() + ), + to_json( + null, + array() + ), + to_json( + struct("123", "456", 789, array(314,007)), + array('ti','si','i','bi') + ), + to_json( + struct("123", "456", 789, array(314,007)), + 'ti,si,i,bi' + ), + to_json( + struct("123", "456", 789, array(314,007)) + ), + to_json( + NAMED_STRUCT("country", "japan", "city", "tokyo") + ), + to_json( + NAMED_STRUCT("country", "japan", "city", "tokyo"), + array('city') + ), + to_json( + ARRAY( + NAMED_STRUCT("country", "japan", "city", "tokyo"), + NAMED_STRUCT("country", "japan", "city", "osaka") + ) + ), + to_json( + ARRAY( + NAMED_STRUCT("country", "japan", "city", "tokyo"), + NAMED_STRUCT("country", "japan", "city", "osaka") + ), + array('city') + ); + ``` + + ``` + {"name":"John","age":31} + {"name":"John","age":31} + {"Name":"John","age":31} + {"name":"John","age":31} + {"age":31} + {} + NULL + {"ti":"123","si":"456","i":789,"bi":[314,7]} + {"ti":"123","si":"456","i":789,"bi":[314,7]} + {"col1":"123","col2":"456","col3":789,"col4":[314,7]} + {"country":"japan","city":"tokyo"} + {"city":"tokyo"} + [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}] + [{"country":"japan","city":"tokyo"},{"country":"japan","city":"osaka"}] + ``` + # Map +- `map_exclude_keys(Map<K,V> map, array<K> filteringKeys)` - Returns the filtered entries of a map not having specified keys + ```sql + SELECT map_exclude_keys(map(1,'one',2,'two',3,'three'),array(2,3)); + {1:"one"} + ``` + - `map_get_sum(map<int,float> src, array<int> keys)` - Returns sum of values that are retrieved by keys +- `map_include_keys(Map<K,V> map, array<K> filteringKeys)` - Returns the filtered entries of a map having specified keys + ```sql + SELECT map_include_keys(map(1,'one',2,'two',3,'three'),array(2,3)); + {2:"two",3:"three"} + ``` + +- `map_index(a, n)` - Returns the n-th element of the given array + ```sql + WITH tmp as ( + SELECT "one" as key + UNION ALL + SELECT "two" as key + ) + SELECT map_index(map("one",1,"two",2),key) + FROM tmp; + + 1 + 2 + ``` + +- `map_key_values(map)` - Returns a array of key-value pairs. + ```sql + SELECT map_key_values(map("one",1,"two",2)); + + [{"key":"one","value":1},{"key":"two","value":2}] + ``` + - `map_tail_n(map SRC, int N)` - Returns the last N elements from a sorted array of SRC +- `merge_maps(x)` - Returns a map which contains the union of an aggregation of maps. Note that an existing value of a key can be replaced with the other duplicate key entry. + ```sql + SELECT + merge_maps(m) + FROM ( + SELECT map('A',10,'B',20,'C',30) + UNION ALL + SELECT map('A',10,'B',20,'C',30) + ) t + ``` + - `to_map(key, value)` - Convert two aggregated columns into a key-value map - `to_ordered_map(key, value [, const int k|const boolean reverseOrder=false])` - Convert two aggregated columns into an ordered key-value map @@ -152,42 +461,6 @@ This page describes a list of useful Hivemall generic functions. See also a [lis from t ``` -# Bitset - -- `bits_collect(int|long x)` - Returns a bitset in array<long> - -- `bits_or(array<long> b1, array<long> b2, ..)` - Returns a logical OR given bitsets - ```sql - select unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3)))); - > [1,2,3,4] - ``` - -- `to_bits(int[] indexes)` - Returns an bitset representation if the given indexes in long[] - ```sql - select to_bits(array(1,2,3,128)); - > [14,-9223372036854775808] - ``` - -- `unbits(long[] bitset)` - Returns an long array of the give bitset representation - ```sql - select unbits(to_bits(array(1,4,2,3))); - > [1,2,3,4] - ``` - -# Compression - -- `deflate(TEXT data [, const int compressionLevel])` - Returns a compressed BINARY object by using Deflater. The compression level must be in range [-1,9] - ```sql - select base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); - > AA+=kaIM|WTt!+wbGAA - ``` - -- `inflate(BINARY compressedData)` - Returns a decompressed STRING by using Inflater - ```sql - select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); - > aaaaaaaaaaaaaaaabbbbccc - ``` - # MapReduce - `distcache_gets(filepath, key, default_value [, parseKey])` - Returns map<key_type, value_type>|value_type @@ -198,9 +471,9 @@ This page describes a list of useful Hivemall generic functions. See also a [lis - `rowid()` - Returns a generated row id of a form {TASK_ID}-{SEQUENCE_NUMBER} -- `rownum()` - Returns a generated row number in long - ``` - returns sprintf(`%d%04d`,sequence,taskId) as long +- `rownum()` - Returns a generated row number `sprintf(`%d%04d`,sequence,taskId)` in long + ```sql + SELECT rownum() as rownum, xxx from ... ``` - `taskid()` - Returns the value of mapred.task.partition @@ -215,47 +488,91 @@ This page describes a list of useful Hivemall generic functions. See also a [lis - `transpose_and_dot(array<number> matrix0_row, array<number> matrix1_row)` - Returns dot(matrix0.T, matrix1) as array<array<double>>, shape = (matrix0.#cols, matrix1.#cols) +# Sanity Checks + +- `assert(boolean condition)` or _FUNC_(boolean condition, string errMsg)- Throws HiveException if condition is not met + ```sql + SELECT count(1) FROM stock_price WHERE assert(price > 0.0); + SELECT count(1) FROM stock_price WHERE assert(price > 0.0, 'price MUST be more than 0.0') + ``` + +- `raise_error()` or _FUNC_(string msg) - Throws an error + ```sql + SELECT product_id, price, raise_error('Found an invalid record') FROM xxx WHERE price < 0.0 + ``` + # Text processing - `base91(BINARY bin)` - Convert the argument from binary to a BASE91 string ```sql - select base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); - > AA+=kaIM|WTt!+wbGAA + SELECT base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); + AA+=kaIM|WTt!+wbGAA ``` - `is_stopword(string word)` - Returns whether English stopword or not - `normalize_unicode(string str [, string form])` - Transforms `str` with the specified normalization form. The `form` takes one of NFC (default), NFD, NFKC, or NFKD ```sql - select normalize_unicode('ï¾ï¾ï½¶ï½¸ï½¶ï¾ ','NFKC'); - > ãã³ã«ã¯ã«ã + SELECT normalize_unicode('ï¾ï¾ï½¶ï½¸ï½¶ï¾ ','NFKC'); + ãã³ã«ã¯ã«ã - select normalize_unicode('ã±ã§ã¦â ¢','NFKC'); - > (æ ª)ãã³ãã«III + SELECT normalize_unicode('ã±ã§ã¦â ¢','NFKC'); + (æ ª)ãã³ãã«III ``` - `singularize(string word)` - Returns singular form of a given English word ```sql - select singularize(lower("Apples")); + SELECT singularize(lower("Apples")); - > "apple" + "apple" ``` -- `split_words(string query [, string regex])` - Returns an array<text> containing split strings +- `split_words(string query [, string regex])` - Returns an array<text> containing splitted strings - `tokenize(string englishText [, boolean toLowerCase])` - Returns tokenized words in array<string> - `unbase91(string)` - Convert a BASE91 string to a binary ```sql - select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); - > aaaaaaaaaaaaaaaabbbbccc + SELECT inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); + aaaaaaaaaaaaaaaabbbbccc ``` - `word_ngrams(array<string> words, int minSize, int maxSize])` - Returns list of n-grams for given words, where `minSize <= n <= maxSize` ```sql - select word_ngrams(tokenize('Machine learning is fun!', true), 1, 2); + SELECT word_ngrams(tokenize('Machine learning is fun!', true), 1, 2); - > ["machine","machine learning","learning","learning is","is","is fun","fun"] + ["machine","machine learning","learning","learning is","is","is fun","fun"] + ``` + +# Timeseries + +- `moving_avg(NUMBER value, const int windowSize)` - Returns moving average of a time series using a given window + ```sql + SELECT moving_avg(x, 3) FROM (SELECT explode(array(1.0,2.0,3.0,4.0,5.0,6.0,7.0)) as x) series; + 1.0 + 1.5 + 2.0 + 3.0 + 4.0 + 5.0 + 6.0 + ``` + +# Vector + +- `vector_add(array<NUMBER> x, array<NUMBER> y)` - Perform vector ADD operation. + ```sql + SELECT vector_add(array(1.0,2.0,3.0), array(2, 3, 4)); + [3.0,5.0,7.0] + ``` + +- `vector_dot(array<NUMBER> x, array<NUMBER> y)` - Performs vector dot product. + ```sql + SELECT vector_dot(array(1.0,2.0,3.0),array(2.0,3.0,4.0)); + 20 + + SELECT vector_dot(array(1.0,2.0,3.0),2); + [2.0,4.0,6.0] ``` # Others @@ -264,25 +581,51 @@ This page describes a list of useful Hivemall generic functions. See also a [lis - `each_top_k(int K, Object group, double cmpKey, *)` - Returns top-K values (or tail-K values when k is less than 0) -- `generate_series(const int|bigint start, const int|bigint end)` - Generate a series of values, from start to end. A similar function to PostgreSQL's `generate_serics`. http://www.postgresql.org/docs/current/static/functions-srf.html +- `generate_series(const int|bigint start, const int|bigint end)` - Generate a series of values, from start to end. A similar function to PostgreSQL's [generate_serics](http://www.postgresql.org/docs/current/static/functions-srf.html) ```sql - select generate_series(1,9); + SELECT generate_series(2,4); - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 + 2 + 3 + 4 + + SELECT generate_series(5,1,-2); + + 5 + 3 + 1 + + SELECT generate_series(4,3); + + (no return) + + SELECT date_add(current_date(),value),value from (SELECT generate_series(1,3)) t; + + 2018-04-21 1 + 2018-04-22 2 + 2018-04-23 3 + + WITH input as ( + SELECT 1 as c1, 10 as c2, 3 as step + UNION ALL + SELECT 10, 2, -3 + ) + SELECT generate_series(c1, c2, step) as series + FROM input; + + 1 + 4 + 7 + 10 + 10 + 7 + 4 ``` - `try_cast(ANY src, const string typeName)` - Explicitly cast a value as a type. Returns null if cast fails. ```sql - Usage: select try_cast(array(1.0,2.0,3.0), 'array<string>') - select try_cast(map('A',10,'B',20,'C',30), 'map<string,double>') + SELECT try_cast(array(1.0,2.0,3.0), 'array<string>') + SELECT try_cast(map('A',10,'B',20,'C',30), 'map<string,double>') ``` - `x_rank(KEY)` - Generates a pseudo sequence number starting from 1 for each key http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 5a78bd1..aa682c6 100644 --- a/pom.xml +++ b/pom.xml @@ -196,6 +196,18 @@ </roles> <timezone>+9</timezone> </developer> + <developer> + <id>jbanks</id> + <name>Jerome Banks</name> + <email>jbanks[at]apache.org</email> + <url>https://github.com/jeromebanks/</url> + <organization>Jumpshot Inc.</organization> + <organizationUrl>https://www.jumpshot.com/</organizationUrl> + <roles> + <role>Committer</role> + </roles> + <timezone>-8</timezone> + </developer> <!-- Project mentors --> <developer> <id>rvs</id> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/resources/ddl/define-all-as-permanent.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index e7da8e3..5c257c5 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -435,6 +435,30 @@ CREATE FUNCTION array_intersect as 'hivemall.tools.array.ArrayIntersectUDF' USIN DROP FUNCTION IF EXISTS select_k_best; CREATE FUNCTION select_k_best as 'hivemall.tools.array.SelectKBestUDF' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS array_append; +CREATE FUNCTION array_append as 'hivemall.tools.array.ArrayAppendUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS element_at; +CREATE FUNCTION element_at as 'hivemall.tools.array.ArrayElementAtUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS array_union; +CREATE FUNCTION array_union as 'hivemall.tools.array.ArrayUnionUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS first_element; +CREATE FUNCTION first_element as 'hivemall.tools.array.FirstElementUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS last_element; +CREATE FUNCTION last_element as 'hivemall.tools.array.LastElementUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS array_flatten; +CREATE FUNCTION array_flatten as 'hivemall.tools.array.ArrayFlattenUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS array_to_str; +CREATE FUNCTION array_to_str as 'hivemall.tools.array.ArrayToStrUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS conditional_emit; +CREATE FUNCTION conditional_emit as 'hivemall.tools.array.ConditionalEmitUDTF' USING JAR '${hivemall_jar}'; + ----------------------------- -- bit operation functions -- ----------------------------- @@ -477,6 +501,18 @@ CREATE FUNCTION to_map as 'hivemall.tools.map.UDAFToMap' USING JAR '${hivemall_j DROP FUNCTION IF EXISTS to_ordered_map; CREATE FUNCTION to_ordered_map as 'hivemall.tools.map.UDAFToOrderedMap' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS map_include_keys; +CREATE FUNCTION map_include_keys as 'hivemall.tools.map.MapIncludeKeysUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS map_exclude_keys; +CREATE FUNCTION map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS map_index; +CREATE FUNCTION map_index as 'hivemall.tools.map.MapIndexUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS map_key_values; +CREATE FUNCTION map_key_values as 'hivemall.tools.map.MapKeyValuesUDF' USING JAR '${hivemall_jar}'; + --------------------- -- list functions -- --------------------- @@ -494,13 +530,19 @@ CREATE FUNCTION sigmoid as 'hivemall.tools.math.SigmoidGenericUDF' USING JAR '${ DROP FUNCTION IF EXISTS l2_norm; CREATE FUNCTION l2_norm as 'hivemall.tools.math.L2NormUDAF' USING JAR '${hivemall_jar}'; ----------------------- --- Matrix functions -- ----------------------- +----------------------------- +-- Matrix/Vector functions -- +----------------------------- DROP FUNCTION IF EXISTS transpose_and_dot; CREATE FUNCTION transpose_and_dot as 'hivemall.tools.matrix.TransposeAndDotUDAF' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS vector_add; +CREATE FUNCTION vector_add as 'hivemall.tools.vector.VectorAddUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS vector_dot; +CREATE FUNCTION vector_dot as 'hivemall.tools.vector.VectorDotUDF' USING JAR '${hivemall_jar}'; + ---------------------- -- mapred functions -- ---------------------- @@ -524,6 +566,26 @@ DROP FUNCTION IF EXISTS jobconf_gets; CREATE FUNCTION jobconf_gets as 'hivemall.tools.mapred.JobConfGetsUDF' USING JAR '${hivemall_jar}'; -------------------- +-- JSON functions -- +-------------------- + +DROP FUNCTION IF EXISTS to_json; +CREATE FUNCTION to_json as 'hivemall.tools.json.ToJsonUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS from_json; +CREATE FUNCTION from_json as 'hivemall.tools.json.FromJsonUDF' USING JAR '${hivemall_jar}'; + +---------------------------- +-- Sanity Check functions -- +---------------------------- + +DROP FUNCTION IF EXISTS assert; +CREATE FUNCTION assert as 'hivemall.tools.sanity.AssertUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS raise_error; +CREATE FUNCTION raise_error as 'hivemall.tools.sanity.RaiseErrorUDF' USING JAR '${hivemall_jar}'; + +-------------------- -- misc functions -- -------------------- @@ -539,6 +601,15 @@ CREATE FUNCTION x_rank as 'hivemall.tools.RankSequenceUDF' USING JAR '${hivemall DROP FUNCTION IF EXISTS each_top_k; CREATE FUNCTION each_top_k as 'hivemall.tools.EachTopKUDTF' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS try_cast; +CREATE FUNCTION try_cast as 'hivemall.tools.TryCastUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS sessionize; +CREATE FUNCTION sessionize as 'hivemall.tools.datetime.SessionizeUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS moving_avg; +CREATE FUNCTION moving_avg as 'hivemall.tools.timeseries.MovingAverageUDTF' USING JAR '${hivemall_jar}'; + ------------------------------- -- Text processing functions -- ------------------------------- @@ -749,6 +820,28 @@ CREATE FUNCTION train_slim as 'hivemall.recommend.SlimUDTF' USING JAR '${hivemal DROP FUNCTION IF EXISTS approx_count_distinct; CREATE FUNCTION approx_count_distinct as 'hivemall.sketch.hll.ApproxCountDistinctUDAF' USING JAR '${hivemall_jar}'; +------------------ +-- Bloom Filter -- +------------------ + +DROP FUNCTION IF EXISTS bloom; +CREATE FUNCTION bloom as 'hivemall.sketch.bloom.BloomFilterUDAF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS bloom_and; +CREATE FUNCTION bloom_and as 'hivemall.sketch.bloom.BloomAndUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS bloom_contains; +CREATE FUNCTION bloom_contains as 'hivemall.sketch.bloom.BloomContainsUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS bloom_not; +CREATE FUNCTION bloom_not as 'hivemall.sketch.bloom.BloomNotUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS bloom_or; +CREATE FUNCTION bloom_or as 'hivemall.sketch.bloom.BloomOrUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS bloom_contains_any; +CREATE FUNCTION bloom_contains_any as 'hivemall.sketch.bloom.BloomContainsAnyUDF' USING JAR '${hivemall_jar}'; + ------------------------------ -- XGBoost related features -- ------------------------------ http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/resources/ddl/define-all.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 9228ce9..fbb3ed2 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -427,6 +427,30 @@ create temporary function array_intersect as 'hivemall.tools.array.ArrayIntersec drop temporary function if exists select_k_best; create temporary function select_k_best as 'hivemall.tools.array.SelectKBestUDF'; +drop temporary function if exists array_append; +create temporary function array_append as 'hivemall.tools.array.ArrayAppendUDF'; + +drop temporary function if exists element_at; +create temporary function element_at as 'hivemall.tools.array.ArrayElementAtUDF'; + +drop temporary function if exists array_union; +create temporary function array_union as 'hivemall.tools.array.ArrayUnionUDF'; + +drop temporary function if exists first_element; +create temporary function first_element as 'hivemall.tools.array.FirstElementUDF'; + +drop temporary function if exists last_element; +create temporary function last_element as 'hivemall.tools.array.LastElementUDF'; + +drop temporary function if exists array_flatten; +create temporary function array_flatten as 'hivemall.tools.array.ArrayFlattenUDF'; + +drop temporary function if exists array_to_str; +create temporary function array_to_str as 'hivemall.tools.array.ArrayToStrUDF'; + +drop temporary function if exists conditional_emit; +create temporary function conditional_emit as 'hivemall.tools.array.ConditionalEmitUDTF'; + ----------------------------- -- bit operation functions -- ----------------------------- @@ -469,6 +493,18 @@ create temporary function to_map as 'hivemall.tools.map.UDAFToMap'; drop temporary function if exists to_ordered_map; create temporary function to_ordered_map as 'hivemall.tools.map.UDAFToOrderedMap'; +drop temporary function if exists map_include_keys; +create temporary function map_include_keys as 'hivemall.tools.map.MapIncludeKeysUDF'; + +drop temporary function if exists map_exclude_keys; +create temporary function map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF'; + +drop temporary function if exists map_index; +create temporary function map_index as 'hivemall.tools.map.MapIndexUDF'; + +drop temporary function if exists map_key_values; +create temporary function map_key_values as 'hivemall.tools.map.MapKeyValuesUDF'; + --------------------- -- list functions -- --------------------- @@ -486,13 +522,19 @@ create temporary function sigmoid as 'hivemall.tools.math.SigmoidGenericUDF'; drop temporary function if exists l2_norm; create temporary function l2_norm as 'hivemall.tools.math.L2NormUDAF'; ----------------------- --- Matrix functions -- ----------------------- +----------------------------- +-- Matrix/Vector functions -- +----------------------------- drop temporary function if exists transpose_and_dot; create temporary function transpose_and_dot as 'hivemall.tools.matrix.TransposeAndDotUDAF'; +drop temporary function if exists vector_add; +create temporary function vector_add as 'hivemall.tools.vector.VectorAddUDF'; + +drop temporary function if exists vector_dot; +create temporary function vector_dot as 'hivemall.tools.vector.VectorDotUDF'; + ---------------------- -- mapred functions -- ---------------------- @@ -516,6 +558,26 @@ drop temporary function if exists jobconf_gets; create temporary function jobconf_gets as 'hivemall.tools.mapred.JobConfGetsUDF'; -------------------- +-- JSON functions -- +-------------------- + +drop temporary function if exists to_json; +create temporary function to_json as 'hivemall.tools.json.ToJsonUDF'; + +drop temporary function if exists from_json; +create temporary function from_json as 'hivemall.tools.json.FromJsonUDF'; + +---------------------------- +-- Sanity Check functions -- +---------------------------- + +drop temporary function if exists assert; +create temporary function assert as 'hivemall.tools.sanity.AssertUDF'; + +drop temporary function if exists raise_error; +create temporary function raise_error as 'hivemall.tools.sanity.RaiseErrorUDF'; + +-------------------- -- misc functions -- -------------------- @@ -531,6 +593,15 @@ create temporary function x_rank as 'hivemall.tools.RankSequenceUDF'; drop temporary function if exists each_top_k; create temporary function each_top_k as 'hivemall.tools.EachTopKUDTF'; +drop temporary function if exists try_cast; +create temporary function try_cast as 'hivemall.tools.TryCastUDF'; + +drop temporary function if exists sessionize; +create temporary function sessionize as 'hivemall.tools.datetime.SessionizeUDF'; + +drop temporary function if exists moving_avg; +create temporary function moving_avg as 'hivemall.tools.timeseries.MovingAverageUDTF'; + ------------------------------- -- Text processing functions -- ------------------------------- @@ -741,6 +812,28 @@ create temporary function train_slim as 'hivemall.recommend.SlimUDTF'; drop temporary function if exists approx_count_distinct; create temporary function approx_count_distinct as 'hivemall.sketch.hll.ApproxCountDistinctUDAF'; +------------------ +-- Bloom Filter -- +------------------ + +drop temporary function if exists bloom; +create temporary function bloom as 'hivemall.sketch.bloom.BloomFilterUDAF'; + +drop temporary function if exists bloom_and; +create temporary function bloom_and as 'hivemall.sketch.bloom.BloomAndUDF'; + +drop temporary function if exists bloom_contains; +create temporary function bloom_contains as 'hivemall.sketch.bloom.BloomContainsUDF'; + +drop temporary function if exists bloom_not; +create temporary function bloom_not as 'hivemall.sketch.bloom.BloomNotUDF'; + +drop temporary function if exists bloom_or; +create temporary function bloom_or as 'hivemall.sketch.bloom.BloomOrUDF'; + +drop temporary function if exists bloom_contains_any; +create temporary function bloom_contains_any as 'hivemall.sketch.bloom.BloomContainsAnyUDF'; + -------------------------------------------------------------------------------------------------- -- macros available from hive 0.12.0 -- see https://issues.apache.org/jira/browse/HIVE-2655 http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/resources/ddl/define-all.spark ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index 3764ca2..e78a966 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -425,6 +425,30 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION array_intersect AS 'hivemall.tools.arr sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS select_k_best") sqlContext.sql("CREATE TEMPORARY FUNCTION select_k_best AS 'hivemall.tools.array.SelectKBestUDF'") +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_append") +sqlContext.sql("CREATE TEMPORARY FUNCTION array_append AS 'hivemall.tools.array.ArrayAppendUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS element_at") +sqlContext.sql("CREATE TEMPORARY FUNCTION element_at AS 'hivemall.tools.array.ArrayElementAtUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_union") +sqlContext.sql("CREATE TEMPORARY FUNCTION array_union AS 'hivemall.tools.array.ArrayUnionUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS first_element") +sqlContext.sql("CREATE TEMPORARY FUNCTION first_element AS 'hivemall.tools.array.FirstElementUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS last_element") +sqlContext.sql("CREATE TEMPORARY FUNCTION last_element AS 'hivemall.tools.array.LastElementUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_flatten") +sqlContext.sql("CREATE TEMPORARY FUNCTION array_flatten AS 'hivemall.tools.array.ArrayFlattenUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_to_str") +sqlContext.sql("CREATE TEMPORARY FUNCTION array_to_str AS 'hivemall.tools.array.ArrayToStrUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS conditional_emit") +sqlContext.sql("CREATE TEMPORARY FUNCTION conditional_emit AS 'hivemall.tools.array.ConditionalEmitUDTF'") + /** * Bit operation functions */ @@ -467,6 +491,18 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION to_map AS 'hivemall.tools.map.UDAFToMa sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS to_ordered_map") sqlContext.sql("CREATE TEMPORARY FUNCTION to_ordered_map AS 'hivemall.tools.map.UDAFToOrderedMap'") +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_include_keys") +sqlContext.sql("CREATE TEMPORARY FUNCTION map_include_keys AS 'hivemall.tools.map.MapIncludeKeysUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_exclude_keys") +sqlContext.sql("CREATE TEMPORARY FUNCTION map_exclude_keys AS 'hivemall.tools.map.MapExcludeKeysUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_index") +sqlContext.sql("CREATE TEMPORARY FUNCTION map_index AS 'hivemall.tools.map.MapIndexUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_key_values") +sqlContext.sql("CREATE TEMPORARY FUNCTION map_key_values AS 'hivemall.tools.map.MapKeyValuesUDF'") + /** * List functions */ @@ -485,12 +521,18 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS l2_norm") sqlContext.sql("CREATE TEMPORARY FUNCTION l2_norm AS 'hivemall.tools.math.L2NormUDAF'") /** - * Matrix functions + * Matrix/Vector functions */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS transpose_and_dot") sqlContext.sql("CREATE TEMPORARY FUNCTION transpose_and_dot AS 'hivemall.tools.matrix.TransposeAndDotUDAF'") +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS vector_add") +sqlContext.sql("CREATE TEMPORARY FUNCTION vector_add AS 'hivemall.tools.vector.VectorAddUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS vector_dot") +sqlContext.sql("CREATE TEMPORARY FUNCTION vector_dot AS 'hivemall.tools.vector.VectorDotUDF'") + /** * MAPRED functions */ @@ -499,6 +541,26 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS rowid") sqlContext.sql("CREATE TEMPORARY FUNCTION rowid AS 'hivemall.tools.mapred.RowIdUDFWrapper'") /** + * JSON functions + */ + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS to_json") +sqlContext.sql("CREATE TEMPORARY FUNCTION to_json AS 'hivemall.tools.json.ToJsonUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS from_json") +sqlContext.sql("CREATE TEMPORARY FUNCTION from_json AS 'hivemall.tools.json.FromJsonUDF'") + +/** + * Sanity Check functions + */ + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS assert") +sqlContext.sql("CREATE TEMPORARY FUNCTION assert AS 'hivemall.tools.sanity.AssertUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS raise_error") +sqlContext.sql("CREATE TEMPORARY FUNCTION raise_error AS 'hivemall.tools.sanity.RaiseErrorUDF'") + +/** * MISC functions */ @@ -514,6 +576,15 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION x_rank AS 'hivemall.tools.RankSequence sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS each_top_k") sqlContext.sql("CREATE TEMPORARY FUNCTION each_top_k AS 'hivemall.tools.EachTopKUDTF'") +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS try_cast") +sqlContext.sql("CREATE TEMPORARY FUNCTION try_cast AS 'hivemall.tools.TryCastUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS sessionize") +sqlContext.sql("CREATE TEMPORARY FUNCTION sessionize AS 'hivemall.tools.datetime.SessionizeUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS moving_avg") +sqlContext.sql("CREATE TEMPORARY FUNCTION moving_avg AS 'hivemall.tools.timeseries.MovingAverageUDTF'") + /** * Text processing functions */ @@ -726,3 +797,25 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION train_slim AS 'hivemall.recommend.Slim sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS approx_count_distinct") sqlContext.sql("CREATE TEMPORARY FUNCTION approx_count_distinct AS 'hivemall.sketch.hll.ApproxCountDistinctUDAF'") + +/** + * Bloom Filter + */ + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom") +sqlContext.sql("CREATE TEMPORARY FUNCTION bloom AS 'hivemall.sketch.bloom.BloomFilterUDAF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom_and") +sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_and AS 'hivemall.sketch.bloom.BloomAndUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom_contains") +sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_contains AS 'hivemall.sketch.bloom.BloomContainsUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom_not") +sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_not AS 'hivemall.sketch.bloom.BloomNotUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom_or") +sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_or AS 'hivemall.sketch.bloom.BloomOrUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bloom_contains_any") +sqlContext.sql("CREATE TEMPORARY FUNCTION bloom_contains_any AS 'hivemall.sketch.bloom.BloomContainsAnyUDF'") http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/resources/ddl/define-udfs.td.hql ---------------------------------------------------------------------- diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index b106eda..2352390 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -155,7 +155,7 @@ create temporary function train_randomforest_regr as 'hivemall.smile.regression. create temporary function tree_predict as 'hivemall.smile.tools.TreePredictUDF'; create temporary function rf_ensemble as 'hivemall.smile.tools.RandomForestEnsembleUDAF'; create temporary function guess_attribute_types as 'hivemall.smile.tools.GuessAttributesUDF'; --- since Hivemall v0.5 +-- since Hivemall v0.5.0 create temporary function changefinder as 'hivemall.anomaly.ChangeFinderUDF'; create temporary function sst as 'hivemall.anomaly.SingularSpectrumTransformUDF'; create temporary function train_lda as 'hivemall.topicmodel.LDAUDTF'; @@ -183,7 +183,35 @@ create temporary function train_slim as 'hivemall.recommend.SlimUDTF'; create temporary function hitrate as 'hivemall.evaluation.HitRateUDAF'; create temporary function word_ngrams as 'hivemall.tools.text.WordNgramsUDF'; create temporary function approx_count_distinct as 'hivemall.sketch.hll.ApproxCountDistinctUDAF'; +-- since Hivemall v0.5.2 create temporary function array_slice as 'hivemall.tools.array.ArraySliceUDF'; +create temporary function try_cast as 'hivemall.tools.TryCastUDF'; +create temporary function array_append as 'hivemall.tools.array.ArrayAppendUDF'; +create temporary function element_at as 'hivemall.tools.array.ArrayElementAtUDF'; +create temporary function array_union as 'hivemall.tools.array.ArrayUnionUDF'; +create temporary function first_element as 'hivemall.tools.array.FirstElementUDF'; +create temporary function last_element as 'hivemall.tools.array.LastElementUDF'; +create temporary function array_flatten as 'hivemall.tools.array.ArrayFlattenUDF'; +create temporary function map_include_keys as 'hivemall.tools.map.MapIncludeKeysUDF'; +create temporary function map_exclude_keys as 'hivemall.tools.map.MapExcludeKeysUDF'; +create temporary function array_to_str as 'hivemall.tools.array.ArrayToStrUDF'; +create temporary function map_index as 'hivemall.tools.map.MapIndexUDF'; +create temporary function map_key_values as 'hivemall.tools.map.MapKeyValuesUDF'; +create temporary function sessionize as 'hivemall.tools.datetime.SessionizeUDF'; +create temporary function to_json as 'hivemall.tools.json.ToJsonUDF'; +create temporary function from_json as 'hivemall.tools.json.FromJsonUDF'; +create temporary function assert as 'hivemall.tools.sanity.AssertUDF'; +create temporary function raise_error as 'hivemall.tools.sanity.RaiseErrorUDF'; +create temporary function moving_avg as 'hivemall.tools.timeseries.MovingAverageUDTF'; +create temporary function vector_add as 'hivemall.tools.vector.VectorAddUDF'; +create temporary function vector_dot as 'hivemall.tools.vector.VectorDotUDF'; +create temporary function bloom as 'hivemall.sketch.bloom.BloomFilterUDAF'; +create temporary function bloom_and as 'hivemall.sketch.bloom.BloomAndUDF'; +create temporary function bloom_contains as 'hivemall.sketch.bloom.BloomContainsUDF'; +create temporary function bloom_not as 'hivemall.sketch.bloom.BloomNotUDF'; +create temporary function bloom_or as 'hivemall.sketch.bloom.BloomOrUDF'; +create temporary function bloom_contains_any as 'hivemall.sketch.bloom.BloomContainsAnyUDF'; +create temporary function conditional_emit as 'hivemall.tools.array.ConditionalEmitUDTF'; -- NLP features create temporary function tokenize_ja as 'hivemall.nlp.tokenizer.KuromojiUDF'; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/spark/spark-2.2/pom.xml ---------------------------------------------------------------------- diff --git a/spark/spark-2.2/pom.xml b/spark/spark-2.2/pom.xml index 100de59..5cb3609 100644 --- a/spark/spark-2.2/pom.xml +++ b/spark/spark-2.2/pom.xml @@ -141,7 +141,7 @@ <JAVA_HOME>${env.JAVA8_HOME}</JAVA_HOME> <PATH>${env.JAVA8_HOME}/bin:${env.PATH}</PATH> </environmentVariables> - </configuration> + </configuration> </plugin> </plugins> </build> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/tools/hivemall-docs/pom.xml ---------------------------------------------------------------------- diff --git a/tools/hivemall-docs/pom.xml b/tools/hivemall-docs/pom.xml index 76994d7..99cd138 100644 --- a/tools/hivemall-docs/pom.xml +++ b/tools/hivemall-docs/pom.xml @@ -16,7 +16,9 @@ specific language governing permissions and limitations under the License. --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> @@ -56,46 +58,94 @@ <scope>provided</scope> </dependency> - <!-- hivemall dependencies --> + <!-- compile scope --> + <dependency> + <groupId>org.apache.hive</groupId> + <artifactId>hive-exec</artifactId> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>org.reflections</groupId> + <artifactId>reflections</artifactId> + <version>0.9.10</version> + <scope>compile</scope> + </dependency> + + <!-- runtime hivemall dependencies using reflection --> <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-core</artifactId> <version>${project.version}</version> - <scope>compile</scope> + <scope>runtime</scope> </dependency> <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-nlp</artifactId> <version>${project.version}</version> - <scope>compile</scope> + <scope>runtime</scope> </dependency> <dependency> <groupId>org.apache.hivemall</groupId> <artifactId>hivemall-xgboost</artifactId> <version>${project.version}</version> - <scope>compile</scope> + <scope>runtime</scope> </dependency> - <!-- compile scope --> - <dependency> - <groupId>org.apache.hive</groupId> - <artifactId>hive-exec</artifactId> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>com.google.guava</groupId> - <artifactId>guava</artifactId> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>org.reflections</groupId> - <artifactId>reflections</artifactId> - <version>0.9.10</version> - <scope>compile</scope> - </dependency> </dependencies> <build> + <pluginManagement> + <plugins> + <!--This plugin's configuration is used to store Eclipse m2e settings + only. It has no influence on the Maven build itself. --> + <plugin> + <groupId>org.eclipse.m2e</groupId> + <artifactId>lifecycle-mapping</artifactId> + <version>1.0.0</version> + <configuration> + <lifecycleMappingMetadata> + <pluginExecutions> + <pluginExecution> + <pluginExecutionFilter> + <groupId>org.codehaus.plexus</groupId> + <artifactId>plexus-component-metadata</artifactId> + <versionRange>[1.5.5,)</versionRange> + <goals> + <goal>generate-metadata</goal> + </goals> + </pluginExecutionFilter> + <action> + <execute> + <runOnIncremental>false</runOnIncremental> + </execute> + </action> + </pluginExecution> + <pluginExecution> + <pluginExecutionFilter> + <groupId> + org.apache.maven.plugins + </groupId> + <artifactId> + maven-plugin-plugin + </artifactId> + <versionRange> + [3.2,) + </versionRange> + <goals> + <goal>descriptor</goal> + </goals> + </pluginExecutionFilter> + <action> + <ignore></ignore> + </action> + </pluginExecution> + </pluginExecutions> + </lifecycleMappingMetadata> + </configuration> + </plugin> + </plugins> + </pluginManagement> + <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> @@ -106,6 +156,12 @@ <phase>process-classes</phase> </execution> <execution> + <id>mojo-descriptor</id> + <goals> + <goal>descriptor</goal> + </goals> + </execution> + <execution> <id>generate-helpmojo</id> <goals> <goal>helpmojo</goal> @@ -113,6 +169,19 @@ </execution> </executions> </plugin> + <plugin> + <groupId>org.codehaus.plexus</groupId> + <artifactId>plexus-component-metadata</artifactId> + <version>1.7.1</version> + <executions> + <execution> + <goals> + <goal>generate-metadata</goal> + </goals> + </execution> + </executions> + </plugin> </plugins> </build> + </project> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/49496032/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java ---------------------------------------------------------------------- diff --git a/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java b/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java deleted file mode 100644 index 61fea68..0000000 --- a/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package hivemall.docs; - -import static hivemall.docs.utils.MarkdownUtils.asCodeBlock; -import static hivemall.docs.utils.MarkdownUtils.asInlineCode; -import static hivemall.docs.utils.MarkdownUtils.asListElement; -import static hivemall.docs.utils.MarkdownUtils.indent; -import static org.apache.commons.lang.StringEscapeUtils.escapeHtml; - -import hivemall.utils.lang.StringUtils; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.Arrays; -import java.util.Collections; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import javax.annotation.Nonnull; - -import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.maven.execution.MavenSession; -import org.apache.maven.plugin.AbstractMojo; -import org.apache.maven.plugin.MojoExecutionException; -import org.apache.maven.plugins.annotations.Mojo; -import org.apache.maven.plugins.annotations.Parameter; -import org.reflections.Reflections; - -/** - * Generate a list of UDFs for documentation. - * - * @link https://hivemall.incubator.apache.org/userguide/misc/generic_funcs.html - * @link https://hivemall.incubator.apache.org/userguide/misc/funcs.html - */ -@Mojo(name = "generate-funcs-list") -public class FuncsListGenerator extends AbstractMojo { - - @Parameter(defaultValue = "${basedir}", readonly = true) - private File basedir; - - @Parameter(defaultValue = "${session}", readonly = true) - private MavenSession session; - - @Parameter(defaultValue = "docs/gitbook/misc/generic_funcs.md") - private String pathToGenericFuncs; - - @Parameter(defaultValue = "docs/gitbook/misc/funcs.md") - private String pathToFuncs; - - private static final Map<String, List<String>> genericFuncsHeaders = new LinkedHashMap<>(); - static { - genericFuncsHeaders.put("# Array", - Arrays.asList("hivemall.tools.array", "hivemall.tools.list")); - genericFuncsHeaders.put("# Map", Collections.singletonList("hivemall.tools.map")); - genericFuncsHeaders.put("# Bitset", Collections.singletonList("hivemall.tools.bits")); - genericFuncsHeaders.put("# Compression", - Collections.singletonList("hivemall.tools.compress")); - genericFuncsHeaders.put("# MapReduce", Collections.singletonList("hivemall.tools.mapred")); - genericFuncsHeaders.put("# Math", Collections.singletonList("hivemall.tools.math")); - genericFuncsHeaders.put("# Matrix", Collections.singletonList("hivemall.tools.matrix")); - genericFuncsHeaders.put("# Text processing", - Collections.singletonList("hivemall.tools.text")); - genericFuncsHeaders.put("# Others", Collections.singletonList("hivemall.tools")); - } - - private static final Map<String, List<String>> funcsHeaders = new LinkedHashMap<>(); - static { - funcsHeaders.put("# Regression", Collections.singletonList("hivemall.regression")); - funcsHeaders.put("# Classification", Collections.<String>emptyList()); - funcsHeaders.put("## Binary classification", - Collections.singletonList("hivemall.classifier")); - funcsHeaders.put("## Multiclass classification", - Collections.singletonList("hivemall.classifier.multiclass")); - funcsHeaders.put("# Matrix factorization", Collections.singletonList("hivemall.mf")); - funcsHeaders.put("# Factorization machines", Collections.singletonList("hivemall.fm")); - funcsHeaders.put("# Recommendation", Collections.singletonList("hivemall.recommend")); - funcsHeaders.put("# Anomaly detection", Collections.singletonList("hivemall.anomaly")); - funcsHeaders.put("# Topic modeling", Collections.singletonList("hivemall.topicmodel")); - funcsHeaders.put("# Preprocessing", Collections.singletonList("hivemall.ftvec")); - funcsHeaders.put("## Data amplification", - Collections.singletonList("hivemall.ftvec.amplify")); - funcsHeaders.put("## Feature binning", Collections.singletonList("hivemall.ftvec.binning")); - funcsHeaders.put("## Feature format conversion", - Collections.singletonList("hivemall.ftvec.conv")); - funcsHeaders.put("## Feature hashing", Collections.singletonList("hivemall.ftvec.hashing")); - funcsHeaders.put("## Feature paring", Collections.singletonList("hivemall.ftvec.pairing")); - funcsHeaders.put("## Ranking", Collections.singletonList("hivemall.ftvec.ranking")); - funcsHeaders.put("## Feature scaling", Collections.singletonList("hivemall.ftvec.scaling")); - funcsHeaders.put("## Feature selection", - Collections.singletonList("hivemall.ftvec.selection")); - funcsHeaders.put("## Feature transformation and vectorization", - Collections.singletonList("hivemall.ftvec.trans")); - funcsHeaders.put("# Geospatial functions", - Collections.singletonList("hivemall.geospatial")); - funcsHeaders.put("# Distance measures", Collections.singletonList("hivemall.knn.distance")); - funcsHeaders.put("# Locality-sensitive hashing", - Collections.singletonList("hivemall.knn.lsh")); - funcsHeaders.put("# Similarity measures", - Collections.singletonList("hivemall.knn.similarity")); - funcsHeaders.put("# Evaluation", Collections.singletonList("hivemall.evaluation")); - funcsHeaders.put("# Sketching", Collections.singletonList("hivemall.sketch.hll")); - funcsHeaders.put("# Ensemble learning", Collections.singletonList("hivemall.ensemble")); - funcsHeaders.put("## Bagging", Collections.singletonList("hivemall.ensemble.bagging")); - funcsHeaders.put("# Decision trees and RandomForest", Arrays.asList( - "hivemall.smile.classification", "hivemall.smile.regression", "hivemall.smile.tools")); - funcsHeaders.put("# XGBoost", Arrays.asList("hivemall.xgboost.classification", - "hivemall.xgboost.regression", "hivemall.xgboost.tools")); - funcsHeaders.put("# Others", - Arrays.asList("hivemall", "hivemall.dataset", "hivemall.ftvec.text")); - } - - @Override - public void execute() throws MojoExecutionException { - if (!isReactorRootProject()) { - // output only once across the projects - return; - } - - generate(new File(basedir, pathToGenericFuncs), - "This page describes a list of useful Hivemall generic functions. See also a [list of machine-learning-related functions](./funcs.md).", - genericFuncsHeaders); - generate(new File(basedir, pathToFuncs), - "This page describes a list of Hivemall functions. See also a [list of generic Hivemall functions](./generic_funcs.md) for more general-purpose functions such as array and map UDFs.", - funcsHeaders); - } - - private boolean isReactorRootProject() { - return session.getExecutionRootDirectory().equalsIgnoreCase(basedir.toString()); - } - - private void generate(@Nonnull File outputFile, @Nonnull String preface, - @Nonnull Map<String, List<String>> headers) throws MojoExecutionException { - Reflections reflections = new Reflections("hivemall"); - Set<Class<?>> annotatedClasses = reflections.getTypesAnnotatedWith(Description.class); - - StringBuilder sb = new StringBuilder(); - Map<String, Set<String>> packages = new HashMap<>(); - - Pattern func = Pattern.compile("_FUNC_(\\(.*?\\))(.*)", Pattern.DOTALL); - - for (Class<?> annotatedClass : annotatedClasses) { - Deprecated deprecated = annotatedClass.getAnnotation(Deprecated.class); - if (deprecated != null) { - continue; - } - - Description description = annotatedClass.getAnnotation(Description.class); - - String value = description.value().replaceAll("\n", " "); - Matcher matcher = func.matcher(value); - if (matcher.find()) { - value = asInlineCode(description.name() + matcher.group(1)) - + escapeHtml(matcher.group(2)); - } - sb.append(asListElement(value)); - - StringBuilder sbExtended = new StringBuilder(); - if (!description.extended().isEmpty()) { - sbExtended.append(description.extended()); - sb.append("\n"); - } - - String extended = sbExtended.toString(); - if (extended.isEmpty()) { - sb.append("\n"); - } else { - if (extended.toLowerCase().contains("select")) { // extended description contains SQL statements - sb.append(indent(asCodeBlock(extended, "sql"))); - } else { - sb.append(indent(asCodeBlock(extended))); - } - } - - String packageName = annotatedClass.getPackage().getName(); - if (!packages.containsKey(packageName)) { - Set<String> set = new TreeSet<>(); - packages.put(packageName, set); - } - Set<String> List = packages.get(packageName); - List.add(sb.toString()); - - StringUtils.clear(sb); - } - - try (PrintWriter writer = new PrintWriter(outputFile)) { - // license header - writer.println("<!--"); - try { - File licenseFile = new File(basedir, "resources/license-header.txt"); - FileReader fileReader = new FileReader(licenseFile); - - try (BufferedReader bufferedReader = new BufferedReader(fileReader)) { - String line; - while ((line = bufferedReader.readLine()) != null) { - writer.println(indent(line)); - } - } - } catch (IOException e) { - throw new MojoExecutionException("Failed to read license file"); - } - writer.println("-->\n"); - - writer.println(preface); - - writer.println("\n<!-- toc -->\n"); - - for (Map.Entry<String, List<String>> e : headers.entrySet()) { - writer.println(e.getKey() + "\n"); - List<String> packageNames = e.getValue(); - for (String packageName : packageNames) { - for (String desc : packages.get(packageName)) { - writer.println(desc); - } - } - } - - writer.flush(); - } catch (FileNotFoundException e) { - throw new MojoExecutionException("Output file is not found"); - } - } -}
