Repository: incubator-hivemall Updated Branches: refs/heads/master 8f973c5b2 -> 36fb839d9
[HIVEMALL-193] Implement a tool for generating a list of Hivemall UDFs ## What changes were proposed in this pull request? Automatically generate a list of UDFs for: - https://hivemall.incubator.apache.org/userguide/misc/funcs.html - https://hivemall.incubator.apache.org/userguide/misc/generic_funcs.html Initial mock implementation: https://github.com/takuti/hivemalldoc ## What type of PR is it? Improvement, Documentation ## What is the Jira issue? https://issues.apache.org/jira/browse/HIVEMALL-193 ## How was this patch tested? See output: https://gist.github.com/takuti/312d3a11bf85fc4044399d7e97a06f13 ## How to use this feature? ``` $ mvn clean package -Dskiptests=true -Dmaven.test.skip=true $ mvn org.apache.hivemall:hivemall-docs:generate-funcs-list ``` ## Checklist - [x] Did you apply source code formatter, i.e., `mvn formatter:format`, for your commit? Author: Takuya Kitazawa <[email protected]> Closes #148 from takuti/HIVEMALL-193. Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/36fb839d Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/36fb839d Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/36fb839d Branch: refs/heads/master Commit: 36fb839d935fcf05a4646bed9187e2af98b37232 Parents: 8f973c5 Author: Takuya Kitazawa <[email protected]> Authored: Wed Apr 25 16:46:01 2018 +0900 Committer: Makoto Yui <[email protected]> Committed: Wed Apr 25 16:46:01 2018 +0900 ---------------------------------------------------------------------- .../smile/tools/GuessAttributesUDF.java | 6 +- .../java/hivemall/tools/GenerateSeriesUDTF.java | 7 +- .../hivemall/tools/array/ArrayConcatUDF.java | 3 +- .../hivemall/tools/array/ArrayIntersectUDF.java | 3 +- .../hivemall/tools/array/ArrayRemoveUDF.java | 4 +- .../hivemall/tools/array/ArraySliceUDF.java | 3 +- .../tools/array/SortAndUniqArrayUDF.java | 3 +- .../tools/array/SubarrayEndWithUDF.java | 3 +- .../tools/array/SubarrayStartWithUDF.java | 3 +- .../java/hivemall/tools/bits/BitsORUDF.java | 4 +- .../java/hivemall/tools/bits/ToBitsUDF.java | 3 +- .../java/hivemall/tools/bits/UnBitsUDF.java | 3 +- .../hivemall/tools/compress/DeflateUDF.java | 8 +- .../hivemall/tools/compress/InflateUDF.java | 4 +- .../hivemall/tools/list/UDAFToOrderedList.java | 29 +- .../hivemall/tools/map/UDAFToOrderedMap.java | 24 +- .../java/hivemall/tools/text/Base91UDF.java | 3 +- .../tools/text/NormalizeUnicodeUDF.java | 4 +- .../hivemall/tools/text/SingularizeUDF.java | 3 +- .../java/hivemall/tools/text/Unbase91UDF.java | 4 +- .../java/hivemall/tools/text/WordNgramsUDF.java | 9 +- docs/gitbook/misc/funcs.md | 39 +- docs/gitbook/misc/generic_funcs.md | 408 +++++++++---------- .../hivemall/nlp/tokenizer/KuromojiUDF.java | 5 +- pom.xml | 1 + tools/hivemall-docs/pom.xml | 120 ++++++ .../java/hivemall/docs/FuncsListGenerator.java | 247 +++++++++++ .../java/hivemall/docs/utils/MarkdownUtils.java | 58 +++ tools/pom.xml | 40 ++ 29 files changed, 784 insertions(+), 267 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/smile/tools/GuessAttributesUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/smile/tools/GuessAttributesUDF.java b/core/src/main/java/hivemall/smile/tools/GuessAttributesUDF.java index bc2794e..5d79a18 100644 --- a/core/src/main/java/hivemall/smile/tools/GuessAttributesUDF.java +++ b/core/src/main/java/hivemall/smile/tools/GuessAttributesUDF.java @@ -31,9 +31,9 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -@Description(name = "guess_attribute_types", value = "_FUNC_(ANY, ...) - Returns attribute types" - + "\nselect guess_attribute_types(*) from train limit 1;" - + "\n> Q,Q,C,C,C,C,Q,C,C,C,Q,C,Q,Q,Q,Q,C,Q") +@Description(name = "guess_attribute_types", value = "_FUNC_(ANY, ...) - Returns attribute types", + extended = "select guess_attribute_types(*) from train limit 1;" + + "\n> Q,Q,C,C,C,C,Q,C,C,C,Q,C,Q,Q,Q,Q,C,Q") @UDFType(deterministic = true, stateful = false) public final class GuessAttributesUDF extends GenericUDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java b/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java index 47ac427..9d9b150 100644 --- a/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java +++ b/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java @@ -31,9 +31,12 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -@Description(name = "generate_series", +@Description( + name = "generate_series", value = "_FUNC_(const int|bigint start, const int|bigint end) - " - + "Generate a series of values, from start to end") + + "Generate a series of values, from start to end. A similar function to PostgreSQL's `generate_serics`. http://www.postgresql.org/docs/current/static/functions-srf.html", + extended = "select generate_series(1,9);\n\n" + "1\n" + "2\n" + "3\n" + "4\n" + "5\n" + + "6\n" + "7\n" + "8\n" + "9") public final class GenerateSeriesUDTF extends GenericUDTF { private long start, end; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java b/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java index baeca60..223d69a 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java @@ -37,7 +37,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; @Description(name = "array_concat", - value = "_FUNC_(array<ANY> x1, array<ANY> x2, ..) - Returns a concatenated array") + value = "_FUNC_(array<ANY> x1, array<ANY> x2, ..) - Returns a concatenated array", + extended = "select array_concat(array(1),array(2,3));\n" + "> [1,2,3]") @UDFType(deterministic = true, stateful = false) public class ArrayConcatUDF extends GenericUDF { /** http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java b/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java index de89f35..dab67bf 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java @@ -40,7 +40,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; @Description(name = "array_intersect", - value = "_FUNC_(array<ANY> x1, array<ANY> x2, ..) - Returns an intersect of given arrays") + value = "_FUNC_(array<ANY> x1, array<ANY> x2, ..) - Returns an intersect of given arrays", + extended = "select array_intersect(array(1,3,4),array(2,3,4),array(3,5));\n" + "> [3]") @UDFType(deterministic = true, stateful = false) public final class ArrayIntersectUDF extends GenericUDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java b/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java index 15a2c3d..523093b 100644 --- a/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java @@ -28,7 +28,9 @@ import org.apache.hadoop.io.Text; @Description(name = "array_remove", value = "_FUNC_(array<int|text> original, int|text|array<int> target)" - + " - Returns an array that the target is removed " + "from the original array") + + " - Returns an array that the target is removed " + "from the original array", + extended = "select array_remove(array(1,null,3),array(null));\n" + "> [3]\n" + "\n" + + "select array_remove(array(\"aaa\",\"bbb\"),\"bbb\");\n" + "> [\"aaa\"]") @UDFType(deterministic = true, stateful = false) public class ArrayRemoveUDF extends UDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java b/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java index f4be2bc..5ac12ac 100644 --- a/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java +++ b/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java @@ -41,7 +41,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn @Description( name = "array_slice", - value = "_FUNC_(array<ANY> values, int offset [, int length]) - Slices the given array by the given offset and length parameters.") + value = "_FUNC_(array<ANY> values, int offset [, int length]) - Slices the given array by the given offset and length parameters.", + extended = "select array_slice(array(1,2,3,4,5,6), 2,4);\n" + "> [3,4]") @UDFType(deterministic = true, stateful = false) public final class ArraySliceUDF extends GenericUDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java b/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java index 844bb9f..1c6162c 100644 --- a/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java +++ b/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java @@ -29,7 +29,8 @@ import org.apache.hadoop.hive.ql.udf.UDFType; import org.apache.hadoop.io.IntWritable; @Description(name = "sort_and_uniq_array", value = "_FUNC_(array<int>) - Takes array<int> and " - + "returns a sorted array with duplicate elements eliminated") + + "returns a sorted array with duplicate elements eliminated", + extended = "select sort_and_uniq_array(array(3,1,1,-2,10));\n" + "> [-2,1,3,10]") @UDFType(deterministic = true, stateful = false) public class SortAndUniqArrayUDF extends UDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java b/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java index ecc8a05..0f845ac 100644 --- a/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java +++ b/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java @@ -27,7 +27,8 @@ import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; @Description(name = "subarray_endwith", value = "_FUNC_(array<int|text> original, int|text key)" - + " - Returns an array that ends with the specified key") + + " - Returns an array that ends with the specified key", + extended = "select subarray_endwith(array(1,2,3,4), 3);\n" + "> [1,2,3]") @UDFType(deterministic = true, stateful = false) public class SubarrayEndWithUDF extends UDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java b/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java index 3f7266d..1903de3 100644 --- a/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java +++ b/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java @@ -27,7 +27,8 @@ import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; @Description(name = "subarray_startwith", value = "_FUNC_(array<int|text> original, int|text key)" - + " - Returns an array that starts with the specified key") + + " - Returns an array that starts with the specified key", + extended = "select subarray_startwith(array(1,2,3,4), 2);\n" + "> [2,3,4]") @UDFType(deterministic = true, stateful = false) public class SubarrayStartWithUDF extends UDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/bits/BitsORUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/bits/BitsORUDF.java b/core/src/main/java/hivemall/tools/bits/BitsORUDF.java index 98a659c..b76f949 100644 --- a/core/src/main/java/hivemall/tools/bits/BitsORUDF.java +++ b/core/src/main/java/hivemall/tools/bits/BitsORUDF.java @@ -37,7 +37,9 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; @Description(name = "bits_or", - value = "_FUNC_(array<long> b1, array<long> b2, ..) - Returns a logical OR given bitsets") + value = "_FUNC_(array<long> b1, array<long> b2, ..) - Returns a logical OR given bitsets", + extended = "select unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3))));\n" + + "> [1,2,3,4]") public final class BitsORUDF extends GenericUDF { private ListObjectInspector[] _listOIs; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java b/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java index ba9bcd9..f5790d9 100644 --- a/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java +++ b/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java @@ -40,7 +40,8 @@ import org.apache.hadoop.io.LongWritable; @Description( name = "to_bits", - value = "_FUNC_(int[] indexes) - Returns an bitset representation if the given indexes in long[]") + value = "_FUNC_(int[] indexes) - Returns an bitset representation if the given indexes in long[]", + extended = "select to_bits(array(1,2,3,128));\n" + "> [14,-9223372036854775808]") @UDFType(deterministic = true, stateful = false) public final class ToBitsUDF extends GenericUDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java b/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java index 705d19d..7651009 100644 --- a/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java +++ b/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java @@ -39,7 +39,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn import org.apache.hadoop.io.LongWritable; @Description(name = "unbits", - value = "_FUNC_(long[] bitset) - Returns an long array of the give bitset representation") + value = "_FUNC_(long[] bitset) - Returns an long array of the give bitset representation", + extended = "select unbits(to_bits(array(1,4,2,3)));\n" + "> [1,2,3,4]") @UDFType(deterministic = true, stateful = false) public final class UnBitsUDF extends GenericUDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/compress/DeflateUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/compress/DeflateUDF.java b/core/src/main/java/hivemall/tools/compress/DeflateUDF.java index ca9062a..28bc370 100644 --- a/core/src/main/java/hivemall/tools/compress/DeflateUDF.java +++ b/core/src/main/java/hivemall/tools/compress/DeflateUDF.java @@ -39,9 +39,11 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspe import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; -@Description(name = "deflate", value = "_FUNC_(TEXT data [, const int compressionLevel]) - " - + "Returns a compressed BINARY object by using Deflater.", - extended = "The compression level must be in range [-1,9]") +@Description( + name = "deflate", + value = "_FUNC_(TEXT data [, const int compressionLevel]) - " + + "Returns a compressed BINARY object by using Deflater. The compression level must be in range [-1,9]", + extended = "select base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));\n" + "> AA+=kaIM|WTt!+wbGAA") @UDFType(deterministic = true, stateful = false) public final class DeflateUDF extends GenericUDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/compress/InflateUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/compress/InflateUDF.java b/core/src/main/java/hivemall/tools/compress/InflateUDF.java index 0021208..20079ae 100644 --- a/core/src/main/java/hivemall/tools/compress/InflateUDF.java +++ b/core/src/main/java/hivemall/tools/compress/InflateUDF.java @@ -39,7 +39,9 @@ import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; @Description(name = "inflate", - value = "_FUNC_(BINARY compressedData) - Returns a decompressed STRING by using Inflater") + value = "_FUNC_(BINARY compressedData) - Returns a decompressed STRING by using Inflater", + extended = "select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));\n" + + "> aaaaaaaaaaaaaaaabbbbccc") @UDFType(deterministic = true, stateful = false) public final class InflateUDF extends GenericUDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/list/UDAFToOrderedList.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/list/UDAFToOrderedList.java b/core/src/main/java/hivemall/tools/list/UDAFToOrderedList.java index f17d1f4..37b33c4 100644 --- a/core/src/main/java/hivemall/tools/list/UDAFToOrderedList.java +++ b/core/src/main/java/hivemall/tools/list/UDAFToOrderedList.java @@ -66,9 +66,34 @@ import org.apache.hadoop.io.IntWritable; /** * Return list of values sorted by value itself or specific key. */ -@Description(name = "to_ordered_list", +@Description( + name = "to_ordered_list", value = "_FUNC_(PRIMITIVE value [, PRIMITIVE key, const string options])" - + " - Return list of values sorted by value itself or specific key") + + " - Return list of values sorted by value itself or specific key", + extended = "with t as (\n" + + " select 5 as key, 'apple' as value\n" + + " union all\n" + + " select 3 as key, 'banana' as value\n" + + " union all\n" + + " select 4 as key, 'candy' as value\n" + + " union all\n" + + " select 2 as key, 'donut' as value\n" + + " union all\n" + + " select 3 as key, 'egg' as value\n" + + ")\n" + + "select -- expected output\n" + + " to_ordered_list(value, key, '-reverse'), -- [apple, candy, (banana, egg | egg, banana), donut] (reverse order)\n" + + " to_ordered_list(value, key, '-k 2'), -- [apple, candy] (top-k)\n" + + " to_ordered_list(value, key, '-k 100'), -- [apple, candy, (banana, egg | egg, banana), dunut]\n" + + " to_ordered_list(value, key, '-k 2 -reverse'), -- [donut, (banana | egg)] (reverse top-k = tail-k)\n" + + " to_ordered_list(value, key), -- [donut, (banana, egg | egg, banana), candy, apple] (natural order)\n" + + " to_ordered_list(value, key, '-k -2'), -- [donut, (banana | egg)] (tail-k)\n" + + " to_ordered_list(value, key, '-k -100'), -- [donut, (banana, egg | egg, banana), candy, apple]\n" + + " to_ordered_list(value, key, '-k -2 -reverse'), -- [apple, candy] (reverse tail-k = top-k)\n" + + " to_ordered_list(value, '-k 2'), -- [egg, donut] (alphabetically)\n" + + " to_ordered_list(key, '-k -2 -reverse'), -- [5, 4] (top-2 keys)\n" + + " to_ordered_list(key) -- [2, 3, 3, 4, 5] (natural ordered keys)\n" + + "from\n" + " t") public final class UDAFToOrderedList extends AbstractGenericUDAFResolver { @Override http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/map/UDAFToOrderedMap.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/map/UDAFToOrderedMap.java b/core/src/main/java/hivemall/tools/map/UDAFToOrderedMap.java index 331874c..ba8ef82 100644 --- a/core/src/main/java/hivemall/tools/map/UDAFToOrderedMap.java +++ b/core/src/main/java/hivemall/tools/map/UDAFToOrderedMap.java @@ -53,9 +53,29 @@ import org.apache.hadoop.io.IntWritable; /** * Convert two aggregated columns into a sorted key-value map. */ -@Description(name = "to_ordered_map", +@Description( + name = "to_ordered_map", value = "_FUNC_(key, value [, const int k|const boolean reverseOrder=false]) " - + "- Convert two aggregated columns into an ordered key-value map") + + "- Convert two aggregated columns into an ordered key-value map", + extended = "with t as (\n" + + " select 10 as key, 'apple' as value\n" + + " union all\n" + + " select 3 as key, 'banana' as value\n" + + " union all\n" + + " select 4 as key, 'candy' as value\n" + + ")\n" + + "select\n" + + " to_ordered_map(key, value, true), -- {10:\"apple\",4:\"candy\",3:\"banana\"} (reverse)\n" + + " to_ordered_map(key, value, 1), -- {10:\"apple\"} (top-1)\n" + + " to_ordered_map(key, value, 2), -- {10:\"apple\",4:\"candy\"} (top-2)\n" + + " to_ordered_map(key, value, 3), -- {10:\"apple\",4:\"candy\",3:\"banana\"} (top-3)\n" + + " to_ordered_map(key, value, 100), -- {10:\"apple\",4:\"candy\",3:\"banana\"} (top-100)\n" + + " to_ordered_map(key, value), -- {3:\"banana\",4:\"candy\",10:\"apple\"} (natural)\n" + + " to_ordered_map(key, value, -1), -- {3:\"banana\"} (tail-1)\n" + + " to_ordered_map(key, value, -2), -- {3:\"banana\",4:\"candy\"} (tail-2)\n" + + " to_ordered_map(key, value, -3), -- {3:\"banana\",4:\"candy\",10:\"apple\"} (tail-3)\n" + + " to_ordered_map(key, value, -100) -- {3:\"banana\",4:\"candy\",10:\"apple\"} (tail-100)\n" + + "from t") public final class UDAFToOrderedMap extends UDAFToMap { @Override http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/text/Base91UDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/text/Base91UDF.java b/core/src/main/java/hivemall/tools/text/Base91UDF.java index 44d6b22..6f52599 100644 --- a/core/src/main/java/hivemall/tools/text/Base91UDF.java +++ b/core/src/main/java/hivemall/tools/text/Base91UDF.java @@ -39,7 +39,8 @@ import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; @Description(name = "base91", - value = "_FUNC_(BINARY bin) - Convert the argument from binary to a BASE91 string") + value = "_FUNC_(BINARY bin) - Convert the argument from binary to a BASE91 string", + extended = "select base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));\n" + "> AA+=kaIM|WTt!+wbGAA") @UDFType(deterministic = true, stateful = false) public final class Base91UDF extends GenericUDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/text/NormalizeUnicodeUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/text/NormalizeUnicodeUDF.java b/core/src/main/java/hivemall/tools/text/NormalizeUnicodeUDF.java index d34225d..0908e44 100644 --- a/core/src/main/java/hivemall/tools/text/NormalizeUnicodeUDF.java +++ b/core/src/main/java/hivemall/tools/text/NormalizeUnicodeUDF.java @@ -29,7 +29,9 @@ import org.apache.hadoop.hive.ql.udf.UDFType; @Description( name = "normalize_unicode", value = "_FUNC_(string str [, string form]) - Transforms `str` with the specified normalization form. " - + "The `form` takes one of NFC (default), NFD, NFKC, or NFKD") + + "The `form` takes one of NFC (default), NFD, NFKC, or NFKD", + extended = "select normalize_unicode('ï¾ï¾ï½¶ï½¸ï½¶ï¾ ','NFKC');\n" + "> ãã³ã«ã¯ã«ã\n" + "\n" + + "select normalize_unicode('ã±ã§ã¦â ¢','NFKC');\n" + "> (æ ª)ãã³ãã«III") @UDFType(deterministic = true, stateful = false) public final class NormalizeUnicodeUDF extends UDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/text/SingularizeUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/text/SingularizeUDF.java b/core/src/main/java/hivemall/tools/text/SingularizeUDF.java index 775c413..3d217e0 100644 --- a/core/src/main/java/hivemall/tools/text/SingularizeUDF.java +++ b/core/src/main/java/hivemall/tools/text/SingularizeUDF.java @@ -39,7 +39,8 @@ import org.apache.hadoop.hive.ql.udf.UDFType; // https://github.com/sundrio/sundrio/blob/95c2b11f7b842bdaa04f61e8e338aea60fb38f70/codegen/src/main/java/io/sundr/codegen/functions/Singularize.java // https://github.com/clips/pattern/blob/3eef00481a4555331cf9a099308910d977f6fc22/pattern/text/en/inflect.py#L445-L623 @Description(name = "singularize", - value = "_FUNC_(string word) - Returns singular form of a given English word") + value = "_FUNC_(string word) - Returns singular form of a given English word", + extended = "select singularize(lower(\"Apples\"));\n" + "\n" + "> \"apple\"") @UDFType(deterministic = true, stateful = false) public final class SingularizeUDF extends UDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/text/Unbase91UDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/text/Unbase91UDF.java b/core/src/main/java/hivemall/tools/text/Unbase91UDF.java index 32b1098..a96b3bf 100644 --- a/core/src/main/java/hivemall/tools/text/Unbase91UDF.java +++ b/core/src/main/java/hivemall/tools/text/Unbase91UDF.java @@ -38,7 +38,9 @@ import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspe import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; -@Description(name = "unbase91", value = "_FUNC_(string) - Convert a BASE91 string to a binary") +@Description(name = "unbase91", value = "_FUNC_(string) - Convert a BASE91 string to a binary", + extended = "select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));\n" + + "> aaaaaaaaaaaaaaaabbbbccc") @UDFType(deterministic = true, stateful = false) public final class Unbase91UDF extends GenericUDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java b/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java index e4e5504..db3435a 100644 --- a/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java +++ b/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java @@ -34,8 +34,13 @@ import javax.annotation.Nullable; import java.util.ArrayList; import java.util.List; -@Description(name = "word_ngrams", value = "_FUNC_(array<string> words, int minSize, int maxSize])" - + " - Returns list of n-grams for given words, where `minSize <= n <= maxSize`") +@Description( + name = "word_ngrams", + value = "_FUNC_(array<string> words, int minSize, int maxSize])" + + " - Returns list of n-grams for given words, where `minSize <= n <= maxSize`", + extended = "select word_ngrams(tokenize('Machine learning is fun!', true), 1, 2);\n" + + "\n" + + "> [\"machine\",\"machine learning\",\"learning\",\"learning is\",\"is\",\"is fun\",\"fun\"]") @UDFType(deterministic = true, stateful = false) public final class WordNgramsUDF extends UDF { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/docs/gitbook/misc/funcs.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/misc/funcs.md b/docs/gitbook/misc/funcs.md index d3b1565..00d7bba 100644 --- a/docs/gitbook/misc/funcs.md +++ b/docs/gitbook/misc/funcs.md @@ -191,8 +191,6 @@ This page describes a list of Hivemall functions. See also a [list of generic Hi # Preprocessing -## Feature creation - - `add_bias(feature_vector in array<string>)` - Returns features with a bias in array<string> - `add_feature_index(ARRAY[DOUBLE]: dense feature vector)` - Returns a feature vector with feature indices @@ -217,13 +215,13 @@ This page describes a list of Hivemall functions. See also a [list of generic Hi - `build_bins(number weight, const int num_of_bins[, const boolean auto_shrink = false])` - Return quantiles representing bins: array<double> -- `feature_binning(array<features::string> features, const map<string, array<number>> quantiles_map)` / _FUNC(number weight, const array<number> quantiles) - Returns binned features as an array<features::string> / bin ID as int +- `feature_binning(array<features::string> features, const map<string, array<number>> quantiles_map)` / _FUNC_(number weight, const array<number> quantiles) - Returns binned features as an array<features::string> / bin ID as int ## Feature format conversion - `conv2dense(int feature, float weight, int nDims)` - Return a dense model in array<float> -- `quantify(boolean outout, col1, col2, ...)` - Returns an identified features +- `quantify(boolean output, col1, col2, ...)` - Returns an identified features - `to_dense_features(array<string> feature_vector, int dimensions)` - Returns a dense feature in array<float> @@ -245,7 +243,7 @@ This page describes a list of Hivemall functions. See also a [list of generic Hi - `feature_pairs(feature_vector in array<string>, [, const string options])` - Returns a relation <string i, string j, double xi, double xj> -- `polynomial_features(feature_vector in array<string>)` - Returns a feature vectorhaving polynominal feature space +- `polynomial_features(feature_vector in array<string>)` - Returns a feature vectorhaving polynomial feature space - `powered_features(feature_vector in array<string>, int degree [, boolean truncate])` - Returns a feature vector having a powered feature space @@ -275,13 +273,13 @@ This page describes a list of Hivemall functions. See also a [list of generic Hi ## Feature transformation and vectorization -- `add_field_indices(array<string> features)` - Returns arrays of string that field indices (<field>:<feature>)* are argumented +- `add_field_indices(array<string> features)` - Returns arrays of string that field indices (<field>:<feature>)* are augmented - `binarize_label(int/long positive, int/long negative, ...)` - Returns positive/negative records that are represented as (..., int label) where label is 0 or 1 - `categorical_features(array<string> featureNames, feature1, feature2, .. [, const string options])` - Returns a feature vector array<string> -- `ffm_features(const array<string> featureNames, feature1, feature2, .. [, const string options])` - Takes categroical variables and returns a feature vector array<string> in a libffm format <field>:<index>:<value> +- `ffm_features(const array<string> featureNames, feature1, feature2, .. [, const string options])` - Takes categorical variables and returns a feature vector array<string> in a libffm format <field>:<index>:<value> - `indexed_features(double v1, double v2, ...)` - Returns a list of features as array<string>: [1:v1, 2:v2, ..] @@ -296,7 +294,7 @@ This page describes a list of Hivemall functions. See also a [list of generic Hi # Geospatial functions - `haversine_distance(double lat1, double lon1, double lat2, double lon2, [const boolean mile=false])`::double - return distance between two locations in km [or miles] using `haversine` formula - ``` + ```sql Usage: select latlon_distance(lat1, lon1, lat2, lon2) from ... ``` @@ -310,10 +308,9 @@ This page describes a list of Hivemall functions. See also a [list of generic Hi Google Maps: https://www.google.com/maps/@${lat},${lon},${zoom}z ``` -- `tile(double lat, double lon, int zoom)`::bigint - Returns a tile number 2^2n where n is zoom level. - +- `tile(double lat, double lon, int zoom)`::bigint - Returns a tile number 2^2n where n is zoom level. _FUNC_(lat,lon,zoom) = xtile(lon,zoom) + ytile(lat,zoom) * 2^zoom ``` - _FUNC_(lat,lon,zoom) = xtile(lon,zoom) + ytile(lat,zoom) * 2^zoomrefer http://wiki.openstreetmap.org/wiki/Slippy_map_tilenames for detail + refer http://wiki.openstreetmap.org/wiki/Slippy_map_tilenames for detail ``` - `tilex2lon(int x, int zoom)`::double - Returns longitude of the given tile x and zoom level @@ -344,7 +341,7 @@ This page describes a list of Hivemall functions. See also a [list of generic Hi - `bbit_minhash(array<> features [, int numHashes])` - Returns a b-bits minhash value -- `minhash(ANY item, array<int|bigint|string> features [, constant string options])` - Returns n differnce k-depth signatures (i.e., clusteid) for each item <clusteid, item> +- `minhash(ANY item, array<int|bigint|string> features [, constant string options])` - Returns n different k-depth signatures (i.e., clusterid) for each item <clusterid, item> - `minhashes(array<> features [, int numHashes, int keyGroup [, boolean noWeight]])` - Returns minhash values @@ -398,8 +395,6 @@ This page describes a list of Hivemall functions. See also a [list of generic Hi # Ensemble learning -## Utils - - `argmin_kld(float mean, float covar)` - Returns mean or covar that minimize a KL-distance among distributions ``` The returned value is (1.0 / (sum(1.0 / covar))) * (sum(mean / covar) @@ -415,25 +410,25 @@ This page describes a list of Hivemall functions. See also a [list of generic Hi - `weight_voted_avg(expr)` - Returns an averaged value by considering sum of positive/negative weights -# Dicision trees and RandomForest +# Decision trees and RandomForest - `train_gradient_tree_boosting_classifier(array<double|string> features, int label [, string options])` - Returns a relation consists of <int iteration, int model_type, array<string> pred_models, double intercept, double shrinkage, array<double> var_importance, float oob_error_rate> -- `train_randomforest_classifier(array<double|string> features, int label [, const array<double> classWeights, const string options])` - Returns a relation consists of <int model_id, int model_type, string pred_model, array<double> var_importance, int oob_errors, int oob_tests, double weight> +- `train_randomforest_classifier(array<double|string> features, int label [, const string options, const array<double> classWeights])`- Returns a relation consists of <string model_id, double model_weight, string model, array<double> var_importance, int oob_errors, int oob_tests> - `train_randomforest_regression(array<double|string> features, double target [, string options])` - Returns a relation consists of <int model_id, int model_type, string pred_model, array<double> var_importance, int oob_errors, int oob_tests> - `guess_attribute_types(ANY, ...)` - Returns attribute types - ``` + ```sql select guess_attribute_types(*) from train limit 1; > Q,Q,C,C,C,C,Q,C,C,C,Q,C,Q,Q,Q,Q,C,Q ``` -- `rf_ensemble(int yhat [, array<double> proba [, double model_weight=1.0]])` - Returns emsebled prediction results in <int label, double probability, array<double> probabilities> +- `rf_ensemble(int yhat [, array<double> proba [, double model_weight=1.0]])` - Returns ensembled prediction results in <int label, double probability, array<double> probabilities> - `tree_export(string model, const string options, optional array<string> featureNames=null, optional array<string> classNames=null)` - exports a Decision Tree model as javascript/dot] -- `tree_predict(string modelId, string model, array<double|string> features [, const string options | const boolean classification=false])` - Returns a prediction result of a random forest in <int value, array<double> posteriori> for classification and <double> for regression +- `tree_predict(string modelId, string model, array<double|string> features [, const string options | const boolean classification=false])` - Returns a prediction result of a random forest in <int value, array<double> a posteriori> for classification and <double> for regression # XGBoost @@ -450,10 +445,14 @@ This page describes a list of Hivemall functions. See also a [list of generic Hi # Others - `hivemall_version()` - Returns the version of Hivemall + ```sql + Usage: SELECT hivemall_version(); + ``` - `lr_datagen(options string)` - Generates a logistic regression dataset ```sql WITH dual AS (SELECT 1) SELECT lr_datagen('-n_examples 1k -n_features 10') FROM dual; ``` -- `tf(string text)` - Return a term frequency in <string, float> \ No newline at end of file +- `tf(string text)` - Return a term frequency in <string, float> + http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/docs/gitbook/misc/generic_funcs.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/misc/generic_funcs.md b/docs/gitbook/misc/generic_funcs.md index 3409f26..d33ab21 100644 --- a/docs/gitbook/misc/generic_funcs.md +++ b/docs/gitbook/misc/generic_funcs.md @@ -21,191 +21,174 @@ This page describes a list of useful Hivemall generic functions. See also a [lis <!-- toc --> -# Array functions +# Array -## Array UDFs +- `array_append(array<T> arr, T elem)` - Append an element to the end of an array + +- `array_avg(array<number>)` - Returns an array<double> in which each element is the mean of a set of numbers - `array_concat(array<ANY> x1, array<ANY> x2, ..)` - Returns a concatenated array + ```sql + select array_concat(array(1),array(2,3)); + > [1,2,3] + ``` - ```sql - select array_concat(array(1),array(2,3)); - > [1,2,3] - ``` +- `array_flatten(array<array<ANY>>)` - Returns an array with the elements flattened. - `array_intersect(array<ANY> x1, array<ANY> x2, ..)` - Returns an intersect of given arrays - - ```sql - select array_intersect(array(1,3,4),array(2,3,4),array(3,5)); - > [3] - ``` + ```sql + select array_intersect(array(1,3,4),array(2,3,4),array(3,5)); + > [3] + ``` - `array_remove(array<int|text> original, int|text|array<int> target)` - Returns an array that the target is removed from the original array + ```sql + select array_remove(array(1,null,3),array(null)); + > [3] - ```sql - select array_remove(array(1,null,3),array(null)); - > [3] + select array_remove(array("aaa","bbb"),"bbb"); + > ["aaa"] + ``` - select array_remove(array("aaa","bbb"),"bbb"); - > ["aaa"] - ``` +- `array_slice(array<ANY> values, int offset [, int length])` - Slices the given array by the given offset and length parameters. + ```sql + select array_slice(array(1,2,3,4,5,6), 2,4); + > [3,4] + ``` -- `sort_and_uniq_array(array<int>)` - Takes an array of type INT and returns a sorted array in a natural order with duplicate elements eliminated +- `array_sum(array<number>)` - Returns an array<double> in which each element is summed up - ```sql - select sort_and_uniq_array(array(3,1,1,-2,10)); - > [-2,1,3,10] - ``` +- `array_union(array1, array2, ...)` - Returns the union of a set of arrays -- `subarray_endwith(array<int|text> original, int|text key)` - Returns an array that ends with the specified key +- `conditional_emit(array<boolean> conditions, array<primitive> features)` - Emit features of a row according to various conditions - ```sql - select subarray_endwith(array(1,2,3,4), 3); - > [1,2,3] - ``` +- `element_at(array<T> list, int pos)` - Returns an element at the given position -- `subarray_startwith(array<int|text> original, int|text key)` - Returns an array that starts with the specified key +- `first_element(x)` - Returns the first element in an array - ```sql - select subarray_startwith(array(1,2,3,4), 2); - > [2,3,4] - ``` +- `float_array(nDims)` - Returns an array<float> of nDims elements -- `subarray(array<int> orignal, int fromIndex, int toIndex)` - Returns a slice of the original array between the inclusive `fromIndex` and the exclusive `toIndex` +- `last_element(x)` - Return the last element in an array - ```sql - select subarray(array(1,2,3,4,5,6), 2,4); - > [3,4] - ``` +- `select_k_best(array<number> array, const array<number> importance, const int k)` - Returns selected top-k elements as array<double> -- `float_array(nDims)` - Returns an array<float> of nDims elements +- `sort_and_uniq_array(array<int>)` - Takes array<int> and returns a sorted array with duplicate elements eliminated + ```sql + select sort_and_uniq_array(array(3,1,1,-2,10)); + > [-2,1,3,10] + ``` -- `select_k_best(array<number> array, const array<number> importance, const int k)` - Returns selected top-k elements as array<double> +- `subarray_endwith(array<int|text> original, int|text key)` - Returns an array that ends with the specified key + ```sql + select subarray_endwith(array(1,2,3,4), 3); + > [1,2,3] + ``` + +- `subarray_startwith(array<int|text> original, int|text key)` - Returns an array that starts with the specified key + ```sql + select subarray_startwith(array(1,2,3,4), 2); + > [2,3,4] + ``` - `to_string_array(array<ANY>)` - Returns an array of strings -## Array UDAFs - -- `array_avg(array<NUMBER>)` - Returns an array<double> in which each element is the mean of a set of numbers - -- `array_sum(array<NUMBER>)` - Returns an array<double> in which each element is summed up - -## List UDAF - -- `to_ordered_list(PRIMITIVE value [, PRIMITIVE key, const string options])` or `to_ordered_list(value, key [, const string options])` - Return list of values sorted by value itself or specific key - - ```sql - with t as ( - select 5 as key, 'apple' as value - union all - select 3 as key, 'banana' as value - union all - select 4 as key, 'candy' as value - union all - select 2 as key, 'donut' as value - union all - select 3 as key, 'egg' as value - ) - select -- expected output - to_ordered_list(value, key, '-reverse'), -- [apple, candy, (banana, egg | egg, banana), donut] (reverse order) - to_ordered_list(value, key, '-k 2'), -- [apple, candy] (top-k) - to_ordered_list(value, key, '-k 100'), -- [apple, candy, (banana, egg | egg, banana), dunut] - to_ordered_list(value, key, '-k 2 -reverse'), -- [donut, (banana | egg)] (reverse top-k = tail-k) - to_ordered_list(value, key), -- [donut, (banana, egg | egg, banana), candy, apple] (natural order) - to_ordered_list(value, key, '-k -2'), -- [donut, (banana | egg)] (tail-k) - to_ordered_list(value, key, '-k -100'), -- [donut, (banana, egg | egg, banana), candy, apple] - to_ordered_list(value, key, '-k -2 -reverse'), -- [apple, candy] (reverse tail-k = top-k) - to_ordered_list(value, '-k 2'), -- [egg, donut] (alphabetically) - to_ordered_list(key, '-k -2 -reverse'), -- [5, 4] (top-2 keys) - to_ordered_list(key) -- [2, 3, 3, 4, 5] (natural ordered keys) - from - t - ; - ``` - -# Bitset functions - -## Bitset UDF +- `to_ordered_list(PRIMITIVE value [, PRIMITIVE key, const string options])` - Return list of values sorted by value itself or specific key + ```sql + with t as ( + select 5 as key, 'apple' as value + union all + select 3 as key, 'banana' as value + union all + select 4 as key, 'candy' as value + union all + select 2 as key, 'donut' as value + union all + select 3 as key, 'egg' as value + ) + select -- expected output + to_ordered_list(value, key, '-reverse'), -- [apple, candy, (banana, egg | egg, banana), donut] (reverse order) + to_ordered_list(value, key, '-k 2'), -- [apple, candy] (top-k) + to_ordered_list(value, key, '-k 100'), -- [apple, candy, (banana, egg | egg, banana), dunut] + to_ordered_list(value, key, '-k 2 -reverse'), -- [donut, (banana | egg)] (reverse top-k = tail-k) + to_ordered_list(value, key), -- [donut, (banana, egg | egg, banana), candy, apple] (natural order) + to_ordered_list(value, key, '-k -2'), -- [donut, (banana | egg)] (tail-k) + to_ordered_list(value, key, '-k -100'), -- [donut, (banana, egg | egg, banana), candy, apple] + to_ordered_list(value, key, '-k -2 -reverse'), -- [apple, candy] (reverse tail-k = top-k) + to_ordered_list(value, '-k 2'), -- [egg, donut] (alphabetically) + to_ordered_list(key, '-k -2 -reverse'), -- [5, 4] (top-2 keys) + to_ordered_list(key) -- [2, 3, 3, 4, 5] (natural ordered keys) + from + t + ``` + +# Map -- `to_bits(int[] indexes)` - Returns an bitset representation if the given indexes in long[] +- `map_get_sum(map<int,float> src, array<int> keys)` - Returns sum of values that are retrieved by keys - ```sql - select to_bits(array(1,2,3,128)); - >[14,-9223372036854775808] - ``` +- `map_tail_n(map SRC, int N)` - Returns the last N elements from a sorted array of SRC -- `unbits(long[] bitset)` - Returns an long array of the give bitset representation +- `to_map(key, value)` - Convert two aggregated columns into a key-value map - ```sql - select unbits(to_bits(array(1,4,2,3))); - > [1,2,3,4] - ``` +- `to_ordered_map(key, value [, const int k|const boolean reverseOrder=false])` - Convert two aggregated columns into an ordered key-value map + ```sql + with t as ( + select 10 as key, 'apple' as value + union all + select 3 as key, 'banana' as value + union all + select 4 as key, 'candy' as value + ) + select + to_ordered_map(key, value, true), -- {10:"apple",4:"candy",3:"banana"} (reverse) + to_ordered_map(key, value, 1), -- {10:"apple"} (top-1) + to_ordered_map(key, value, 2), -- {10:"apple",4:"candy"} (top-2) + to_ordered_map(key, value, 3), -- {10:"apple",4:"candy",3:"banana"} (top-3) + to_ordered_map(key, value, 100), -- {10:"apple",4:"candy",3:"banana"} (top-100) + to_ordered_map(key, value), -- {3:"banana",4:"candy",10:"apple"} (natural) + to_ordered_map(key, value, -1), -- {3:"banana"} (tail-1) + to_ordered_map(key, value, -2), -- {3:"banana",4:"candy"} (tail-2) + to_ordered_map(key, value, -3), -- {3:"banana",4:"candy",10:"apple"} (tail-3) + to_ordered_map(key, value, -100) -- {3:"banana",4:"candy",10:"apple"} (tail-100) + from t + ``` + +# Bitset + +- `bits_collect(int|long x)` - Returns a bitset in array<long> - `bits_or(array<long> b1, array<long> b2, ..)` - Returns a logical OR given bitsets + ```sql + select unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3)))); + > [1,2,3,4] + ``` - ```sql - select unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3)))); - > [1,2,3,4] - ``` - -## Bitset UDAF - -- `bits_collect(int|long x)` - Returns a bitset in array<long> +- `to_bits(int[] indexes)` - Returns an bitset representation if the given indexes in long[] + ```sql + select to_bits(array(1,2,3,128)); + > [14,-9223372036854775808] + ``` -# Compression functions +- `unbits(long[] bitset)` - Returns an long array of the give bitset representation + ```sql + select unbits(to_bits(array(1,4,2,3))); + > [1,2,3,4] + ``` -- `deflate(TEXT data [, const int compressionLevel])` - Returns a compressed BINARY object by using Deflater. -The compression level must be in range [-1,9] +# Compression - ```sql - select base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); - > AA+=kaIM|WTt!+wbGAA - ``` +- `deflate(TEXT data [, const int compressionLevel])` - Returns a compressed BINARY object by using Deflater. The compression level must be in range [-1,9] + ```sql + select base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); + > AA+=kaIM|WTt!+wbGAA + ``` - `inflate(BINARY compressedData)` - Returns a decompressed STRING by using Inflater + ```sql + select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); + > aaaaaaaaaaaaaaaabbbbccc + ``` - ```sql - select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); - > aaaaaaaaaaaaaaaabbbbccc - ``` - -# Map functions - -## Map UDFs - -- `map_get_sum(map<int,float> src, array<int> keys)` - Returns sum of values that are retrieved by keys - -- `map_tail_n(map SRC, int N)` - Returns the last N elements from a sorted array of SRC - -## MAP UDAFs - -- `to_map(key, value)` - Convert two aggregated columns into a key-value map - -- `to_ordered_map(key, value [, const int k|const boolean reverseOrder=false])` - Convert two aggregated columns into an ordered key-value map - - ```sql - with t as ( - select 10 as key, 'apple' as value - union all - select 3 as key, 'banana' as value - union all - select 4 as key, 'candy' as value - ) - select - to_ordered_map(key, value, true), -- {10:"apple",4:"candy",3:"banana"} (reverse) - to_ordered_map(key, value, 1), -- {10:"apple"} (top-1) - to_ordered_map(key, value, 2), -- {10:"apple",4:"candy"} (top-2) - to_ordered_map(key, value, 3), -- {10:"apple",4:"candy",3:"banana"} (top-3) - to_ordered_map(key, value, 100), -- {10:"apple",4:"candy",3:"banana"} (top-100) - to_ordered_map(key, value), -- {3:"banana",4:"candy",10:"apple"} (natural) - to_ordered_map(key, value, -1), -- {3:"banana"} (tail-1) - to_ordered_map(key, value, -2), -- {3:"banana",4:"candy"} (tail-2) - to_ordered_map(key, value, -3), -- {3:"banana",4:"candy",10:"apple"} (tail-3) - to_ordered_map(key, value, -100) -- {3:"banana",4:"candy",10:"apple"} (tail-100) - from t - ; - ``` - -# MapReduce functions +# MapReduce - `distcache_gets(filepath, key, default_value [, parseKey])` - Returns map<key_type, value_type>|value_type @@ -216,100 +199,91 @@ The compression level must be in range [-1,9] - `rowid()` - Returns a generated row id of a form {TASK_ID}-{SEQUENCE_NUMBER} - `rownum()` - Returns a generated row number in long + ``` + returns sprintf(`%d%04d`,sequence,taskId) as long + ``` - `taskid()` - Returns the value of mapred.task.partition -# Math functions +# Math - `l2_norm(double xi)` - Return L2 norm of a vector which has the given values in each dimension -- `sigmoid(x)` - Returns `1.0 / (1.0 + exp(-x))` +- `sigmoid(x)` - Returns 1.0 / (1.0 + exp(-x)) -# Matrix functions +# Matrix - `transpose_and_dot(array<number> matrix0_row, array<number> matrix1_row)` - Returns dot(matrix0.T, matrix1) as array<array<double>>, shape = (matrix0.#cols, matrix1.#cols) -# Text processing functions +# Text processing -- `base91(binary)` - Convert the argument from binary to a BASE91 string +- `base91(BINARY bin)` - Convert the argument from binary to a BASE91 string + ```sql + select base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); + > AA+=kaIM|WTt!+wbGAA + ``` - ```sql - select base91(deflate('aaaaaaaaaaaaaaaabbbbccc')); - > AA+=kaIM|WTt!+wbGAA - ``` - -- `unbase91(string)` - Convert a BASE91 string to a binary - - ```sql - select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); - > aaaaaaaaaaaaaaaabbbbccc - ``` +- `is_stopword(string word)` - Returns whether English stopword or not - `normalize_unicode(string str [, string form])` - Transforms `str` with the specified normalization form. The `form` takes one of NFC (default), NFD, NFKC, or NFKD + ```sql + select normalize_unicode('ï¾ï¾ï½¶ï½¸ï½¶ï¾ ','NFKC'); + > ãã³ã«ã¯ã«ã - ```sql - select normalize_unicode('ï¾ï¾ï½¶ï½¸ï½¶ï¾ ','NFKC'); - > ãã³ã«ã¯ã«ã - - select normalize_unicode('ã±ã§ã¦â ¢','NFKC'); - > (æ ª)ãã³ãã«III - ``` - -- `split_words(string query [, string regex])` - Returns an array<text> containing splitted strings - -- `is_stopword(string word)` - Returns whether English stopword or not + select normalize_unicode('ã±ã§ã¦â ¢','NFKC'); + > (æ ª)ãã³ãã«III + ``` - `singularize(string word)` - Returns singular form of a given English word + ```sql + select singularize(lower("Apples")); - ```sql - select singularize(lower("Apples")); - - > "apple" - ``` - -- `tokenize(string englishText [, boolean toLowerCase])` - Returns words in array<string> - -- `tokenize_ja(String line [, const string mode = "normal", const list<string> stopWords, const list<string> stopTags])` - returns tokenized Japanese string in array<string>. Refer [this article](../misc/tokenizer.html) for detail. - - ```sql - select tokenize_ja("kuromojiã使ã£ãåãã¡æ¸ãã®ãã¹ãã§ãã第äºå¼æ°ã«ã¯normal/search/extendedãæå®ã§ãã¾ããããã©ã«ãã§ã¯normalã¢ã¼ãã§ãã"); + > "apple" + ``` - > ["kuromoji","使ã","åãã¡æ¸ã","ãã¹ã","第","äº","弿°","normal","search","extended","æå®","ããã©ã«ã","normal"," ã¢ã¼ã"] - ``` +- `split_words(string query [, string regex])` - Returns an array<text> containing split strings -- `tokenize_cn(String line [, const list<string> stopWords])` - returns tokenized Chinese string in array<string>. Refer [this article](../misc/tokenizer.html) for detail. +- `tokenize(string englishText [, boolean toLowerCase])` - Returns tokenized words in array<string> -- `word_ngrams(array<string> words, int minSize, int maxSize)` - Returns list of n-grams where `minSize <= n <= maxSize` +- `unbase91(string)` - Convert a BASE91 string to a binary + ```sql + select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc')))); + > aaaaaaaaaaaaaaaabbbbccc + ``` - ```sql - select word_ngrams(tokenize('Machine learning is fun!', true), 1, 2); +- `word_ngrams(array<string> words, int minSize, int maxSize])` - Returns list of n-grams for given words, where `minSize <= n <= maxSize` + ```sql + select word_ngrams(tokenize('Machine learning is fun!', true), 1, 2); - > ["machine","machine learning","learning","learning is","is","is fun","fun"] - ``` + > ["machine","machine learning","learning","learning is","is","is fun","fun"] + ``` -# Other functions +# Others - `convert_label(const int|const float)` - Convert from -1|1 to 0.0f|1.0f, or from 0.0f|1.0f to -1|1 -- `each_top_k(int K, Object group, double cmpKey, *)` - Returns top-K values (or tail-K values when k is less than 0). Refer [this article](../misc/topk.html) for detail. - -- `generate_series(const int|bigint start, const int|bigint end)` - Generate a series of values, from start to end - - ```sql - select generate_series(1,9); - - 1 - 2 - 3 - 4 - 5 - 6 - 7 - 8 - 9 - ``` - - A similar function to PostgreSQL's `generate_serics`. - http://www.postgresql.org/docs/current/static/functions-srf.html +- `each_top_k(int K, Object group, double cmpKey, *)` - Returns top-K values (or tail-K values when k is less than 0) + +- `generate_series(const int|bigint start, const int|bigint end)` - Generate a series of values, from start to end. A similar function to PostgreSQL's `generate_serics`. http://www.postgresql.org/docs/current/static/functions-srf.html + ```sql + select generate_series(1,9); + + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + ``` + +- `try_cast(ANY src, const string typeName)` - Explicitly cast a value as a type. Returns null if cast fails. + ```sql + Usage: select try_cast(array(1.0,2.0,3.0), 'array<string>') + select try_cast(map('A',10,'B',20,'C',30), 'map<string,double>') + ``` - `x_rank(KEY)` - Generates a pseudo sequence number starting from 1 for each key + http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java ---------------------------------------------------------------------- diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java index 89cf2c8..96d9e4b 100644 --- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java +++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java @@ -62,7 +62,10 @@ import org.apache.lucene.analysis.util.CharArraySet; @Description( name = "tokenize_ja", value = "_FUNC_(String line [, const string mode = \"normal\", const array<string> stopWords, const array<string> stopTags, const array<string> userDict (or string userDictURL)])" - + " - returns tokenized strings in array<string>") + + " - returns tokenized strings in array<string>", + extended = "select tokenize_ja(\"kuromojiã使ã£ãåãã¡æ¸ãã®ãã¹ãã§ãã第äºå¼æ°ã«ã¯normal/search/extendedãæå®ã§ãã¾ããããã©ã«ãã§ã¯normalã¢ã¼ãã§ãã\");\n" + + "\n" + + "> [\"kuromoji\",\"使ã\",\"åãã¡æ¸ã\",\"ãã¹ã\",\"第\",\"äº\",\"弿°\",\"normal\",\"search\",\"extended\",\"æå®\",\"ããã©ã«ã\",\"normal\",\" ã¢ã¼ã\"]\n") @UDFType(deterministic = true, stateful = false) public final class KuromojiUDF extends GenericUDF { private static final int CONNECT_TIMEOUT_MS = 10000; // 10 sec http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 1b5c8e6..35e3767 100644 --- a/pom.xml +++ b/pom.xml @@ -251,6 +251,7 @@ <module>mixserv</module> <module>spark</module> <module>dist</module> + <module>tools</module> </modules> <properties> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/tools/hivemall-docs/pom.xml ---------------------------------------------------------------------- diff --git a/tools/hivemall-docs/pom.xml b/tools/hivemall-docs/pom.xml new file mode 100644 index 0000000..fc49b84 --- /dev/null +++ b/tools/hivemall-docs/pom.xml @@ -0,0 +1,120 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.hivemall</groupId> + <artifactId>hivemall-tools</artifactId> + <version>0.5.1-incubating-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + + <!-- mvn org.apache.hivemall:hivemall-docs:generate-funcs-list --> + <artifactId>hivemall-docs</artifactId> + <name>Hivemall Documentation Tool</name> + <packaging>maven-plugin</packaging> + + <properties> + <main.basedir>${project.parent.parent.basedir}</main.basedir> + </properties> + + <dependencies> + <!-- provided scope --> + <dependency> + <groupId>org.apache.maven</groupId> + <artifactId>maven-core</artifactId> + <version>3.5.2</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.maven</groupId> + <artifactId>maven-plugin-api</artifactId> + <version>3.5.2</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.maven.plugin-tools</groupId> + <artifactId>maven-plugin-annotations</artifactId> + <version>3.4</version> + <scope>provided</scope> + </dependency> + + <!-- hivemall dependencies --> + <dependency> + <groupId>org.apache.hivemall</groupId> + <artifactId>hivemall-core</artifactId> + <version>${project.version}</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>org.apache.hivemall</groupId> + <artifactId>hivemall-nlp</artifactId> + <version>${project.version}</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>org.apache.hivemall</groupId> + <artifactId>hivemall-xgboost</artifactId> + <version>${project.version}</version> + <scope>compile</scope> + </dependency> + + <!-- compile scope --> + <dependency> + <groupId>org.apache.hive</groupId> + <artifactId>hive-exec</artifactId> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>${guava.version}</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>org.reflections</groupId> + <artifactId>reflections</artifactId> + <version>0.9.10</version> + <scope>compile</scope> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-plugin-plugin</artifactId> + <version>3.4</version> + <executions> + <execution> + <id>default-descriptor</id> + <phase>process-classes</phase> + </execution> + <execution> + <id>generate-helpmojo</id> + <goals> + <goal>helpmojo</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> +</project> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java ---------------------------------------------------------------------- diff --git a/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java b/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java new file mode 100644 index 0000000..187c752 --- /dev/null +++ b/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java @@ -0,0 +1,247 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.docs; + +import hivemall.docs.utils.MarkdownUtils; +import hivemall.utils.lang.StringUtils; + +import org.apache.maven.plugin.AbstractMojo; +import org.apache.maven.execution.MavenSession; +import org.apache.maven.plugin.MojoExecutionException; +import org.apache.maven.plugins.annotations.Mojo; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.maven.plugins.annotations.Parameter; +import org.reflections.Reflections; + +import javax.annotation.Nonnull; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.LinkedHashMap; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static org.apache.commons.lang.StringEscapeUtils.escapeHtml; + +/** + * Generate a list of UDFs for documentation. + * + * @link https://hivemall.incubator.apache.org/userguide/misc/generic_funcs.html + * @link https://hivemall.incubator.apache.org/userguide/misc/funcs.html + */ +@Mojo(name = "generate-funcs-list") +public class FuncsListGenerator extends AbstractMojo { + + @Parameter(defaultValue = "${basedir}", readonly = true) + private File basedir; + + @Parameter(defaultValue = "${session}", readonly = true) + private MavenSession session; + + @Parameter(defaultValue = "docs/gitbook/misc/generic_funcs.md") + private String pathToGenericFuncs; + + @Parameter(defaultValue = "docs/gitbook/misc/funcs.md") + private String pathToFuncs; + + private static final Map<String, List<String>> genericFuncsHeaders = new LinkedHashMap<>(); + static { + genericFuncsHeaders.put("# Array", + Arrays.asList("hivemall.tools.array", "hivemall.tools.list")); + genericFuncsHeaders.put("# Map", Collections.singletonList("hivemall.tools.map")); + genericFuncsHeaders.put("# Bitset", Collections.singletonList("hivemall.tools.bits")); + genericFuncsHeaders.put("# Compression", + Collections.singletonList("hivemall.tools.compress")); + genericFuncsHeaders.put("# MapReduce", Collections.singletonList("hivemall.tools.mapred")); + genericFuncsHeaders.put("# Math", Collections.singletonList("hivemall.tools.math")); + genericFuncsHeaders.put("# Matrix", Collections.singletonList("hivemall.tools.matrix")); + genericFuncsHeaders.put("# Text processing", + Collections.singletonList("hivemall.tools.text")); + genericFuncsHeaders.put("# Others", Collections.singletonList("hivemall.tools")); + } + + private static final Map<String, List<String>> funcsHeaders = new LinkedHashMap<>(); + static { + funcsHeaders.put("# Regression", Collections.singletonList("hivemall.regression")); + funcsHeaders.put("# Classification", Collections.<String>emptyList()); + funcsHeaders.put("## Binary classification", + Collections.singletonList("hivemall.classifier")); + funcsHeaders.put("## Multiclass classification", + Collections.singletonList("hivemall.classifier.multiclass")); + funcsHeaders.put("# Matrix factorization", Collections.singletonList("hivemall.mf")); + funcsHeaders.put("# Factorization machines", Collections.singletonList("hivemall.fm")); + funcsHeaders.put("# Recommendation", Collections.singletonList("hivemall.recommend")); + funcsHeaders.put("# Anomaly detection", Collections.singletonList("hivemall.anomaly")); + funcsHeaders.put("# Topic modeling", Collections.singletonList("hivemall.topicmodel")); + funcsHeaders.put("# Preprocessing", Collections.singletonList("hivemall.ftvec")); + funcsHeaders.put("## Data amplification", + Collections.singletonList("hivemall.ftvec.amplify")); + funcsHeaders.put("## Feature binning", Collections.singletonList("hivemall.ftvec.binning")); + funcsHeaders.put("## Feature format conversion", + Collections.singletonList("hivemall.ftvec.conv")); + funcsHeaders.put("## Feature hashing", Collections.singletonList("hivemall.ftvec.hashing")); + funcsHeaders.put("## Feature paring", Collections.singletonList("hivemall.ftvec.pairing")); + funcsHeaders.put("## Ranking", Collections.singletonList("hivemall.ftvec.ranking")); + funcsHeaders.put("## Feature scaling", Collections.singletonList("hivemall.ftvec.scaling")); + funcsHeaders.put("## Feature selection", + Collections.singletonList("hivemall.ftvec.selection")); + funcsHeaders.put("## Feature transformation and vectorization", + Collections.singletonList("hivemall.ftvec.trans")); + funcsHeaders.put("# Geospatial functions", Collections.singletonList("hivemall.geospatial")); + funcsHeaders.put("# Distance measures", Collections.singletonList("hivemall.knn.distance")); + funcsHeaders.put("# Locality-sensitive hashing", + Collections.singletonList("hivemall.knn.lsh")); + funcsHeaders.put("# Similarity measures", + Collections.singletonList("hivemall.knn.similarity")); + funcsHeaders.put("# Evaluation", Collections.singletonList("hivemall.evaluation")); + funcsHeaders.put("# Sketching", Collections.singletonList("hivemall.sketch.hll")); + funcsHeaders.put("# Ensemble learning", Collections.singletonList("hivemall.ensemble")); + funcsHeaders.put("## Bagging", Collections.singletonList("hivemall.ensemble.bagging")); + funcsHeaders.put("# Decision trees and RandomForest", Arrays.asList( + "hivemall.smile.classification", "hivemall.smile.regression", "hivemall.smile.tools")); + funcsHeaders.put("# XGBoost", Arrays.asList("hivemall.xgboost.classification", + "hivemall.xgboost.regression", "hivemall.xgboost.tools")); + funcsHeaders.put("# Others", + Arrays.asList("hivemall", "hivemall.dataset", "hivemall.ftvec.text")); + } + + @Override + public void execute() throws MojoExecutionException { + if (!isReactorRootProject()) { + // output only once across the projects + return; + } + + generate( + new File(basedir, pathToGenericFuncs), + "This page describes a list of useful Hivemall generic functions. See also a [list of machine-learning-related functions](./funcs.md).", + genericFuncsHeaders); + generate( + new File(basedir, pathToFuncs), + "This page describes a list of Hivemall functions. See also a [list of generic Hivemall functions](./generic_funcs.md) for more general-purpose functions such as array and map UDFs.", + funcsHeaders); + } + + private boolean isReactorRootProject() { + return session.getExecutionRootDirectory().equalsIgnoreCase(basedir.toString()); + } + + private void generate(@Nonnull File outputFile, @Nonnull String preface, + @Nonnull Map<String, List<String>> headers) throws MojoExecutionException { + Reflections reflections = new Reflections("hivemall"); + Set<Class<?>> annotatedClasses = reflections.getTypesAnnotatedWith(Description.class); + + StringBuilder sb = new StringBuilder(); + Map<String, Set<String>> packages = new HashMap<>(); + + Pattern func = Pattern.compile("_FUNC_(\\(.*?\\))(.*)", Pattern.DOTALL); + + for (Class<?> annotatedClass : annotatedClasses) { + Deprecated deprecated = annotatedClass.getAnnotation(Deprecated.class); + if (deprecated != null) { + continue; + } + + Description description = annotatedClass.getAnnotation(Description.class); + + String value = description.value().replaceAll("\n", " "); + Matcher matcher = func.matcher(value); + if (matcher.find()) { + value = MarkdownUtils.asInlineCode(description.name() + matcher.group(1)) + + escapeHtml(matcher.group(2)); + } + sb.append(MarkdownUtils.asListElement(value)); + + StringBuilder sbExtended = new StringBuilder(); + if (!description.extended().isEmpty()) { + sbExtended.append(description.extended()); + sb.append("\n"); + } + + String extended = sbExtended.toString(); + if (!extended.isEmpty()) { + if (extended.toLowerCase().contains("select")) { // extended description contains SQL statements + sb.append(MarkdownUtils.indent(MarkdownUtils.asCodeBlock(extended, "sql"))); + } else { + sb.append(MarkdownUtils.indent(MarkdownUtils.asCodeBlock(extended))); + } + } else { + sb.append("\n"); + } + + String packageName = annotatedClass.getPackage().getName(); + if (!packages.containsKey(packageName)) { + Set<String> set = new TreeSet<>(); + packages.put(packageName, set); + } + Set<String> List = packages.get(packageName); + List.add(sb.toString()); + + StringUtils.clear(sb); + } + + PrintWriter writer; + try { + writer = new PrintWriter(outputFile); + } catch (FileNotFoundException e) { + throw new MojoExecutionException("Output file is not found"); + } + + // license header + writer.println("<!--"); + try { + File licenseFile = new File(basedir, "resources/license-header.txt"); + FileReader fileReader = new FileReader(licenseFile); + BufferedReader bufferedReader = new BufferedReader(fileReader); + String line; + while ((line = bufferedReader.readLine()) != null) { + writer.println(MarkdownUtils.indent(line)); + } + } catch (IOException e) { + throw new MojoExecutionException("Failed to read license file"); + } + writer.println("-->\n"); + + writer.println(preface); + + writer.println("\n<!-- toc -->\n"); + + for (Map.Entry<String, List<String>> e : headers.entrySet()) { + writer.println(e.getKey() + "\n"); + List<String> packageNames = e.getValue(); + for (String packageName : packageNames) { + for (String desc : packages.get(packageName)) { + writer.println(desc); + } + } + } + + writer.close(); + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/tools/hivemall-docs/src/main/java/hivemall/docs/utils/MarkdownUtils.java ---------------------------------------------------------------------- diff --git a/tools/hivemall-docs/src/main/java/hivemall/docs/utils/MarkdownUtils.java b/tools/hivemall-docs/src/main/java/hivemall/docs/utils/MarkdownUtils.java new file mode 100644 index 0000000..677e6a7 --- /dev/null +++ b/tools/hivemall-docs/src/main/java/hivemall/docs/utils/MarkdownUtils.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.docs.utils; + +public class MarkdownUtils { + private static final String TAB = " "; + + public static String indent(final String s) { + if (s.isEmpty()) { + return s; + } + return TAB + s.replaceAll("(\\r\\n|\\r|\\n)(.+)", "$1" + TAB + "$2"); + } + + public static String asBold(final String s) { + return "**" + s + "**"; + } + + public static String asInlineCode(final String s) { + return "`" + s + "`"; + } + + public static String asListElement(final String s) { + return "- " + s; + } + + public static String asCodeBlock(final String s) { + return asCodeBlock(s, ""); + } + + public static String asCodeBlock(final String s, final String lang) { + return "```" + lang + "\n" + s + "\n```\n"; + } + + public static String asHeader(final String s, int level) { + char[] buf = new char[level]; + for (int i = 0; i < level; i++) { + buf[i] = '#'; + } + return new String(buf) + " " + s + "\n"; + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/tools/pom.xml ---------------------------------------------------------------------- diff --git a/tools/pom.xml b/tools/pom.xml new file mode 100644 index 0000000..6249896 --- /dev/null +++ b/tools/pom.xml @@ -0,0 +1,40 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.hivemall</groupId> + <artifactId>hivemall</artifactId> + <version>0.5.1-incubating-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + + <artifactId>hivemall-tools</artifactId> + <name>Hivemall Tools</name> + <packaging>pom</packaging> + + <modules> + <module>hivemall-docs</module> + </modules> + + <properties> + <main.basedir>${project.parent.basedir}</main.basedir> + </properties> +</project>
