incubator-hivemall git commit: [HIVEMALL-193] Implement a tool for generating a list of Hivemall UDFs

myui Wed, 25 Apr 2018 00:46:43 -0700

Repository: incubator-hivemall
Updated Branches:
  refs/heads/master 8f973c5b2 -> 36fb839d9



[HIVEMALL-193] Implement a tool for generating a list of Hivemall UDFs

## What changes were proposed in this pull request?

Automatically generate a list of UDFs for:

- https://hivemall.incubator.apache.org/userguide/misc/funcs.html
- https://hivemall.incubator.apache.org/userguide/misc/generic_funcs.html

Initial mock implementation: https://github.com/takuti/hivemalldoc

## What type of PR is it?

Improvement, Documentation

## What is the Jira issue?

https://issues.apache.org/jira/browse/HIVEMALL-193

## How was this patch tested?

See output: https://gist.github.com/takuti/312d3a11bf85fc4044399d7e97a06f13

## How to use this feature?

```
$ mvn clean package -Dskiptests=true -Dmaven.test.skip=true
$ mvn org.apache.hivemall:hivemall-docs:generate-funcs-list
```

## Checklist

- [x] Did you apply source code formatter, i.e., `mvn formatter:format`, for 
your commit?

Author: Takuya Kitazawa <[email protected]>

Closes #148 from takuti/HIVEMALL-193.


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/36fb839d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/36fb839d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/36fb839d

Branch: refs/heads/master
Commit: 36fb839d935fcf05a4646bed9187e2af98b37232
Parents: 8f973c5
Author: Takuya Kitazawa <[email protected]>
Authored: Wed Apr 25 16:46:01 2018 +0900
Committer: Makoto Yui <[email protected]>
Committed: Wed Apr 25 16:46:01 2018 +0900

----------------------------------------------------------------------
 .../smile/tools/GuessAttributesUDF.java         |   6 +-
 .../java/hivemall/tools/GenerateSeriesUDTF.java |   7 +-
 .../hivemall/tools/array/ArrayConcatUDF.java    |   3 +-
 .../hivemall/tools/array/ArrayIntersectUDF.java |   3 +-
 .../hivemall/tools/array/ArrayRemoveUDF.java    |   4 +-
 .../hivemall/tools/array/ArraySliceUDF.java     |   3 +-
 .../tools/array/SortAndUniqArrayUDF.java        |   3 +-
 .../tools/array/SubarrayEndWithUDF.java         |   3 +-
 .../tools/array/SubarrayStartWithUDF.java       |   3 +-
 .../java/hivemall/tools/bits/BitsORUDF.java     |   4 +-
 .../java/hivemall/tools/bits/ToBitsUDF.java     |   3 +-
 .../java/hivemall/tools/bits/UnBitsUDF.java     |   3 +-
 .../hivemall/tools/compress/DeflateUDF.java     |   8 +-
 .../hivemall/tools/compress/InflateUDF.java     |   4 +-
 .../hivemall/tools/list/UDAFToOrderedList.java  |  29 +-
 .../hivemall/tools/map/UDAFToOrderedMap.java    |  24 +-
 .../java/hivemall/tools/text/Base91UDF.java     |   3 +-
 .../tools/text/NormalizeUnicodeUDF.java         |   4 +-
 .../hivemall/tools/text/SingularizeUDF.java     |   3 +-
 .../java/hivemall/tools/text/Unbase91UDF.java   |   4 +-
 .../java/hivemall/tools/text/WordNgramsUDF.java |   9 +-
 docs/gitbook/misc/funcs.md                      |  39 +-
 docs/gitbook/misc/generic_funcs.md              | 408 +++++++++----------
 .../hivemall/nlp/tokenizer/KuromojiUDF.java     |   5 +-
 pom.xml                                         |   1 +
 tools/hivemall-docs/pom.xml                     | 120 ++++++
 .../java/hivemall/docs/FuncsListGenerator.java  | 247 +++++++++++
 .../java/hivemall/docs/utils/MarkdownUtils.java |  58 +++
 tools/pom.xml                                   |  40 ++
 29 files changed, 784 insertions(+), 267 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/smile/tools/GuessAttributesUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/smile/tools/GuessAttributesUDF.java 
b/core/src/main/java/hivemall/smile/tools/GuessAttributesUDF.java
index bc2794e..5d79a18 100644
--- a/core/src/main/java/hivemall/smile/tools/GuessAttributesUDF.java
+++ b/core/src/main/java/hivemall/smile/tools/GuessAttributesUDF.java
@@ -31,9 +31,9 @@ import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
 import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 
-@Description(name = "guess_attribute_types", value = "_FUNC_(ANY, ...) - 
Returns attribute types"
-        + "\nselect guess_attribute_types(*) from train limit 1;"
-        + "\n> Q,Q,C,C,C,C,Q,C,C,C,Q,C,Q,Q,Q,Q,C,Q")
+@Description(name = "guess_attribute_types", value = "_FUNC_(ANY, ...) - 
Returns attribute types",
+        extended = "select guess_attribute_types(*) from train limit 1;"
+                + "\n> Q,Q,C,C,C,C,Q,C,C,C,Q,C,Q,Q,Q,Q,C,Q")
 @UDFType(deterministic = true, stateful = false)
 public final class GuessAttributesUDF extends GenericUDF {
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java 
b/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java
index 47ac427..9d9b150 100644
--- a/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java
+++ b/core/src/main/java/hivemall/tools/GenerateSeriesUDTF.java
@@ -31,9 +31,12 @@ import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
 import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
 import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 
-@Description(name = "generate_series",
+@Description(
+        name = "generate_series",
         value = "_FUNC_(const int|bigint start, const int|bigint end) - "
-                + "Generate a series of values, from start to end")
+                + "Generate a series of values, from start to end. A similar 
function to PostgreSQL's `generate_serics`. 
http://www.postgresql.org/docs/current/static/functions-srf.html";,
+        extended = "select generate_series(1,9);\n\n" + "1\n" + "2\n" + "3\n" 
+ "4\n" + "5\n"
+                + "6\n" + "7\n" + "8\n" + "9")
 public final class GenerateSeriesUDTF extends GenericUDTF {
 
     private long start, end;

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java 
b/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java
index baeca60..223d69a 100644
--- a/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java
+++ b/core/src/main/java/hivemall/tools/array/ArrayConcatUDF.java
@@ -37,7 +37,8 @@ import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
 import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;
 
 @Description(name = "array_concat",
-        value = "_FUNC_(array<ANY> x1, array<ANY> x2, ..) - Returns a 
concatenated array")
+        value = "_FUNC_(array<ANY> x1, array<ANY> x2, ..) - Returns a 
concatenated array",
+        extended = "select array_concat(array(1),array(2,3));\n" + "> [1,2,3]")
 @UDFType(deterministic = true, stateful = false)
 public class ArrayConcatUDF extends GenericUDF {
     /**

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java 
b/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java
index de89f35..dab67bf 100644
--- a/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java
+++ b/core/src/main/java/hivemall/tools/array/ArrayIntersectUDF.java
@@ -40,7 +40,8 @@ import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
 
 @Description(name = "array_intersect",
-        value = "_FUNC_(array<ANY> x1, array<ANY> x2, ..) - Returns an 
intersect of given arrays")
+        value = "_FUNC_(array<ANY> x1, array<ANY> x2, ..) - Returns an 
intersect of given arrays",
+        extended = "select 
array_intersect(array(1,3,4),array(2,3,4),array(3,5));\n" + "> [3]")
 @UDFType(deterministic = true, stateful = false)
 public final class ArrayIntersectUDF extends GenericUDF {
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java 
b/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java
index 15a2c3d..523093b 100644
--- a/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java
+++ b/core/src/main/java/hivemall/tools/array/ArrayRemoveUDF.java
@@ -28,7 +28,9 @@ import org.apache.hadoop.io.Text;
 
 @Description(name = "array_remove",
         value = "_FUNC_(array<int|text> original, int|text|array<int> target)"
-                + " - Returns an array that the target is removed " + "from 
the original array")
+                + " - Returns an array that the target is removed " + "from 
the original array",
+        extended = "select array_remove(array(1,null,3),array(null));\n" + "> 
[3]\n" + "\n"
+                + "select array_remove(array(\"aaa\",\"bbb\"),\"bbb\");\n" + 
"> [\"aaa\"]")
 @UDFType(deterministic = true, stateful = false)
 public class ArrayRemoveUDF extends UDF {
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java 
b/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java
index f4be2bc..5ac12ac 100644
--- a/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java
+++ b/core/src/main/java/hivemall/tools/array/ArraySliceUDF.java
@@ -41,7 +41,8 @@ import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn
 
 @Description(
         name = "array_slice",
-        value = "_FUNC_(array<ANY> values, int offset [, int length]) - Slices 
the given array by the given offset and length parameters.")
+        value = "_FUNC_(array<ANY> values, int offset [, int length]) - Slices 
the given array by the given offset and length parameters.",
+        extended = "select array_slice(array(1,2,3,4,5,6), 2,4);\n" + "> 
[3,4]")
 @UDFType(deterministic = true, stateful = false)
 public final class ArraySliceUDF extends GenericUDF {
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java 
b/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java
index 844bb9f..1c6162c 100644
--- a/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java
+++ b/core/src/main/java/hivemall/tools/array/SortAndUniqArrayUDF.java
@@ -29,7 +29,8 @@ import org.apache.hadoop.hive.ql.udf.UDFType;
 import org.apache.hadoop.io.IntWritable;
 
 @Description(name = "sort_and_uniq_array", value = "_FUNC_(array<int>) - Takes 
array<int> and "
-        + "returns a sorted array with duplicate elements eliminated")
+        + "returns a sorted array with duplicate elements eliminated",
+        extended = "select sort_and_uniq_array(array(3,1,1,-2,10));\n" + "> 
[-2,1,3,10]")
 @UDFType(deterministic = true, stateful = false)
 public class SortAndUniqArrayUDF extends UDF {
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java 
b/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java
index ecc8a05..0f845ac 100644
--- a/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java
+++ b/core/src/main/java/hivemall/tools/array/SubarrayEndWithUDF.java
@@ -27,7 +27,8 @@ import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
 
 @Description(name = "subarray_endwith", value = "_FUNC_(array<int|text> 
original, int|text key)"
-        + " - Returns an array that ends with the specified key")
+        + " - Returns an array that ends with the specified key",
+        extended = "select subarray_endwith(array(1,2,3,4), 3);\n" + "> 
[1,2,3]")
 @UDFType(deterministic = true, stateful = false)
 public class SubarrayEndWithUDF extends UDF {
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java 
b/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java
index 3f7266d..1903de3 100644
--- a/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java
+++ b/core/src/main/java/hivemall/tools/array/SubarrayStartWithUDF.java
@@ -27,7 +27,8 @@ import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.Text;
 
 @Description(name = "subarray_startwith", value = "_FUNC_(array<int|text> 
original, int|text key)"
-        + " - Returns an array that starts with the specified key")
+        + " - Returns an array that starts with the specified key",
+        extended = "select subarray_startwith(array(1,2,3,4), 2);\n" + "> 
[2,3,4]")
 @UDFType(deterministic = true, stateful = false)
 public class SubarrayStartWithUDF extends UDF {
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/bits/BitsORUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/bits/BitsORUDF.java 
b/core/src/main/java/hivemall/tools/bits/BitsORUDF.java
index 98a659c..b76f949 100644
--- a/core/src/main/java/hivemall/tools/bits/BitsORUDF.java
+++ b/core/src/main/java/hivemall/tools/bits/BitsORUDF.java
@@ -37,7 +37,9 @@ import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn
 import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
 
 @Description(name = "bits_or",
-        value = "_FUNC_(array<long> b1, array<long> b2, ..) - Returns a 
logical OR given bitsets")
+        value = "_FUNC_(array<long> b1, array<long> b2, ..) - Returns a 
logical OR given bitsets",
+        extended = "select 
unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3))));\n"
+                + "> [1,2,3,4]")
 public final class BitsORUDF extends GenericUDF {
 
     private ListObjectInspector[] _listOIs;

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java 
b/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java
index ba9bcd9..f5790d9 100644
--- a/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java
+++ b/core/src/main/java/hivemall/tools/bits/ToBitsUDF.java
@@ -40,7 +40,8 @@ import org.apache.hadoop.io.LongWritable;
 
 @Description(
         name = "to_bits",
-        value = "_FUNC_(int[] indexes) - Returns an bitset representation if 
the given indexes in long[]")
+        value = "_FUNC_(int[] indexes) - Returns an bitset representation if 
the given indexes in long[]",
+        extended = "select to_bits(array(1,2,3,128));\n" + "> 
[14,-9223372036854775808]")
 @UDFType(deterministic = true, stateful = false)
 public final class ToBitsUDF extends GenericUDF {
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java 
b/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java
index 705d19d..7651009 100644
--- a/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java
+++ b/core/src/main/java/hivemall/tools/bits/UnBitsUDF.java
@@ -39,7 +39,8 @@ import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectIn
 import org.apache.hadoop.io.LongWritable;
 
 @Description(name = "unbits",
-        value = "_FUNC_(long[] bitset) - Returns an long array of the give 
bitset representation")
+        value = "_FUNC_(long[] bitset) - Returns an long array of the give 
bitset representation",
+        extended = "select unbits(to_bits(array(1,4,2,3)));\n" + "> [1,2,3,4]")
 @UDFType(deterministic = true, stateful = false)
 public final class UnBitsUDF extends GenericUDF {
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/compress/DeflateUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/compress/DeflateUDF.java 
b/core/src/main/java/hivemall/tools/compress/DeflateUDF.java
index ca9062a..28bc370 100644
--- a/core/src/main/java/hivemall/tools/compress/DeflateUDF.java
+++ b/core/src/main/java/hivemall/tools/compress/DeflateUDF.java
@@ -39,9 +39,11 @@ import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspe
 import org.apache.hadoop.io.BytesWritable;
 import org.apache.hadoop.io.Text;
 
-@Description(name = "deflate", value = "_FUNC_(TEXT data [, const int 
compressionLevel]) - "
-        + "Returns a compressed BINARY object by using Deflater.",
-        extended = "The compression level must be in range [-1,9]")
+@Description(
+        name = "deflate",
+        value = "_FUNC_(TEXT data [, const int compressionLevel]) - "
+                + "Returns a compressed BINARY object by using Deflater. The 
compression level must be in range [-1,9]",
+        extended = "select base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));\n" + "> 
AA+=kaIM|WTt!+wbGAA")
 @UDFType(deterministic = true, stateful = false)
 public final class DeflateUDF extends GenericUDF {
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/compress/InflateUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/compress/InflateUDF.java 
b/core/src/main/java/hivemall/tools/compress/InflateUDF.java
index 0021208..20079ae 100644
--- a/core/src/main/java/hivemall/tools/compress/InflateUDF.java
+++ b/core/src/main/java/hivemall/tools/compress/InflateUDF.java
@@ -39,7 +39,9 @@ import org.apache.hadoop.io.BytesWritable;
 import org.apache.hadoop.io.Text;
 
 @Description(name = "inflate",
-        value = "_FUNC_(BINARY compressedData) - Returns a decompressed STRING 
by using Inflater")
+        value = "_FUNC_(BINARY compressedData) - Returns a decompressed STRING 
by using Inflater",
+        extended = "select 
inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));\n"
+                + "> aaaaaaaaaaaaaaaabbbbccc")
 @UDFType(deterministic = true, stateful = false)
 public final class InflateUDF extends GenericUDF {
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/list/UDAFToOrderedList.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/list/UDAFToOrderedList.java 
b/core/src/main/java/hivemall/tools/list/UDAFToOrderedList.java
index f17d1f4..37b33c4 100644
--- a/core/src/main/java/hivemall/tools/list/UDAFToOrderedList.java
+++ b/core/src/main/java/hivemall/tools/list/UDAFToOrderedList.java
@@ -66,9 +66,34 @@ import org.apache.hadoop.io.IntWritable;
 /**
  * Return list of values sorted by value itself or specific key.
  */
-@Description(name = "to_ordered_list",
+@Description(
+        name = "to_ordered_list",
         value = "_FUNC_(PRIMITIVE value [, PRIMITIVE key, const string 
options])"
-                + " - Return list of values sorted by value itself or specific 
key")
+                + " - Return list of values sorted by value itself or specific 
key",
+        extended = "with t as (\n"
+                + "    select 5 as key, 'apple' as value\n"
+                + "    union all\n"
+                + "    select 3 as key, 'banana' as value\n"
+                + "    union all\n"
+                + "    select 4 as key, 'candy' as value\n"
+                + "    union all\n"
+                + "    select 2 as key, 'donut' as value\n"
+                + "    union all\n"
+                + "    select 3 as key, 'egg' as value\n"
+                + ")\n"
+                + "select                                             -- 
expected output\n"
+                + "    to_ordered_list(value, key, '-reverse'),       -- 
[apple, candy, (banana, egg | egg, banana), donut] (reverse order)\n"
+                + "    to_ordered_list(value, key, '-k 2'),           -- 
[apple, candy] (top-k)\n"
+                + "    to_ordered_list(value, key, '-k 100'),         -- 
[apple, candy, (banana, egg | egg, banana), dunut]\n"
+                + "    to_ordered_list(value, key, '-k 2 -reverse'),  -- 
[donut, (banana | egg)] (reverse top-k = tail-k)\n"
+                + "    to_ordered_list(value, key),                   -- 
[donut, (banana, egg | egg, banana), candy, apple] (natural order)\n"
+                + "    to_ordered_list(value, key, '-k -2'),          -- 
[donut, (banana | egg)] (tail-k)\n"
+                + "    to_ordered_list(value, key, '-k -100'),        -- 
[donut, (banana, egg | egg, banana), candy, apple]\n"
+                + "    to_ordered_list(value, key, '-k -2 -reverse'), -- 
[apple, candy] (reverse tail-k = top-k)\n"
+                + "    to_ordered_list(value, '-k 2'),                -- [egg, 
donut] (alphabetically)\n"
+                + "    to_ordered_list(key, '-k -2 -reverse'),        -- [5, 
4] (top-2 keys)\n"
+                + "    to_ordered_list(key)                           -- [2, 
3, 3, 4, 5] (natural ordered keys)\n"
+                + "from\n" + "    t")
 public final class UDAFToOrderedList extends AbstractGenericUDAFResolver {
 
     @Override

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/map/UDAFToOrderedMap.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/map/UDAFToOrderedMap.java 
b/core/src/main/java/hivemall/tools/map/UDAFToOrderedMap.java
index 331874c..ba8ef82 100644
--- a/core/src/main/java/hivemall/tools/map/UDAFToOrderedMap.java
+++ b/core/src/main/java/hivemall/tools/map/UDAFToOrderedMap.java
@@ -53,9 +53,29 @@ import org.apache.hadoop.io.IntWritable;
 /**
  * Convert two aggregated columns into a sorted key-value map.
  */
-@Description(name = "to_ordered_map",
+@Description(
+        name = "to_ordered_map",
         value = "_FUNC_(key, value [, const int k|const boolean 
reverseOrder=false]) "
-                + "- Convert two aggregated columns into an ordered key-value 
map")
+                + "- Convert two aggregated columns into an ordered key-value 
map",
+        extended = "with t as (\n"
+                + "    select 10 as key, 'apple' as value\n"
+                + "    union all\n"
+                + "    select 3 as key, 'banana' as value\n"
+                + "    union all\n"
+                + "    select 4 as key, 'candy' as value\n"
+                + ")\n"
+                + "select\n"
+                + "    to_ordered_map(key, value, true),   -- 
{10:\"apple\",4:\"candy\",3:\"banana\"} (reverse)\n"
+                + "    to_ordered_map(key, value, 1),      -- {10:\"apple\"} 
(top-1)\n"
+                + "    to_ordered_map(key, value, 2),      -- 
{10:\"apple\",4:\"candy\"} (top-2)\n"
+                + "    to_ordered_map(key, value, 3),      -- 
{10:\"apple\",4:\"candy\",3:\"banana\"} (top-3)\n"
+                + "    to_ordered_map(key, value, 100),    -- 
{10:\"apple\",4:\"candy\",3:\"banana\"} (top-100)\n"
+                + "    to_ordered_map(key, value),         -- 
{3:\"banana\",4:\"candy\",10:\"apple\"} (natural)\n"
+                + "    to_ordered_map(key, value, -1),     -- {3:\"banana\"} 
(tail-1)\n"
+                + "    to_ordered_map(key, value, -2),     -- 
{3:\"banana\",4:\"candy\"} (tail-2)\n"
+                + "    to_ordered_map(key, value, -3),     -- 
{3:\"banana\",4:\"candy\",10:\"apple\"} (tail-3)\n"
+                + "    to_ordered_map(key, value, -100)    -- 
{3:\"banana\",4:\"candy\",10:\"apple\"} (tail-100)\n"
+                + "from t")
 public final class UDAFToOrderedMap extends UDAFToMap {
 
     @Override

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/text/Base91UDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/text/Base91UDF.java 
b/core/src/main/java/hivemall/tools/text/Base91UDF.java
index 44d6b22..6f52599 100644
--- a/core/src/main/java/hivemall/tools/text/Base91UDF.java
+++ b/core/src/main/java/hivemall/tools/text/Base91UDF.java
@@ -39,7 +39,8 @@ import org.apache.hadoop.io.BytesWritable;
 import org.apache.hadoop.io.Text;
 
 @Description(name = "base91",
-        value = "_FUNC_(BINARY bin) - Convert the argument from binary to a 
BASE91 string")
+        value = "_FUNC_(BINARY bin) - Convert the argument from binary to a 
BASE91 string",
+        extended = "select base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));\n" + "> 
AA+=kaIM|WTt!+wbGAA")
 @UDFType(deterministic = true, stateful = false)
 public final class Base91UDF extends GenericUDF {
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/text/NormalizeUnicodeUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/text/NormalizeUnicodeUDF.java 
b/core/src/main/java/hivemall/tools/text/NormalizeUnicodeUDF.java
index d34225d..0908e44 100644
--- a/core/src/main/java/hivemall/tools/text/NormalizeUnicodeUDF.java
+++ b/core/src/main/java/hivemall/tools/text/NormalizeUnicodeUDF.java
@@ -29,7 +29,9 @@ import org.apache.hadoop.hive.ql.udf.UDFType;
 @Description(
         name = "normalize_unicode",
         value = "_FUNC_(string str [, string form]) - Transforms `str` with 
the specified normalization form. "
-                + "The `form` takes one of NFC (default), NFD, NFKC, or NFKD")
+                + "The `form` takes one of NFC (default), NFD, NFKC, or NFKD",
+        extended = "select normalize_unicode('ï¾ï¾ï½¶ï½¸ï½¶ï¾','NFKC');\n" 
+ "> ãã³ã«ã¯ã«ã\n" + "\n"
+                + "select normalize_unicode('ã±ã§ã¦â¢','NFKC');\n" + "> (æ 
ª)ãã³ãã«III")
 @UDFType(deterministic = true, stateful = false)
 public final class NormalizeUnicodeUDF extends UDF {
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/text/SingularizeUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/text/SingularizeUDF.java 
b/core/src/main/java/hivemall/tools/text/SingularizeUDF.java
index 775c413..3d217e0 100644
--- a/core/src/main/java/hivemall/tools/text/SingularizeUDF.java
+++ b/core/src/main/java/hivemall/tools/text/SingularizeUDF.java
@@ -39,7 +39,8 @@ import org.apache.hadoop.hive.ql.udf.UDFType;
 //  
https://github.com/sundrio/sundrio/blob/95c2b11f7b842bdaa04f61e8e338aea60fb38f70/codegen/src/main/java/io/sundr/codegen/functions/Singularize.java
 //  
https://github.com/clips/pattern/blob/3eef00481a4555331cf9a099308910d977f6fc22/pattern/text/en/inflect.py#L445-L623
 @Description(name = "singularize",
-        value = "_FUNC_(string word) - Returns singular form of a given 
English word")
+        value = "_FUNC_(string word) - Returns singular form of a given 
English word",
+        extended = "select singularize(lower(\"Apples\"));\n" + "\n" + "> 
\"apple\"")
 @UDFType(deterministic = true, stateful = false)
 public final class SingularizeUDF extends UDF {
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/text/Unbase91UDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/text/Unbase91UDF.java 
b/core/src/main/java/hivemall/tools/text/Unbase91UDF.java
index 32b1098..a96b3bf 100644
--- a/core/src/main/java/hivemall/tools/text/Unbase91UDF.java
+++ b/core/src/main/java/hivemall/tools/text/Unbase91UDF.java
@@ -38,7 +38,9 @@ import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspe
 import org.apache.hadoop.io.BytesWritable;
 import org.apache.hadoop.io.Text;
 
-@Description(name = "unbase91", value = "_FUNC_(string) - Convert a BASE91 
string to a binary")
+@Description(name = "unbase91", value = "_FUNC_(string) - Convert a BASE91 
string to a binary",
+        extended = "select 
inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));\n"
+                + "> aaaaaaaaaaaaaaaabbbbccc")
 @UDFType(deterministic = true, stateful = false)
 public final class Unbase91UDF extends GenericUDF {
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java 
b/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java
index e4e5504..db3435a 100644
--- a/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java
+++ b/core/src/main/java/hivemall/tools/text/WordNgramsUDF.java
@@ -34,8 +34,13 @@ import javax.annotation.Nullable;
 import java.util.ArrayList;
 import java.util.List;
 
-@Description(name = "word_ngrams", value = "_FUNC_(array<string> words, int 
minSize, int maxSize])"
-        + " - Returns list of n-grams for given words, where `minSize <= n <= 
maxSize`")
+@Description(
+        name = "word_ngrams",
+        value = "_FUNC_(array<string> words, int minSize, int maxSize])"
+                + " - Returns list of n-grams for given words, where `minSize 
<= n <= maxSize`",
+        extended = "select word_ngrams(tokenize('Machine learning is fun!', 
true), 1, 2);\n"
+                + "\n"
+                + "> [\"machine\",\"machine learning\",\"learning\",\"learning 
is\",\"is\",\"is fun\",\"fun\"]")
 @UDFType(deterministic = true, stateful = false)
 public final class WordNgramsUDF extends UDF {
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/docs/gitbook/misc/funcs.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/misc/funcs.md b/docs/gitbook/misc/funcs.md
index d3b1565..00d7bba 100644
--- a/docs/gitbook/misc/funcs.md
+++ b/docs/gitbook/misc/funcs.md
@@ -191,8 +191,6 @@ This page describes a list of Hivemall functions. See also 
a [list of generic Hi
 
 # Preprocessing
 
-## Feature creation
-
 - `add_bias(feature_vector in array<string>)` - Returns features with a bias 
in array&lt;string&gt;
 
 - `add_feature_index(ARRAY[DOUBLE]: dense feature vector)` - Returns a feature 
vector with feature indices
@@ -217,13 +215,13 @@ This page describes a list of Hivemall functions. See 
also a [list of generic Hi
 
 - `build_bins(number weight, const int num_of_bins[, const boolean auto_shrink 
= false])` - Return quantiles representing bins: array&lt;double&gt;
 
-- `feature_binning(array<features::string> features, const map<string, 
array<number>> quantiles_map)` / _FUNC(number weight, const array&lt;number&gt; 
quantiles) - Returns binned features as an array&lt;features::string&gt; / bin 
ID as int
+- `feature_binning(array<features::string> features, const map<string, 
array<number>> quantiles_map)` / _FUNC_(number weight, const 
array&lt;number&gt; quantiles) - Returns binned features as an 
array&lt;features::string&gt; / bin ID as int
 
 ## Feature format conversion
 
 - `conv2dense(int feature, float weight, int nDims)` - Return a dense model in 
array&lt;float&gt;
 
-- `quantify(boolean outout, col1, col2, ...)` - Returns an identified features
+- `quantify(boolean output, col1, col2, ...)` - Returns an identified features
 
 - `to_dense_features(array<string> feature_vector, int dimensions)` - Returns 
a dense feature in array&lt;float&gt;
 
@@ -245,7 +243,7 @@ This page describes a list of Hivemall functions. See also 
a [list of generic Hi
 
 - `feature_pairs(feature_vector in array<string>, [, const string options])` - 
Returns a relation &lt;string i, string j, double xi, double xj&gt;
 
-- `polynomial_features(feature_vector in array<string>)` - Returns a feature 
vectorhaving polynominal feature space
+- `polynomial_features(feature_vector in array<string>)` - Returns a feature 
vectorhaving polynomial feature space
 
 - `powered_features(feature_vector in array<string>, int degree [, boolean 
truncate])` - Returns a feature vector having a powered feature space
 
@@ -275,13 +273,13 @@ This page describes a list of Hivemall functions. See 
also a [list of generic Hi
 
 ## Feature transformation and vectorization
 
-- `add_field_indices(array<string> features)` - Returns arrays of string that 
field indices (&lt;field&gt;:&lt;feature&gt;)* are argumented
+- `add_field_indices(array<string> features)` - Returns arrays of string that 
field indices (&lt;field&gt;:&lt;feature&gt;)* are augmented
 
 - `binarize_label(int/long positive, int/long negative, ...)` - Returns 
positive/negative records that are represented as (..., int label) where label 
is 0 or 1
 
 - `categorical_features(array<string> featureNames, feature1, feature2, .. [, 
const string options])` - Returns a feature vector array&lt;string&gt;
 
-- `ffm_features(const array<string> featureNames, feature1, feature2, .. [, 
const string options])` - Takes categroical variables and returns a feature 
vector array&lt;string&gt; in a libffm format 
&lt;field&gt;:&lt;index&gt;:&lt;value&gt;
+- `ffm_features(const array<string> featureNames, feature1, feature2, .. [, 
const string options])` - Takes categorical variables and returns a feature 
vector array&lt;string&gt; in a libffm format 
&lt;field&gt;:&lt;index&gt;:&lt;value&gt;
 
 - `indexed_features(double v1, double v2, ...)` - Returns a list of features 
as array&lt;string&gt;: [1:v1, 2:v2, ..]
 
@@ -296,7 +294,7 @@ This page describes a list of Hivemall functions. See also 
a [list of generic Hi
 # Geospatial functions
 
 - `haversine_distance(double lat1, double lon1, double lat2, double lon2, 
[const boolean mile=false])`::double - return distance between two locations in 
km [or miles] using `haversine` formula
-  ```
+  ```sql
   Usage: select latlon_distance(lat1, lon1, lat2, lon2) from ...
   ```
 
@@ -310,10 +308,9 @@ This page describes a list of Hivemall functions. See also 
a [list of generic Hi
   Google Maps: https://www.google.com/maps/@${lat},${lon},${zoom}z
   ```
 
-- `tile(double lat, double lon, int zoom)`::bigint - Returns a tile number 
2^2n where n is zoom level.
-
+- `tile(double lat, double lon, int zoom)`::bigint - Returns a tile number 
2^2n where n is zoom level. _FUNC_(lat,lon,zoom) = xtile(lon,zoom) + 
ytile(lat,zoom) * 2^zoom
   ```
-  _FUNC_(lat,lon,zoom) = xtile(lon,zoom) + ytile(lat,zoom) * 2^zoomrefer 
http://wiki.openstreetmap.org/wiki/Slippy_map_tilenames for detail
+  refer http://wiki.openstreetmap.org/wiki/Slippy_map_tilenames for detail
   ```
 
 - `tilex2lon(int x, int zoom)`::double - Returns longitude of the given tile x 
and zoom level
@@ -344,7 +341,7 @@ This page describes a list of Hivemall functions. See also 
a [list of generic Hi
 
 - `bbit_minhash(array<> features [, int numHashes])` - Returns a b-bits 
minhash value
 
-- `minhash(ANY item, array<int|bigint|string> features [, constant string 
options])` - Returns n differnce k-depth signatures (i.e., clusteid) for each 
item &lt;clusteid, item&gt;
+- `minhash(ANY item, array<int|bigint|string> features [, constant string 
options])` - Returns n different k-depth signatures (i.e., clusterid) for each 
item &lt;clusterid, item&gt;
 
 - `minhashes(array<> features [, int numHashes, int keyGroup [, boolean 
noWeight]])` - Returns minhash values
 
@@ -398,8 +395,6 @@ This page describes a list of Hivemall functions. See also 
a [list of generic Hi
 
 # Ensemble learning
 
-## Utils
-
 - `argmin_kld(float mean, float covar)` - Returns mean or covar that minimize 
a KL-distance among distributions
   ```
   The returned value is (1.0 / (sum(1.0 / covar))) * (sum(mean / covar)
@@ -415,25 +410,25 @@ This page describes a list of Hivemall functions. See 
also a [list of generic Hi
 
 - `weight_voted_avg(expr)` - Returns an averaged value by considering sum of 
positive/negative weights
 
-# Dicision trees and RandomForest
+# Decision trees and RandomForest
 
 - `train_gradient_tree_boosting_classifier(array<double|string> features, int 
label [, string options])` - Returns a relation consists of &lt;int iteration, 
int model_type, array&lt;string&gt; pred_models, double intercept, double 
shrinkage, array&lt;double&gt; var_importance, float oob_error_rate&gt;
 
-- `train_randomforest_classifier(array<double|string> features, int label [, 
const array<double> classWeights, const string options])` - Returns a relation 
consists of &lt;int model_id, int model_type, string pred_model, 
array&lt;double&gt; var_importance, int oob_errors, int oob_tests, double 
weight&gt;
+- `train_randomforest_classifier(array<double|string> features, int label [, 
const string options, const array<double> classWeights])`- Returns a relation 
consists of &lt;string model_id, double model_weight, string model, 
array&lt;double&gt; var_importance, int oob_errors, int oob_tests&gt;
 
 - `train_randomforest_regression(array<double|string> features, double target 
[, string options])` - Returns a relation consists of &lt;int model_id, int 
model_type, string pred_model, array&lt;double&gt; var_importance, int 
oob_errors, int oob_tests&gt;
 
 - `guess_attribute_types(ANY, ...)` - Returns attribute types
-  ```
+  ```sql
   select guess_attribute_types(*) from train limit 1;
   > Q,Q,C,C,C,C,Q,C,C,C,Q,C,Q,Q,Q,Q,C,Q
   ```
 
-- `rf_ensemble(int yhat [, array<double> proba [, double model_weight=1.0]])` 
- Returns emsebled prediction results in &lt;int label, double probability, 
array&lt;double&gt; probabilities&gt;
+- `rf_ensemble(int yhat [, array<double> proba [, double model_weight=1.0]])` 
- Returns ensembled prediction results in &lt;int label, double probability, 
array&lt;double&gt; probabilities&gt;
 
 - `tree_export(string model, const string options, optional array<string> 
featureNames=null, optional array<string> classNames=null)` - exports a 
Decision Tree model as javascript/dot]
 
-- `tree_predict(string modelId, string model, array<double|string> features [, 
const string options | const boolean classification=false])` - Returns a 
prediction result of a random forest in &lt;int value, array&lt;double&gt; 
posteriori&gt; for classification and &lt;double&gt; for regression
+- `tree_predict(string modelId, string model, array<double|string> features [, 
const string options | const boolean classification=false])` - Returns a 
prediction result of a random forest in &lt;int value, array&lt;double&gt; a 
posteriori&gt; for classification and &lt;double&gt; for regression
 
 # XGBoost
 
@@ -450,10 +445,14 @@ This page describes a list of Hivemall functions. See 
also a [list of generic Hi
 # Others
 
 - `hivemall_version()` - Returns the version of Hivemall
+  ```sql
+  Usage: SELECT hivemall_version();
+  ```
 
 - `lr_datagen(options string)` - Generates a logistic regression dataset
   ```sql
   WITH dual AS (SELECT 1) SELECT lr_datagen('-n_examples 1k -n_features 10') 
FROM dual;
   ```
 
-- `tf(string text)` - Return a term frequency in &lt;string, float&gt;
\ No newline at end of file
+- `tf(string text)` - Return a term frequency in &lt;string, float&gt;
+

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/docs/gitbook/misc/generic_funcs.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/misc/generic_funcs.md 
b/docs/gitbook/misc/generic_funcs.md
index 3409f26..d33ab21 100644
--- a/docs/gitbook/misc/generic_funcs.md
+++ b/docs/gitbook/misc/generic_funcs.md
@@ -21,191 +21,174 @@ This page describes a list of useful Hivemall generic 
functions. See also a [lis
 
 <!-- toc -->
 
-# Array functions
+# Array
 
-## Array UDFs
+- `array_append(array<T> arr, T elem)` - Append an element to the end of an 
array
+
+- `array_avg(array<number>)` - Returns an array&lt;double&gt; in which each 
element is the mean of a set of numbers
 
 - `array_concat(array<ANY> x1, array<ANY> x2, ..)` - Returns a concatenated 
array
+  ```sql
+  select array_concat(array(1),array(2,3));
+  > [1,2,3]
+  ```
 
-    ```sql
-    select array_concat(array(1),array(2,3));
-    > [1,2,3]
-    ```
+- `array_flatten(array<array<ANY>>)` - Returns an array with the elements 
flattened.
 
 - `array_intersect(array<ANY> x1, array<ANY> x2, ..)` - Returns an intersect 
of given arrays
-
-    ```sql
-    select array_intersect(array(1,3,4),array(2,3,4),array(3,5));
-    > [3]
-    ```
+  ```sql
+  select array_intersect(array(1,3,4),array(2,3,4),array(3,5));
+  > [3]
+  ```
 
 - `array_remove(array<int|text> original, int|text|array<int> target)` - 
Returns an array that the target is removed from the original array
+  ```sql
+  select array_remove(array(1,null,3),array(null));
+  > [3]
 
-    ```sql
-    select array_remove(array(1,null,3),array(null));
-    > [3]
+  select array_remove(array("aaa","bbb"),"bbb");
+  > ["aaa"]
+  ```
 
-    select array_remove(array("aaa","bbb"),"bbb");
-    > ["aaa"]
-    ```
+- `array_slice(array<ANY> values, int offset [, int length])` - Slices the 
given array by the given offset and length parameters.
+  ```sql
+  select array_slice(array(1,2,3,4,5,6), 2,4);
+  > [3,4]
+  ```
 
-- `sort_and_uniq_array(array<int>)` - Takes an array of type INT and returns a 
sorted array in a natural order with duplicate elements eliminated
+- `array_sum(array<number>)` - Returns an array&lt;double&gt; in which each 
element is summed up
 
-    ```sql
-    select sort_and_uniq_array(array(3,1,1,-2,10));
-    > [-2,1,3,10]
-    ```
+- `array_union(array1, array2, ...)` - Returns the union of a set of arrays
 
-- `subarray_endwith(array<int|text> original, int|text key)` - Returns an 
array that ends with the specified key
+- `conditional_emit(array<boolean> conditions, array<primitive> features)` - 
Emit features of a row according to various conditions
 
-    ```sql
-    select subarray_endwith(array(1,2,3,4), 3);
-    > [1,2,3]
-    ```
+- `element_at(array<T> list, int pos)` - Returns an element at the given 
position
 
-- `subarray_startwith(array<int|text> original, int|text key)` - Returns an 
array that starts with the specified key
+- `first_element(x)` - Returns the first element in an array 
 
-    ```sql
-    select subarray_startwith(array(1,2,3,4), 2);
-    > [2,3,4]
-    ```
+- `float_array(nDims)` - Returns an array&lt;float&gt; of nDims elements
 
-- `subarray(array<int> orignal, int fromIndex, int toIndex)` - Returns a slice 
of the original array between the inclusive `fromIndex` and the exclusive 
`toIndex`
+- `last_element(x)` - Return the last element in an array
 
-    ```sql
-    select subarray(array(1,2,3,4,5,6), 2,4);
-    > [3,4]
-    ```
+- `select_k_best(array<number> array, const array<number> importance, const 
int k)` - Returns selected top-k elements as array&lt;double&gt;
 
-- `float_array(nDims)` - Returns an array&lt;float&gt; of nDims elements
+- `sort_and_uniq_array(array<int>)` - Takes array&lt;int&gt; and returns a 
sorted array with duplicate elements eliminated
+  ```sql
+  select sort_and_uniq_array(array(3,1,1,-2,10));
+  > [-2,1,3,10]
+  ```
 
-- `select_k_best(array<number> array, const array<number> importance, const 
int k)` - Returns selected top-k elements as array&lt;double&gt;
+- `subarray_endwith(array<int|text> original, int|text key)` - Returns an 
array that ends with the specified key
+  ```sql
+  select subarray_endwith(array(1,2,3,4), 3);
+  > [1,2,3]
+  ```
+
+- `subarray_startwith(array<int|text> original, int|text key)` - Returns an 
array that starts with the specified key
+  ```sql
+  select subarray_startwith(array(1,2,3,4), 2);
+  > [2,3,4]
+  ```
 
 - `to_string_array(array<ANY>)` - Returns an array of strings
 
-## Array UDAFs
-
-- `array_avg(array<NUMBER>)` - Returns an array<double> in which each element 
is the mean of a set of numbers
-
-- `array_sum(array<NUMBER>)` - Returns an array<double> in which each element 
is summed up
-
-## List UDAF
-
-- `to_ordered_list(PRIMITIVE value [, PRIMITIVE key, const string options])` 
or `to_ordered_list(value, key [, const string options])` - Return list of 
values sorted by value itself or specific key
-
-    ```sql
-    with t as (
-        select 5 as key, 'apple' as value
-        union all
-        select 3 as key, 'banana' as value
-        union all
-        select 4 as key, 'candy' as value
-        union all
-        select 2 as key, 'donut' as value
-        union all
-        select 3 as key, 'egg' as value
-    )
-    select                                             -- expected output
-        to_ordered_list(value, key, '-reverse'),       -- [apple, candy, 
(banana, egg | egg, banana), donut] (reverse order)
-        to_ordered_list(value, key, '-k 2'),           -- [apple, candy] 
(top-k)
-        to_ordered_list(value, key, '-k 100'),         -- [apple, candy, 
(banana, egg | egg, banana), dunut]
-        to_ordered_list(value, key, '-k 2 -reverse'),  -- [donut, (banana | 
egg)] (reverse top-k = tail-k)
-        to_ordered_list(value, key),                   -- [donut, (banana, egg 
| egg, banana), candy, apple] (natural order)
-        to_ordered_list(value, key, '-k -2'),          -- [donut, (banana | 
egg)] (tail-k)
-        to_ordered_list(value, key, '-k -100'),        -- [donut, (banana, egg 
| egg, banana), candy, apple]
-        to_ordered_list(value, key, '-k -2 -reverse'), -- [apple, candy] 
(reverse tail-k = top-k)
-        to_ordered_list(value, '-k 2'),                -- [egg, donut] 
(alphabetically)
-        to_ordered_list(key, '-k -2 -reverse'),        -- [5, 4] (top-2 keys)
-        to_ordered_list(key)                           -- [2, 3, 3, 4, 5] 
(natural ordered keys)
-    from
-        t
-    ;
-    ```
-
-# Bitset functions
-
-## Bitset UDF
+- `to_ordered_list(PRIMITIVE value [, PRIMITIVE key, const string options])` - 
Return list of values sorted by value itself or specific key
+  ```sql
+  with t as (
+      select 5 as key, 'apple' as value
+      union all
+      select 3 as key, 'banana' as value
+      union all
+      select 4 as key, 'candy' as value
+      union all
+      select 2 as key, 'donut' as value
+      union all
+      select 3 as key, 'egg' as value
+  )
+  select                                             -- expected output
+      to_ordered_list(value, key, '-reverse'),       -- [apple, candy, 
(banana, egg | egg, banana), donut] (reverse order)
+      to_ordered_list(value, key, '-k 2'),           -- [apple, candy] (top-k)
+      to_ordered_list(value, key, '-k 100'),         -- [apple, candy, 
(banana, egg | egg, banana), dunut]
+      to_ordered_list(value, key, '-k 2 -reverse'),  -- [donut, (banana | 
egg)] (reverse top-k = tail-k)
+      to_ordered_list(value, key),                   -- [donut, (banana, egg | 
egg, banana), candy, apple] (natural order)
+      to_ordered_list(value, key, '-k -2'),          -- [donut, (banana | 
egg)] (tail-k)
+      to_ordered_list(value, key, '-k -100'),        -- [donut, (banana, egg | 
egg, banana), candy, apple]
+      to_ordered_list(value, key, '-k -2 -reverse'), -- [apple, candy] 
(reverse tail-k = top-k)
+      to_ordered_list(value, '-k 2'),                -- [egg, donut] 
(alphabetically)
+      to_ordered_list(key, '-k -2 -reverse'),        -- [5, 4] (top-2 keys)
+      to_ordered_list(key)                           -- [2, 3, 3, 4, 5] 
(natural ordered keys)
+  from
+      t
+  ```
+
+# Map
 
-- `to_bits(int[] indexes)` - Returns an bitset representation if the given 
indexes in long[]
+- `map_get_sum(map<int,float> src, array<int> keys)` - Returns sum of values 
that are retrieved by keys
 
-    ```sql
-    select to_bits(array(1,2,3,128));
-    >[14,-9223372036854775808]
-    ```
+- `map_tail_n(map SRC, int N)` - Returns the last N elements from a sorted 
array of SRC
 
-- `unbits(long[] bitset)` - Returns an long array of the give bitset 
representation
+- `to_map(key, value)` - Convert two aggregated columns into a key-value map
 
-    ```sql
-    select unbits(to_bits(array(1,4,2,3)));
-    > [1,2,3,4]
-    ```
+- `to_ordered_map(key, value [, const int k|const boolean 
reverseOrder=false])` - Convert two aggregated columns into an ordered 
key-value map
+  ```sql
+  with t as (
+      select 10 as key, 'apple' as value
+      union all
+      select 3 as key, 'banana' as value
+      union all
+      select 4 as key, 'candy' as value
+  )
+  select
+      to_ordered_map(key, value, true),   -- {10:"apple",4:"candy",3:"banana"} 
(reverse)
+      to_ordered_map(key, value, 1),      -- {10:"apple"} (top-1)
+      to_ordered_map(key, value, 2),      -- {10:"apple",4:"candy"} (top-2)
+      to_ordered_map(key, value, 3),      -- {10:"apple",4:"candy",3:"banana"} 
(top-3)
+      to_ordered_map(key, value, 100),    -- {10:"apple",4:"candy",3:"banana"} 
(top-100)
+      to_ordered_map(key, value),         -- {3:"banana",4:"candy",10:"apple"} 
(natural)
+      to_ordered_map(key, value, -1),     -- {3:"banana"} (tail-1)
+      to_ordered_map(key, value, -2),     -- {3:"banana",4:"candy"} (tail-2)
+      to_ordered_map(key, value, -3),     -- {3:"banana",4:"candy",10:"apple"} 
(tail-3)
+      to_ordered_map(key, value, -100)    -- {3:"banana",4:"candy",10:"apple"} 
(tail-100)
+  from t
+  ```
+
+# Bitset
+
+- `bits_collect(int|long x)` - Returns a bitset in array&lt;long&gt;
 
 - `bits_or(array<long> b1, array<long> b2, ..)` - Returns a logical OR given 
bitsets
+  ```sql
+  select unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3))));
+  > [1,2,3,4]
+  ```
 
-    ```sql
-    select unbits(bits_or(to_bits(array(1,4)),to_bits(array(2,3))));
-    > [1,2,3,4]
-    ```
-
-## Bitset UDAF
-
-- `bits_collect(int|long x)` - Returns a bitset in array<long>
+- `to_bits(int[] indexes)` - Returns an bitset representation if the given 
indexes in long[]
+  ```sql
+  select to_bits(array(1,2,3,128));
+  > [14,-9223372036854775808]
+  ```
 
-# Compression functions
+- `unbits(long[] bitset)` - Returns an long array of the give bitset 
representation
+  ```sql
+  select unbits(to_bits(array(1,4,2,3)));
+  > [1,2,3,4]
+  ```
 
-- `deflate(TEXT data [, const int compressionLevel])` - Returns a compressed 
BINARY object by using Deflater.
-The compression level must be in range [-1,9]
+# Compression
 
-    ```sql
-    select base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));
-    > AA+=kaIM|WTt!+wbGAA
-    ```
+- `deflate(TEXT data [, const int compressionLevel])` - Returns a compressed 
BINARY object by using Deflater. The compression level must be in range [-1,9]
+  ```sql
+  select base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));
+  > AA+=kaIM|WTt!+wbGAA
+  ```
 
 - `inflate(BINARY compressedData)` - Returns a decompressed STRING by using 
Inflater
+  ```sql
+  select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));
+  > aaaaaaaaaaaaaaaabbbbccc
+  ```
 
-    ```sql
-    select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));
-    > aaaaaaaaaaaaaaaabbbbccc
-    ```
-
-# Map functions
-
-## Map UDFs
-
-- `map_get_sum(map<int,float> src, array<int> keys)` - Returns sum of values 
that are retrieved by keys
-
-- `map_tail_n(map SRC, int N)` - Returns the last N elements from a sorted 
array of SRC
-
-## MAP UDAFs
-
-- `to_map(key, value)` - Convert two aggregated columns into a key-value map
-
-- `to_ordered_map(key, value [, const int k|const boolean 
reverseOrder=false])` - Convert two aggregated columns into an ordered 
key-value map
-
-    ```sql
-    with t as (
-        select 10 as key, 'apple' as value
-        union all
-        select 3 as key, 'banana' as value
-        union all
-        select 4 as key, 'candy' as value
-    )
-    select
-        to_ordered_map(key, value, true),   -- 
{10:"apple",4:"candy",3:"banana"} (reverse)
-        to_ordered_map(key, value, 1),      -- {10:"apple"} (top-1)
-        to_ordered_map(key, value, 2),      -- {10:"apple",4:"candy"} (top-2)
-        to_ordered_map(key, value, 3),      -- 
{10:"apple",4:"candy",3:"banana"} (top-3)
-        to_ordered_map(key, value, 100),    -- 
{10:"apple",4:"candy",3:"banana"} (top-100)
-        to_ordered_map(key, value),         -- 
{3:"banana",4:"candy",10:"apple"} (natural)
-        to_ordered_map(key, value, -1),     -- {3:"banana"} (tail-1)
-        to_ordered_map(key, value, -2),     -- {3:"banana",4:"candy"} (tail-2)
-        to_ordered_map(key, value, -3),     -- 
{3:"banana",4:"candy",10:"apple"} (tail-3)
-        to_ordered_map(key, value, -100)    -- 
{3:"banana",4:"candy",10:"apple"} (tail-100)
-    from t
-    ;
-    ```
-
-# MapReduce functions
+# MapReduce
 
 - `distcache_gets(filepath, key, default_value [, parseKey])` - Returns 
map&lt;key_type, value_type&gt;|value_type
 
@@ -216,100 +199,91 @@ The compression level must be in range [-1,9]
 - `rowid()` - Returns a generated row id of a form {TASK_ID}-{SEQUENCE_NUMBER}
 
 - `rownum()` - Returns a generated row number in long
+  ```
+  returns sprintf(`%d%04d`,sequence,taskId) as long
+  ```
 
 - `taskid()` - Returns the value of mapred.task.partition
 
-# Math functions
+# Math
 
 - `l2_norm(double xi)` - Return L2 norm of a vector which has the given values 
in each dimension
 
-- `sigmoid(x)` - Returns `1.0 / (1.0 + exp(-x))`
+- `sigmoid(x)` - Returns 1.0 / (1.0 + exp(-x))
 
-# Matrix functions
+# Matrix
 
 - `transpose_and_dot(array<number> matrix0_row, array<number> matrix1_row)` - 
Returns dot(matrix0.T, matrix1) as array&lt;array&lt;double&gt;&gt;, shape = 
(matrix0.#cols, matrix1.#cols)
 
-# Text processing functions
+# Text processing
 
-- `base91(binary)` - Convert the argument from binary to a BASE91 string
+- `base91(BINARY bin)` - Convert the argument from binary to a BASE91 string
+  ```sql
+  select base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));
+  > AA+=kaIM|WTt!+wbGAA
+  ```
 
-    ```sql
-    select base91(deflate('aaaaaaaaaaaaaaaabbbbccc'));
-    > AA+=kaIM|WTt!+wbGAA
-    ```
-
-- `unbase91(string)` - Convert a BASE91 string to a binary
-
-    ```sql
-    select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));
-    > aaaaaaaaaaaaaaaabbbbccc
-    ```
+- `is_stopword(string word)` - Returns whether English stopword or not
 
 - `normalize_unicode(string str [, string form])` - Transforms `str` with the 
specified normalization form. The `form` takes one of NFC (default), NFD, NFKC, 
or NFKD
+  ```sql
+  select normalize_unicode('ï¾ï¾ï½¶ï½¸ï½¶ï¾','NFKC');
+  > ãã³ã«ã¯ã«ã
 
-    ```sql
-    select normalize_unicode('ï¾ï¾ï½¶ï½¸ï½¶ï¾','NFKC');
-    > ãã³ã«ã¯ã«ã
-
-    select normalize_unicode('ã±ã§ã¦â¢','NFKC');
-    > (æ ª)ãã³ãã«III
-    ```
-
-- `split_words(string query [, string regex])` - Returns an array<text> 
containing splitted strings
-
-- `is_stopword(string word)` - Returns whether English stopword or not
+  select normalize_unicode('ã±ã§ã¦â¢','NFKC');
+  > (æ ª)ãã³ãã«III
+  ```
 
 - `singularize(string word)` - Returns singular form of a given English word
+  ```sql
+  select singularize(lower("Apples"));
 
-    ```sql
-    select singularize(lower("Apples"));
-
-    > "apple"
-    ```
-
-- `tokenize(string englishText [, boolean toLowerCase])` - Returns words in 
array<string>
-
-- `tokenize_ja(String line [, const string mode = "normal", const list<string> 
stopWords, const list<string> stopTags])` - returns tokenized Japanese string 
in array<string>. Refer [this article](../misc/tokenizer.html) for detail.
-
-    ```sql
-    select 
tokenize_ja("kuromojiãä½¿ã£ãåãã¡æ¸ãã®ãã¹ãã§ããç¬¬äºå¼æ°ã«ã¯normal/search/extendedãæå®ã§ãã¾ããããã©ã«ãã§ã¯normalã¢ã¼ãã§ãã");
+  > "apple"
+  ```
 
-    > 
["kuromoji","ä½¿ã","åãã¡æ¸ã","ãã¹ã","ç¬¬","äº","å¼æ°","normal","search","extended","æå®","ããã©ã«ã","normal","
 ã¢ã¼ã"]
-    ```
+- `split_words(string query [, string regex])` - Returns an array&lt;text&gt; 
containing split strings
 
-- `tokenize_cn(String line [, const list<string> stopWords])` - returns 
tokenized Chinese string in array&lt;string&gt;. Refer [this 
article](../misc/tokenizer.html) for detail.
+- `tokenize(string englishText [, boolean toLowerCase])` - Returns tokenized 
words in array&lt;string&gt;
 
-- `word_ngrams(array<string> words, int minSize, int maxSize)` - Returns list 
of n-grams where `minSize <= n <= maxSize`
+- `unbase91(string)` - Convert a BASE91 string to a binary
+  ```sql
+  select inflate(unbase91(base91(deflate('aaaaaaaaaaaaaaaabbbbccc'))));
+  > aaaaaaaaaaaaaaaabbbbccc
+  ```
 
-    ```sql
-    select word_ngrams(tokenize('Machine learning is fun!', true), 1, 2);
+- `word_ngrams(array<string> words, int minSize, int maxSize])` - Returns list 
of n-grams for given words, where `minSize &lt;= n &lt;= maxSize`
+  ```sql
+  select word_ngrams(tokenize('Machine learning is fun!', true), 1, 2);
 
-    > ["machine","machine learning","learning","learning is","is","is 
fun","fun"]
-    ```
+  > ["machine","machine learning","learning","learning is","is","is fun","fun"]
+  ```
 
-# Other functions
+# Others
 
 - `convert_label(const int|const float)` - Convert from -1|1 to 0.0f|1.0f, or 
from 0.0f|1.0f to -1|1
 
-- `each_top_k(int K, Object group, double cmpKey, *)` - Returns top-K values 
(or tail-K values when k is less than 0). Refer [this 
article](../misc/topk.html) for detail.
-
-- `generate_series(const int|bigint start, const int|bigint end)` - Generate a 
series of values, from start to end
-
-    ```sql
-    select generate_series(1,9);
-
-    1
-    2
-    3
-    4
-    5
-    6
-    7
-    8
-    9
-    ```
-
-    A similar function to PostgreSQL's `generate_serics`.
-    http://www.postgresql.org/docs/current/static/functions-srf.html
+- `each_top_k(int K, Object group, double cmpKey, *)` - Returns top-K values 
(or tail-K values when k is less than 0)
+
+- `generate_series(const int|bigint start, const int|bigint end)` - Generate a 
series of values, from start to end. A similar function to PostgreSQL's 
`generate_serics`. 
http://www.postgresql.org/docs/current/static/functions-srf.html
+  ```sql
+  select generate_series(1,9);
+
+  1
+  2
+  3
+  4
+  5
+  6
+  7
+  8
+  9
+  ```
+
+- `try_cast(ANY src, const string typeName)` - Explicitly cast a value as a 
type. Returns null if cast fails.
+  ```sql
+  Usage: select try_cast(array(1.0,2.0,3.0), 'array<string>')
+       select try_cast(map('A',10,'B',20,'C',30), 'map<string,double>')
+  ```
 
 - `x_rank(KEY)` - Generates a pseudo sequence number starting from 1 for each 
key
+

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
----------------------------------------------------------------------
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java 
b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
index 89cf2c8..96d9e4b 100644
--- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java
@@ -62,7 +62,10 @@ import org.apache.lucene.analysis.util.CharArraySet;
 @Description(
         name = "tokenize_ja",
         value = "_FUNC_(String line [, const string mode = \"normal\", const 
array<string> stopWords, const array<string> stopTags, const array<string> 
userDict (or string userDictURL)])"
-                + " - returns tokenized strings in array<string>")
+                + " - returns tokenized strings in array<string>",
+        extended = "select 
tokenize_ja(\"kuromojiãä½¿ã£ãåãã¡æ¸ãã®ãã¹ãã§ããç¬¬äºå¼æ°ã«ã¯normal/search/extendedãæå®ã§ãã¾ããããã©ã«ãã§ã¯normalã¢ã¼ãã§ãã\");\n"
+                + "\n"
+                + "> 
[\"kuromoji\",\"ä½¿ã\",\"åãã¡æ¸ã\",\"ãã¹ã\",\"ç¬¬\",\"äº\",\"å¼æ°\",\"normal\",\"search\",\"extended\",\"æå®\",\"ããã©ã«ã\",\"normal\",\"
 ã¢ã¼ã\"]\n")
 @UDFType(deterministic = true, stateful = false)
 public final class KuromojiUDF extends GenericUDF {
     private static final int CONNECT_TIMEOUT_MS = 10000; // 10 sec

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 1b5c8e6..35e3767 100644
--- a/pom.xml
+++ b/pom.xml
@@ -251,6 +251,7 @@
                <module>mixserv</module>
                <module>spark</module>
                <module>dist</module>
+               <module>tools</module>
        </modules>
 
        <properties>

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/tools/hivemall-docs/pom.xml
----------------------------------------------------------------------
diff --git a/tools/hivemall-docs/pom.xml b/tools/hivemall-docs/pom.xml
new file mode 100644
index 0000000..fc49b84
--- /dev/null
+++ b/tools/hivemall-docs/pom.xml
@@ -0,0 +1,120 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"; 
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+       <modelVersion>4.0.0</modelVersion>
+
+       <parent>
+               <groupId>org.apache.hivemall</groupId>
+               <artifactId>hivemall-tools</artifactId>
+               <version>0.5.1-incubating-SNAPSHOT</version>
+               <relativePath>../pom.xml</relativePath>
+       </parent>
+
+       <!-- mvn org.apache.hivemall:hivemall-docs:generate-funcs-list -->
+       <artifactId>hivemall-docs</artifactId>
+       <name>Hivemall Documentation Tool</name>
+       <packaging>maven-plugin</packaging>
+
+       <properties>
+               <main.basedir>${project.parent.parent.basedir}</main.basedir>
+       </properties>
+
+       <dependencies>
+               <!-- provided scope -->
+               <dependency>
+                       <groupId>org.apache.maven</groupId>
+                       <artifactId>maven-core</artifactId>
+                       <version>3.5.2</version>
+                       <scope>provided</scope>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.maven</groupId>
+                       <artifactId>maven-plugin-api</artifactId>
+                       <version>3.5.2</version>
+                       <scope>provided</scope>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.maven.plugin-tools</groupId>
+                       <artifactId>maven-plugin-annotations</artifactId>
+                       <version>3.4</version>
+                       <scope>provided</scope>
+               </dependency>
+
+               <!-- hivemall dependencies -->
+               <dependency>
+                       <groupId>org.apache.hivemall</groupId>
+                       <artifactId>hivemall-core</artifactId>
+                       <version>${project.version}</version>
+                       <scope>compile</scope>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.hivemall</groupId>
+                       <artifactId>hivemall-nlp</artifactId>
+                       <version>${project.version}</version>
+                       <scope>compile</scope>
+               </dependency>
+               <dependency>
+                       <groupId>org.apache.hivemall</groupId>
+                       <artifactId>hivemall-xgboost</artifactId>
+                       <version>${project.version}</version>
+                       <scope>compile</scope>
+               </dependency>
+
+               <!-- compile scope -->
+               <dependency>
+                       <groupId>org.apache.hive</groupId>
+                       <artifactId>hive-exec</artifactId>
+                       <scope>compile</scope>
+               </dependency>
+               <dependency>
+                       <groupId>com.google.guava</groupId>
+                       <artifactId>guava</artifactId>
+                       <version>${guava.version}</version>
+                       <scope>compile</scope>
+               </dependency>
+               <dependency>
+                       <groupId>org.reflections</groupId>
+                       <artifactId>reflections</artifactId>
+                       <version>0.9.10</version>
+                       <scope>compile</scope>
+               </dependency>
+       </dependencies>
+
+       <build>
+               <plugins>
+                       <plugin>
+                               <groupId>org.apache.maven.plugins</groupId>
+                               <artifactId>maven-plugin-plugin</artifactId>
+                               <version>3.4</version>
+                               <executions>
+                                       <execution>
+                                               <id>default-descriptor</id>
+                                               <phase>process-classes</phase>
+                                       </execution>
+                                       <execution>
+                                               <id>generate-helpmojo</id>
+                                               <goals>
+                                                       <goal>helpmojo</goal>
+                                               </goals>
+                                       </execution>
+                               </executions>
+                       </plugin>
+               </plugins>
+       </build>
+</project>

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java
----------------------------------------------------------------------
diff --git 
a/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java 
b/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java
new file mode 100644
index 0000000..187c752
--- /dev/null
+++ b/tools/hivemall-docs/src/main/java/hivemall/docs/FuncsListGenerator.java
@@ -0,0 +1,247 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.docs;
+
+import hivemall.docs.utils.MarkdownUtils;
+import hivemall.utils.lang.StringUtils;
+
+import org.apache.maven.plugin.AbstractMojo;
+import org.apache.maven.execution.MavenSession;
+import org.apache.maven.plugin.MojoExecutionException;
+import org.apache.maven.plugins.annotations.Mojo;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.maven.plugins.annotations.Parameter;
+import org.reflections.Reflections;
+
+import javax.annotation.Nonnull;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.LinkedHashMap;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static org.apache.commons.lang.StringEscapeUtils.escapeHtml;
+
+/**
+ * Generate a list of UDFs for documentation.
+ *
+ * @link 
https://hivemall.incubator.apache.org/userguide/misc/generic_funcs.html
+ * @link https://hivemall.incubator.apache.org/userguide/misc/funcs.html
+ */
+@Mojo(name = "generate-funcs-list")
+public class FuncsListGenerator extends AbstractMojo {
+
+    @Parameter(defaultValue = "${basedir}", readonly = true)
+    private File basedir;
+
+    @Parameter(defaultValue = "${session}", readonly = true)
+    private MavenSession session;
+
+    @Parameter(defaultValue = "docs/gitbook/misc/generic_funcs.md")
+    private String pathToGenericFuncs;
+
+    @Parameter(defaultValue = "docs/gitbook/misc/funcs.md")
+    private String pathToFuncs;
+
+    private static final Map<String, List<String>> genericFuncsHeaders = new 
LinkedHashMap<>();
+    static {
+        genericFuncsHeaders.put("# Array",
+            Arrays.asList("hivemall.tools.array", "hivemall.tools.list"));
+        genericFuncsHeaders.put("# Map", 
Collections.singletonList("hivemall.tools.map"));
+        genericFuncsHeaders.put("# Bitset", 
Collections.singletonList("hivemall.tools.bits"));
+        genericFuncsHeaders.put("# Compression",
+            Collections.singletonList("hivemall.tools.compress"));
+        genericFuncsHeaders.put("# MapReduce", 
Collections.singletonList("hivemall.tools.mapred"));
+        genericFuncsHeaders.put("# Math", 
Collections.singletonList("hivemall.tools.math"));
+        genericFuncsHeaders.put("# Matrix", 
Collections.singletonList("hivemall.tools.matrix"));
+        genericFuncsHeaders.put("# Text processing",
+            Collections.singletonList("hivemall.tools.text"));
+        genericFuncsHeaders.put("# Others", 
Collections.singletonList("hivemall.tools"));
+    }
+
+    private static final Map<String, List<String>> funcsHeaders = new 
LinkedHashMap<>();
+    static {
+        funcsHeaders.put("# Regression", 
Collections.singletonList("hivemall.regression"));
+        funcsHeaders.put("# Classification", Collections.<String>emptyList());
+        funcsHeaders.put("## Binary classification",
+            Collections.singletonList("hivemall.classifier"));
+        funcsHeaders.put("## Multiclass classification",
+            Collections.singletonList("hivemall.classifier.multiclass"));
+        funcsHeaders.put("# Matrix factorization", 
Collections.singletonList("hivemall.mf"));
+        funcsHeaders.put("# Factorization machines", 
Collections.singletonList("hivemall.fm"));
+        funcsHeaders.put("# Recommendation", 
Collections.singletonList("hivemall.recommend"));
+        funcsHeaders.put("# Anomaly detection", 
Collections.singletonList("hivemall.anomaly"));
+        funcsHeaders.put("# Topic modeling", 
Collections.singletonList("hivemall.topicmodel"));
+        funcsHeaders.put("# Preprocessing", 
Collections.singletonList("hivemall.ftvec"));
+        funcsHeaders.put("## Data amplification",
+            Collections.singletonList("hivemall.ftvec.amplify"));
+        funcsHeaders.put("## Feature binning", 
Collections.singletonList("hivemall.ftvec.binning"));
+        funcsHeaders.put("## Feature format conversion",
+            Collections.singletonList("hivemall.ftvec.conv"));
+        funcsHeaders.put("## Feature hashing", 
Collections.singletonList("hivemall.ftvec.hashing"));
+        funcsHeaders.put("## Feature paring", 
Collections.singletonList("hivemall.ftvec.pairing"));
+        funcsHeaders.put("## Ranking", 
Collections.singletonList("hivemall.ftvec.ranking"));
+        funcsHeaders.put("## Feature scaling", 
Collections.singletonList("hivemall.ftvec.scaling"));
+        funcsHeaders.put("## Feature selection",
+            Collections.singletonList("hivemall.ftvec.selection"));
+        funcsHeaders.put("## Feature transformation and vectorization",
+            Collections.singletonList("hivemall.ftvec.trans"));
+        funcsHeaders.put("# Geospatial functions", 
Collections.singletonList("hivemall.geospatial"));
+        funcsHeaders.put("# Distance measures", 
Collections.singletonList("hivemall.knn.distance"));
+        funcsHeaders.put("# Locality-sensitive hashing",
+            Collections.singletonList("hivemall.knn.lsh"));
+        funcsHeaders.put("# Similarity measures",
+            Collections.singletonList("hivemall.knn.similarity"));
+        funcsHeaders.put("# Evaluation", 
Collections.singletonList("hivemall.evaluation"));
+        funcsHeaders.put("# Sketching", 
Collections.singletonList("hivemall.sketch.hll"));
+        funcsHeaders.put("# Ensemble learning", 
Collections.singletonList("hivemall.ensemble"));
+        funcsHeaders.put("## Bagging", 
Collections.singletonList("hivemall.ensemble.bagging"));
+        funcsHeaders.put("# Decision trees and RandomForest", Arrays.asList(
+            "hivemall.smile.classification", "hivemall.smile.regression", 
"hivemall.smile.tools"));
+        funcsHeaders.put("# XGBoost", 
Arrays.asList("hivemall.xgboost.classification",
+            "hivemall.xgboost.regression", "hivemall.xgboost.tools"));
+        funcsHeaders.put("# Others",
+            Arrays.asList("hivemall", "hivemall.dataset", 
"hivemall.ftvec.text"));
+    }
+
+    @Override
+    public void execute() throws MojoExecutionException {
+        if (!isReactorRootProject()) {
+            // output only once across the projects
+            return;
+        }
+
+        generate(
+            new File(basedir, pathToGenericFuncs),
+            "This page describes a list of useful Hivemall generic functions. 
See also a [list of machine-learning-related functions](./funcs.md).",
+            genericFuncsHeaders);
+        generate(
+            new File(basedir, pathToFuncs),
+            "This page describes a list of Hivemall functions. See also a 
[list of generic Hivemall functions](./generic_funcs.md) for more 
general-purpose functions such as array and map UDFs.",
+            funcsHeaders);
+    }
+
+    private boolean isReactorRootProject() {
+        return 
session.getExecutionRootDirectory().equalsIgnoreCase(basedir.toString());
+    }
+
+    private void generate(@Nonnull File outputFile, @Nonnull String preface,
+            @Nonnull Map<String, List<String>> headers) throws 
MojoExecutionException {
+        Reflections reflections = new Reflections("hivemall");
+        Set<Class<?>> annotatedClasses = 
reflections.getTypesAnnotatedWith(Description.class);
+
+        StringBuilder sb = new StringBuilder();
+        Map<String, Set<String>> packages = new HashMap<>();
+
+        Pattern func = Pattern.compile("_FUNC_(\\(.*?\\))(.*)", 
Pattern.DOTALL);
+
+        for (Class<?> annotatedClass : annotatedClasses) {
+            Deprecated deprecated = 
annotatedClass.getAnnotation(Deprecated.class);
+            if (deprecated != null) {
+                continue;
+            }
+
+            Description description = 
annotatedClass.getAnnotation(Description.class);
+
+            String value = description.value().replaceAll("\n", " ");
+            Matcher matcher = func.matcher(value);
+            if (matcher.find()) {
+                value = MarkdownUtils.asInlineCode(description.name() + 
matcher.group(1))
+                        + escapeHtml(matcher.group(2));
+            }
+            sb.append(MarkdownUtils.asListElement(value));
+
+            StringBuilder sbExtended = new StringBuilder();
+            if (!description.extended().isEmpty()) {
+                sbExtended.append(description.extended());
+                sb.append("\n");
+            }
+
+            String extended = sbExtended.toString();
+            if (!extended.isEmpty()) {
+                if (extended.toLowerCase().contains("select")) { // extended 
description contains SQL statements
+                    
sb.append(MarkdownUtils.indent(MarkdownUtils.asCodeBlock(extended, "sql")));
+                } else {
+                    
sb.append(MarkdownUtils.indent(MarkdownUtils.asCodeBlock(extended)));
+                }
+            } else {
+                sb.append("\n");
+            }
+
+            String packageName = annotatedClass.getPackage().getName();
+            if (!packages.containsKey(packageName)) {
+                Set<String> set = new TreeSet<>();
+                packages.put(packageName, set);
+            }
+            Set<String> List = packages.get(packageName);
+            List.add(sb.toString());
+
+            StringUtils.clear(sb);
+        }
+
+        PrintWriter writer;
+        try {
+            writer = new PrintWriter(outputFile);
+        } catch (FileNotFoundException e) {
+            throw new MojoExecutionException("Output file is not found");
+        }
+
+        // license header
+        writer.println("<!--");
+        try {
+            File licenseFile = new File(basedir, 
"resources/license-header.txt");
+            FileReader fileReader = new FileReader(licenseFile);
+            BufferedReader bufferedReader = new BufferedReader(fileReader);
+            String line;
+            while ((line = bufferedReader.readLine()) != null) {
+                writer.println(MarkdownUtils.indent(line));
+            }
+        } catch (IOException e) {
+            throw new MojoExecutionException("Failed to read license file");
+        }
+        writer.println("-->\n");
+
+        writer.println(preface);
+
+        writer.println("\n<!-- toc -->\n");
+
+        for (Map.Entry<String, List<String>> e : headers.entrySet()) {
+            writer.println(e.getKey() + "\n");
+            List<String> packageNames = e.getValue();
+            for (String packageName : packageNames) {
+                for (String desc : packages.get(packageName)) {
+                    writer.println(desc);
+                }
+            }
+        }
+
+        writer.close();
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/tools/hivemall-docs/src/main/java/hivemall/docs/utils/MarkdownUtils.java
----------------------------------------------------------------------
diff --git 
a/tools/hivemall-docs/src/main/java/hivemall/docs/utils/MarkdownUtils.java 
b/tools/hivemall-docs/src/main/java/hivemall/docs/utils/MarkdownUtils.java
new file mode 100644
index 0000000..677e6a7
--- /dev/null
+++ b/tools/hivemall-docs/src/main/java/hivemall/docs/utils/MarkdownUtils.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.docs.utils;
+
+public class MarkdownUtils {
+    private static final String TAB = "  ";
+
+    public static String indent(final String s) {
+        if (s.isEmpty()) {
+            return s;
+        }
+        return TAB + s.replaceAll("(\\r\\n|\\r|\\n)(.+)", "$1" + TAB + "$2");
+    }
+
+    public static String asBold(final String s) {
+        return "**" + s + "**";
+    }
+
+    public static String asInlineCode(final String s) {
+        return "`" + s + "`";
+    }
+
+    public static String asListElement(final String s) {
+        return "- " + s;
+    }
+
+    public static String asCodeBlock(final String s) {
+        return asCodeBlock(s, "");
+    }
+
+    public static String asCodeBlock(final String s, final String lang) {
+        return "```" + lang + "\n" + s + "\n```\n";
+    }
+
+    public static String asHeader(final String s, int level) {
+        char[] buf = new char[level];
+        for (int i = 0; i < level; i++) {
+            buf[i] = '#';
+        }
+        return new String(buf) + " " + s + "\n";
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/36fb839d/tools/pom.xml
----------------------------------------------------------------------
diff --git a/tools/pom.xml b/tools/pom.xml
new file mode 100644
index 0000000..6249896
--- /dev/null
+++ b/tools/pom.xml
@@ -0,0 +1,40 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"; 
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+       <modelVersion>4.0.0</modelVersion>
+
+       <parent>
+               <groupId>org.apache.hivemall</groupId>
+               <artifactId>hivemall</artifactId>
+               <version>0.5.1-incubating-SNAPSHOT</version>
+               <relativePath>../pom.xml</relativePath>
+       </parent>
+
+       <artifactId>hivemall-tools</artifactId>
+       <name>Hivemall Tools</name>
+       <packaging>pom</packaging>
+
+       <modules>
+               <module>hivemall-docs</module>
+       </modules>
+
+       <properties>
+               <main.basedir>${project.parent.basedir}</main.basedir>
+       </properties>
+</project>

incubator-hivemall git commit: [HIVEMALL-193] Implement a tool for generating a list of Hivemall UDFs

Reply via email to