Repository: incubator-hivemall
Updated Branches:
  refs/heads/master e2666ec96 -> 2da3f381a (forced update)


[HIVEMALL-162] Support L1 normalization

Support `l1_normalize` in a similar manner to `l2_normalize`

Feature

- https://issues.apache.org/jira/browse/HIVEMALL-59
- https://issues.apache.org/jira/browse/HIVEMALL-162

Unit test and manual test on EMR

(Please remove this section if not needed; check `x` for YES, blank for NO)

- [x] Did you apply source code formatter, i.e., `mvn formatter:format`, for 
your commit?
- [x] Did you run system tests on Hive (or Spark)?

Author: Takuya Kitazawa <[email protected]>

Closes #126 from takuti/l1-normalize.


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/2da3f381
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/2da3f381
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/2da3f381

Branch: refs/heads/master
Commit: 2da3f381a073321f623208f2ef01627e0bea9945
Parents: 2fa6fb9
Author: Takuya Kitazawa <[email protected]>
Authored: Mon Dec 18 16:28:59 2017 +0900
Committer: Takuya Kitazawa <[email protected]>
Committed: Tue Dec 19 10:23:37 2017 +0900

----------------------------------------------------------------------
 .../ftvec/scaling/L1NormalizationUDF.java       | 80 ++++++++++++++++++++
 .../ftvec/scaling/L2NormalizationUDF.java       |  5 +-
 .../ftvec/scaling/L1NormalizationUDFTest.java   | 69 +++++++++++++++++
 .../ftvec/scaling/L2NormalizationUDFTest.java   |  3 +-
 docs/gitbook/ft_engineering/scaling.md          | 16 ++++
 resources/ddl/define-all-as-permanent.hive      |  3 +
 resources/ddl/define-all.hive                   |  3 +
 resources/ddl/define-all.spark                  |  3 +
 resources/ddl/define-udfs.td.hql                |  1 +
 9 files changed, 180 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/core/src/main/java/hivemall/ftvec/scaling/L1NormalizationUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/ftvec/scaling/L1NormalizationUDF.java 
b/core/src/main/java/hivemall/ftvec/scaling/L1NormalizationUDF.java
new file mode 100644
index 0000000..45ef97d
--- /dev/null
+++ b/core/src/main/java/hivemall/ftvec/scaling/L1NormalizationUDF.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.ftvec.scaling;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDF;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.io.Text;
+
+import java.util.Arrays;
+import java.util.List;
+
+@Description(name = "l1_normalize", value = "_FUNC_(ftvec string) - Returned a 
L1 normalized value")
+@UDFType(deterministic = true, stateful = false)
+public final class L1NormalizationUDF extends UDF {
+
+    public List<Text> evaluate(final List<Text> ftvecs) throws HiveException {
+        if (ftvecs == null) {
+            return null;
+        }
+        double absoluteSum = 0.d;
+        final int numFeatures = ftvecs.size();
+        final String[] features = new String[numFeatures];
+        final float[] weights = new float[numFeatures];
+        for (int i = 0; i < numFeatures; i++) {
+            Text ftvec = ftvecs.get(i);
+            if (ftvec == null) {
+                continue;
+            }
+            String s = ftvec.toString();
+            final String[] ft = s.split(":");
+            final int ftlen = ft.length;
+            if (ftlen == 1) {
+                features[i] = ft[0];
+                weights[i] = 1.f;
+                absoluteSum += 1.d;
+            } else if (ftlen == 2) {
+                features[i] = ft[0];
+                float v = Float.parseFloat(ft[1]);
+                weights[i] = v;
+                absoluteSum += Math.abs(v);
+            } else {
+                throw new HiveException("Invalid feature value representation: 
" + s);
+            }
+        }
+        final float norm = (float) absoluteSum;
+        final Text[] t = new Text[numFeatures];
+        if (norm == 0.f) {
+            for (int i = 0; i < numFeatures; i++) {
+                String f = features[i];
+                t[i] = new Text(f + ':' + 0.f);
+            }
+        } else {
+            for (int i = 0; i < numFeatures; i++) {
+                String f = features[i];
+                float v = weights[i] / norm;
+                t[i] = new Text(f + ':' + v);
+            }
+        }
+        return Arrays.asList(t);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/core/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDF.java 
b/core/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDF.java
index 8b05a36..9cf315c 100644
--- a/core/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDF.java
+++ b/core/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDF.java
@@ -23,6 +23,7 @@ import java.util.List;
 
 import org.apache.hadoop.hive.ql.exec.Description;
 import org.apache.hadoop.hive.ql.exec.UDF;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.udf.UDFType;
 import org.apache.hadoop.io.Text;
 
@@ -33,7 +34,7 @@ import org.apache.hadoop.io.Text;
 @UDFType(deterministic = true, stateful = false)
 public final class L2NormalizationUDF extends UDF {
 
-    public List<Text> evaluate(final List<Text> ftvecs) {
+    public List<Text> evaluate(final List<Text> ftvecs) throws HiveException {
         if (ftvecs == null) {
             return null;
         }
@@ -59,7 +60,7 @@ public final class L2NormalizationUDF extends UDF {
                 weights[i] = v;
                 squaredSum += (v * v);
             } else {
-                throw new IllegalArgumentException("Invalid feature value 
representation: " + s);
+                throw new HiveException("Invalid feature value representation: 
" + s);
             }
         }
         final float norm = (float) Math.sqrt(squaredSum);

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/core/src/test/java/hivemall/ftvec/scaling/L1NormalizationUDFTest.java
----------------------------------------------------------------------
diff --git 
a/core/src/test/java/hivemall/ftvec/scaling/L1NormalizationUDFTest.java 
b/core/src/test/java/hivemall/ftvec/scaling/L1NormalizationUDFTest.java
new file mode 100644
index 0000000..7d997f7
--- /dev/null
+++ b/core/src/test/java/hivemall/ftvec/scaling/L1NormalizationUDFTest.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.ftvec.scaling;
+
+import hivemall.utils.hadoop.WritableUtils;
+import hivemall.utils.math.MathUtils;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.io.Text;
+import org.junit.Test;
+
+import java.util.Collections;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+
+public class L1NormalizationUDFTest {
+
+    @Test
+    public void test() throws HiveException {
+        L1NormalizationUDF udf = new L1NormalizationUDF();
+
+        assertEquals(null, udf.evaluate(null));
+
+        assertEquals(WritableUtils.val(new String[] {}),
+            udf.evaluate(WritableUtils.val(new String[] {})));
+
+        assertEquals(WritableUtils.val(new String[] {"aaa:1.0"}),
+            udf.evaluate(WritableUtils.val(new String[] {"aaa"})));
+
+        assertEquals(WritableUtils.val(new String[] {"aaa:1.0"}),
+            udf.evaluate(WritableUtils.val(new String[] {"aaa:1"})));
+
+        assertEquals(WritableUtils.val(new String[] {"aaa:1.0"}),
+            udf.evaluate(WritableUtils.val(new String[] {"aaa:1.0"})));
+
+        float[] normalized = MathUtils.l1normalize(new float[] {1.0f, 0.5f});
+        assertEquals(
+            WritableUtils.val(new String[] {"aaa:" + normalized[0], "bbb:" + 
normalized[1]}),
+            udf.evaluate(WritableUtils.val(new String[] {"aaa:1.0", 
"bbb:0.5"})));
+
+        normalized = MathUtils.l1normalize(new float[] {1.0f, -0.5f});
+        assertEquals(
+            WritableUtils.val(new String[] {"aaa:" + normalized[0], "bbb:" + 
normalized[1]}),
+            udf.evaluate(WritableUtils.val(new String[] {"aaa:1.0", 
"bbb:-0.5"})));
+
+        List<Text> expected = udf.evaluate(WritableUtils.val(new String[] 
{"bbb:-0.5", "aaa:1.0"}));
+        Collections.sort(expected);
+        List<Text> actual = udf.evaluate(WritableUtils.val(new String[] 
{"aaa:1.0", "bbb:-0.5"}));
+        Collections.sort(actual);
+        assertEquals(expected, actual);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/core/src/test/java/hivemall/ftvec/scaling/L2NormalizationUDFTest.java
----------------------------------------------------------------------
diff --git 
a/core/src/test/java/hivemall/ftvec/scaling/L2NormalizationUDFTest.java 
b/core/src/test/java/hivemall/ftvec/scaling/L2NormalizationUDFTest.java
index bd80577..30e2aba 100644
--- a/core/src/test/java/hivemall/ftvec/scaling/L2NormalizationUDFTest.java
+++ b/core/src/test/java/hivemall/ftvec/scaling/L2NormalizationUDFTest.java
@@ -25,13 +25,14 @@ import hivemall.utils.math.MathUtils;
 import java.util.Collections;
 import java.util.List;
 
+import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.io.Text;
 import org.junit.Test;
 
 public class L2NormalizationUDFTest {
 
     @Test
-    public void test() {
+    public void test() throws HiveException {
         L2NormalizationUDF udf = new L2NormalizationUDF();
 
         assertEquals(null, udf.evaluate(null));

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/docs/gitbook/ft_engineering/scaling.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/ft_engineering/scaling.md 
b/docs/gitbook/ft_engineering/scaling.md
index b419254..ff3ccef 100644
--- a/docs/gitbook/ft_engineering/scaling.md
+++ b/docs/gitbook/ft_engineering/scaling.md
@@ -19,6 +19,22 @@
 
 <!-- toc -->
 
+# L1/L2 Normalization
+
+[L1](http://mathworld.wolfram.com/L1-Norm.html) and 
[L2](http://mathworld.wolfram.com/L2-Norm.html) normalization ensures that each 
feature vector has unit length:
+
+```sql
+select l1_normalize(array('apple:1.0', 'banana:0.5'))
+```
+
+> ["apple:0.6666667","banana:0.33333334"]
+
+```sql
+select l2_normalize(array('apple:1.0', 'banana:0.5'))
+```
+
+> ["apple:0.8944272","banana:0.4472136"]
+
 # Min-Max Normalization
 http://en.wikipedia.org/wiki/Feature_scaling#Rescaling
 ```sql

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/resources/ddl/define-all-as-permanent.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all-as-permanent.hive 
b/resources/ddl/define-all-as-permanent.hive
index fa307d5..ed9f22f 100644
--- a/resources/ddl/define-all-as-permanent.hive
+++ b/resources/ddl/define-all-as-permanent.hive
@@ -211,6 +211,9 @@ CREATE FUNCTION rescale as 
'hivemall.ftvec.scaling.RescaleUDF' USING JAR '${hive
 DROP FUNCTION IF EXISTS zscore;
 CREATE FUNCTION zscore as 'hivemall.ftvec.scaling.ZScoreUDF' USING JAR 
'${hivemall_jar}';
 
+DROP FUNCTION IF EXISTS l1_normalize;
+CREATE FUNCTION l1_normalize as 'hivemall.ftvec.scaling.L1NormalizationUDF' 
USING JAR '${hivemall_jar}';
+
 DROP FUNCTION IF EXISTS l2_normalize;
 CREATE FUNCTION l2_normalize as 'hivemall.ftvec.scaling.L2NormalizationUDF' 
USING JAR '${hivemall_jar}';
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/resources/ddl/define-all.hive
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive
index 13abe76..0267a6d 100644
--- a/resources/ddl/define-all.hive
+++ b/resources/ddl/define-all.hive
@@ -207,6 +207,9 @@ create temporary function rescale as 
'hivemall.ftvec.scaling.RescaleUDF';
 drop temporary function if exists zscore;
 create temporary function zscore as 'hivemall.ftvec.scaling.ZScoreUDF';
 
+drop temporary function if exists l1_normalize;
+create temporary function l1_normalize as 
'hivemall.ftvec.scaling.L1NormalizationUDF';
+
 drop temporary function if exists l2_normalize;
 create temporary function l2_normalize as 
'hivemall.ftvec.scaling.L2NormalizationUDF';
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/resources/ddl/define-all.spark
----------------------------------------------------------------------
diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark
index 67e3765..cf4a15c 100644
--- a/resources/ddl/define-all.spark
+++ b/resources/ddl/define-all.spark
@@ -212,6 +212,9 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION rescale AS 
'hivemall.ftvec.scaling.Res
 sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS zscore")
 sqlContext.sql("CREATE TEMPORARY FUNCTION zscore AS 
'hivemall.ftvec.scaling.ZScoreUDF'")
 
+sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS l1_normalize")
+sqlContext.sql("CREATE TEMPORARY FUNCTION l1_normalize AS 
'hivemall.ftvec.scaling.L1NormalizationUDF'")
+
 sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS l2_normalize")
 sqlContext.sql("CREATE TEMPORARY FUNCTION l2_normalize AS 
'hivemall.ftvec.scaling.L2NormalizationUDF'")
 

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/resources/ddl/define-udfs.td.hql
----------------------------------------------------------------------
diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql
index 5bcd366..6a7b75b 100644
--- a/resources/ddl/define-udfs.td.hql
+++ b/resources/ddl/define-udfs.td.hql
@@ -49,6 +49,7 @@ create temporary function polynomial_features as 
'hivemall.ftvec.pairing.Polynom
 create temporary function powered_features as 
'hivemall.ftvec.pairing.PoweredFeaturesUDF';
 create temporary function rescale as 'hivemall.ftvec.scaling.RescaleUDF';
 create temporary function zscore as 'hivemall.ftvec.scaling.ZScoreUDF';
+create temporary function l1_normalize as 
'hivemall.ftvec.scaling.L1NormalizationUDF';
 create temporary function l2_normalize as 
'hivemall.ftvec.scaling.L2NormalizationUDF';
 create temporary function chi2 as 'hivemall.ftvec.selection.ChiSquareUDF';
 create temporary function snr as 
'hivemall.ftvec.selection.SignalNoiseRatioUDAF';

Reply via email to