Repository: incubator-hivemall Updated Branches: refs/heads/master e2666ec96 -> 2da3f381a (forced update)
[HIVEMALL-162] Support L1 normalization Support `l1_normalize` in a similar manner to `l2_normalize` Feature - https://issues.apache.org/jira/browse/HIVEMALL-59 - https://issues.apache.org/jira/browse/HIVEMALL-162 Unit test and manual test on EMR (Please remove this section if not needed; check `x` for YES, blank for NO) - [x] Did you apply source code formatter, i.e., `mvn formatter:format`, for your commit? - [x] Did you run system tests on Hive (or Spark)? Author: Takuya Kitazawa <[email protected]> Closes #126 from takuti/l1-normalize. Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/2da3f381 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/2da3f381 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/2da3f381 Branch: refs/heads/master Commit: 2da3f381a073321f623208f2ef01627e0bea9945 Parents: 2fa6fb9 Author: Takuya Kitazawa <[email protected]> Authored: Mon Dec 18 16:28:59 2017 +0900 Committer: Takuya Kitazawa <[email protected]> Committed: Tue Dec 19 10:23:37 2017 +0900 ---------------------------------------------------------------------- .../ftvec/scaling/L1NormalizationUDF.java | 80 ++++++++++++++++++++ .../ftvec/scaling/L2NormalizationUDF.java | 5 +- .../ftvec/scaling/L1NormalizationUDFTest.java | 69 +++++++++++++++++ .../ftvec/scaling/L2NormalizationUDFTest.java | 3 +- docs/gitbook/ft_engineering/scaling.md | 16 ++++ resources/ddl/define-all-as-permanent.hive | 3 + resources/ddl/define-all.hive | 3 + resources/ddl/define-all.spark | 3 + resources/ddl/define-udfs.td.hql | 1 + 9 files changed, 180 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/core/src/main/java/hivemall/ftvec/scaling/L1NormalizationUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/ftvec/scaling/L1NormalizationUDF.java b/core/src/main/java/hivemall/ftvec/scaling/L1NormalizationUDF.java new file mode 100644 index 0000000..45ef97d --- /dev/null +++ b/core/src/main/java/hivemall/ftvec/scaling/L1NormalizationUDF.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.ftvec.scaling; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.io.Text; + +import java.util.Arrays; +import java.util.List; + +@Description(name = "l1_normalize", value = "_FUNC_(ftvec string) - Returned a L1 normalized value") +@UDFType(deterministic = true, stateful = false) +public final class L1NormalizationUDF extends UDF { + + public List<Text> evaluate(final List<Text> ftvecs) throws HiveException { + if (ftvecs == null) { + return null; + } + double absoluteSum = 0.d; + final int numFeatures = ftvecs.size(); + final String[] features = new String[numFeatures]; + final float[] weights = new float[numFeatures]; + for (int i = 0; i < numFeatures; i++) { + Text ftvec = ftvecs.get(i); + if (ftvec == null) { + continue; + } + String s = ftvec.toString(); + final String[] ft = s.split(":"); + final int ftlen = ft.length; + if (ftlen == 1) { + features[i] = ft[0]; + weights[i] = 1.f; + absoluteSum += 1.d; + } else if (ftlen == 2) { + features[i] = ft[0]; + float v = Float.parseFloat(ft[1]); + weights[i] = v; + absoluteSum += Math.abs(v); + } else { + throw new HiveException("Invalid feature value representation: " + s); + } + } + final float norm = (float) absoluteSum; + final Text[] t = new Text[numFeatures]; + if (norm == 0.f) { + for (int i = 0; i < numFeatures; i++) { + String f = features[i]; + t[i] = new Text(f + ':' + 0.f); + } + } else { + for (int i = 0; i < numFeatures; i++) { + String f = features[i]; + float v = weights[i] / norm; + t[i] = new Text(f + ':' + v); + } + } + return Arrays.asList(t); + } + +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/core/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDF.java b/core/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDF.java index 8b05a36..9cf315c 100644 --- a/core/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDF.java +++ b/core/src/main/java/hivemall/ftvec/scaling/L2NormalizationUDF.java @@ -23,6 +23,7 @@ import java.util.List; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; +import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.UDFType; import org.apache.hadoop.io.Text; @@ -33,7 +34,7 @@ import org.apache.hadoop.io.Text; @UDFType(deterministic = true, stateful = false) public final class L2NormalizationUDF extends UDF { - public List<Text> evaluate(final List<Text> ftvecs) { + public List<Text> evaluate(final List<Text> ftvecs) throws HiveException { if (ftvecs == null) { return null; } @@ -59,7 +60,7 @@ public final class L2NormalizationUDF extends UDF { weights[i] = v; squaredSum += (v * v); } else { - throw new IllegalArgumentException("Invalid feature value representation: " + s); + throw new HiveException("Invalid feature value representation: " + s); } } final float norm = (float) Math.sqrt(squaredSum); http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/core/src/test/java/hivemall/ftvec/scaling/L1NormalizationUDFTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/ftvec/scaling/L1NormalizationUDFTest.java b/core/src/test/java/hivemall/ftvec/scaling/L1NormalizationUDFTest.java new file mode 100644 index 0000000..7d997f7 --- /dev/null +++ b/core/src/test/java/hivemall/ftvec/scaling/L1NormalizationUDFTest.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.ftvec.scaling; + +import hivemall.utils.hadoop.WritableUtils; +import hivemall.utils.math.MathUtils; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.io.Text; +import org.junit.Test; + +import java.util.Collections; +import java.util.List; + +import static org.junit.Assert.assertEquals; + +public class L1NormalizationUDFTest { + + @Test + public void test() throws HiveException { + L1NormalizationUDF udf = new L1NormalizationUDF(); + + assertEquals(null, udf.evaluate(null)); + + assertEquals(WritableUtils.val(new String[] {}), + udf.evaluate(WritableUtils.val(new String[] {}))); + + assertEquals(WritableUtils.val(new String[] {"aaa:1.0"}), + udf.evaluate(WritableUtils.val(new String[] {"aaa"}))); + + assertEquals(WritableUtils.val(new String[] {"aaa:1.0"}), + udf.evaluate(WritableUtils.val(new String[] {"aaa:1"}))); + + assertEquals(WritableUtils.val(new String[] {"aaa:1.0"}), + udf.evaluate(WritableUtils.val(new String[] {"aaa:1.0"}))); + + float[] normalized = MathUtils.l1normalize(new float[] {1.0f, 0.5f}); + assertEquals( + WritableUtils.val(new String[] {"aaa:" + normalized[0], "bbb:" + normalized[1]}), + udf.evaluate(WritableUtils.val(new String[] {"aaa:1.0", "bbb:0.5"}))); + + normalized = MathUtils.l1normalize(new float[] {1.0f, -0.5f}); + assertEquals( + WritableUtils.val(new String[] {"aaa:" + normalized[0], "bbb:" + normalized[1]}), + udf.evaluate(WritableUtils.val(new String[] {"aaa:1.0", "bbb:-0.5"}))); + + List<Text> expected = udf.evaluate(WritableUtils.val(new String[] {"bbb:-0.5", "aaa:1.0"})); + Collections.sort(expected); + List<Text> actual = udf.evaluate(WritableUtils.val(new String[] {"aaa:1.0", "bbb:-0.5"})); + Collections.sort(actual); + assertEquals(expected, actual); + } + +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/core/src/test/java/hivemall/ftvec/scaling/L2NormalizationUDFTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/ftvec/scaling/L2NormalizationUDFTest.java b/core/src/test/java/hivemall/ftvec/scaling/L2NormalizationUDFTest.java index bd80577..30e2aba 100644 --- a/core/src/test/java/hivemall/ftvec/scaling/L2NormalizationUDFTest.java +++ b/core/src/test/java/hivemall/ftvec/scaling/L2NormalizationUDFTest.java @@ -25,13 +25,14 @@ import hivemall.utils.math.MathUtils; import java.util.Collections; import java.util.List; +import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.io.Text; import org.junit.Test; public class L2NormalizationUDFTest { @Test - public void test() { + public void test() throws HiveException { L2NormalizationUDF udf = new L2NormalizationUDF(); assertEquals(null, udf.evaluate(null)); http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/docs/gitbook/ft_engineering/scaling.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/ft_engineering/scaling.md b/docs/gitbook/ft_engineering/scaling.md index b419254..ff3ccef 100644 --- a/docs/gitbook/ft_engineering/scaling.md +++ b/docs/gitbook/ft_engineering/scaling.md @@ -19,6 +19,22 @@ <!-- toc --> +# L1/L2 Normalization + +[L1](http://mathworld.wolfram.com/L1-Norm.html) and [L2](http://mathworld.wolfram.com/L2-Norm.html) normalization ensures that each feature vector has unit length: + +```sql +select l1_normalize(array('apple:1.0', 'banana:0.5')) +``` + +> ["apple:0.6666667","banana:0.33333334"] + +```sql +select l2_normalize(array('apple:1.0', 'banana:0.5')) +``` + +> ["apple:0.8944272","banana:0.4472136"] + # Min-Max Normalization http://en.wikipedia.org/wiki/Feature_scaling#Rescaling ```sql http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/resources/ddl/define-all-as-permanent.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index fa307d5..ed9f22f 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -211,6 +211,9 @@ CREATE FUNCTION rescale as 'hivemall.ftvec.scaling.RescaleUDF' USING JAR '${hive DROP FUNCTION IF EXISTS zscore; CREATE FUNCTION zscore as 'hivemall.ftvec.scaling.ZScoreUDF' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS l1_normalize; +CREATE FUNCTION l1_normalize as 'hivemall.ftvec.scaling.L1NormalizationUDF' USING JAR '${hivemall_jar}'; + DROP FUNCTION IF EXISTS l2_normalize; CREATE FUNCTION l2_normalize as 'hivemall.ftvec.scaling.L2NormalizationUDF' USING JAR '${hivemall_jar}'; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/resources/ddl/define-all.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 13abe76..0267a6d 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -207,6 +207,9 @@ create temporary function rescale as 'hivemall.ftvec.scaling.RescaleUDF'; drop temporary function if exists zscore; create temporary function zscore as 'hivemall.ftvec.scaling.ZScoreUDF'; +drop temporary function if exists l1_normalize; +create temporary function l1_normalize as 'hivemall.ftvec.scaling.L1NormalizationUDF'; + drop temporary function if exists l2_normalize; create temporary function l2_normalize as 'hivemall.ftvec.scaling.L2NormalizationUDF'; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/resources/ddl/define-all.spark ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index 67e3765..cf4a15c 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -212,6 +212,9 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION rescale AS 'hivemall.ftvec.scaling.Res sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS zscore") sqlContext.sql("CREATE TEMPORARY FUNCTION zscore AS 'hivemall.ftvec.scaling.ZScoreUDF'") +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS l1_normalize") +sqlContext.sql("CREATE TEMPORARY FUNCTION l1_normalize AS 'hivemall.ftvec.scaling.L1NormalizationUDF'") + sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS l2_normalize") sqlContext.sql("CREATE TEMPORARY FUNCTION l2_normalize AS 'hivemall.ftvec.scaling.L2NormalizationUDF'") http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2da3f381/resources/ddl/define-udfs.td.hql ---------------------------------------------------------------------- diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index 5bcd366..6a7b75b 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -49,6 +49,7 @@ create temporary function polynomial_features as 'hivemall.ftvec.pairing.Polynom create temporary function powered_features as 'hivemall.ftvec.pairing.PoweredFeaturesUDF'; create temporary function rescale as 'hivemall.ftvec.scaling.RescaleUDF'; create temporary function zscore as 'hivemall.ftvec.scaling.ZScoreUDF'; +create temporary function l1_normalize as 'hivemall.ftvec.scaling.L1NormalizationUDF'; create temporary function l2_normalize as 'hivemall.ftvec.scaling.L2NormalizationUDF'; create temporary function chi2 as 'hivemall.ftvec.selection.ChiSquareUDF'; create temporary function snr as 'hivemall.ftvec.selection.SignalNoiseRatioUDAF';
