Repository: incubator-hivemall Updated Branches: refs/heads/master 518e232d8 -> fad2941fd
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fad2941f/core/src/test/java/hivemall/utils/lang/PreconditionsTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/utils/lang/PreconditionsTest.java b/core/src/test/java/hivemall/utils/lang/PreconditionsTest.java new file mode 100644 index 0000000..b0cfbd0 --- /dev/null +++ b/core/src/test/java/hivemall/utils/lang/PreconditionsTest.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.utils.lang; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.junit.Test; + +public class PreconditionsTest { + + @Test(expected = UDFArgumentException.class) + public void testCheckNotNullTClassOfE() throws UDFArgumentException { + Preconditions.checkNotNull(null, UDFArgumentException.class); + } + + @Test(expected = HiveException.class) + public void testCheckArgumentBooleanClassOfE() throws UDFArgumentException, HiveException { + Preconditions.checkArgument(false, HiveException.class); + } + +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fad2941f/docs/gitbook/SUMMARY.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md index c333c98..33bb46c 100644 --- a/docs/gitbook/SUMMARY.md +++ b/docs/gitbook/SUMMARY.md @@ -61,6 +61,8 @@ * [Vectorize Features](ft_engineering/vectorizer.md) * [Quantify non-number features](ft_engineering/quantify.md) +* [Feature selection](ft_engineering/feature_selection.md) + ## Part IV - Evaluation * [Statistical evaluation of a prediction model](eval/stat_eval.md) http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fad2941f/docs/gitbook/ft_engineering/feature_selection.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/ft_engineering/feature_selection.md b/docs/gitbook/ft_engineering/feature_selection.md new file mode 100644 index 0000000..5a2a92b --- /dev/null +++ b/docs/gitbook/ft_engineering/feature_selection.md @@ -0,0 +1,155 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +[Feature Selection](https://en.wikipedia.org/wiki/Feature_selection) is the process of selecting a subset of relevant features for use in model construction. + +It is a useful technique to 1) improve prediction results by omitting redundant features, 2) to shorten training time, and 3) to know important features for prediction. + +*Note: The feature is supported Hivemall from v0.5-rc.1 or later.* + +<!-- toc --> + +# Supported Feature Selection algorithms + +* Chi-square (Chi2) + * In statistics, the $$\chi^2$$ test is applied to test the independence of two even events. Chi-square statistics between every feature variable and the target variable can be applied to Feature Selection. Refer [this article](http://nlp.stanford.edu/IR-book/html/htmledition/feature-selectionchi2-feature-selection-1.html) for Mathematical details. +* Signal Noise Ratio (SNR) + * The Signal Noise Ratio (SNR) is a univariate feature ranking metric, which can be used as a feature selection criterion for binary classification problems. SNR is defined as $$|\mu_{1} - \mu_{2}| / (\sigma_{1} + \sigma_{2})$$, where $$\mu_{k}$$ is the mean value of the variable in classes $$k$$, and $$\sigma_{k}$$ is the standard deviations of the variable in classes $$k$$. Clearly, features with larger SNR are useful for classification. + +# Usage + +## Feature Selection based on Chi-square test + +``` sql +CREATE TABLE input ( + X array<double>, -- features + Y array<int> -- binarized label +); + +set hivevar:k=2; + +WITH stats AS ( + SELECT + transpose_and_dot(Y, X) AS observed, -- array<array<double>>, shape = (n_classes, n_features) + array_sum(X) AS feature_count, -- n_features col vector, shape = (1, array<double>) + array_avg(Y) AS class_prob -- n_class col vector, shape = (1, array<double>) + FROM + input +), +test AS ( + SELECT + transpose_and_dot(class_prob, feature_count) AS expected -- array<array<double>>, shape = (n_class, n_features) + FROM + stats +), +chi2 AS ( + SELECT + chi2(r.observed, l.expected) AS v -- struct<array<double>, array<double>>, each shape = (1, n_features) + FROM + test l + CROSS JOIN stats r +) +SELECT + select_k_best(l.X, r.v.chi2, ${k}) as features -- top-k feature selection based on chi2 score +FROM + input l + CROSS JOIN chi2 r; +``` + +## Feature Selection based on Signal Noise Ratio (SNR) + +``` sql +CREATE TABLE input ( + X array<double>, -- features + Y array<int> -- binarized label +); + +set hivevar:k=2; + +WITH snr AS ( + SELECT snr(X, Y) AS snr -- aggregated SNR as array<double>, shape = (1, #features) + FROM input +) +SELECT + select_k_best(X, snr, ${k}) as features +FROM + input + CROSS JOIN snr; +``` + +# Function signatures + +### [UDAF] `transpose_and_dot(X::array<number>, Y::array<number>)::array<array<double>>` + +##### Input + +| `array<number>` X | `array<number>` Y | +| :-: | :-: | +| a row of matrix | a row of matrix | + +##### Output + +| `array<array<double>>` dot product | +| :-: | +| `dot(X.T, Y)` of shape = (X.#cols, Y.#cols) | + +### [UDF] `select_k_best(X::array<number>, importance_list::array<number>, k::int)::array<double>` + +##### Input + +| `array<number>` X | `array<number>` importance_list | `int` k | +| :-: | :-: | :-: | +| feature vector | importance of each feature | the number of features to be selected | + +##### Output + +| `array<array<double>>` k-best features | +| :-: | +| top-k elements from feature vector `X` based on importance list | + +### [UDF] `chi2(observed::array<array<number>>, expected::array<array<number>>)::struct<array<double>, array<double>>` + +##### Input + +| `array<number>` observed | `array<number>` expected | +| :-: | :-: | +| observed features | expected features `dot(class_prob.T, feature_count)` | + +Both of `observed` and `expected` have a shape `(#classes, #features)` + +##### Output + +| `struct<array<double>, array<double>>` importance_list | +| :-: | +| chi2-value and p-value for each feature | + +### [UDAF] `snr(X::array<number>, Y::array<int>)::array<double>` + +##### Input + +| `array<number>` X | `array<int>` Y | +| :-: | :-: | +| feature vector | one hot label | + +##### Output + +| `array<double>` importance_list | +| :-: | +| Signal Noise Ratio for each feature | + http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fad2941f/docs/gitbook/ft_engineering/quantify.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/ft_engineering/quantify.md b/docs/gitbook/ft_engineering/quantify.md index 952db53..1bfaa73 100644 --- a/docs/gitbook/ft_engineering/quantify.md +++ b/docs/gitbook/ft_engineering/quantify.md @@ -19,7 +19,7 @@ `quantified_features` is useful for transforming values of non-number columns to indexed numbers. -*Note: The feature is supported Hivemall v0.4 or later.* +*Note: The feature is supported from Hivemall v0.4 or later.* ```sql desc train; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fad2941f/resources/ddl/define-all-as-permanent.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index 1906de1..72835b1 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -199,6 +199,16 @@ CREATE FUNCTION zscore as 'hivemall.ftvec.scaling.ZScoreUDF' USING JAR '${hivema DROP FUNCTION IF EXISTS l2_normalize; CREATE FUNCTION l2_normalize as 'hivemall.ftvec.scaling.L2NormalizationUDF' USING JAR '${hivemall_jar}'; +--------------------------------- +-- Feature Selection functions -- +--------------------------------- + +DROP FUNCTION IF EXISTS chi2; +CREATE FUNCTION chi2 as 'hivemall.ftvec.selection.ChiSquareUDF' USING JAR '${hivemall_jar}'; + +DROP FUNCTION IF EXISTS snr; +CREATE FUNCTION snr as 'hivemall.ftvec.selection.SignalNoiseRatioUDAF' USING JAR '${hivemall_jar}'; + -------------------- -- misc functions -- -------------------- @@ -386,6 +396,9 @@ CREATE FUNCTION to_string_array as 'hivemall.tools.array.ToStringArrayUDF' USING DROP FUNCTION IF EXISTS array_intersect; CREATE FUNCTION array_intersect as 'hivemall.tools.array.ArrayIntersectUDF' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS select_k_best; +CREATE FUNCTION select_k_best as 'hivemall.tools.array.SelectKBestUDF' USING JAR '${hivemall_jar}'; + ----------------------------- -- bit operation functions -- ----------------------------- @@ -436,6 +449,13 @@ DROP FUNCTION IF EXISTS sigmoid; CREATE FUNCTION sigmoid as 'hivemall.tools.math.SigmoidGenericUDF' USING JAR '${hivemall_jar}'; ---------------------- +-- Matrix functions -- +---------------------- + +DROP FUNCTION IF EXISTS transpose_and_dot; +CREATE FUNCTION transpose_and_dot as 'hivemall.tools.matrix.TransposeAndDotUDAF' USING JAR '${hivemall_jar}'; + +---------------------- -- mapred functions -- ---------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fad2941f/resources/ddl/define-all.hive ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 37d262a..351303e 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -195,6 +195,16 @@ create temporary function zscore as 'hivemall.ftvec.scaling.ZScoreUDF'; drop temporary function l2_normalize; create temporary function l2_normalize as 'hivemall.ftvec.scaling.L2NormalizationUDF'; +--------------------------------- +-- Feature Selection functions -- +--------------------------------- + +drop temporary function chi2; +create temporary function chi2 as 'hivemall.ftvec.selection.ChiSquareUDF'; + +drop temporary function snr; +create temporary function snr as 'hivemall.ftvec.selection.SignalNoiseRatioUDAF'; + ----------------------------------- -- Feature engineering functions -- ----------------------------------- @@ -382,6 +392,9 @@ create temporary function to_string_array as 'hivemall.tools.array.ToStringArray drop temporary function array_intersect; create temporary function array_intersect as 'hivemall.tools.array.ArrayIntersectUDF'; +drop temporary function select_k_best; +create temporary function select_k_best as 'hivemall.tools.array.SelectKBestUDF'; + ----------------------------- -- bit operation functions -- ----------------------------- @@ -432,6 +445,13 @@ drop temporary function sigmoid; create temporary function sigmoid as 'hivemall.tools.math.SigmoidGenericUDF'; ---------------------- +-- Matrix functions -- +---------------------- + +drop temporary function transpose_and_dot; +create temporary function transpose_and_dot as 'hivemall.tools.matrix.TransposeAndDotUDAF'; + +---------------------- -- mapred functions -- ---------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fad2941f/resources/ddl/define-all.spark ---------------------------------------------------------------------- diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index 5de6106..838f7bb 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -6,7 +6,7 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS hivemall_version") sqlContext.sql("CREATE TEMPORARY FUNCTION hivemall_version AS 'hivemall.HivemallVersionUDF'") /** - * binary classification + * Binary classification */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS train_perceptron") @@ -59,7 +59,7 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS train_multiclass_scw2") sqlContext.sql("CREATE TEMPORARY FUNCTION train_multiclass_scw2 AS 'hivemall.classifier.multiclass.MulticlassSoftConfidenceWeightedUDTF$SCW2'") /** - * similarity functions + * Similarity functions */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS cosine_sim") @@ -78,7 +78,7 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS distance2similarity") sqlContext.sql("CREATE TEMPORARY FUNCTION distance2similarity AS 'hivemall.knn.similarity.Distance2SimilarityUDF'") /** - * distance functions + * Distance functions */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS homming_distance") @@ -122,7 +122,7 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS bbit_minhash") sqlContext.sql("CREATE TEMPORARY FUNCTION bbit_minhash AS 'hivemall.knn.lsh.bBitMinHashUDF'") /** - * voting functions + * Voting functions */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS voted_avg") @@ -132,7 +132,7 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS weight_voted_avg") sqlContext.sql("CREATE TEMPORARY FUNCTION weight_voted_avg AS 'hivemall.ensemble.bagging.WeightVotedAvgUDAF'") /** - * misc functions + * MISC functions */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS max_label") @@ -145,7 +145,7 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS argmin_kld") sqlContext.sql("CREATE TEMPORARY FUNCTION argmin_kld AS 'hivemall.ensemble.ArgminKLDistanceUDAF'") /** - * hashing functions + * Feature hashing functions */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS mhash") @@ -161,7 +161,7 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS prefixed_hash_values") sqlContext.sql("CREATE TEMPORARY FUNCTION prefixed_hash_values AS 'hivemall.ftvec.hashing.ArrayPrefixedHashValuesUDF'") /** - * pairing functions + * Feature pairing functions */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS polynomial_features") @@ -171,7 +171,7 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS powered_features") sqlContext.sql("CREATE TEMPORARY FUNCTION powered_features AS 'hivemall.ftvec.pairing.PoweredFeaturesUDF'") /** - * scaling functions + * Feature scaling functions */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS rescale") @@ -184,7 +184,17 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS normalize") sqlContext.sql("CREATE TEMPORARY FUNCTION normalize AS 'hivemall.ftvec.scaling.L2NormalizationUDF'") /** - * misc functions + * Feature selection functions + */ + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS chi2") +sqlContext.sql("CREATE TEMPORARY FUNCTION chi2 AS 'hivemall.ftvec.selection.ChiSquareUDF'") + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS snr") +sqlContext.sql("CREATE TEMPORARY FUNCTION snr AS 'hivemall.ftvec.selection.SignalNoiseRatioUDAF'") + +/** + * MISC feature engineering functions */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS amplify") @@ -257,7 +267,7 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS tf") sqlContext.sql("CREATE TEMPORARY FUNCTION tf AS 'hivemall.ftvec.text.TermFrequencyUDAF'") /** - * fegression functions + * Regression functions */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS train_logregr") @@ -291,7 +301,7 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS train_arowe2_regr") sqlContext.sql("CREATE TEMPORARY FUNCTION train_arow_regr AS 'hivemall.regression.AROWRegressionUDTF$AROWe2'") /** - * array functions + * Array functions */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS float_array") @@ -321,8 +331,11 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION subarray AS 'hivemall.tools.array.Suba sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS array_avg") sqlContext.sql("CREATE TEMPORARY FUNCTION array_avg AS 'hivemall.tools.array.ArrayAvgGenericUDAF'") +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS select_k_best") +sqlContext.sql("CREATE TEMPORARY FUNCTION select_k_best AS 'hivemall.tools.array.SelectKBestUDF'") + /** - * compression functions + * Compression functions */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS inflate") @@ -332,7 +345,7 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS deflate") sqlContext.sql("CREATE TEMPORARY FUNCTION deflate AS 'hivemall.tools.compress.DeflateUDF'") /** - * map functions + * Map functions */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS map_get_sum") @@ -355,14 +368,21 @@ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS sigmoid") sqlContext.sql("CREATE TEMPORARY FUNCTION sigmoid AS 'hivemall.tools.math.SigmoidGenericUDF'") /** - * mapred functions + * Matrix functions + */ + +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS transpose_and_dot") +sqlContext.sql("CREATE TEMPORARY FUNCTION transpose_and_dot AS 'hivemall.tools.matrix.TransposeAndDotUDAF'") + +/** + * MAPRED functions */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS rowid") sqlContext.sql("CREATE TEMPORARY FUNCTION rowid AS 'hivemall.tools.mapred.RowIdUDFWrapper'") /** - * misc functions + * MISC functions */ sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS generate_series") http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fad2941f/resources/ddl/define-udfs.td.hql ---------------------------------------------------------------------- diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index a0bea45..47dbd1d 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -50,6 +50,8 @@ create temporary function powered_features as 'hivemall.ftvec.pairing.PoweredFea create temporary function rescale as 'hivemall.ftvec.scaling.RescaleUDF'; create temporary function zscore as 'hivemall.ftvec.scaling.ZScoreUDF'; create temporary function l2_normalize as 'hivemall.ftvec.scaling.L2NormalizationUDF'; +create temporary function chi2 as 'hivemall.ftvec.selection.ChiSquareUDF'; +create temporary function snr as 'hivemall.ftvec.selection.SignalNoiseRatioUDAF'; create temporary function amplify as 'hivemall.ftvec.amplify.AmplifierUDTF'; create temporary function rand_amplify as 'hivemall.ftvec.amplify.RandomAmplifierUDTF'; create temporary function add_bias as 'hivemall.ftvec.AddBiasUDF'; @@ -101,6 +103,7 @@ create temporary function array_avg as 'hivemall.tools.array.ArrayAvgGenericUDAF create temporary function array_sum as 'hivemall.tools.array.ArraySumUDAF'; create temporary function to_string_array as 'hivemall.tools.array.ToStringArrayUDF'; create temporary function array_intersect as 'hivemall.tools.array.ArrayIntersectUDF'; +create temporary function select_k_best as 'hivemall.tools.array.SelectKBestUDF'; create temporary function bits_collect as 'hivemall.tools.bits.BitsCollectUDAF'; create temporary function to_bits as 'hivemall.tools.bits.ToBitsUDF'; create temporary function unbits as 'hivemall.tools.bits.UnBitsUDF'; @@ -112,6 +115,7 @@ create temporary function map_tail_n as 'hivemall.tools.map.MapTailNUDF'; create temporary function to_map as 'hivemall.tools.map.UDAFToMap'; create temporary function to_ordered_map as 'hivemall.tools.map.UDAFToOrderedMap'; create temporary function sigmoid as 'hivemall.tools.math.SigmoidGenericUDF'; +create temporary function transpose_and_dot as 'hivemall.tools.matrix.TransposeAndDotUDAF'; create temporary function taskid as 'hivemall.tools.mapred.TaskIdUDF'; create temporary function jobid as 'hivemall.tools.mapred.JobIdUDF'; create temporary function rowid as 'hivemall.tools.mapred.RowIdUDF'; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fad2941f/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala ---------------------------------------------------------------------- diff --git a/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala b/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala index fd4da64..dd6db6c 100644 --- a/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala +++ b/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala @@ -271,9 +271,32 @@ final class GroupedDataEx protected[sql]( */ def onehot_encoding(features: String*): DataFrame = { val udaf = HiveUDAFFunction( - new HiveFunctionWrapper("hivemall.ftvec.trans.OnehotEncodingUDAF"), - features.map(df.col(_).expr), - isUDAFBridgeRequired = false) + new HiveFunctionWrapper("hivemall.ftvec.trans.OnehotEncodingUDAF"), + features.map(df.col(_).expr), + isUDAFBridgeRequired = false) + toDF(Seq(Alias(udaf, udaf.prettyString)())) + } + + /** + * @see hivemall.ftvec.selection.SignalNoiseRatioUDAF + */ + def snr(X: String, Y: String): DataFrame = { + val udaf = HiveUDAFFunction( + new HiveFunctionWrapper("hivemall.ftvec.selection.SignalNoiseRatioUDAF"), + Seq(X, Y).map(df.col(_).expr), + isUDAFBridgeRequired = false) + .toAggregateExpression() + toDF(Seq(Alias(udaf, udaf.prettyString)())) + } + + /** + * @see hivemall.tools.matrix.TransposeAndDotUDAF + */ + def transpose_and_dot(X: String, Y: String): DataFrame = { + val udaf = HiveUDAFFunction( + new HiveFunctionWrapper("hivemall.tools.matrix.TransposeAndDotUDAF"), + Seq(X, Y).map(df.col(_).expr), + isUDAFBridgeRequired = false) .toAggregateExpression() toDF(Seq(Alias(udaf, udaf.prettyString)())) } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fad2941f/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala ---------------------------------------------------------------------- diff --git a/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala b/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala index 4c750dd..8583e1c 100644 --- a/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala +++ b/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala @@ -1010,6 +1010,15 @@ object HivemallOps { } /** + * @see hivemall.ftvec.selection.ChiSquareUDF + * @group ftvec.selection + */ + def chi2(observed: Column, expected: Column): Column = { + HiveGenericUDF(new HiveFunctionWrapper( + "hivemall.ftvec.selection.ChiSquareUDF"), Seq(observed.expr, expected.expr)) + } + + /** * @see hivemall.ftvec.conv.ToDenseFeaturesUDF * @group ftvec.conv */ @@ -1082,6 +1091,15 @@ object HivemallOps { } /** + * @see hivemall.tools.array.SelectKBestUDF + * @group tools.array + */ + def select_k_best(X: Column, importanceList: Column, k: Column): Column = { + HiveGenericUDF(new HiveFunctionWrapper( + "hivemall.tools.array.SelectKBestUDF"), Seq(X.expr, importanceList.expr, k.expr)) + } + + /** * @see hivemall.tools.math.SigmoidUDF * @group misc */ http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fad2941f/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala ---------------------------------------------------------------------- diff --git a/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala b/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala index 901056d..4c77f18 100644 --- a/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala +++ b/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala @@ -187,6 +187,35 @@ final class HivemallOpsSuite extends HivemallQueryTest { Row(Seq("1:1.0")))) } + test("ftvec.selection - chi2") { + import hiveContext.implicits._ + + // see also hivemall.ftvec.selection.ChiSquareUDFTest + val df = Seq( + Seq( + Seq(250.29999999999998, 170.90000000000003, 73.2, 12.199999999999996), + Seq(296.8, 138.50000000000003, 212.99999999999997, 66.3), + Seq(329.3999999999999, 148.7, 277.59999999999997, 101.29999999999998) + ) -> Seq( + Seq(292.1666753739119, 152.70000455081467, 187.93333893418327, 59.93333511948589), + Seq(292.1666753739119, 152.70000455081467, 187.93333893418327, 59.93333511948589), + Seq(292.1666753739119, 152.70000455081467, 187.93333893418327, 59.93333511948589))) + .toDF("arg0", "arg1") + + val result = df.select(chi2(df("arg0"), df("arg1"))).collect + assert(result.length == 1) + val chi2Val = result.head.getAs[Row](0).getAs[Seq[Double]](0) + val pVal = result.head.getAs[Row](0).getAs[Seq[Double]](1) + + (chi2Val, Seq(10.81782088, 3.59449902, 116.16984746, 67.24482759)) + .zipped + .foreach((actual, expected) => assert(actual ~== expected)) + + (pVal, Seq(4.47651499e-03, 1.65754167e-01, 5.94344354e-26, 2.50017968e-15)) + .zipped + .foreach((actual, expected) => assert(actual ~== expected)) + } + test("ftvec.conv - quantify") { import hiveContext.implicits._ val testDf = Seq((1, "aaa", true), (2, "bbb", false), (3, "aaa", false)).toDF @@ -336,6 +365,17 @@ final class HivemallOpsSuite extends HivemallQueryTest { checkAnswer(predicted, Seq(Row(0), Row(1))) } + test("tools.array - select_k_best") { + import hiveContext.implicits._ + + val data = Seq(Seq(0, 1, 3), Seq(2, 4, 1), Seq(5, 4, 9)) + val df = data.map(d => (d, Seq(3, 1, 2), 2)).toDF("features", "importance_list", "k") + + // if use checkAnswer here, fail for some reason, maybe type? but it's okay on spark-2.0 + assert(df.select(select_k_best(df("features"), df("importance_list"), df("k"))).collect === + data.map(s => Row(Seq(s(0).toDouble, s(2).toDouble)))) + } + test("misc - sigmoid") { import hiveContext.implicits._ /** @@ -534,14 +574,13 @@ final class HivemallOpsSuite extends HivemallQueryTest { assert(row4(0).getDouble(1) ~== 0.25) } - test("user-defined aggregators for ftvec.trans") { + ignore("user-defined aggregators for ftvec.trans") { import hiveContext.implicits._ val df0 = Seq((1, "cat", "mammal", 9), (1, "dog", "mammal", 10), (1, "human", "mammal", 10), (1, "seahawk", "bird", 101), (1, "wasp", "insect", 3), (1, "wasp", "insect", 9), (1, "cat", "mammal", 101), (1, "dog", "mammal", 1), (1, "human", "mammal", 9)) - .toDF("col0", "cat1", "cat2", "cat3") - + .toDF("col0", "cat1", "cat2", "cat3") val row00 = df0.groupby($"col0").onehot_encoding("cat1") val row01 = df0.groupby($"col0").onehot_encoding("cat1", "cat2", "cat3") @@ -560,4 +599,64 @@ final class HivemallOpsSuite extends HivemallQueryTest { assert(result012.keySet === Set(1, 3, 9, 10, 101)) assert(result012.values.toSet === Set(9, 10, 11, 12, 13)) } + + test("user-defined aggregators for ftvec.selection") { + import hiveContext.implicits._ + + // see also hivemall.ftvec.selection.SignalNoiseRatioUDAFTest + // binary class + // +-----------------+-------+ + // | features | class | + // +-----------------+-------+ + // | 5.1,3.5,1.4,0.2 | 0 | + // | 4.9,3.0,1.4,0.2 | 0 | + // | 4.7,3.2,1.3,0.2 | 0 | + // | 7.0,3.2,4.7,1.4 | 1 | + // | 6.4,3.2,4.5,1.5 | 1 | + // | 6.9,3.1,4.9,1.5 | 1 | + // +-----------------+-------+ + val df0 = Seq( + (1, Seq(5.1, 3.5, 1.4, 0.2), Seq(1, 0)), (1, Seq(4.9, 3.0, 1.4, 0.2), Seq(1, 0)), + (1, Seq(4.7, 3.2, 1.3, 0.2), Seq(1, 0)), (1, Seq(7.0, 3.2, 4.7, 1.4), Seq(0, 1)), + (1, Seq(6.4, 3.2, 4.5, 1.5), Seq(0, 1)), (1, Seq(6.9, 3.1, 4.9, 1.5), Seq(0, 1))) + .toDF("c0", "arg0", "arg1") + val row0 = df0.groupby($"c0").snr("arg0", "arg1").collect + (row0(0).getAs[Seq[Double]](1), Seq(4.38425236, 0.26390002, 15.83984511, 26.87005769)) + .zipped + .foreach((actual, expected) => assert(actual ~== expected)) + + // multiple class + // +-----------------+-------+ + // | features | class | + // +-----------------+-------+ + // | 5.1,3.5,1.4,0.2 | 0 | + // | 4.9,3.0,1.4,0.2 | 0 | + // | 7.0,3.2,4.7,1.4 | 1 | + // | 6.4,3.2,4.5,1.5 | 1 | + // | 6.3,3.3,6.0,2.5 | 2 | + // | 5.8,2.7,5.1,1.9 | 2 | + // +-----------------+-------+ + val df1 = Seq( + (1, Seq(5.1, 3.5, 1.4, 0.2), Seq(1, 0, 0)), (1, Seq(4.9, 3.0, 1.4, 0.2), Seq(1, 0, 0)), + (1, Seq(7.0, 3.2, 4.7, 1.4), Seq(0, 1, 0)), (1, Seq(6.4, 3.2, 4.5, 1.5), Seq(0, 1, 0)), + (1, Seq(6.3, 3.3, 6.0, 2.5), Seq(0, 0, 1)), (1, Seq(5.8, 2.7, 5.1, 1.9), Seq(0, 0, 1))) + .toDF("c0", "arg0", "arg1") + val row1 = df1.groupby($"c0").snr("arg0", "arg1").collect + (row1(0).getAs[Seq[Double]](1), Seq(8.43181818, 1.32121212, 42.94949495, 33.80952381)) + .zipped + .foreach((actual, expected) => assert(actual ~== expected)) + } + + test("user-defined aggregators for tools.matrix") { + import hiveContext.implicits._ + + // | 1 2 3 |T | 5 6 7 | + // | 3 4 5 | * | 7 8 9 | + val df0 = Seq((1, Seq(1, 2, 3), Seq(5, 6, 7)), (1, Seq(3, 4, 5), Seq(7, 8, 9))) + .toDF("c0", "arg0", "arg1") + + // if use checkAnswer here, fail for some reason, maybe type? but it's okay on spark-2.0 + assert(df0.groupby($"c0").transpose_and_dot("arg0", "arg1").collect() === + Seq(Row(1, Seq(Seq(26.0, 30.0, 34.0), Seq(38.0, 44.0, 50.0), Seq(50.0, 58.0, 66.0))))) + } } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fad2941f/spark/spark-2.0/src/main/scala/org/apache/spark/sql/hive/HivemallGroupedDataset.scala ---------------------------------------------------------------------- diff --git a/spark/spark-2.0/src/main/scala/org/apache/spark/sql/hive/HivemallGroupedDataset.scala b/spark/spark-2.0/src/main/scala/org/apache/spark/sql/hive/HivemallGroupedDataset.scala index 8ac7185..bdeff98 100644 --- a/spark/spark-2.0/src/main/scala/org/apache/spark/sql/hive/HivemallGroupedDataset.scala +++ b/spark/spark-2.0/src/main/scala/org/apache/spark/sql/hive/HivemallGroupedDataset.scala @@ -133,6 +133,19 @@ final class HivemallGroupedDataset(groupBy: RelationalGroupedDataset) { } /** + * @see hivemall.tools.matrix.TransposeAndDotUDAF + */ + def transpose_and_dot(X: String, Y: String): DataFrame = { + val udaf = HiveUDAFFunction( + "transpose_and_dot", + new HiveFunctionWrapper("hivemall.tools.matrix.TransposeAndDotUDAF"), + Seq(X, Y).map(df.col(_).expr), + isUDAFBridgeRequired = false) + .toAggregateExpression() + toDF(Seq(Alias(udaf, udaf.prettyName)())) + } + + /** * @see hivemall.ftvec.trans.OnehotEncodingUDAF * @group ftvec.trans */ @@ -147,6 +160,19 @@ final class HivemallGroupedDataset(groupBy: RelationalGroupedDataset) { } /** + * @see hivemall.ftvec.selection.SignalNoiseRatioUDAF + */ + def snr(X: String, Y: String): DataFrame = { + val udaf = HiveUDAFFunction( + "snr", + new HiveFunctionWrapper("hivemall.ftvec.selection.SignalNoiseRatioUDAF"), + Seq(X, Y).map(df.col(_).expr), + isUDAFBridgeRequired = false) + .toAggregateExpression() + toDF(Seq(Alias(udaf, udaf.prettyName)())) + } + + /** * @see hivemall.evaluation.MeanAbsoluteErrorUDAF * @group evaluation */ http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fad2941f/spark/spark-2.0/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala ---------------------------------------------------------------------- diff --git a/spark/spark-2.0/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala b/spark/spark-2.0/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala index ba58039..9bde84f 100644 --- a/spark/spark-2.0/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala +++ b/spark/spark-2.0/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala @@ -1216,6 +1216,16 @@ object HivemallOps { } /** + * @see hivemall.ftvec.selection.ChiSquareUDF + * @group ftvec.selection + */ + def chi2(observed: Column, expected: Column): Column = withExpr { + HiveGenericUDF("chi2", + new HiveFunctionWrapper("hivemall.ftvec.selection.ChiSquareUDF"), + Seq(observed.expr, expected.expr)) + } + + /** * @see hivemall.ftvec.conv.ToDenseFeaturesUDF * @group ftvec.conv */ @@ -1295,6 +1305,16 @@ object HivemallOps { } /** + * @see hivemall.tools.array.SelectKBestUDF + * @group tools.array + */ + def select_k_best(X: Column, importanceList: Column, k: Column): Column = withExpr { + HiveGenericUDF("select_k_best", + new HiveFunctionWrapper("hivemall.tools.array.SelectKBestUDF"), + Seq(X.expr, importanceList.expr, k.expr)) + } + + /** * @see hivemall.tools.math.SigmoidUDF * @group misc */ http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/fad2941f/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala ---------------------------------------------------------------------- diff --git a/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala b/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala index a093e07..6f2f016 100644 --- a/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala +++ b/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala @@ -188,6 +188,35 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest { Row(Seq("1:1.0")))) } + test("ftvec.selection - chi2") { + import hiveContext.implicits._ + + // see also hivemall.ftvec.selection.ChiSquareUDFTest + val df = Seq( + Seq( + Seq(250.29999999999998, 170.90000000000003, 73.2, 12.199999999999996), + Seq(296.8, 138.50000000000003, 212.99999999999997, 66.3), + Seq(329.3999999999999, 148.7, 277.59999999999997, 101.29999999999998) + ) -> Seq( + Seq(292.1666753739119, 152.70000455081467, 187.93333893418327, 59.93333511948589), + Seq(292.1666753739119, 152.70000455081467, 187.93333893418327, 59.93333511948589), + Seq(292.1666753739119, 152.70000455081467, 187.93333893418327, 59.93333511948589))) + .toDF("arg0", "arg1") + + val result = df.select(chi2(df("arg0"), df("arg1"))).collect + assert(result.length == 1) + val chi2Val = result.head.getAs[Row](0).getAs[Seq[Double]](0) + val pVal = result.head.getAs[Row](0).getAs[Seq[Double]](1) + + (chi2Val, Seq(10.81782088, 3.59449902, 116.16984746, 67.24482759)) + .zipped + .foreach((actual, expected) => assert(actual ~== expected)) + + (pVal, Seq(4.47651499e-03, 1.65754167e-01, 5.94344354e-26, 2.50017968e-15)) + .zipped + .foreach((actual, expected) => assert(actual ~== expected)) + } + test("ftvec.conv - quantify") { import hiveContext.implicits._ val testDf = Seq((1, "aaa", true), (2, "bbb", false), (3, "aaa", false)).toDF @@ -361,6 +390,18 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest { checkAnswer(predicted, Seq(Row(0), Row(1))) } + test("tools.array - select_k_best") { + import hiveContext.implicits._ + import org.apache.spark.sql.functions._ + + val data = Seq(Seq(0, 1, 3), Seq(2, 4, 1), Seq(5, 4, 9)) + val df = data.map(d => (d, Seq(3, 1, 2))).toDF("features", "importance_list") + val k = 2 + + checkAnswer(df.select(select_k_best(df("features"), df("importance_list"), lit(k))), + data.map(s => Row(Seq(s(0).toDouble, s(2).toDouble)))) + } + test("misc - sigmoid") { import hiveContext.implicits._ /** @@ -661,6 +702,65 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest { assert(result012.keySet === Set(1, 3, 9, 10, 101)) assert(result012.values.toSet === Set(9, 10, 11, 12, 13)) } + + test("user-defined aggregators for ftvec.selection") { + import hiveContext.implicits._ + + // see also hivemall.ftvec.selection.SignalNoiseRatioUDAFTest + // binary class + // +-----------------+-------+ + // | features | class | + // +-----------------+-------+ + // | 5.1,3.5,1.4,0.2 | 0 | + // | 4.9,3.0,1.4,0.2 | 0 | + // | 4.7,3.2,1.3,0.2 | 0 | + // | 7.0,3.2,4.7,1.4 | 1 | + // | 6.4,3.2,4.5,1.5 | 1 | + // | 6.9,3.1,4.9,1.5 | 1 | + // +-----------------+-------+ + val df0 = Seq( + (1, Seq(5.1, 3.5, 1.4, 0.2), Seq(1, 0)), (1, Seq(4.9, 3.0, 1.4, 0.2), Seq(1, 0)), + (1, Seq(4.7, 3.2, 1.3, 0.2), Seq(1, 0)), (1, Seq(7.0, 3.2, 4.7, 1.4), Seq(0, 1)), + (1, Seq(6.4, 3.2, 4.5, 1.5), Seq(0, 1)), (1, Seq(6.9, 3.1, 4.9, 1.5), Seq(0, 1))) + .toDF("c0", "arg0", "arg1") + val row0 = df0.groupBy($"c0").snr("arg0", "arg1").collect + (row0(0).getAs[Seq[Double]](1), Seq(4.38425236, 0.26390002, 15.83984511, 26.87005769)) + .zipped + .foreach((actual, expected) => assert(actual ~== expected)) + + // multiple class + // +-----------------+-------+ + // | features | class | + // +-----------------+-------+ + // | 5.1,3.5,1.4,0.2 | 0 | + // | 4.9,3.0,1.4,0.2 | 0 | + // | 7.0,3.2,4.7,1.4 | 1 | + // | 6.4,3.2,4.5,1.5 | 1 | + // | 6.3,3.3,6.0,2.5 | 2 | + // | 5.8,2.7,5.1,1.9 | 2 | + // +-----------------+-------+ + val df1 = Seq( + (1, Seq(5.1, 3.5, 1.4, 0.2), Seq(1, 0, 0)), (1, Seq(4.9, 3.0, 1.4, 0.2), Seq(1, 0, 0)), + (1, Seq(7.0, 3.2, 4.7, 1.4), Seq(0, 1, 0)), (1, Seq(6.4, 3.2, 4.5, 1.5), Seq(0, 1, 0)), + (1, Seq(6.3, 3.3, 6.0, 2.5), Seq(0, 0, 1)), (1, Seq(5.8, 2.7, 5.1, 1.9), Seq(0, 0, 1))) + .toDF("c0", "arg0", "arg1") + val row1 = df1.groupBy($"c0").snr("arg0", "arg1").collect + (row1(0).getAs[Seq[Double]](1), Seq(8.43181818, 1.32121212, 42.94949495, 33.80952381)) + .zipped + .foreach((actual, expected) => assert(actual ~== expected)) + } + + test("user-defined aggregators for tools.matrix") { + import hiveContext.implicits._ + + // | 1 2 3 |T | 5 6 7 | + // | 3 4 5 | * | 7 8 9 | + val df0 = Seq((1, Seq(1, 2, 3), Seq(5, 6, 7)), (1, Seq(3, 4, 5), Seq(7, 8, 9))) + .toDF("c0", "arg0", "arg1") + + checkAnswer(df0.groupBy($"c0").transpose_and_dot("arg0", "arg1"), + Seq(Row(1, Seq(Seq(26.0, 30.0, 34.0), Seq(38.0, 44.0, 50.0), Seq(50.0, 58.0, 66.0))))) + } } final class HivemallOpsWithVectorSuite extends VectorQueryTest {
