[incubator-hivemall] Git Push Summary
Repository: incubator-hivemall Updated Branches: refs/heads/JIRA-22/pr-304 [deleted] 775ae4f79
[incubator-hivemall] Git Push Summary
Repository: incubator-hivemall Updated Branches: refs/heads/JIRA-22/pr-356 [deleted] bb3250448
[incubator-hivemall] Git Push Summary
Repository: incubator-hivemall Updated Branches: refs/heads/JIRA-22/pr-336 [deleted] f8d152cba
[incubator-hivemall] Git Push Summary
Repository: incubator-hivemall Updated Branches: refs/heads/JIRA-22/pr-385 [deleted] 4c8dcbfcd
[incubator-hivemall] Git Push Summary
Repository: incubator-hivemall Updated Tags: refs/tags/v0.5-alpha.1 [deleted] 2a66cf620
[incubator-hivemall] Git Push Summary
Repository: incubator-hivemall Updated Tags: refs/tags/v0.4.2-rc.2 [deleted] e1df0504d
[incubator-hivemall] Git Push Summary
Repository: incubator-hivemall Updated Branches: refs/heads/JIRA-22/pr-285 [deleted] 05766432c
[25/50] [abbrv] incubator-hivemall git commit: integrate chi2 and SNR into hivemall.spark
integrate chi2 and SNR into hivemall.spark Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/a1f8f958 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/a1f8f958 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/a1f8f958 Branch: refs/heads/JIRA-22/pr-385 Commit: a1f8f958c99f3cde9e48b6d80d364004f6d98cc2 Parents: 22a608e Author: amaya Authored: Tue Sep 27 15:58:33 2016 +0900 Committer: amaya Committed: Tue Sep 27 15:58:33 2016 +0900 -- .../apache/spark/sql/hive/GroupedDataEx.scala | 24 .../org/apache/spark/sql/hive/HivemallOps.scala | 19 ++ .../spark/sql/hive/HivemallOpsSuite.scala | 63 ++- .../org/apache/spark/sql/hive/HivemallOps.scala | 20 ++ .../sql/hive/RelationalGroupedDatasetEx.scala | 26 .../spark/sql/hive/HivemallOpsSuite.scala | 65 +++- 6 files changed, 212 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/a1f8f958/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala -- diff --git a/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala b/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala index 37d5423..2482c62 100644 --- a/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala +++ b/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala @@ -264,4 +264,28 @@ final class GroupedDataEx protected[sql]( .toAggregateExpression() toDF((Alias(udaf, udaf.prettyString)() :: Nil).toSeq) } + + /** + * @see hivemall.ftvec.selection.SignalNoiseRatioUDAF + */ + def snr(X: String, Y: String): DataFrame = { +val udaf = HiveUDAFFunction( + new HiveFunctionWrapper("hivemall.ftvec.selection.SignalNoiseRatioUDAF"), + Seq(X, Y).map(df.col(_).expr), + isUDAFBridgeRequired = false) + .toAggregateExpression() +toDF(Seq(Alias(udaf, udaf.prettyString)())) + } + + /** + * @see hivemall.tools.matrix.TransposeAndDotUDAF + */ + def transpose_and_dot(X: String, Y: String): DataFrame = { +val udaf = HiveUDAFFunction( + new HiveFunctionWrapper("hivemall.tools.matrix.TransposeAndDotUDAF"), + Seq(X, Y).map(df.col(_).expr), + isUDAFBridgeRequired = false) + .toAggregateExpression() +toDF(Seq(Alias(udaf, udaf.prettyString)())) + } } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/a1f8f958/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala -- diff --git a/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala b/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala index 133f1d5..5970b83 100644 --- a/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala +++ b/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala @@ -1006,6 +1006,15 @@ object HivemallOps { } /** +* @see hivemall.ftvec.selection.ChiSquareUDF +* @group ftvec.selection +*/ + def chi2(exprs: Column*): Column = { +HiveGenericUDF(new HiveFunctionWrapper( + "hivemall.ftvec.selection.ChiSquareUDF"), exprs.map(_.expr)) + } + + /** * @see hivemall.ftvec.conv.ToDenseFeaturesUDF * @group ftvec.conv */ @@ -1078,6 +1087,16 @@ object HivemallOps { } /** + * @see hivemall.tools.array.SelectKBestUDF + * @group tools.array + */ + @scala.annotation.varargs + def select_k_best(exprs: Column*): Column = { +HiveGenericUDF(new HiveFunctionWrapper( + "hivemall.tools.array.SelectKBestUDF"), exprs.map(_.expr)) + } + + /** * @see hivemall.tools.math.SigmoidUDF * @group misc */ http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/a1f8f958/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala -- diff --git a/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala b/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala index 4be1e5e..148e5a2 100644 --- a/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala +++ b/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.hive -import scala.collection.mutable.Seq - import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.hive.HivemallOps._ import org.apache.spark.sql.hive.HivemallUtils._ @@ -188,6 +186,22 @@ final class HivemallOpsSuite extends
[43/50] [abbrv] incubator-hivemall git commit: Update license header
Update license header Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/798ec6a7 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/798ec6a7 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/798ec6a7 Branch: refs/heads/JIRA-22/pr-336 Commit: 798ec6a73ca37d474137fc82db1c22a92521307d Parents: ddd8dc2 Author: amaya Authored: Fri Nov 18 04:27:59 2016 +0900 Committer: amaya Committed: Fri Nov 18 04:27:59 2016 +0900 -- systemtest/pom.xml | 2 ++ 1 file changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/798ec6a7/systemtest/pom.xml -- diff --git a/systemtest/pom.xml b/systemtest/pom.xml index e7345af..0debee0 100644 --- a/systemtest/pom.xml +++ b/systemtest/pom.xml @@ -6,7 +6,9 @@ to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
[28/50] [abbrv] incubator-hivemall git commit: refine feature selection in spark integration
refine feature selection in spark integration Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/1347de98 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/1347de98 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/1347de98 Branch: refs/heads/JIRA-22/pr-385 Commit: 1347de985ea6f8028c9d381f8827882ad39ad3a7 Parents: aa7d529 Author: amaya Authored: Wed Sep 28 14:22:05 2016 +0900 Committer: amaya Committed: Wed Sep 28 14:22:05 2016 +0900 -- .../org/apache/spark/sql/hive/HivemallOps.scala | 9 +- .../spark/sql/hive/HivemallOpsSuite.scala | 94 ++-- .../org/apache/spark/sql/hive/HivemallOps.scala | 8 +- .../spark/sql/hive/HivemallOpsSuite.scala | 89 -- 4 files changed, 138 insertions(+), 62 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1347de98/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala -- diff --git a/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala b/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala index 41a4065..255f697 100644 --- a/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala +++ b/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala @@ -1006,9 +1006,9 @@ object HivemallOps { * @see hivemall.ftvec.selection.ChiSquareUDF * @group ftvec.selection */ - def chi2(exprs: Column*): Column = { + def chi2(observed: Column, expected: Column): Column = { HiveGenericUDF(new HiveFunctionWrapper( - "hivemall.ftvec.selection.ChiSquareUDF"), exprs.map(_.expr)) + "hivemall.ftvec.selection.ChiSquareUDF"), Seq(observed.expr, expected.expr)) } /** @@ -1087,10 +1087,9 @@ object HivemallOps { * @see hivemall.tools.array.SelectKBestUDF * @group tools.array */ - @scala.annotation.varargs - def select_k_best(exprs: Column*): Column = { + def select_k_best(X: Column, importanceList: Column, k: Column): Column = { HiveGenericUDF(new HiveFunctionWrapper( - "hivemall.tools.array.SelectKBestUDF"), exprs.map(_.expr)) + "hivemall.tools.array.SelectKBestUDF"), Seq(X.expr, importanceList.expr, k.expr)) } /** http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1347de98/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala -- diff --git a/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala b/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala index e118257..cce22ce 100644 --- a/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala +++ b/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala @@ -17,13 +17,14 @@ package org.apache.spark.sql.hive -import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.hive.HivemallOps._ import org.apache.spark.sql.hive.HivemallUtils._ import org.apache.spark.sql.types._ +import org.apache.spark.sql.{Column, Row} import org.apache.spark.test.HivemallQueryTest import org.apache.spark.test.TestDoubleWrapper._ import org.apache.spark.test.TestUtils._ +import org.scalatest.Matchers._ final class HivemallOpsSuite extends HivemallQueryTest { @@ -188,18 +189,32 @@ final class HivemallOpsSuite extends HivemallQueryTest { test("ftvec.selection - chi2") { import hiveContext.implicits._ - -val df = Seq(Seq( - Seq(250.28, 170.93, 73.2, 12.196), - Seq(296.8, 138.53, 212.97, 66.3), - Seq(329.3, 148.7, 277.57, 101.28)) -> Seq( - Seq(292.1666753739119, 152.7455081467, 187.9893418327, 59.9511948589), - Seq(292.1666753739119, 152.7455081467, 187.9893418327, 59.9511948589), - Seq(292.1666753739119, 152.7455081467, 187.9893418327, 59.9511948589))).toDF("arg0", "arg1") - -assert(df.select(chi2(df("arg0"), df("arg1"))).collect.toSet === - Set(Row(Row(Seq(10.817820878493995, 3.5944990176817315, 116.16984746363957, 67.24482558215503), -Seq(0.004476514990225833, 0.16575416718561453, 0d, 2.55351295663786e-15) +implicit val doubleEquality = org.scalactic.TolerantNumerics.tolerantDoubleEquality(1e-5) + +// see also hivemall.ftvec.selection.ChiSquareUDFTest +val df = Seq( + Seq( +Seq(250.28, 170.93, 73.2, 12.196), +Seq(296.8, 138.53, 212.97, 66.3), +Seq(329.3999
[36/50] [abbrv] incubator-hivemall git commit: Mod README
Mod README Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/ba912677 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/ba912677 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/ba912677 Branch: refs/heads/JIRA-22/pr-336 Commit: ba91267796cbfdee53aaef02af882aff591fb8f7 Parents: 43ca0c8 Author: amaya Authored: Thu Nov 17 14:15:03 2016 +0900 Committer: amaya Committed: Thu Nov 17 14:15:03 2016 +0900 -- systemtest/README.md | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/ba912677/systemtest/README.md -- diff --git a/systemtest/README.md b/systemtest/README.md index 4fca0c3..2805165 100644 --- a/systemtest/README.md +++ b/systemtest/README.md @@ -157,7 +157,7 @@ public class QuickExample { public void test3() throws Exception { // test on HiveRunner once only // auto matching by files which name is `test3` in `case/` and `answer/` -team.set(HQ.autoMatchingByFileName("test3", ci)); // unordered test +team.set(HQ.autoMatchingByFileName("test3"), ci); // unordered test team.run(); // this call is required } @@ -165,7 +165,7 @@ public class QuickExample { public void test4() throws Exception { // test on HiveRunner once only predictor.expect(Throwable.class); // you can use systemtest w/ other rules -team.set(HQ.fromStatement("invalid queryyy")); // this query throws an exception +team.set(HQ.fromStatement("invalid queryyy"), "never used"); // this query throws an exception team.run(); // this call is required // thrown exception will be caught by `ExpectedException` rule } @@ -174,7 +174,7 @@ public class QuickExample { The above requires following files -* `systemtest/src/test/resources/hivemall/HogeTest/init/color.tsv` (`systemtest/src/test/resources/${path/to/package}/${className}/init/${fileName}`) +* `systemtest/src/test/resources/hivemall/QuickExample/init/color.tsv` (`systemtest/src/test/resources/${path/to/package}/${className}/init/${fileName}`) ```tsv blue 0 0 255 @@ -190,7 +190,7 @@ red 255 0 0 pink 255 192 203 ``` -* `systemtest/src/test/resources/hivemall/HogeTest/case/test3` (`systemtest/src/test/resources/${path/to/package}/${className}/case/${fileName}`) +* `systemtest/src/test/resources/hivemall/QuickExample/case/test3` (`systemtest/src/test/resources/${path/to/package}/${className}/case/${fileName}`) ```sql -- write your hive queries @@ -199,12 +199,12 @@ SELECT blue FROM color WHERE name = 'lavender';SELECT green FROM color WHERE nam SELECT name FROM color WHERE blue = 255 ``` -* `systemtest/src/test/resources/hivemall/HogeTest/answer/test3` (`systemtest/src/test/resources/${path/to/package}/${className}/answer/${fileName}`) +* `systemtest/src/test/resources/hivemall/QuickExample/answer/test3` (`systemtest/src/test/resources/${path/to/package}/${className}/answer/${fileName}`) tsv format is required ```tsv -230 +250 16569 azurebluemagenta ```
[40/50] [abbrv] incubator-hivemall git commit: Fix process of tdprop
Fix process of tdprop Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/144cb504 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/144cb504 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/144cb504 Branch: refs/heads/JIRA-22/pr-336 Commit: 144cb504d674d2509620ce0d315694be0f664f42 Parents: 3550fd3 Author: amaya Authored: Fri Nov 18 01:58:31 2016 +0900 Committer: amaya Committed: Fri Nov 18 01:58:31 2016 +0900 -- .../systemtest/runner/TDSystemTestRunner.java | 16 ++-- 1 file changed, 10 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/144cb504/systemtest/src/main/java/hivemall/systemtest/runner/TDSystemTestRunner.java -- diff --git a/systemtest/src/main/java/hivemall/systemtest/runner/TDSystemTestRunner.java b/systemtest/src/main/java/hivemall/systemtest/runner/TDSystemTestRunner.java index 6d6c85b..87dd835 100644 --- a/systemtest/src/main/java/hivemall/systemtest/runner/TDSystemTestRunner.java +++ b/systemtest/src/main/java/hivemall/systemtest/runner/TDSystemTestRunner.java @@ -85,16 +85,20 @@ public class TDSystemTestRunner extends SystemTestRunner { fileUploadCommitRetryLimit = Integer.valueOf(props.getProperty("fileUploadCommitRetryLimit")); } -final Properties TDPorps = System.getProperties(); +boolean fromPropertiesFile = false; for (Map.Entry e : props.entrySet()) { -if (e.getKey().toString().startsWith("td.client.")) { -TDPorps.setProperty(e.getKey().toString(), e.getValue().toString()); +final String key = e.getKey().toString(); +if (key.startsWith("td.client.")) { +fromPropertiesFile = true; +System.setProperty(key, e.getValue().toString()); } } -System.setProperties(TDPorps); -client = System.getProperties().size() == TDPorps.size() ? TDClient.newClient() // use $HOME/.td/td.conf -: TDClient.newBuilder(false).build(); // use *.properties +if (fromPropertiesFile) { +client = TDClient.newBuilder(false).build(); // use *.properties +} else { +client = TDClient.newClient(); // use $HOME/.td/td.conf +} } @Override
[26/50] [abbrv] incubator-hivemall git commit: Merge 'master' into 'feature/feature_selection'
Merge 'master' into 'feature/feature_selection' Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/aa7d5299 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/aa7d5299 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/aa7d5299 Branch: refs/heads/JIRA-22/pr-385 Commit: aa7d5299739349b49ef4f50cc2c1969f5cb8a78f Parents: a1f8f95 bc8b015 Author: amaya Authored: Tue Sep 27 16:02:02 2016 +0900 Committer: amaya Committed: Tue Sep 27 16:02:02 2016 +0900 -- README.md | 7 +- core/pom.xml| 2 +- .../hivemall/ensemble/ArgminKLDistanceUDAF.java | 1 + .../main/java/hivemall/ensemble/MaxRowUDAF.java | 21 +- .../hivemall/ensemble/MaxValueLabelUDAF.java| 1 + .../hivemall/ensemble/bagging/VotedAvgUDAF.java | 1 + .../ensemble/bagging/WeightVotedAvgUDAF.java| 1 + .../main/java/hivemall/evaluation/AUCUDAF.java | 37 +- .../evaluation/BinaryResponsesMeasures.java | 31 +- .../java/hivemall/evaluation/FMeasureUDAF.java | 1 + .../evaluation/GradedResponsesMeasures.java | 7 +- .../evaluation/LogarithmicLossUDAF.java | 1 + .../main/java/hivemall/evaluation/MAPUDAF.java | 55 +-- .../main/java/hivemall/evaluation/MRRUDAF.java | 55 +-- .../evaluation/MeanAbsoluteErrorUDAF.java | 1 + .../evaluation/MeanSquaredErrorUDAF.java| 1 + .../main/java/hivemall/evaluation/NDCGUDAF.java | 45 +-- .../java/hivemall/evaluation/PrecisionUDAF.java | 55 +-- .../main/java/hivemall/evaluation/R2UDAF.java | 1 + .../java/hivemall/evaluation/RecallUDAF.java| 55 +-- .../evaluation/RootMeanSquaredErrorUDAF.java| 1 + .../java/hivemall/fm/FMPredictGenericUDAF.java | 23 +- .../hivemall/ftvec/binning/BuildBinsUDAF.java | 45 ++- .../ftvec/binning/FeatureBinningUDF.java| 26 +- .../ftvec/binning/NumericHistogram.java | 28 +- .../ftvec/conv/ConvertToDenseModelUDAF.java | 1 + .../hivemall/ftvec/text/TermFrequencyUDAF.java | 1 + .../ftvec/trans/OnehotEncodingUDAF.java | 335 +++ .../smile/tools/RandomForestEnsembleUDAF.java | 1 + .../tools/array/ArrayAvgGenericUDAF.java| 27 +- .../java/hivemall/tools/array/ArraySumUDAF.java | 1 + .../hivemall/tools/bits/BitsCollectUDAF.java| 23 +- .../main/java/hivemall/tools/map/UDAFToMap.java | 23 +- .../hivemall/tools/map/UDAFToOrderedMap.java| 6 +- .../java/hivemall/utils/hadoop/HiveUtils.java | 9 + .../hivemall/utils/hadoop/WritableUtils.java| 15 + .../java/hivemall/utils/lang/Identifier.java| 38 ++- .../hive/ql/exec/MapredContextAccessor.java | 3 + .../ftvec/trans/TestBinarizeLabelUDTF.java | 7 +- mixserv/pom.xml | 2 +- nlp/pom.xml | 2 +- .../hivemall/nlp/tokenizer/KuromojiUDFTest.java | 31 +- pom.xml | 1 + resources/ddl/define-all-as-permanent.hive | 3 + resources/ddl/define-all.hive | 3 + resources/ddl/define-udfs.td.hql| 1 + .../org/apache/spark/sql/hive/HivemallOps.scala | 5 +- .../apache/spark/sql/hive/HiveUdfSuite.scala| 36 ++ .../spark/sql/hive/HivemallOpsSuite.scala | 47 ++- .../sql/catalyst/expressions/EachTopK.scala | 108 ++ .../org/apache/spark/sql/hive/HivemallOps.scala | 43 ++- .../apache/spark/sql/hive/HiveUdfSuite.scala| 43 ++- .../spark/sql/hive/HivemallOpsSuite.scala | 70 ++-- .../sql/hive/benchmark/MiscBenchmark.scala | 72 ++-- spark/spark-common/pom.xml | 2 +- xgboost/pom.xml | 2 +- 56 files changed, 1125 insertions(+), 338 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/aa7d5299/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java -- diff --cc core/src/main/java/hivemall/utils/hadoop/HiveUtils.java index 9272e60,91f1dfa..c752188 --- a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java +++ b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java @@@ -55,9 -55,9 +55,10 @@@ import org.apache.hadoop.hive.serde2.ob import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; + import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.
[30/50] [abbrv] incubator-hivemall git commit: mod SNR for corner cases
mod SNR for corner cases Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/4cfa4e5a Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/4cfa4e5a Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/4cfa4e5a Branch: refs/heads/JIRA-22/pr-385 Commit: 4cfa4e5ac15a6535b187c23616c205696a1cd13b Parents: 8e2842c Author: amaya Authored: Wed Sep 28 18:26:01 2016 +0900 Committer: amaya Committed: Wed Sep 28 18:29:28 2016 +0900 -- .../ftvec/selection/SignalNoiseRatioUDAF.java | 48 +-- .../selection/SignalNoiseRatioUDAFTest.java | 135 ++- 2 files changed, 167 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/4cfa4e5a/core/src/main/java/hivemall/ftvec/selection/SignalNoiseRatioUDAF.java -- diff --git a/core/src/main/java/hivemall/ftvec/selection/SignalNoiseRatioUDAF.java b/core/src/main/java/hivemall/ftvec/selection/SignalNoiseRatioUDAF.java index b7b9126..507aefa 100644 --- a/core/src/main/java/hivemall/ftvec/selection/SignalNoiseRatioUDAF.java +++ b/core/src/main/java/hivemall/ftvec/selection/SignalNoiseRatioUDAF.java @@ -21,7 +21,6 @@ package hivemall.ftvec.selection; import hivemall.utils.hadoop.HiveUtils; import hivemall.utils.hadoop.WritableUtils; import hivemall.utils.lang.Preconditions; -import org.apache.commons.math3.util.FastMath; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; @@ -193,7 +192,7 @@ public class SignalNoiseRatioUDAF extends AbstractGenericUDAFResolver { int clazz = -1; for (int i = 0; i < nClasses; i++) { -int label = PrimitiveObjectInspectorUtils.getInt(labels.get(i), labelOI); +final int label = PrimitiveObjectInspectorUtils.getInt(labels.get(i), labelOI); if (label == 1 && clazz == -1) { clazz = i; } else if (label == 1) { @@ -255,6 +254,12 @@ public class SignalNoiseRatioUDAF extends AbstractGenericUDAFResolver { for (int i = 0; i < nClasses; i++) { final long n = myAgg.ns[i]; final long m = PrimitiveObjectInspectorUtils.getLong(ns.get(i), nOI); + +// no need to merge class `i` +if (m == 0) { +continue; +} + final List means = meansOI.getList(meanss.get(i)); final List variances = variancesOI.getList(variancess.get(i)); @@ -266,10 +271,19 @@ public class SignalNoiseRatioUDAF extends AbstractGenericUDAFResolver { final double varianceN = myAgg.variancess[i][j]; final double varianceM = PrimitiveObjectInspectorUtils.getDouble( variances.get(j), varianceOI); -myAgg.meanss[i][j] = (n * meanN + m * meanM) / (double) (n + m); -myAgg.variancess[i][j] = (varianceN * (n - 1) + varianceM * (m - 1) + FastMath.pow( -meanN - meanM, 2) * n * m / (n + m)) -/ (n + m - 1); + +if (n == 0) { +// only assign `other` into `myAgg` +myAgg.meanss[i][j] = meanM; +myAgg.variancess[i][j] = varianceM; +} else { +// merge by Chan's method +// http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf +myAgg.meanss[i][j] = (n * meanN + m * meanM) / (double) (n + m); +myAgg.variancess[i][j] = (varianceN * (n - 1) + varianceM * (m - 1) + Math.pow( +meanN - meanM, 2) * n * m / (n + m)) +/ (n + m - 1); +} } } } @@ -302,25 +316,33 @@ public class SignalNoiseRatioUDAF extends AbstractGenericUDAFResolver { // calc SNR between classes each feature final double[] result = new double[nFeatures]; -final double[] sds = new double[nClasses]; // memo +final double[] sds = new double[nClasses]; // for memorization for (int i = 0; i < nFeatures; i++) { -sds[0] = FastMath.sqrt(myAgg.variancess[0][i]); +sds[0] = Math.sqrt(myAgg.variancess[0][i]); for (int j = 1; j < nClasses; j++) { -sds[j] = FastMath.sqrt(myAgg.variancess[j][i]); -if (Double.isNaN(sds[j])) { +
[14/50] [abbrv] incubator-hivemall git commit: Revert some modifications
Revert some modifications Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/3620eb89 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/3620eb89 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/3620eb89 Branch: refs/heads/JIRA-22/pr-285 Commit: 3620eb89993db22ce8aee924d3cc0df33a5f9618 Parents: f81948c Author: Takeshi YAMAMURO Authored: Wed Sep 21 01:52:22 2016 +0900 Committer: Takeshi YAMAMURO Committed: Wed Sep 21 01:55:59 2016 +0900 -- .../src/main/java/hivemall/LearnerBaseUDTF.java | 33 ++ .../hivemall/classifier/AROWClassifierUDTF.java | 2 +- .../hivemall/classifier/AdaGradRDAUDTF.java | 125 +++- .../classifier/BinaryOnlineClassifierUDTF.java | 10 + .../classifier/GeneralClassifierUDTF.java | 1 + .../classifier/PassiveAggressiveUDTF.java | 2 +- .../main/java/hivemall/model/DenseModel.java| 86 - .../main/java/hivemall/model/NewDenseModel.java | 293 + .../model/NewSpaceEfficientDenseModel.java | 317 +++ .../java/hivemall/model/NewSparseModel.java | 197 .../java/hivemall/model/PredictionModel.java| 3 + .../model/SpaceEfficientDenseModel.java | 92 +- .../main/java/hivemall/model/SparseModel.java | 19 +- .../model/SynchronizedModelWrapper.java | 6 + .../hivemall/regression/AROWRegressionUDTF.java | 2 +- .../java/hivemall/regression/AdaDeltaUDTF.java | 118 ++- .../java/hivemall/regression/AdaGradUDTF.java | 119 ++- .../regression/GeneralRegressionUDTF.java | 1 + .../java/hivemall/regression/LogressUDTF.java | 65 +++- .../PassiveAggressiveRegressionUDTF.java| 2 +- .../hivemall/regression/RegressionBaseUDTF.java | 12 +- .../NewSpaceEfficientNewDenseModelTest.java | 60 .../model/SpaceEfficientDenseModelTest.java | 60 .../java/hivemall/mix/server/MixServerTest.java | 14 +- .../hivemall/mix/server/MixServerSuite.scala| 4 +- 25 files changed, 1512 insertions(+), 131 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3620eb89/core/src/main/java/hivemall/LearnerBaseUDTF.java -- diff --git a/core/src/main/java/hivemall/LearnerBaseUDTF.java b/core/src/main/java/hivemall/LearnerBaseUDTF.java index 7fd5190..4cf3c7f 100644 --- a/core/src/main/java/hivemall/LearnerBaseUDTF.java +++ b/core/src/main/java/hivemall/LearnerBaseUDTF.java @@ -25,6 +25,9 @@ import hivemall.model.DenseModel; import hivemall.model.PredictionModel; import hivemall.model.SpaceEfficientDenseModel; import hivemall.model.SparseModel; +import hivemall.model.NewDenseModel; +import hivemall.model.NewSparseModel; +import hivemall.model.NewSpaceEfficientDenseModel; import hivemall.model.SynchronizedModelWrapper; import hivemall.model.WeightValue; import hivemall.model.WeightValue.WeightValueWithCovar; @@ -199,6 +202,36 @@ public abstract class LearnerBaseUDTF extends UDTFWithOptions { return model; } +protected PredictionModel createNewModel(String label) { +PredictionModel model; +final boolean useCovar = useCovariance(); +if (dense_model) { +if (disable_halffloat == false && model_dims > 16777216) { +logger.info("Build a space efficient dense model with " + model_dims ++ " initial dimensions" + (useCovar ? " w/ covariances" : "")); +model = new NewSpaceEfficientDenseModel(model_dims, useCovar); +} else { +logger.info("Build a dense model with initial with " + model_dims ++ " initial dimensions" + (useCovar ? " w/ covariances" : "")); +model = new NewDenseModel(model_dims, useCovar); +} +} else { +int initModelSize = getInitialModelSize(); +logger.info("Build a sparse model with initial with " + initModelSize ++ " initial dimensions"); +model = new NewSparseModel(initModelSize, useCovar); +} +if (mixConnectInfo != null) { +model.configureClock(); +model = new SynchronizedModelWrapper(model); +MixClient client = configureMixClient(mixConnectInfo, label, model); +model.configureMix(client, mixCancel); +this.mixClient = client; +} +assert (model != null); +return model; +} + // If a model implements a optimizer, it must override this protected Map getOptimzierOptions() { return null; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3620eb89/core/src/main/java/hivemall/classifier/AROWCl
[39/50] [abbrv] incubator-hivemall git commit: Mod assert methods
Mod assert methods Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/3550fd30 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/3550fd30 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/3550fd30 Branch: refs/heads/JIRA-22/pr-336 Commit: 3550fd30af3a01f4217c075a3b814952b406aebe Parents: 1f3df54 Author: amaya Authored: Fri Nov 18 01:57:47 2016 +0900 Committer: amaya Committed: Fri Nov 18 01:57:47 2016 +0900 -- .../main/java/hivemall/systemtest/runner/SystemTestRunner.java | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3550fd30/systemtest/src/main/java/hivemall/systemtest/runner/SystemTestRunner.java -- diff --git a/systemtest/src/main/java/hivemall/systemtest/runner/SystemTestRunner.java b/systemtest/src/main/java/hivemall/systemtest/runner/SystemTestRunner.java index 77091f2..f16da90 100644 --- a/systemtest/src/main/java/hivemall/systemtest/runner/SystemTestRunner.java +++ b/systemtest/src/main/java/hivemall/systemtest/runner/SystemTestRunner.java @@ -195,12 +195,10 @@ public abstract class SystemTestRunner extends ExternalResource { if (ordered) { // take order into consideration (like list) -Assert.assertThat(Arrays.asList(answer.split(IO.RD)), -Matchers.contains(result.toArray())); +Assert.assertThat(result, Matchers.contains(answer.split(IO.RD))); } else { // not take order into consideration (like multiset) -Assert.assertThat(Arrays.asList(answer.split(IO.RD)), -Matchers.containsInAnyOrder(result.toArray())); +Assert.assertThat(result, Matchers.containsInAnyOrder(answer.split(IO.RD))); } }
[23/50] [abbrv] incubator-hivemall git commit: add snr
add snr Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/22a608ee Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/22a608ee Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/22a608ee Branch: refs/heads/JIRA-22/pr-385 Commit: 22a608ee1c7239b2953183b5341f80c58b1e7045 Parents: 5088ef3 Author: amaya Authored: Mon Sep 26 17:07:55 2016 +0900 Committer: amaya Committed: Mon Sep 26 17:15:22 2016 +0900 -- .../ftvec/selection/SignalNoiseRatioUDAF.java | 327 +++ .../selection/SignalNoiseRatioUDAFTest.java | 174 ++ resources/ddl/define-all-as-permanent.hive | 3 + resources/ddl/define-all.hive | 3 + resources/ddl/define-all.spark | 3 + resources/ddl/define-udfs.td.hql| 1 + 6 files changed, 511 insertions(+) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/22a608ee/core/src/main/java/hivemall/ftvec/selection/SignalNoiseRatioUDAF.java -- diff --git a/core/src/main/java/hivemall/ftvec/selection/SignalNoiseRatioUDAF.java b/core/src/main/java/hivemall/ftvec/selection/SignalNoiseRatioUDAF.java new file mode 100644 index 000..b7b9126 --- /dev/null +++ b/core/src/main/java/hivemall/ftvec/selection/SignalNoiseRatioUDAF.java @@ -0,0 +1,327 @@ +/* + * Hivemall: Hive scalable Machine Learning Library + * + * Copyright (C) 2015 Makoto YUI + * Copyright (C) 2013-2015 National Institute of Advanced Industrial Science and Technology (AIST) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package hivemall.ftvec.selection; + +import hivemall.utils.hadoop.HiveUtils; +import hivemall.utils.hadoop.WritableUtils; +import hivemall.utils.lang.Preconditions; +import org.apache.commons.math3.util.FastMath; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFParameterInfo; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +@Description(name = "snr", value = "_FUNC_(array features, array one-hot class label)" ++ " - Returns SNR values of each feature as array") +public class SignalNoiseRatioUDAF extends AbstractGenericUDAFResolver { +@Override +public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo info) +throws SemanticException { +final ObjectInspector[] OIs = info.getParameterObjectInspectors(); + +if (OIs.length != 2) { +throw new UDFArgumentLengthException("Specify two arguments."); +} + +if (!HiveUtils.isNumberListOI(OIs[0])) { +throw new UDFArgumentTypeException(0, +"Only array type argument is acceptable but " + OIs[0].getTypeName() ++ " was passed as `features`"); +} + +if (!HiveUtils.isListOI(OIs[1]) +
[22/50] [abbrv] incubator-hivemall git commit: Implement initial SST-based change-point detector
Implement initial SST-based change-point detector Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/3ebd771e Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/3ebd771e Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/3ebd771e Branch: refs/heads/JIRA-22/pr-356 Commit: 3ebd771ee4bebf14769b7c240f8b28b9d5d10e86 Parents: 89ec56e Author: Takuya Kitazawa Authored: Mon Sep 26 17:12:01 2016 +0900 Committer: Takuya Kitazawa Committed: Mon Sep 26 17:12:01 2016 +0900 -- .../java/hivemall/anomaly/SSTChangePoint.java | 118 +++ .../hivemall/anomaly/SSTChangePointUDF.java | 197 +++ .../hivemall/anomaly/SSTChangePointTest.java| 111 +++ 3 files changed, 426 insertions(+) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/3ebd771e/core/src/main/java/hivemall/anomaly/SSTChangePoint.java -- diff --git a/core/src/main/java/hivemall/anomaly/SSTChangePoint.java b/core/src/main/java/hivemall/anomaly/SSTChangePoint.java new file mode 100644 index 000..e693bd4 --- /dev/null +++ b/core/src/main/java/hivemall/anomaly/SSTChangePoint.java @@ -0,0 +1,118 @@ +/* + * Hivemall: Hive scalable Machine Learning Library + * + * Copyright (C) 2015 Makoto YUI + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package hivemall.anomaly; + +import hivemall.anomaly.SSTChangePointUDF.SSTChangePointInterface; +import hivemall.anomaly.SSTChangePointUDF.Parameters; +import hivemall.utils.collections.DoubleRingBuffer; +import org.apache.commons.math3.linear.MatrixUtils; +import org.apache.commons.math3.linear.RealMatrix; +import org.apache.commons.math3.linear.SingularValueDecomposition; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; + +import java.util.Arrays; + +import javax.annotation.Nonnull; + +final class SSTChangePoint implements SSTChangePointInterface { + +@Nonnull +private final PrimitiveObjectInspector oi; + +@Nonnull +private final int window; +@Nonnull +private final int nPastWindow; +@Nonnull +private final int nCurrentWindow; +@Nonnull +private final int pastSize; +@Nonnull +private final int currentSize; +@Nonnull +private final int currentOffset; +@Nonnull +private final int r; + +@Nonnull +private final DoubleRingBuffer xRing; +@Nonnull +private final double[] xSeries; + +SSTChangePoint(@Nonnull Parameters params, @Nonnull PrimitiveObjectInspector oi) { +this.oi = oi; + +this.window = params.w; +this.nPastWindow = params.n; +this.nCurrentWindow = params.m; +this.pastSize = window + nPastWindow; +this.currentSize = window + nCurrentWindow; +this.currentOffset = params.g; +this.r = params.r; + +// (w + n) past samples for the n-past-windows +// (w + m) current samples for the m-current-windows, starting from offset g +// => need to hold past (w + n + g + w + m) samples from the latest sample +int holdSampleSize = pastSize + currentOffset + currentSize; + +this.xRing = new DoubleRingBuffer(holdSampleSize); +this.xSeries = new double[holdSampleSize]; +} + +@Override +public void update(@Nonnull final Object arg, @Nonnull final double[] outScores) +throws HiveException { +double x = PrimitiveObjectInspectorUtils.getDouble(arg, oi); +xRing.add(x).toArray(xSeries, true /* FIFO */); + +// need to wait until the buffer is filled +if (!xRing.isFull()) { +outScores[0] = 0.d; +} else { +outScores[0] = computeScore(); +} +} + +private double computeScore() { +// create past trajectory matrix and find its left singular vectors +RealMatrix H = MatrixUtils.createRealMatrix(window, nPastWindow); +for (int i = 0; i < nPastWindow; i++) { +H.setColumn(i, Arrays.copyOfRange(xSeries, i, i + window)); +} +
[21/50] [abbrv] incubator-hivemall git commit: add tests
add tests Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/5088ef36 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/5088ef36 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/5088ef36 Branch: refs/heads/JIRA-22/pr-385 Commit: 5088ef36367df1cd51ae62f1c044933676975e2e Parents: a882c5f Author: amaya Authored: Wed Sep 21 16:22:09 2016 +0900 Committer: amaya Committed: Wed Sep 21 18:00:35 2016 +0900 -- .../tools/matrix/TransposeAndDotUDAF.java | 2 +- .../ftvec/selection/ChiSquareUDFTest.java | 80 .../tools/array/SelectKBeatUDFTest.java | 65 .../tools/matrix/TransposeAndDotUDAFTest.java | 58 ++ 4 files changed, 204 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5088ef36/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java -- diff --git a/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java b/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java index 9d68f93..9df9305 100644 --- a/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java +++ b/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java @@ -70,7 +70,7 @@ public final class TransposeAndDotUDAF extends AbstractGenericUDAFResolver { return new TransposeAndDotUDAFEvaluator(); } -private static final class TransposeAndDotUDAFEvaluator extends GenericUDAFEvaluator { +static final class TransposeAndDotUDAFEvaluator extends GenericUDAFEvaluator { // PARTIAL1 and COMPLETE private ListObjectInspector matrix0RowOI; private PrimitiveObjectInspector matrix0ElOI; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5088ef36/core/src/test/java/hivemall/ftvec/selection/ChiSquareUDFTest.java -- diff --git a/core/src/test/java/hivemall/ftvec/selection/ChiSquareUDFTest.java b/core/src/test/java/hivemall/ftvec/selection/ChiSquareUDFTest.java new file mode 100644 index 000..38f7f57 --- /dev/null +++ b/core/src/test/java/hivemall/ftvec/selection/ChiSquareUDFTest.java @@ -0,0 +1,80 @@ +/* + * Hivemall: Hive scalable Machine Learning Library + * + * Copyright (C) 2016 Makoto YUI + * Copyright (C) 2013-2015 National Institute of Advanced Industrial Science and Technology (AIST) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package hivemall.ftvec.selection; + +import hivemall.utils.hadoop.WritableUtils; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.junit.Assert; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; + +public class ChiSquareUDFTest { + +@Test +public void test() throws Exception { +// this test is based on iris data set +final ChiSquareUDF chi2 = new ChiSquareUDF(); +final List> observed = new ArrayList>(); +final List> expected = new ArrayList>(); +final GenericUDF.DeferredObject[] dObjs = new GenericUDF.DeferredObject[] { +new GenericUDF.DeferredJavaObject(observed), +new GenericUDF.DeferredJavaObject(expected)}; + +final double[][] matrix0 = new double[][] { +{250.28, 170.93, 73.2, 12.196}, +{296.8, 138.53, 212.97, 66.3}, +{329.3, 148.7, 277.57, 101.28}}; +final double[][] matrix1 = new double[][] { +{292.1666753739119, 152.7455081467, 187.9893418327, 59.9511948589}, +{292.1666753739119, 152.7455081467, 187.9893418327, 59.9511948589}, +{292.1666753739119, 152.7455081467, 187.9893418327, 59.9511948589}};
[16/50] [abbrv] incubator-hivemall git commit: standardize to chi2
standardize to chi2 Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/6dc23449 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/6dc23449 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/6dc23449 Branch: refs/heads/JIRA-22/pr-385 Commit: 6dc234490dc25f563b22e5659c378e6ebcf8dcdb Parents: 89c81aa Author: amaya Authored: Wed Sep 21 11:41:59 2016 +0900 Committer: amaya Committed: Wed Sep 21 13:35:23 2016 +0900 -- resources/ddl/define-all-as-permanent.hive | 4 ++-- resources/ddl/define-all.hive | 4 ++-- resources/ddl/define-all.spark | 4 ++-- resources/ddl/define-udfs.td.hql | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6dc23449/resources/ddl/define-all-as-permanent.hive -- diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index adf6a14..b515b24 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -206,8 +206,8 @@ CREATE FUNCTION l2_normalize as 'hivemall.ftvec.scaling.L2NormalizationUDF' USIN -- selection functions -- - -DROP FUNCTION IF EXISTS chi_square; -CREATE FUNCTION chi_square as 'hivemall.ftvec.selection.ChiSquareUDF' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS chi2; +CREATE FUNCTION chi2 as 'hivemall.ftvec.selection.ChiSquareUDF' USING JAR '${hivemall_jar}'; -- misc functions -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6dc23449/resources/ddl/define-all.hive -- diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 1586d2e..2124892 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -202,8 +202,8 @@ create temporary function l2_normalize as 'hivemall.ftvec.scaling.L2Normalizatio -- selection functions -- - -drop temporary function chi_square; -create temporary function chi_square as 'hivemall.ftvec.selection.ChiSquareUDF'; +drop temporary function chi2; +create temporary function chi2 as 'hivemall.ftvec.selection.ChiSquareUDF'; --- -- Feature engineering functions -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6dc23449/resources/ddl/define-all.spark -- diff --git a/resources/ddl/define-all.spark b/resources/ddl/define-all.spark index 50d560b..47f0ce5 100644 --- a/resources/ddl/define-all.spark +++ b/resources/ddl/define-all.spark @@ -187,8 +187,8 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION normalize AS 'hivemall.ftvec.scaling.L * selection functions */ -sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS chi_square") -sqlContext.sql("CREATE TEMPORARY FUNCTION chi_square AS 'hivemall.ftvec.selection.ChiSquareUDF'") +sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS chi2") +sqlContext.sql("CREATE TEMPORARY FUNCTION chi2 AS 'hivemall.ftvec.selection.ChiSquareUDF'") /** * misc functions http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6dc23449/resources/ddl/define-udfs.td.hql -- diff --git a/resources/ddl/define-udfs.td.hql b/resources/ddl/define-udfs.td.hql index 601eead..fd7dc1d 100644 --- a/resources/ddl/define-udfs.td.hql +++ b/resources/ddl/define-udfs.td.hql @@ -50,7 +50,7 @@ create temporary function powered_features as 'hivemall.ftvec.pairing.PoweredFea create temporary function rescale as 'hivemall.ftvec.scaling.RescaleUDF'; create temporary function zscore as 'hivemall.ftvec.scaling.ZScoreUDF'; create temporary function l2_normalize as 'hivemall.ftvec.scaling.L2NormalizationUDF'; -create temporary function chi_square as 'hivemall.ftvec.selection.ChiSquareUDF'; +create temporary function chi2 as 'hivemall.ftvec.selection.ChiSquareUDF'; create temporary function amplify as 'hivemall.ftvec.amplify.AmplifierUDTF'; create temporary function rand_amplify as 'hivemall.ftvec.amplify.RandomAmplifierUDTF'; create temporary function add_bias as 'hivemall.ftvec.AddBiasUDF';
[42/50] [abbrv] incubator-hivemall git commit: Refine access modifiers/calls
Refine access modifiers/calls Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/ddd8dc2d Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/ddd8dc2d Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/ddd8dc2d Branch: refs/heads/JIRA-22/pr-336 Commit: ddd8dc2dbf8222c9d9d84b038dbdcd9aef1f1a87 Parents: 7447dde Author: amaya Authored: Fri Nov 18 04:22:51 2016 +0900 Committer: amaya Committed: Fri Nov 18 04:22:51 2016 +0900 -- .../systemtest/runner/HiveSystemTestRunner.java | 4 +- .../systemtest/runner/SystemTestRunner.java | 40 +++- .../systemtest/runner/SystemTestTeam.java | 8 +--- .../systemtest/runner/TDSystemTestRunner.java | 24 ++-- 4 files changed, 36 insertions(+), 40 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/ddd8dc2d/systemtest/src/main/java/hivemall/systemtest/runner/HiveSystemTestRunner.java -- diff --git a/systemtest/src/main/java/hivemall/systemtest/runner/HiveSystemTestRunner.java b/systemtest/src/main/java/hivemall/systemtest/runner/HiveSystemTestRunner.java index 25a2125..db1edc7 100644 --- a/systemtest/src/main/java/hivemall/systemtest/runner/HiveSystemTestRunner.java +++ b/systemtest/src/main/java/hivemall/systemtest/runner/HiveSystemTestRunner.java @@ -101,7 +101,7 @@ public class HiveSystemTestRunner extends SystemTestRunner { } @Override -protected void finRunner() { +void finRunner() { if (container != null) { container.tearDown(); } @@ -111,7 +111,7 @@ public class HiveSystemTestRunner extends SystemTestRunner { } @Override -protected List exec(@Nonnull final RawHQ hq) { +public List exec(@Nonnull final RawHQ hq) { logger.info("executing: `" + hq.query + "`"); return hShell.executeQuery(hq.query); http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/ddd8dc2d/systemtest/src/main/java/hivemall/systemtest/runner/SystemTestRunner.java -- diff --git a/systemtest/src/main/java/hivemall/systemtest/runner/SystemTestRunner.java b/systemtest/src/main/java/hivemall/systemtest/runner/SystemTestRunner.java index f16da90..e142174 100644 --- a/systemtest/src/main/java/hivemall/systemtest/runner/SystemTestRunner.java +++ b/systemtest/src/main/java/hivemall/systemtest/runner/SystemTestRunner.java @@ -45,7 +45,6 @@ import javax.annotation.Nullable; import java.io.FileInputStream; import java.io.InputStream; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -55,9 +54,9 @@ import java.util.Set; public abstract class SystemTestRunner extends ExternalResource { static final Logger logger = LoggerFactory.getLogger(SystemTestRunner.class); @Nonnull -final List classInitHqs; +private final List classInitHqs; @Nonnull -final Set immutableTables; +private final Set immutableTables; @Nonnull final String dbName; @Nonnull @@ -98,7 +97,7 @@ public abstract class SystemTestRunner extends ExternalResource { @Override protected void after() { try { -resetDB(); // clean up database +cleanDB(); // clean up database } catch (Exception ex) { throw new QueryExecutionException("Failed to clean up temporary database. " + ex.getMessage()); @@ -111,16 +110,16 @@ public abstract class SystemTestRunner extends ExternalResource { abstract void finRunner(); -public void initBy(@Nonnull final HQBase hq) { +protected void initBy(@Nonnull final HQBase hq) { classInitHqs.add(hq); } -public void initBy(@Nonnull final List hqs) { +protected void initBy(@Nonnull final List hqs) { classInitHqs.addAll(hqs); } // fix to temporary database and user-defined init (should be called per Test class) -void prepareDB() throws Exception { +private void prepareDB() throws Exception { createDB(dbName); use(dbName); for (HQBase q : classInitHqs) { @@ -136,15 +135,21 @@ public abstract class SystemTestRunner extends ExternalResource { } // drop temporary database (should be called per Test class) -void resetDB() throws Exception { +private void cleanDB() throws Exception { dropDB(dbName); } -public final boolean isImmutableTable(final String tableName) { -return immutableTables.contains(tableName); +// drop temporary tables (should be called per Test method) +void resetDB() throws Exception { +
[37/50] [abbrv] incubator-hivemall git commit: Add exception
Add exception Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/33eab26f Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/33eab26f Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/33eab26f Branch: refs/heads/JIRA-22/pr-336 Commit: 33eab26f383dbdbce00a209e742b611a63d953cf Parents: ba91267 Author: amaya Authored: Thu Nov 17 14:16:14 2016 +0900 Committer: amaya Committed: Thu Nov 17 14:16:14 2016 +0900 -- .../main/java/hivemall/systemtest/runner/HiveSystemTestRunner.java | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/33eab26f/systemtest/src/main/java/hivemall/systemtest/runner/HiveSystemTestRunner.java -- diff --git a/systemtest/src/main/java/hivemall/systemtest/runner/HiveSystemTestRunner.java b/systemtest/src/main/java/hivemall/systemtest/runner/HiveSystemTestRunner.java index 6b41855..25a2125 100644 --- a/systemtest/src/main/java/hivemall/systemtest/runner/HiveSystemTestRunner.java +++ b/systemtest/src/main/java/hivemall/systemtest/runner/HiveSystemTestRunner.java @@ -132,6 +132,7 @@ public class HiveSystemTestRunner extends SystemTestRunner { hShell.insertInto(dbName, hq.tableName).addRowsFromTsv(hq.file).commit(); break; case MSGPACK: +throw new Exception("MessagePack is not supported in HiveSystemTestRunner"); case UNKNOWN: throw new Exception("Input csv or tsv"); }
[24/50] [abbrv] incubator-hivemall git commit: Rename SSTChangePoint -> SingularSpectrumTransform
Rename SSTChangePoint -> SingularSpectrumTransform Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/bde06e09 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/bde06e09 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/bde06e09 Branch: refs/heads/JIRA-22/pr-356 Commit: bde06e0952445bf60a9aef4bca182c0afe87e250 Parents: 3ebd771 Author: Takuya Kitazawa Authored: Tue Sep 27 14:06:20 2016 +0900 Committer: Takuya Kitazawa Committed: Tue Sep 27 14:06:20 2016 +0900 -- .../java/hivemall/anomaly/SSTChangePoint.java | 118 --- .../hivemall/anomaly/SSTChangePointUDF.java | 197 --- .../anomaly/SingularSpectrumTransform.java | 118 +++ .../anomaly/SingularSpectrumTransformUDF.java | 197 +++ .../hivemall/anomaly/SSTChangePointTest.java| 111 --- .../anomaly/SingularSpectrumTransformTest.java | 111 +++ 6 files changed, 426 insertions(+), 426 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/bde06e09/core/src/main/java/hivemall/anomaly/SSTChangePoint.java -- diff --git a/core/src/main/java/hivemall/anomaly/SSTChangePoint.java b/core/src/main/java/hivemall/anomaly/SSTChangePoint.java deleted file mode 100644 index e693bd4..000 --- a/core/src/main/java/hivemall/anomaly/SSTChangePoint.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Hivemall: Hive scalable Machine Learning Library - * - * Copyright (C) 2015 Makoto YUI - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package hivemall.anomaly; - -import hivemall.anomaly.SSTChangePointUDF.SSTChangePointInterface; -import hivemall.anomaly.SSTChangePointUDF.Parameters; -import hivemall.utils.collections.DoubleRingBuffer; -import org.apache.commons.math3.linear.MatrixUtils; -import org.apache.commons.math3.linear.RealMatrix; -import org.apache.commons.math3.linear.SingularValueDecomposition; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; - -import java.util.Arrays; - -import javax.annotation.Nonnull; - -final class SSTChangePoint implements SSTChangePointInterface { - -@Nonnull -private final PrimitiveObjectInspector oi; - -@Nonnull -private final int window; -@Nonnull -private final int nPastWindow; -@Nonnull -private final int nCurrentWindow; -@Nonnull -private final int pastSize; -@Nonnull -private final int currentSize; -@Nonnull -private final int currentOffset; -@Nonnull -private final int r; - -@Nonnull -private final DoubleRingBuffer xRing; -@Nonnull -private final double[] xSeries; - -SSTChangePoint(@Nonnull Parameters params, @Nonnull PrimitiveObjectInspector oi) { -this.oi = oi; - -this.window = params.w; -this.nPastWindow = params.n; -this.nCurrentWindow = params.m; -this.pastSize = window + nPastWindow; -this.currentSize = window + nCurrentWindow; -this.currentOffset = params.g; -this.r = params.r; - -// (w + n) past samples for the n-past-windows -// (w + m) current samples for the m-current-windows, starting from offset g -// => need to hold past (w + n + g + w + m) samples from the latest sample -int holdSampleSize = pastSize + currentOffset + currentSize; - -this.xRing = new DoubleRingBuffer(holdSampleSize); -this.xSeries = new double[holdSampleSize]; -} - -@Override -public void update(@Nonnull final Object arg, @Nonnull final double[] outScores) -throws HiveException { -double x = PrimitiveObjectInspectorUtils.getDouble(arg, oi); -xRing.add(x).toArray(xSeries, true /* FIFO */); - -// need to wait until the buffer is filled -if (!xRing.isFull()) { -outScores[0] = 0.d; -} else { -outScores[0] = computeScore(); -} -} - -private double computeScore() { -// create past trajectory matrix and find its left
[31/50] [abbrv] incubator-hivemall git commit: Support implicit-Krylov-approximation-based efficient SST
Support implicit-Krylov-approximation-based efficient SST Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/998203d5 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/998203d5 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/998203d5 Branch: refs/heads/JIRA-22/pr-356 Commit: 998203d5e8623d6282c2b187df24e4da7d41c16b Parents: 2bfd127 Author: Takuya Kitazawa Authored: Wed Sep 28 19:49:48 2016 +0900 Committer: Takuya Kitazawa Committed: Wed Sep 28 19:49:48 2016 +0900 -- .../anomaly/SingularSpectrumTransform.java | 103 -- .../anomaly/SingularSpectrumTransformUDF.java | 27 +++ .../java/hivemall/utils/math/MatrixUtils.java | 203 +++ .../anomaly/SingularSpectrumTransformTest.java | 61 -- .../hivemall/utils/math/MatrixUtilsTest.java| 67 ++ 5 files changed, 434 insertions(+), 27 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/998203d5/core/src/main/java/hivemall/anomaly/SingularSpectrumTransform.java -- diff --git a/core/src/main/java/hivemall/anomaly/SingularSpectrumTransform.java b/core/src/main/java/hivemall/anomaly/SingularSpectrumTransform.java index c964129..f9f6222 100644 --- a/core/src/main/java/hivemall/anomaly/SingularSpectrumTransform.java +++ b/core/src/main/java/hivemall/anomaly/SingularSpectrumTransform.java @@ -18,9 +18,11 @@ package hivemall.anomaly; import hivemall.anomaly.SingularSpectrumTransformUDF.SingularSpectrumTransformInterface; +import hivemall.anomaly.SingularSpectrumTransformUDF.ScoreFunction; import hivemall.anomaly.SingularSpectrumTransformUDF.Parameters; import hivemall.utils.collections.DoubleRingBuffer; -import org.apache.commons.math3.linear.MatrixUtils; +import hivemall.utils.math.MatrixUtils; +import org.apache.commons.math3.linear.Array2DRowRealMatrix; import org.apache.commons.math3.linear.RealMatrix; import org.apache.commons.math3.linear.SingularValueDecomposition; import org.apache.hadoop.hive.ql.metadata.HiveException; @@ -28,6 +30,8 @@ import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; import java.util.Arrays; +import java.util.TreeMap; +import java.util.Collections; import javax.annotation.Nonnull; @@ -37,6 +41,9 @@ final class SingularSpectrumTransform implements SingularSpectrumTransformInterf private final PrimitiveObjectInspector oi; @Nonnull +private final ScoreFunction scoreFunc; + +@Nonnull private final int window; @Nonnull private final int nPastWindow; @@ -50,15 +57,22 @@ final class SingularSpectrumTransform implements SingularSpectrumTransformInterf private final int currentOffset; @Nonnull private final int r; +@Nonnull +private final int k; @Nonnull private final DoubleRingBuffer xRing; @Nonnull private final double[] xSeries; +@Nonnull +private final double[] q; + SingularSpectrumTransform(@Nonnull Parameters params, @Nonnull PrimitiveObjectInspector oi) { this.oi = oi; +this.scoreFunc = params.scoreFunc; + this.window = params.w; this.nPastWindow = params.n; this.nCurrentWindow = params.m; @@ -66,6 +80,7 @@ final class SingularSpectrumTransform implements SingularSpectrumTransformInterf this.currentSize = window + nCurrentWindow; this.currentOffset = params.g; this.r = params.r; +this.k = params.k; // (w + n) past samples for the n-past-windows // (w + m) current samples for the m-current-windows, starting from offset g @@ -74,6 +89,18 @@ final class SingularSpectrumTransform implements SingularSpectrumTransformInterf this.xRing = new DoubleRingBuffer(holdSampleSize); this.xSeries = new double[holdSampleSize]; + +this.q = new double[window]; +double norm = 0.d; +for (int i = 0; i < window; i++) { +this.q[i] = Math.random(); +norm += q[i] * q[i]; +} +norm = Math.sqrt(norm); +// normalize +for (int i = 0; i < window; i++) { +this.q[i] = q[i] / norm; +} } @Override @@ -86,25 +113,39 @@ final class SingularSpectrumTransform implements SingularSpectrumTransformInterf if (!xRing.isFull()) { outScores[0] = 0.d; } else { -outScores[0] = computeScore(); +// create past trajectory matrix and find its left singular vectors +RealMatrix H = new Array2DRowRealMatrix(new double[window][nPastWindow]); +
[33/50] [abbrv] incubator-hivemall git commit: minor fix
minor fix Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/8d9f0d4c Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/8d9f0d4c Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/8d9f0d4c Branch: refs/heads/JIRA-22/pr-385 Commit: 8d9f0d4c00758324029d342eb4b892e046ca4a49 Parents: 80be81e Author: amaya Authored: Thu Sep 29 11:02:14 2016 +0900 Committer: amaya Committed: Thu Sep 29 11:02:14 2016 +0900 -- .../test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/8d9f0d4c/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala -- diff --git a/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala b/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala index 7b62b92..fe73a1b 100644 --- a/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala +++ b/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala @@ -743,8 +743,8 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest { val df0 = Seq((1, Seq(1, 2, 3), Seq(5, 6, 7)), (1, Seq(3, 4, 5), Seq(7, 8, 9))) .toDF("c0", "arg0", "arg1") -df0.groupby($"c0").transpose_and_dot("arg0", "arg1").collect() shouldEqual - Seq(Row(1, Seq(Seq(26.0, 30.0, 34.0), Seq(38.0, 44.0, 50.0), Seq(50.0, 58.0, 66.0 +checkAnswer(df0.groupby($"c0").transpose_and_dot("arg0", "arg1"), + Seq(Row(1, Seq(Seq(26.0, 30.0, 34.0), Seq(38.0, 44.0, 50.0), Seq(50.0, 58.0, 66.0) } }
[27/50] [abbrv] incubator-hivemall git commit: Add references for the original SST papers
Add references for the original SST papers Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/2bfd1270 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/2bfd1270 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/2bfd1270 Branch: refs/heads/JIRA-22/pr-356 Commit: 2bfd1270b1e9b79185a41cbe2568f2ce968d4a71 Parents: bde06e0 Author: Takuya Kitazawa Authored: Wed Sep 28 11:16:56 2016 +0900 Committer: Takuya Kitazawa Committed: Wed Sep 28 11:22:46 2016 +0900 -- .../hivemall/anomaly/SingularSpectrumTransformUDF.java | 11 +++ 1 file changed, 11 insertions(+) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2bfd1270/core/src/main/java/hivemall/anomaly/SingularSpectrumTransformUDF.java -- diff --git a/core/src/main/java/hivemall/anomaly/SingularSpectrumTransformUDF.java b/core/src/main/java/hivemall/anomaly/SingularSpectrumTransformUDF.java index 2ec0a91..64b7d20 100644 --- a/core/src/main/java/hivemall/anomaly/SingularSpectrumTransformUDF.java +++ b/core/src/main/java/hivemall/anomaly/SingularSpectrumTransformUDF.java @@ -41,6 +41,17 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; +/** + * Change-point detection based on Singular Spectrum Transformation (SST). + * + * References: + * + * T. Ide and K. Inoue, + * "Knowledge Discovery from Heterogeneous Dynamic Systems using Change-Point Correlations", SDM'05. + * T. Ide and K. Tsuda, "Change-point detection using Krylov subspace learning", SDM'07. + * + */ + @Description( name = "sst", value = "_FUNC_(double|array x [, const string options])"
[29/50] [abbrv] incubator-hivemall git commit: refine tests
refine tests Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/8e2842cf Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/8e2842cf Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/8e2842cf Branch: refs/heads/JIRA-22/pr-385 Commit: 8e2842cf8c272642feaa76bf95e8fa463b0322dc Parents: 1347de9 Author: amaya Authored: Wed Sep 28 14:24:19 2016 +0900 Committer: amaya Committed: Wed Sep 28 14:24:19 2016 +0900 -- .../ftvec/selection/ChiSquareUDFTest.java | 12 ++-- .../selection/SignalNoiseRatioUDAFTest.java | 71 2 files changed, 64 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/8e2842cf/core/src/test/java/hivemall/ftvec/selection/ChiSquareUDFTest.java -- diff --git a/core/src/test/java/hivemall/ftvec/selection/ChiSquareUDFTest.java b/core/src/test/java/hivemall/ftvec/selection/ChiSquareUDFTest.java index 38f7f57..d5880b8 100644 --- a/core/src/test/java/hivemall/ftvec/selection/ChiSquareUDFTest.java +++ b/core/src/test/java/hivemall/ftvec/selection/ChiSquareUDFTest.java @@ -69,12 +69,12 @@ public class ChiSquareUDFTest { result1[i] = Double.valueOf(((List) result[1]).get(i).toString()); } -final double[] answer0 = new double[] {10.817820878493995, 3.5944990176817315, -116.16984746363957, 67.24482558215503}; -final double[] answer1 = new double[] {0.004476514990225833, 0.16575416718561453, 0.d, -2.55351295663786e-15}; +// compare with results by scikit-learn +final double[] answer0 = new double[] {10.81782088, 3.59449902, 116.16984746, 67.24482759}; +final double[] answer1 = new double[] {4.47651499e-03, 1.65754167e-01, 5.94344354e-26, +2.50017968e-15}; -Assert.assertArrayEquals(answer0, result0, 0.d); -Assert.assertArrayEquals(answer1, result1, 0.d); +Assert.assertArrayEquals(answer0, result0, 1e-5); +Assert.assertArrayEquals(answer1, result1, 1e-5); } } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/8e2842cf/core/src/test/java/hivemall/ftvec/selection/SignalNoiseRatioUDAFTest.java -- diff --git a/core/src/test/java/hivemall/ftvec/selection/SignalNoiseRatioUDAFTest.java b/core/src/test/java/hivemall/ftvec/selection/SignalNoiseRatioUDAFTest.java index 4655545..56a01d0 100644 --- a/core/src/test/java/hivemall/ftvec/selection/SignalNoiseRatioUDAFTest.java +++ b/core/src/test/java/hivemall/ftvec/selection/SignalNoiseRatioUDAFTest.java @@ -40,7 +40,8 @@ public class SignalNoiseRatioUDAFTest { public ExpectedException expectedException = ExpectedException.none(); @Test -public void test() throws Exception { +public void snrBinaryClass() throws Exception { +// this test is based on *subset* of iris data set final SignalNoiseRatioUDAF snr = new SignalNoiseRatioUDAF(); final ObjectInspector[] OIs = new ObjectInspector[] { ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector), @@ -51,20 +52,62 @@ public class SignalNoiseRatioUDAFTest { final SignalNoiseRatioUDAF.SignalNoiseRatioUDAFEvaluator.SignalNoiseRatioAggregationBuffer agg = (SignalNoiseRatioUDAF.SignalNoiseRatioUDAFEvaluator.SignalNoiseRatioAggregationBuffer) evaluator.getNewAggregationBuffer(); evaluator.reset(agg); -final double[][] featuress = new double[][] { {5.1, 3.5, 1.4, 0.2}, {4.9, 3.d, 1.4, 0.2}, +final double[][] features = new double[][] { {5.1, 3.5, 1.4, 0.2}, {4.9, 3.d, 1.4, 0.2}, +{4.7, 3.2, 1.3, 0.2}, {7.d, 3.2, 4.7, 1.4}, {6.4, 3.2, 4.5, 1.5}, +{6.9, 3.1, 4.9, 1.5}}; + +final int[][] labels = new int[][] { {1, 0}, {1, 0}, {1, 0}, {0, 1}, {0, 1}, {0, 1}}; + +for (int i = 0; i < features.length; i++) { +final List labelList = new ArrayList(); +for (int label : labels[i]) { +labelList.add(new IntWritable(label)); +} +evaluator.iterate(agg, new Object[] {WritableUtils.toWritableList(features[i]), +labelList}); +} + +@SuppressWarnings("unchecked") +final List resultObj = (ArrayList) evaluator.terminate(agg); +final int size = resultObj.size(); +final double[] result = new double[size]; +for (int i = 0; i < size; i++) { +result[i] = resultObj.get(i).get(); +} + +// compare with result by numpy +final double[] answer
[38/50] [abbrv] incubator-hivemall git commit: Make dir name static
Make dir name static Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/1f3df54c Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/1f3df54c Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/1f3df54c Branch: refs/heads/JIRA-22/pr-336 Commit: 1f3df54c0183a61390f58b94f58c12e531754a09 Parents: 33eab26 Author: amaya Authored: Fri Nov 18 01:57:31 2016 +0900 Committer: amaya Committed: Fri Nov 18 01:57:31 2016 +0900 -- .../hivemall/systemtest/runner/SystemTestCommonInfo.java | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1f3df54c/systemtest/src/main/java/hivemall/systemtest/runner/SystemTestCommonInfo.java -- diff --git a/systemtest/src/main/java/hivemall/systemtest/runner/SystemTestCommonInfo.java b/systemtest/src/main/java/hivemall/systemtest/runner/SystemTestCommonInfo.java index 60292fa..82b433f 100644 --- a/systemtest/src/main/java/hivemall/systemtest/runner/SystemTestCommonInfo.java +++ b/systemtest/src/main/java/hivemall/systemtest/runner/SystemTestCommonInfo.java @@ -21,6 +21,10 @@ package hivemall.systemtest.runner; import javax.annotation.Nonnull; public class SystemTestCommonInfo { +private static final String CASE = "case"; +private static final String ANSWER = "answer"; +private static final String INIT = "init"; + @Nonnull public final String baseDir; @Nonnull @@ -34,9 +38,9 @@ public class SystemTestCommonInfo { public SystemTestCommonInfo(@Nonnull final Class clazz) { baseDir = clazz.getName().replace(".", "/"); -caseDir = baseDir + "/case/"; -answerDir = baseDir + "/answer/"; -initDir = baseDir + "/init/"; +caseDir = baseDir + "/" + CASE + "/"; +answerDir = baseDir + "/" + ANSWER + "/"; +initDir = baseDir + "/" + INIT + "/"; dbName = clazz.getName().replace(".", "_").toLowerCase(); } }
[35/50] [abbrv] incubator-hivemall git commit: Update license headers
Update license headers Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/43ca0c86 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/43ca0c86 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/43ca0c86 Branch: refs/heads/JIRA-22/pr-336 Commit: 43ca0c86936f3ccc7f825db3c4f4ecaa48087917 Parents: faebaf9 Author: amaya Authored: Wed Nov 16 15:23:49 2016 +0900 Committer: amaya Committed: Wed Nov 16 15:23:49 2016 +0900 -- systemtest/README.md| 18 + systemtest/pom.xml | 17 +++- .../java/com/klarna/hiverunner/Extractor.java | 28 ++-- .../hivemall/systemtest/MsgpackConverter.java | 28 ++-- .../exception/QueryExecutionException.java | 28 ++-- .../systemtest/model/CreateTableHQ.java | 28 ++-- .../hivemall/systemtest/model/DropTableHQ.java | 28 ++-- .../main/java/hivemall/systemtest/model/HQ.java | 28 ++-- .../java/hivemall/systemtest/model/HQBase.java | 28 ++-- .../hivemall/systemtest/model/InsertHQ.java | 28 ++-- .../java/hivemall/systemtest/model/RawHQ.java | 28 ++-- .../java/hivemall/systemtest/model/TableHQ.java | 28 ++-- .../hivemall/systemtest/model/TableListHQ.java | 28 ++-- .../model/UploadFileAsNewTableHQ.java | 28 ++-- .../hivemall/systemtest/model/UploadFileHQ.java | 28 ++-- .../model/UploadFileToExistingHQ.java | 28 ++-- .../model/lazy/LazyMatchingResource.java| 28 ++-- .../systemtest/runner/HiveSystemTestRunner.java | 28 ++-- .../systemtest/runner/SystemTestCommonInfo.java | 28 ++-- .../systemtest/runner/SystemTestRunner.java | 28 ++-- .../systemtest/runner/SystemTestTeam.java | 28 ++-- .../systemtest/runner/TDSystemTestRunner.java | 28 ++-- .../main/java/hivemall/systemtest/utils/IO.java | 28 ++-- 23 files changed, 328 insertions(+), 295 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/43ca0c86/systemtest/README.md -- diff --git a/systemtest/README.md b/systemtest/README.md index 9d1442a..4fca0c3 100644 --- a/systemtest/README.md +++ b/systemtest/README.md @@ -1,3 +1,21 @@ + ## Usage ### Initialization http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/43ca0c86/systemtest/pom.xml -- diff --git a/systemtest/pom.xml b/systemtest/pom.xml index e59d2ce..e7345af 100644 --- a/systemtest/pom.xml +++ b/systemtest/pom.xml @@ -1,4 +1,19 @@ - + http://maven.apache.org/POM/4.0.0"; xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"; xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd";> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/43ca0c86/systemtest/src/main/java/com/klarna/hiverunner/Extractor.java -- diff --git a/systemtest/src/main/java/com/klarna/hiverunner/Extractor.java b/systemtest/src/main/java/com/klarna/hiverunner/Extractor.java index 99720f0..f7f372f 100644 --- a/systemtest/src/main/java/com/klarna/hiverunner/Extractor.java +++ b/systemtest/src/main/java/com/klarna/hiverunner/Extractor.java @@ -1,20 +1,20 @@ /* - * Hivemall: Hive scalable Machine Learning Library + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * Copyright (C) 2016 Makoto YUI - * Copyright (C) 2013-2015 National Institute of Advanced Industrial Science and Technology (AIST) + * http://www.apache.org/licenses/LICENSE-2.0 * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, eith
[20/50] [abbrv] incubator-hivemall git commit: mod chi2 function name
mod chi2 function name Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/a882c5f9 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/a882c5f9 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/a882c5f9 Branch: refs/heads/JIRA-22/pr-385 Commit: a882c5f9f8067b911254dfc43d268de06a5490f9 Parents: b8cf396 Author: amaya Authored: Wed Sep 21 16:00:36 2016 +0900 Committer: amaya Committed: Wed Sep 21 16:23:47 2016 +0900 -- core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java | 2 +- core/src/main/java/hivemall/utils/math/StatsUtils.java| 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/a882c5f9/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java -- diff --git a/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java index 70f0316..1583959 100644 --- a/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java +++ b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java @@ -129,7 +129,7 @@ public class ChiSquareUDF extends GenericUDF { } } -final Map.Entry chi2 = StatsUtils.chiSquares(observed, expected); +final Map.Entry chi2 = StatsUtils.chiSquare(observed, expected); final Object[] result = new Object[2]; result[0] = WritableUtils.toWritableList(chi2.getKey()); http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/a882c5f9/core/src/main/java/hivemall/utils/math/StatsUtils.java -- diff --git a/core/src/main/java/hivemall/utils/math/StatsUtils.java b/core/src/main/java/hivemall/utils/math/StatsUtils.java index e255b84..14adbff 100644 --- a/core/src/main/java/hivemall/utils/math/StatsUtils.java +++ b/core/src/main/java/hivemall/utils/math/StatsUtils.java @@ -262,7 +262,7 @@ public final class StatsUtils { * @param expecteds means positive matrix * @return (chi2 value[], p value[]) */ -public static Map.Entry chiSquares(@Nonnull final double[][] observeds, +public static Map.Entry chiSquare(@Nonnull final double[][] observeds, @Nonnull final double[][] expecteds) { Preconditions.checkArgument(observeds.length == expecteds.length);
[32/50] [abbrv] incubator-hivemall git commit: minor fix
minor fix Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/80be81ec Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/80be81ec Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/80be81ec Branch: refs/heads/JIRA-22/pr-385 Commit: 80be81ecf92cd4675dcdfaa5f456d84d484d6c44 Parents: 4cfa4e5 Author: amaya Authored: Wed Sep 28 20:01:08 2016 +0900 Committer: amaya Committed: Wed Sep 28 20:01:08 2016 +0900 -- .../main/java/hivemall/ftvec/selection/SignalNoiseRatioUDAF.java | 2 +- .../test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/80be81ec/core/src/main/java/hivemall/ftvec/selection/SignalNoiseRatioUDAF.java -- diff --git a/core/src/main/java/hivemall/ftvec/selection/SignalNoiseRatioUDAF.java b/core/src/main/java/hivemall/ftvec/selection/SignalNoiseRatioUDAF.java index 507aefa..96fdc5b 100644 --- a/core/src/main/java/hivemall/ftvec/selection/SignalNoiseRatioUDAF.java +++ b/core/src/main/java/hivemall/ftvec/selection/SignalNoiseRatioUDAF.java @@ -335,7 +335,7 @@ public class SignalNoiseRatioUDAF extends AbstractGenericUDAFResolver { final double snr = Math.abs(myAgg.meanss[j][i] - myAgg.meanss[k][i]) / (sds[j] + sds[k]); // if `NaN`(when diff between means and both sds are zero, IOW, all related values are equal), -// regard feature `i` as meaningless between class `j` and `k` and skip +// regard feature `i` as meaningless between class `j` and `k`, skip if (!Double.isNaN(snr)) { result[i] += snr; // accept `Infinity` } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/80be81ec/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala -- diff --git a/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala b/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala index 2e18280..7b62b92 100644 --- a/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala +++ b/spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala @@ -740,7 +740,8 @@ final class HivemallOpsWithFeatureSuite extends HivemallFeatureQueryTest { // | 1 2 3 |T| 5 6 7 | // | 3 4 5 | * | 7 8 9 | -val df0 = Seq((1, Seq(1, 2, 3), Seq(5, 6, 7)), (1, Seq(3, 4, 5), Seq(7, 8, 9))).toDF.as("c0", "arg0", "arg1") +val df0 = Seq((1, Seq(1, 2, 3), Seq(5, 6, 7)), (1, Seq(3, 4, 5), Seq(7, 8, 9))) + .toDF("c0", "arg0", "arg1") df0.groupby($"c0").transpose_and_dot("arg0", "arg1").collect() shouldEqual Seq(Row(1, Seq(Seq(26.0, 30.0, 34.0), Seq(38.0, 44.0, 50.0), Seq(50.0, 58.0, 66.0
[34/50] [abbrv] incubator-hivemall git commit: change method of testing for spark
change method of testing for spark Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/ce4a4898 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/ce4a4898 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/ce4a4898 Branch: refs/heads/JIRA-22/pr-385 Commit: ce4a48980e33b9f16c74a62fcea6878f28b9c08b Parents: 8d9f0d4 Author: amaya Authored: Fri Sep 30 17:05:20 2016 +0900 Committer: amaya Committed: Fri Sep 30 17:05:20 2016 +0900 -- .../spark/sql/hive/HivemallOpsSuite.scala | 23 ++-- .../spark/sql/hive/HivemallOpsSuite.scala | 17 ++- 2 files changed, 18 insertions(+), 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/ce4a4898/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala -- diff --git a/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala b/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala index cce22ce..c7016c0 100644 --- a/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala +++ b/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala @@ -24,7 +24,6 @@ import org.apache.spark.sql.{Column, Row} import org.apache.spark.test.HivemallQueryTest import org.apache.spark.test.TestDoubleWrapper._ import org.apache.spark.test.TestUtils._ -import org.scalatest.Matchers._ final class HivemallOpsSuite extends HivemallQueryTest { @@ -189,7 +188,6 @@ final class HivemallOpsSuite extends HivemallQueryTest { test("ftvec.selection - chi2") { import hiveContext.implicits._ -implicit val doubleEquality = org.scalactic.TolerantNumerics.tolerantDoubleEquality(1e-5) // see also hivemall.ftvec.selection.ChiSquareUDFTest val df = Seq( @@ -204,17 +202,17 @@ final class HivemallOpsSuite extends HivemallQueryTest { .toDF("arg0", "arg1") val result = df.select(chi2(df("arg0"), df("arg1"))).collect -result should have length 1 +assert(result.length == 1) val chi2Val = result.head.getAs[Row](0).getAs[Seq[Double]](0) val pVal = result.head.getAs[Row](0).getAs[Seq[Double]](1) (chi2Val, Seq(10.81782088, 3.59449902, 116.16984746, 67.24482759)) .zipped - .foreach((actual, expected) => actual shouldEqual expected) + .foreach((actual, expected) => assert(actual ~== expected)) (pVal, Seq(4.47651499e-03, 1.65754167e-01, 5.94344354e-26, 2.50017968e-15)) .zipped - .foreach((actual, expected) => actual shouldEqual expected) + .foreach((actual, expected) => assert(actual ~== expected)) } test("ftvec.conv - quantify") { @@ -370,8 +368,9 @@ final class HivemallOpsSuite extends HivemallQueryTest { val data = Seq(Seq(0, 1, 3), Seq(2, 4, 1), Seq(5, 4, 9)) val df = data.map(d => (d, Seq(3, 1, 2), 2)).toDF("features", "importance_list", "k") -df.select(select_k_best(df("features"), df("importance_list"), df("k"))).collect shouldEqual - data.map(s => Row(Seq(s(0).toDouble, s(2).toDouble))) +// if use checkAnswer here, fail for some reason, maybe type? but it's okay on spark-2.0 +assert(df.select(select_k_best(df("features"), df("importance_list"), df("k"))).collect === + data.map(s => Row(Seq(s(0).toDouble, s(2).toDouble } test("misc - sigmoid") { @@ -573,7 +572,6 @@ final class HivemallOpsSuite extends HivemallQueryTest { test("user-defined aggregators for ftvec.selection") { import hiveContext.implicits._ -implicit val doubleEquality = org.scalactic.TolerantNumerics.tolerantDoubleEquality(1e-5) // see also hivemall.ftvec.selection.SignalNoiseRatioUDAFTest // binary class @@ -595,7 +593,7 @@ final class HivemallOpsSuite extends HivemallQueryTest { val row0 = df0.groupby($"c0").snr("arg0", "arg1").collect (row0(0).getAs[Seq[Double]](1), Seq(4.38425236, 0.26390002, 15.83984511, 26.87005769)) .zipped - .foreach((actual, expected) => actual shouldEqual expected) + .foreach((actual, expected) => assert(actual ~== expected)) // multiple class // +-+---+ @@ -616,7 +614,7 @@ final class HivemallOpsSuite extends HivemallQueryTest { val row1 = df1.groupby($"c0").snr("arg0", "arg1").collect (row1(0).getAs[Seq[Double]](1), Seq(8.43181818, 1.32121212, 42.94949495, 33.80952381)) .zipped - .foreach((actual, expected) => actual shouldEqual expected) + .foreach((actual, expected) => assert(actual ~== expected)) } test("user-defined aggregators for tools.matrix") { @@ -627,7 +625,8 @@ final class HivemallOpsSuite extends HivemallQ
[41/50] [abbrv] incubator-hivemall git commit: Mod README
Mod README Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/7447dde6 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/7447dde6 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/7447dde6 Branch: refs/heads/JIRA-22/pr-336 Commit: 7447dde61f3a9cb8e3ba5ab278a260d0a0615524 Parents: 144cb50 Author: amaya Authored: Fri Nov 18 03:23:46 2016 +0900 Committer: amaya Committed: Fri Nov 18 03:23:46 2016 +0900 -- systemtest/README.md | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7447dde6/systemtest/README.md -- diff --git a/systemtest/README.md b/systemtest/README.md index 2805165..2b1167e 100644 --- a/systemtest/README.md +++ b/systemtest/README.md @@ -195,8 +195,9 @@ pink255 192 203 ```sql -- write your hive queries -- comments like this and multiple queries in one row are allowed -SELECT blue FROM color WHERE name = 'lavender';SELECT green FROM color WHERE name LIKE 'orange%' -SELECT name FROM color WHERE blue = 255 +SELECT blue FROM color WHERE name = 'lavender'; +SELECT green FROM color WHERE name LIKE 'orange%'; +SELECT name FROM color WHERE blue = 255; ``` * `systemtest/src/test/resources/hivemall/QuickExample/answer/test3` (`systemtest/src/test/resources/${path/to/package}/${className}/answer/${fileName}`) @@ -205,6 +206,6 @@ tsv format is required ```tsv 250 -16569 -azurebluemagenta +16569 +azure bluemagenta ```
[10/50] [abbrv] incubator-hivemall git commit: add license and format
add license and format Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/ad81b3aa Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/ad81b3aa Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/ad81b3aa Branch: refs/heads/JIRA-22/pr-385 Commit: ad81b3aa5a0bbb7c248d127ba44608578c01ae00 Parents: 1ab9b09 Author: amaya Authored: Tue Sep 20 17:05:55 2016 +0900 Committer: amaya Committed: Tue Sep 20 18:37:51 2016 +0900 -- .../hivemall/ftvec/selection/ChiSquareUDF.java | 92 .../tools/array/ArrayTopKIndicesUDF.java| 29 -- .../tools/array/SubarrayByIndicesUDF.java | 36 ++-- .../tools/matrix/TransposeAndDotUDAF.java | 64 +- .../java/hivemall/utils/hadoop/HiveUtils.java | 10 ++- .../java/hivemall/utils/math/StatsUtils.java| 29 +++--- 6 files changed, 171 insertions(+), 89 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/ad81b3aa/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java -- diff --git a/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java index 1954e33..e2b7494 100644 --- a/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java +++ b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java @@ -1,3 +1,21 @@ +/* + * Hivemall: Hive scalable Machine Learning Library + * + * Copyright (C) 2016 Makoto YUI + * Copyright (C) 2013-2015 National Institute of Advanced Industrial Science and Technology (AIST) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package hivemall.ftvec.selection; import hivemall.utils.hadoop.HiveUtils; @@ -10,24 +28,20 @@ import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; -import org.apache.hadoop.hive.serde2.io.DoubleWritable; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import javax.annotation.Nonnull; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; @Description(name = "chi2", -value = "_FUNC_(array> observed, array> expected)" + -" - Returns chi2_val and p_val of each columns as , array>") +value = "_FUNC_(array> observed, array> expected)" ++ " - Returns chi2_val and p_val of each columns as , array>") public class ChiSquareUDF extends GenericUDF { private ListObjectInspector observedOI; private ListObjectInspector observedRowOI; @@ -42,31 +56,31 @@ public class ChiSquareUDF extends GenericUDF { throw new UDFArgumentLengthException("Specify two arguments."); } -if (!HiveUtils.isNumberListListOI(OIs[0])){ -throw new UDFArgumentTypeException(0, "Only array> type argument is acceptable but " -+ OIs[0].getTypeName() + " was passed as `observed`"); +if (!HiveUtils.isNumberListListOI(OIs[0])) { +throw new UDFArgumentTypeException(0, +"Only array> type argument is acceptable but " + OIs[0].getTypeName() ++ " was passed as `observed`"); } -if (!HiveUtils.isNumberListListOI(OIs[1])){ -throw new UDFArgumentTypeException(1, "Only array> type argument is acceptable but " -+ OIs[1].getTypeName() + " was passed as `expected`"); +if (!HiveUtils.isNumberListListOI(OIs[1])) { +throw new UDFArgumentTypeException(1, +"Only array> typ
[03/50] [abbrv] incubator-hivemall git commit: add transpose_and_dot
add transpose_and_dot Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/6f9b4fa0 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/6f9b4fa0 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/6f9b4fa0 Branch: refs/heads/JIRA-22/pr-385 Commit: 6f9b4fa0acebf604882240ccd5507d9df45bab2d Parents: 56adf2d Author: amaya Authored: Fri Sep 16 15:52:54 2016 +0900 Committer: amaya Committed: Fri Sep 16 15:52:54 2016 +0900 -- .../tools/matrix/TransposeAndDotUDAF.java | 191 +++ 1 file changed, 191 insertions(+) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6f9b4fa0/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java -- diff --git a/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java b/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java new file mode 100644 index 000..4fa5ce4 --- /dev/null +++ b/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java @@ -0,0 +1,191 @@ +package hivemall.tools.matrix; + +import hivemall.utils.hadoop.HiveUtils; +import hivemall.utils.hadoop.WritableUtils; +import hivemall.utils.lang.Preconditions; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFParameterInfo; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +@Description(name = "transpose_and_dot", +value = "_FUNC_(array matrix0_row, array matrix1_row)" + +" - Returns dot(matrix0.T, matrix1) as array>, shape = (matrix0.#cols, matrix1.#cols)") +public final class TransposeAndDotUDAF extends AbstractGenericUDAFResolver { +@Override +public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo info) throws SemanticException { +ObjectInspector[] OIs = info.getParameterObjectInspectors(); + +if (OIs.length != 2) { +throw new UDFArgumentLengthException("Specify two arguments."); +} + +if (!HiveUtils.isNumberListOI(OIs[0])) { +throw new UDFArgumentTypeException(0, "Only array type argument is acceptable but " ++ OIs[0].getTypeName() + " was passed as `matrix0_row`"); +} + +if (!HiveUtils.isNumberListOI(OIs[1])) { +throw new UDFArgumentTypeException(1, "Only array type argument is acceptable but " ++ OIs[1].getTypeName() + " was passed as `matrix1_row`"); +} + +return new TransposeAndDotUDAFEvaluator(); +} + +private static final class TransposeAndDotUDAFEvaluator extends GenericUDAFEvaluator { +// PARTIAL1 and COMPLETE +private ListObjectInspector matrix0RowOI; +private PrimitiveObjectInspector matrix0ElOI; +private ListObjectInspector matrix1RowOI; +private PrimitiveObjectInspector matrix1ElOI; + +// PARTIAL2 and FINAL +private ListObjectInspector aggMatrixOI; +private ListObjectInspector aggMatrixRowOI; +private DoubleObjectInspector aggMatrixElOI; + +private double[] matrix0Row; +private double[] matrix1Row; + +@AggregationType(estimable = true) +static class TransposeAndDotAggregationBuffer extends AbstractAggregationBuffer { +double[][] aggMatrix; + +@Override +public int estimate() { +return aggMatrix != null +? aggMatrix.length * aggMatrix[0].length * 8 +: 0; +} + +public void init(int n, int m) { +aggMatrix = new double[n][m]; +} + +public void reset() { +if (aggMatrix != null) { +for (double[] row :
[04/50] [abbrv] incubator-hivemall git commit: add chi2 and chi2_test
add chi2 and chi2_test Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/d3009be5 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/d3009be5 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/d3009be5 Branch: refs/heads/JIRA-22/pr-385 Commit: d3009be59bcf314b373038e3db8903a041396931 Parents: 6f9b4fa Author: amaya Authored: Fri Sep 16 16:00:58 2016 +0900 Committer: amaya Committed: Fri Sep 16 16:00:58 2016 +0900 -- .../ftvec/selection/ChiSquareTestUDF.java | 21 + .../hivemall/ftvec/selection/ChiSquareUDF.java | 21 + .../ftvec/selection/DissociationDegreeUDF.java | 88 .../java/hivemall/utils/math/StatsUtils.java| 49 +++ 4 files changed, 179 insertions(+) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/d3009be5/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java -- diff --git a/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java b/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java new file mode 100644 index 000..d367085 --- /dev/null +++ b/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java @@ -0,0 +1,21 @@ +package hivemall.ftvec.selection; + +import hivemall.utils.math.StatsUtils; +import org.apache.hadoop.hive.ql.exec.Description; + +import javax.annotation.Nonnull; + +@Description(name = "chi2_test", +value = "_FUNC_(array expected, array observed) - Returns p-value as double") +public class ChiSquareTestUDF extends DissociationDegreeUDF { +@Override +double calcDissociation(@Nonnull final double[] expected,@Nonnull final double[] observed) { +return StatsUtils.chiSquareTest(expected, observed); +} + +@Override +@Nonnull +String getFuncName() { +return "chi2_test"; +} +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/d3009be5/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java -- diff --git a/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java new file mode 100644 index 000..937b1bd --- /dev/null +++ b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java @@ -0,0 +1,21 @@ +package hivemall.ftvec.selection; + +import hivemall.utils.math.StatsUtils; +import org.apache.hadoop.hive.ql.exec.Description; + +import javax.annotation.Nonnull; + +@Description(name = "chi2", +value = "_FUNC_(array expected, array observed) - Returns chi2-value as double") +public class ChiSquareUDF extends DissociationDegreeUDF { +@Override +double calcDissociation(@Nonnull final double[] expected,@Nonnull final double[] observed) { +return StatsUtils.chiSquare(expected, observed); +} + +@Override +@Nonnull +String getFuncName() { +return "chi2"; +} +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/d3009be5/core/src/main/java/hivemall/ftvec/selection/DissociationDegreeUDF.java -- diff --git a/core/src/main/java/hivemall/ftvec/selection/DissociationDegreeUDF.java b/core/src/main/java/hivemall/ftvec/selection/DissociationDegreeUDF.java new file mode 100644 index 000..0acae82 --- /dev/null +++ b/core/src/main/java/hivemall/ftvec/selection/DissociationDegreeUDF.java @@ -0,0 +1,88 @@ +package hivemall.ftvec.selection; + +import hivemall.utils.hadoop.HiveUtils; +import hivemall.utils.lang.Preconditions; +import hivemall.utils.math.StatsUtils; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +import javax.annotation.Nonnull; + +@Description(name = "", +value = "_FUNC_(array expected, array observed) - Returns dissociation degree as double") +public abstract class DissociationDegreeUDF extends GenericUDF { +private ListObjectInspector expectedOI; +private DoubleObjectInspector expectedElO
[47/50] [abbrv] incubator-hivemall git commit: Fix syntax errors in spark (#387)
Fix syntax errors in spark (#387) Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/4c8dcbfc Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/4c8dcbfc Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/4c8dcbfc Branch: refs/heads/JIRA-22/pr-385 Commit: 4c8dcbfcdd9dd584fc97e28db39a12d12dfd7b48 Parents: 6549ef5 Author: Takeshi Yamamuro Authored: Thu Nov 24 03:13:25 2016 +0900 Committer: Makoto YUI Committed: Thu Nov 24 03:13:25 2016 +0900 -- .../apache/spark/sql/hive/GroupedDataEx.scala | 8 +-- .../org/apache/spark/sql/hive/HivemallOps.scala | 6 +-- .../spark/sql/hive/HivemallOpsSuite.scala | 7 ++- .../spark/sql/hive/HivemallGroupedDataset.scala | 51 ++-- .../spark/sql/hive/HivemallOpsSuite.scala | 13 ++--- 5 files changed, 41 insertions(+), 44 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/4c8dcbfc/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala -- diff --git a/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala b/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala index 8f78a7f..dd6db6c 100644 --- a/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala +++ b/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala @@ -271,9 +271,11 @@ final class GroupedDataEx protected[sql]( */ def onehot_encoding(features: String*): DataFrame = { val udaf = HiveUDAFFunction( -new HiveFunctionWrapper("hivemall.ftvec.trans.OnehotEncodingUDAF"), -features.map(df.col(_).expr), -isUDAFBridgeRequired = false) + new HiveFunctionWrapper("hivemall.ftvec.trans.OnehotEncodingUDAF"), + features.map(df.col(_).expr), + isUDAFBridgeRequired = false) +toDF(Seq(Alias(udaf, udaf.prettyString)())) + } /** * @see hivemall.ftvec.selection.SignalNoiseRatioUDAF http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/4c8dcbfc/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala -- diff --git a/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala b/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala index 27cffc7..8583e1c 100644 --- a/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala +++ b/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/HivemallOps.scala @@ -1010,9 +1010,9 @@ object HivemallOps { } /** -* @see hivemall.ftvec.selection.ChiSquareUDF -* @group ftvec.selection -*/ + * @see hivemall.ftvec.selection.ChiSquareUDF + * @group ftvec.selection + */ def chi2(observed: Column, expected: Column): Column = { HiveGenericUDF(new HiveFunctionWrapper( "hivemall.ftvec.selection.ChiSquareUDF"), Seq(observed.expr, expected.expr)) http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/4c8dcbfc/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala -- diff --git a/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala b/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala index c231105..4c77f18 100644 --- a/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala +++ b/spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala @@ -22,7 +22,6 @@ import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.hive.HivemallOps._ import org.apache.spark.sql.hive.HivemallUtils._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.{Column, Row} import org.apache.spark.test.HivemallQueryTest import org.apache.spark.test.TestDoubleWrapper._ import org.apache.spark.test.TestUtils._ @@ -575,14 +574,13 @@ final class HivemallOpsSuite extends HivemallQueryTest { assert(row4(0).getDouble(1) ~== 0.25) } - test("user-defined aggregators for ftvec.trans") { + ignore("user-defined aggregators for ftvec.trans") { import hiveContext.implicits._ val df0 = Seq((1, "cat", "mammal", 9), (1, "dog", "mammal", 10), (1, "human", "mammal", 10), (1, "seahawk", "bird", 101), (1, "wasp", "insect", 3), (1, "wasp", "insect", 9), (1, "cat", "mammal", 101), (1, "dog", "mammal", 1), (1, "human", "mammal", 9)) -.toDF("col0", "cat1", "cat2", "cat3") - + .toDF("col0", "cat1", "cat2", "cat3") val row00 = df0.groupby($"col0").onehot_encoding("cat1") val row01 = df0.groupby($"col0").onehot_encodin
[45/50] [abbrv] incubator-hivemall git commit: Updated license headers
Updated license headers Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/e44a413e Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/e44a413e Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/e44a413e Branch: refs/heads/JIRA-22/pr-385 Commit: e44a413e5fd4270af53895fceec27ccff3d63a73 Parents: 67ba963 Author: myui Authored: Mon Nov 21 19:02:27 2016 +0900 Committer: myui Committed: Mon Nov 21 19:02:27 2016 +0900 -- .../hivemall/ftvec/selection/ChiSquareUDF.java | 77 ++-- .../ftvec/selection/SignalNoiseRatioUDAF.java | 39 +- .../hivemall/tools/array/SelectKBestUDF.java| 48 ++-- .../tools/matrix/TransposeAndDotUDAF.java | 38 +- .../ftvec/selection/ChiSquareUDFTest.java | 35 - .../selection/SignalNoiseRatioUDAFTest.java | 36 - .../tools/array/SelectKBeatUDFTest.java | 33 + .../tools/matrix/TransposeAndDotUDAFTest.java | 29 8 files changed, 171 insertions(+), 164 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e44a413e/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java -- diff --git a/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java index 1583959..91742bc 100644 --- a/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java +++ b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java @@ -1,20 +1,20 @@ /* - * Hivemall: Hive scalable Machine Learning Library + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * - * Copyright (C) 2016 Makoto YUI - * Copyright (C) 2013-2015 National Institute of Advanced Industrial Science and Technology (AIST) + * http://www.apache.org/licenses/LICENSE-2.0 * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. */ package hivemall.ftvec.selection; @@ -22,11 +22,18 @@ import hivemall.utils.hadoop.HiveUtils; import hivemall.utils.hadoop.WritableUtils; import hivemall.utils.lang.Preconditions; import hivemall.utils.math.StatsUtils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; @@ -34,15 +41,12 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; - @Description(name = "chi2", value = "_FUNC_(array> observed, array> expected)" + " - Returns chi2_val and p_val of each columns as , array>") -public class ChiSquareUDF extends GenericUDF { +@UDFType(deterministic = true, stateful = false) +public final class ChiSquareUDF extends GenericUDF { + private List
[01/50] [abbrv] incubator-hivemall git commit: add HiveUtils.isNumberListOI
Repository: incubator-hivemall Updated Branches: refs/heads/JIRA-22/pr-285 [created] 05766432c refs/heads/JIRA-22/pr-304 [created] 775ae4f79 refs/heads/JIRA-22/pr-336 [created] f8d152cba refs/heads/JIRA-22/pr-356 [created] bb3250448 refs/heads/JIRA-22/pr-385 [created] 4c8dcbfcd add HiveUtils.isNumberListOI Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/2dc176a7 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/2dc176a7 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/2dc176a7 Branch: refs/heads/JIRA-22/pr-385 Commit: 2dc176a760b553214624e98f885a719ee196cc4e Parents: 5a7df55 Author: amaya Authored: Fri Sep 16 15:46:44 2016 +0900 Committer: amaya Committed: Fri Sep 16 15:46:44 2016 +0900 -- core/src/main/java/hivemall/utils/hadoop/HiveUtils.java | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/2dc176a7/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java -- diff --git a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java index 4628ce1..32b60d0 100644 --- a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java +++ b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java @@ -189,8 +189,7 @@ public final class HiveUtils { return BOOLEAN_TYPE_NAME.equals(typeName); } -public static boolean isNumberOI(@Nonnull final ObjectInspector argOI) -throws UDFArgumentTypeException { +public static boolean isNumberOI(@Nonnull final ObjectInspector argOI) { if (argOI.getCategory() != Category.PRIMITIVE) { return false; } @@ -231,6 +230,10 @@ public final class HiveUtils { return category == Category.LIST; } +public static boolean isNumberListOI(@Nonnull final ObjectInspector oi){ +return isListOI(oi) && isNumberOI(((ListObjectInspector)oi).getListElementObjectInspector()); +} + public static boolean isPrimitiveTypeInfo(@Nonnull TypeInfo typeInfo) { return typeInfo.getCategory() == ObjectInspector.Category.PRIMITIVE; }
[18/50] [abbrv] incubator-hivemall git commit: refine transpose_and_dot
refine transpose_and_dot Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/abbf5492 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/abbf5492 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/abbf5492 Branch: refs/heads/JIRA-22/pr-385 Commit: abbf5492b95dd69e347580c59ac044a78627c547 Parents: a16a3fd Author: amaya Authored: Wed Sep 21 13:11:00 2016 +0900 Committer: amaya Committed: Wed Sep 21 13:40:54 2016 +0900 -- .../tools/matrix/TransposeAndDotUDAF.java | 32 +++- 1 file changed, 18 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/abbf5492/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java -- diff --git a/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java b/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java index 1e54004..9d68f93 100644 --- a/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java +++ b/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java @@ -127,33 +127,37 @@ public final class TransposeAndDotUDAF extends AbstractGenericUDAFResolver { @Override public AbstractAggregationBuffer getNewAggregationBuffer() throws HiveException { -TransposeAndDotAggregationBuffer myAgg = new TransposeAndDotAggregationBuffer(); +final TransposeAndDotAggregationBuffer myAgg = new TransposeAndDotAggregationBuffer(); reset(myAgg); return myAgg; } @Override public void reset(AggregationBuffer agg) throws HiveException { -TransposeAndDotAggregationBuffer myAgg = (TransposeAndDotAggregationBuffer) agg; +final TransposeAndDotAggregationBuffer myAgg = (TransposeAndDotAggregationBuffer) agg; myAgg.reset(); } @Override public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException { -TransposeAndDotAggregationBuffer myAgg = (TransposeAndDotAggregationBuffer) agg; +final Object matrix0RowObj = parameters[0]; +final Object matrix1RowObj = parameters[1]; +Preconditions.checkNotNull(matrix0RowObj); +Preconditions.checkNotNull(matrix1RowObj); + +final TransposeAndDotAggregationBuffer myAgg = (TransposeAndDotAggregationBuffer) agg; + +// init if (matrix0Row == null) { -matrix0Row = new double[matrix0RowOI.getListLength(parameters[0])]; +matrix0Row = new double[matrix0RowOI.getListLength(matrix0RowObj)]; } if (matrix1Row == null) { -matrix1Row = new double[matrix1RowOI.getListLength(parameters[1])]; +matrix1Row = new double[matrix1RowOI.getListLength(matrix1RowObj)]; } -HiveUtils.toDoubleArray(parameters[0], matrix0RowOI, matrix0ElOI, matrix0Row, false); -HiveUtils.toDoubleArray(parameters[1], matrix1RowOI, matrix1ElOI, matrix1Row, false); - -Preconditions.checkNotNull(matrix0Row); -Preconditions.checkNotNull(matrix1Row); +HiveUtils.toDoubleArray(matrix0RowObj, matrix0RowOI, matrix0ElOI, matrix0Row, false); +HiveUtils.toDoubleArray(matrix1RowObj, matrix1RowOI, matrix1ElOI, matrix1Row, false); if (myAgg.aggMatrix == null) { myAgg.init(matrix0Row.length, matrix1Row.length); @@ -172,9 +176,9 @@ public final class TransposeAndDotUDAF extends AbstractGenericUDAFResolver { return; } -TransposeAndDotAggregationBuffer myAgg = (TransposeAndDotAggregationBuffer) agg; +final TransposeAndDotAggregationBuffer myAgg = (TransposeAndDotAggregationBuffer) agg; -List matrix = aggMatrixOI.getList(other); +final List matrix = aggMatrixOI.getList(other); final int n = matrix.size(); final double[] row = new double[aggMatrixRowOI.getListLength(matrix.get(0))]; for (int i = 0; i < n; i++) { @@ -197,9 +201,9 @@ public final class TransposeAndDotUDAF extends AbstractGenericUDAFResolver { @Override public Object terminate(AggregationBuffer agg) throws HiveException { -TransposeAndDotAggregationBuffer myAgg = (TransposeAndDotAggregationBuffer) agg; +final TransposeAndDotAggregationBuffer myAgg = (TransposeAndDotAggregationBuffer) agg; -List> result = new ArrayList>(); +final List> result = new ArrayList>(); for (double[] row : myAgg.aggMatrix) {
[17/50] [abbrv] incubator-hivemall git commit: refine chi2
refine chi2 Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/a16a3fde Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/a16a3fde Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/a16a3fde Branch: refs/heads/JIRA-22/pr-385 Commit: a16a3fde844ba381dee7eb1e9608ddc2dcfb96fc Parents: 6dc2344 Author: amaya Authored: Wed Sep 21 13:10:18 2016 +0900 Committer: amaya Committed: Wed Sep 21 13:35:33 2016 +0900 -- .../hivemall/ftvec/selection/ChiSquareUDF.java | 40 +++-- .../java/hivemall/utils/math/StatsUtils.java| 62 +++- 2 files changed, 58 insertions(+), 44 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/a16a3fde/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java -- diff --git a/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java index e2b7494..951aeeb 100644 --- a/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java +++ b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java @@ -50,6 +50,12 @@ public class ChiSquareUDF extends GenericUDF { private ListObjectInspector expectedRowOI; private PrimitiveObjectInspector expectedElOI; +private int nFeatures = -1; +private double[] observedRow = null; // to reuse +private double[] expectedRow = null; // to reuse +private double[][] observed = null; // shape = (#features, #classes) +private double[][] expected = null; // shape = (#features, #classes) + @Override public ObjectInspector initialize(ObjectInspector[] OIs) throws UDFArgumentException { if (OIs.length != 2) { @@ -75,12 +81,12 @@ public class ChiSquareUDF extends GenericUDF { expectedRowOI = HiveUtils.asListOI(expectedOI.getListElementObjectInspector()); expectedElOI = HiveUtils.asDoubleCompatibleOI(expectedRowOI.getListElementObjectInspector()); -List fieldOIs = new ArrayList(); +final List fieldOIs = new ArrayList(); fieldOIs.add(ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)); fieldOIs.add(ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector)); return ObjectInspectorFactory.getStandardStructObjectInspector( -Arrays.asList("chi2_vals", "p_vals"), fieldOIs); +Arrays.asList("chi2", "pvalue"), fieldOIs); } @Override @@ -93,28 +99,28 @@ public class ChiSquareUDF extends GenericUDF { final int nClasses = observedObj.size(); Preconditions.checkArgument(nClasses == expectedObj.size()); // same #rows -int nFeatures = -1; -double[] observedRow = null; // to reuse -double[] expectedRow = null; // to reuse -double[][] observed = null; // shape = (#features, #classes) -double[][] expected = null; // shape = (#features, #classes) - // explode and transpose matrix for (int i = 0; i < nClasses; i++) { -if (i == 0) { +final Object observedObjRow = observedObj.get(i); +final Object expectedObjRow = observedObj.get(i); + +Preconditions.checkNotNull(observedObjRow); +Preconditions.checkNotNull(expectedObjRow); + +if (observedRow == null) { // init -observedRow = HiveUtils.asDoubleArray(observedObj.get(i), observedRowOI, -observedElOI, false); -expectedRow = HiveUtils.asDoubleArray(expectedObj.get(i), expectedRowOI, -expectedElOI, false); +observedRow = HiveUtils.asDoubleArray(observedObjRow, observedRowOI, observedElOI, +false); +expectedRow = HiveUtils.asDoubleArray(expectedObjRow, expectedRowOI, expectedElOI, +false); nFeatures = observedRow.length; observed = new double[nFeatures][nClasses]; expected = new double[nFeatures][nClasses]; } else { -HiveUtils.toDoubleArray(observedObj.get(i), observedRowOI, observedElOI, -observedRow, false); -HiveUtils.toDoubleArray(expectedObj.get(i), expectedRowOI, expectedElOI, -expectedRow, false); +HiveUtils.toDoubleArray(observedObjRow, observedRowOI, observedElOI, observedRow, +false); +HiveUtils.toDoubleArray(expectedObjRow, expectedRowOI, expectedElOI, expectedRow, +false);
[12/50] [abbrv] incubator-hivemall git commit: Add optimizer implementations
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f81948c5/core/src/main/java/hivemall/optimizer/LossFunctions.java -- diff --git a/core/src/main/java/hivemall/optimizer/LossFunctions.java b/core/src/main/java/hivemall/optimizer/LossFunctions.java new file mode 100644 index 000..d11be9b --- /dev/null +++ b/core/src/main/java/hivemall/optimizer/LossFunctions.java @@ -0,0 +1,467 @@ +/* + * Hivemall: Hive scalable Machine Learning Library + * + * Copyright (C) 2015 Makoto YUI + * Copyright (C) 2013-2015 National Institute of Advanced Industrial Science and Technology (AIST) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package hivemall.optimizer; + +import hivemall.utils.math.MathUtils; + +/** + * @link https://github.com/JohnLangford/vowpal_wabbit/wiki/Loss-functions + */ +public final class LossFunctions { + +public enum LossType { +SquaredLoss, LogLoss, HingeLoss, SquaredHingeLoss, QuantileLoss, EpsilonInsensitiveLoss +} + +public static LossFunction getLossFunction(String type) { +if ("SquaredLoss".equalsIgnoreCase(type)) { +return new SquaredLoss(); +} else if ("LogLoss".equalsIgnoreCase(type)) { +return new LogLoss(); +} else if ("HingeLoss".equalsIgnoreCase(type)) { +return new HingeLoss(); +} else if ("SquaredHingeLoss".equalsIgnoreCase(type)) { +return new SquaredHingeLoss(); +} else if ("QuantileLoss".equalsIgnoreCase(type)) { +return new QuantileLoss(); +} else if ("EpsilonInsensitiveLoss".equalsIgnoreCase(type)) { +return new EpsilonInsensitiveLoss(); +} +throw new IllegalArgumentException("Unsupported type: " + type); +} + +public static LossFunction getLossFunction(LossType type) { +switch (type) { +case SquaredLoss: +return new SquaredLoss(); +case LogLoss: +return new LogLoss(); +case HingeLoss: +return new HingeLoss(); +case SquaredHingeLoss: +return new SquaredHingeLoss(); +case QuantileLoss: +return new QuantileLoss(); +case EpsilonInsensitiveLoss: +return new EpsilonInsensitiveLoss(); +default: +throw new IllegalArgumentException("Unsupported type: " + type); +} +} + +public interface LossFunction { + +/** + * Evaluate the loss function. + * + * @param p The prediction, p = w^T x + * @param y The true value (aka target) + * @return The loss evaluated at `p` and `y`. + */ +public float loss(float p, float y); + +public double loss(double p, double y); + +/** + * Evaluate the derivative of the loss function with respect to the prediction `p`. + * + * @param p The prediction, p = w^T x + * @param y The true value (aka target) + * @return The derivative of the loss function w.r.t. `p`. + */ +public float dloss(float p, float y); + +public boolean forBinaryClassification(); + +public boolean forRegression(); + +} + +public static abstract class BinaryLoss implements LossFunction { + +protected static void checkTarget(float y) { +if (!(y == 1.f || y == -1.f)) { +throw new IllegalArgumentException("target must be [+1,-1]: " + y); +} +} + +protected static void checkTarget(double y) { +if (!(y == 1.d || y == -1.d)) { +throw new IllegalArgumentException("target must be [+1,-1]: " + y); +} +} + +@Override +public boolean forBinaryClassification() { +return true; +} + +@Override +public boolean forRegression() { +return false; +} +} + +public static abstract class RegressionLoss implements LossFunction { + +@Override +public boolean forBinaryClassification() { +return false; +} + +@Override +public boolean forRegression() { +return true; +} + +} + +/** + * Squared loss for regression problems. + * + * If you're trying to minimize the mean error, use squared-loss. + */ +
[08/50] [abbrv] incubator-hivemall git commit: add array_top_k_indices
add array_top_k_indices Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/e9d1a94f Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/e9d1a94f Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/e9d1a94f Branch: refs/heads/JIRA-22/pr-385 Commit: e9d1a94f29f31e2910a54add7c2625825d715318 Parents: 7b07e4a Author: amaya Authored: Tue Sep 20 16:55:57 2016 +0900 Committer: amaya Committed: Tue Sep 20 18:37:38 2016 +0900 -- .../tools/array/ArrayTopKIndicesUDF.java| 96 1 file changed, 96 insertions(+) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/e9d1a94f/core/src/main/java/hivemall/tools/array/ArrayTopKIndicesUDF.java -- diff --git a/core/src/main/java/hivemall/tools/array/ArrayTopKIndicesUDF.java b/core/src/main/java/hivemall/tools/array/ArrayTopKIndicesUDF.java new file mode 100644 index 000..bf9fe15 --- /dev/null +++ b/core/src/main/java/hivemall/tools/array/ArrayTopKIndicesUDF.java @@ -0,0 +1,96 @@ +package hivemall.tools.array; + +import hivemall.utils.hadoop.HiveUtils; +import hivemall.utils.lang.Preconditions; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; +import org.apache.hadoop.io.IntWritable; + +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Map; + +@Description(name = "array_top_k_indices", +value = "_FUNC_(array array, const int k) - Returns indices array of top-k as array") +public class ArrayTopKIndicesUDF extends GenericUDF { +private ListObjectInspector arrayOI; +private PrimitiveObjectInspector elementOI; +private PrimitiveObjectInspector kOI; + +@Override +public ObjectInspector initialize(ObjectInspector[] OIs) throws UDFArgumentException { +if (OIs.length != 2) { +throw new UDFArgumentLengthException("Specify two or three arguments."); +} + +if (!HiveUtils.isNumberListOI(OIs[0])) { +throw new UDFArgumentTypeException(0, "Only array type argument is acceptable but " ++ OIs[0].getTypeName() + " was passed as `array`"); +} +if (!HiveUtils.isIntegerOI(OIs[1])) { +throw new UDFArgumentTypeException(1, "Only int type argument is acceptable but " ++ OIs[1].getTypeName() + " was passed as `k`"); +} + +arrayOI = HiveUtils.asListOI(OIs[0]); +elementOI = HiveUtils.asDoubleCompatibleOI(arrayOI.getListElementObjectInspector()); +kOI = HiveUtils.asIntegerOI(OIs[1]); + +return ObjectInspectorFactory.getStandardListObjectInspector( +PrimitiveObjectInspectorFactory.writableIntObjectInspector); +} + +@Override +public Object evaluate(GenericUDF.DeferredObject[] dObj) throws HiveException { +final double[] array = HiveUtils.asDoubleArray(dObj[0].get(), arrayOI, elementOI); +final int k = PrimitiveObjectInspectorUtils.getInt(dObj[1].get(), kOI); + +Preconditions.checkNotNull(array); +Preconditions.checkArgument(array.length >= k); + +List> list = new ArrayList>(); +for (int i = 0; i < array.length; i++) { +list.add(new AbstractMap.SimpleEntry(i, array[i])); +} +list.sort(new Comparator>() { +@Override +public int compare(Map.Entry o1, Map.Entry o2) { +return o1.getValue() > o2.getValue() ? -1 : 1; +} +}); + +List result = new ArrayList(); +for (int i = 0; i < k; i++) { +result.add(new IntWritable(list.get(i).getKey())); +} +return result; +} + +@Override +public String getDisplayString(String[] children) { +StringBuilder sb = new StringBuilder(); +sb.append("array_top_k_indices"); +sb.append("("); +if
[48/50] [abbrv] incubator-hivemall git commit: Merge branch 'sst-changepoint' of https://github.com/takuti/hivemall into JIRA-22/pr-356
Merge branch 'sst-changepoint' of https://github.com/takuti/hivemall into JIRA-22/pr-356 Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/bb325044 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/bb325044 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/bb325044 Branch: refs/heads/JIRA-22/pr-356 Commit: bb32504482db55ed5d946fefb25b4b88d2c36209 Parents: 72d6a62 998203d Author: myui Authored: Fri Dec 2 15:33:01 2016 +0900 Committer: myui Committed: Fri Dec 2 15:33:01 2016 +0900 -- .../anomaly/SingularSpectrumTransform.java | 193 +++ .../anomaly/SingularSpectrumTransformUDF.java | 235 +++ .../java/hivemall/utils/math/MatrixUtils.java | 203 .../anomaly/SingularSpectrumTransformTest.java | 146 .../hivemall/utils/math/MatrixUtilsTest.java| 67 ++ 5 files changed, 844 insertions(+) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/bb325044/core/src/main/java/hivemall/utils/math/MatrixUtils.java -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/bb325044/core/src/test/java/hivemall/utils/math/MatrixUtilsTest.java --
[07/50] [abbrv] incubator-hivemall git commit: change interface of chi2
change interface of chi2 Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/7b07e4a6 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/7b07e4a6 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/7b07e4a6 Branch: refs/heads/JIRA-22/pr-385 Commit: 7b07e4a6e1f700ba0a6e5b68659a040a3d89aa2f Parents: d0e97e6 Author: amaya Authored: Tue Sep 20 12:03:44 2016 +0900 Committer: amaya Committed: Tue Sep 20 12:11:42 2016 +0900 -- .../ftvec/selection/ChiSquareTestUDF.java | 21 .../hivemall/ftvec/selection/ChiSquareUDF.java | 124 +-- .../ftvec/selection/DissociationDegreeUDF.java | 88 - .../java/hivemall/utils/math/StatsUtils.java| 49 ++-- 4 files changed, 155 insertions(+), 127 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7b07e4a6/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java -- diff --git a/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java b/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java deleted file mode 100644 index d367085..000 --- a/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java +++ /dev/null @@ -1,21 +0,0 @@ -package hivemall.ftvec.selection; - -import hivemall.utils.math.StatsUtils; -import org.apache.hadoop.hive.ql.exec.Description; - -import javax.annotation.Nonnull; - -@Description(name = "chi2_test", -value = "_FUNC_(array expected, array observed) - Returns p-value as double") -public class ChiSquareTestUDF extends DissociationDegreeUDF { -@Override -double calcDissociation(@Nonnull final double[] expected,@Nonnull final double[] observed) { -return StatsUtils.chiSquareTest(expected, observed); -} - -@Override -@Nonnull -String getFuncName() { -return "chi2_test"; -} -} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/7b07e4a6/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java -- diff --git a/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java index 937b1bd..1954e33 100644 --- a/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java +++ b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java @@ -1,21 +1,131 @@ package hivemall.ftvec.selection; +import hivemall.utils.hadoop.HiveUtils; +import hivemall.utils.hadoop.WritableUtils; +import hivemall.utils.lang.Preconditions; import hivemall.utils.math.StatsUtils; import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import javax.annotation.Nonnull; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; @Description(name = "chi2", -value = "_FUNC_(array expected, array observed) - Returns chi2-value as double") -public class ChiSquareUDF extends DissociationDegreeUDF { +value = "_FUNC_(array> observed, array> expected)" + +" - Returns chi2_val and p_val of each columns as , array>") +public class ChiSquareUDF extends GenericUDF { +private ListObjectInspector observedOI; +private ListObjectInspector observedRowOI; +private PrimitiveObjectInspector observedElOI; +private ListObjectInspector expectedOI; +private ListObjectInspector expectedRowOI; +private PrimitiveObjectInspector expectedElOI; + @Override -double calcDissociation(@Nonnull final double[] expected,@Nonnull final double[] observed) { -return StatsUtils.chiSquare(expected, observed); +public ObjectInspector initialize(ObjectInspector[] OIs) throws UDFArgumentException { +if (OIs.length != 2) { +throw new UDFArgument
[13/50] [abbrv] incubator-hivemall git commit: Add optimizer implementations
Add optimizer implementations Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/f81948c5 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/f81948c5 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/f81948c5 Branch: refs/heads/JIRA-22/pr-285 Commit: f81948c5c7b83155eb29369a59f1fc65bb607f91 Parents: 5a7df55 Author: Takeshi YAMAMURO Authored: Mon May 2 23:43:42 2016 +0900 Committer: Takeshi YAMAMURO Committed: Wed Sep 21 00:07:28 2016 +0900 -- .../src/main/java/hivemall/LearnerBaseUDTF.java | 22 + .../hivemall/classifier/AROWClassifierUDTF.java | 2 +- .../hivemall/classifier/AdaGradRDAUDTF.java | 123 + .../classifier/BinaryOnlineClassifierUDTF.java | 3 + .../classifier/GeneralClassifierUDTF.java | 121 + .../classifier/PassiveAggressiveUDTF.java | 2 +- .../main/java/hivemall/common/EtaEstimator.java | 160 --- .../java/hivemall/common/LossFunctions.java | 467 --- .../java/hivemall/fm/FMHyperParameters.java | 2 +- .../hivemall/fm/FactorizationMachineModel.java | 2 +- .../hivemall/fm/FactorizationMachineUDTF.java | 8 +- .../fm/FieldAwareFactorizationMachineModel.java | 1 + .../hivemall/mf/BPRMatrixFactorizationUDTF.java | 2 +- .../hivemall/mf/MatrixFactorizationSGDUDTF.java | 2 +- .../main/java/hivemall/model/DenseModel.java| 87 +--- .../main/java/hivemall/model/IWeightValue.java | 16 +- .../java/hivemall/model/PredictionModel.java| 5 +- .../model/SpaceEfficientDenseModel.java | 93 +--- .../main/java/hivemall/model/SparseModel.java | 20 +- .../model/SynchronizedModelWrapper.java | 16 +- .../main/java/hivemall/model/WeightValue.java | 162 ++- .../hivemall/model/WeightValueWithClock.java| 167 ++- .../optimizer/DenseOptimizerFactory.java| 215 + .../java/hivemall/optimizer/EtaEstimator.java | 191 .../java/hivemall/optimizer/LossFunctions.java | 467 +++ .../main/java/hivemall/optimizer/Optimizer.java | 246 ++ .../java/hivemall/optimizer/Regularization.java | 99 .../optimizer/SparseOptimizerFactory.java | 171 +++ .../hivemall/regression/AROWRegressionUDTF.java | 2 +- .../java/hivemall/regression/AdaDeltaUDTF.java | 117 + .../java/hivemall/regression/AdaGradUDTF.java | 118 + .../regression/GeneralRegressionUDTF.java | 125 + .../java/hivemall/regression/LogressUDTF.java | 63 +-- .../PassiveAggressiveRegressionUDTF.java| 2 +- .../hivemall/regression/RegressionBaseUDTF.java | 14 +- .../java/hivemall/optimizer/OptimizerTest.java | 172 +++ .../java/hivemall/mix/server/MixServerTest.java | 14 +- resources/ddl/define-all-as-permanent.hive | 13 +- resources/ddl/define-all.hive | 12 +- 39 files changed, 2301 insertions(+), 1223 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f81948c5/core/src/main/java/hivemall/LearnerBaseUDTF.java -- diff --git a/core/src/main/java/hivemall/LearnerBaseUDTF.java b/core/src/main/java/hivemall/LearnerBaseUDTF.java index 4518cce..7fd5190 100644 --- a/core/src/main/java/hivemall/LearnerBaseUDTF.java +++ b/core/src/main/java/hivemall/LearnerBaseUDTF.java @@ -28,6 +28,9 @@ import hivemall.model.SparseModel; import hivemall.model.SynchronizedModelWrapper; import hivemall.model.WeightValue; import hivemall.model.WeightValue.WeightValueWithCovar; +import hivemall.optimizer.DenseOptimizerFactory; +import hivemall.optimizer.Optimizer; +import hivemall.optimizer.SparseOptimizerFactory; import hivemall.utils.datetime.StopWatch; import hivemall.utils.hadoop.HadoopUtils; import hivemall.utils.hadoop.HiveUtils; @@ -38,6 +41,7 @@ import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.util.List; +import java.util.Map; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -195,6 +199,24 @@ public abstract class LearnerBaseUDTF extends UDTFWithOptions { return model; } +// If a model implements a optimizer, it must override this +protected Map getOptimzierOptions() { +return null; +} + +protected Optimizer createOptimizer() { +assert(!useCovariance()); +final Map options = getOptimzierOptions(); +if(options != null) { +if (dense_model) { +return DenseOptimizerFactory.create(model_dims, options); +} else { +return SparseOptimizerFactory.create(model_dims, options); +} +} +return null; +} + protected MixClient configureMixClient(Stri
[46/50] [abbrv] incubator-hivemall git commit: Add feature selection gitbook (#386)
Add feature selection gitbook (#386) Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/6549ef51 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/6549ef51 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/6549ef51 Branch: refs/heads/JIRA-22/pr-385 Commit: 6549ef5104883a9529dfd9fc52b2b24843076fbb Parents: e44a413 Author: amaya Authored: Wed Nov 23 21:16:10 2016 +0900 Committer: Makoto YUI Committed: Wed Nov 23 21:16:10 2016 +0900 -- docs/gitbook/SUMMARY.md | 2 + .../gitbook/ft_engineering/feature_selection.md | 151 +++ 2 files changed, 153 insertions(+) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6549ef51/docs/gitbook/SUMMARY.md -- diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md index c333c98..33bb46c 100644 --- a/docs/gitbook/SUMMARY.md +++ b/docs/gitbook/SUMMARY.md @@ -61,6 +61,8 @@ * [Vectorize Features](ft_engineering/vectorizer.md) * [Quantify non-number features](ft_engineering/quantify.md) +* [Feature selection](ft_engineering/feature_selection.md) + ## Part IV - Evaluation * [Statistical evaluation of a prediction model](eval/stat_eval.md) http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/6549ef51/docs/gitbook/ft_engineering/feature_selection.md -- diff --git a/docs/gitbook/ft_engineering/feature_selection.md b/docs/gitbook/ft_engineering/feature_selection.md new file mode 100644 index 000..8b522c6 --- /dev/null +++ b/docs/gitbook/ft_engineering/feature_selection.md @@ -0,0 +1,151 @@ + + +Feature selection is the process which selects a subset consisting of influential features from miscellaneous ones. +It is an important technique to **enhance results**, **shorten training time** and **make features human-understandable**. + +## Selecting methods supported by Hivemall +* Chi-square (Chi2) +* For non-negative data only +* Signal Noise Ratio (SNR) +* ~~Minimum Redundancy Maximum Relevance (mRMR)~~ +* Contributions are welcome! + +## Usage +1. Create importance list for feature selection +* chi2/SNR +2. Filter features +* Select top-k features based on importance list + + +## Example - Chi2 +``` sql +CREATE TABLE input ( + X array, -- features + Y array -- binarized label +); + +WITH stats AS ( + SELECT +-- [UDAF] transpose_and_dot(Y::array, X::array)::array> +transpose_and_dot(Y, X) AS observed, -- array>, shape = (n_classes, n_features) +array_sum(X) AS feature_count, -- n_features col vector, shape = (1, array) +array_avg(Y) AS class_prob -- n_class col vector, shape = (1, array) + FROM +input +), +test AS ( + SELECT +transpose_and_dot(class_prob, feature_count) AS expected -- array>, shape = (n_class, n_features) + FROM +stats +), +chi2 AS ( + SELECT +-- [UDAF] chi2(observed::array>, expected::array>)::struct, array> +chi2(observed, expected) AS chi2s -- struct, array>, each shape = (1, n_features) + FROM +test JOIN stats; +) +SELECT + -- [UDF] select_k_best(X::array, importance_list::array k::int)::array + select_k_best(X, chi2s.chi2, $[k}) -- top-k feature selection based on chi2 score +FROM + input JOIN chi2; +``` + + +## Example - SNR +``` sql +CREATE TABLE input ( + X array, -- features + Y array -- binarized label +); + +WITH snr AS ( + -- [UDAF] snr(features::array, labels::array)::array + SELECT snr(X, Y) AS snr FROM input -- aggregated SNR as array, shape = (1, #features) +) +SELECT select_k_best(X, snr, ${k}) FROM input JOIN snr; +``` + + +## UDF details +### Common + [UDAF] `transpose_and_dot(X::array, Y::array)::array>` +# Input + +| array X | array Y | +| :-: | :-: | +| a row of matrix | a row of matrix | +# Output + +| array> dotted | +| :-: | +| `dot(X.T, Y)`, shape = (X.#cols, Y.#cols) | + [UDF] `select_k_best(X::array, importance_list::array k::int)::array` +# Input + +| array X | array importance list | int k | +| :-: | :-: | :-: | +| array | the larger, the more important | top-? | +# Output + +| array> k-best elements | +| :-: | +| top-k elements from X based on indices of importance list | + + Note +- Current implementation expects **_ALL each `importance_list` and `k` are equal**_. It maybe confuse us. + - Future WA: add option showing use of common `importance_list` and `k` + + +### Chi2 + [UDF] `chi2(observed::array>, expected::array>)::struct, array>` +# Input + +both `observed` and `expected`, shape = (#classes, #features) + +| array observed | array expected | +| :-: | :-: | +| observed features | expected features, `dot(class_
[11/50] [abbrv] incubator-hivemall git commit: add ddl definitions
add ddl definitions Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/be1ea37a Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/be1ea37a Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/be1ea37a Branch: refs/heads/JIRA-22/pr-385 Commit: be1ea37a0f5048cde4284107c04e109f0f526b42 Parents: ad81b3a Author: amaya Authored: Tue Sep 20 18:00:49 2016 +0900 Committer: amaya Committed: Tue Sep 20 18:38:01 2016 +0900 -- resources/ddl/define-all-as-permanent.hive | 20 resources/ddl/define-all.hive | 20 resources/ddl/define-all.spark | 20 resources/ddl/define-udfs.td.hql | 4 4 files changed, 64 insertions(+) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/be1ea37a/resources/ddl/define-all-as-permanent.hive -- diff --git a/resources/ddl/define-all-as-permanent.hive b/resources/ddl/define-all-as-permanent.hive index bab5a29..52b73a0 100644 --- a/resources/ddl/define-all-as-permanent.hive +++ b/resources/ddl/define-all-as-permanent.hive @@ -202,6 +202,13 @@ CREATE FUNCTION zscore as 'hivemall.ftvec.scaling.ZScoreUDF' USING JAR '${hivema DROP FUNCTION IF EXISTS l2_normalize; CREATE FUNCTION l2_normalize as 'hivemall.ftvec.scaling.L2NormalizationUDF' USING JAR '${hivemall_jar}'; +- +-- selection functions -- +- + +DROP FUNCTION IF EXISTS chi_square; +CREATE FUNCTION chi_square as 'hivemall.ftvec.selection.ChiSquareUDF' USING JAR '${hivemall_jar}'; + -- misc functions -- @@ -364,6 +371,9 @@ CREATE FUNCTION subarray_endwith as 'hivemall.tools.array.SubarrayEndWithUDF' US DROP FUNCTION IF EXISTS subarray_startwith; CREATE FUNCTION subarray_startwith as 'hivemall.tools.array.SubarrayStartWithUDF' USING JAR '${hivemall_jar}'; +DROP FUNCTION IF EXISTS subarray_by_indices; +CREATE FUNCTION subarray_by_indices as 'hivemall.tools.array.SubarrayByIndicesUDF' USING JAR '${hivemall_jar}'; + DROP FUNCTION IF EXISTS array_concat; CREATE FUNCTION array_concat as 'hivemall.tools.array.ArrayConcatUDF' USING JAR '${hivemall_jar}'; @@ -380,6 +390,9 @@ CREATE FUNCTION array_avg as 'hivemall.tools.array.ArrayAvgGenericUDAF' USING JA DROP FUNCTION IF EXISTS array_sum; CREATE FUNCTION array_sum as 'hivemall.tools.array.ArraySumUDAF' USING JAR '${hivemall_jar}'; +DROP FUNCTION array_top_k_indices; +CREATE FUNCTION array_top_k_indices as 'hivemall.tools.array.ArrayTopKIndicesUDF' USING JAR '${hivemall_jar}'; + DROP FUNCTION IF EXISTS to_string_array; CREATE FUNCTION to_string_array as 'hivemall.tools.array.ToStringArrayUDF' USING JAR '${hivemall_jar}'; @@ -436,6 +449,13 @@ DROP FUNCTION IF EXISTS sigmoid; CREATE FUNCTION sigmoid as 'hivemall.tools.math.SigmoidGenericUDF' USING JAR '${hivemall_jar}'; -- +-- Matrix functions -- +-- + +DROP FUNCTION IF EXISTS transpose_and_dot; +CREATE FUNCTION transpose_and_dot as 'hivemall.tools.matrix.TransposeAndDotUDAF' USING JAR '${hivemall_jar}'; + +-- -- mapred functions -- -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/be1ea37a/resources/ddl/define-all.hive -- diff --git a/resources/ddl/define-all.hive b/resources/ddl/define-all.hive index 315b4d2..a70ae0f 100644 --- a/resources/ddl/define-all.hive +++ b/resources/ddl/define-all.hive @@ -198,6 +198,13 @@ create temporary function zscore as 'hivemall.ftvec.scaling.ZScoreUDF'; drop temporary function l2_normalize; create temporary function l2_normalize as 'hivemall.ftvec.scaling.L2NormalizationUDF'; +- +-- selection functions -- +- + +drop temporary function chi_square; +create temporary function chi_square as 'hivemall.ftvec.selection.ChiSquareUDF'; + --- -- Feature engineering functions -- --- @@ -360,6 +367,9 @@ create temporary function subarray_endwith as 'hivemall.tools.array.SubarrayEndW drop temporary function subarray_startwith; create temporary function subarray_startwith as 'hivemall.tools.array.SubarrayStartWithUDF'; +drop temporary function subarray_by_indices; +create temporary function subarray_by_indices as 'hivemall.tools.array.SubarrayByIndicesUDF'; + drop temporary function array_concat; create temporary function array_concat as 'hivemall.tools.array.ArrayConcatUDF'; @@ -376,6 +386,9 @@ create temporary function array_avg as 'hivemall.tools.array.
[44/50] [abbrv] incubator-hivemall git commit: Merge branch 'feature/feature_selection' of https://github.com/amaya382/hivemall into feature_selection
Merge branch 'feature/feature_selection' of https://github.com/amaya382/hivemall into feature_selection # Conflicts: # core/src/main/java/hivemall/utils/hadoop/HiveUtils.java # core/src/main/java/hivemall/utils/math/StatsUtils.java # spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala # spark/spark-1.6/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala # spark/spark-2.0/src/main/scala/org/apache/spark/sql/hive/HivemallGroupedDataset.scala # spark/spark-2.0/src/test/scala/org/apache/spark/sql/hive/HivemallOpsSuite.scala Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/67ba9631 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/67ba9631 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/67ba9631 Branch: refs/heads/JIRA-22/pr-385 Commit: 67ba9631af3c231b7abd145134d17237b6aca0a5 Parents: 69496fa ce4a489 Author: myui Authored: Mon Nov 21 18:19:45 2016 +0900 Committer: myui Committed: Mon Nov 21 18:19:45 2016 +0900 -- .../hivemall/ftvec/selection/ChiSquareUDF.java | 155 .../ftvec/selection/SignalNoiseRatioUDAF.java | 349 +++ .../hivemall/tools/array/SelectKBestUDF.java| 143 .../tools/matrix/TransposeAndDotUDAF.java | 213 +++ .../java/hivemall/utils/hadoop/HiveUtils.java | 22 +- .../java/hivemall/utils/math/StatsUtils.java| 91 + .../ftvec/selection/ChiSquareUDFTest.java | 80 + .../selection/SignalNoiseRatioUDAFTest.java | 348 ++ .../tools/array/SelectKBeatUDFTest.java | 65 .../tools/matrix/TransposeAndDotUDAFTest.java | 58 +++ resources/ddl/define-all-as-permanent.hive | 20 ++ resources/ddl/define-all.hive | 20 ++ resources/ddl/define-all.spark | 20 ++ resources/ddl/define-udfs.td.hql| 4 + .../apache/spark/sql/hive/GroupedDataEx.scala | 21 ++ .../org/apache/spark/sql/hive/HivemallOps.scala | 18 + .../spark/sql/hive/HivemallOpsSuite.scala | 100 ++ .../spark/sql/hive/HivemallGroupedDataset.scala | 25 ++ .../org/apache/spark/sql/hive/HivemallOps.scala | 20 ++ .../spark/sql/hive/HivemallOpsSuite.scala | 103 ++ 20 files changed, 1873 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/67ba9631/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java -- diff --cc core/src/main/java/hivemall/utils/hadoop/HiveUtils.java index d8b1aef,c752188..8188b7a --- a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java +++ b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java @@@ -242,10 -240,16 +242,20 @@@ public final class HiveUtils return category == Category.LIST; } +public static boolean isMapOI(@Nonnull final ObjectInspector oi) { +return oi.getCategory() == Category.MAP; +} + + public static boolean isNumberListOI(@Nonnull final ObjectInspector oi) { + return isListOI(oi) + && isNumberOI(((ListObjectInspector) oi).getListElementObjectInspector()); + } + + public static boolean isNumberListListOI(@Nonnull final ObjectInspector oi) { + return isListOI(oi) + && isNumberListOI(((ListObjectInspector) oi).getListElementObjectInspector()); + } + public static boolean isPrimitiveTypeInfo(@Nonnull TypeInfo typeInfo) { return typeInfo.getCategory() == ObjectInspector.Category.PRIMITIVE; } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/67ba9631/core/src/main/java/hivemall/utils/math/StatsUtils.java -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/67ba9631/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala -- diff --cc spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala index fd4da64,2482c62..8f78a7f --- a/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala +++ b/spark/spark-1.6/src/main/scala/org/apache/spark/sql/hive/GroupedDataEx.scala @@@ -267,13 -266,25 +267,34 @@@ final class GroupedDataEx protected[sql } /** + * @see hivemall.ftvec.trans.OnehotEncodingUDAF + */ + def onehot_encoding(features: String*): DataFrame = { +val udaf = HiveUDAFFunction( +new HiveFunctionWrapper("hivemall.ftvec.trans.OnehotEncodingUDAF"), +features.map(df.col(_).expr), +isUDAFBridgeRequired = false) ++ ++ /** +* @see hivemall.ftvec.selection.
[19/50] [abbrv] incubator-hivemall git commit: fix chi2
fix chi2 Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/b8cf3968 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/b8cf3968 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/b8cf3968 Branch: refs/heads/JIRA-22/pr-385 Commit: b8cf39684496f2511e59294041d443b9438394a9 Parents: abbf549 Author: amaya Authored: Wed Sep 21 15:02:12 2016 +0900 Committer: amaya Committed: Wed Sep 21 16:23:42 2016 +0900 -- core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/b8cf3968/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java -- diff --git a/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java index 951aeeb..70f0316 100644 --- a/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java +++ b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java @@ -102,7 +102,7 @@ public class ChiSquareUDF extends GenericUDF { // explode and transpose matrix for (int i = 0; i < nClasses; i++) { final Object observedObjRow = observedObj.get(i); -final Object expectedObjRow = observedObj.get(i); +final Object expectedObjRow = expectedObj.get(i); Preconditions.checkNotNull(observedObjRow); Preconditions.checkNotNull(expectedObjRow);
[06/50] [abbrv] incubator-hivemall git commit: add HiveUtils.isNumberListListOI
add HiveUtils.isNumberListListOI Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/d0e97e6f Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/d0e97e6f Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/d0e97e6f Branch: refs/heads/JIRA-22/pr-385 Commit: d0e97e6ff71b2072ec5235cc3ac169162d59da59 Parents: d8f1005 Author: amaya Authored: Tue Sep 20 12:02:28 2016 +0900 Committer: amaya Committed: Tue Sep 20 12:02:28 2016 +0900 -- core/src/main/java/hivemall/utils/hadoop/HiveUtils.java | 4 1 file changed, 4 insertions(+) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/d0e97e6f/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java -- diff --git a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java index 7e8ea7b..dcbf534 100644 --- a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java +++ b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java @@ -235,6 +235,10 @@ public final class HiveUtils { return isListOI(oi) && isNumberOI(((ListObjectInspector)oi).getListElementObjectInspector()); } +public static boolean isNumberListListOI(@Nonnull final ObjectInspector oi) { +return isListOI(oi) && isNumberListOI(((ListObjectInspector)oi).getListElementObjectInspector()); +} + public static boolean isPrimitiveTypeInfo(@Nonnull TypeInfo typeInfo) { return typeInfo.getCategory() == ObjectInspector.Category.PRIMITIVE; }
[incubator-hivemall] Git Push Summary
Repository: incubator-hivemall Updated Tags: refs/tags/v0.4.2-rc.2 [created] e1df0504d
[05/50] [abbrv] incubator-hivemall git commit: mod number format
mod number format Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/d8f1005b Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/d8f1005b Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/d8f1005b Branch: refs/heads/JIRA-22/pr-385 Commit: d8f1005bb9fbf769b117290582bed18d7607a94a Parents: d3009be Author: amaya Authored: Tue Sep 20 12:01:46 2016 +0900 Committer: amaya Committed: Tue Sep 20 12:01:46 2016 +0900 -- .../hivemall/tools/matrix/TransposeAndDotUDAF.java| 2 +- .../src/main/java/hivemall/utils/math/StatsUtils.java | 14 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/d8f1005b/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java -- diff --git a/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java b/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java index 4fa5ce4..3dcbb93 100644 --- a/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java +++ b/core/src/main/java/hivemall/tools/matrix/TransposeAndDotUDAF.java @@ -81,7 +81,7 @@ public final class TransposeAndDotUDAF extends AbstractGenericUDAFResolver { public void reset() { if (aggMatrix != null) { for (double[] row : aggMatrix) { -Arrays.fill(row, 0.0); +Arrays.fill(row, 0.d); } } } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/d8f1005b/core/src/main/java/hivemall/utils/math/StatsUtils.java -- diff --git a/core/src/main/java/hivemall/utils/math/StatsUtils.java b/core/src/main/java/hivemall/utils/math/StatsUtils.java index ffccea3..7633419 100644 --- a/core/src/main/java/hivemall/utils/math/StatsUtils.java +++ b/core/src/main/java/hivemall/utils/math/StatsUtils.java @@ -198,22 +198,22 @@ public final class StatsUtils { public static double chiSquare(@Nonnull final double[] expected, @Nonnull final double[] observed) { Preconditions.checkArgument(expected.length == observed.length); -double sumExpected = 0.0D; -double sumObserved = 0.0D; +double sumExpected = 0.d; +double sumObserved = 0.d; for (int ratio = 0; ratio < observed.length; ++ratio) { sumExpected += expected[ratio]; sumObserved += observed[ratio]; } -double var15 = 1.0D; +double var15 = 1.d; boolean rescale = false; -if (Math.abs(sumExpected - sumObserved) > 1.0E-5D) { +if (Math.abs(sumExpected - sumObserved) > 1.e-5) { var15 = sumObserved / sumExpected; rescale = true; } -double sumSq = 0.0D; +double sumSq = 0.d; for (int i = 0; i < observed.length; ++i) { double dev; @@ -235,7 +235,7 @@ public final class StatsUtils { * @return p-value */ public static double chiSquareTest(@Nonnull final double[] expected,@Nonnull final double[] observed) { -ChiSquaredDistribution distribution = new ChiSquaredDistribution(null, (double)expected.length - 1.0D); -return 1.0D - distribution.cumulativeProbability(chiSquare(expected, observed)); +ChiSquaredDistribution distribution = new ChiSquaredDistribution(null, (double)expected.length - 1.d); +return 1.d - distribution.cumulativeProbability(chiSquare(expected, observed)); } }
[02/50] [abbrv] incubator-hivemall git commit: add HiveUtils.asDoubleOI
add HiveUtils.asDoubleOI Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/56adf2d4 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/56adf2d4 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/56adf2d4 Branch: refs/heads/JIRA-22/pr-385 Commit: 56adf2d4e8b2591c31b846b8980016d3dafdbacc Parents: 2dc176a Author: amaya Authored: Fri Sep 16 15:48:33 2016 +0900 Committer: amaya Committed: Fri Sep 16 15:48:33 2016 +0900 -- core/src/main/java/hivemall/utils/hadoop/HiveUtils.java | 9 + 1 file changed, 9 insertions(+) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/56adf2d4/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java -- diff --git a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java index 32b60d0..7e8ea7b 100644 --- a/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java +++ b/core/src/main/java/hivemall/utils/hadoop/HiveUtils.java @@ -57,6 +57,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; @@ -675,6 +676,14 @@ public final class HiveUtils { return (LongObjectInspector) argOI; } +public static DoubleObjectInspector asDoubleOI(@Nonnull final ObjectInspector argOI) +throws UDFArgumentException { +if (!DOUBLE_TYPE_NAME.equals(argOI.getTypeName())) { +throw new UDFArgumentException("Argument type must be DOUBLE: " + argOI.getTypeName()); +} +return (DoubleObjectInspector) argOI; +} + public static PrimitiveObjectInspector asIntCompatibleOI(@Nonnull final ObjectInspector argOI) throws UDFArgumentTypeException { if (argOI.getCategory() != Category.PRIMITIVE) {
[15/50] [abbrv] incubator-hivemall git commit: change to select_k_best
change to select_k_best Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/89c81aac Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/89c81aac Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/89c81aac Branch: refs/heads/JIRA-22/pr-385 Commit: 89c81aacf5b13f6e125723cb5c70574c10ae Parents: be1ea37 Author: amaya Authored: Wed Sep 21 10:56:59 2016 +0900 Committer: amaya Committed: Wed Sep 21 13:35:16 2016 +0900 -- .../tools/array/ArrayTopKIndicesUDF.java| 115 --- .../hivemall/tools/array/SelectKBestUDF.java| 143 +++ .../tools/array/SubarrayByIndicesUDF.java | 111 -- resources/ddl/define-all-as-permanent.hive | 9 +- resources/ddl/define-all.hive | 9 +- resources/ddl/define-all.spark | 7 +- resources/ddl/define-udfs.td.hql| 3 +- 7 files changed, 152 insertions(+), 245 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/89c81aac/core/src/main/java/hivemall/tools/array/ArrayTopKIndicesUDF.java -- diff --git a/core/src/main/java/hivemall/tools/array/ArrayTopKIndicesUDF.java b/core/src/main/java/hivemall/tools/array/ArrayTopKIndicesUDF.java deleted file mode 100644 index f895f9b..000 --- a/core/src/main/java/hivemall/tools/array/ArrayTopKIndicesUDF.java +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Hivemall: Hive scalable Machine Learning Library - * - * Copyright (C) 2016 Makoto YUI - * Copyright (C) 2013-2015 National Institute of Advanced Industrial Science and Technology (AIST) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package hivemall.tools.array; - -import hivemall.utils.hadoop.HiveUtils; -import hivemall.utils.lang.Preconditions; -import org.apache.hadoop.hive.ql.exec.Description; -import org.apache.hadoop.hive.ql.exec.UDFArgumentException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; -import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; -import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; -import org.apache.hadoop.io.IntWritable; - -import java.util.AbstractMap; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.List; -import java.util.Map; - -@Description( -name = "array_top_k_indices", -value = "_FUNC_(array array, const int k) - Returns indices array of top-k as array") -public class ArrayTopKIndicesUDF extends GenericUDF { -private ListObjectInspector arrayOI; -private PrimitiveObjectInspector elementOI; -private PrimitiveObjectInspector kOI; - -@Override -public ObjectInspector initialize(ObjectInspector[] OIs) throws UDFArgumentException { -if (OIs.length != 2) { -throw new UDFArgumentLengthException("Specify two or three arguments."); -} - -if (!HiveUtils.isNumberListOI(OIs[0])) { -throw new UDFArgumentTypeException(0, -"Only array type argument is acceptable but " + OIs[0].getTypeName() -+ " was passed as `array`"); -} -if (!HiveUtils.isIntegerOI(OIs[1])) { -throw new UDFArgumentTypeException(1, "Only int type argument is acceptable but " -+ OIs[1].getTypeName() + " was passed as `k`"); -} - -arrayOI = HiveUtils.asListOI(OIs[0]); -elementOI = HiveUtils.asDoubleCompatibleOI(arrayOI.getListElementObjectInspector()); -kOI = HiveUtils.asIntegerOI(OIs[1]); - -return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writab
[incubator-hivemall] Git Push Summary
Repository: incubator-hivemall Updated Tags: refs/tags/v0.5-alpha.1 [created] 2a66cf620
[49/50] [abbrv] incubator-hivemall git commit: Merge branch 'feature/systemtest' of https://github.com/amaya382/hivemall into JIRA-22/pr-336
Merge branch 'feature/systemtest' of https://github.com/amaya382/hivemall into JIRA-22/pr-336 Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/f8d152cb Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/f8d152cb Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/f8d152cb Branch: refs/heads/JIRA-22/pr-336 Commit: f8d152cba17f3f96a897d9eef5bb70722f4bc7c7 Parents: 72d6a62 798ec6a Author: myui Authored: Fri Dec 2 15:33:12 2016 +0900 Committer: myui Committed: Fri Dec 2 15:33:12 2016 +0900 -- pom.xml | 1 + systemtest/README.md| 211 +++ systemtest/pom.xml | 105 ++ .../java/com/klarna/hiverunner/Extractor.java | 33 ++ .../hivemall/systemtest/MsgpackConverter.java | 114 ++ .../exception/QueryExecutionException.java | 27 ++ .../systemtest/model/CreateTableHQ.java | 49 +++ .../hivemall/systemtest/model/DropTableHQ.java | 27 ++ .../main/java/hivemall/systemtest/model/HQ.java | 161 .../java/hivemall/systemtest/model/HQBase.java | 22 ++ .../hivemall/systemtest/model/InsertHQ.java | 47 +++ .../java/hivemall/systemtest/model/RawHQ.java | 30 ++ .../java/hivemall/systemtest/model/TableHQ.java | 30 ++ .../hivemall/systemtest/model/TableListHQ.java | 23 ++ .../model/UploadFileAsNewTableHQ.java | 35 ++ .../hivemall/systemtest/model/UploadFileHQ.java | 57 +++ .../model/UploadFileToExistingHQ.java | 28 ++ .../model/lazy/LazyMatchingResource.java| 63 .../systemtest/runner/HiveSystemTestRunner.java | 142 .../systemtest/runner/SystemTestCommonInfo.java | 46 +++ .../systemtest/runner/SystemTestRunner.java | 337 + .../systemtest/runner/SystemTestTeam.java | 183 ++ .../systemtest/runner/TDSystemTestRunner.java | 363 +++ .../main/java/hivemall/systemtest/utils/IO.java | 83 + .../resources/hivemall/hiverunner.properties| 6 + .../src/test/resources/hivemall/td.properties | 13 + 26 files changed, 2236 insertions(+) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f8d152cb/pom.xml --
[50/50] [abbrv] incubator-hivemall git commit: Merge branch 'AddOptimizers' of https://github.com/maropu/hivemall into JIRA-22/pr-285
Merge branch 'AddOptimizers' of https://github.com/maropu/hivemall into JIRA-22/pr-285 Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/05766432 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/05766432 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/05766432 Branch: refs/heads/JIRA-22/pr-285 Commit: 05766432c45f89627e423245e5aec3ced6d0c100 Parents: 775ae4f 3620eb8 Author: myui Authored: Fri Dec 2 15:35:05 2016 +0900 Committer: myui Committed: Fri Dec 2 15:35:05 2016 +0900 -- .../src/main/java/hivemall/LearnerBaseUDTF.java | 55 +++ .../hivemall/classifier/AROWClassifierUDTF.java | 2 +- .../hivemall/classifier/AdaGradRDAUDTF.java | 6 +- .../classifier/BinaryOnlineClassifierUDTF.java | 13 + .../classifier/GeneralClassifierUDTF.java | 122 + .../classifier/PassiveAggressiveUDTF.java | 2 +- .../main/java/hivemall/common/EtaEstimator.java | 160 --- .../java/hivemall/common/LossFunctions.java | 467 --- .../java/hivemall/fm/FMHyperParameters.java | 2 +- .../hivemall/fm/FactorizationMachineModel.java | 2 +- .../hivemall/fm/FactorizationMachineUDTF.java | 8 +- .../fm/FieldAwareFactorizationMachineModel.java | 1 + .../hivemall/mf/BPRMatrixFactorizationUDTF.java | 2 +- .../hivemall/mf/MatrixFactorizationSGDUDTF.java | 2 +- .../main/java/hivemall/model/DenseModel.java| 5 + .../main/java/hivemall/model/IWeightValue.java | 16 +- .../main/java/hivemall/model/NewDenseModel.java | 293 .../model/NewSpaceEfficientDenseModel.java | 317 + .../java/hivemall/model/NewSparseModel.java | 197 .../java/hivemall/model/PredictionModel.java| 2 + .../model/SpaceEfficientDenseModel.java | 5 + .../main/java/hivemall/model/SparseModel.java | 5 + .../model/SynchronizedModelWrapper.java | 10 + .../main/java/hivemall/model/WeightValue.java | 162 ++- .../hivemall/model/WeightValueWithClock.java| 167 ++- .../optimizer/DenseOptimizerFactory.java| 215 + .../java/hivemall/optimizer/EtaEstimator.java | 191 .../java/hivemall/optimizer/LossFunctions.java | 467 +++ .../main/java/hivemall/optimizer/Optimizer.java | 246 ++ .../java/hivemall/optimizer/Regularization.java | 99 .../optimizer/SparseOptimizerFactory.java | 171 +++ .../hivemall/regression/AROWRegressionUDTF.java | 2 +- .../java/hivemall/regression/AdaDeltaUDTF.java | 5 +- .../java/hivemall/regression/AdaGradUDTF.java | 5 +- .../regression/GeneralRegressionUDTF.java | 126 + .../java/hivemall/regression/LogressUDTF.java | 10 +- .../PassiveAggressiveRegressionUDTF.java| 2 +- .../hivemall/regression/RegressionBaseUDTF.java | 26 +- .../NewSpaceEfficientNewDenseModelTest.java | 60 +++ .../model/SpaceEfficientDenseModelTest.java | 60 --- .../java/hivemall/optimizer/OptimizerTest.java | 172 +++ .../java/hivemall/mix/server/MixServerTest.java | 18 +- resources/ddl/define-all-as-permanent.hive | 13 +- resources/ddl/define-all.hive | 12 +- .../hivemall/mix/server/MixServerSuite.scala| 6 +- 45 files changed, 3195 insertions(+), 734 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/05766432/core/src/main/java/hivemall/LearnerBaseUDTF.java -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/05766432/core/src/main/java/hivemall/classifier/AROWClassifierUDTF.java -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/05766432/core/src/main/java/hivemall/classifier/AdaGradRDAUDTF.java -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/05766432/core/src/main/java/hivemall/classifier/BinaryOnlineClassifierUDTF.java -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/05766432/core/src/main/java/hivemall/classifier/PassiveAggressiveUDTF.java -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/05766432/core/src/main/java/hivemall/fm/FMHyperParameters.java -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/05766432/core/src/main/java/hivemall/fm/FactorizationMachineModel.java -- http://git-wip-us.apach
[09/50] [abbrv] incubator-hivemall git commit: add subarray_by_indices
add subarray_by_indices Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/1ab9b097 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/1ab9b097 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/1ab9b097 Branch: refs/heads/JIRA-22/pr-385 Commit: 1ab9b0974ca4203c00175469b7b75d5b65209547 Parents: e9d1a94 Author: amaya Authored: Tue Sep 20 16:56:15 2016 +0900 Committer: amaya Committed: Tue Sep 20 18:37:46 2016 +0900 -- .../tools/array/SubarrayByIndicesUDF.java | 93 1 file changed, 93 insertions(+) -- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1ab9b097/core/src/main/java/hivemall/tools/array/SubarrayByIndicesUDF.java -- diff --git a/core/src/main/java/hivemall/tools/array/SubarrayByIndicesUDF.java b/core/src/main/java/hivemall/tools/array/SubarrayByIndicesUDF.java new file mode 100644 index 000..f476589 --- /dev/null +++ b/core/src/main/java/hivemall/tools/array/SubarrayByIndicesUDF.java @@ -0,0 +1,93 @@ +package hivemall.tools.array; + + +import hivemall.utils.hadoop.HiveUtils; +import hivemall.utils.lang.Preconditions; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; + +import java.util.ArrayList; +import java.util.List; + +@Description(name = "subarray_by_indices", +value = "_FUNC_(array input, array indices)" + +" - Returns subarray selected by given indices as array") +public class SubarrayByIndicesUDF extends GenericUDF { +private ListObjectInspector inputOI; +private PrimitiveObjectInspector elementOI; +private ListObjectInspector indicesOI; +private PrimitiveObjectInspector indexOI; + +@Override +public ObjectInspector initialize(ObjectInspector[] OIs) throws UDFArgumentException { +if (OIs.length != 2) { +throw new UDFArgumentLengthException("Specify two arguments."); +} + +if (!HiveUtils.isListOI(OIs[0])) { +throw new UDFArgumentTypeException(0, "Only array type argument is acceptable but " ++ OIs[0].getTypeName() + " was passed as `input`"); +} +if (!HiveUtils.isListOI(OIs[1]) +|| !HiveUtils.isIntegerOI(((ListObjectInspector) OIs[1]).getListElementObjectInspector())) { +throw new UDFArgumentTypeException(0, "Only array type argument is acceptable but " ++ OIs[0].getTypeName() + " was passed as `indices`"); +} + +inputOI = HiveUtils.asListOI(OIs[0]); +elementOI = HiveUtils.asDoubleCompatibleOI(inputOI.getListElementObjectInspector()); +indicesOI = HiveUtils.asListOI(OIs[1]); +indexOI = HiveUtils.asIntegerOI(indicesOI.getListElementObjectInspector()); + +return ObjectInspectorFactory.getStandardListObjectInspector( +PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); +} + +@Override +public Object evaluate(GenericUDF.DeferredObject[] dObj) throws HiveException { +final double[] input = HiveUtils.asDoubleArray(dObj[0].get(), inputOI, elementOI); +final List indices = indicesOI.getList(dObj[1].get()); + +Preconditions.checkNotNull(input); +Preconditions.checkNotNull(indices); + +List result = new ArrayList(); +for (Object indexObj : indices) { +int index = PrimitiveObjectInspectorUtils.getInt(indexObj, indexOI); +if (index > input.length - 1) { +throw new ArrayIndexOutOfBoundsException(index); +} + +result.add(new DoubleWritable(input[index])); +} + +return result; +} + +@Override +public String getDisplayString(String[] children) { +StringBuilder sb = new StringBuilder(); +sb.append("subarray_by_indices"); +sb.append("("); +
[incubator-hivemall] Git Push Summary
Repository: incubator-hivemall Updated Tags: refs/tags/v0.5-alpha.1 [deleted] 2a66cf620
[incubator-hivemall] Git Push Summary
Repository: incubator-hivemall Updated Tags: refs/tags/v0.4.2-rc.2 [deleted] e1df0504d