Repository: incubator-hivemall Updated Branches: refs/heads/master e9c66f0a1 -> 89c7538a3
Close #99: [HIVEMALL-116][SPARK][DOC] Add documentation about SQL in Spark Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/89c7538a Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/89c7538a Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/89c7538a Branch: refs/heads/master Commit: 89c7538a3338633fddb8c34500ab61b15f3f95de Parents: e9c66f0 Author: Takeshi Yamamuro <[email protected]> Authored: Wed Jul 12 09:14:28 2017 +0900 Committer: Takeshi Yamamuro <[email protected]> Committed: Wed Jul 12 09:14:28 2017 +0900 ---------------------------------------------------------------------- docs/gitbook/SUMMARY.md | 2 + docs/gitbook/spark/binaryclass/a9a_sql.md | 163 ++++++++++++++++++++++++ docs/gitbook/spark/regression/e2006_sql.md | 155 ++++++++++++++++++++++ 3 files changed, 320 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/89c7538a/docs/gitbook/SUMMARY.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/SUMMARY.md b/docs/gitbook/SUMMARY.md index 539cea1..f228cfe 100644 --- a/docs/gitbook/SUMMARY.md +++ b/docs/gitbook/SUMMARY.md @@ -178,9 +178,11 @@ * [Binary Classification](spark/binaryclass/index.md) * [a9a Tutorial for DataFrame](spark/binaryclass/a9a_df.md) + * [a9a Tutorial for SQL](spark/binaryclass/a9a_sql.md) * [Regression](spark/binaryclass/index.md) * [E2006-tfidf regression Tutorial for DataFrame](spark/regression/e2006_df.md) + * [E2006-tfidf regression Tutorial for SQL](spark/regression/e2006_sql.md) * [Generic features](spark/misc/misc.md) * [Top-k Join processing](spark/misc/topk_join.md) http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/89c7538a/docs/gitbook/spark/binaryclass/a9a_sql.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/spark/binaryclass/a9a_sql.md b/docs/gitbook/spark/binaryclass/a9a_sql.md new file mode 100644 index 0000000..06734d9 --- /dev/null +++ b/docs/gitbook/spark/binaryclass/a9a_sql.md @@ -0,0 +1,163 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +a9a +=== +http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#a9a + +Data preparation +================ + +```sh +$ wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a +$ wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a.t +``` + +```scala +scala> :paste +park.read.format("libsvm").load("a9a") + .select($"label", to_hivemall_features($"features").as("features")) + .createOrReplaceTempView("rawTrainTable") + +val (max, min) = sql("SELECT MAX(label), MIN(label) FROM rawTrainTable").collect.map { + case Row(max: Double, min: Double) => (max, min) +}.head + +// `label` must be [0.0, 1.0] +sql(s""" + CREATE OR REPLACE TEMPORARY VIEW trainTable AS + SELECT rescale(label, $min, $max) AS label, features + FROM rawTrainTable +""") + +scala> trainDf.printSchema +root + |-- label: float (nullable = true) + |-- features: vector (nullable = true) + +scala> :paste +spark.read.format("libsvm").load("a9a.t") + .select($"label", to_hivemall_features($"features").as("features")) + .createOrReplaceTempView("rawTestTable") + +sql(s""" + CREATE OR REPLACE TEMPORARY VIEW testTable AS + SELECT + rowid() AS rowid, + rescale(label, $min, $max) AS target, + features + FROM + rawTestTable +""") + +// Caches data to fix row IDs +sql("CACHE TABLE testTable") + +sql(""" + CREATE OR REPLACE TEMPORARY VIEW testTable_exploded AS + SELECT + rowid, + target, + extract_feature(ft) AS feature, + extract_weight(ft) AS value + FROM ( + SELECT + rowid, + target, + explode(features) AS ft + FROM + testTable + ) +""") + +scala> testDf.printSchema +root + |-- rowid: string (nullable = true) + |-- target: float (nullable = true) + |-- feature: string (nullable = true) + |-- value: double (nullable = true) +``` + +Tutorials +================ + +[Logistic Regression] +--- + +#Training + +```scala +scala> :paste +sql(""" + CREATE OR REPLACE TEMPORARY VIEW modelTable AS + SELECT + feature, AVG(weight) AS weight + FROM ( + SELECT + train_logistic_regr(add_bias(features), label) AS (feature, weight) + FROM + trainTable + ) + GROUP BY + feature +""") +``` + +#Test + +```scala +scala> :paste +sql(""" + CREATE OR REPLACE TEMPORARY VIEW predicted AS + SELECT + rowid, + CASE + WHEN sigmoid(sum(weight * value)) > 0.50 THEN 1.0 + ELSE 0.0 + END AS predicted + FROM + testTable_exploded t LEFT OUTER JOIN modelTable m + ON t.feature = m.feature + GROUP BY + rowid +""") +``` + +#Evaluation + +```scala +val num_test_instances = spark.table("testTable").count + +sql(s""" + SELECT + count(1) / $num_test_instances AS eval + FROM + predicted p INNER JOIN testTable t + ON p.rowid = t.rowid + WHERE + p.predicted = t.target +""") + ++------------------+ +| eval| ++------------------+ +|0.8327921286841418| ++------------------+ +``` + http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/89c7538a/docs/gitbook/spark/regression/e2006_sql.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/spark/regression/e2006_sql.md b/docs/gitbook/spark/regression/e2006_sql.md new file mode 100644 index 0000000..48477d1 --- /dev/null +++ b/docs/gitbook/spark/regression/e2006_sql.md @@ -0,0 +1,155 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +E2006 +=== +http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html#E2006-tfidf + +Data preparation +================ + +```sh +$ wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2 +$ wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.test.bz2 +``` + +```scala +scala> :paste +spark.read.format("libsvm").load("E2006.train.bz2") + .select($"label", to_hivemall_features($"features").as("features")) + .createOrReplaceTempView("rawTrainTable") + +val (max, min) = sql("SELECT MAX(label), MIN(label) FROM rawTrainTable").collect.map { + case Row(max: Double, min: Double) => (max, min) +}.head + +// `label` must be [0.0, 1.0] +sql(s""" + CREATE OR REPLACE TEMPORARY VIEW trainTable AS + SELECT rescale(label, $min, $max) AS label, features + FROM rawTrainTable +""") + +scala> trainDf.printSchema +root + |-- label: float (nullable = true) + |-- features: vector (nullable = true) + +scala> :paste +spark.read.format("libsvm").load("E2006.test.bz2") + .select($"label", to_hivemall_features($"features").as("features")) + .createOrReplaceTempView("rawTestTable") + +sql(s""" + CREATE OR REPLACE TEMPORARY VIEW testTable AS + SELECT + rowid() AS rowid, + rescale(label, $min, $max) AS target, + features + FROM + rawTestTable +""") + +// Caches data to fix row IDs +sql("CACHE TABLE testTable") + +sql(""" + CREATE OR REPLACE TEMPORARY VIEW testTable_exploded AS + SELECT + rowid, + target, + extract_feature(ft) AS feature, + extract_weight(ft) AS value + FROM ( + SELECT + rowid, + target, + explode(features) AS ft + FROM + testTable +""") + +scala> df.printSchema +root + |-- rowid: string (nullable = true) + |-- target: float (nullable = true) + |-- feature: string (nullable = true) + |-- value: double (nullable = true) +``` + +Tutorials +================ + +[AROWe2] +--- + +#Training + +```scala +scala> :paste +sql(""" + CREATE OR REPLACE TEMPORARY VIEW modelTable AS + SELECT + feature, AVG(weight) AS weight + FROM ( + SELECT + train_arowe2_regr(add_bias(features), label) AS (feature, weight) + FROM + trainTable + ) + GROUP BY + feature +""") +``` + +#Test + +```scala +scala> :paste +sql(""" + CREATE OR REPLACE TEMPORARY VIEW predicted AS + SELECT + rowid, sum(weight * value) AS predicted + FROM + testTable_exploded t LEFT OUTER JOIN modelTable m + ON t.feature = m.feature + GROUP BY + rowid +""") +``` + +#Evaluation + +```scala +scala> :paste +sql(s""" + SELECT + AVG(target), AVG(predicted) + FROM + predicted p INNER JOIN testTable t + ON p.rowid = t.rowid +""") + ++------------------+------------------+ +| avg(target)| avg(predicted)| ++------------------+------------------+ +|0.5489154884487879|0.6030108853227014| ++------------------+------------------+ +``` +
