Remove spark tutorials for now (#383)
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/f9f9799d Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/f9f9799d Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/f9f9799d Branch: refs/heads/master Commit: f9f9799d7b3499624d3c231e71d5dab2eb97b447 Parents: da55b54 Author: Takeshi Yamamuro <[email protected]> Authored: Mon Nov 21 13:52:09 2016 +0900 Committer: Makoto YUI <[email protected]> Committed: Mon Nov 21 13:52:09 2016 +0900 ---------------------------------------------------------------------- .../tutorials/binary_classification.md | 71 ---------------- .../tutorials/multiclass_classification.md | 77 ------------------ spark/spark-1.6/tutorials/randomforest.md | 86 -------------------- spark/spark-1.6/tutorials/regression.md | 71 ---------------- .../tutorials/binary_classification.md | 71 ---------------- .../tutorials/multiclass_classification.md | 77 ------------------ spark/spark-2.0/tutorials/randomforest.md | 86 -------------------- spark/spark-2.0/tutorials/regression.md | 71 ---------------- 8 files changed, 610 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f9f9799d/spark/spark-1.6/tutorials/binary_classification.md ---------------------------------------------------------------------- diff --git a/spark/spark-1.6/tutorials/binary_classification.md b/spark/spark-1.6/tutorials/binary_classification.md deleted file mode 100644 index 22b1c29..0000000 --- a/spark/spark-1.6/tutorials/binary_classification.md +++ /dev/null @@ -1,71 +0,0 @@ -This tutorial uses [9a binary classification](https://github.com/myui/hivemall/wiki#a9a-binary-classification) as a reference. - -Data preparation --------------------- -``` -// Fetch training data -# wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a -# wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a.t - -// Fetch a script to normalize the data -# wget https://raw.githubusercontent.com/maropu/hivemall-spark/master/scripts/misc/conv.awk -# awk -f conv.awk a9a | sed -e "s/+1/1/" | sed -e "s/-1/0/" > a9a.train -# awk -f conv.awk a9a.t | sed -e "s/+1/1/" | sed -e "s/-1/0/" > a9a.test - -// Fetch an initialization script for hivemall-spark -# wget https://raw.githubusercontent.com/maropu/hivemall-spark/master/scripts/ddl/define-dfs.sh - -// Invoke a spark-shell with hivemall-spark -# bin/spark-shell --packages maropu:hivemall-spark:0.0.6 --master=local-cluster[2,1,1024] --conf spark.executor.memory=1024 - -scala> :load define-dfs.sh - -scala> :paste - -// Load the training data as a RDD -val trainRdd = sc.textFile("a9a.train") - .map(HmLabeledPoint.parse) - -// Create the DataFrame that has exactly 2 partitions and -// amplify the data by 3 times. -val trainDf = sqlContext.createDataFrame(trainRdd) - .coalesce(2).part_amplify(3) - -// Load the test data as a RDD -val testRdd = sc.textFile("a9a.test") - .map(HmLabeledPoint.parse) - -// Transform into a DataFrame and transform features -// into a Spark Vector type. -val testDf = sqlContext.createDataFrame(testRdd) - .select($"label".as("target"), ft2vec($"features").as("features")) -``` - -Training (Logistic Regression) --------------------- -``` -// Make a model from the training data -val model = trainDf - .train_logregr(add_bias($"features"), $"label", "-total_steps 32561") - .groupby("feature").agg("weight" -> "avg") - .as("feature", "weight") - -val modelUdf = HivemallUtils - .funcModel(model) -``` - -Test --------------------- -``` -// Do prediction -val predict = testDf - .select($"target", sigmoid(modelUdf($"features")).as("prob")) - .select($"target", when($"prob" > 0.50, 1.0).otherwise(0.0).as("predict"), $"prob") - .cache -``` - -Evaluation --------------------- -``` -(predict.where($"target" === $"predict").count + 0.0) / predict.count -``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f9f9799d/spark/spark-1.6/tutorials/multiclass_classification.md ---------------------------------------------------------------------- diff --git a/spark/spark-1.6/tutorials/multiclass_classification.md b/spark/spark-1.6/tutorials/multiclass_classification.md deleted file mode 100644 index 74cd3bc..0000000 --- a/spark/spark-1.6/tutorials/multiclass_classification.md +++ /dev/null @@ -1,77 +0,0 @@ -This tutorial uses [news20 multiclass classification](https://github.com/myui/hivemall/wiki#news20-multiclass-classification) as a reference. - -Data preparation --------------------- -``` -// Fetch training and test data -# wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/news20.scale.bz2 -# wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/news20.t.scale.bz2 - -// Fetch a script to normalize the data -# wget https://raw.githubusercontent.com/maropu/hivemall-spark/master/scripts/misc/conv.awk -# bunzip2 -c news20.scale.bz2 | awk -f conv.awk > news20.train -# bunzip2 -c news20.t.scale.bz2 | awk -f conv.awk > news20.test - -// Fetch an initialization script for hivemall-spark -# wget https://raw.githubusercontent.com/maropu/hivemall-spark/master/scripts/ddl/define-dfs.sh - -// Invoke a spark-shell with hivemall-spark -# bin/spark-shell --packages maropu:hivemall-spark:0.0.6 --master=local-cluster[2,1,1024] --conf spark.executor.memory=1024 - -scala> :load define-dfs.sh - -scala> :paste - -// Load the training data as a RDD -val trainRdd = sc.textFile("news20.train") - .map(HmLabeledPoint.parse) - -// Create the DataFrame that has exactly 2 partitions and -// amplify the data by 3 times. -val trainDf = sqlContext.createDataFrame(trainRdd) - .coalesce(2).part_amplify(3) - -// Load the test data as a RDD -val testRdd = sc.textFile("news20.test") - .map(HmLabeledPoint.parse) - -// Transform into a DataFrame and transform features -// into a Spark Vector type. -val testDf = sqlContext.createDataFrame(testRdd) - .select(rowid(), $"label".cast(IntegerType).as("target"), $"features") - .cache - -val testDf_exploded = testDf.explode_array($"features") - .select($"rowid", $"target", extract_feature($"feature"), extract_weight($"feature")) -``` - -Training (CW) --------------------- -``` -// Make a model from the training data -val model = trainDf - .train_multiclass_cw(add_bias($"features"), $"label".cast(IntegerType)) - .groupby("label", "feature").argmin_kld("weight", "conv") - .as("label", "feature", "weight") -``` - -Test --------------------- -``` -// Do prediction -val predict = testDf_exploded - .join(model, testDf_exploded("feature") === model("feature"), "LEFT_OUTER") - .select($"rowid", $"label".cast(StringType).as("label"), ($"weight" * $"value").as("value")) - .groupby("rowid", "label").sum("value") - .groupby("rowid").maxrow("SUM(value)", "label") - .as("rowid", "r") - .select($"rowid", $"r.col0".as("score"), $"r.col1".as("predict")) - .cache -``` - -Evaluation --------------------- -``` -val joinPredicate = (testDf("rowid") === predict("rowid")).and(testDf("target") === predict("predict")) -(testDf.join(predict, joinPredicate, "INNER").count + 0.0) / testDf.count -``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f9f9799d/spark/spark-1.6/tutorials/randomforest.md ---------------------------------------------------------------------- diff --git a/spark/spark-1.6/tutorials/randomforest.md b/spark/spark-1.6/tutorials/randomforest.md deleted file mode 100644 index 350e19a..0000000 --- a/spark/spark-1.6/tutorials/randomforest.md +++ /dev/null @@ -1,86 +0,0 @@ -This tutorial uses [Kaggle Titanic binary classification](https://github.com/myui/hivemall/wiki/Kaggle-Titanic-binary-classification-using-Random-Forest) as a reference. - -Data preparation --------------------- -``` -// Fetch training and test data in Kaggle(https://www.kaggle.com/c/titanic/data), train.csv and test.csv - -// Fetch an initialization script for hivemall-spark -# wget https://raw.githubusercontent.com/maropu/hivemall-spark/master/scripts/ddl/define-dfs.sh - -// Invoke a spark-shell with spark-csv and hivemall-spark -# bin/spark-shell --packages com.databricks:spark-csv_2.10:1.4.0,maropu:hivemall-spark:0.0.6 - -scala> :load define-dfs.sh - -scala> :paste - -// Load the training data as a DataFrame -val trainCsvDf = sqlContext - .read - .format("com.databricks.spark.csv") - .option("header", "true") - .option("inferSchema", "true") - .load("train.csv") - .cache // Cached for second use - -val trainQuantifiedDf = trainCsvDf - .quantify(true.as("output") +: trainCsvDf.cols: _*) - // Rename output columns for readability - .as("passengerid", "survived", "pclass", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked") - .sort($"passengerid".asc) - -val trainDf = trainQuantifiedDf - .select( - $"passengerid", - array(trainQuantifiedDf.cols.drop(2): _*).as("features"), - $"survived" - ) - -// Load the test data as a DataFrame -val testCsvDf = sqlContext - .read - .format("com.databricks.spark.csv") - .option("header", "true") - .option("inferSchema", "true") - .load("test.csv") - -val testQuantifiedDf = testCsvDf - .select(Seq(1.as("train_first"), true.as("output"), $"PassengerId") ++ testCsvDf.cols.drop(1): _*) - .unionAll( - trainCsvDf.select(Seq(0.as("train_first"), false.as("output"), $"PassengerId") ++ trainCsvDf.cols.drop(2): _*) - ) - .sort($"train_first".asc, $"PassengerId".asc) - .quantify($"output" +: testCsvDf.cols: _*) - // Rename output columns for readability - .as("passengerid", "pclass", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked") - -val testDf = testQuantifiedDf - .select($"passengerid", array(testQuantifiedDf.cols.drop(1): _*).as("features")) -``` - -Training --------------------- -``` -// Make a model from the training data -val model = trainDf - .coalesce(4) - .train_randomforest_classifier($"features", $"survived", "-trees 400") -``` - -Test --------------------- -``` -// Do prediction -model - .coalesce(4) - .join(testDf) - .select( - testDf("passengerid"), - tree_predict(model("model_id"), model("model_type"), model("pred_model"), testDf("features"), true).as("predicted") - ) - .groupby($"passengerid").rf_ensemble("predicted") - .as("passengerid", "predicted") - .select($"passengerid", $"predicted.label") - .show -``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f9f9799d/spark/spark-1.6/tutorials/regression.md ---------------------------------------------------------------------- diff --git a/spark/spark-1.6/tutorials/regression.md b/spark/spark-1.6/tutorials/regression.md deleted file mode 100644 index 5856373..0000000 --- a/spark/spark-1.6/tutorials/regression.md +++ /dev/null @@ -1,71 +0,0 @@ -This tutorial uses [E2006 tfidf regression](https://github.com/myui/hivemall/wiki#e2006-tfidf-regression) as a reference. - -Data preparation --------------------- -``` -// Fetch training and test data -# wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2 -# wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.test.bz2 - -// Fetch a script to normalize the data -# wget https://raw.githubusercontent.com/maropu/hivemall-spark/master/scripts/misc/conv.awk -# bunzip2 -c E2006.train.bz2 | awk -f conv.awk > E2006.train.lp -# bunzip2 -c E2006.test.bz2 | awk -f conv.awk > E2006.test.lp - -// Fetch an initialization script for hivemall-spark -# wget https://raw.githubusercontent.com/maropu/hivemall-spark/master/scripts/ddl/define-dfs.sh - -// Invoke a spark-shell with hivemall-spark -# bin/spark-shell --packages maropu:hivemall-spark:0.0.6 --master=local-cluster[2,1,1024] --conf spark.executor.memory=1024 - -scala> :load define-dfs.sh - -scala> :paste - -// Load the training data as a RDD -val trainRdd = sc.textFile("E2006.train.lp") - .map(HmLabeledPoint.parse) - -// Create the DataFrame that has exactly 2 partitions and -// amplify the data by 3 times. -val trainDf = sqlContext.createDataFrame(trainRdd) - .coalesce(2).part_amplify(3) - -// Load the test data as a RDD -val testRdd = sc.textFile("E2006.test.lp") - .map(HmLabeledPoint.parse) - -// Transform into a DataFrame and transform features -// into a Spark Vector type. -val testDf = sqlContext.createDataFrame(testRdd) - .select($"label".as("target"), ft2vec($"features").as("features")) -``` - -Training (PA1) --------------------- -``` -// Make a model from the training data -val model = trainDf - .train_pa1_regr(add_bias($"features"), $"label") - .groupby("feature").agg("weight" -> "avg") - .as("feature", "weight") - -val modelUdf = HivemallUtils - .funcModel(model) -``` - -Test --------------------- -``` -// Do prediction -val predict = testDf - .select($"target", modelUdf($"features").as("predicted")) -``` - -Evaluation --------------------- -``` -predict - .groupBy().agg(Map("target"->"avg", "predicted"->"avg")) - .show() -``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f9f9799d/spark/spark-2.0/tutorials/binary_classification.md ---------------------------------------------------------------------- diff --git a/spark/spark-2.0/tutorials/binary_classification.md b/spark/spark-2.0/tutorials/binary_classification.md deleted file mode 100644 index 22b1c29..0000000 --- a/spark/spark-2.0/tutorials/binary_classification.md +++ /dev/null @@ -1,71 +0,0 @@ -This tutorial uses [9a binary classification](https://github.com/myui/hivemall/wiki#a9a-binary-classification) as a reference. - -Data preparation --------------------- -``` -// Fetch training data -# wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a -# wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a.t - -// Fetch a script to normalize the data -# wget https://raw.githubusercontent.com/maropu/hivemall-spark/master/scripts/misc/conv.awk -# awk -f conv.awk a9a | sed -e "s/+1/1/" | sed -e "s/-1/0/" > a9a.train -# awk -f conv.awk a9a.t | sed -e "s/+1/1/" | sed -e "s/-1/0/" > a9a.test - -// Fetch an initialization script for hivemall-spark -# wget https://raw.githubusercontent.com/maropu/hivemall-spark/master/scripts/ddl/define-dfs.sh - -// Invoke a spark-shell with hivemall-spark -# bin/spark-shell --packages maropu:hivemall-spark:0.0.6 --master=local-cluster[2,1,1024] --conf spark.executor.memory=1024 - -scala> :load define-dfs.sh - -scala> :paste - -// Load the training data as a RDD -val trainRdd = sc.textFile("a9a.train") - .map(HmLabeledPoint.parse) - -// Create the DataFrame that has exactly 2 partitions and -// amplify the data by 3 times. -val trainDf = sqlContext.createDataFrame(trainRdd) - .coalesce(2).part_amplify(3) - -// Load the test data as a RDD -val testRdd = sc.textFile("a9a.test") - .map(HmLabeledPoint.parse) - -// Transform into a DataFrame and transform features -// into a Spark Vector type. -val testDf = sqlContext.createDataFrame(testRdd) - .select($"label".as("target"), ft2vec($"features").as("features")) -``` - -Training (Logistic Regression) --------------------- -``` -// Make a model from the training data -val model = trainDf - .train_logregr(add_bias($"features"), $"label", "-total_steps 32561") - .groupby("feature").agg("weight" -> "avg") - .as("feature", "weight") - -val modelUdf = HivemallUtils - .funcModel(model) -``` - -Test --------------------- -``` -// Do prediction -val predict = testDf - .select($"target", sigmoid(modelUdf($"features")).as("prob")) - .select($"target", when($"prob" > 0.50, 1.0).otherwise(0.0).as("predict"), $"prob") - .cache -``` - -Evaluation --------------------- -``` -(predict.where($"target" === $"predict").count + 0.0) / predict.count -``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f9f9799d/spark/spark-2.0/tutorials/multiclass_classification.md ---------------------------------------------------------------------- diff --git a/spark/spark-2.0/tutorials/multiclass_classification.md b/spark/spark-2.0/tutorials/multiclass_classification.md deleted file mode 100644 index 74cd3bc..0000000 --- a/spark/spark-2.0/tutorials/multiclass_classification.md +++ /dev/null @@ -1,77 +0,0 @@ -This tutorial uses [news20 multiclass classification](https://github.com/myui/hivemall/wiki#news20-multiclass-classification) as a reference. - -Data preparation --------------------- -``` -// Fetch training and test data -# wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/news20.scale.bz2 -# wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/news20.t.scale.bz2 - -// Fetch a script to normalize the data -# wget https://raw.githubusercontent.com/maropu/hivemall-spark/master/scripts/misc/conv.awk -# bunzip2 -c news20.scale.bz2 | awk -f conv.awk > news20.train -# bunzip2 -c news20.t.scale.bz2 | awk -f conv.awk > news20.test - -// Fetch an initialization script for hivemall-spark -# wget https://raw.githubusercontent.com/maropu/hivemall-spark/master/scripts/ddl/define-dfs.sh - -// Invoke a spark-shell with hivemall-spark -# bin/spark-shell --packages maropu:hivemall-spark:0.0.6 --master=local-cluster[2,1,1024] --conf spark.executor.memory=1024 - -scala> :load define-dfs.sh - -scala> :paste - -// Load the training data as a RDD -val trainRdd = sc.textFile("news20.train") - .map(HmLabeledPoint.parse) - -// Create the DataFrame that has exactly 2 partitions and -// amplify the data by 3 times. -val trainDf = sqlContext.createDataFrame(trainRdd) - .coalesce(2).part_amplify(3) - -// Load the test data as a RDD -val testRdd = sc.textFile("news20.test") - .map(HmLabeledPoint.parse) - -// Transform into a DataFrame and transform features -// into a Spark Vector type. -val testDf = sqlContext.createDataFrame(testRdd) - .select(rowid(), $"label".cast(IntegerType).as("target"), $"features") - .cache - -val testDf_exploded = testDf.explode_array($"features") - .select($"rowid", $"target", extract_feature($"feature"), extract_weight($"feature")) -``` - -Training (CW) --------------------- -``` -// Make a model from the training data -val model = trainDf - .train_multiclass_cw(add_bias($"features"), $"label".cast(IntegerType)) - .groupby("label", "feature").argmin_kld("weight", "conv") - .as("label", "feature", "weight") -``` - -Test --------------------- -``` -// Do prediction -val predict = testDf_exploded - .join(model, testDf_exploded("feature") === model("feature"), "LEFT_OUTER") - .select($"rowid", $"label".cast(StringType).as("label"), ($"weight" * $"value").as("value")) - .groupby("rowid", "label").sum("value") - .groupby("rowid").maxrow("SUM(value)", "label") - .as("rowid", "r") - .select($"rowid", $"r.col0".as("score"), $"r.col1".as("predict")) - .cache -``` - -Evaluation --------------------- -``` -val joinPredicate = (testDf("rowid") === predict("rowid")).and(testDf("target") === predict("predict")) -(testDf.join(predict, joinPredicate, "INNER").count + 0.0) / testDf.count -``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f9f9799d/spark/spark-2.0/tutorials/randomforest.md ---------------------------------------------------------------------- diff --git a/spark/spark-2.0/tutorials/randomforest.md b/spark/spark-2.0/tutorials/randomforest.md deleted file mode 100644 index 350e19a..0000000 --- a/spark/spark-2.0/tutorials/randomforest.md +++ /dev/null @@ -1,86 +0,0 @@ -This tutorial uses [Kaggle Titanic binary classification](https://github.com/myui/hivemall/wiki/Kaggle-Titanic-binary-classification-using-Random-Forest) as a reference. - -Data preparation --------------------- -``` -// Fetch training and test data in Kaggle(https://www.kaggle.com/c/titanic/data), train.csv and test.csv - -// Fetch an initialization script for hivemall-spark -# wget https://raw.githubusercontent.com/maropu/hivemall-spark/master/scripts/ddl/define-dfs.sh - -// Invoke a spark-shell with spark-csv and hivemall-spark -# bin/spark-shell --packages com.databricks:spark-csv_2.10:1.4.0,maropu:hivemall-spark:0.0.6 - -scala> :load define-dfs.sh - -scala> :paste - -// Load the training data as a DataFrame -val trainCsvDf = sqlContext - .read - .format("com.databricks.spark.csv") - .option("header", "true") - .option("inferSchema", "true") - .load("train.csv") - .cache // Cached for second use - -val trainQuantifiedDf = trainCsvDf - .quantify(true.as("output") +: trainCsvDf.cols: _*) - // Rename output columns for readability - .as("passengerid", "survived", "pclass", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked") - .sort($"passengerid".asc) - -val trainDf = trainQuantifiedDf - .select( - $"passengerid", - array(trainQuantifiedDf.cols.drop(2): _*).as("features"), - $"survived" - ) - -// Load the test data as a DataFrame -val testCsvDf = sqlContext - .read - .format("com.databricks.spark.csv") - .option("header", "true") - .option("inferSchema", "true") - .load("test.csv") - -val testQuantifiedDf = testCsvDf - .select(Seq(1.as("train_first"), true.as("output"), $"PassengerId") ++ testCsvDf.cols.drop(1): _*) - .unionAll( - trainCsvDf.select(Seq(0.as("train_first"), false.as("output"), $"PassengerId") ++ trainCsvDf.cols.drop(2): _*) - ) - .sort($"train_first".asc, $"PassengerId".asc) - .quantify($"output" +: testCsvDf.cols: _*) - // Rename output columns for readability - .as("passengerid", "pclass", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked") - -val testDf = testQuantifiedDf - .select($"passengerid", array(testQuantifiedDf.cols.drop(1): _*).as("features")) -``` - -Training --------------------- -``` -// Make a model from the training data -val model = trainDf - .coalesce(4) - .train_randomforest_classifier($"features", $"survived", "-trees 400") -``` - -Test --------------------- -``` -// Do prediction -model - .coalesce(4) - .join(testDf) - .select( - testDf("passengerid"), - tree_predict(model("model_id"), model("model_type"), model("pred_model"), testDf("features"), true).as("predicted") - ) - .groupby($"passengerid").rf_ensemble("predicted") - .as("passengerid", "predicted") - .select($"passengerid", $"predicted.label") - .show -``` http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/f9f9799d/spark/spark-2.0/tutorials/regression.md ---------------------------------------------------------------------- diff --git a/spark/spark-2.0/tutorials/regression.md b/spark/spark-2.0/tutorials/regression.md deleted file mode 100644 index 5856373..0000000 --- a/spark/spark-2.0/tutorials/regression.md +++ /dev/null @@ -1,71 +0,0 @@ -This tutorial uses [E2006 tfidf regression](https://github.com/myui/hivemall/wiki#e2006-tfidf-regression) as a reference. - -Data preparation --------------------- -``` -// Fetch training and test data -# wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.train.bz2 -# wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.test.bz2 - -// Fetch a script to normalize the data -# wget https://raw.githubusercontent.com/maropu/hivemall-spark/master/scripts/misc/conv.awk -# bunzip2 -c E2006.train.bz2 | awk -f conv.awk > E2006.train.lp -# bunzip2 -c E2006.test.bz2 | awk -f conv.awk > E2006.test.lp - -// Fetch an initialization script for hivemall-spark -# wget https://raw.githubusercontent.com/maropu/hivemall-spark/master/scripts/ddl/define-dfs.sh - -// Invoke a spark-shell with hivemall-spark -# bin/spark-shell --packages maropu:hivemall-spark:0.0.6 --master=local-cluster[2,1,1024] --conf spark.executor.memory=1024 - -scala> :load define-dfs.sh - -scala> :paste - -// Load the training data as a RDD -val trainRdd = sc.textFile("E2006.train.lp") - .map(HmLabeledPoint.parse) - -// Create the DataFrame that has exactly 2 partitions and -// amplify the data by 3 times. -val trainDf = sqlContext.createDataFrame(trainRdd) - .coalesce(2).part_amplify(3) - -// Load the test data as a RDD -val testRdd = sc.textFile("E2006.test.lp") - .map(HmLabeledPoint.parse) - -// Transform into a DataFrame and transform features -// into a Spark Vector type. -val testDf = sqlContext.createDataFrame(testRdd) - .select($"label".as("target"), ft2vec($"features").as("features")) -``` - -Training (PA1) --------------------- -``` -// Make a model from the training data -val model = trainDf - .train_pa1_regr(add_bias($"features"), $"label") - .groupby("feature").agg("weight" -> "avg") - .as("feature", "weight") - -val modelUdf = HivemallUtils - .funcModel(model) -``` - -Test --------------------- -``` -// Do prediction -val predict = testDf - .select($"target", modelUdf($"features").as("predicted")) -``` - -Evaluation --------------------- -``` -predict - .groupBy().agg(Map("target"->"avg", "predicted"->"avg")) - .show() -```
