incubator-hivemall git commit: Close #100: [HOTFIX] Update documents for DataFrame in Spark

yamamuro Wed, 12 Jul 2017 00:03:35 -0700

Repository: incubator-hivemall
Updated Branches:
  refs/heads/master 89c7538a3 -> 047f5fed4



Close #100: [HOTFIX] Update documents for DataFrame in Spark


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/047f5fed
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/047f5fed
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/047f5fed

Branch: refs/heads/master
Commit: 047f5fed48baf3bbf8882a79ee54283cb4fd9b19
Parents: 89c7538
Author: Takeshi Yamamuro <[email protected]>
Authored: Wed Jul 12 16:02:46 2017 +0900
Committer: Takeshi Yamamuro <[email protected]>
Committed: Wed Jul 12 16:02:46 2017 +0900

----------------------------------------------------------------------
 docs/gitbook/spark/binaryclass/a9a_df.md  | 13 +++++++++----
 docs/gitbook/spark/regression/e2006_df.md | 13 +++++++++----
 2 files changed, 18 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/047f5fed/docs/gitbook/spark/binaryclass/a9a_df.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/spark/binaryclass/a9a_df.md 
b/docs/gitbook/spark/binaryclass/a9a_df.md
index 74f2705..88229e3 100644
--- a/docs/gitbook/spark/binaryclass/a9a_df.md
+++ b/docs/gitbook/spark/binaryclass/a9a_df.md
@@ -31,10 +31,15 @@ $ wget 
http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a.t
 
 ```scala
 scala> :paste
-val trainDf = spark.read.format("libsvm").load("a9a")
-  .select(
+val rawTrainDf = spark.read.format("libsvm").load("a9a")
+
+val (max, min) = rawTrainDf.select(max($"label"), min($"label")).collect.map {
+  case Row(max: Double, min: Double) => (max, min)
+}
+
+val trainDf = rawTrainDf.select(
     // `label` must be [0.0, 1.0]
-    rescale($"label", lit(-1.0f), lit(1.0f)).as("label"),
+    rescale($"label", lit(min), lit(max)).as("label"),
     $"features"
   )
 
@@ -45,7 +50,7 @@ root
 
 scala> :paste
 val testDf = spark.read.format("libsvm").load("a9a.t")
-  .select(rowid(), rescale($"label", lit(-1.0f), lit(1.0f)).as("label"), 
$"features")
+  .select(rowid(), rescale($"label", lit(min), lit(max)).as("label"), 
$"features")
   .explode_vector($"features")
   .select($"rowid", $"label".as("target"), $"feature", $"weight".as("value"))
   .cache

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/047f5fed/docs/gitbook/spark/regression/e2006_df.md
----------------------------------------------------------------------
diff --git a/docs/gitbook/spark/regression/e2006_df.md 
b/docs/gitbook/spark/regression/e2006_df.md
index 5980e3e..d6ac138 100644
--- a/docs/gitbook/spark/regression/e2006_df.md
+++ b/docs/gitbook/spark/regression/e2006_df.md
@@ -31,10 +31,15 @@ $ wget 
http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.t
 
 ```scala
 scala> :paste
-val trainDf = spark.read.format("libsvm").load("E2006.train.bz2")
-  .select(
+val rawTrainDf = spark.read.format("libsvm").load("E2006.train.bz2")
+
+val (max, min) = rawTrainDf.select(max($"label"), min($"label")).collect.map {
+  case Row(max: Double, min: Double) => (max, min)
+}
+
+val trainDf = rawTrainDf.select(
     // `label` must be [0.0, 1.0]
-    rescale($"label", lit(-7.899578f), lit(-0.51940954f)).as("label"),
+    rescale($"label", lit(min), lit(max).as("label"),
     $"features"
   )
 
@@ -45,7 +50,7 @@ root
 
 scala> :paste
 val testDf = spark.read.format("libsvm").load("E2006.test.bz2")
-  .select(rowid(), rescale($"label", lit(-7.899578f), 
lit(-0.51940954f)).as("label"), $"features")
+  .select(rowid(), rescale($"label", lit(min), lit(max)).as("label"), 
$"features")
   .explode_vector($"features")
   .select($"rowid", $"label".as("target"), $"feature", $"weight".as("value"))
   .cache

incubator-hivemall git commit: Close #100: [HOTFIX] Update documents for DataFrame in Spark

Reply via email to