Repository: incubator-hivemall Updated Branches: refs/heads/master 89c7538a3 -> 047f5fed4
Close #100: [HOTFIX] Update documents for DataFrame in Spark Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/047f5fed Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/047f5fed Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/047f5fed Branch: refs/heads/master Commit: 047f5fed48baf3bbf8882a79ee54283cb4fd9b19 Parents: 89c7538 Author: Takeshi Yamamuro <[email protected]> Authored: Wed Jul 12 16:02:46 2017 +0900 Committer: Takeshi Yamamuro <[email protected]> Committed: Wed Jul 12 16:02:46 2017 +0900 ---------------------------------------------------------------------- docs/gitbook/spark/binaryclass/a9a_df.md | 13 +++++++++---- docs/gitbook/spark/regression/e2006_df.md | 13 +++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/047f5fed/docs/gitbook/spark/binaryclass/a9a_df.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/spark/binaryclass/a9a_df.md b/docs/gitbook/spark/binaryclass/a9a_df.md index 74f2705..88229e3 100644 --- a/docs/gitbook/spark/binaryclass/a9a_df.md +++ b/docs/gitbook/spark/binaryclass/a9a_df.md @@ -31,10 +31,15 @@ $ wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a9a.t ```scala scala> :paste -val trainDf = spark.read.format("libsvm").load("a9a") - .select( +val rawTrainDf = spark.read.format("libsvm").load("a9a") + +val (max, min) = rawTrainDf.select(max($"label"), min($"label")).collect.map { + case Row(max: Double, min: Double) => (max, min) +} + +val trainDf = rawTrainDf.select( // `label` must be [0.0, 1.0] - rescale($"label", lit(-1.0f), lit(1.0f)).as("label"), + rescale($"label", lit(min), lit(max)).as("label"), $"features" ) @@ -45,7 +50,7 @@ root scala> :paste val testDf = spark.read.format("libsvm").load("a9a.t") - .select(rowid(), rescale($"label", lit(-1.0f), lit(1.0f)).as("label"), $"features") + .select(rowid(), rescale($"label", lit(min), lit(max)).as("label"), $"features") .explode_vector($"features") .select($"rowid", $"label".as("target"), $"feature", $"weight".as("value")) .cache http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/047f5fed/docs/gitbook/spark/regression/e2006_df.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/spark/regression/e2006_df.md b/docs/gitbook/spark/regression/e2006_df.md index 5980e3e..d6ac138 100644 --- a/docs/gitbook/spark/regression/e2006_df.md +++ b/docs/gitbook/spark/regression/e2006_df.md @@ -31,10 +31,15 @@ $ wget http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/E2006.t ```scala scala> :paste -val trainDf = spark.read.format("libsvm").load("E2006.train.bz2") - .select( +val rawTrainDf = spark.read.format("libsvm").load("E2006.train.bz2") + +val (max, min) = rawTrainDf.select(max($"label"), min($"label")).collect.map { + case Row(max: Double, min: Double) => (max, min) +} + +val trainDf = rawTrainDf.select( // `label` must be [0.0, 1.0] - rescale($"label", lit(-7.899578f), lit(-0.51940954f)).as("label"), + rescale($"label", lit(min), lit(max).as("label"), $"features" ) @@ -45,7 +50,7 @@ root scala> :paste val testDf = spark.read.format("libsvm").load("E2006.test.bz2") - .select(rowid(), rescale($"label", lit(-7.899578f), lit(-0.51940954f)).as("label"), $"features") + .select(rowid(), rescale($"label", lit(min), lit(max)).as("label"), $"features") .explode_vector($"features") .select($"rowid", $"label".as("target"), $"feature", $"weight".as("value")) .cache
