Repository: spark
Updated Branches:
refs/heads/branch-1.6 98c614d66 -> 4a1bcb26d
[SPARK-11723][ML][DOC] Use LibSVM data source rather than
MLUtils.loadLibSVMFile to load DataFrame
Use LibSVM data source rather than MLUtils.loadLibSVMFile to load DataFrame,
include:
* Use libSVM data source for all example codes under examples/ml, and remove
unused import.
* Use libSVM data source for user guides under ml-*** which were omitted by
#8697.
* Fix bug: We should use ```sqlContext.read().format("libsvm").load(path)``` at
Java side, but the API doc and user guides misuse as
```sqlContext.read.format("libsvm").load(path)```.
* Code cleanup.
mengxr
Author: Yanbo Liang <[email protected]>
Closes #9690 from yanboliang/spark-11723.
(cherry picked from commit 99693fef0a30432d94556154b81872356d921c64)
Signed-off-by: Xiangrui Meng <[email protected]>
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4a1bcb26
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4a1bcb26
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4a1bcb26
Branch: refs/heads/branch-1.6
Commit: 4a1bcb26dde66e85c4b146a97dde1b0ea48befdf
Parents: 98c614d
Author: Yanbo Liang <[email protected]>
Authored: Fri Nov 13 08:43:05 2015 -0800
Committer: Xiangrui Meng <[email protected]>
Committed: Fri Nov 13 08:43:13 2015 -0800
----------------------------------------------------------------------
docs/ml-ensembles.md | 10 ++---
docs/ml-features.md | 8 ++--
docs/ml-guide.md | 10 +----
docs/ml-linear-methods.md | 4 +-
.../JavaDecisionTreeClassificationExample.java | 8 +---
.../ml/JavaDecisionTreeRegressionExample.java | 9 ++---
...vaMultilayerPerceptronClassifierExample.java | 6 +--
.../spark/examples/ml/JavaOneVsRestExample.java | 23 +++++------
.../ml/JavaTrainValidationSplitExample.java | 6 +--
.../ml/decision_tree_classification_example.py | 5 +--
.../ml/decision_tree_regression_example.py | 5 +--
.../main/python/ml/gradient_boosted_trees.py | 5 +--
.../src/main/python/ml/logistic_regression.py | 5 +--
.../ml/multilayer_perceptron_classification.py | 5 +--
.../src/main/python/ml/random_forest_example.py | 4 +-
.../ml/DecisionTreeClassificationExample.scala | 6 +--
.../spark/examples/ml/DecisionTreeExample.scala | 42 +++++++-------------
.../ml/DecisionTreeRegressionExample.scala | 7 ++--
.../apache/spark/examples/ml/GBTExample.scala | 2 +-
.../examples/ml/LinearRegressionExample.scala | 2 +-
.../examples/ml/LogisticRegressionExample.scala | 4 +-
.../MultilayerPerceptronClassifierExample.scala | 8 ++--
.../spark/examples/ml/OneVsRestExample.scala | 17 ++++----
.../spark/examples/ml/RandomForestExample.scala | 2 +-
.../ml/TrainValidationSplitExample.scala | 4 +-
.../spark/ml/source/libsvm/LibSVMRelation.scala | 2 +-
26 files changed, 79 insertions(+), 130 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/docs/ml-ensembles.md
----------------------------------------------------------------------
diff --git a/docs/ml-ensembles.md b/docs/ml-ensembles.md
index 58f566c..ce15f5e 100644
--- a/docs/ml-ensembles.md
+++ b/docs/ml-ensembles.md
@@ -195,7 +195,7 @@ import org.apache.spark.ml.feature.*;
import org.apache.spark.sql.DataFrame;
// Load and parse the data file, converting it to a DataFrame.
-DataFrame data = sqlContext.read.format("libsvm")
+DataFrame data = sqlContext.read().format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");
// Index labels, adding metadata to the label column.
@@ -384,7 +384,7 @@ import org.apache.spark.ml.regression.RandomForestRegressor;
import org.apache.spark.sql.DataFrame;
// Load and parse the data file, converting it to a DataFrame.
-DataFrame data = sqlContext.read.format("libsvm")
+DataFrame data = sqlContext.read().format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");
// Automatically identify categorical features, and index them.
@@ -640,7 +640,7 @@ import org.apache.spark.ml.feature.*;
import org.apache.spark.sql.DataFrame;
// Load and parse the data file, converting it to a DataFrame.
-DataFrame data
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+DataFrame data
sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
@@ -830,7 +830,7 @@ import org.apache.spark.ml.regression.GBTRegressor;
import org.apache.spark.sql.DataFrame;
// Load and parse the data file, converting it to a DataFrame.
-DataFrame data =
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+DataFrame data =
sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as
continuous.
@@ -1000,7 +1000,7 @@ SparkConf conf = new
SparkConf().setAppName("JavaOneVsRestExample");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext jsql = new SQLContext(jsc);
-DataFrame dataFrame = sqlContext.read.format("libsvm")
+DataFrame dataFrame = sqlContext.read().format("libsvm")
.load("data/mllib/sample_multiclass_classification_data.txt");
DataFrame[] splits = dataFrame.randomSplit(new double[] {0.7, 0.3}, 12345);
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/docs/ml-features.md
----------------------------------------------------------------------
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 142afac..cd1838d 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1109,7 +1109,7 @@ import org.apache.spark.ml.feature.VectorIndexer;
import org.apache.spark.ml.feature.VectorIndexerModel;
import org.apache.spark.sql.DataFrame;
-DataFrame data = sqlContext.read.format("libsvm")
+DataFrame data = sqlContext.read().format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");
VectorIndexer indexer = new VectorIndexer()
.setInputCol("features")
@@ -1187,7 +1187,7 @@ for more details on the API.
import org.apache.spark.ml.feature.Normalizer;
import org.apache.spark.sql.DataFrame;
-DataFrame dataFrame = sqlContext.read.format("libsvm")
+DataFrame dataFrame = sqlContext.read().format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");
// Normalize each Vector using $L^1$ norm.
@@ -1273,7 +1273,7 @@ import org.apache.spark.ml.feature.StandardScaler;
import org.apache.spark.ml.feature.StandardScalerModel;
import org.apache.spark.sql.DataFrame;
-DataFrame dataFrame = sqlContext.read.format("libsvm")
+DataFrame dataFrame = sqlContext.read().format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");
StandardScaler scaler = new StandardScaler()
.setInputCol("features")
@@ -1366,7 +1366,7 @@ import org.apache.spark.ml.feature.MinMaxScaler;
import org.apache.spark.ml.feature.MinMaxScalerModel;
import org.apache.spark.sql.DataFrame;
-DataFrame dataFrame = sqlContext.read.format("libsvm")
+DataFrame dataFrame = sqlContext.read().format("libsvm")
.load("data/mllib/sample_libsvm_data.txt");
MinMaxScaler scaler = new MinMaxScaler()
.setInputCol("features")
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/docs/ml-guide.md
----------------------------------------------------------------------
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index c293e71..be18a05 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -867,10 +867,9 @@ The `ParamMap` which produces the best evaluation metric
is selected as the best
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
-import org.apache.spark.mllib.util.MLUtils
// Prepare training and test data.
-val data = MLUtils.loadLibSVMFile(sc,
"data/mllib/sample_libsvm_data.txt").toDF()
+val data =
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345)
val lr = new LinearRegression()
@@ -911,14 +910,9 @@ import org.apache.spark.ml.evaluation.RegressionEvaluator;
import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.ml.regression.LinearRegression;
import org.apache.spark.ml.tuning.*;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.DataFrame;
-DataFrame data = sqlContext.createDataFrame(
- MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt"),
- LabeledPoint.class);
+DataFrame data =
jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
// Prepare training and test data.
DataFrame[] splits = data.randomSplit(new double[] {0.9, 0.1}, 12345);
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/docs/ml-linear-methods.md
----------------------------------------------------------------------
diff --git a/docs/ml-linear-methods.md b/docs/ml-linear-methods.md
index 16e2ee7..85edfd3 100644
--- a/docs/ml-linear-methods.md
+++ b/docs/ml-linear-methods.md
@@ -95,7 +95,7 @@ public class LogisticRegressionWithElasticNetExample {
String path = "data/mllib/sample_libsvm_data.txt";
// Load training data
- DataFrame training = sqlContext.read.format("libsvm").load(path);
+ DataFrame training = sqlContext.read().format("libsvm").load(path);
LogisticRegression lr = new LogisticRegression()
.setMaxIter(10)
@@ -292,7 +292,7 @@ public class LinearRegressionWithElasticNetExample {
String path = "data/mllib/sample_libsvm_data.txt";
// Load training data
- DataFrame training = sqlContext.read.format("libsvm").load(path);
+ DataFrame training = sqlContext.read().format("libsvm").load(path);
LinearRegression lr = new LinearRegression()
.setMaxIter(10)
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java
----------------------------------------------------------------------
diff --git
a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java
b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java
index 51c1730..482225e 100644
---
a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java
+++
b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeClassificationExample.java
@@ -26,9 +26,6 @@ import
org.apache.spark.ml.classification.DecisionTreeClassifier;
import org.apache.spark.ml.classification.DecisionTreeClassificationModel;
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
import org.apache.spark.ml.feature.*;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
// $example off$
@@ -40,9 +37,8 @@ public class JavaDecisionTreeClassificationExample {
SQLContext sqlContext = new SQLContext(jsc);
// $example on$
- // Load and parse the data file, converting it to a DataFrame.
- RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(jsc.sc(),
"data/mllib/sample_libsvm_data.txt");
- DataFrame data = sqlContext.createDataFrame(rdd, LabeledPoint.class);
+ // Load the data stored in LIBSVM format as a DataFrame.
+ DataFrame data =
sqlContext.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java
----------------------------------------------------------------------
diff --git
a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java
b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java
index a4098a4..c7f1868 100644
---
a/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java
+++
b/examples/src/main/java/org/apache/spark/examples/ml/JavaDecisionTreeRegressionExample.java
@@ -27,9 +27,6 @@ import org.apache.spark.ml.feature.VectorIndexer;
import org.apache.spark.ml.feature.VectorIndexerModel;
import org.apache.spark.ml.regression.DecisionTreeRegressionModel;
import org.apache.spark.ml.regression.DecisionTreeRegressor;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
// $example off$
@@ -40,9 +37,9 @@ public class JavaDecisionTreeRegressionExample {
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(jsc);
// $example on$
- // Load and parse the data file, converting it to a DataFrame.
- RDD<LabeledPoint> rdd = MLUtils.loadLibSVMFile(jsc.sc(),
"data/mllib/sample_libsvm_data.txt");
- DataFrame data = sqlContext.createDataFrame(rdd, LabeledPoint.class);
+ // Load the data stored in LIBSVM format as a DataFrame.
+ DataFrame data = sqlContext.read().format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt");
// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as
continuous.
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
----------------------------------------------------------------------
diff --git
a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
index f48e133..84369f6 100644
---
a/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
+++
b/examples/src/main/java/org/apache/spark/examples/ml/JavaMultilayerPerceptronClassifierExample.java
@@ -21,12 +21,9 @@ package org.apache.spark.examples.ml;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SQLContext;
-import org.apache.spark.api.java.JavaRDD;
import
org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel;
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier;
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.sql.DataFrame;
// $example off$
@@ -43,8 +40,7 @@ public class JavaMultilayerPerceptronClassifierExample {
// $example on$
// Load training data
String path = "data/mllib/sample_multiclass_classification_data.txt";
- JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(),
path).toJavaRDD();
- DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+ DataFrame dataFrame = jsql.read().format("libsvm").load(path);
// Split the data into train and test
DataFrame[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L);
DataFrame train = splits[0];
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
----------------------------------------------------------------------
diff --git
a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
index e7f2f6f..f0d92a5 100644
---
a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
+++
b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneVsRestExample.java
@@ -27,9 +27,7 @@ import org.apache.spark.ml.classification.OneVsRestModel;
import org.apache.spark.ml.util.MetadataUtils;
import org.apache.spark.mllib.evaluation.MulticlassMetrics;
import org.apache.spark.mllib.linalg.Matrix;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.rdd.RDD;
+import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.StructField;
@@ -80,31 +78,30 @@ public class JavaOneVsRestExample {
OneVsRest ovr = new OneVsRest().setClassifier(classifier);
String input = params.input;
- RDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), input);
- RDD<LabeledPoint> train;
- RDD<LabeledPoint> test;
+ DataFrame inputData = jsql.read().format("libsvm").load(input);
+ DataFrame train;
+ DataFrame test;
// compute the train/ test split: if testInput is not provided use part of
input
String testInput = params.testInput;
if (testInput != null) {
train = inputData;
// compute the number of features in the training set.
- int numFeatures = inputData.first().features().size();
- test = MLUtils.loadLibSVMFile(jsc.sc(), testInput, numFeatures);
+ int numFeatures = inputData.first().<Vector>getAs(1).size();
+ test = jsql.read().format("libsvm").option("numFeatures",
+ String.valueOf(numFeatures)).load(testInput);
} else {
double f = params.fracTest;
- RDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{1 - f, f},
12345);
+ DataFrame[] tmp = inputData.randomSplit(new double[]{1 - f, f}, 12345);
train = tmp[0];
test = tmp[1];
}
// train the multiclass model
- DataFrame trainingDataFrame = jsql.createDataFrame(train,
LabeledPoint.class);
- OneVsRestModel ovrModel = ovr.fit(trainingDataFrame.cache());
+ OneVsRestModel ovrModel = ovr.fit(train.cache());
// score the model on test data
- DataFrame testDataFrame = jsql.createDataFrame(test, LabeledPoint.class);
- DataFrame predictions = ovrModel.transform(testDataFrame.cache())
+ DataFrame predictions = ovrModel.transform(test.cache())
.select("prediction", "label");
// obtain metrics
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
----------------------------------------------------------------------
diff --git
a/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
b/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
index 23f834a..d433905 100644
---
a/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
+++
b/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
@@ -23,8 +23,6 @@ import org.apache.spark.ml.evaluation.RegressionEvaluator;
import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.ml.regression.LinearRegression;
import org.apache.spark.ml.tuning.*;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.util.MLUtils;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
@@ -46,9 +44,7 @@ public class JavaTrainValidationSplitExample {
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext jsql = new SQLContext(jsc);
- DataFrame data = jsql.createDataFrame(
- MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt"),
- LabeledPoint.class);
+ DataFrame data =
jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
// Prepare training and test data.
DataFrame[] splits = data.randomSplit(new double [] {0.9, 0.1}, 12345);
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/python/ml/decision_tree_classification_example.py
----------------------------------------------------------------------
diff --git
a/examples/src/main/python/ml/decision_tree_classification_example.py
b/examples/src/main/python/ml/decision_tree_classification_example.py
index 0af9205..8cda56d 100644
--- a/examples/src/main/python/ml/decision_tree_classification_example.py
+++ b/examples/src/main/python/ml/decision_tree_classification_example.py
@@ -28,7 +28,6 @@ from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
-from pyspark.mllib.util import MLUtils
# $example off$
if __name__ == "__main__":
@@ -36,8 +35,8 @@ if __name__ == "__main__":
sqlContext = SQLContext(sc)
# $example on$
- # Load and parse the data file, converting it to a DataFrame.
- data = MLUtils.loadLibSVMFile(sc,
"data/mllib/sample_libsvm_data.txt").toDF()
+ # Load the data stored in LIBSVM format as a DataFrame.
+ data =
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/python/ml/decision_tree_regression_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/decision_tree_regression_example.py
b/examples/src/main/python/ml/decision_tree_regression_example.py
index 3857aed..439e398 100644
--- a/examples/src/main/python/ml/decision_tree_regression_example.py
+++ b/examples/src/main/python/ml/decision_tree_regression_example.py
@@ -28,7 +28,6 @@ from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator
-from pyspark.mllib.util import MLUtils
# $example off$
if __name__ == "__main__":
@@ -36,8 +35,8 @@ if __name__ == "__main__":
sqlContext = SQLContext(sc)
# $example on$
- # Load and parse the data file, converting it to a DataFrame.
- data = MLUtils.loadLibSVMFile(sc,
"data/mllib/sample_libsvm_data.txt").toDF()
+ # Load the data stored in LIBSVM format as a DataFrame.
+ data =
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are
treated as continuous.
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/python/ml/gradient_boosted_trees.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/gradient_boosted_trees.py
b/examples/src/main/python/ml/gradient_boosted_trees.py
index 6446f0f..c3bf8aa 100644
--- a/examples/src/main/python/ml/gradient_boosted_trees.py
+++ b/examples/src/main/python/ml/gradient_boosted_trees.py
@@ -24,7 +24,6 @@ from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer
from pyspark.ml.regression import GBTRegressor
from pyspark.mllib.evaluation import BinaryClassificationMetrics,
RegressionMetrics
-from pyspark.mllib.util import MLUtils
from pyspark.sql import Row, SQLContext
"""
@@ -70,8 +69,8 @@ if __name__ == "__main__":
sc = SparkContext(appName="PythonGBTExample")
sqlContext = SQLContext(sc)
- # Load and parse the data file into a dataframe.
- df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+ # Load the data stored in LIBSVM format as a DataFrame.
+ df =
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Map labels into an indexed column of labels in [0, numLabels)
stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/python/ml/logistic_regression.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/logistic_regression.py
b/examples/src/main/python/ml/logistic_regression.py
index 55afe1b..4cd027f 100644
--- a/examples/src/main/python/ml/logistic_regression.py
+++ b/examples/src/main/python/ml/logistic_regression.py
@@ -23,7 +23,6 @@ from pyspark import SparkContext
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.feature import StringIndexer
-from pyspark.mllib.util import MLUtils
from pyspark.sql import SQLContext
"""
@@ -41,8 +40,8 @@ if __name__ == "__main__":
sc = SparkContext(appName="PythonLogisticRegressionExample")
sqlContext = SQLContext(sc)
- # Load and parse the data file into a dataframe.
- df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+ # Load the data stored in LIBSVM format as a DataFrame.
+ df =
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Map labels into an indexed column of labels in [0, numLabels)
stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/python/ml/multilayer_perceptron_classification.py
----------------------------------------------------------------------
diff --git
a/examples/src/main/python/ml/multilayer_perceptron_classification.py
b/examples/src/main/python/ml/multilayer_perceptron_classification.py
index d8ef9f3..f84588f 100644
--- a/examples/src/main/python/ml/multilayer_perceptron_classification.py
+++ b/examples/src/main/python/ml/multilayer_perceptron_classification.py
@@ -22,7 +22,6 @@ from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
-from pyspark.mllib.util import MLUtils
# $example off$
if __name__ == "__main__":
@@ -32,8 +31,8 @@ if __name__ == "__main__":
# $example on$
# Load training data
- data = MLUtils.loadLibSVMFile(sc,
"data/mllib/sample_multiclass_classification_data.txt")\
- .toDF()
+ data = sqlContext.read.format("libsvm")\
+ .load("data/mllib/sample_multiclass_classification_data.txt")
# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/python/ml/random_forest_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/ml/random_forest_example.py
b/examples/src/main/python/ml/random_forest_example.py
index c7730e1..dc6a778 100644
--- a/examples/src/main/python/ml/random_forest_example.py
+++ b/examples/src/main/python/ml/random_forest_example.py
@@ -74,8 +74,8 @@ if __name__ == "__main__":
sc = SparkContext(appName="PythonRandomForestExample")
sqlContext = SQLContext(sc)
- # Load and parse the data file into a dataframe.
- df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
+ # Load the data stored in LIBSVM format as a DataFrame.
+ df =
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
# Map labels into an indexed column of labels in [0, numLabels)
stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
----------------------------------------------------------------------
diff --git
a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
index a24a344..ff8a0a9 100644
---
a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
+++
b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
@@ -26,7 +26,6 @@ import
org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.ml.feature.{StringIndexer, IndexToString,
VectorIndexer}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
-import org.apache.spark.mllib.util.MLUtils
// $example off$
object DecisionTreeClassificationExample {
@@ -34,10 +33,9 @@ object DecisionTreeClassificationExample {
val conf = new SparkConf().setAppName("DecisionTreeClassificationExample")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
// $example on$
- // Load and parse the data file, converting it to a DataFrame.
- val data = MLUtils.loadLibSVMFile(sc,
"data/mllib/sample_libsvm_data.txt").toDF()
+ // Load the data stored in LIBSVM format as a DataFrame.
+ val data =
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
----------------------------------------------------------------------
diff --git
a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
index f28671f..c4e98df 100644
---
a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
+++
b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala
@@ -32,10 +32,7 @@ import
org.apache.spark.ml.regression.{DecisionTreeRegressionModel, DecisionTree
import org.apache.spark.ml.util.MetadataUtils
import org.apache.spark.mllib.evaluation.{RegressionMetrics, MulticlassMetrics}
import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}
@@ -138,15 +135,18 @@ object DecisionTreeExample {
/** Load a dataset from the given path, using the given format */
private[ml] def loadData(
- sc: SparkContext,
+ sqlContext: SQLContext,
path: String,
format: String,
- expectedNumFeatures: Option[Int] = None): RDD[LabeledPoint] = {
+ expectedNumFeatures: Option[Int] = None): DataFrame = {
+ import sqlContext.implicits._
+
format match {
- case "dense" => MLUtils.loadLabeledPoints(sc, path)
+ case "dense" => MLUtils.loadLabeledPoints(sqlContext.sparkContext,
path).toDF()
case "libsvm" => expectedNumFeatures match {
- case Some(numFeatures) => MLUtils.loadLibSVMFile(sc, path, numFeatures)
- case None => MLUtils.loadLibSVMFile(sc, path)
+ case Some(numFeatures) => sqlContext.read.option("numFeatures",
numFeatures.toString)
+ .format("libsvm").load(path)
+ case None => sqlContext.read.format("libsvm").load(path)
}
case _ => throw new IllegalArgumentException(s"Bad data format: $format")
}
@@ -169,36 +169,22 @@ object DecisionTreeExample {
algo: String,
fracTest: Double): (DataFrame, DataFrame) = {
val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
// Load training data
- val origExamples: RDD[LabeledPoint] = loadData(sc, input, dataFormat)
+ val origExamples: DataFrame = loadData(sqlContext, input, dataFormat)
// Load or create test set
- val splits: Array[RDD[LabeledPoint]] = if (testInput != "") {
+ val dataframes: Array[DataFrame] = if (testInput != "") {
// Load testInput.
- val numFeatures = origExamples.take(1)(0).features.size
- val origTestExamples: RDD[LabeledPoint] =
- loadData(sc, testInput, dataFormat, Some(numFeatures))
+ val numFeatures = origExamples.first().getAs[Vector](1).size
+ val origTestExamples: DataFrame =
+ loadData(sqlContext, testInput, dataFormat, Some(numFeatures))
Array(origExamples, origTestExamples)
} else {
// Split input into training, test.
origExamples.randomSplit(Array(1.0 - fracTest, fracTest), seed = 12345)
}
- // For classification, convert labels to Strings since we will index them
later with
- // StringIndexer.
- def labelsToStrings(data: DataFrame): DataFrame = {
- algo.toLowerCase match {
- case "classification" =>
- data.withColumn("labelString", data("label").cast(StringType))
- case "regression" =>
- data
- case _ =>
- throw new IllegalArgumentException("Algo ${params.algo} not
supported.")
- }
- }
- val dataframes = splits.map(_.toDF()).map(labelsToStrings)
val training = dataframes(0).cache()
val test = dataframes(1).cache()
@@ -230,7 +216,7 @@ object DecisionTreeExample {
val labelColName = if (algo == "classification") "indexedLabel" else
"label"
if (algo == "classification") {
val labelIndexer = new StringIndexer()
- .setInputCol("labelString")
+ .setInputCol("label")
.setOutputCol(labelColName)
stages += labelIndexer
}
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
----------------------------------------------------------------------
diff --git
a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
index 64cd986..fc40272 100644
---
a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
+++
b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
@@ -25,17 +25,16 @@ import org.apache.spark.ml.regression.DecisionTreeRegressor
import org.apache.spark.ml.regression.DecisionTreeRegressionModel
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.evaluation.RegressionEvaluator
-import org.apache.spark.mllib.util.MLUtils
// $example off$
object DecisionTreeRegressionExample {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("DecisionTreeRegressionExample")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
+
// $example on$
- // Load and parse the data file, converting it to a DataFrame.
- val data = MLUtils.loadLibSVMFile(sc,
"data/mllib/sample_libsvm_data.txt").toDF()
+ // Load the data stored in LIBSVM format as a DataFrame.
+ val data =
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
// Automatically identify categorical features, and index them.
// Here, we treat features with > 4 distinct values as continuous.
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala
----------------------------------------------------------------------
diff --git
a/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala
b/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala
index f4a15f8..6b0be0f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala
@@ -153,7 +153,7 @@ object GBTExample {
val labelColName = if (algo == "classification") "indexedLabel" else
"label"
if (algo == "classification") {
val labelIndexer = new StringIndexer()
- .setInputCol("labelString")
+ .setInputCol("label")
.setOutputCol(labelColName)
stages += labelIndexer
}
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
----------------------------------------------------------------------
diff --git
a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
index b73299f..50998c9 100644
---
a/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
+++
b/examples/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
@@ -131,7 +131,7 @@ object LinearRegressionExample {
println(s"Training time: $elapsedTime seconds")
// Print the weights and intercept for linear regression.
- println(s"Weights: ${lirModel.weights} Intercept: ${lirModel.intercept}")
+ println(s"Weights: ${lirModel.coefficients} Intercept:
${lirModel.intercept}")
println("Training data results:")
DecisionTreeExample.evaluateRegressionModel(lirModel, training, "label")
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
----------------------------------------------------------------------
diff --git
a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
index 8e3760d..a380c90 100644
---
a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
+++
b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
@@ -125,7 +125,7 @@ object LogisticRegressionExample {
val stages = new mutable.ArrayBuffer[PipelineStage]()
val labelIndexer = new StringIndexer()
- .setInputCol("labelString")
+ .setInputCol("label")
.setOutputCol("indexedLabel")
stages += labelIndexer
@@ -149,7 +149,7 @@ object LogisticRegressionExample {
val lorModel =
pipelineModel.stages.last.asInstanceOf[LogisticRegressionModel]
// Print the weights and intercept for logistic regression.
- println(s"Weights: ${lorModel.weights} Intercept: ${lorModel.intercept}")
+ println(s"Weights: ${lorModel.coefficients} Intercept:
${lorModel.intercept}")
println("Training data results:")
DecisionTreeExample.evaluateClassificationModel(pipelineModel, training,
"indexedLabel")
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
----------------------------------------------------------------------
diff --git
a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
index 99d5f35..146b83c 100644
---
a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
+++
b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
@@ -23,7 +23,6 @@ import org.apache.spark.sql.SQLContext
// $example on$
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
-import org.apache.spark.mllib.util.MLUtils
// $example off$
/**
@@ -35,12 +34,11 @@ object MultilayerPerceptronClassifierExample {
val conf = new
SparkConf().setAppName("MultilayerPerceptronClassifierExample")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
// $example on$
- // Load training data
- val data = MLUtils.loadLibSVMFile(sc,
"data/mllib/sample_multiclass_classification_data.txt")
- .toDF()
+ // Load the data stored in LIBSVM format as a DataFrame.
+ val data = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_multiclass_classification_data.txt")
// Split the data into train and test
val splits = data.randomSplit(Array(0.6, 0.4), seed = 1234L)
val train = splits(0)
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
----------------------------------------------------------------------
diff --git
a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
index bab31f5..8e4f1b0 100644
---
a/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
+++
b/examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala
@@ -27,9 +27,8 @@ import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.ml.classification.{OneVsRest, LogisticRegression}
import org.apache.spark.ml.util.MetadataUtils
import org.apache.spark.mllib.evaluation.MulticlassMetrics
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.util.MLUtils
-import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SQLContext
/**
@@ -111,24 +110,24 @@ object OneVsRestExample {
private def run(params: Params) {
val conf = new SparkConf().setAppName(s"OneVsRestExample with $params")
val sc = new SparkContext(conf)
- val inputData = MLUtils.loadLibSVMFile(sc, params.input)
val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
+ val inputData = sqlContext.read.format("libsvm").load(params.input)
// compute the train/test split: if testInput is not provided use part of
input.
val data = params.testInput match {
case Some(t) => {
// compute the number of features in the training set.
- val numFeatures = inputData.first().features.size
- val testData = MLUtils.loadLibSVMFile(sc, t, numFeatures)
- Array[RDD[LabeledPoint]](inputData, testData)
+ val numFeatures = inputData.first().getAs[Vector](1).size
+ val testData = sqlContext.read.option("numFeatures",
numFeatures.toString)
+ .format("libsvm").load(t)
+ Array[DataFrame](inputData, testData)
}
case None => {
val f = params.fracTest
inputData.randomSplit(Array(1 - f, f), seed = 12345)
}
}
- val Array(train, test) = data.map(_.toDF().cache())
+ val Array(train, test) = data.map(_.cache())
// instantiate the base classifier
val classifier = new LogisticRegression()
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala
----------------------------------------------------------------------
diff --git
a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala
b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala
index 109178f..7a00d99 100644
---
a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala
+++
b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestExample.scala
@@ -159,7 +159,7 @@ object RandomForestExample {
val labelColName = if (algo == "classification") "indexedLabel" else
"label"
if (algo == "classification") {
val labelIndexer = new StringIndexer()
- .setInputCol("labelString")
+ .setInputCol("label")
.setOutputCol(labelColName)
stages += labelIndexer
}
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala
----------------------------------------------------------------------
diff --git
a/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala
b/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala
index 1abdf21..cd1b0e9 100644
---
a/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala
+++
b/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala
@@ -20,7 +20,6 @@ package org.apache.spark.examples.ml
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
-import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
@@ -39,10 +38,9 @@ object TrainValidationSplitExample {
val conf = new SparkConf().setAppName("TrainValidationSplitExample")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
- import sqlContext.implicits._
// Prepare training and test data.
- val data = MLUtils.loadLibSVMFile(sc,
"data/mllib/sample_libsvm_data.txt").toDF()
+ val data =
sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345)
val lr = new LinearRegression()
http://git-wip-us.apache.org/repos/asf/spark/blob/4a1bcb26/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
----------------------------------------------------------------------
diff --git
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
index 1f62777..11b9815 100644
---
a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
+++
b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala
@@ -82,7 +82,7 @@ private[libsvm] class LibSVMRelation(val path: String, val
numFeatures: Int, val
* .load("data/mllib/sample_libsvm_data.txt")
*
* // Java
- * DataFrame df = sqlContext.read.format("libsvm")
+ * DataFrame df = sqlContext.read().format("libsvm")
* .option("numFeatures, "780")
* .load("data/mllib/sample_libsvm_data.txt");
* }}}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]