Repository: spark Updated Branches: refs/heads/master 262833397 -> a539b724c
[SPARK-16260][ML][EXAMPLE] PySpark ML Example Improvements and Cleanup ## What changes were proposed in this pull request? 1). Remove unused import in Scala example; 2). Move spark session import outside example off; 3). Change parameter setting the same as Scala; 4). Change comment to be consistent; 5). Make sure that Scala and python using the same data set; I did one pass and fixed the above issues. There are missing examples in python, which might be added later. TODO: For some examples, there are comments on how to run examples; But there are many missing. We can add them later. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) Manually test them Author: [email protected] <[email protected]> Closes #14021 from wangmiao1981/ann. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a539b724 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a539b724 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a539b724 Branch: refs/heads/master Commit: a539b724c1d407083cb87abfa06d8bf213501057 Parents: 2628333 Author: [email protected] <[email protected]> Authored: Sun Jul 3 23:23:02 2016 -0700 Committer: Yanbo Liang <[email protected]> Committed: Sun Jul 3 23:23:02 2016 -0700 ---------------------------------------------------------------------- examples/src/main/python/ml/elementwise_product_example.py | 2 ++ examples/src/main/python/ml/polynomial_expansion_example.py | 2 +- examples/src/main/python/ml/quantile_discretizer_example.py | 2 +- examples/src/main/python/ml/random_forest_classifier_example.py | 2 +- examples/src/main/python/ml/simple_text_classification_pipeline.py | 2 +- .../main/scala/org/apache/spark/examples/ml/DataFrameExample.scala | 1 - .../org/apache/spark/examples/ml/GaussianMixtureExample.scala | 2 +- .../scala/org/apache/spark/examples/ml/NaiveBayesExample.scala | 2 +- 8 files changed, 8 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/a539b724/examples/src/main/python/ml/elementwise_product_example.py ---------------------------------------------------------------------- diff --git a/examples/src/main/python/ml/elementwise_product_example.py b/examples/src/main/python/ml/elementwise_product_example.py index 598deae..5900539 100644 --- a/examples/src/main/python/ml/elementwise_product_example.py +++ b/examples/src/main/python/ml/elementwise_product_example.py @@ -30,10 +30,12 @@ if __name__ == "__main__": .getOrCreate() # $example on$ + # Create some vector data; also works for sparse vectors data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)] df = spark.createDataFrame(data, ["vector"]) transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]), inputCol="vector", outputCol="transformedVector") + # Batch transform the vectors to create new column: transformer.transform(df).show() # $example off$ http://git-wip-us.apache.org/repos/asf/spark/blob/a539b724/examples/src/main/python/ml/polynomial_expansion_example.py ---------------------------------------------------------------------- diff --git a/examples/src/main/python/ml/polynomial_expansion_example.py b/examples/src/main/python/ml/polynomial_expansion_example.py index 9475e33..b46c1ba 100644 --- a/examples/src/main/python/ml/polynomial_expansion_example.py +++ b/examples/src/main/python/ml/polynomial_expansion_example.py @@ -35,7 +35,7 @@ if __name__ == "__main__": (Vectors.dense([0.0, 0.0]),), (Vectors.dense([0.6, -1.1]),)], ["features"]) - px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures") + px = PolynomialExpansion(degree=3, inputCol="features", outputCol="polyFeatures") polyDF = px.transform(df) for expanded in polyDF.select("polyFeatures").take(3): print(expanded) http://git-wip-us.apache.org/repos/asf/spark/blob/a539b724/examples/src/main/python/ml/quantile_discretizer_example.py ---------------------------------------------------------------------- diff --git a/examples/src/main/python/ml/quantile_discretizer_example.py b/examples/src/main/python/ml/quantile_discretizer_example.py index 5444cac..6f422f8 100644 --- a/examples/src/main/python/ml/quantile_discretizer_example.py +++ b/examples/src/main/python/ml/quantile_discretizer_example.py @@ -24,7 +24,7 @@ from pyspark.sql import SparkSession if __name__ == "__main__": - spark = SparkSession.builder.appName("PythonQuantileDiscretizerExample").getOrCreate() + spark = SparkSession.builder.appName("QuantileDiscretizerExample").getOrCreate() # $example on$ data = [(0, 18.0,), (1, 19.0,), (2, 8.0,), (3, 5.0,), (4, 2.2,)] http://git-wip-us.apache.org/repos/asf/spark/blob/a539b724/examples/src/main/python/ml/random_forest_classifier_example.py ---------------------------------------------------------------------- diff --git a/examples/src/main/python/ml/random_forest_classifier_example.py b/examples/src/main/python/ml/random_forest_classifier_example.py index a7fc765..eb9ded9 100644 --- a/examples/src/main/python/ml/random_forest_classifier_example.py +++ b/examples/src/main/python/ml/random_forest_classifier_example.py @@ -50,7 +50,7 @@ if __name__ == "__main__": (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. - rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures") + rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10) # Chain indexers and forest in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf]) http://git-wip-us.apache.org/repos/asf/spark/blob/a539b724/examples/src/main/python/ml/simple_text_classification_pipeline.py ---------------------------------------------------------------------- diff --git a/examples/src/main/python/ml/simple_text_classification_pipeline.py b/examples/src/main/python/ml/simple_text_classification_pipeline.py index 886f43c..b528b59 100644 --- a/examples/src/main/python/ml/simple_text_classification_pipeline.py +++ b/examples/src/main/python/ml/simple_text_classification_pipeline.py @@ -48,7 +48,7 @@ if __name__ == "__main__": # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") - hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") + hashingTF = HashingTF(numFeatures=1000, inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) http://git-wip-us.apache.org/repos/asf/spark/blob/a539b724/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala index 11faa61..38c1c1c 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala @@ -20,7 +20,6 @@ package org.apache.spark.examples.ml import java.io.File -import com.google.common.io.Files import scopt.OptionParser import org.apache.spark.examples.mllib.AbstractParams http://git-wip-us.apache.org/repos/asf/spark/blob/a539b724/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala index c484ee5..2c2bf42 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/GaussianMixtureExample.scala @@ -21,8 +21,8 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.clustering.GaussianMixture -import org.apache.spark.sql.SparkSession // $example off$ +import org.apache.spark.sql.SparkSession /** * An example demonstrating Gaussian Mixture Model (GMM). http://git-wip-us.apache.org/repos/asf/spark/blob/a539b724/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala index a59ba18..7089a4b 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala @@ -35,7 +35,7 @@ object NaiveBayesExample { val data = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") // Split the data into training and test sets (30% held out for testing) - val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3)) + val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3), seed = 1234L) // Train a NaiveBayes model. val model = new NaiveBayes() --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
