Github user MLnick commented on a diff in the pull request:
https://github.com/apache/spark/pull/12920#discussion_r62414456
--- Diff:
examples/src/main/scala/org/apache/spark/examples/ml/OneVsRestExample.scala ---
@@ -18,171 +18,71 @@
// scalastyle:off println
package org.apache.spark.examples.ml
-import java.util.concurrent.TimeUnit.{NANOSECONDS => NANO}
-
-import scopt.OptionParser
-
// $example on$
-import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest}
-import org.apache.spark.ml.util.MetadataUtils
import org.apache.spark.mllib.evaluation.MulticlassMetrics
-import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.DataFrame
// $example off$
import org.apache.spark.sql.SparkSession
/**
* An example runner for Multiclass to Binary Reduction with One Vs Rest.
- * The example uses Logistic Regression as the base classifier. All
parameters that
- * can be specified on the base classifier can be passed in to the runner
options.
+ * The example uses Logistic Regression as the base classifier.
* Run with
* {{{
- * ./bin/run-example ml.OneVsRestExample [options]
+ * ./bin/run-example ml.OneVsRestExample
* }}}
- * For local mode, run
- * {{{
- * ./bin/spark-submit --class
org.apache.spark.examples.ml.OneVsRestExample --driver-memory 1g
- * [examples JAR path] [options]
- * }}}
- * If you use it as a template to create your own app, please use
`spark-submit` to submit your app.
*/
-object OneVsRestExample {
-
- case class Params private[ml] (
- input: String = null,
- testInput: Option[String] = None,
- maxIter: Int = 100,
- tol: Double = 1E-6,
- fitIntercept: Boolean = true,
- regParam: Option[Double] = None,
- elasticNetParam: Option[Double] = None,
- fracTest: Double = 0.2) extends AbstractParams[Params]
+object OneVsRestExample {
def main(args: Array[String]) {
- val defaultParams = Params()
-
- val parser = new OptionParser[Params]("OneVsRest Example") {
- head("OneVsRest Example: multiclass to binary reduction using
OneVsRest")
- opt[String]("input")
- .text("input path to labeled examples. This path must be
specified")
- .required()
- .action((x, c) => c.copy(input = x))
- opt[Double]("fracTest")
- .text(s"fraction of data to hold out for testing. If given option
testInput, " +
- s"this option is ignored. default: ${defaultParams.fracTest}")
- .action((x, c) => c.copy(fracTest = x))
- opt[String]("testInput")
- .text("input path to test dataset. If given, option fracTest is
ignored")
- .action((x, c) => c.copy(testInput = Some(x)))
- opt[Int]("maxIter")
- .text(s"maximum number of iterations for Logistic Regression." +
- s" default: ${defaultParams.maxIter}")
- .action((x, c) => c.copy(maxIter = x))
- opt[Double]("tol")
- .text(s"the convergence tolerance of iterations for Logistic
Regression." +
- s" default: ${defaultParams.tol}")
- .action((x, c) => c.copy(tol = x))
- opt[Boolean]("fitIntercept")
- .text(s"fit intercept for Logistic Regression." +
- s" default: ${defaultParams.fitIntercept}")
- .action((x, c) => c.copy(fitIntercept = x))
- opt[Double]("regParam")
- .text(s"the regularization parameter for Logistic Regression.")
- .action((x, c) => c.copy(regParam = Some(x)))
- opt[Double]("elasticNetParam")
- .text(s"the ElasticNet mixing parameter for Logistic Regression.")
- .action((x, c) => c.copy(elasticNetParam = Some(x)))
- checkConfig { params =>
- if (params.fracTest < 0 || params.fracTest >= 1) {
- failure(s"fracTest ${params.fracTest} value incorrect; should be
in [0,1).")
- } else {
- success
- }
- }
- }
- parser.parse(args, defaultParams).map { params =>
- run(params)
- }.getOrElse {
- sys.exit(1)
- }
- }
-
- private def run(params: Params) {
val spark = SparkSession
.builder
- .appName(s"OneVsRestExample with $params")
+ .appName(s"OneVsRestExample")
.getOrCreate()
+ import spark.implicits._
+
// $example on$
- val inputData = spark.read.format("libsvm").load(params.input)
- // compute the train/test split: if testInput is not provided use part
of input.
- val data = params.testInput match {
- case Some(t) =>
- // compute the number of features in the training set.
- val numFeatures = inputData.first().getAs[Vector](1).size
- val testData = spark.read.option("numFeatures",
numFeatures.toString)
- .format("libsvm").load(t)
- Array[DataFrame](inputData, testData)
- case None =>
- val f = params.fracTest
- inputData.randomSplit(Array(1 - f, f), seed = 12345)
- }
- val Array(train, test) = data.map(_.cache())
+ // load data file.
+ val inputData: DataFrame = spark.read.format("libsvm")
+ .load("data/mllib/sample_multiclass_classification_data.txt")
+
+ // generate the train/test split.
+ val Array(train, test) = inputData.randomSplit(Array(0.8, 0.2))
// instantiate the base classifier
val classifier = new LogisticRegression()
- .setMaxIter(params.maxIter)
- .setTol(params.tol)
- .setFitIntercept(params.fitIntercept)
-
- // Set regParam, elasticNetParam if specified in params
- params.regParam.foreach(classifier.setRegParam)
- params.elasticNetParam.foreach(classifier.setElasticNetParam)
+ .setMaxIter(10)
+ .setTol(1E-6)
+ .setFitIntercept(true)
// instantiate the One Vs Rest Classifier.
-
val ovr = new OneVsRest()
ovr.setClassifier(classifier)
--- End diff --
minor but perhaps use `val over = new OneVsRest().setClassifier(...)` to
match Java example.
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]