[GitHub] spark pull request: [SPARK-7262][ML] Binary LogisticRegression wit...

mengxr Fri, 08 May 2015 10:00:53 -0700

Github user mengxr commented on a diff in the pull request:

    https://github.com/apache/spark/pull/5967#discussion_r29956523
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 ---
    @@ -44,45 +53,151 @@ private[classification] trait LogisticRegressionParams 
extends ProbabilisticClas
     @AlphaComponent
     class LogisticRegression
       extends ProbabilisticClassifier[Vector, LogisticRegression, 
LogisticRegressionModel]
    -  with LogisticRegressionParams {
    +  with LogisticRegressionParams with Logging {
     
    -  /** @group setParam */
    +  /**
    +   * Set the regularization parameter.
    +   * Default is 0.0.
    +   * @group setParam
    +   */
       def setRegParam(value: Double): this.type = set(regParam, value)
    +  setDefault(regParam -> 0.0)
     
    -  /** @group setParam */
    +  /**
    +   * Set the ElasticNet mixing parameter.
    +   * For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an 
L1 penalty.
    +   * For 0 < alpha < 1, the penalty is a combination of L1 and L2.
    +   * Default is 0.0 which is an L2 penalty.
    +   * @group setParam
    +   */
    +  def setElasticNetParam(value: Double): this.type = set(elasticNetParam, 
value)
    +  setDefault(elasticNetParam -> 0.0)
    +
    +  /**
    +   * Set the maximal number of iterations.
    +   * Default is 100.
    +   * @group setParam
    +   */
       def setMaxIter(value: Int): this.type = set(maxIter, value)
    +  setDefault(maxIter -> 100)
    +
    +  /**
    +   * Set the convergence tolerance of iterations.
    +   * Smaller value will lead to higher accuracy with the cost of more 
iterations.
    +   * Default is 1E-6.
    +   * @group setParam
    +   */
    +  def setTol(value: Double): this.type = set(tol, value)
    +  setDefault(tol -> 1E-6)
     
       /** @group setParam */
       def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value)
    +  setDefault(fitIntercept -> true)
     
       /** @group setParam */
       def setThreshold(value: Double): this.type = set(threshold, value)
    +  setDefault(threshold -> 0.5)
     
       override protected def train(dataset: DataFrame): 
LogisticRegressionModel = {
         // Extract columns from data.  If dataset is persisted, do not persist 
oldDataset.
    -    val oldDataset = extractLabeledPoints(dataset)
    +    val instances = extractLabeledPoints(dataset).map {
    +      case LabeledPoint(label: Double, features: Vector) => (label, 
features)
    +    }
         val handlePersistence = dataset.rdd.getStorageLevel == 
StorageLevel.NONE
    -    if (handlePersistence) {
    -      oldDataset.persist(StorageLevel.MEMORY_AND_DISK)
    +    if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
    +
    +    val (summarizer, labelSummarizer) = instances.treeAggregate(
    +      (new MultivariateOnlineSummarizer, new MultiClassSummarizer))( {
    +        case ((summarizer: MultivariateOnlineSummarizer, labelSummarizer: 
MultiClassSummarizer),
    --- End diff --
    
    I think we need to update `MultivariateOnline Summarizer`'s constructor to 
allow users to set flags to indicate what to compute. Let's do this later:)



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [SPARK-7262][ML] Binary LogisticRegression wit...

Reply via email to