[GitHub] spark pull request: [WIP][SPARK-1486][MLlib] Multi Model Training ...

jkbradley Fri, 19 Sep 2014 14:11:34 -0700

Github user jkbradley commented on a diff in the pull request:

    https://github.com/apache/spark/pull/2451#discussion_r17809784
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/optimization/MultiModelGradientDescent.scala
 ---
    @@ -0,0 +1,256 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.mllib.optimization
    +
    +import scala.collection.mutable.ArrayBuffer
    +
    +import breeze.linalg.{DenseVector => BDV}
    +
    +import org.apache.spark.annotation.{Experimental, DeveloperApi}
    +import org.apache.spark.Logging
    +import org.apache.spark.rdd.RDD
    +import org.apache.spark.mllib.linalg._
    +import org.apache.spark.mllib.rdd.RDDFunctions._
    +
    +class MultiModelGradientDescent private[mllib] (
    +    private var gradient: MultiModelGradient,
    +    private var updater: Array[MultiModelUpdater]) extends 
Optimizer[Matrix] with Logging {
    +
    +  private var stepSize: Array[Double] = Array(1.0, 0.1)
    +  private var numIterations: Array[Int] = Array(100)
    +  private var regParam: Array[Double] = Array(0.0, 0.1, 1.0)
    +  private var miniBatchFraction: Double = 1.0
    +
    +  /**
    +   * Set the initial step size of SGD for the first step. Default (1.0, 
0.1).
    +   * In subsequent steps, the step size will decrease with stepSize/sqrt(t)
    +   */
    +  def setStepSize(step: Array[Double]): this.type = {
    +    this.stepSize = step
    +    this
    +  }
    +
    +  /**
    +   * :: Experimental ::
    +   * Set fraction of data to be used for each SGD iteration.
    +   * Default 1.0 (corresponding to deterministic/classical gradient 
descent)
    +   */
    +  @Experimental
    +  def setMiniBatchFraction(fraction: Double): this.type = {
    +    this.miniBatchFraction = fraction
    +    this
    +  }
    +
    +  /**
    +   * Set the number of iterations for SGD. Default 100.
    +   */
    +  def setNumIterations(iters: Array[Int]): this.type = {
    +    this.numIterations = iters
    +    this
    +  }
    +
    +  /**
    +   * Set the regularization parameter. Default (0.0, 0.1, 1.0).
    +   */
    +  def setRegParam(regParam: Array[Double]): this.type = {
    +    this.regParam = regParam
    +    this
    +  }
    +
    +  /**
    +   * Set the gradient function (of the loss function of one single data 
example)
    +   * to be used for SGD.
    +   */
    +  def setGradient(gradient: MultiModelGradient): this.type = {
    +    this.gradient = gradient
    +    this
    +  }
    +
    +
    +  /**
    +   * Set the updater function to actually perform a gradient step in a 
given direction.
    +   * The updater is responsible to perform the update from the 
regularization term as well,
    +   * and therefore determines what kind or regularization is used, if any.
    +   */
    +  def setUpdater(updater: Array[MultiModelUpdater]): this.type = {
    +    this.updater = updater
    +    this
    +  }
    +
    +  /**
    +   * :: DeveloperApi ::
    +   * Runs gradient descent on the given training data.
    +   * @param data training data
    +   * @param initialWeights initial weights
    +   * @return solution vector
    +   */
    +  @DeveloperApi
    +  def optimize(data: RDD[(Double, Vector)], initialWeights: Vector): 
Matrix = {
    +    val (weights, _) = MultiModelGradientDescent.runMiniBatchMMSGD(
    +      data,
    +      gradient,
    +      updater,
    +      stepSize,
    +      numIterations,
    +      regParam,
    +      miniBatchFraction,
    +      initialWeights)
    +    weights
    +  }
    +
    +}
    +
    +/**
    + * :: DeveloperApi ::
    + * Top-level method to run gradient descent.
    + */
    +@DeveloperApi
    +object MultiModelGradientDescent extends Logging {
    +  /**
    +   * Run stochastic gradient descent (SGD) in parallel using mini batches.
    +   * In each iteration, we sample a subset (fraction miniBatchFraction) of 
the total data
    +   * in order to compute a gradient estimate.
    +   * Sampling, and averaging the subgradients over this subset is 
performed using one standard
    +   * spark map-reduce in each iteration.
    +   *
    +   * @param data - Input data for SGD. RDD of the set of data examples, 
each of
    +   *               the form (label, [feature values]).
    +   * @param gradient - Gradient object (used to compute the gradient of 
the loss function of
    +   *                   one single data example)
    +   * @param updater - Updater function to actually perform a gradient step 
in a given direction.
    +   * @param stepSize - initial step size for the first step
    +   * @param numIterations - number of iterations that SGD should be run.
    +   * @param regParam - regularization parameter
    +   * @param miniBatchFraction - fraction of the input data set that should 
be used for
    +   *                            one iteration of SGD. Default value 1.0.
    +   *
    +   * @return A tuple containing two elements. The first element is a 
column matrix containing
    +   *         weights for every feature, and the second element is an array 
containing the
    +   *         stochastic loss computed for every iteration.
    +   */
    +  def runMiniBatchMMSGD(
    --- End diff --
    
    Are we trying to keep things Java-friendly?  (The default param values 
won't be.)



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request: [WIP][SPARK-1486][MLlib] Multi Model Training ...

Reply via email to