Repository: spark
Updated Branches:
  refs/heads/master e9471d341 -> 6e9c3ff1e


[SPARK-8314][MLlib] improvement in performance of MLUtils.appendBias

MLUtils.appendBias method is heavily used in creating intercepts for linear 
models.
This method uses Breeze's vector concatenation which is very slow compared to 
the plain
System.arrayCopy. This improvement is to change the implementation to use 
System.arrayCopy.

I saw the following performance improvements after the change:
Benchmark with mnist dataset for 50 times:
MLUtils.appendBias (SparseVector Before): 47320 ms
MLUtils.appendBias (SparseVector After): 1935 ms
MLUtils.appendBias (DenseVector Before): 5340 ms
MLUtils.appendBias (DenseVector After): 4080 ms
This is almost a 24 times performance boost for SparseVectors.

Author: Roger Menezes <rmene...@netflix.com>

Closes #6768 from rogermenezes/improve-append-bias and squashes the following 
commits:

4e42f75 [Roger Menezes] address feedback
e999d79 [Roger Menezes] first commit


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6e9c3ff1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6e9c3ff1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6e9c3ff1

Branch: refs/heads/master
Commit: 6e9c3ff1ecaf12a0126d83f27f5a4153ae420a34
Parents: e9471d3
Author: Roger Menezes <rmene...@netflix.com>
Authored: Fri Jun 12 18:29:58 2015 -0700
Committer: DB Tsai <d...@netflix.com>
Committed: Fri Jun 12 18:29:58 2015 -0700

----------------------------------------------------------------------
 .../examples/ml/LogisticRegressionExample.scala |  4 +--
 .../org/apache/spark/mllib/linalg/BLAS.scala    |  4 +--
 .../org/apache/spark/mllib/util/MLUtils.scala   | 26 ++++++++++++++++----
 3 files changed, 25 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/6e9c3ff1/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
----------------------------------------------------------------------
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
index b12f833..3cf193f 100644
--- 
a/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/LogisticRegressionExample.scala
@@ -145,9 +145,9 @@ object LogisticRegressionExample {
     val elapsedTime = (System.nanoTime() - startTime) / 1e9
     println(s"Training time: $elapsedTime seconds")
 
-    val lirModel = 
pipelineModel.stages.last.asInstanceOf[LogisticRegressionModel]
+    val lorModel = 
pipelineModel.stages.last.asInstanceOf[LogisticRegressionModel]
     // Print the weights and intercept for logistic regression.
-    println(s"Weights: ${lirModel.weights} Intercept: ${lirModel.intercept}")
+    println(s"Weights: ${lorModel.weights} Intercept: ${lorModel.intercept}")
 
     println("Training data results:")
     DecisionTreeExample.evaluateClassificationModel(pipelineModel, training, 
"indexedLabel")

http://git-wip-us.apache.org/repos/asf/spark/blob/6e9c3ff1/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
index 557119f..3523f18 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/BLAS.scala
@@ -213,9 +213,9 @@ private[spark] object BLAS extends Serializable with 
Logging {
   def scal(a: Double, x: Vector): Unit = {
     x match {
       case sx: SparseVector =>
-        f2jBLAS.dscal(sx.values.size, a, sx.values, 1)
+        f2jBLAS.dscal(sx.values.length, a, sx.values, 1)
       case dx: DenseVector =>
-        f2jBLAS.dscal(dx.values.size, a, dx.values, 1)
+        f2jBLAS.dscal(dx.values.length, a, dx.values, 1)
       case _ =>
         throw new IllegalArgumentException(s"scal doesn't support vector type 
${x.getClass}.")
     }

http://git-wip-us.apache.org/repos/asf/spark/blob/6e9c3ff1/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index 52d6468..7c5cfa7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -270,12 +270,28 @@ object MLUtils {
    * Returns a new vector with `1.0` (bias) appended to the input vector.
    */
   def appendBias(vector: Vector): Vector = {
-    val vector1 = vector.toBreeze match {
-      case dv: BDV[Double] => BDV.vertcat(dv, new BDV[Double](Array(1.0)))
-      case sv: BSV[Double] => BSV.vertcat(sv, new BSV[Double](Array(0), 
Array(1.0), 1))
-      case v: Any => throw new IllegalArgumentException("Do not support vector 
type " + v.getClass)
+    vector match {
+      case dv: DenseVector =>
+        val inputValues = dv.values
+        val inputLength = inputValues.length
+        val outputValues = Array.ofDim[Double](inputLength + 1)
+        System.arraycopy(inputValues, 0, outputValues, 0, inputLength)
+        outputValues(inputLength) = 1.0
+        Vectors.dense(outputValues)
+      case sv: SparseVector =>
+        val inputValues = sv.values
+        val inputIndices = sv.indices
+        val inputValuesLength = inputValues.length
+        val dim = sv.size
+        val outputValues = Array.ofDim[Double](inputValuesLength + 1)
+        val outputIndices = Array.ofDim[Int](inputValuesLength + 1)
+        System.arraycopy(inputValues, 0, outputValues, 0, inputValuesLength)
+        System.arraycopy(inputIndices, 0, outputIndices, 0, inputValuesLength)
+        outputValues(inputValuesLength) = 1.0
+        outputIndices(inputValuesLength) = dim
+        Vectors.sparse(dim + 1, outputIndices, outputValues)
+      case _ => throw new IllegalArgumentException(s"Do not support vector 
type ${vector.getClass}")
     }
-    Vectors.fromBreeze(vector1)
   }
 
   /**


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to