This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 4df9fa24f56 [SPARK-45521][ML] Avoid re-computation of nnz in 
`VectorAssembler`
4df9fa24f56 is described below

commit 4df9fa24f56161f7aab08611fa32efc1a89a0ab2
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Fri Oct 13 08:37:04 2023 +0800

    [SPARK-45521][ML] Avoid re-computation of nnz in `VectorAssembler`
    
    ### What changes were proposed in this pull request?
    1, add a new private `compressed` method with given `nnz`, since we can 
know it sometime;
    2, minor change `Array.range(0, length)` -> `Iterator.range(0, length)` to 
avoid array creation;
    
    ### Why are the changes needed?
    in `VectorAssembler`, the `nnz` if already known before vector 
construction, the scan to compute nnz can be skipped;
    
    ### Does this PR introduce _any_ user-facing change?
    no
    
    ### How was this patch tested?
    ci
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #43353 from zhengruifeng/ml_vec_opt.
    
    Authored-by: Ruifeng Zheng <[email protected]>
    Signed-off-by: Ruifeng Zheng <[email protected]>
---
 .../src/main/scala/org/apache/spark/ml/linalg/Vectors.scala       | 5 +++--
 .../main/scala/org/apache/spark/ml/feature/VectorAssembler.scala  | 8 +++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git 
a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala 
b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
index 985f67fc3c3..827ca3f8b9d 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
@@ -184,8 +184,9 @@ sealed trait Vector extends Serializable {
    * Returns a vector in either dense or sparse format, whichever uses less 
storage.
    */
   @Since("2.0.0")
-  def compressed: Vector = {
-    val nnz = numNonzeros
+  def compressed: Vector = compressed(numNonzeros)
+
+  private[ml] def compressed(nnz: Int): Vector = {
     // A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12 
* nnz + 20 bytes.
     if (1.5 * (nnz + 1.0) < size) {
       toSparseWithSize(nnz)
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index 7bc5e56aaeb..761352e34a3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -279,8 +279,8 @@ object VectorAssembler extends 
DefaultParamsReadable[VectorAssembler] {
         featureIndex += vec.size
       case null =>
         if (keepInvalid) {
-          val length: Int = lengths(inputColumnIndex)
-          Array.range(0, length).foreach { i =>
+          val length = lengths(inputColumnIndex)
+          Iterator.range(0, length).foreach { i =>
             indices += featureIndex + i
             values += Double.NaN
           }
@@ -295,6 +295,8 @@ object VectorAssembler extends 
DefaultParamsReadable[VectorAssembler] {
       case o =>
         throw new SparkException(s"$o of type ${o.getClass.getName} is not 
supported.")
     }
-    Vectors.sparse(featureIndex, indices.result(), values.result()).compressed
+
+    val (idxArray, valArray) = (indices.result(), values.result())
+    Vectors.sparse(featureIndex, idxArray, 
valArray).compressed(idxArray.length)
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to