This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 4df9fa24f56 [SPARK-45521][ML] Avoid re-computation of nnz in
`VectorAssembler`
4df9fa24f56 is described below
commit 4df9fa24f56161f7aab08611fa32efc1a89a0ab2
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Fri Oct 13 08:37:04 2023 +0800
[SPARK-45521][ML] Avoid re-computation of nnz in `VectorAssembler`
### What changes were proposed in this pull request?
1, add a new private `compressed` method with given `nnz`, since we can
know it sometime;
2, minor change `Array.range(0, length)` -> `Iterator.range(0, length)` to
avoid array creation;
### Why are the changes needed?
in `VectorAssembler`, the `nnz` if already known before vector
construction, the scan to compute nnz can be skipped;
### Does this PR introduce _any_ user-facing change?
no
### How was this patch tested?
ci
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #43353 from zhengruifeng/ml_vec_opt.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
.../src/main/scala/org/apache/spark/ml/linalg/Vectors.scala | 5 +++--
.../main/scala/org/apache/spark/ml/feature/VectorAssembler.scala | 8 +++++---
2 files changed, 8 insertions(+), 5 deletions(-)
diff --git
a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
index 985f67fc3c3..827ca3f8b9d 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
@@ -184,8 +184,9 @@ sealed trait Vector extends Serializable {
* Returns a vector in either dense or sparse format, whichever uses less
storage.
*/
@Since("2.0.0")
- def compressed: Vector = {
- val nnz = numNonzeros
+ def compressed: Vector = compressed(numNonzeros)
+
+ private[ml] def compressed(nnz: Int): Vector = {
// A dense vector needs 8 * size + 8 bytes, while a sparse vector needs 12
* nnz + 20 bytes.
if (1.5 * (nnz + 1.0) < size) {
toSparseWithSize(nnz)
diff --git
a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
index 7bc5e56aaeb..761352e34a3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorAssembler.scala
@@ -279,8 +279,8 @@ object VectorAssembler extends
DefaultParamsReadable[VectorAssembler] {
featureIndex += vec.size
case null =>
if (keepInvalid) {
- val length: Int = lengths(inputColumnIndex)
- Array.range(0, length).foreach { i =>
+ val length = lengths(inputColumnIndex)
+ Iterator.range(0, length).foreach { i =>
indices += featureIndex + i
values += Double.NaN
}
@@ -295,6 +295,8 @@ object VectorAssembler extends
DefaultParamsReadable[VectorAssembler] {
case o =>
throw new SparkException(s"$o of type ${o.getClass.getName} is not
supported.")
}
- Vectors.sparse(featureIndex, indices.result(), values.result()).compressed
+
+ val (idxArray, valArray) = (indices.result(), values.result())
+ Vectors.sparse(featureIndex, idxArray,
valArray).compressed(idxArray.length)
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]