Github user dbtsai commented on a diff in the pull request:
https://github.com/apache/spark/pull/9843#discussion_r46011586
--- Diff: mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala ---
@@ -211,14 +213,16 @@ private object IDFModel {
val n = v.size
v match {
case SparseVector(size, indices, values) =>
- val nnz = indices.size
- val newValues = new Array[Double](nnz)
+ val newElements = new ArrayBuffer[(Int, Double)]
var k = 0
- while (k < nnz) {
- newValues(k) = values(k) * idf(indices(k))
+ while (k < indices.size) {
+ val newValue = values(k) * idf(indices(k))
--- End diff --
Since `idf` can be sparse, lookup will not be constant time operation. You
need to make it 4 cases. Also, `indices.size` is not good since in each tight
iteration, the `size` method will be called. The following code should work,
but I just wrote it here without testing so may have some bugs. You need to add
the tests for four different scenarios.
```scala
(idf, v) match {
case (didf: DenseVector, dv: DenseVector) =>
val didfValues = didf.values
val dvValues = dv.values
val newValues = new Array[Double](n)
var j = 0
while (j < n) {
newValues(j) = dvValues(j) * didfValues(j)
j += 1
}
Vectors.dense(newValues)
case (didf: DenseVector, sv: SparseVector) =>
val didfValues = didf.values
val svIndices = sv.indices
val svValues = sv.values
val svNnz = svIndices.length
val newValues = new Array[Double](svNnz)
var k = 0
while (k < svNnz) {
newValues(k) = svValues(k) * didfValues(svIndices(k))
k += 1
}
Vectors.sparse(n, svIndices, newValues)
case (sidf: SparseVector, dv: DenseVector) =>
val dvValues = dv.values
val sidfIndices = sidf.indices
val sidfValues = sidf.values
val sidfNnz = sidfIndices.length
val newValues = new Array[Double](sidfNnz)
var k = 0
while (k < sidfNnz) {
newValues(k) = sidfValues(k) * dvValues(sidfIndices(k))
k += 1
}
Vectors.sparse(n, sidfIndices, newValues)
case (sidf: SparseVector, sv: SparseVector) =>
val (largeIndices, largeValues, largeNnz, smallIndices,
smallValues, smallNnz) = {
val sidfIndices = sidf.indices
val sidfValues = sidf.values
val sidfNnz = sidfIndices.length
val svIndices = sv.indices
val svValues = sv.values
val svNnz = svIndices.length
if (sidfNnz > svNnz) {
(sidfIndices, sidfValues, sidfNnz, svIndices, svValues, svNnz)
} else {
(svIndices, svValues, svNnz, sidfIndices, sidfValues, sidfNnz)
}
}
val newIndices = new ArrayBuffer[Int](smallNnz)
val newValues = new ArrayBuffer[Double](smallNnz)
var j = 0
var k = 0
while (j < smallNnz && k < largeNnz) {
val smallIndex = smallIndices(j)
val largeIndex = largeIndices(k)
if (smallIndex == largeIndex) {
newIndices.append(smallIndex)
newValues.append(smallValues(j) * largeValues(k))
j += 1
k += 1
} else if (smallIndex > largeIndex) {
k += 1
} else {
j += 1
}
}
Vectors.sparse(n, newIndices.toArray, newValues.toArray)
case (idfOther, vOther) =>
throw new UnsupportedOperationException(
s"Only sparse and dense vectors are supported but got
${idfOther.getClass} " +
s"for idf vector, and ${vOther.getClass} for term frequence
vector.")
}
```
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]