Github user dbtsai commented on a diff in the pull request:

    https://github.com/apache/spark/pull/9843#discussion_r46011586
  
    --- Diff: mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala ---
    @@ -211,14 +213,16 @@ private object IDFModel {
         val n = v.size
         v match {
           case SparseVector(size, indices, values) =>
    -        val nnz = indices.size
    -        val newValues = new Array[Double](nnz)
    +        val newElements = new ArrayBuffer[(Int, Double)]
             var k = 0
    -        while (k < nnz) {
    -          newValues(k) = values(k) * idf(indices(k))
    +        while (k < indices.size) {
    +          val newValue = values(k) * idf(indices(k))
    --- End diff --
    
    Since `idf` can be sparse, lookup will not be constant time operation. You 
need to make it 4 cases. Also, `indices.size` is not good since in each tight 
iteration, the `size` method will be called. The following code should work, 
but I just wrote it here without testing so may have some bugs. You need to add 
the tests for four different scenarios. 
    
    ```scala
        (idf, v) match {
          case (didf: DenseVector, dv: DenseVector) =>
            val didfValues = didf.values
            val dvValues = dv.values
    
            val newValues = new Array[Double](n)
            var j = 0
            while (j < n) {
              newValues(j) = dvValues(j) * didfValues(j)
              j += 1
            }
            Vectors.dense(newValues)
    
          case (didf: DenseVector, sv: SparseVector) =>
            val didfValues = didf.values
            val svIndices = sv.indices
            val svValues = sv.values
            val svNnz = svIndices.length
    
            val newValues = new Array[Double](svNnz)
            var k = 0
            while (k < svNnz) {
              newValues(k) = svValues(k) * didfValues(svIndices(k))
              k += 1
            }
            Vectors.sparse(n, svIndices, newValues)
    
          case (sidf: SparseVector, dv: DenseVector) =>
            val dvValues = dv.values
            val sidfIndices = sidf.indices
            val sidfValues = sidf.values
            val sidfNnz = sidfIndices.length
    
            val newValues = new Array[Double](sidfNnz)
            var k = 0
            while (k < sidfNnz) {
              newValues(k) = sidfValues(k) * dvValues(sidfIndices(k))
              k += 1
            }
            Vectors.sparse(n, sidfIndices, newValues)
    
          case (sidf: SparseVector, sv: SparseVector) =>
            val (largeIndices, largeValues, largeNnz, smallIndices, 
smallValues, smallNnz) = {
              val sidfIndices = sidf.indices
              val sidfValues = sidf.values
              val sidfNnz = sidfIndices.length
    
              val svIndices = sv.indices
              val svValues = sv.values
              val svNnz = svIndices.length
              if (sidfNnz > svNnz) {
                (sidfIndices, sidfValues, sidfNnz, svIndices, svValues, svNnz)
              } else {
                (svIndices, svValues, svNnz, sidfIndices, sidfValues, sidfNnz)
              }
            }
    
            val newIndices = new ArrayBuffer[Int](smallNnz)
            val newValues = new ArrayBuffer[Double](smallNnz)
    
            var j = 0
            var k = 0
            while (j < smallNnz && k < largeNnz) {
              val smallIndex = smallIndices(j)
              val largeIndex = largeIndices(k)
              
              if (smallIndex == largeIndex) {
                newIndices.append(smallIndex)
                newValues.append(smallValues(j) * largeValues(k))
                j += 1
                k += 1
              } else if (smallIndex > largeIndex) {
                k += 1
              } else {
                j += 1
              }
            }
            Vectors.sparse(n, newIndices.toArray, newValues.toArray)
    
          case (idfOther, vOther) =>
            throw new UnsupportedOperationException(
              s"Only sparse and dense vectors are supported but got 
${idfOther.getClass} " +
              s"for idf vector, and ${vOther.getClass} for term frequence 
vector.")
        }
    ```


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to