srowen commented on a change in pull request #24963: [SPARK-28159][ML] Make the
transform natively in ml framework to avoid extra conversion
URL: https://github.com/apache/spark/pull/24963#discussion_r298630098
##########
File path:
mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
##########
@@ -75,40 +77,61 @@ class ChiSqSelectorModel @Since("1.3.0") (
private def compress(features: Vector): Vector = {
features match {
case SparseVector(size, indices, values) =>
- val newSize = filterIndices.length
- val newValues = new ArrayBuilder.ofDouble
- val newIndices = new ArrayBuilder.ofInt
- var i = 0
- var j = 0
- var indicesIdx = 0
- var filterIndicesIdx = 0
- while (i < indices.length && j < filterIndices.length) {
- indicesIdx = indices(i)
- filterIndicesIdx = filterIndices(j)
- if (indicesIdx == filterIndicesIdx) {
- newIndices += j
- newValues += values(i)
- j += 1
- i += 1
- } else {
- if (indicesIdx > filterIndicesIdx) {
- j += 1
- } else {
- i += 1
- }
- }
- }
- // TODO: Sparse representation might be ineffective if (newSize ~=
newValues.size)
- Vectors.sparse(newSize, newIndices.result(), newValues.result())
+ val (newIndices, newValues) = compressSparse(indices, values)
+ Vectors.sparse(filterIndices.length, newIndices, newValues)
case DenseVector(values) =>
- val values = features.toArray
- Vectors.dense(filterIndices.map(i => values(i)))
+ Vectors.dense(compressDense(values))
+ case other =>
+ throw new UnsupportedOperationException(
+ s"Only sparse and dense vectors are supported but got
${other.getClass}.")
+ }
+ }
+
+ private[spark] def compress(features: NewVector): NewVector = {
Review comment:
Likewise here, I don't think we want to handle .ml vectors in .mllib. I
think the idea is to make this .mllib method more generic, perhaps just
operating on indices and values?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]