Hello Spark fans,

I am trying to use the IDF model available in the spark mllib to create an
tf-idf representation of a n RDD[Vectors]. Below i have attached my MWE

I get the following error

"java.lang.IndexOutOfBoundsException: 7 not in [-4,4)
at breeze.linalg.DenseVector.apply$mcI$sp(DenseVector.scala:70)
at breeze.linalg.DenseVector.apply(DenseVector.scala:69)
at
org.apache.spark.mllib.feature.IDF$DocumentFrequencyAggregator.add(IDF.scala:81)
"

Any ideas?

Regards,
Shivani

import org.apache.spark.mllib.feature.VectorTransformer

import com.box.analytics.ml.dms.vector.{SparkSparseVector,SparkDenseVector}

import org.apache.spark.mllib.linalg.{DenseVector => SDV, SparseVector =>
SSV}

import org.apache.spark.mllib.linalg.{Vector => SparkVector}

import org.apache.spark.mllib.linalg.distributed.{IndexedRow,
IndexedRowMatrix}

import org.apache.spark.mllib.feature._


    val doc1s = new IndexedRow(1L, new SSV(4, Array(1, 3, 5, 7),Array(1.0,
1.0, 0.0, 5.0)))

    val doc2s = new IndexedRow(2L, new SSV(4, Array(1, 2, 4, 13),
Array(0.0, 1.0, 2.0, 0.0)))

    val doc3s = new IndexedRow(3L, new SSV(4, Array(10, 14, 20,
21),Array(2.0, 0.0, 2.0, 1.0)))

    val doc4s = new IndexedRow(4L, new SSV(4, Array(3, 7, 13,
20),Array(2.0, 0.0, 2.0, 1.0)))

 val indata = sc.parallelize(List(doc1s,doc2s,doc3s,doc4s)).map(e=>e.vector)

(new IDF()).fit(indata).idf

-- 
Software Engineer
Analytics Engineering Team@ Box
Mountain View, CA

Reply via email to