Hi All,
PFB code.
import org.apache.spark.ml.feature.{HashingTF, IDF}
import org.apache.spark.ml.linalg.SparseVector
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by satyajit on 12/7/16.
*/
object DIMSUMusingtf extends App {
val conf = new SparkConf()
.setMaster("local[1]")
.setAppName("testColsim")
val sc = new SparkContext(conf)
val spark = SparkSession
.builder
.appName("testColSim").getOrCreate()
import org.apache.spark.ml.feature.Tokenizer
val sentenceData = spark.createDataFrame(Seq(
(0, "Hi I heard about Spark"),
(0, "I wish Java could use case classes"),
(1, "Logistic regression models are neat")
)).toDF("label", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
val wordsData = tokenizer.transform(sentenceData)
val hashingTF = new HashingTF()
.setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
val featurizedData = hashingTF.transform(wordsData)
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
rescaledData.show()
rescaledData.select("features", "label").take(3).foreach(println)
val check = rescaledData.select("features")
val row = check.rdd.map(row => row.getAs[SparseVector]("features"))
val mat = new RowMatrix(row) //i am basically trying to use
Dense.vector as a direct input to
rowMatrix, but i get an error that RowMatrix Cannot resolve constructor
row.foreach(println)
}
Any help would be appreciated.
Regards,
Satyajit.