This is what I have done, is there a better way of doing this?
val df = spark.read.option("header", "false").csv("data")
val tk = new Tokenizer().setInputCol("_c2").setOutputCol("words")
val tf = new HashingTF().setInputCol("words").setOutputCol("tf")
val idf = new IDF().setInputCol("tf").setOutputCol("tf-idf")
val df1 = tf.transform(tk.transform(df))
val idfs = idf.fit(df1).transform(df1)
println(nearestNeighbour("http://dbpedia.org/resource/Barack_Obama",
idfs))
def nearestNeighbour(uri: String, ds: DataFrame) : String = {
var res : Row = null
var metric : Double = 0
val tfIdfSrc = ds.filter(s"_c0 ==
'$uri'").take(1)(0).getAs[Vector]("tf-idf")
ds.filter("_c0 != '" + uri + "'").foreach { r =>
val tfIdfDst = r.getAs[Vector]("tf-idf")
val dp = dorProduct(tfIdfSrc, tfIdfDst)
if (dp > metric) {
res = r
metric = dp
}
}
return res.getAs[String]("_c1")
}
def cosineSimilarity(vectorA: Vector, vectorB: Vector) = {
var dotProduct = 0.0
var normA = 0.0
var normB = 0.0
var index = vectorA.size - 1
for (i <- 0 to index) {
dotProduct += vectorA(i) * vectorB(i)
normA += Math.pow(vectorA(i), 2)
normB += Math.pow(vectorB(i), 2)
}
(dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)))
}
def dorProduct(vectorA: Vector, vectorB: Vector) = {
var dp = 0.0
var index = vectorA.size - 1
for (i <- 0 to index) {
dp += vectorA(i) * vectorB(i)
}
dp
}
On Sun, Nov 13, 2016 at 7:04 PM, Meeraj Kunnumpurath <
[email protected]> wrote:
> Hello,
>
> I have a dataset containing TF-IDF vectors for a corpus of documents. How
> do I perform a nearest neighbour search on the dataset, using cosine
> similarity?
>
> val df = spark.read.option("header", "false").csv("data")
>
> val tk = new Tokenizer().setInputCol("_c2").setOutputCol("words")
>
> val tf = new HashingTF().setInputCol("words").setOutputCol("tf")
>
> val idf = new IDF().setInputCol("tf").setOutputCol("tf-idf")
>
> val df1 = tf.transform(tk.transform(df))
>
> idf.fit(df1).transform(df1).select("tf-idf").show(10)
> Thank you
>
> --
> *Meeraj Kunnumpurath*
>
>
> *Director and Executive PrincipalService Symphony Ltd00 44 7702 693597*
>
> *00 971 50 409 [email protected] <[email protected]>*
>
--
*Meeraj Kunnumpurath*
*Director and Executive PrincipalService Symphony Ltd00 44 7702 693597*
*00 971 50 409 [email protected] <[email protected]>*