Github user WeichenXu123 commented on a diff in the pull request:
https://github.com/apache/spark/pull/20964#discussion_r178783980
--- Diff:
mllib/src/test/scala/org/apache/spark/ml/feature/MinHashLSHSuite.scala ---
@@ -167,4 +166,20 @@ class MinHashLSHSuite extends SparkFunSuite with
MLlibTestSparkContext with Defa
assert(precision == 1.0)
assert(recall >= 0.7)
}
+
+ test("MinHashLSHModel.transform should work with Structured Streaming") {
+ val localSpark = spark
+ import localSpark.implicits._
+
+ val model = new MinHashLSHModel("mh", randCoefficients = Array((1, 0)))
+ model.set(model.inputCol, "keys")
+ testTransformer[Tuple1[Vector]](dataset.toDF(), model, "keys",
model.getOutputCol) {
+ case Row(_: Vector, output: Seq[_]) =>
+ assert(output.length === model.randCoefficients.length)
+ // no AND-amplification yet: SPARK-18450, so each hash output is
of length 1
+ output.foreach {
+ case hashOutput: Vector => assert(hashOutput.size === 1)
+ }
+ }
--- End diff --
Why not have "expected" column" here to compare with ?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]