Github user holdenk commented on a diff in the pull request:
https://github.com/apache/spark/pull/19659#discussion_r149479801
--- Diff: mllib/src/test/scala/org/apache/spark/ml/feature/NGramSuite.scala
---
@@ -76,11 +76,32 @@ class NGramSuite extends SparkFunSuite with
MLlibTestSparkContext with DefaultRe
testNGram(nGram, dataset)
}
+ test("NGramLength in [2, 4] yields length 2, 3, 4 n-grams") {
+ val nGram = new NGram()
+ .setInputCol("inputTokens")
+ .setOutputCol("nGrams")
+ .setN(2)
+ .setMaxN(4)
+ val dataset = Seq(NGramTestData(
+ Array("a", "b", "c", "d", "e", "f", "g"),
+ Array(
+ "a b", "a b c", "a b c d",
+ "b c", "b c d", "b c d e",
+ "c d", "c d e", "c d e f",
+ "d e", "d e f", "d e f g",
+ "e f", "e f g",
+ "f g"
+ )
+ )).toDF()
+ testNGram(nGram, dataset)
+ }
+
test("read/write") {
val t = new NGram()
.setInputCol("myInputCol")
.setOutputCol("myOutputCol")
.setN(3)
+ .setMaxN(5)
testDefaultReadWrite(t)
--- End diff --
Would be good to make sure read/write makes sense even when maxN isn't set
perhaps?
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]