[GitHub] spark pull request #20367: [SPARK-23166][ML] Add maxDF Parameter to CountVec...
Github user ymazari commented on a diff in the pull request: https://github.com/apache/spark/pull/20367#discussion_r164275764 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala --- @@ -155,24 +182,48 @@ class CountVectorizer @Since("1.5.0") (@Since("1.5.0") override val uid: String) transformSchema(dataset.schema, logging = true) val vocSize = $(vocabSize) val input = dataset.select($(inputCol)).rdd.map(_.getAs[Seq[String]](0)) +val countingRequired = $(minDF) < 1.0 || $(maxDF) < 1.0 +val maybeInputSize = if (countingRequired) { + Some(input.cache().count()) +} else { + None +} val minDf = if ($(minDF) >= 1.0) { $(minDF) } else { - $(minDF) * input.cache().count() + $(minDF) * maybeInputSize.get } -val wordCounts: RDD[(String, Long)] = input.flatMap { case (tokens) => +val maxDf = if ($(maxDF) >= 1.0) { + $(maxDF) +} else { + $(maxDF) * maybeInputSize.get +} +require(maxDf >= minDf, "maxDF must be >= minDF.") +val allWordCounts = input.flatMap { case (tokens) => val wc = new OpenHashMap[String, Long] tokens.foreach { w => wc.changeValue(w, 1L, _ + 1L) } wc.map { case (word, count) => (word, (count, 1)) } }.reduceByKey { case ((wc1, df1), (wc2, df2)) => (wc1 + wc2, df1 + df2) -}.filter { case (word, (wc, df)) => - df >= minDf -}.map { case (word, (count, dfCount)) => - (word, count) -}.cache() +} + +val filteringRequired = isSet(minDF) || isSet(maxDF) --- End diff -- Making a variable here for the sake of clarity. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20367: [SPARK-23166][ML] Add maxDF Parameter to CountVec...
Github user ymazari commented on a diff in the pull request: https://github.com/apache/spark/pull/20367#discussion_r164275714 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala --- @@ -155,24 +182,47 @@ class CountVectorizer @Since("1.5.0") (@Since("1.5.0") override val uid: String) transformSchema(dataset.schema, logging = true) val vocSize = $(vocabSize) val input = dataset.select($(inputCol)).rdd.map(_.getAs[Seq[String]](0)) +val filteringRequired = isSet(minDF) || isSet(maxDF) +val maybeInputSize = if (filteringRequired) { + Some(input.cache().count()) +} else { + None +} val minDf = if ($(minDF) >= 1.0) { $(minDF) } else { - $(minDF) * input.cache().count() + $(minDF) * maybeInputSize.getOrElse(1L) } -val wordCounts: RDD[(String, Long)] = input.flatMap { case (tokens) => +val maxDf = if ($(maxDF) >= 1.0) { + $(maxDF) +} else { + $(maxDF) * maybeInputSize.getOrElse(1L) +} +require(maxDf >= minDf, "maxDF must be >= minDF.") +val allWordCounts = input.flatMap { case (tokens) => val wc = new OpenHashMap[String, Long] tokens.foreach { w => wc.changeValue(w, 1L, _ + 1L) } wc.map { case (word, count) => (word, (count, 1)) } }.reduceByKey { case ((wc1, df1), (wc2, df2)) => (wc1 + wc2, df1 + df2) -}.filter { case (word, (wc, df)) => - df >= minDf -}.map { case (word, (count, dfCount)) => - (word, count) -}.cache() +} + +val maybeFilteredWordCounts = if (filteringRequired) { --- End diff -- Done. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20367: [SPARK-23166][ML] Add maxDF Parameter to CountVec...
Github user ymazari commented on a diff in the pull request: https://github.com/apache/spark/pull/20367#discussion_r164275722 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala --- @@ -155,24 +182,47 @@ class CountVectorizer @Since("1.5.0") (@Since("1.5.0") override val uid: String) transformSchema(dataset.schema, logging = true) val vocSize = $(vocabSize) val input = dataset.select($(inputCol)).rdd.map(_.getAs[Seq[String]](0)) +val filteringRequired = isSet(minDF) || isSet(maxDF) +val maybeInputSize = if (filteringRequired) { + Some(input.cache().count()) +} else { + None +} val minDf = if ($(minDF) >= 1.0) { $(minDF) } else { - $(minDF) * input.cache().count() + $(minDF) * maybeInputSize.getOrElse(1L) } -val wordCounts: RDD[(String, Long)] = input.flatMap { case (tokens) => +val maxDf = if ($(maxDF) >= 1.0) { + $(maxDF) +} else { + $(maxDF) * maybeInputSize.getOrElse(1L) +} +require(maxDf >= minDf, "maxDF must be >= minDF.") +val allWordCounts = input.flatMap { case (tokens) => val wc = new OpenHashMap[String, Long] tokens.foreach { w => wc.changeValue(w, 1L, _ + 1L) } wc.map { case (word, count) => (word, (count, 1)) } }.reduceByKey { case ((wc1, df1), (wc2, df2)) => (wc1 + wc2, df1 + df2) -}.filter { case (word, (wc, df)) => - df >= minDf -}.map { case (word, (count, dfCount)) => - (word, count) -}.cache() +} + +val maybeFilteredWordCounts = if (filteringRequired) { + allWordCounts.filter { case (word, (wc, df)) => (df >= minDf) && (df <= maxDf) } +} else { + allWordCounts +} + +val wordCounts = maybeFilteredWordCounts + .map { case (word, (count, dfCount)) => (word, count) } --- End diff -- Changed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20367: [SPARK-23166][ML] Add maxDF Parameter to CountVec...
Github user ymazari commented on a diff in the pull request: https://github.com/apache/spark/pull/20367#discussion_r164275721 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala --- @@ -155,24 +182,47 @@ class CountVectorizer @Since("1.5.0") (@Since("1.5.0") override val uid: String) transformSchema(dataset.schema, logging = true) val vocSize = $(vocabSize) val input = dataset.select($(inputCol)).rdd.map(_.getAs[Seq[String]](0)) +val filteringRequired = isSet(minDF) || isSet(maxDF) +val maybeInputSize = if (filteringRequired) { + Some(input.cache().count()) +} else { + None +} val minDf = if ($(minDF) >= 1.0) { $(minDF) } else { - $(minDF) * input.cache().count() + $(minDF) * maybeInputSize.getOrElse(1L) } -val wordCounts: RDD[(String, Long)] = input.flatMap { case (tokens) => +val maxDf = if ($(maxDF) >= 1.0) { + $(maxDF) +} else { + $(maxDF) * maybeInputSize.getOrElse(1L) +} +require(maxDf >= minDf, "maxDF must be >= minDF.") +val allWordCounts = input.flatMap { case (tokens) => val wc = new OpenHashMap[String, Long] tokens.foreach { w => wc.changeValue(w, 1L, _ + 1L) } wc.map { case (word, count) => (word, (count, 1)) } }.reduceByKey { case ((wc1, df1), (wc2, df2)) => (wc1 + wc2, df1 + df2) -}.filter { case (word, (wc, df)) => - df >= minDf -}.map { case (word, (count, dfCount)) => - (word, count) -}.cache() +} + +val maybeFilteredWordCounts = if (filteringRequired) { + allWordCounts.filter { case (word, (wc, df)) => (df >= minDf) && (df <= maxDf) } --- End diff -- Changed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20367: [SPARK-23166][ML] Add maxDF Parameter to CountVec...
Github user ymazari commented on a diff in the pull request: https://github.com/apache/spark/pull/20367#discussion_r164275712 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala --- @@ -155,24 +182,47 @@ class CountVectorizer @Since("1.5.0") (@Since("1.5.0") override val uid: String) transformSchema(dataset.schema, logging = true) val vocSize = $(vocabSize) val input = dataset.select($(inputCol)).rdd.map(_.getAs[Seq[String]](0)) +val filteringRequired = isSet(minDF) || isSet(maxDF) +val maybeInputSize = if (filteringRequired) { + Some(input.cache().count()) +} else { + None +} val minDf = if ($(minDF) >= 1.0) { $(minDF) } else { - $(minDF) * input.cache().count() + $(minDF) * maybeInputSize.getOrElse(1L) } -val wordCounts: RDD[(String, Long)] = input.flatMap { case (tokens) => +val maxDf = if ($(maxDF) >= 1.0) { + $(maxDF) +} else { + $(maxDF) * maybeInputSize.getOrElse(1L) --- End diff -- Changed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20367: [SPARK-23166][ML] Add maxDF Parameter to CountVec...
Github user ymazari commented on a diff in the pull request: https://github.com/apache/spark/pull/20367#discussion_r164275697 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala --- @@ -155,24 +182,47 @@ class CountVectorizer @Since("1.5.0") (@Since("1.5.0") override val uid: String) transformSchema(dataset.schema, logging = true) val vocSize = $(vocabSize) val input = dataset.select($(inputCol)).rdd.map(_.getAs[Seq[String]](0)) +val filteringRequired = isSet(minDF) || isSet(maxDF) --- End diff -- Changed. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20367: [SPARK-23166][ML] Add maxDF Parameter to CountVec...
Github user ymazari commented on a diff in the pull request: https://github.com/apache/spark/pull/20367#discussion_r164275706 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala --- @@ -155,24 +182,47 @@ class CountVectorizer @Since("1.5.0") (@Since("1.5.0") override val uid: String) transformSchema(dataset.schema, logging = true) val vocSize = $(vocabSize) val input = dataset.select($(inputCol)).rdd.map(_.getAs[Seq[String]](0)) +val filteringRequired = isSet(minDF) || isSet(maxDF) +val maybeInputSize = if (filteringRequired) { + Some(input.cache().count()) +} else { + None +} val minDf = if ($(minDF) >= 1.0) { $(minDF) } else { - $(minDF) * input.cache().count() + $(minDF) * maybeInputSize.getOrElse(1L) --- End diff -- Right. I guess I was overdoing it here. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20367: [SPARK-23166][ML] Add maxDF Parameter to CountVec...
Github user ymazari commented on a diff in the pull request: https://github.com/apache/spark/pull/20367#discussion_r164260027 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala --- @@ -160,6 +187,11 @@ class CountVectorizer @Since("1.5.0") (@Since("1.5.0") override val uid: String) } else { $(minDF) * input.cache().count() } +val maxDf = if ($(maxDF) >= 1.0) { + $(maxDF) +} else { + $(maxDF) * input.cache().count() +} --- End diff -- Good points. - I added a check that maxDF >= minDF - Changed the code so that counting (and caching) is done only once - I refactored the code so that "filter()" is only invoked if minDF or maxDF is set - Added an un-persisting the input after the counting is done. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20367: [SPARK-23166][ML] Add maxDF Parameter to CountVectorizer
Github user ymazari commented on the issue: https://github.com/apache/spark/pull/20367 @srowen It seems that this PR needs Admin approval. Could you please help getting it to the next step? Thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20367: [SPARK-23166][ML] Add maxDF Parameter to CountVec...
Github user ymazari commented on a diff in the pull request: https://github.com/apache/spark/pull/20367#discussion_r163465302 --- Diff: mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala --- @@ -119,6 +119,41 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext } } + test("CountVectorizer maxDF") { +val df = Seq( + (0, split("a b c d"), Vectors.sparse(3, Seq((0, 1.0), (1, 1.0), (2, 1.0, + (1, split("a b c"), Vectors.sparse(3, Seq((0, 1.0), (1, 1.0, + (2, split("a b"), Vectors.sparse(3, Seq((0, 1.0, + (3, split("a"), Vectors.sparse(3, Seq())) +).toDF("id", "words", "expected") + +// maxDF: ignore terms with count more than 3 +val cvModel = new CountVectorizer() + .setInputCol("words") + .setOutputCol("features") + .setMaxDF(3) + .fit(df) +assert(cvModel.vocabulary === Array("b", "c", "d")) + +cvModel.transform(df).select("features", "expected").collect().foreach { + case Row(features: Vector, expected: Vector) => +assert(features ~== expected absTol 1e-14) +} + +// maxDF: ignore terms with freq > 0.75 +val cvModel2 = new CountVectorizer() + .setInputCol("words") + .setOutputCol("features") + .setMaxDF(0.75) + .fit(df) +assert(cvModel2.vocabulary === Array("b", "c", "d")) + +cvModel2.transform(df).select("features", "expected").collect().foreach { + case Row(features: Vector, expected: Vector) => +assert(features ~== expected absTol 1e-14) --- End diff -- Done. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20367: [SPARK-23166][ML] Add maxDF Parameter to CountVec...
Github user ymazari commented on a diff in the pull request: https://github.com/apache/spark/pull/20367#discussion_r163359719 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala --- @@ -113,7 +132,11 @@ private[feature] trait CountVectorizerParams extends Params with HasInputCol wit /** @group getParam */ def getBinary: Boolean = $(binary) - setDefault(vocabSize -> (1 << 18), minDF -> 1.0, minTF -> 1.0, binary -> false) + setDefault(vocabSize -> (1 << 18), +minDF -> 1.0, +maxDF -> Long.MaxValue, --- End diff -- > by @mgaido91: what about avoiding to set a default value and apply the filter only if it was set? That'd be easy to do, but would make the code a couple lines longer. I will do it if you think it's really better. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20367: [SPARK-23166][ML] Add maxDF Parameter to CountVec...
Github user ymazari commented on a diff in the pull request: https://github.com/apache/spark/pull/20367#discussion_r163358962 --- Diff: mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala --- @@ -119,6 +119,41 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext } } + test("CountVectorizer maxDF") { +val df = Seq( + (0, split("a b c d"), Vectors.sparse(3, Seq((0, 1.0), (1, 1.0), (2, 1.0, + (1, split("a b c"), Vectors.sparse(3, Seq((0, 1.0), (1, 1.0, + (2, split("a b"), Vectors.sparse(3, Seq((0, 1.0, + (3, split("a"), Vectors.sparse(3, Seq())) +).toDF("id", "words", "expected") + +// maxDF: ignore terms with count more than 3 +val cvModel = new CountVectorizer() + .setInputCol("words") + .setOutputCol("features") + .setMaxDF(3) + .fit(df) +assert(cvModel.vocabulary === Array("b", "c", "d")) + +cvModel.transform(df).select("features", "expected").collect().foreach { + case Row(features: Vector, expected: Vector) => +assert(features ~== expected absTol 1e-14) +} + +// maxDF: ignore terms with freq > 0.75 +val cvModel2 = new CountVectorizer() + .setInputCol("words") + .setOutputCol("features") + .setMaxDF(0.75) + .fit(df) +assert(cvModel2.vocabulary === Array("b", "c", "d")) + +cvModel2.transform(df).select("features", "expected").collect().foreach { + case Row(features: Vector, expected: Vector) => +assert(features ~== expected absTol 1e-14) --- End diff -- > by @mgaido91: may you please also add a UT to check that setting both maxDF and minDF works as expected? Yes, I will. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20367: [SPARK-23166][ML] Add maxDF Parameter to CountVec...
Github user ymazari commented on a diff in the pull request: https://github.com/apache/spark/pull/20367#discussion_r163358747 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala --- @@ -169,7 +201,7 @@ class CountVectorizer @Since("1.5.0") (@Since("1.5.0") override val uid: String) }.reduceByKey { case ((wc1, df1), (wc2, df2)) => (wc1 + wc2, df1 + df2) }.filter { case (word, (wc, df)) => - df >= minDf + (df >= minDf) && (df <= maxDf) --- End diff -- > from @mgaido91: nit: the parenthesis are not needed Right. I added them for the purpose of clarity. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20367: [SPARK-23166][ML] Add maxDF Parameter to CountVectorizer
Github user ymazari commented on the issue: https://github.com/apache/spark/pull/20367 @mgaido91 I closed the previous PR and opened this one. I am answering your comments here. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20366: [SPARK-23166] [ML] Add maxDF Parameter to CountVe...
Github user ymazari commented on a diff in the pull request: https://github.com/apache/spark/pull/20366#discussion_r163355218 --- Diff: mllib/src/test/scala/org/apache/spark/ml/feature/CountVectorizerSuite.scala --- @@ -119,6 +119,41 @@ class CountVectorizerSuite extends SparkFunSuite with MLlibTestSparkContext } } + test("CountVectorizer maxDF") { +val df = Seq( + (0, split("a b c d"), Vectors.sparse(3, Seq((0, 1.0), (1, 1.0), (2, 1.0, + (1, split("a b c"), Vectors.sparse(3, Seq((0, 1.0), (1, 1.0, + (2, split("a b"), Vectors.sparse(3, Seq((0, 1.0, + (3, split("a"), Vectors.sparse(3, Seq())) +).toDF("id", "words", "expected") + +// maxDF: ignore terms with count more than 3 +val cvModel = new CountVectorizer() + .setInputCol("words") + .setOutputCol("features") + .setMaxDF(3) + .fit(df) +assert(cvModel.vocabulary === Array("b", "c", "d")) + +cvModel.transform(df).select("features", "expected").collect().foreach { + case Row(features: Vector, expected: Vector) => +assert(features ~== expected absTol 1e-14) +} + +// maxDF: ignore terms with freq > 0.75 +val cvModel2 = new CountVectorizer() + .setInputCol("words") + .setOutputCol("features") + .setMaxDF(0.75) + .fit(df) +assert(cvModel2.vocabulary === Array("b", "c", "d")) + +cvModel2.transform(df).select("features", "expected").collect().foreach { + case Row(features: Vector, expected: Vector) => +assert(features ~== expected absTol 1e-14) +} --- End diff -- I will. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20366: [SPARK-23166] [ML] Add maxDF Parameter to CountVe...
Github user ymazari commented on a diff in the pull request: https://github.com/apache/spark/pull/20366#discussion_r163355088 --- Diff: mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala --- @@ -169,7 +197,7 @@ class CountVectorizer @Since("1.5.0") (@Since("1.5.0") override val uid: String) }.reduceByKey { case ((wc1, df1), (wc2, df2)) => (wc1 + wc2, df1 + df2) }.filter { case (word, (wc, df)) => - df >= minDf + (df >= minDf) && (df <= maxDf) --- End diff -- Right. I just added them for clarity. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark issue #20367: [SPARK-23166][ML] Add maxDF Parameter to CountVectorizer
Github user ymazari commented on the issue: https://github.com/apache/spark/pull/20367 @hhbyyh, @MLnick, @jkbradley could you please help reviewing this? Thanks. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20367: [SPARK-23166][ML] Add maxDF Parameter to CountVec...
GitHub user ymazari opened a pull request: https://github.com/apache/spark/pull/20367 [SPARK-23166][ML] Add maxDF Parameter to CountVectorizer ## What changes were proposed in this pull request? Currently, the CountVectorizer has a minDF parameter. It might be useful to also have a maxDF parameter. It will be used as a threshold for filtering all the terms that occur very frequently in a text corpus, because they are not very informative or could even be stop-words. This is analogous to scikit-learn, CountVectorizer, max_df. ## How was this patch tested? Unit tests. You can merge this pull request into a Git repository by running: $ git pull https://github.com/ymazari/spark SPARK-23166 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/20367.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #20367 commit f9eb543ea2c9f9c1d4c96db112a4e14a9c1beb11 Author: Yacine Mazari Date: 2018-01-23T19:11:52Z [SPARK-23166][ML] Add maxDF Parameter to CountVectorizer --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20366: [SPARK-23166] [ML] Add maxDF Parameter to CountVe...
Github user ymazari closed the pull request at: https://github.com/apache/spark/pull/20366 --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #20366: [SPARK-23166] [ML] Add maxDF Parameter to CountVe...
GitHub user ymazari opened a pull request: https://github.com/apache/spark/pull/20366 [SPARK-23166] [ML] Add maxDF Parameter to CountVectorizer ## What changes were proposed in this pull request? (Please fill in changes proposed in this fix) ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) Please review http://spark.apache.org/contributing.html before opening a pull request. You can merge this pull request into a Git repository by running: $ git pull https://github.com/ymazari/spark SPARK-23166 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/20366.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #20366 commit 568ea65ff32dce179097098b5d2934df20cac17c Author: Yacine Mazari Date: 2018-01-22T13:22:58Z [SPARK-23166] [ML] Add maxDF Parameter to CountVectorizer --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org