Repository: spark
Updated Branches:
  refs/heads/master 95c03cbd2 -> a33655348


[SPARK-23615][ML][PYSPARK] Add maxDF Parameter to Python CountVectorizer

## What changes were proposed in this pull request?

The maxDF parameter is for filtering out frequently occurring terms. This param 
was recently added to the Scala CountVectorizer and needs to be added to Python 
also.

## How was this patch tested?

add test

Author: Huaxin Gao <[email protected]>

Closes #20777 from huaxingao/spark-23615.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a3365534
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a3365534
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a3365534

Branch: refs/heads/master
Commit: a33655348c4066d9c1d8ad2055aadfbc892ba7fd
Parents: 95c03cb
Author: Huaxin Gao <[email protected]>
Authored: Fri Mar 23 15:58:48 2018 -0700
Committer: Bryan Cutler <[email protected]>
Committed: Fri Mar 23 15:58:48 2018 -0700

----------------------------------------------------------------------
 .../spark/ml/feature/CountVectorizer.scala      | 20 +++++-----
 python/pyspark/ml/feature.py                    | 40 +++++++++++++++-----
 python/pyspark/ml/tests.py                      | 25 ++++++++++++
 3 files changed, 67 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/a3365534/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
index 60a4f91..9e0ed43 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -70,19 +70,21 @@ private[feature] trait CountVectorizerParams extends Params 
with HasInputCol wit
   def getMinDF: Double = $(minDF)
 
   /**
-   * Specifies the maximum number of different documents a term must appear in 
to be included
-   * in the vocabulary.
-   * If this is an integer greater than or equal to 1, this specifies the 
number of documents
-   * the term must appear in; if this is a double in [0,1), then this 
specifies the fraction of
-   * documents.
+   * Specifies the maximum number of different documents a term could appear 
in to be included
+   * in the vocabulary. A term that appears more than the threshold will be 
ignored. If this is an
+   * integer greater than or equal to 1, this specifies the maximum number of 
documents the term
+   * could appear in; if this is a double in [0,1), then this specifies the 
maximum fraction of
+   * documents the term could appear in.
    *
-   * Default: (2^64^) - 1
+   * Default: (2^63^) - 1
    * @group param
    */
   val maxDF: DoubleParam = new DoubleParam(this, "maxDF", "Specifies the 
maximum number of" +
-    " different documents a term must appear in to be included in the 
vocabulary." +
-    " If this is an integer >= 1, this specifies the number of documents the 
term must" +
-    " appear in; if this is a double in [0,1), then this specifies the 
fraction of documents.",
+    " different documents a term could appear in to be included in the 
vocabulary." +
+    " A term that appears more than the threshold will be ignored. If this is 
an integer >= 1," +
+    " this specifies the maximum number of documents the term could appear 
in;" +
+    " if this is a double in [0,1), then this specifies the maximum fraction 
of" +
+    " documents the term could appear in.",
     ParamValidators.gtEq(0.0))
 
   /** @group getParam */

http://git-wip-us.apache.org/repos/asf/spark/blob/a3365534/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index a1ceb7f..fcb0dfc 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -422,6 +422,14 @@ class _CountVectorizerParams(JavaParams, HasInputCol, 
HasOutputCol):
         " If this is an integer >= 1, this specifies the number of documents 
the term must" +
         " appear in; if this is a double in [0,1), then this specifies the 
fraction of documents." +
         " Default 1.0", typeConverter=TypeConverters.toFloat)
+    maxDF = Param(
+        Params._dummy(), "maxDF", "Specifies the maximum number of" +
+        " different documents a term could appear in to be included in the 
vocabulary." +
+        " A term that appears more than the threshold will be ignored. If this 
is an" +
+        " integer >= 1, this specifies the maximum number of documents the 
term could appear in;" +
+        " if this is a double in [0,1), then this specifies the maximum" +
+        " fraction of documents the term could appear in." +
+        " Default (2^63) - 1", typeConverter=TypeConverters.toFloat)
     vocabSize = Param(
         Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 
<< 18.",
         typeConverter=TypeConverters.toInt)
@@ -433,7 +441,7 @@ class _CountVectorizerParams(JavaParams, HasInputCol, 
HasOutputCol):
 
     def __init__(self, *args):
         super(_CountVectorizerParams, self).__init__(*args)
-        self._setDefault(minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False)
+        self._setDefault(minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 
<< 18, binary=False)
 
     @since("1.6.0")
     def getMinTF(self):
@@ -449,6 +457,13 @@ class _CountVectorizerParams(JavaParams, HasInputCol, 
HasOutputCol):
         """
         return self.getOrDefault(self.minDF)
 
+    @since("2.4.0")
+    def getMaxDF(self):
+        """
+        Gets the value of maxDF or its default value.
+        """
+        return self.getOrDefault(self.maxDF)
+
     @since("1.6.0")
     def getVocabSize(self):
         """
@@ -513,11 +528,11 @@ class CountVectorizer(JavaEstimator, 
_CountVectorizerParams, JavaMLReadable, Jav
     """
 
     @keyword_only
-    def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, 
inputCol=None,
-                 outputCol=None):
+    def __init__(self, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 
18, binary=False,
+                 inputCol=None, outputCol=None):
         """
-        __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, 
inputCol=None,\
-                 outputCol=None)
+        __init__(self, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 << 
18, binary=False,\
+                 inputCol=None,outputCol=None)
         """
         super(CountVectorizer, self).__init__()
         self._java_obj = 
self._new_java_obj("org.apache.spark.ml.feature.CountVectorizer",
@@ -527,11 +542,11 @@ class CountVectorizer(JavaEstimator, 
_CountVectorizerParams, JavaMLReadable, Jav
 
     @keyword_only
     @since("1.6.0")
-    def setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, 
inputCol=None,
-                  outputCol=None):
+    def setParams(self, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 
<< 18, binary=False,
+                  inputCol=None, outputCol=None):
         """
-        setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, 
inputCol=None,\
-                  outputCol=None)
+        setParams(self, minTF=1.0, minDF=1.0, maxDF=2 ** 63 - 1, vocabSize=1 
<< 18, binary=False,\
+                  inputCol=None, outputCol=None)
         Set the params for the CountVectorizer
         """
         kwargs = self._input_kwargs
@@ -551,6 +566,13 @@ class CountVectorizer(JavaEstimator, 
_CountVectorizerParams, JavaMLReadable, Jav
         """
         return self._set(minDF=value)
 
+    @since("2.4.0")
+    def setMaxDF(self, value):
+        """
+        Sets the value of :py:attr:`maxDF`.
+        """
+        return self._set(maxDF=value)
+
     @since("1.6.0")
     def setVocabSize(self, value):
         """

http://git-wip-us.apache.org/repos/asf/spark/blob/a3365534/python/pyspark/ml/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 0801199..cf1ffa1 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -697,6 +697,31 @@ class FeatureTests(SparkSessionTestCase):
             feature, expected = r
             self.assertEqual(feature, expected)
 
+    def test_count_vectorizer_with_maxDF(self):
+        dataset = self.spark.createDataFrame([
+            (0, "a b c d".split(' '), SparseVector(3, {0: 1.0, 1: 1.0, 2: 
1.0}),),
+            (1, "a b c".split(' '), SparseVector(3, {0: 1.0, 1: 1.0}),),
+            (2, "a b".split(' '), SparseVector(3, {0: 1.0}),),
+            (3, "a".split(' '), SparseVector(3,  {}),)], ["id", "words", 
"expected"])
+        cv = CountVectorizer(inputCol="words", outputCol="features")
+        model1 = cv.setMaxDF(3).fit(dataset)
+        self.assertEqual(model1.vocabulary, ['b', 'c', 'd'])
+
+        transformedList1 = model1.transform(dataset).select("features", 
"expected").collect()
+
+        for r in transformedList1:
+            feature, expected = r
+            self.assertEqual(feature, expected)
+
+        model2 = cv.setMaxDF(0.75).fit(dataset)
+        self.assertEqual(model2.vocabulary, ['b', 'c', 'd'])
+
+        transformedList2 = model2.transform(dataset).select("features", 
"expected").collect()
+
+        for r in transformedList2:
+            feature, expected = r
+            self.assertEqual(feature, expected)
+
     def test_count_vectorizer_from_vocab(self):
         model = CountVectorizerModel.from_vocabulary(["a", "b", "c"], 
inputCol="words",
                                                      outputCol="features", 
minTF=2)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to