Github user BryanCutler commented on a diff in the pull request:
https://github.com/apache/spark/pull/20777#discussion_r173335895
--- Diff: python/pyspark/ml/feature.py ---
@@ -408,35 +408,86 @@ class CountVectorizer(JavaEstimator, HasInputCol,
HasOutputCol, JavaMLReadable,
"""
Extracts a vocabulary from document collections and generates a
:py:attr:`CountVectorizerModel`.
- >>> df = spark.createDataFrame(
+ >>> df1 = spark.createDataFrame(
... [(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])],
... ["label", "raw"])
- >>> cv = CountVectorizer(inputCol="raw", outputCol="vectors")
- >>> model = cv.fit(df)
- >>> model.transform(df).show(truncate=False)
+ >>> cv1 = CountVectorizer(inputCol="raw", outputCol="vectors")
+ >>> model1 = cv1.fit(df1)
+ >>> model1.transform(df1).show(truncate=False)
+-----+---------------+-------------------------+
|label|raw |vectors |
+-----+---------------+-------------------------+
|0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])|
|1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+-----+---------------+-------------------------+
...
- >>> sorted(model.vocabulary) == ['a', 'b', 'c']
+ >>> sorted(model1.vocabulary) == ['a', 'b', 'c']
True
>>> countVectorizerPath = temp_path + "/count-vectorizer"
- >>> cv.save(countVectorizerPath)
+ >>> cv1.save(countVectorizerPath)
>>> loadedCv = CountVectorizer.load(countVectorizerPath)
- >>> loadedCv.getMinDF() == cv.getMinDF()
+ >>> loadedCv.getMinDF() == cv1.getMinDF()
True
- >>> loadedCv.getMinTF() == cv.getMinTF()
+ >>> loadedCv.getMinTF() == cv1.getMinTF()
True
- >>> loadedCv.getVocabSize() == cv.getVocabSize()
+ >>> loadedCv.getVocabSize() == cv1.getVocabSize()
True
>>> modelPath = temp_path + "/count-vectorizer-model"
- >>> model.save(modelPath)
+ >>> model1.save(modelPath)
>>> loadedModel = CountVectorizerModel.load(modelPath)
- >>> loadedModel.vocabulary == model.vocabulary
+ >>> loadedModel.vocabulary == model1.vocabulary
True
+ >>> df2 = spark.createDataFrame(
+ ... [(0, ["a", "b", "c", "d"]), (1, ["a", "b", "c",]),(2, ["a",
"b"]),(3, ["a"]),],
+ ... ["label", "raw"])
+ >>> cv2 = CountVectorizer(inputCol="raw", outputCol="vectors", maxDF=3)
+ >>> model2 = cv2.fit(df2)
+ >>> model2.transform(df2).show(truncate=False)
+ +-----+------------+-------------------------+
+ |label|raw |vectors |
+ +-----+------------+-------------------------+
+ |0 |[a, b, c, d]|(3,[0,1,2],[1.0,1.0,1.0])|
+ |1 |[a, b, c] |(3,[0,1],[1.0,1.0]) |
+ |2 |[a, b] |(3,[0],[1.0]) |
+ |3 |[a] |(3,[],[]) |
+ +-----+------------+-------------------------+
+ ...
+ >>> cv3 = CountVectorizer(inputCol="raw", outputCol="vectors",
maxDF=0.75)
+ >>> model3 = cv3.fit(df2)
+ >>> model3.transform(df2).show(truncate=False)
+ +-----+------------+-------------------------+
+ |label|raw |vectors |
+ +-----+------------+-------------------------+
+ |0 |[a, b, c, d]|(3,[0,1,2],[1.0,1.0,1.0])|
+ |1 |[a, b, c] |(3,[0,1],[1.0,1.0]) |
+ |2 |[a, b] |(3,[0],[1.0]) |
+ |3 |[a] |(3,[],[]) |
+ +-----+------------+-------------------------+
+ ...
+ >>> cv4 = CountVectorizer(inputCol="raw", outputCol="vectors",
minDF=2, maxDF=3)
+ >>> model4 = cv4.fit(df2)
+ >>> model4.transform(df2).show(truncate=False)
+ +-----+------------+-------------------+
+ |label|raw |vectors |
+ +-----+------------+-------------------+
+ |0 |[a, b, c, d]|(2,[0,1],[1.0,1.0])|
+ |1 |[a, b, c] |(2,[0,1],[1.0,1.0])|
+ |2 |[a, b] |(2,[0],[1.0]) |
+ |3 |[a] |(2,[],[]) |
+ +-----+------------+-------------------+
+ ...
+ >>> cv5 = CountVectorizer(inputCol="raw", outputCol="vectors",
minDF=0.5, maxDF=0.75)
+ >>> model5 = cv5.fit(df2)
+ >>> model5.transform(df2).show(truncate=False)
+ +-----+------------+-------------------+
+ |label|raw |vectors |
+ +-----+------------+-------------------+
+ |0 |[a, b, c, d]|(2,[0,1],[1.0,1.0])|
+ |1 |[a, b, c] |(2,[0,1],[1.0,1.0])|
+ |2 |[a, b] |(2,[0],[1.0]) |
+ |3 |[a] |(2,[],[]) |
+ +-----+------------+-------------------+
+ ...
--- End diff --
I think this is too much to put as a doctest. Instead, can you just add a
unit test in ml/tests.py? I think you just need 2 transforms, one with an
integer value of `maxDF` > 1 and one as a fractional value. Also, I don't
think your test data actually uses the `maxDF` filtering.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]