Repository: spark
Updated Branches:
  refs/heads/branch-1.6 93c9a63ea -> 58dfba66e


[SPARK-14665][ML][PYTHON] Fixed bug with StopWordsRemover default stopwords

The default stopwords were a Java object.  They are no longer.

Unit test which failed before the fix

Author: Joseph K. Bradley <[email protected]>

Closes #12422 from jkbradley/pyspark-stopwords.

(cherry picked from commit d6ae7d4637d23c57c4eeab79d1177216f380ec9c)
Signed-off-by: Joseph K. Bradley <[email protected]>

Conflicts:
        python/pyspark/ml/feature.py
        python/pyspark/ml/tests.py


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/58dfba66
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/58dfba66
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/58dfba66

Branch: refs/heads/branch-1.6
Commit: 58dfba66ee9b87cdbce6bc2b01324025c7514669
Parents: 93c9a63
Author: Joseph K. Bradley <[email protected]>
Authored: Fri Apr 15 11:50:21 2016 -0700
Committer: Joseph K. Bradley <[email protected]>
Committed: Fri Apr 15 11:58:08 2016 -0700

----------------------------------------------------------------------
 python/pyspark/ml/feature.py | 2 +-
 python/pyspark/ml/tests.py   | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/58dfba66/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index ea874dc..bc63f6d 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -1402,7 +1402,7 @@ class StopWordsRemover(JavaTransformer, HasInputCol, 
HasOutputCol):
         self.caseSensitive = Param(self, "caseSensitive", "whether to do a 
case " +
                                    "sensitive comparison over the stop words")
         stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords
-        defaultStopWords = stopWordsObj.English()
+        defaultStopWords = list(stopWordsObj.English())
         self._setDefault(stopWords=defaultStopWords)
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)

http://git-wip-us.apache.org/repos/asf/spark/blob/58dfba66/python/pyspark/ml/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 674dbe9..b2bd569 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -20,6 +20,9 @@ Unit tests for Spark ML Python APIs.
 """
 
 import sys
+if sys.version > '3':
+    basestring = str
+
 try:
     import xmlrunner
 except ImportError:
@@ -283,6 +286,8 @@ class FeatureTests(PySparkTestCase):
         self.assertEqual(stopWordRemover.getInputCol(), "input")
         transformedDF = stopWordRemover.transform(dataset)
         self.assertEqual(transformedDF.head().output, ["panda"])
+        self.assertEqual(type(stopWordRemover.getStopWords()), list)
+        self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], 
basestring))
         # Custom
         stopwords = ["panda"]
         stopWordRemover.setStopWords(stopwords)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to