Repository: spark Updated Branches: refs/heads/branch-1.6 93c9a63ea -> 58dfba66e
[SPARK-14665][ML][PYTHON] Fixed bug with StopWordsRemover default stopwords The default stopwords were a Java object. They are no longer. Unit test which failed before the fix Author: Joseph K. Bradley <[email protected]> Closes #12422 from jkbradley/pyspark-stopwords. (cherry picked from commit d6ae7d4637d23c57c4eeab79d1177216f380ec9c) Signed-off-by: Joseph K. Bradley <[email protected]> Conflicts: python/pyspark/ml/feature.py python/pyspark/ml/tests.py Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/58dfba66 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/58dfba66 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/58dfba66 Branch: refs/heads/branch-1.6 Commit: 58dfba66ee9b87cdbce6bc2b01324025c7514669 Parents: 93c9a63 Author: Joseph K. Bradley <[email protected]> Authored: Fri Apr 15 11:50:21 2016 -0700 Committer: Joseph K. Bradley <[email protected]> Committed: Fri Apr 15 11:58:08 2016 -0700 ---------------------------------------------------------------------- python/pyspark/ml/feature.py | 2 +- python/pyspark/ml/tests.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/58dfba66/python/pyspark/ml/feature.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index ea874dc..bc63f6d 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1402,7 +1402,7 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol): self.caseSensitive = Param(self, "caseSensitive", "whether to do a case " + "sensitive comparison over the stop words") stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords - defaultStopWords = stopWordsObj.English() + defaultStopWords = list(stopWordsObj.English()) self._setDefault(stopWords=defaultStopWords) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) http://git-wip-us.apache.org/repos/asf/spark/blob/58dfba66/python/pyspark/ml/tests.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 674dbe9..b2bd569 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -20,6 +20,9 @@ Unit tests for Spark ML Python APIs. """ import sys +if sys.version > '3': + basestring = str + try: import xmlrunner except ImportError: @@ -283,6 +286,8 @@ class FeatureTests(PySparkTestCase): self.assertEqual(stopWordRemover.getInputCol(), "input") transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["panda"]) + self.assertEqual(type(stopWordRemover.getStopWords()), list) + self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring)) # Custom stopwords = ["panda"] stopWordRemover.setStopWords(stopwords) --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
