Repository: spark Updated Branches: refs/heads/master d2a9b66f6 -> 876566501
[SPARK-8169] [ML] Add StopWordsRemover as a transformer jira: https://issues.apache.org/jira/browse/SPARK-8169 stop words: http://en.wikipedia.org/wiki/Stop_words StopWordsRemover takes a string array column and outputs a string array column with all defined stop words removed. The transformer should also come with a standard set of stop words as default. Currently I used a minimum stop words set since on some [case](http://nlp.stanford.edu/IR-book/html/htmledition/dropping-common-terms-stop-words-1.html), small set of stop words is preferred. ASCII char has been tested, Yet I cannot check it in due to style check. Further thought, 1. Maybe I should use OpenHashSet. Is it recommended? 2. Currently I leave the null in input array untouched, i.e. Array(null, null) => Array(null, null). 3. If the current stop words set looks too limited, any suggestion for replacement? We can have something similar to the one in [SKlearn](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/stop_words.py). Author: Yuhao Yang <hhb...@gmail.com> Closes #6742 from hhbyyh/stopwords and squashes the following commits: fa959d8 [Yuhao Yang] separating udf f190217 [Yuhao Yang] replace default list and other small fix 04403ab [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into stopwords b3aa957 [Yuhao Yang] add stopWordsRemover Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/87656650 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/87656650 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/87656650 Branch: refs/heads/master Commit: 8765665015ef47a23e00f7d01d4d280c31bb236d Parents: d2a9b66 Author: Yuhao Yang <hhb...@gmail.com> Authored: Sat Aug 1 02:31:28 2015 -0700 Committer: Xiangrui Meng <m...@databricks.com> Committed: Sat Aug 1 02:31:28 2015 -0700 ---------------------------------------------------------------------- .../spark/ml/feature/StopWordsRemover.scala | 155 +++++++++++++++++++ .../ml/feature/StopWordsRemoverSuite.scala | 80 ++++++++++ 2 files changed, 235 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/87656650/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala new file mode 100644 index 0000000..3cc4142 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import org.apache.spark.annotation.Experimental +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.param.{ParamMap, BooleanParam, Param} +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.types.{StringType, StructField, ArrayType, StructType} +import org.apache.spark.sql.functions.{col, udf} + +/** + * stop words list + */ +private object StopWords { + + /** + * Use the same default stopwords list as scikit-learn. + * The original list can be found from "Glasgow Information Retrieval Group" + * [[http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words]] + */ + val EnglishStopWords = Array( "a", "about", "above", "across", "after", "afterwards", "again", + "against", "all", "almost", "alone", "along", "already", "also", "although", "always", + "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", + "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", + "around", "as", "at", "back", "be", "became", "because", "become", + "becomes", "becoming", "been", "before", "beforehand", "behind", "being", + "below", "beside", "besides", "between", "beyond", "bill", "both", + "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", + "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", + "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", + "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", + "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", + "find", "fire", "first", "five", "for", "former", "formerly", "forty", + "found", "four", "from", "front", "full", "further", "get", "give", "go", + "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", + "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", + "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", + "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", + "latterly", "least", "less", "ltd", "made", "many", "may", "me", + "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", + "move", "much", "must", "my", "myself", "name", "namely", "neither", + "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", + "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", + "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", + "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", + "please", "put", "rather", "re", "same", "see", "seem", "seemed", + "seeming", "seems", "serious", "several", "she", "should", "show", "side", + "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", + "something", "sometime", "sometimes", "somewhere", "still", "such", + "system", "take", "ten", "than", "that", "the", "their", "them", + "themselves", "then", "thence", "there", "thereafter", "thereby", + "therefore", "therein", "thereupon", "these", "they", "thick", "thin", + "third", "this", "those", "though", "three", "through", "throughout", + "thru", "thus", "to", "together", "too", "top", "toward", "towards", + "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", + "very", "via", "was", "we", "well", "were", "what", "whatever", "when", + "whence", "whenever", "where", "whereafter", "whereas", "whereby", + "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", + "who", "whoever", "whole", "whom", "whose", "why", "will", "with", + "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves") +} + +/** + * :: Experimental :: + * A feature transformer that filters out stop words from input. + * Note: null values from input array are preserved unless adding null to stopWords explicitly. + * @see [[http://en.wikipedia.org/wiki/Stop_words]] + */ +@Experimental +class StopWordsRemover(override val uid: String) + extends Transformer with HasInputCol with HasOutputCol { + + def this() = this(Identifiable.randomUID("stopWords")) + + /** @group setParam */ + def setInputCol(value: String): this.type = set(inputCol, value) + + /** @group setParam */ + def setOutputCol(value: String): this.type = set(outputCol, value) + + /** + * the stop words set to be filtered out + * @group param + */ + val stopWords: Param[Array[String]] = new Param(this, "stopWords", "stop words") + + /** @group setParam */ + def setStopWords(value: Array[String]): this.type = set(stopWords, value) + + /** @group getParam */ + def getStopWords: Array[String] = $(stopWords) + + /** + * whether to do a case sensitive comparison over the stop words + * @group param + */ + val caseSensitive: BooleanParam = new BooleanParam(this, "caseSensitive", + "whether to do case-sensitive comparison during filtering") + + /** @group setParam */ + def setCaseSensitive(value: Boolean): this.type = set(caseSensitive, value) + + /** @group getParam */ + def getCaseSensitive: Boolean = $(caseSensitive) + + setDefault(stopWords -> StopWords.EnglishStopWords, caseSensitive -> false) + + override def transform(dataset: DataFrame): DataFrame = { + val outputSchema = transformSchema(dataset.schema) + val t = if ($(caseSensitive)) { + val stopWordsSet = $(stopWords).toSet + udf { terms: Seq[String] => + terms.filter(s => !stopWordsSet.contains(s)) + } + } else { + val toLower = (s: String) => if (s != null) s.toLowerCase else s + val lowerStopWords = $(stopWords).map(toLower(_)).toSet + udf { terms: Seq[String] => + terms.filter(s => !lowerStopWords.contains(toLower(s))) + } + } + + val metadata = outputSchema($(outputCol)).metadata + dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) + } + + override def transformSchema(schema: StructType): StructType = { + val inputType = schema($(inputCol)).dataType + require(inputType.sameType(ArrayType(StringType)), + s"Input type must be ArrayType(StringType) but got $inputType.") + val outputFields = schema.fields :+ + StructField($(outputCol), inputType, schema($(inputCol)).nullable) + StructType(outputFields) + } + + override def copy(extra: ParamMap): StopWordsRemover = defaultCopy(extra) +} http://git-wip-us.apache.org/repos/asf/spark/blob/87656650/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala new file mode 100644 index 0000000..f01306f --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import org.apache.spark.SparkFunSuite +import org.apache.spark.mllib.util.MLlibTestSparkContext +import org.apache.spark.sql.{DataFrame, Row} + +object StopWordsRemoverSuite extends SparkFunSuite { + def testStopWordsRemover(t: StopWordsRemover, dataset: DataFrame): Unit = { + t.transform(dataset) + .select("filtered", "expected") + .collect() + .foreach { case Row(tokens, wantedTokens) => + assert(tokens === wantedTokens) + } + } +} + +class StopWordsRemoverSuite extends SparkFunSuite with MLlibTestSparkContext { + import StopWordsRemoverSuite._ + + test("StopWordsRemover default") { + val remover = new StopWordsRemover() + .setInputCol("raw") + .setOutputCol("filtered") + val dataSet = sqlContext.createDataFrame(Seq( + (Seq("test", "test"), Seq("test", "test")), + (Seq("a", "b", "c", "d"), Seq("b", "c", "d")), + (Seq("a", "the", "an"), Seq()), + (Seq("A", "The", "AN"), Seq()), + (Seq(null), Seq(null)), + (Seq(), Seq()) + )).toDF("raw", "expected") + + testStopWordsRemover(remover, dataSet) + } + + test("StopWordsRemover case sensitive") { + val remover = new StopWordsRemover() + .setInputCol("raw") + .setOutputCol("filtered") + .setCaseSensitive(true) + val dataSet = sqlContext.createDataFrame(Seq( + (Seq("A"), Seq("A")), + (Seq("The", "the"), Seq("The")) + )).toDF("raw", "expected") + + testStopWordsRemover(remover, dataSet) + } + + test("StopWordsRemover with additional words") { + val stopWords = StopWords.EnglishStopWords ++ Array("python", "scala") + val remover = new StopWordsRemover() + .setInputCol("raw") + .setOutputCol("filtered") + .setStopWords(stopWords) + val dataSet = sqlContext.createDataFrame(Seq( + (Seq("python", "scala", "a"), Seq()), + (Seq("Python", "Scala", "swift"), Seq("swift")) + )).toDF("raw", "expected") + + testStopWordsRemover(remover, dataSet) + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org