spark git commit: [SPARK-11069][ML] Add RegexTokenizer option to convert to lowercase
Repository: spark Updated Branches: refs/heads/branch-1.6 08253874a -> 34e824d90 [SPARK-11069][ML] Add RegexTokenizer option to convert to lowercase jira: https://issues.apache.org/jira/browse/SPARK-11069 quotes from jira: Tokenizer converts strings to lowercase automatically, but RegexTokenizer does not. It would be nice to add an option to RegexTokenizer to convert to lowercase. Proposal: call the Boolean Param "toLowercase" set default to false (so behavior does not change) Actually sklearn converts to lowercase before tokenizing too Author: Yuhao Yang Closes #9092 from hhbyyh/tokenLower. (cherry picked from commit 61f9c8711c79f35d67b0456155866da316b131d9) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/34e824d9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/34e824d9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/34e824d9 Branch: refs/heads/branch-1.6 Commit: 34e824d906b90783013021029e0e483ff20c78d5 Parents: 0825387 Author: Yuhao Yang Authored: Mon Nov 9 16:55:23 2015 -0800 Committer: Joseph K. Bradley Committed: Mon Nov 9 16:57:19 2015 -0800 -- .../org/apache/spark/ml/feature/Tokenizer.scala | 19 +++-- .../spark/ml/feature/JavaTokenizerSuite.java| 1 + .../spark/ml/feature/TokenizerSuite.scala | 22 +++- 3 files changed, 35 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/34e824d9/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index 248288c..1b82b40 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -100,10 +100,25 @@ class RegexTokenizer(override val uid: String) /** @group getParam */ def getPattern: String = $(pattern) - setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+") + /** + * Indicates whether to convert all characters to lowercase before tokenizing. + * Default: true + * @group param + */ + final val toLowercase: BooleanParam = new BooleanParam(this, "toLowercase", +"whether to convert all characters to lowercase before tokenizing.") + + /** @group setParam */ + def setToLowercase(value: Boolean): this.type = set(toLowercase, value) + + /** @group getParam */ + def getToLowercase: Boolean = $(toLowercase) + + setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+", toLowercase -> true) - override protected def createTransformFunc: String => Seq[String] = { str => + override protected def createTransformFunc: String => Seq[String] = { originStr => val re = $(pattern).r +val str = if ($(toLowercase)) originStr.toLowerCase() else originStr val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq val minLength = $(minTokenLength) tokens.filter(_.length >= minLength) http://git-wip-us.apache.org/repos/asf/spark/blob/34e824d9/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java -- diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java index 02309ce..c407d98 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java @@ -53,6 +53,7 @@ public class JavaTokenizerSuite { .setOutputCol("tokens") .setPattern("\\s") .setGaps(true) + .setToLowercase(false) .setMinTokenLength(3); http://git-wip-us.apache.org/repos/asf/spark/blob/34e824d9/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala index e5fd21c..a02992a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala @@ -48,13 +48,13 @@ class RegexTokenizerSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("rawText") .setOutputCol("tokens") val dataset0 = sqlContext.createDataFrame(Seq( - TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization", ".")), - TokenizerTestData("Te,st. punct", Array("Te", ",", "st", ".", "punct")) + TokenizerTe
spark git commit: [SPARK-11069][ML] Add RegexTokenizer option to convert to lowercase
Repository: spark Updated Branches: refs/heads/master 7dc9d8dba -> 61f9c8711 [SPARK-11069][ML] Add RegexTokenizer option to convert to lowercase jira: https://issues.apache.org/jira/browse/SPARK-11069 quotes from jira: Tokenizer converts strings to lowercase automatically, but RegexTokenizer does not. It would be nice to add an option to RegexTokenizer to convert to lowercase. Proposal: call the Boolean Param "toLowercase" set default to false (so behavior does not change) Actually sklearn converts to lowercase before tokenizing too Author: Yuhao Yang Closes #9092 from hhbyyh/tokenLower. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/61f9c871 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/61f9c871 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/61f9c871 Branch: refs/heads/master Commit: 61f9c8711c79f35d67b0456155866da316b131d9 Parents: 7dc9d8d Author: Yuhao Yang Authored: Mon Nov 9 16:55:23 2015 -0800 Committer: Joseph K. Bradley Committed: Mon Nov 9 16:55:23 2015 -0800 -- .../org/apache/spark/ml/feature/Tokenizer.scala | 19 +++-- .../spark/ml/feature/JavaTokenizerSuite.java| 1 + .../spark/ml/feature/TokenizerSuite.scala | 22 +++- 3 files changed, 35 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/61f9c871/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala index 248288c..1b82b40 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala @@ -100,10 +100,25 @@ class RegexTokenizer(override val uid: String) /** @group getParam */ def getPattern: String = $(pattern) - setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+") + /** + * Indicates whether to convert all characters to lowercase before tokenizing. + * Default: true + * @group param + */ + final val toLowercase: BooleanParam = new BooleanParam(this, "toLowercase", +"whether to convert all characters to lowercase before tokenizing.") + + /** @group setParam */ + def setToLowercase(value: Boolean): this.type = set(toLowercase, value) + + /** @group getParam */ + def getToLowercase: Boolean = $(toLowercase) + + setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+", toLowercase -> true) - override protected def createTransformFunc: String => Seq[String] = { str => + override protected def createTransformFunc: String => Seq[String] = { originStr => val re = $(pattern).r +val str = if ($(toLowercase)) originStr.toLowerCase() else originStr val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq val minLength = $(minTokenLength) tokens.filter(_.length >= minLength) http://git-wip-us.apache.org/repos/asf/spark/blob/61f9c871/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java -- diff --git a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java index 02309ce..c407d98 100644 --- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java @@ -53,6 +53,7 @@ public class JavaTokenizerSuite { .setOutputCol("tokens") .setPattern("\\s") .setGaps(true) + .setToLowercase(false) .setMinTokenLength(3); http://git-wip-us.apache.org/repos/asf/spark/blob/61f9c871/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala index e5fd21c..a02992a 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala @@ -48,13 +48,13 @@ class RegexTokenizerSuite extends SparkFunSuite with MLlibTestSparkContext { .setInputCol("rawText") .setOutputCol("tokens") val dataset0 = sqlContext.createDataFrame(Seq( - TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization", ".")), - TokenizerTestData("Te,st. punct", Array("Te", ",", "st", ".", "punct")) + TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")), + TokenizerTestData("Te,st. p