spark git commit: [SPARK-11069][ML] Add RegexTokenizer option to convert to lowercase

2015-11-09 Thread jkbradley
Repository: spark
Updated Branches:
  refs/heads/branch-1.6 08253874a -> 34e824d90


[SPARK-11069][ML] Add RegexTokenizer option to convert to lowercase

jira: https://issues.apache.org/jira/browse/SPARK-11069
quotes from jira:
Tokenizer converts strings to lowercase automatically, but RegexTokenizer does 
not. It would be nice to add an option to RegexTokenizer to convert to 
lowercase. Proposal:
call the Boolean Param "toLowercase"
set default to false (so behavior does not change)

Actually sklearn converts to lowercase before tokenizing too

Author: Yuhao Yang 

Closes #9092 from hhbyyh/tokenLower.

(cherry picked from commit 61f9c8711c79f35d67b0456155866da316b131d9)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/34e824d9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/34e824d9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/34e824d9

Branch: refs/heads/branch-1.6
Commit: 34e824d906b90783013021029e0e483ff20c78d5
Parents: 0825387
Author: Yuhao Yang 
Authored: Mon Nov 9 16:55:23 2015 -0800
Committer: Joseph K. Bradley 
Committed: Mon Nov 9 16:57:19 2015 -0800

--
 .../org/apache/spark/ml/feature/Tokenizer.scala | 19 +++--
 .../spark/ml/feature/JavaTokenizerSuite.java|  1 +
 .../spark/ml/feature/TokenizerSuite.scala   | 22 +++-
 3 files changed, 35 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/34e824d9/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index 248288c..1b82b40 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -100,10 +100,25 @@ class RegexTokenizer(override val uid: String)
   /** @group getParam */
   def getPattern: String = $(pattern)
 
-  setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+")
+  /**
+   * Indicates whether to convert all characters to lowercase before 
tokenizing.
+   * Default: true
+   * @group param
+   */
+  final val toLowercase: BooleanParam = new BooleanParam(this, "toLowercase",
+"whether to convert all characters to lowercase before tokenizing.")
+
+  /** @group setParam */
+  def setToLowercase(value: Boolean): this.type = set(toLowercase, value)
+
+  /** @group getParam */
+  def getToLowercase: Boolean = $(toLowercase)
+
+  setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+", toLowercase 
-> true)
 
-  override protected def createTransformFunc: String => Seq[String] = { str =>
+  override protected def createTransformFunc: String => Seq[String] = { 
originStr =>
 val re = $(pattern).r
+val str = if ($(toLowercase)) originStr.toLowerCase() else originStr
 val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq
 val minLength = $(minTokenLength)
 tokens.filter(_.length >= minLength)

http://git-wip-us.apache.org/repos/asf/spark/blob/34e824d9/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java
--
diff --git 
a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java 
b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java
index 02309ce..c407d98 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java
@@ -53,6 +53,7 @@ public class JavaTokenizerSuite {
   .setOutputCol("tokens")
   .setPattern("\\s")
   .setGaps(true)
+  .setToLowercase(false)
   .setMinTokenLength(3);
 
 

http://git-wip-us.apache.org/repos/asf/spark/blob/34e824d9/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
index e5fd21c..a02992a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
@@ -48,13 +48,13 @@ class RegexTokenizerSuite extends SparkFunSuite with 
MLlibTestSparkContext {
   .setInputCol("rawText")
   .setOutputCol("tokens")
 val dataset0 = sqlContext.createDataFrame(Seq(
-  TokenizerTestData("Test for tokenization.", Array("Test", "for", 
"tokenization", ".")),
-  TokenizerTestData("Te,st. punct", Array("Te", ",", "st", ".", "punct"))
+  TokenizerTe

spark git commit: [SPARK-11069][ML] Add RegexTokenizer option to convert to lowercase

2015-11-09 Thread jkbradley
Repository: spark
Updated Branches:
  refs/heads/master 7dc9d8dba -> 61f9c8711


[SPARK-11069][ML] Add RegexTokenizer option to convert to lowercase

jira: https://issues.apache.org/jira/browse/SPARK-11069
quotes from jira:
Tokenizer converts strings to lowercase automatically, but RegexTokenizer does 
not. It would be nice to add an option to RegexTokenizer to convert to 
lowercase. Proposal:
call the Boolean Param "toLowercase"
set default to false (so behavior does not change)

Actually sklearn converts to lowercase before tokenizing too

Author: Yuhao Yang 

Closes #9092 from hhbyyh/tokenLower.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/61f9c871
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/61f9c871
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/61f9c871

Branch: refs/heads/master
Commit: 61f9c8711c79f35d67b0456155866da316b131d9
Parents: 7dc9d8d
Author: Yuhao Yang 
Authored: Mon Nov 9 16:55:23 2015 -0800
Committer: Joseph K. Bradley 
Committed: Mon Nov 9 16:55:23 2015 -0800

--
 .../org/apache/spark/ml/feature/Tokenizer.scala | 19 +++--
 .../spark/ml/feature/JavaTokenizerSuite.java|  1 +
 .../spark/ml/feature/TokenizerSuite.scala   | 22 +++-
 3 files changed, 35 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/61f9c871/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
index 248288c..1b82b40 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Tokenizer.scala
@@ -100,10 +100,25 @@ class RegexTokenizer(override val uid: String)
   /** @group getParam */
   def getPattern: String = $(pattern)
 
-  setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+")
+  /**
+   * Indicates whether to convert all characters to lowercase before 
tokenizing.
+   * Default: true
+   * @group param
+   */
+  final val toLowercase: BooleanParam = new BooleanParam(this, "toLowercase",
+"whether to convert all characters to lowercase before tokenizing.")
+
+  /** @group setParam */
+  def setToLowercase(value: Boolean): this.type = set(toLowercase, value)
+
+  /** @group getParam */
+  def getToLowercase: Boolean = $(toLowercase)
+
+  setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+", toLowercase 
-> true)
 
-  override protected def createTransformFunc: String => Seq[String] = { str =>
+  override protected def createTransformFunc: String => Seq[String] = { 
originStr =>
 val re = $(pattern).r
+val str = if ($(toLowercase)) originStr.toLowerCase() else originStr
 val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq
 val minLength = $(minTokenLength)
 tokens.filter(_.length >= minLength)

http://git-wip-us.apache.org/repos/asf/spark/blob/61f9c871/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java
--
diff --git 
a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java 
b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java
index 02309ce..c407d98 100644
--- a/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/feature/JavaTokenizerSuite.java
@@ -53,6 +53,7 @@ public class JavaTokenizerSuite {
   .setOutputCol("tokens")
   .setPattern("\\s")
   .setGaps(true)
+  .setToLowercase(false)
   .setMinTokenLength(3);
 
 

http://git-wip-us.apache.org/repos/asf/spark/blob/61f9c871/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
index e5fd21c..a02992a 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/TokenizerSuite.scala
@@ -48,13 +48,13 @@ class RegexTokenizerSuite extends SparkFunSuite with 
MLlibTestSparkContext {
   .setInputCol("rawText")
   .setOutputCol("tokens")
 val dataset0 = sqlContext.createDataFrame(Seq(
-  TokenizerTestData("Test for tokenization.", Array("Test", "for", 
"tokenization", ".")),
-  TokenizerTestData("Te,st. punct", Array("Te", ",", "st", ".", "punct"))
+  TokenizerTestData("Test for tokenization.", Array("test", "for", 
"tokenization", ".")),
+  TokenizerTestData("Te,st. p