Repository: incubator-predictionio-template-text-classifier Updated Branches: refs/heads/master 7bff41178 -> b24325a39
Changed tokenizer to use Apache Luce StandardAnalyzer for non-western languages Project: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/commit/2bcbdae6 Tree: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/tree/2bcbdae6 Diff: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/diff/2bcbdae6 Branch: refs/heads/master Commit: 2bcbdae63326873d996da9b4e1aa9afd952ecd67 Parents: 3d609f8 Author: Sebastiaan de Man <[email protected]> Authored: Sun Oct 30 21:31:48 2016 +0100 Committer: Sebastiaan de Man <[email protected]> Committed: Sun Oct 30 21:31:48 2016 +0100 ---------------------------------------------------------------------- src/main/scala/Preparator.scala | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/blob/2bcbdae6/src/main/scala/Preparator.scala ---------------------------------------------------------------------- diff --git a/src/main/scala/Preparator.scala b/src/main/scala/Preparator.scala index c990944..fd043d6 100644 --- a/src/main/scala/Preparator.scala +++ b/src/main/scala/Preparator.scala @@ -11,6 +11,14 @@ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD +import org.apache.lucene.analysis.standard.StandardAnalyzer +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute +import org.apache.lucene.util.Version + +import java.io.StringReader + +import scala.collection.mutable + /** Define Preparator parameters. Recall that for our data * representation we are only required to input the n-gram window * components. @@ -62,9 +70,28 @@ class TFHasher( private val hasher = new HashingTF(numFeatures = numFeatures) +/** Use Lucene StandardAnalyzer to tokenize text **/ + def tokenize(content: String): Seq[String] = { + val tReader = new StringReader(content) + val analyzer = new StandardAnalyzer(Version.LATEST) + val tStream = analyzer.tokenStream("contents", tReader) + val term = tStream.addAttribute(classOf[CharTermAttribute]) + tStream.reset() + + val result = mutable.ArrayBuffer.empty[String] + while (tStream.incrementToken()) { + val termValue = term.toString + + result += term.toString + + } + result +} + + /** Hashing function: Text -> term frequency vector. */ def hashTF(text: String): Vector = { - val newList : Array[String] = text.split(" ") + val newList : Array[String] = tokenize(text) .sliding(nGram) .map(_.mkString) .toArray
