Merge branch 'STOP_WORDS' of https://github.com/nlauchande/incubator-predictionio-template-text-classifier
Project: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/commit/3d2baf55 Tree: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/tree/3d2baf55 Diff: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/diff/3d2baf55 Branch: refs/heads/master Commit: 3d2baf5544491d7c0cc33cf83c6e32374e5d70d7 Parents: 5c45ef9 1a31614 Author: Donald Szeto <[email protected]> Authored: Thu May 4 10:46:57 2017 -0700 Committer: Donald Szeto <[email protected]> Committed: Thu May 4 10:46:57 2017 -0700 ---------------------------------------------------------------------- src/main/scala/Preparator.scala | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/blob/3d2baf55/src/main/scala/Preparator.scala ---------------------------------------------------------------------- diff --cc src/main/scala/Preparator.scala index 9441257,1681acc..8a5cb5c --- a/src/main/scala/Preparator.scala +++ b/src/main/scala/Preparator.scala @@@ -70,28 -63,10 +71,29 @@@ class TFHasher private val hasher = new HashingTF(numFeatures = numFeatures) +/** Use Lucene StandardAnalyzer to tokenize text **/ + def tokenize(content: String): Seq[String] = { + val tReader = new StringReader(content) + val analyzer = new StandardAnalyzer(Version.LATEST) + val tStream = analyzer.tokenStream("contents", tReader) + val term = tStream.addAttribute(classOf[CharTermAttribute]) + tStream.reset() + + val result = mutable.ArrayBuffer.empty[String] + while (tStream.incrementToken()) { + val termValue = term.toString - ++ + result += term.toString - ++ + } + result +} + + /** Hashing function: Text -> term frequency vector. */ def hashTF(text: String): Vector = { - val newList : Array[String] = text.split(" ") + val newList : Array[String] = tokenize(text) + .filterNot(stopWords.contains(_)) .sliding(nGram) .map(_.mkString) .toArray @@@ -104,7 -79,7 +106,7 @@@ class TFIDFModel val hasher: TFHasher, val idf: IDFModel ) extends Serializable { -- ++ /** trasform text to tf-idf vector. */ def transform(text: String): Vector = { // Map(n-gram -> document tf)
