Filter out stop words from vectorization As per the discussion described in : https://github.com/apache/incubator-predictionio-template-text-classifier/pull/8 . We implement a filter for stop words and they are added to the constructor of TFHasher during vectorization of words.
Project: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/commit/1a316143 Tree: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/tree/1a316143 Diff: http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/diff/1a316143 Branch: refs/heads/master Commit: 1a316143f169bc7804604d0914b380381dfb9fa1 Parents: 7bff411 Author: Natu Lauchande <[email protected]> Authored: Mon Dec 5 17:36:04 2016 +0200 Committer: Natu Lauchande <[email protected]> Committed: Tue Dec 6 04:04:47 2016 +0200 ---------------------------------------------------------------------- src/main/scala/Preparator.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-predictionio-template-text-classifier/blob/1a316143/src/main/scala/Preparator.scala ---------------------------------------------------------------------- diff --git a/src/main/scala/Preparator.scala b/src/main/scala/Preparator.scala index c8b35d0..1681acc 100644 --- a/src/main/scala/Preparator.scala +++ b/src/main/scala/Preparator.scala @@ -26,7 +26,7 @@ class Preparator(pp: PreparatorParams) def prepare(sc: SparkContext, td: TrainingData): PreparedData = { - val tfHasher = new TFHasher(pp.numFeatures, pp.nGram) + val tfHasher = new TFHasher(pp.numFeatures, pp.nGram, td.stopWords) // Convert trainingdata's observation text into TF vector // and then fit a IDF model @@ -57,7 +57,8 @@ class Preparator(pp: PreparatorParams) class TFHasher( val numFeatures: Int, - val nGram: Int + val nGram: Int, + val stopWords:Set[String] ) extends Serializable { private val hasher = new HashingTF(numFeatures = numFeatures) @@ -65,6 +66,7 @@ class TFHasher( /** Hashing function: Text -> term frequency vector. */ def hashTF(text: String): Vector = { val newList : Array[String] = text.split(" ") + .filterNot(stopWords.contains(_)) .sliding(nGram) .map(_.mkString) .toArray
