Author: sujen
Date: Thu Feb 11 05:32:19 2016
New Revision: 1729763

URL: http://svn.apache.org/viewvc?rev=1729763&view=rev
Log:
NUTCH-2209 Improved Tokenization for Similarity Scoring plugin, this closes #87

Added:
    nutch/trunk/src/plugin/scoring-similarity/build-ivy.xml
    
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
    
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
    
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
    
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
    
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/
    
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
    
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
    
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/package-info.java
Removed:
    
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/CosineSimilarity.java
    
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/DocumentVector.java
    
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/ScoringFilterModel.java
    
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarityModel.java
    
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocumentVector.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/plugin/scoring-similarity/build.xml
    nutch/trunk/src/plugin/scoring-similarity/plugin.xml
    
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1729763&r1=1729762&r2=1729763&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Feb 11 05:32:19 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2209 Improved Tokenization for Similarity Scoring plugin (Sujen)
+
 * NUTCH-2211 Added filterchecker and normalizerchecker to bin/nutch script 
(markus)
 
 * NUTCH-2197 Add Solr 5 cloud indexer support (Jurian Broertjes via markus)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1729763&r1=1729762&r2=1729763&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Thu Feb 11 05:32:19 2016
@@ -1390,7 +1390,16 @@ For more detailed information on the wor
 visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
 
 <property>
-    <name>scoring.similarity.model.path</name>
+    <name>scoring.similarity.model</name>
+    <value>cosine</value>
+    <description>The type of similarity metric to use. Eg - cosine (which is, 
currently, the only available model).
+      Please make sure to set the model specific properties for the scoring to 
function properly. 
+      Description of these properties can be found on the wiki.
+    </description>
+</property>
+
+<property>
+    <name>cosine.goldstandard.file</name>
     <value>goldstandard.txt</value>
     <description>Path to the gold standard file which contains all the 
relevant text and terms, 
       pertaining to the domain.

Modified: nutch/trunk/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1729763&r1=1729762&r2=1729763&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Thu Feb 11 05:32:19 2016
@@ -87,7 +87,8 @@
         <dependency org="com.fasterxml.jackson.core" name="jackson-databind" 
rev="2.5.1"  conf="*->default"/> 
         <dependency org="com.fasterxml.jackson.dataformat" 
name="jackson-dataformat-cbor" rev="2.5.1" conf="*->default"/>
         <dependency org="com.fasterxml.jackson.jaxrs" 
name="jackson-jaxrs-json-provider" rev="2.5.1" conf="*->default"/>       
-              
+        
+        <dependency org="org.apache.lucene" name="lucene-analyzers-common" 
rev="4.10.2" conf="*->default"></dependency>
                <!-- WARC artifacts needed  -->
                <dependency org="org.netpreserve.commons" 
name="webarchive-commons" rev="1.1.5" conf="*->default">
                        <exclude module="hadoop-core"/>

Added: nutch/trunk/src/plugin/scoring-similarity/build-ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/build-ivy.xml?rev=1729763&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/build-ivy.xml (added)
+++ nutch/trunk/src/plugin/scoring-similarity/build-ivy.xml Thu Feb 11 05:32:19 
2016
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-similarity" default="deps-jar" 
xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without 
any special installation -->
+        <get 
src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar";
 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not 
already dropped
+              it into ant's lib dir (note that the latter copy will always 
take precedence).
+              We will not fail as long as local lib dir exists (it may be 
empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+  </target>
+
+</project>

Modified: nutch/trunk/src/plugin/scoring-similarity/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/build.xml?rev=1729763&r1=1729762&r2=1729763&view=diff
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/build.xml (original)
+++ nutch/trunk/src/plugin/scoring-similarity/build.xml Thu Feb 11 05:32:19 2016
@@ -18,7 +18,15 @@
 <project name="scoring-similarity" default="jar-core">
 
   <import file="../build-plugin.xml"/>
-
+         <target name="deps-jar">
+           <ant target="jar" inheritall="false" dir="../indexer-elastic" />
+         </target>
+         <!-- Add compilation dependencies to classpath -->
+         <path id="plugin.deps">
+           <fileset dir="${nutch.root}/build">
+             <include name="**/indexer-elastic/*.jar" />
+           </fileset>
+         </path>
   <!-- Deploy Unit test dependencies -->
   <target name="deps-test">
     <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>

Modified: nutch/trunk/src/plugin/scoring-similarity/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/plugin.xml?rev=1729763&r1=1729762&r2=1729763&view=diff
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/plugin.xml (original)
+++ nutch/trunk/src/plugin/scoring-similarity/plugin.xml Thu Feb 11 05:32:19 
2016
@@ -26,8 +26,13 @@
       <library name="scoring-similarity.jar">
          <export name="*"/>
       </library>
+      <library name="lucene-core-4.10.2.jar"/>
    </runtime>
 
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+   
    <extension id="org.apache.nutch.scoring.similarity"
               name="SimilarityScoring"
               point="org.apache.nutch.scoring.ScoringFilter">

Modified: 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java?rev=1729763&r1=1729762&r2=1729763&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
 Thu Feb 11 05:32:19 2016
@@ -17,6 +17,7 @@
 package org.apache.nutch.scoring.similarity;
 
 import java.util.Collection;
+import java.util.List;
 import java.util.Map.Entry;
 
 import org.apache.hadoop.conf.Configuration;
@@ -28,13 +29,12 @@ import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.scoring.AbstractScoringFilter;
 import org.apache.nutch.scoring.ScoringFilterException;
-import org.apache.nutch.scoring.similarity.cosine.CosineSimilarityModel;
+import org.apache.nutch.scoring.similarity.cosine.CosineSimilarity;
 
 public class SimilarityScoringFilter extends AbstractScoringFilter {
 
   private Configuration conf;
   private SimilarityModel similarityModel;
-
   @Override
   public Configuration getConf() {
     return conf;
@@ -43,7 +43,11 @@ public class SimilarityScoringFilter ext
   @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
-    similarityModel = new CosineSimilarityModel();
+    switch(conf.get("scoring.similarity.model","cosine")){
+    case "cosine":
+      similarityModel = (SimilarityModel) new CosineSimilarity();
+      break;
+    }
     similarityModel.setConf(conf);
   }
 

Added: 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java?rev=1729763&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
 (added)
+++ 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
 Thu Feb 11 05:32:19 2016
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.similarity.SimilarityModel;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class CosineSimilarity implements SimilarityModel{
+
+  private Configuration conf; 
+  private final static Logger LOG = LoggerFactory
+      .getLogger(CosineSimilarity.class);
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  @Override
+  public float setURLScoreAfterParsing(Text url, Content content, Parse parse) 
{
+    float score = 1;
+
+    try {
+      if(!Model.isModelCreated){
+        Model.createModel(conf);
+      }
+      String metatags = parse.getData().getParseMeta().get("metatag.keyword");
+      String metaDescription = 
parse.getData().getParseMeta().get("metatag.description");
+      DocVector docVector = 
Model.createDocVector(parse.getText()+metaDescription+metatags);
+      if(docVector!=null){
+        score = Model.computeCosineSimilarity(docVector);
+        LOG.info("Setting score of {} to {}",url, score);
+      }
+      else {
+        throw new Exception("Could not create DocVector from parsed text");
+      }
+    } catch (Exception e) {
+      LOG.error("Error creating Cosine Model, setting scores of urls to 1 : 
{}", StringUtils.stringifyException(e));
+    }
+    return score;
+  }
+
+  @Override
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData 
parseData,
+      Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust,
+      int allCount) {
+    float score = 
Float.parseFloat(parseData.getContentMeta().get(Nutch.SCORE_KEY));
+    for (Entry<Text, CrawlDatum> target : targets) {
+      target.getValue().setScore(score);
+    }
+    return adjust;
+  }
+
+}

Added: 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java?rev=1729763&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
 (added)
+++ 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
 Thu Feb 11 05:32:19 2016
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class DocVector {
+
+  public HashMap<Integer, Long> termVector;
+  public HashMap<String, Integer> termFreqVector;
+
+  public DocVector() {
+    termFreqVector = new HashMap<>();
+  }
+
+  public void setTermFreqVector(HashMap<String, Integer> termFreqVector) {
+    this.termFreqVector = termFreqVector;
+  }
+  
+  public void setVectorEntry(int pos, long freq) {
+    termVector.put(pos, freq);
+  }
+  
+  public float dotProduct(DocVector docVector) {
+    float product = 0;
+    for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) {
+      if(docVector.termFreqVector.containsKey(entry.getKey())) {
+        product += 
docVector.termFreqVector.get(entry.getKey())*entry.getValue();
+      }
+    }
+    return product;
+  }
+  
+  public float getL2Norm() {
+    float sum = 0;
+    for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) {
+      sum += entry.getValue()*entry.getValue();
+    }
+    return (float) Math.sqrt(sum);
+  }
+
+}

Added: 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java?rev=1729763&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
 (added)
+++ 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
 Thu Feb 11 05:32:19 2016
@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import 
org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType;
+import org.apache.nutch.scoring.similarity.util.LuceneTokenizer;
+import org.apache.nutch.scoring.similarity.util.LuceneTokenizer.TokenizerType;
+import org.apache.tika.Tika;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class creates a model used to store Document vector representation of 
the corpus. 
+ *
+ */
+public class Model {
+
+  //Currently only one file, but in future could accept a corpus hence an 
ArrayList
+  public static ArrayList<DocVector> docVectors = new ArrayList<>(); 
+  private static final Logger LOG = LoggerFactory.getLogger(Model.class);
+  public static boolean isModelCreated = false;
+  private static List<String> stopWords;
+
+  public static synchronized void createModel(Configuration conf) throws 
IOException {
+    if(isModelCreated) {
+      LOG.info("Model exists, skipping model creation");
+      return;
+    }
+    LOG.info("Creating Cosine model");
+    try {
+      //If user has specified a stopword file other than the template
+      
if(!conf.get("scoring.similarity.stopword.file").equals("stopwords.txt.template"))
 {
+        stopWords = new ArrayList<String>();
+        String stopWord;
+        BufferedReader br = new 
BufferedReader(conf.getConfResourceAsReader((conf.get("scoring.similarity.stopword.file"))));
+        while ((stopWord = br.readLine()) != null) {
+          stopWords.add(stopWord);
+        }
+        LOG.info("Loaded custom stopwords from 
{}",conf.get("scoring.similarity.stopword.file"));
+      }
+      // TODO : Allow for corpus of documents to be provided as gold standard. 
+      String line;
+      StringBuilder sb = new StringBuilder();
+      BufferedReader br = new 
BufferedReader(conf.getConfResourceAsReader((conf.get("cosine.goldstandard.file"))));
+      while ((line = br.readLine()) != null) {
+        sb.append(line);
+      }
+      DocVector goldStandard = createDocVector(sb.toString());
+      if(goldStandard!=null)
+        docVectors.add(goldStandard);
+      else {
+        throw new Exception("Could not create DocVector for goldstandard");
+      }
+    } catch (Exception e) {
+      LOG.warn("Failed to add {} to model : 
{}",conf.get("cosine.goldstandard.file","goldstandard.txt.template"), 
+          StringUtils.stringifyException(e));
+    }
+    if(docVectors.size()>0) {
+      LOG.info("Cosine model creation complete");
+      isModelCreated = true;
+    }
+    else
+      LOG.info("Cosine model creation failed");
+  }
+
+  /**
+   * Used to create a DocVector from given String text. Used during the parse 
stage of the crawl 
+   * cycle to create a DocVector of the currently parsed page from the 
parseText attribute value
+   * @param content
+   */
+  public static DocVector createDocVector(String content) {
+    LuceneTokenizer tokenizer;
+    if(stopWords!=null) {
+      tokenizer = new LuceneTokenizer(content, TokenizerType.CLASSIC, 
stopWords, true, 
+          StemFilterType.PORTERSTEM_FILTER);
+    }
+    else {
+      tokenizer = new LuceneTokenizer(content, TokenizerType.CLASSIC, true, 
+          StemFilterType.PORTERSTEM_FILTER);
+    }
+    TokenStream tStream = tokenizer.getTokenStream();
+    HashMap<String, Integer> termVector = new HashMap<>();
+    try {
+      CharTermAttribute charTermAttribute = 
tStream.addAttribute(CharTermAttribute.class);
+      tStream.reset();
+      while(tStream.incrementToken()) {
+        String term = charTermAttribute.toString();
+        if(termVector.containsKey(term)) {
+          int count = termVector.get(term);
+          count++;
+          termVector.put(term, count);
+        }
+        else {
+          termVector.put(term, 1);
+        }
+      }
+      DocVector docVector = new DocVector();
+      docVector.setTermFreqVector(termVector);
+      return docVector;
+    } catch (IOException e) {
+      LOG.error("Error creating DocVector : 
{}",StringUtils.stringifyException(e));
+    }
+    return null;
+  }
+
+  public static float computeCosineSimilarity(DocVector docVector) {
+    float scores[] = new float[docVectors.size()];
+    int i=0;
+    float maxScore = 0;
+    for(DocVector corpusDoc : docVectors) {
+      float numerator = docVector.dotProduct(corpusDoc);
+      float denominator = docVector.getL2Norm()*corpusDoc.getL2Norm();
+      float currentScore = numerator/denominator;
+      scores[i++] = currentScore;
+      maxScore = (currentScore>maxScore)? currentScore : maxScore;
+    }
+    // Returning the max score amongst all documents in the corpus
+    return maxScore;
+  }
+}
\ No newline at end of file

Added: 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/package-info.java?rev=1729763&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
 (added)
+++ 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
 Thu Feb 11 05:32:19 2016
@@ -0,0 +1,7 @@
+/**
+ * 
+ */
+/** Implements the cosine similarity metric for scoring relevant documents 
+ *
+ */
+package org.apache.nutch.scoring.similarity.cosine;
\ No newline at end of file

Added: 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java?rev=1729763&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
 (added)
+++ 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
 Thu Feb 11 05:32:19 2016
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.util;
+
+import java.io.Reader;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
+import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.standard.ClassicTokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.util.CharArraySet;
+
+/**
+ * Creates a custom analyzer based on user provided inputs
+ *
+ */
+public class LuceneAnalyzerUtil extends Analyzer{ 
+  
+  public static enum StemFilterType { PORTERSTEM_FILTER, 
ENGLISHMINIMALSTEM_FILTER, NONE }
+  
+  private static StemFilterType stemFilterType;
+  private static CharArraySet stopSet;
+  
+  
+  /**
+   * Creates an analyzer instance based on Lucene default stopword set if 
@param useStopFilter is set to true
+   */
+  public LuceneAnalyzerUtil(StemFilterType stemFilterType, boolean 
useStopFilter) {
+    LuceneAnalyzerUtil.stemFilterType = stemFilterType;
+    if(useStopFilter) {
+      stopSet = StandardAnalyzer.STOP_WORDS_SET;
+    }
+    else {
+      stopSet = null;
+    }
+  }
+  
+  /**
+   * Creates an analyzer instance based on user provided stop words. If @param 
addToDefault is set to true, then 
+   * user provided stop words will be added to the Lucene default stopset.
+   */
+  public LuceneAnalyzerUtil(StemFilterType stemFilterType, List<String> 
stopWords, boolean addToDefault) {
+    LuceneAnalyzerUtil.stemFilterType = stemFilterType;
+    if(addToDefault) {
+      stopSet.addAll(stopWords);
+    }
+    else {
+      stopSet = StopFilter.makeStopSet(stopWords);
+    }
+  }
+    
+  @Override
+  protected TokenStreamComponents createComponents(String fieldName, Reader 
reader) {
+    Tokenizer source = new ClassicTokenizer(reader);
+    TokenStream filter = new LowerCaseFilter(source);
+    if(stopSet != null) {
+      filter = new StopFilter(filter, stopSet);
+    }
+    
+    switch(stemFilterType){
+    case PORTERSTEM_FILTER:
+      filter = new PorterStemFilter(filter);
+      break;
+    case ENGLISHMINIMALSTEM_FILTER:
+      filter = new EnglishMinimalStemFilter(filter);
+      break;
+    default:
+      break;        
+    }
+    return new TokenStreamComponents(source, filter);
+  }
+
+}

Added: 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java?rev=1729763&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
 (added)
+++ 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
 Thu Feb 11 05:32:19 2016
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.util;
+
+import java.io.StringReader;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
+import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.standard.ClassicTokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.CharArraySet;
+import 
org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType;
+
+public class LuceneTokenizer {
+
+  private TokenStream tokenStream; 
+  private TokenizerType tokenizer;
+  private StemFilterType stemFilterType;
+  private CharArraySet stopSet = null;
+  
+  public static enum TokenizerType {CLASSIC, STANDARD}
+  
+  /**
+   * Creates a tokenizer based on param values
+   * @param content - The text to tokenize
+   * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT 
+   * @param useStopFilter - if set to true the token stream will be filtered 
using default Lucene stopset 
+   * @param stemFilterType - Type of stemming to perform 
+   */
+  public LuceneTokenizer(String content, TokenizerType tokenizer, boolean 
useStopFilter, StemFilterType stemFilterType) {
+    this.tokenizer = tokenizer;
+    this.stemFilterType = stemFilterType;
+    if(useStopFilter) {
+      stopSet = StandardAnalyzer.STOP_WORDS_SET;
+    }
+    tokenStream = createTokenStream(content);
+  }
+  
+  /**
+   * Creates a tokenizer based on param values
+   * @param content - The text to tokenize
+   * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT 
+   * @param stopSet - Provide a set of user defined stop words
+   * @param addToDefault - If set to true, the stopSet words will be added to 
the Lucene default stop set.
+   * If false, then only the user provided words will be used as the stop set
+   * @param stemFilterType
+   */
+  public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> 
stopWords, boolean addToDefault, StemFilterType stemFilterType) {
+    this.tokenizer = tokenizer;
+    this.stemFilterType = stemFilterType;
+    if(addToDefault) {
+      CharArraySet stopSet = 
CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET);;
+      for(String word : stopWords){
+        stopSet.add(word);
+      }
+      this.stopSet = stopSet;
+    }
+    else {
+      stopSet = new CharArraySet(stopWords, true);
+    }
+    tokenStream = createTokenStream(content);
+  }
+  
+  /**
+   * Returns the tokenStream created by the Tokenizer
+   * @return
+   */
+  public TokenStream getTokenStream() {
+    return tokenStream;
+  }
+  
+  private TokenStream createTokenStream(String content) {
+    tokenStream = generateTokenStreamFromText(content, tokenizer);
+    tokenStream = new LowerCaseFilter(tokenStream);
+    if(stopSet != null) {
+      tokenStream = applyStopFilter(stopSet);
+    }
+    tokenStream = applyStemmer(stemFilterType);
+    return tokenStream;
+  }
+  
+  private TokenStream generateTokenStreamFromText(String content, 
TokenizerType tokenizer){
+    switch(tokenizer){
+    case CLASSIC:
+      tokenStream = new ClassicTokenizer(new StringReader(content));
+      break;
+      
+    case STANDARD:
+      tokenStream = new StandardTokenizer(new StringReader(content));
+    }
+    return tokenStream;
+  }
+  
+  private TokenStream applyStopFilter(CharArraySet stopWords) {
+    tokenStream = new StopFilter(tokenStream, stopWords); 
+    return tokenStream;
+  }
+  
+  private TokenStream applyStemmer(StemFilterType stemFilterType) {
+    switch(stemFilterType){
+    case ENGLISHMINIMALSTEM_FILTER:
+      tokenStream = new EnglishMinimalStemFilter(tokenStream);
+      break;
+    case PORTERSTEM_FILTER:
+      tokenStream = new PorterStemFilter(tokenStream);
+      break;
+     default:
+       break;
+    }
+
+    return tokenStream; 
+  }
+}

Added: 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/package-info.java?rev=1729763&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/package-info.java
 (added)
+++ 
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/package-info.java
 Thu Feb 11 05:32:19 2016
@@ -0,0 +1,24 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * 
+ */
+/**
+ * Utility package for Lucene functions
+ *
+ */
+package org.apache.nutch.scoring.similarity.util;
\ No newline at end of file


Reply via email to