Author: sujen
Date: Thu Feb 11 05:32:19 2016
New Revision: 1729763
URL: http://svn.apache.org/viewvc?rev=1729763&view=rev
Log:
NUTCH-2209 Improved Tokenization for Similarity Scoring plugin, this closes #87
Added:
nutch/trunk/src/plugin/scoring-similarity/build-ivy.xml
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/package-info.java
Removed:
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/CosineSimilarity.java
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/DocumentVector.java
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/ScoringFilterModel.java
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarityModel.java
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocumentVector.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/ivy/ivy.xml
nutch/trunk/src/plugin/scoring-similarity/build.xml
nutch/trunk/src/plugin/scoring-similarity/plugin.xml
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1729763&r1=1729762&r2=1729763&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Feb 11 05:32:19 2016
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-2209 Improved Tokenization for Similarity Scoring plugin (Sujen)
+
* NUTCH-2211 Added filterchecker and normalizerchecker to bin/nutch script
(markus)
* NUTCH-2197 Add Solr 5 cloud indexer support (Jurian Broertjes via markus)
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1729763&r1=1729762&r2=1729763&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Thu Feb 11 05:32:19 2016
@@ -1390,7 +1390,16 @@ For more detailed information on the wor
visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
<property>
- <name>scoring.similarity.model.path</name>
+ <name>scoring.similarity.model</name>
+ <value>cosine</value>
+ <description>The type of similarity metric to use. Eg - cosine (which is,
currently, the only available model).
+ Please make sure to set the model specific properties for the scoring to
function properly.
+ Description of these properties can be found on the wiki.
+ </description>
+</property>
+
+<property>
+ <name>cosine.goldstandard.file</name>
<value>goldstandard.txt</value>
<description>Path to the gold standard file which contains all the
relevant text and terms,
pertaining to the domain.
Modified: nutch/trunk/ivy/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1729763&r1=1729762&r2=1729763&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Thu Feb 11 05:32:19 2016
@@ -87,7 +87,8 @@
<dependency org="com.fasterxml.jackson.core" name="jackson-databind"
rev="2.5.1" conf="*->default"/>
<dependency org="com.fasterxml.jackson.dataformat"
name="jackson-dataformat-cbor" rev="2.5.1" conf="*->default"/>
<dependency org="com.fasterxml.jackson.jaxrs"
name="jackson-jaxrs-json-provider" rev="2.5.1" conf="*->default"/>
-
+
+ <dependency org="org.apache.lucene" name="lucene-analyzers-common"
rev="4.10.2" conf="*->default"></dependency>
<!-- WARC artifacts needed -->
<dependency org="org.netpreserve.commons"
name="webarchive-commons" rev="1.1.5" conf="*->default">
<exclude module="hadoop-core"/>
Added: nutch/trunk/src/plugin/scoring-similarity/build-ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/build-ivy.xml?rev=1729763&view=auto
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/build-ivy.xml (added)
+++ nutch/trunk/src/plugin/scoring-similarity/build-ivy.xml Thu Feb 11 05:32:19
2016
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="scoring-similarity" default="deps-jar"
xmlns:ivy="antlib:org.apache.ivy.ant">
+
+ <property name="ivy.install.version" value="2.1.0" />
+ <condition property="ivy.home" value="${env.IVY_HOME}">
+ <isset property="env.IVY_HOME" />
+ </condition>
+ <property name="ivy.home" value="${user.home}/.ant" />
+ <property name="ivy.checksums" value="" />
+ <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+ <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+ <target name="download-ivy" unless="offline">
+
+ <mkdir dir="${ivy.jar.dir}"/>
+ <!-- download Ivy from web site so that it can be used even without
any special installation -->
+ <get
src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar"
+ dest="${ivy.jar.file}" usetimestamp="true"/>
+ </target>
+
+ <target name="init-ivy" depends="download-ivy">
+ <!-- try to load ivy here from ivy home, in case the user has not
already dropped
+ it into ant's lib dir (note that the latter copy will always
take precedence).
+ We will not fail as long as local lib dir exists (it may be
empty) and
+ ivy is in at least one of ant's lib dir or the local lib dir. -->
+ <path id="ivy.lib.path">
+ <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+ </path>
+ <taskdef resource="org/apache/ivy/ant/antlib.xml"
+ uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+ </target>
+
+ <target name="deps-jar" depends="init-ivy">
+ <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+ </target>
+
+</project>
Modified: nutch/trunk/src/plugin/scoring-similarity/build.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/build.xml?rev=1729763&r1=1729762&r2=1729763&view=diff
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/build.xml (original)
+++ nutch/trunk/src/plugin/scoring-similarity/build.xml Thu Feb 11 05:32:19 2016
@@ -18,7 +18,15 @@
<project name="scoring-similarity" default="jar-core">
<import file="../build-plugin.xml"/>
-
+ <target name="deps-jar">
+ <ant target="jar" inheritall="false" dir="../indexer-elastic" />
+ </target>
+ <!-- Add compilation dependencies to classpath -->
+ <path id="plugin.deps">
+ <fileset dir="${nutch.root}/build">
+ <include name="**/indexer-elastic/*.jar" />
+ </fileset>
+ </path>
<!-- Deploy Unit test dependencies -->
<target name="deps-test">
<ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
Modified: nutch/trunk/src/plugin/scoring-similarity/plugin.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/plugin.xml?rev=1729763&r1=1729762&r2=1729763&view=diff
==============================================================================
--- nutch/trunk/src/plugin/scoring-similarity/plugin.xml (original)
+++ nutch/trunk/src/plugin/scoring-similarity/plugin.xml Thu Feb 11 05:32:19
2016
@@ -26,8 +26,13 @@
<library name="scoring-similarity.jar">
<export name="*"/>
</library>
+ <library name="lucene-core-4.10.2.jar"/>
</runtime>
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
<extension id="org.apache.nutch.scoring.similarity"
name="SimilarityScoring"
point="org.apache.nutch.scoring.ScoringFilter">
Modified:
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java?rev=1729763&r1=1729762&r2=1729763&view=diff
==============================================================================
---
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
(original)
+++
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
Thu Feb 11 05:32:19 2016
@@ -17,6 +17,7 @@
package org.apache.nutch.scoring.similarity;
import java.util.Collection;
+import java.util.List;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
@@ -28,13 +29,12 @@ import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.AbstractScoringFilter;
import org.apache.nutch.scoring.ScoringFilterException;
-import org.apache.nutch.scoring.similarity.cosine.CosineSimilarityModel;
+import org.apache.nutch.scoring.similarity.cosine.CosineSimilarity;
public class SimilarityScoringFilter extends AbstractScoringFilter {
private Configuration conf;
private SimilarityModel similarityModel;
-
@Override
public Configuration getConf() {
return conf;
@@ -43,7 +43,11 @@ public class SimilarityScoringFilter ext
@Override
public void setConf(Configuration conf) {
this.conf = conf;
- similarityModel = new CosineSimilarityModel();
+ switch(conf.get("scoring.similarity.model","cosine")){
+ case "cosine":
+ similarityModel = (SimilarityModel) new CosineSimilarity();
+ break;
+ }
similarityModel.setConf(conf);
}
Added:
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java?rev=1729763&view=auto
==============================================================================
---
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
(added)
+++
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
Thu Feb 11 05:32:19 2016
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.similarity.SimilarityModel;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class CosineSimilarity implements SimilarityModel{
+
+ private Configuration conf;
+ private final static Logger LOG = LoggerFactory
+ .getLogger(CosineSimilarity.class);
+
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ @Override
+ public float setURLScoreAfterParsing(Text url, Content content, Parse parse)
{
+ float score = 1;
+
+ try {
+ if(!Model.isModelCreated){
+ Model.createModel(conf);
+ }
+ String metatags = parse.getData().getParseMeta().get("metatag.keyword");
+ String metaDescription =
parse.getData().getParseMeta().get("metatag.description");
+ DocVector docVector =
Model.createDocVector(parse.getText()+metaDescription+metatags);
+ if(docVector!=null){
+ score = Model.computeCosineSimilarity(docVector);
+ LOG.info("Setting score of {} to {}",url, score);
+ }
+ else {
+ throw new Exception("Could not create DocVector from parsed text");
+ }
+ } catch (Exception e) {
+ LOG.error("Error creating Cosine Model, setting scores of urls to 1 :
{}", StringUtils.stringifyException(e));
+ }
+ return score;
+ }
+
+ @Override
+ public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData
parseData,
+ Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust,
+ int allCount) {
+ float score =
Float.parseFloat(parseData.getContentMeta().get(Nutch.SCORE_KEY));
+ for (Entry<Text, CrawlDatum> target : targets) {
+ target.getValue().setScore(score);
+ }
+ return adjust;
+ }
+
+}
Added:
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java?rev=1729763&view=auto
==============================================================================
---
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
(added)
+++
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
Thu Feb 11 05:32:19 2016
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class DocVector {
+
+ public HashMap<Integer, Long> termVector;
+ public HashMap<String, Integer> termFreqVector;
+
+ public DocVector() {
+ termFreqVector = new HashMap<>();
+ }
+
+ public void setTermFreqVector(HashMap<String, Integer> termFreqVector) {
+ this.termFreqVector = termFreqVector;
+ }
+
+ public void setVectorEntry(int pos, long freq) {
+ termVector.put(pos, freq);
+ }
+
+ public float dotProduct(DocVector docVector) {
+ float product = 0;
+ for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) {
+ if(docVector.termFreqVector.containsKey(entry.getKey())) {
+ product +=
docVector.termFreqVector.get(entry.getKey())*entry.getValue();
+ }
+ }
+ return product;
+ }
+
+ public float getL2Norm() {
+ float sum = 0;
+ for(Map.Entry<String, Integer> entry : termFreqVector.entrySet()) {
+ sum += entry.getValue()*entry.getValue();
+ }
+ return (float) Math.sqrt(sum);
+ }
+
+}
Added:
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java?rev=1729763&view=auto
==============================================================================
---
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
(added)
+++
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
Thu Feb 11 05:32:19 2016
@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.cosine;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import
org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType;
+import org.apache.nutch.scoring.similarity.util.LuceneTokenizer;
+import org.apache.nutch.scoring.similarity.util.LuceneTokenizer.TokenizerType;
+import org.apache.tika.Tika;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class creates a model used to store Document vector representation of
the corpus.
+ *
+ */
+public class Model {
+
+ //Currently only one file, but in future could accept a corpus hence an
ArrayList
+ public static ArrayList<DocVector> docVectors = new ArrayList<>();
+ private static final Logger LOG = LoggerFactory.getLogger(Model.class);
+ public static boolean isModelCreated = false;
+ private static List<String> stopWords;
+
+ public static synchronized void createModel(Configuration conf) throws
IOException {
+ if(isModelCreated) {
+ LOG.info("Model exists, skipping model creation");
+ return;
+ }
+ LOG.info("Creating Cosine model");
+ try {
+ //If user has specified a stopword file other than the template
+
if(!conf.get("scoring.similarity.stopword.file").equals("stopwords.txt.template"))
{
+ stopWords = new ArrayList<String>();
+ String stopWord;
+ BufferedReader br = new
BufferedReader(conf.getConfResourceAsReader((conf.get("scoring.similarity.stopword.file"))));
+ while ((stopWord = br.readLine()) != null) {
+ stopWords.add(stopWord);
+ }
+ LOG.info("Loaded custom stopwords from
{}",conf.get("scoring.similarity.stopword.file"));
+ }
+ // TODO : Allow for corpus of documents to be provided as gold standard.
+ String line;
+ StringBuilder sb = new StringBuilder();
+ BufferedReader br = new
BufferedReader(conf.getConfResourceAsReader((conf.get("cosine.goldstandard.file"))));
+ while ((line = br.readLine()) != null) {
+ sb.append(line);
+ }
+ DocVector goldStandard = createDocVector(sb.toString());
+ if(goldStandard!=null)
+ docVectors.add(goldStandard);
+ else {
+ throw new Exception("Could not create DocVector for goldstandard");
+ }
+ } catch (Exception e) {
+ LOG.warn("Failed to add {} to model :
{}",conf.get("cosine.goldstandard.file","goldstandard.txt.template"),
+ StringUtils.stringifyException(e));
+ }
+ if(docVectors.size()>0) {
+ LOG.info("Cosine model creation complete");
+ isModelCreated = true;
+ }
+ else
+ LOG.info("Cosine model creation failed");
+ }
+
+ /**
+ * Used to create a DocVector from given String text. Used during the parse
stage of the crawl
+ * cycle to create a DocVector of the currently parsed page from the
parseText attribute value
+ * @param content
+ */
+ public static DocVector createDocVector(String content) {
+ LuceneTokenizer tokenizer;
+ if(stopWords!=null) {
+ tokenizer = new LuceneTokenizer(content, TokenizerType.CLASSIC,
stopWords, true,
+ StemFilterType.PORTERSTEM_FILTER);
+ }
+ else {
+ tokenizer = new LuceneTokenizer(content, TokenizerType.CLASSIC, true,
+ StemFilterType.PORTERSTEM_FILTER);
+ }
+ TokenStream tStream = tokenizer.getTokenStream();
+ HashMap<String, Integer> termVector = new HashMap<>();
+ try {
+ CharTermAttribute charTermAttribute =
tStream.addAttribute(CharTermAttribute.class);
+ tStream.reset();
+ while(tStream.incrementToken()) {
+ String term = charTermAttribute.toString();
+ if(termVector.containsKey(term)) {
+ int count = termVector.get(term);
+ count++;
+ termVector.put(term, count);
+ }
+ else {
+ termVector.put(term, 1);
+ }
+ }
+ DocVector docVector = new DocVector();
+ docVector.setTermFreqVector(termVector);
+ return docVector;
+ } catch (IOException e) {
+ LOG.error("Error creating DocVector :
{}",StringUtils.stringifyException(e));
+ }
+ return null;
+ }
+
+ public static float computeCosineSimilarity(DocVector docVector) {
+ float scores[] = new float[docVectors.size()];
+ int i=0;
+ float maxScore = 0;
+ for(DocVector corpusDoc : docVectors) {
+ float numerator = docVector.dotProduct(corpusDoc);
+ float denominator = docVector.getL2Norm()*corpusDoc.getL2Norm();
+ float currentScore = numerator/denominator;
+ scores[i++] = currentScore;
+ maxScore = (currentScore>maxScore)? currentScore : maxScore;
+ }
+ // Returning the max score amongst all documents in the corpus
+ return maxScore;
+ }
+}
\ No newline at end of file
Added:
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/package-info.java?rev=1729763&view=auto
==============================================================================
---
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
(added)
+++
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
Thu Feb 11 05:32:19 2016
@@ -0,0 +1,7 @@
+/**
+ *
+ */
+/** Implements the cosine similarity metric for scoring relevant documents
+ *
+ */
+package org.apache.nutch.scoring.similarity.cosine;
\ No newline at end of file
Added:
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java?rev=1729763&view=auto
==============================================================================
---
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
(added)
+++
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
Thu Feb 11 05:32:19 2016
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.util;
+
+import java.io.Reader;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
+import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.standard.ClassicTokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.util.CharArraySet;
+
+/**
+ * Creates a custom analyzer based on user provided inputs
+ *
+ */
+public class LuceneAnalyzerUtil extends Analyzer{
+
+ public static enum StemFilterType { PORTERSTEM_FILTER,
ENGLISHMINIMALSTEM_FILTER, NONE }
+
+ private static StemFilterType stemFilterType;
+ private static CharArraySet stopSet;
+
+
+ /**
+ * Creates an analyzer instance based on Lucene default stopword set if
@param useStopFilter is set to true
+ */
+ public LuceneAnalyzerUtil(StemFilterType stemFilterType, boolean
useStopFilter) {
+ LuceneAnalyzerUtil.stemFilterType = stemFilterType;
+ if(useStopFilter) {
+ stopSet = StandardAnalyzer.STOP_WORDS_SET;
+ }
+ else {
+ stopSet = null;
+ }
+ }
+
+ /**
+ * Creates an analyzer instance based on user provided stop words. If @param
addToDefault is set to true, then
+ * user provided stop words will be added to the Lucene default stopset.
+ */
+ public LuceneAnalyzerUtil(StemFilterType stemFilterType, List<String>
stopWords, boolean addToDefault) {
+ LuceneAnalyzerUtil.stemFilterType = stemFilterType;
+ if(addToDefault) {
+ stopSet.addAll(stopWords);
+ }
+ else {
+ stopSet = StopFilter.makeStopSet(stopWords);
+ }
+ }
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader
reader) {
+ Tokenizer source = new ClassicTokenizer(reader);
+ TokenStream filter = new LowerCaseFilter(source);
+ if(stopSet != null) {
+ filter = new StopFilter(filter, stopSet);
+ }
+
+ switch(stemFilterType){
+ case PORTERSTEM_FILTER:
+ filter = new PorterStemFilter(filter);
+ break;
+ case ENGLISHMINIMALSTEM_FILTER:
+ filter = new EnglishMinimalStemFilter(filter);
+ break;
+ default:
+ break;
+ }
+ return new TokenStreamComponents(source, filter);
+ }
+
+}
Added:
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java?rev=1729763&view=auto
==============================================================================
---
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
(added)
+++
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
Thu Feb 11 05:32:19 2016
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.scoring.similarity.util;
+
+import java.io.StringReader;
+import java.util.List;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
+import org.apache.lucene.analysis.en.PorterStemFilter;
+import org.apache.lucene.analysis.standard.ClassicTokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.util.CharArraySet;
+import
org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType;
+
+public class LuceneTokenizer {
+
+ private TokenStream tokenStream;
+ private TokenizerType tokenizer;
+ private StemFilterType stemFilterType;
+ private CharArraySet stopSet = null;
+
+ public static enum TokenizerType {CLASSIC, STANDARD}
+
+ /**
+ * Creates a tokenizer based on param values
+ * @param content - The text to tokenize
+ * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT
+ * @param useStopFilter - if set to true the token stream will be filtered
using default Lucene stopset
+ * @param stemFilterType - Type of stemming to perform
+ */
+ public LuceneTokenizer(String content, TokenizerType tokenizer, boolean
useStopFilter, StemFilterType stemFilterType) {
+ this.tokenizer = tokenizer;
+ this.stemFilterType = stemFilterType;
+ if(useStopFilter) {
+ stopSet = StandardAnalyzer.STOP_WORDS_SET;
+ }
+ tokenStream = createTokenStream(content);
+ }
+
+ /**
+ * Creates a tokenizer based on param values
+ * @param content - The text to tokenize
+ * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT
+ * @param stopSet - Provide a set of user defined stop words
+ * @param addToDefault - If set to true, the stopSet words will be added to
the Lucene default stop set.
+ * If false, then only the user provided words will be used as the stop set
+ * @param stemFilterType
+ */
+ public LuceneTokenizer(String content, TokenizerType tokenizer, List<String>
stopWords, boolean addToDefault, StemFilterType stemFilterType) {
+ this.tokenizer = tokenizer;
+ this.stemFilterType = stemFilterType;
+ if(addToDefault) {
+ CharArraySet stopSet =
CharArraySet.copy(StandardAnalyzer.STOP_WORDS_SET);;
+ for(String word : stopWords){
+ stopSet.add(word);
+ }
+ this.stopSet = stopSet;
+ }
+ else {
+ stopSet = new CharArraySet(stopWords, true);
+ }
+ tokenStream = createTokenStream(content);
+ }
+
+ /**
+ * Returns the tokenStream created by the Tokenizer
+ * @return
+ */
+ public TokenStream getTokenStream() {
+ return tokenStream;
+ }
+
+ private TokenStream createTokenStream(String content) {
+ tokenStream = generateTokenStreamFromText(content, tokenizer);
+ tokenStream = new LowerCaseFilter(tokenStream);
+ if(stopSet != null) {
+ tokenStream = applyStopFilter(stopSet);
+ }
+ tokenStream = applyStemmer(stemFilterType);
+ return tokenStream;
+ }
+
+ private TokenStream generateTokenStreamFromText(String content,
TokenizerType tokenizer){
+ switch(tokenizer){
+ case CLASSIC:
+ tokenStream = new ClassicTokenizer(new StringReader(content));
+ break;
+
+ case STANDARD:
+ tokenStream = new StandardTokenizer(new StringReader(content));
+ }
+ return tokenStream;
+ }
+
+ private TokenStream applyStopFilter(CharArraySet stopWords) {
+ tokenStream = new StopFilter(tokenStream, stopWords);
+ return tokenStream;
+ }
+
+ private TokenStream applyStemmer(StemFilterType stemFilterType) {
+ switch(stemFilterType){
+ case ENGLISHMINIMALSTEM_FILTER:
+ tokenStream = new EnglishMinimalStemFilter(tokenStream);
+ break;
+ case PORTERSTEM_FILTER:
+ tokenStream = new PorterStemFilter(tokenStream);
+ break;
+ default:
+ break;
+ }
+
+ return tokenStream;
+ }
+}
Added:
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/package-info.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/package-info.java?rev=1729763&view=auto
==============================================================================
---
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/package-info.java
(added)
+++
nutch/trunk/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/package-info.java
Thu Feb 11 05:32:19 2016
@@ -0,0 +1,24 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *
+ */
+/**
+ * Utility package for Lucene functions
+ *
+ */
+package org.apache.nutch.scoring.similarity.util;
\ No newline at end of file