Repository: nutch Updated Branches: refs/heads/master a9b2491a3 -> b62f43fda
Fix for NUTCH-2245 contributed by Bhavya Sanghavi Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/2c426808 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/2c426808 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/2c426808 Branch: refs/heads/master Commit: 2c42680823079faf87705df4d0698dcf8b43ef66 Parents: a9b2491 Author: Bhavya Sanghavi <[email protected]> Authored: Wed Mar 23 22:24:40 2016 -0700 Committer: Sujen Shah <[email protected]> Committed: Sun Apr 3 23:31:37 2016 -0700 ---------------------------------------------------------------------- conf/nutch-default.xml | 6 +++ .../similarity/cosine/CosineSimilarity.java | 3 +- .../nutch/scoring/similarity/cosine/Model.java | 22 +++++++--- .../similarity/util/LuceneTokenizer.java | 44 +++++++++++++++----- 4 files changed, 59 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/2c426808/conf/nutch-default.xml ---------------------------------------------------------------------- diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 93503f3..fe031d5 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1442,6 +1442,12 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> </description> </property> + <property> + <name>scoring.similarity.ngrams</name> + <value>1</value> + <description>Specifies the 'n' in ngrams</description> +</property> + <property> <name>cosine.goldstandard.file</name> <value>goldstandard.txt</value> http://git-wip-us.apache.org/repos/asf/nutch/blob/2c426808/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java index d41f5e2..81b1eba 100644 --- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java +++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java @@ -53,7 +53,8 @@ public class CosineSimilarity implements SimilarityModel{ } String metatags = parse.getData().getParseMeta().get("metatag.keyword"); String metaDescription = parse.getData().getParseMeta().get("metatag.description"); - DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags); + int ngram = conf.getInt("scoring.similarity.ngrams", 1); + DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags, ngram); if(docVector!=null){ score = Model.computeCosineSimilarity(docVector); LOG.info("Setting score of {} to {}",url, score); http://git-wip-us.apache.org/repos/asf/nutch/blob/2c426808/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java index ba0006a..371f241 100644 --- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java +++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java @@ -68,6 +68,11 @@ public class Model { } LOG.info("Loaded custom stopwords from {}",conf.get("scoring.similarity.stopword.file")); } + + //Check if user has specified n for ngram cosine model + int ngram = conf.getInt("scoring.similarity.ngrams", 1); + LOG.info("Value of ngram: {}",ngram); + // TODO : Allow for corpus of documents to be provided as gold standard. String line; StringBuilder sb = new StringBuilder(); @@ -75,7 +80,7 @@ public class Model { while ((line = br.readLine()) != null) { sb.append(line); } - DocVector goldStandard = createDocVector(sb.toString()); + DocVector goldStandard = createDocVector(sb.toString(), ngram); if(goldStandard!=null) docVectors.add(goldStandard); else { @@ -97,15 +102,21 @@ public class Model { * Used to create a DocVector from given String text. Used during the parse stage of the crawl * cycle to create a DocVector of the currently parsed page from the parseText attribute value * @param content + * @param ngram */ - public static DocVector createDocVector(String content) { + public static DocVector createDocVector(String content, int ngram) { LuceneTokenizer tokenizer; - if(stopWords!=null) { - tokenizer = new LuceneTokenizer(content, TokenizerType.CLASSIC, stopWords, true, + + if(ngram > 1){ + LOG.info("Using Ngram Cosine Model, user specified ngram value : {}", ngram); + tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, ngram); + } + else if(stopWords!=null) { + tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true, StemFilterType.PORTERSTEM_FILTER); } else { - tokenizer = new LuceneTokenizer(content, TokenizerType.CLASSIC, true, + tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, true, StemFilterType.PORTERSTEM_FILTER); } TokenStream tStream = tokenizer.getTokenStream(); @@ -115,6 +126,7 @@ public class Model { tStream.reset(); while(tStream.incrementToken()) { String term = charTermAttribute.toString(); + LOG.debug(term); if(termVector.containsKey(term)) { int count = termVector.get(term); count++; http://git-wip-us.apache.org/repos/asf/nutch/blob/2c426808/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java index 3ce0fee..c95033a 100644 --- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java +++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java @@ -27,6 +27,7 @@ import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.standard.ClassicTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.shingle.ShingleFilter; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.nutch.scoring.similarity.util.LuceneAnalyzerUtil.StemFilterType; @@ -36,9 +37,9 @@ public class LuceneTokenizer { private TokenizerType tokenizer; private StemFilterType stemFilterType; private CharArraySet stopSet = null; - + public static enum TokenizerType {CLASSIC, STANDARD} - + /** * Creates a tokenizer based on param values * @param content - The text to tokenize @@ -54,7 +55,7 @@ public class LuceneTokenizer { } tokenStream = createTokenStream(content); } - + /** * Creates a tokenizer based on param values * @param content - The text to tokenize @@ -79,7 +80,7 @@ public class LuceneTokenizer { } tokenStream = createTokenStream(content); } - + /** * Returns the tokenStream created by the Tokenizer * @return @@ -88,6 +89,19 @@ public class LuceneTokenizer { return tokenStream; } + /** + * Creates a tokenizer for the ngram model based on param values + * @param content - The text to tokenize + * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT + * @param stemFilterType - Type of stemming to perform + * @param ngram - Value of ngram for tokenizing + */ + public LuceneTokenizer(String content, TokenizerType tokenizer, StemFilterType stemFilterType, int ngram) { + this.tokenizer = tokenizer; + this.stemFilterType = stemFilterType; + tokenStream = createNGramTokenStream(content,ngram); + } + private TokenStream createTokenStream(String content) { tokenStream = generateTokenStreamFromText(content, tokenizer); tokenStream = new LowerCaseFilter(tokenStream); @@ -97,24 +111,34 @@ public class LuceneTokenizer { tokenStream = applyStemmer(stemFilterType); return tokenStream; } - + private TokenStream generateTokenStreamFromText(String content, TokenizerType tokenizer){ switch(tokenizer){ case CLASSIC: tokenStream = new ClassicTokenizer(new StringReader(content)); break; - + case STANDARD: tokenStream = new StandardTokenizer(new StringReader(content)); } return tokenStream; } - + + private TokenStream createNGramTokenStream(String content, int ngram) { + tokenStream = new StandardTokenizer(new StringReader(content)); + tokenStream = new LowerCaseFilter(tokenStream); + tokenStream = applyStemmer(stemFilterType); + ShingleFilter shingleFilter = new ShingleFilter(tokenStream, ngram, ngram); + shingleFilter.setOutputUnigrams(false); + tokenStream = (TokenStream)shingleFilter; + return tokenStream; + } + private TokenStream applyStopFilter(CharArraySet stopWords) { tokenStream = new StopFilter(tokenStream, stopWords); return tokenStream; } - + private TokenStream applyStemmer(StemFilterType stemFilterType) { switch(stemFilterType){ case ENGLISHMINIMALSTEM_FILTER: @@ -123,8 +147,8 @@ public class LuceneTokenizer { case PORTERSTEM_FILTER: tokenStream = new PorterStemFilter(tokenStream); break; - default: - break; + default: + break; } return tokenStream;
