Repository: nutch Updated Branches: refs/heads/master 956538984 -> da252eb7b
NUTCH-2263 mingram and maxgram support for Unigram Cosine Similarity Model is provided. Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/da252eb7 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/da252eb7 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/da252eb7 Branch: refs/heads/master Commit: da252eb7b3d2d7b7021480db3bec1d82e6fa564d Parents: 9565389 Author: Furkan KAMACI <[email protected]> Authored: Thu May 19 04:13:04 2016 +0300 Committer: Furkan KAMACI <[email protected]> Committed: Thu May 19 04:13:04 2016 +0300 ---------------------------------------------------------------------- conf/nutch-default.xml | 6 ++- .../similarity/cosine/CosineSimilarity.java | 6 ++- .../nutch/scoring/similarity/cosine/Model.java | 50 +++++++++++++++----- .../similarity/util/LuceneTokenizer.java | 11 +++-- 4 files changed, 53 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/da252eb7/conf/nutch-default.xml ---------------------------------------------------------------------- diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 641809f..51b3fd9 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1444,8 +1444,10 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> <property> <name>scoring.similarity.ngrams</name> - <value>1</value> - <description>Specifies the 'n' in ngrams</description> + <value>1,1</value> + <description>Specifies the min 'n' and max 'n' in ngrams as comma-separated. + If one value is specified as 'n', it will be used for both the min 'n' and max 'n' in ngrams. + </description> </property> <property> http://git-wip-us.apache.org/repos/asf/nutch/blob/da252eb7/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java index 81b1eba..9853b34 100644 --- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java +++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java @@ -53,8 +53,10 @@ public class CosineSimilarity implements SimilarityModel{ } String metatags = parse.getData().getParseMeta().get("metatag.keyword"); String metaDescription = parse.getData().getParseMeta().get("metatag.description"); - int ngram = conf.getInt("scoring.similarity.ngrams", 1); - DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags, ngram); + int[] ngramArr = Model.retrieveNgrams(conf); + int mingram = ngramArr[0]; + int maxgram = ngramArr[1]; + DocVector docVector = Model.createDocVector(parse.getText()+metaDescription+metatags, mingram, maxgram); if(docVector!=null){ score = Model.computeCosineSimilarity(docVector); LOG.info("Setting score of {} to {}",url, score); http://git-wip-us.apache.org/repos/asf/nutch/blob/da252eb7/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java index 371f241..d8180f2 100644 --- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java +++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java @@ -69,9 +69,10 @@ public class Model { LOG.info("Loaded custom stopwords from {}",conf.get("scoring.similarity.stopword.file")); } - //Check if user has specified n for ngram cosine model - int ngram = conf.getInt("scoring.similarity.ngrams", 1); - LOG.info("Value of ngram: {}",ngram); + int[] ngramArr = retrieveNgrams(conf); + int mingram = ngramArr[0]; + int maxgram = ngramArr[1]; + LOG.info("Value of mingram: {} maxgram: {}", mingram, maxgram); // TODO : Allow for corpus of documents to be provided as gold standard. String line; @@ -80,7 +81,7 @@ public class Model { while ((line = br.readLine()) != null) { sb.append(line); } - DocVector goldStandard = createDocVector(sb.toString(), ngram); + DocVector goldStandard = createDocVector(sb.toString(), mingram, maxgram); if(goldStandard!=null) docVectors.add(goldStandard); else { @@ -101,15 +102,20 @@ public class Model { /** * Used to create a DocVector from given String text. Used during the parse stage of the crawl * cycle to create a DocVector of the currently parsed page from the parseText attribute value - * @param content - * @param ngram + * @param content The text to tokenize + * @param mingram Value of mingram for tokenizing + * @param maxgram Value of maxgram for tokenizing */ - public static DocVector createDocVector(String content, int ngram) { + public static DocVector createDocVector(String content, int mingram, int maxgram) { LuceneTokenizer tokenizer; - - if(ngram > 1){ - LOG.info("Using Ngram Cosine Model, user specified ngram value : {}", ngram); - tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, ngram); + + if(mingram > 1 && maxgram > 1){ + LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram); + tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram); + } else if (mingram > 1) { + maxgram = mingram; + LOG.info("Using Ngram Cosine Model, user specified mingram value : {} maxgram value : {}", mingram, maxgram); + tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, StemFilterType.PORTERSTEM_FILTER, mingram, maxgram); } else if(stopWords!=null) { tokenizer = new LuceneTokenizer(content, TokenizerType.STANDARD, stopWords, true, @@ -159,4 +165,26 @@ public class Model { // Returning the max score amongst all documents in the corpus return maxScore; } + + /** + * Retrieves mingram and maxgram from configuration + * @param conf Configuration to retrieve mingram and maxgram + * @return ngram array as mingram at first index and maxgram at second index + */ + public static int[] retrieveNgrams(Configuration conf){ + int[] ngramArr = new int[2]; + //Check if user has specified mingram or ngram for ngram cosine model + String[] ngramStr = conf.getStrings("scoring.similarity.ngrams", "1,1"); + //mingram + ngramArr[0] = Integer.parseInt(ngramStr[0]); + int maxgram; + if (ngramStr.length > 1) { + //maxgram + ngramArr[1] = Integer.parseInt(ngramStr[1]); + } else { + //maxgram + ngramArr[1] = ngramArr[0]; + } + return ngramArr; + } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/da252eb7/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java index c95033a..6f6d4d4 100644 --- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java +++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java @@ -94,12 +94,13 @@ public class LuceneTokenizer { * @param content - The text to tokenize * @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT * @param stemFilterType - Type of stemming to perform - * @param ngram - Value of ngram for tokenizing + * @param mingram - Value of mingram for tokenizing + * @param maxgram - Value of maxgram for tokenizing */ - public LuceneTokenizer(String content, TokenizerType tokenizer, StemFilterType stemFilterType, int ngram) { + public LuceneTokenizer(String content, TokenizerType tokenizer, StemFilterType stemFilterType, int mingram, int maxgram) { this.tokenizer = tokenizer; this.stemFilterType = stemFilterType; - tokenStream = createNGramTokenStream(content,ngram); + tokenStream = createNGramTokenStream(content, mingram, maxgram); } private TokenStream createTokenStream(String content) { @@ -124,11 +125,11 @@ public class LuceneTokenizer { return tokenStream; } - private TokenStream createNGramTokenStream(String content, int ngram) { + private TokenStream createNGramTokenStream(String content, int mingram, int maxgram) { tokenStream = new StandardTokenizer(new StringReader(content)); tokenStream = new LowerCaseFilter(tokenStream); tokenStream = applyStemmer(stemFilterType); - ShingleFilter shingleFilter = new ShingleFilter(tokenStream, ngram, ngram); + ShingleFilter shingleFilter = new ShingleFilter(tokenStream, mingram, maxgram); shingleFilter.setOutputUnigrams(false); tokenStream = (TokenStream)shingleFilter; return tokenStream;
