Ok. I did this for Nutch 0.8 (had to edit the listed code some to make
up for changes from .7.2 to .8 - mostly having to do with the
Configuration type being needed).
It partially works.
If the page I'm trying to index contains the word "interviews" and I
type in the search engine "interview", the stemming takes place and the
page with the word "interviews" is returned.
However, if I type in the word "interviews" no page is returned. (The
page with the word interviews on it should be returned).
Any ideas??
Matt
Dima Mazmanov wrote:
> Hi, .
>
> I've gotten a couple of questions offlist about stemming
> so I thought I'd just post here with my changes. Sorry that
> some of the changes are in the main code and not in a plugin. It
> seemed that it's more efficient to put in the main analyzer. It
> would be nice if later releases could add support for plugging
> in a custom stemmer/analyzer.
>
> The first change I made is in NutchDocumentAnalyzer.java.
>
> Import the following classes at the top of the file:
> import org.apache.lucene.analysis.LowerCaseTokenizer;
> import org.apache.lucene.analysis.LowerCaseFilter;
> import org.apache.lucene.analysis.PorterStemFilter;
>
> Change tokenStream to:
>
> public TokenStream tokenStream(String field, Reader reader) {
> TokenStream ts = CommonGrams.getFilter(new NutchDocumentTokenizer(reader),
> field);
> if (field.equals("content") || field.equals("title")) {
> ts = new LowerCaseFilter(ts);
> return new PorterStemFilter(ts);
> } else {
> return ts;
> }
> }
>
> The second change is in CommonGrams.java.
> Import the following classes near the top:
>
> import org.apache.lucene.analysis.LowerCaseTokenizer;
> import org.apache.lucene.analysis.LowerCaseFilter;
> import org.apache.lucene.analysis.PorterStemFilter;
>
> In optimizePhrase, after this line:
>
> TokenStream ts = getFilter(new ArrayTokens(phrase), field);
>
> Add:
>
> ts = new PorterStemFilter(new LowerCaseFilter(ts));
>
> And the rest is a new QueryFilter plugin that I'm calling query-stemmer.
> Here's the full source for the Java file. You can copy the build.xml
> and plugin.xml from query-basic, and alter the names for query-stemmer.
>
> /* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
> /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
>
> package org.apache.nutch.searcher.stemmer;
>
> import org.apache.lucene.search.BooleanQuery;
> import org.apache.lucene.search.PhraseQuery;
> import org.apache.lucene.search.TermQuery;
> import org.apache.lucene.analysis.TokenFilter;
> import org.apache.lucene.analysis.TokenStream;
> import org.apache.lucene.analysis.Token;
> import org.apache.lucene.analysis.LowerCaseTokenizer;
> import org.apache.lucene.analysis.LowerCaseFilter;
> import org.apache.lucene.analysis.PorterStemFilter;
>
> import org.apache.nutch.analysis.NutchDocumentAnalyzer;
> import org.apache.nutch.analysis.CommonGrams;
>
> import org.apache.nutch.searcher.QueryFilter;
> import org.apache.nutch.searcher.Query;
> import org.apache.nutch.searcher.Query.*;
>
> import java.io.IOException;
> import java.util.HashSet;
> import java.io.StringReader;
>
> /** The default query filter. Query terms in the default query field are
> * expanded to search the url, anchor and content document fields.*/
> public class StemmerQueryFilter implements QueryFilter {
>
> private static float URL_BOOST = 4.0f;
> private static float ANCHOR_BOOST = 2.0f;
>
> private static int SLOP = Integer.MAX_VALUE;
> private static float PHRASE_BOOST = 1.0f;
>
> private static final String[] FIELDS = {"url", "anchor", "content",
> "title"};
> private static final float[] FIELD_BOOSTS = {URL_BOOST, ANCHOR_BOOST,
> 1.0f, 2.0f};
>
> /** Set the boost factor for url matches, relative to content and anchor
> * matches */
> public static void setUrlBoost(float boost) { URL_BOOST = boost; }
>
> /** Set the boost factor for title/anchor matches, relative to url and
> * content matches. */
> public static void setAnchorBoost(float boost) { ANCHOR_BOOST = boost; }
>
> /** Set the boost factor for sloppy phrase matches relative to unordered
> term
> * matches. */
> public static void setPhraseBoost(float boost) { PHRASE_BOOST = boost; }
>
> /** Set the maximum number of terms permitted between matching terms in a
> * sloppy phrase match. */
> public static void setSlop(int slop) { SLOP = slop; }
>
> public BooleanQuery filter(Query input, BooleanQuery output) {
> addTerms(input, output);
> addSloppyPhrases(input, output);
> return output;
> }
>
> private static void addTerms(Query input, BooleanQuery output) {
> Clause[] clauses = input.getClauses();
> for (int i = 0; i < clauses.length; i++) {
> Clause c = clauses[i];
>
> if (!c.getField().equals(Clause.DEFAULT_FIELD))
> continue; // skip non-default fields
>
> BooleanQuery out = new BooleanQuery();
> for (int f = 0; f < FIELDS.length; f++) {
>
> Clause o = c;
> String[] opt;
>
> // TODO: I'm a little nervous about stemming for all default fields.
> // Should keep an eye on this.
> if (c.isPhrase()) { // optimize phrase
> clauses
> opt = CommonGrams.optimizePhrase(c.getPhrase(), FIELDS[f]);
> } else {
> System.out.println("o.getTerm = " + o.getTerm().toString());
> opt = getStemmedWords(o.getTerm().toString());
> }
> if (opt.length==1) {
> o = new Clause(new Term(opt[0]), c.isRequired(),
> c.isProhibited());
> } else {
> o = new Clause(new Phrase(opt), c.isRequired(),
> c.isProhibited());
> }
>
> out.add(o.isPhrase()
> ? exactPhrase(o.getPhrase(), FIELDS[f], FIELD_BOOSTS[f])
> : termQuery(FIELDS[f], o.getTerm(), FIELD_BOOSTS[f]),
> false, false);
> }
> output.add(out, c.isRequired(), c.isProhibited());
> }
> System.out.println("query = " + output.toString());
> }
>
> private static String[] getStemmedWords(String value) {
> StringReader sr = new StringReader(value);
> TokenStream ts = new PorterStemFilter(new LowerCaseTokenizer(sr));
>
> String stemmedValue = "";
> try {
> Token token = ts.next();
> int count = 0;
> while (token != null) {
> System.out.println("token = " + token.termText());
> System.out.println("type = " + token.type());
>
> if (count == 0)
> stemmedValue = token.termText();
> else
> stemmedValue = stemmedValue + " " + token.termText();
>
> token = ts.next();
> count++;
> }
> } catch (Exception e) {
> stemmedValue = value;
> }
>
> if (stemmedValue.equals("")) {
> stemmedValue = value;
> }
>
> String[] stemmedValues = stemmedValue.split("\\s+");
>
> for (int j=0; j<stemmedValues.length; j++) {
> System.out.println("stemmedValues = " + stemmedValues[j]);
> }
> return stemmedValues;
> }
>
>
> private static void addSloppyPhrases(Query input, BooleanQuery output) {
> Clause[] clauses = input.getClauses();
> for (int f = 0; f < FIELDS.length; f++) {
>
> PhraseQuery sloppyPhrase = new PhraseQuery();
> sloppyPhrase.setBoost(FIELD_BOOSTS[f] * PHRASE_BOOST);
> sloppyPhrase.setSlop("anchor".equals(FIELDS[f])
> ? NutchDocumentAnalyzer.INTER_ANCHOR_GAP
> : SLOP);
> int sloppyTerms = 0;
>
> for (int i = 0; i < clauses.length; i++) {
> Clause c = clauses[i];
>
> if (!c.getField().equals(Clause.DEFAULT_FIELD))
> continue; // skip non-default fields
>
> if (c.isPhrase()) // skip exact phrases
> continue;
>
> if (c.isProhibited()) // skip prohibited terms
> continue;
>
> sloppyPhrase.add(luceneTerm(FIELDS[f], c.getTerm()));
> sloppyTerms++;
> }
>
> if (sloppyTerms > 1)
> output.add(sloppyPhrase, false, false);
> }
> }
>
>
> private static org.apache.lucene.search.Query
> termQuery(String field, Term term, float boost) {
> TermQuery result = new TermQuery(luceneTerm(field, term));
> result.setBoost(boost);
> return result;
> }
>
> /** Utility to construct a Lucene exact phrase query for a Nutch phrase.
> */
> private static org.apache.lucene.search.Query
> exactPhrase(Phrase nutchPhrase,
> String field, float boost) {
> Term[] terms = nutchPhrase.getTerms();
> PhraseQuery exactPhrase = new PhraseQuery();
> for (int i = 0; i < terms.length; i++) {
> exactPhrase.add(luceneTerm(field, terms[i]));
> }
> exactPhrase.setBoost(boost);
> return exactPhrase;
> }
>
> /** Utility to construct a Lucene Term given a Nutch query term and field.
> */
> private static org.apache.lucene.index.Term luceneTerm(String field,
> Term term) {
> return new org.apache.lucene.index.Term(field, term.toString());
> }
> }
>
>
>
>
>
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys -- and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Nutch-general mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-general