[Nutch-cvs] [Nutch Wiki] Update of "Stemming" by MatthewHolt

Apache Wiki Fri, 28 Jul 2006 07:56:15 -0700

Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change 
notification.


The following page has been changed by MatthewHolt:
http://wiki.apache.org/nutch/Stemming

New page:
[[TableOfContents]]

The following steps need to be taken to implement Stemming in Nutch. Howie Wang 
is the person credited with doing so for version 0.7.2.
I updated the process for Version 0.8. That can be found below. - Matthew Holt

"I've gotten a couple of questions offlist about stemming
so I thought I'd just post here with my changes. Sorry that
some of the changes are in the main code and not in a plugin. It
seemed that it's more efficient to put in the main analyzer. It
would be nice if later releases could add support for plugging
in a custom stemmer/analyzer."

=Version 0.7.2=
The first change I made is in NutchDocumentAnalyzer.java.

Import the following classes at the top of the file:
{{{
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;
}}}

Change tokenStream to:
{{{
public TokenStream tokenStream(String field, Reader reader) {
   TokenStream ts = CommonGrams.getFilter(new NutchDocumentTokenizer(reader), 
field);
   if (field.equals("content") || field.equals("title")) {
      ts = new LowerCaseFilter(ts);
      return new PorterStemFilter(ts);
   } 
   else {
      return ts;
   }
}
}}}

The second change is in CommonGrams.java.
Import the following classes near the top:

{{{
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;
}}}

In optimizePhrase, after this line:
{{{
   TokenStream ts = getFilter(new ArrayTokens(phrase), field);
}}}
Add:
{{{
   ts = new PorterStemFilter(new LowerCaseFilter(ts));
}}}
And the rest is a new QueryFilter plugin that I'm calling query-stemmer.
Here's the full source for the Java file. You can copy the build.xml
and plugin.xml from query-basic, and alter the names for query-stemmer.

{{{
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package org.apache.nutch.searcher.stemmer;

import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;

import org.apache.nutch.analysis.NutchDocumentAnalyzer;
import org.apache.nutch.analysis.CommonGrams;

import org.apache.nutch.searcher.QueryFilter;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.Query.*;

import java.io.IOException;
import java.util.HashSet;
import java.io.StringReader;

/** The default query filter.  Query terms in the default query field are
* expanded to search the url, anchor and content document fields.*/
public class StemmerQueryFilter implements QueryFilter {

  private static float URL_BOOST = 4.0f;
  private static float ANCHOR_BOOST = 2.0f;

  private static int SLOP = Integer.MAX_VALUE;
  private static float PHRASE_BOOST = 1.0f;

  private static final String[] FIELDS = {"url", "anchor", "content",
"title"};
  private static final float[] FIELD_BOOSTS = {URL_BOOST, ANCHOR_BOOST,
1.0f, 2.0f};

  /** Set the boost factor for url matches, relative to content and anchor
   * matches */
  public static void setUrlBoost(float boost) { URL_BOOST = boost; }

  /** Set the boost factor for title/anchor matches, relative to url and
   * content matches. */
  public static void setAnchorBoost(float boost) { ANCHOR_BOOST = boost; }

  /** Set the boost factor for sloppy phrase matches relative to unordered
term
   * matches. */
  public static void setPhraseBoost(float boost) { PHRASE_BOOST = boost; }

  /** Set the maximum number of terms permitted between matching terms in a
   * sloppy phrase match. */
  public static void setSlop(int slop) { SLOP = slop; }

  public BooleanQuery filter(Query input, BooleanQuery output) {
    addTerms(input, output);
    addSloppyPhrases(input, output);
    return output;
  }

  private static void addTerms(Query input, BooleanQuery output) {
    Clause[] clauses = input.getClauses();
    for (int i = 0; i < clauses.length; i++) {
      Clause c = clauses[i];

      if (!c.getField().equals(Clause.DEFAULT_FIELD))
        continue;                                 // skip non-default fields

      BooleanQuery out = new BooleanQuery();
      for (int f = 0; f < FIELDS.length; f++) {

        Clause o = c;
        String[] opt;

        // TODO: I'm a little nervous about stemming for all default fields.
        //       Should keep an eye on this.
        if (c.isPhrase()) {                         // optimize phrase
clauses
            opt = CommonGrams.optimizePhrase(c.getPhrase(), FIELDS[f]);
        } else {
            System.out.println("o.getTerm = " + o.getTerm().toString());
            opt = getStemmedWords(o.getTerm().toString());
        }
        if (opt.length==1) {
            o = new Clause(new Term(opt[0]), c.isRequired(),
c.isProhibited());
        } else {
            o = new Clause(new Phrase(opt), c.isRequired(),
c.isProhibited());
        }

        out.add(o.isPhrase()
                ? exactPhrase(o.getPhrase(), FIELDS[f], FIELD_BOOSTS[f])
                : termQuery(FIELDS[f], o.getTerm(), FIELD_BOOSTS[f]),
                false, false);
      }
      output.add(out, c.isRequired(), c.isProhibited());
    }
    System.out.println("query = " + output.toString());
  }

    private static String[] getStemmedWords(String value) {
          StringReader sr = new StringReader(value);
          TokenStream ts = new PorterStemFilter(new LowerCaseTokenizer(sr));

          String stemmedValue = "";
          try {
              Token token = ts.next();
              int count = 0;
              while (token != null) {
                  System.out.println("token = " + token.termText());
                  System.out.println("type = " + token.type());

                  if (count == 0)
                      stemmedValue = token.termText();
                  else
                      stemmedValue = stemmedValue + " " + token.termText();

                  token = ts.next();
                  count++;
              }
          } catch (Exception e) {
              stemmedValue = value;
          }

          if (stemmedValue.equals("")) {
              stemmedValue = value;
          }

          String[] stemmedValues = stemmedValue.split("\\s+");

          for (int j=0; j<stemmedValues.length; j++) {
              System.out.println("stemmedValues = " + stemmedValues[j]);
          }
          return stemmedValues;
    }


  private static void addSloppyPhrases(Query input, BooleanQuery output) {
    Clause[] clauses = input.getClauses();
    for (int f = 0; f < FIELDS.length; f++) {

      PhraseQuery sloppyPhrase = new PhraseQuery();
      sloppyPhrase.setBoost(FIELD_BOOSTS[f] * PHRASE_BOOST);
      sloppyPhrase.setSlop("anchor".equals(FIELDS[f])
                           ? NutchDocumentAnalyzer.INTER_ANCHOR_GAP
                           : SLOP);
      int sloppyTerms = 0;

      for (int i = 0; i < clauses.length; i++) {
        Clause c = clauses[i];

        if (!c.getField().equals(Clause.DEFAULT_FIELD))
          continue;                               // skip non-default fields

        if (c.isPhrase())                         // skip exact phrases
          continue;

        if (c.isProhibited())                     // skip prohibited terms
          continue;

        sloppyPhrase.add(luceneTerm(FIELDS[f], c.getTerm()));
        sloppyTerms++;
      }

      if (sloppyTerms > 1)
        output.add(sloppyPhrase, false, false);
    }
  }


  private static org.apache.lucene.search.Query
        termQuery(String field, Term term, float boost) {
    TermQuery result = new TermQuery(luceneTerm(field, term));
    result.setBoost(boost);
    return result;
  }

  /** Utility to construct a Lucene exact phrase query for a Nutch phrase.
*/
  private static org.apache.lucene.search.Query
       exactPhrase(Phrase nutchPhrase,
                   String field, float boost) {
    Term[] terms = nutchPhrase.getTerms();
    PhraseQuery exactPhrase = new PhraseQuery();
    for (int i = 0; i < terms.length; i++) {
      exactPhrase.add(luceneTerm(field, terms[i]));
    }
    exactPhrase.setBoost(boost);
    return exactPhrase;
  }

  /** Utility to construct a Lucene Term given a Nutch query term and field.
*/
  private static org.apache.lucene.index.Term luceneTerm(String field,
                                                         Term term) {
    return new org.apache.lucene.index.Term(field, term.toString());
  }
} 
}}}

=Version 0.8=
The first change I made is in NutchDocumentAnalyzer.java.

Import the following classes at the top of the file:
{{{
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;
}}}

Change tokenStream at the bottom of the file to:
{{{
public TokenStream tokenStream(String field, Reader reader) {
    Analyzer analyzer;
    if ("anchor".equals(field)) {
        analyzer = ANCHOR_ANALYZER;
    }
    else {
        analyzer = CONTENT_ANALYZER;

        TokenStream ts = analyzer.tokenStream(field, reader);
        if (field.equals("content") || field.equals("title")) {
            ts = new LowerCaseFilter(ts);
            return new PorterStemFilter(ts);
        }
        else {
            return ts;
        }
    }
}
}}}

The second change is in CommonGrams.java.
Import the following classes near the top:

{{{
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;
}}}

In optimizePhrase, after this line:
{{{
   TokenStream ts = getFilter(new ArrayTokens(phrase), field);
}}}
Add:
{{{
   ts = new PorterStemFilter(new LowerCaseFilter(ts));
}}}

And the rest is a new QueryFilter plugin that I'm calling query-stemmer.
Here's the full source for the Java file. You can copy the build.xml
and plugin.xml from query-basic, and alter the names for query-stemmer.

{{{{
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package org.apache.nutch.searcher.stemmer;

import org.apache.hadoop.conf.Configuration;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.PorterStemFilter;

import org.apache.nutch.analysis.NutchDocumentAnalyzer;
import org.apache.nutch.analysis.CommonGrams;

import org.apache.nutch.searcher.QueryFilter;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.Query.*;

import java.io.IOException;
import java.util.HashSet;
import java.io.StringReader;

/**
 * The default query filter. Query terms in the default query field are expanded
 * to search the url, anchor and content document fields.
 */
public class StemmerQueryFilter implements QueryFilter {
        private static int SLOP = Integer.MAX_VALUE;

        private float PHRASE_BOOST = 1.0f;

        private static final String[] FIELDS = { "url", "anchor", "content",
                        "title" , "host" };

        private final float[] FIELD_BOOSTS = { 4.0f, 2.0f, 1.0f, 1.5f, 2.0f };

        private Configuration conf;

        /**
         * Set the boost factor for url matches, relative to content and anchor
         * matches
         */
        public void setUrlBoost(float boost) {
                FIELD_BOOSTS[0] = boost;
        }

        /**
         * Set the boost factor for title/anchor matches, relative to url and
         * content matches.
         */
        public void setAnchorBoost(float boost) {
                FIELD_BOOSTS[1] = boost;
        }

        /**
         * Set the boost factor for sloppy phrase matches relative to unordered 
term
         * matches.
         */
        public void setPhraseBoost(float boost) {
                PHRASE_BOOST = boost;
        }

        /**
         * Set the maximum number of terms permitted between matching terms in a
         * sloppy phrase match.
         */
        public static void setSlop(int slop) {
                SLOP = slop;
        }

        public BooleanQuery filter(Query input, BooleanQuery output) {
                addTerms(input, output);
                addSloppyPhrases(input, output);
                return output;
        }

        private void addTerms(Query input, BooleanQuery output) {
                Clause[] clauses = input.getClauses();
                for (int i = 0; i < clauses.length; i++) {
                        Clause c = clauses[i];

                        if (!c.getField().equals(Clause.DEFAULT_FIELD))
                                continue; // skip non-default fields

                        BooleanQuery out = new BooleanQuery();
                        for (int f = 0; f < FIELDS.length; f++) {

                                Clause o = c;
                                String[] opt;

                                // TODO: I'm a little nervous about stemming 
for all default
                                // fields.
                                // Should keep an eye on this.
                                if (c.isPhrase()) { // optimize phrase clauses
                                        opt = new 
CommonGrams(getConf()).optimizePhrase(c
                                                        .getPhrase(), 
FIELDS[f]);
                                } else {
                                        System.out.println("o.getTerm = " + 
o.getTerm().toString());
                                        opt = 
getStemmedWords(o.getTerm().toString());
                                }
                                if (opt.length == 1) {
                                        o = new Clause(new Term(opt[0]), 
c.isRequired(), c
                                                        .isProhibited(), 
getConf());
                                } else {
                                        o = new Clause(new Phrase(opt), 
c.isRequired(), c
                                                        .isProhibited(), 
getConf());
                                }

                                out.add(o.isPhrase() ? 
exactPhrase(o.getPhrase(), FIELDS[f],
                                                FIELD_BOOSTS[f]) : 
termQuery(FIELDS[f], o.getTerm(),
                                                FIELD_BOOSTS[f]), 
BooleanClause.Occur.SHOULD);
                        }
                        output.add(out, (c.isProhibited() ? 
BooleanClause.Occur.MUST_NOT
                                        : (c.isRequired() ? 
BooleanClause.Occur.MUST
                                                        : 
BooleanClause.Occur.SHOULD)));
                }
                System.out.println("query = " + output.toString());
        }

        private static String[] getStemmedWords(String value) {
                StringReader sr = new StringReader(value);
                TokenStream ts = new PorterStemFilter(new 
LowerCaseTokenizer(sr));

                String stemmedValue = "";
                try {
                        Token token = ts.next();
                        int count = 0;
                        while (token != null) {
                                System.out.println("token = " + 
token.termText());
                                System.out.println("type = " + token.type());

                                if (count == 0)
                                        stemmedValue = token.termText();
                                else
                                        stemmedValue = stemmedValue + " " + 
token.termText();

                                token = ts.next();
                                count++;
                        }
                } catch (Exception e) {
                        stemmedValue = value;
                }

                if (stemmedValue.equals("")) {
                        stemmedValue = value;
                }

                String[] stemmedValues = stemmedValue.split("\\s+");

                for (int j = 0; j < stemmedValues.length; j++) {
                        System.out.println("stemmedValues = " + 
stemmedValues[j]);
                }
                return stemmedValues;
        }

        private void addSloppyPhrases(Query input, BooleanQuery output) {
                Clause[] clauses = input.getClauses();
                for (int f = 0; f < FIELDS.length; f++) {

                        PhraseQuery sloppyPhrase = new PhraseQuery();
                        sloppyPhrase.setBoost(FIELD_BOOSTS[f] * PHRASE_BOOST);
                        sloppyPhrase
                                        .setSlop("anchor".equals(FIELDS[f]) ? 
NutchDocumentAnalyzer.INTER_ANCHOR_GAP
                                                        : SLOP);
                        int sloppyTerms = 0;

                        for (int i = 0; i < clauses.length; i++) {
                                Clause c = clauses[i];

                                if (!c.getField().equals(Clause.DEFAULT_FIELD))
                                        continue; // skip non-default fields

                                if (c.isPhrase()) // skip exact phrases
                                        continue;

                                if (c.isProhibited()) // skip prohibited terms
                                        continue;

                                sloppyPhrase.add(luceneTerm(FIELDS[f], 
c.getTerm()));
                                sloppyTerms++;
                        }

                        if (sloppyTerms > 1)
                                output.add(sloppyPhrase, 
BooleanClause.Occur.SHOULD);
                }
        }

        private static org.apache.lucene.search.Query termQuery(String field,
                        Term term, float boost) {
                TermQuery result = new TermQuery(luceneTerm(field, term));
                result.setBoost(boost);
                return result;
        }

        /**
         * Utility to construct a Lucene exact phrase query for a Nutch phrase.
         */
        private static org.apache.lucene.search.Query exactPhrase(
                        Phrase nutchPhrase, String field, float boost) {
                Term[] terms = nutchPhrase.getTerms();
                PhraseQuery exactPhrase = new PhraseQuery();
                for (int i = 0; i < terms.length; i++) {
                        exactPhrase.add(luceneTerm(field, terms[i]));
                }
                exactPhrase.setBoost(boost);
                return exactPhrase;
        }

        /**
         * Utility to construct a Lucene Term given a Nutch query term and 
field.
         */
        private static org.apache.lucene.index.Term luceneTerm(String field,
                        Term term) {
                return new org.apache.lucene.index.Term(field, term.toString());
        }

        public void setConf(Configuration conf) {
                this.conf = conf;
                this.FIELD_BOOSTS[0] = conf.getFloat("query.url.boost", 4.0f);
                this.FIELD_BOOSTS[1] = conf.getFloat("query.anchor.boost", 
2.0f);
                this.FIELD_BOOSTS[2] = conf.getFloat("query.content.boost", 
1.0f);
                this.FIELD_BOOSTS[3] = conf.getFloat("query.title.boost", 1.5f);
                this.FIELD_BOOSTS[4] = conf.getFloat("query.host.boost", 2.0f);
                this.PHRASE_BOOST = conf.getFloat("query.phrase.boost", 1.0f);
        }

        public Configuration getConf() {
                return this.conf;
        }
}
}}}}

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys -- and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

[Nutch-cvs] [Nutch Wiki] Update of "Stemming" by MatthewHolt

Reply via email to