fulltext FullTextIndexer.java PorterStemmer.java WordStemmer.java

bradford 21 Apr 2002 19:34:02 -0000

bradford    02/04/21 12:34:02


  Added:       java/src/org/apache/xindice/core/fulltext
                        FullTextIndexer.java PorterStemmer.java
                        WordStemmer.java
  Log:
  Adding full-text indexing code
  
  Revision  Changes    Path
  1.1                  
xml-xindice/java/src/org/apache/xindice/core/fulltext/FullTextIndexer.java
  
  Index: FullTextIndexer.java
  ===================================================================
  package org.apache.xindice.core.fulltext;
  
  import org.apache.xindice.core.*;
  import org.apache.xindice.core.Collection;
  import org.apache.xindice.core.data.*;
  import org.apache.xindice.core.filer.*;
  import org.apache.xindice.core.indexer.*;
  import org.apache.xindice.core.query.*;
  import org.apache.xindice.util.*;
  import org.apache.xindice.xml.*;
  import org.apache.xindice.xml.dom.*;
  import org.apache.xindice.server.*;
  
  import java.io.*;
  import java.util.*;
  
  import org.w3c.dom.*;
  
  /**
   * FullTextIndexer is a full text search implementation of the
   * Indexer interface.
   */
  
  public final class FullTextIndexer extends BTree implements Indexer {
     private static final IndexMatch[] EmptyMatches = new IndexMatch[0];
     private static final Value EmptyValue = new Value(new byte[0]);
  
     private static final String DEFAULT_STEMMER = 
"org.apache.xindice.fulltext.PorterStemmer";
     private static final String DEFAULT_STOPWORDS = "stopwords.txt";
  
     private static final byte MATCHES = 20;
     private static final long MATCH_INFO = -1000;
  
     private static final String NAME = "name";
     private static final String PATTERN = "pattern";
     private static final String PAGESIZE = "pagesize";
     private static final String MAXKEYSIZE = "maxkeysize";
     private static final String STEMMER = "stemmer";
     private static final String STOPWORDS = "stopwords";
     private static final String ROLLCASE = "rollcase";
  
     private Configuration config;
     private Collection collection;
     private SymbolTable symbols;
  
     private String name;
     private String pattern;
     private WordStemmer stemmer;
     private Set stopWords;
     private boolean rollCase = true;
     private boolean wildcard = false;
  
     private FileHeader fileHeader;
  
     public FullTextIndexer() {
        super();
        fileHeader = getFileHeader();
        fileHeader.setPageSize(1024);
     }
  
     public void setConfig(Configuration config) {
        this.config = config;
        try {
           name = config.getAttribute(NAME);
  
           pattern = config.getAttribute(PATTERN);
           wildcard = pattern.indexOf('*') != -1;
  
           rollCase = config.getBooleanAttribute(ROLLCASE, rollCase);
  
           String stemClass = config.getAttribute(STEMMER, DEFAULT_STEMMER);
           if ( stemClass != null && stemClass.length() > 0 ) {
              try {
                 stemmer = (WordStemmer)Class.forName(stemClass).newInstance();
              }
              catch ( Exception e ) {
                 org.apache.xindice.Debug.println("Stemmer '"+stemClass+"' Not 
Found");
              }
           }
  
           String stopName = config.getAttribute(STOPWORDS, DEFAULT_STOPWORDS);
           if ( stopName != null && stopName.length() > 0 ) {
              try {
                 stopWords = new HashSet();
                 File stopFile = new File(stopName);
                 FileInputStream fis = new FileInputStream(stopFile);
                 BufferedInputStream bis = new BufferedInputStream(fis, 4096);
                 DataInputStream dis = new DataInputStream(bis);
                 while ( dis.available() > 0 ) {
                    String word = dis.readLine();
                    if ( word != null && word.trim().length() > 0 )
                       stopWords.add(word);
                 }
                 fis.close();
              }
              catch ( Exception e ) {
                 org.apache.xindice.Debug.println("Stop Word list 
'"+stopName+"' Not Found");
              }
           }
  
           fileHeader.setPageSize(config.getIntAttribute(PAGESIZE, 
fileHeader.getPageSize()));
           fileHeader.setMaxKeySize(config.getShortAttribute(MAXKEYSIZE, 
fileHeader.getMaxKeySize()));
  
           setLocation(name);
        }
        catch ( Exception e ) {
           org.apache.xindice.Debug.printStackTrace(e);
        }
     }
  
     public Configuration getConfig() {
        return config;
     }
  
     public String getName() {
        return name;
     }
  
     public void setLocation(String location) {
        setFile(new File(collection.getCollectionRoot(), location+".idx"));
     }
  
     public void setCollection(Collection collection) {
        try {
           this.collection = collection;
           symbols = collection.getSymbols();
        }
        catch ( Exception e ) {
           org.apache.xindice.Debug.printStackTrace(e);
        }
     }
  
     public String getIndexStyle() {
        return STYLE_FULLTEXT;
     }
  
     public String getPattern() {
        return pattern;
     }
  
     private Value getCombinedValue(Key key, int pos, int len, short elemID, 
short attrID) {
        Value result;
        try {
           int l = key.getLength();
           byte[] b = new byte[l+13];
  
           // Write the key
           System.arraycopy(key.getData(), 0, b, 0, l);
           b[l] = 0;
  
           // Write the pos
           b[l+1] = (byte)((pos >>> 24) & 0xFF);
           b[l+2] = (byte)((pos >>> 16) & 0xFF);
           b[l+3] = (byte)((pos >>> 8)  & 0xFF);
           b[l+4] = (byte)((pos >>> 0)  & 0xFF);
  
           // Write the len
           b[l+5] = (byte)((len >>> 24) & 0xFF);
           b[l+6] = (byte)((len >>> 16) & 0xFF);
           b[l+7] = (byte)((len >>> 8)  & 0xFF);
           b[l+8] = (byte)((len >>> 0)  & 0xFF);
  
           // Write the elemID
           b[l+9] = (byte)((elemID >>> 8)  & 0xFF);
           b[l+10] = (byte)((elemID >>> 0) & 0xFF);
  
           // Write the attrID
           b[l+11] = (byte)((attrID >>> 8) & 0xFF);
           b[l+12] = (byte)((attrID >>> 0) & 0xFF);
  
           result = new Value(b);
        }
        catch ( Exception e ) {
           result = null; // This will never happen
        }
        return result;
     }
  
     private IndexMatch getIndexMatch(Value v) {
        byte[] b = v.getData();
        int l = b.length-13;
        Key key = new Key(b, 0, b.length-13);
  
        int pos = ((b[l+1] << 24) | (b[l+2] << 16) | (b[l+3] << 8) | b[l+4]);
        int len = ((b[l+5] << 24) | (b[l+6] << 16) | (b[l+7] << 8) | b[l+8]);
        short elemID = (short)((b[l+9] << 8) | b[l+10]);
        short attrID = (short)((b[l+11] << 8) | b[l+12]);
  
        return new IndexMatch(key, pos, len, elemID, attrID);
     }
  
     public synchronized void remove(String value, Key key, int pos, int len, 
short elemID, short attrID) throws DBException {
        StringTokenizer st = new StringTokenizer(value);
        while ( st.hasMoreTokens() ) {
           String s = st.nextToken();
  
           if ( stemmer != null )
              s = stemmer.normalizeCase(s);
  
           if ( stopWords != null && stopWords.contains(s) )
                 continue;
  
           if ( stemmer != null )
              s = stemmer.stemWord(s);
  
           Value v = new Value(s);
           try {
              BTreeRootInfo root = findBTreeRoot(v);
              Value cv = getCombinedValue(key, pos, len, elemID, attrID);
              removeValue(root, cv);
           }
           catch ( DBException d ) {
              throw d;
           }
           catch ( Exception e ) {
              org.apache.xindice.Debug.printStackTrace(e);
           }
        }
     }
  
     public synchronized void add(String value, Key key, int pos, int len, 
short elemID, short attrID) throws DBException {
        StringTokenizer st = new StringTokenizer(value);
        while ( st.hasMoreTokens() ) {
           String s = st.nextToken();
  
           if ( stemmer != null )
              s = stemmer.normalizeCase(s);
  
           if ( stopWords != null && stopWords.contains(s) )
                 continue;
  
           if ( stemmer != null )
              s = stemmer.stemWord(s);
  
           Value v = new Value(s);
           try {
              BTreeRootInfo root;
  
              try {
                 root = findBTreeRoot(v);
              }
              catch ( BTreeNotFoundException e ) {
                 root = createBTreeRoot(v);
              }
  
              Value cv = getCombinedValue(key, pos, len, elemID, attrID);
              addValue(root, cv, MATCH_INFO);
           }
           catch ( DBException d ) {
              throw d;
           }
           catch ( IOException e ) {
              throw new BTreeCorruptException("Corruption detected on add");
           }
           catch ( Exception e ) {
              org.apache.xindice.Debug.printStackTrace(e);
           }
        }
     }
  
     public synchronized void flush() throws DBException {
        super.flush();
     }
  
     public synchronized IndexMatch[] queryMatches(final IndexQuery query) 
throws DBException {
        // Pre-process the value-set for stop words and stemming
        Value[] vals = query.getValues();
        for ( int i = 0; i < vals.length; i++ ) {
           String s = vals[i].toString();
  
           if ( stemmer != null )
              s = stemmer.normalizeCase(s);
  
           if ( stopWords != null && stopWords.contains(s) )
                 continue;
  
           if ( stemmer != null )
              s = stemmer.stemWord(s);
  
           vals[i] = new Value(s);
        }
  
        // Now issue the query
        final List results = new ArrayList();
  
        try {
           query(query, new BTreeCallback() {
              public boolean indexInfo(Value value, long pos) {
                 try {
                    if ( pos == MATCH_INFO ) {
                       IndexMatch match = getIndexMatch(value);
                       if ( !wildcard )
                          results.add(match);
                       else {
                          IndexPattern pt = new IndexPattern(symbols, 
match.getElement(), match.getAttribute());
                          if ( pt.getMatchLevel(query.getPattern()) > 0 )
                             results.add(match);
                       }
                    }
                    else {
                       BTreeRootInfo root = new BTreeRootInfo(value, pos);
                       query(root, null, this);
                    }
                    return true;
                 }
                 catch ( Exception e ) {
                    org.apache.xindice.Debug.printStackTrace(e);
                 }
                 return true;
              }
           });
        }
        catch ( IOException e ) {
           throw new BTreeCorruptException("Corruption detected on query");
        }
        catch ( Exception e ) {
           org.apache.xindice.Debug.printStackTrace(e);
        }
  
        return (IndexMatch[])results.toArray(EmptyMatches);
     }
  }
  
  
  
  
  
  
  1.1                  
xml-xindice/java/src/org/apache/xindice/core/fulltext/PorterStemmer.java
  
  Index: PorterStemmer.java
  ===================================================================
  package org.apache.xindice.core.fulltext;
  
  /*
  
     Porter stemmer in Java. The original paper is in
  
         Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
         no. 3, pp 130-137,
  
     See also <a 
href="http://www.muscat.com/~martin/stem.html";>http://www.muscat.com/~martin/stem.html</a>
  
     History:
  
     Release 1
  
     Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
     The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
     is then out outside the bounds of b.
  
     Release 2
  
     Similarly,
  
     Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
     'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
     b[j] is then outside the bounds of b.
  
     Release 3
  
     Considerably revised 4/9/00 in the light of many helpful suggestions
     from Brian Goetz of Quiotix Corporation ([EMAIL PROTECTED]). The earlier
     version of the program can still be got from [EMAIL PROTECTED] should
     anyone require it.
  
     Release 4
  
  */
  
  import java.io.*;
  
  public class PorterStemmer implements WordStemmer {
     public String normalizeCase(String word) {
        return word.toLowerCase();
     }
     
     public String stemWord(String word) {
        Stemmer s = new Stemmer();
        s.add(word.toCharArray(), word.length());
        s.stem();
        return s.toString();
     }
  
     
     /**
       * Stemmer, implementing the Porter Stemming Algorithm
       *
       * The Stemmer class transforms a word into its root form.  The input
       * word can be provided a character at time (by calling add()), or at once
       * by calling one of the various stem(something) methods.
       */
     
     private class Stemmer
     {  private char[] b;
        private int i,     /* offset into b */
                    i_end, /* offset to end of stemmed word */
                    j, k;
        private static final int INC = 50;
                          /* unit of size whereby b is increased */
        public Stemmer()
        {  b = new char[INC];
           i = 0;
           i_end = 0;
        }
     
        /**
         * Add a character to the word being stemmed.  When you are finished
         * adding characters, you can call stem(void) to stem the word.
         */
     
        public void add(char ch)
        {  if (i == b.length)
           {  char[] new_b = new char[i+INC];
              for (int c = 0; c < i; c++) new_b[c] = b[c];
              b = new_b;
           }
           b[i++] = ch;
        }
     
     
        /** Adds wLen characters to the word being stemmed contained in a 
portion
         * of a char[] array. This is like repeated calls of add(char ch), but
         * faster.
         */
     
        public void add(char[] w, int wLen)
        {  if (i+wLen >= b.length)
           {  char[] new_b = new char[i+wLen+INC];
              for (int c = 0; c < i; c++) new_b[c] = b[c];
              b = new_b;
           }
           for (int c = 0; c < wLen; c++) b[i++] = w[c];
        }
     
        /**
         * After a word has been stemmed, it can be retrieved by toString(),
         * or a reference to the internal buffer can be retrieved by 
getResultBuffer
         * and getResultLength (which is generally more efficient.)
         */
        public String toString() { return new String(b,0,i_end); }
     
        /**
         * Returns the length of the word resulting from the stemming process.
         */
        public int getResultLength() { return i_end; }
     
        /**
         * Returns a reference to a character buffer containing the results of
         * the stemming process.  You also need to consult getResultLength()
         * to determine the length of the result.
         */
        public char[] getResultBuffer() { return b; }
     
        /* cons(i) is true <=> b[i] is a consonant. */
     
        private final boolean cons(int i)
        {  switch (b[i])
           {  case 'a': case 'e': case 'i': case 'o': case 'u': return false;
              case 'y': return (i==0) ? true : !cons(i-1);
              default: return true;
           }
        }
     
        /* m() measures the number of consonant sequences between 0 and j. if c 
is
           a consonant sequence and v a vowel sequence, and <..> indicates 
arbitrary
           presence,
     
              <c><v>       gives 0
              <c>vc<v>     gives 1
              <c>vcvc<v>   gives 2
              <c>vcvcvc<v> gives 3
              ....
        */
     
        private final int m()
        {  int n = 0;
           int i = 0;
           while(true)
           {  if (i > j) return n;
              if (! cons(i)) break; i++;
           }
           i++;
           while(true)
           {  while(true)
              {  if (i > j) return n;
                    if (cons(i)) break;
                    i++;
              }
              i++;
              n++;
              while(true)
              {  if (i > j) return n;
                 if (! cons(i)) break;
                 i++;
              }
              i++;
            }
        }
     
        /* vowelinstem() is true <=> 0,...j contains a vowel */
     
        private final boolean vowelinstem()
        {  int i; for (i = 0; i <= j; i++) if (! cons(i)) return true;
           return false;
        }
     
        /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
     
        private final boolean doublec(int j)
        {  if (j < 1) return false;
           if (b[j] != b[j-1]) return false;
           return cons(j);
        }
     
        /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - 
consonant
           and also if the second c is not w,x or y. this is used when trying to
           restore an e at the end of a short word. e.g.
     
              cav(e), lov(e), hop(e), crim(e), but
              snow, box, tray.
     
        */
     
        private final boolean cvc(int i)
        {  if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false;
           {  int ch = b[i];
              if (ch == 'w' || ch == 'x' || ch == 'y') return false;
           }
           return true;
        }
     
        private final boolean ends(String s)
        {  int l = s.length();
           int o = k-l+1;
           if (o < 0) return false;
           for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false;
           j = k-l;
           return true;
        }
     
        /* setto(s) sets (j+1),...k to the characters in the string s, 
readjusting
           k. */
     
        private final void setto(String s)
        {  int l = s.length();
           int o = j+1;
           for (int i = 0; i < l; i++) b[o+i] = s.charAt(i);
           k = j+l;
        }
     
        /* r(s) is used further down. */
     
        private final void r(String s) { if (m() > 0) setto(s); }
     
        /* step1() gets rid of plurals and -ed or -ing. e.g.
     
               caresses  ->  caress
               ponies    ->  poni
               ties      ->  ti
               caress    ->  caress
               cats      ->  cat
     
               feed      ->  feed
               agreed    ->  agree
               disabled  ->  disable
     
               matting   ->  mat
               mating    ->  mate
               meeting   ->  meet
               milling   ->  mill
               messing   ->  mess
     
               meetings  ->  meet
     
        */
     
        private final void step1()
        {  if (b[k] == 's')
           {  if (ends("sses")) k -= 2; else
              if (ends("ies")) setto("i"); else
              if (b[k-1] != 's') k--;
           }
           if (ends("eed")) { if (m() > 0) k--; } else
           if ((ends("ed") || ends("ing")) && vowelinstem())
           {  k = j;
              if (ends("at")) setto("ate"); else
              if (ends("bl")) setto("ble"); else
              if (ends("iz")) setto("ize"); else
              if (doublec(k))
              {  k--;
                 {  int ch = b[k];
                    if (ch == 'l' || ch == 's' || ch == 'z') k++;
                 }
              }
              else if (m() == 1 && cvc(k)) setto("e");
          }
        }
     
        /* step2() turns terminal y to i when there is another vowel in the 
stem. */
     
        private final void step2() { if (ends("y") && vowelinstem()) b[k] = 
'i'; }
     
        /* step3() maps double suffices to single ones. so -ization ( = -ize 
plus
           -ation) maps to -ize etc. note that the string before the suffix 
must give
           m() > 0. */
     
        private final void step3() { if (k == 0) return; /* For Bug 1 */ switch 
(b[k-1])
        {
            case 'a': if (ends("ational")) { r("ate"); break; }
                      if (ends("tional")) { r("tion"); break; }
                      break;
            case 'c': if (ends("enci")) { r("ence"); break; }
                      if (ends("anci")) { r("ance"); break; }
                      break;
            case 'e': if (ends("izer")) { r("ize"); break; }
                      break;
            case 'l': if (ends("bli")) { r("ble"); break; }
                      if (ends("alli")) { r("al"); break; }
                      if (ends("entli")) { r("ent"); break; }
                      if (ends("eli")) { r("e"); break; }
                      if (ends("ousli")) { r("ous"); break; }
                      break;
            case 'o': if (ends("ization")) { r("ize"); break; }
                      if (ends("ation")) { r("ate"); break; }
                      if (ends("ator")) { r("ate"); break; }
                      break;
            case 's': if (ends("alism")) { r("al"); break; }
                      if (ends("iveness")) { r("ive"); break; }
                      if (ends("fulness")) { r("ful"); break; }
                      if (ends("ousness")) { r("ous"); break; }
                      break;
            case 't': if (ends("aliti")) { r("al"); break; }
                      if (ends("iviti")) { r("ive"); break; }
                      if (ends("biliti")) { r("ble"); break; }
                      break;
            case 'g': if (ends("logi")) { r("log"); break; }
        } }
     
        /* step4() deals with -ic-, -full, -ness etc. similar strategy to 
step3. */
     
        private final void step4() { switch (b[k])
        {
            case 'e': if (ends("icate")) { r("ic"); break; }
                      if (ends("ative")) { r(""); break; }
                      if (ends("alize")) { r("al"); break; }
                      break;
            case 'i': if (ends("iciti")) { r("ic"); break; }
                      break;
            case 'l': if (ends("ical")) { r("ic"); break; }
                      if (ends("ful")) { r(""); break; }
                      break;
            case 's': if (ends("ness")) { r(""); break; }
                      break;
        } }
     
        /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
     
        private final void step5()
        {   if (k == 0) return; /* for Bug 1 */ switch (b[k-1])
            {  case 'a': if (ends("al")) break; return;
               case 'c': if (ends("ance")) break;
                         if (ends("ence")) break; return;
               case 'e': if (ends("er")) break; return;
               case 'i': if (ends("ic")) break; return;
               case 'l': if (ends("able")) break;
                         if (ends("ible")) break; return;
               case 'n': if (ends("ant")) break;
                         if (ends("ement")) break;
                         if (ends("ment")) break;
                         /* element etc. not stripped before the m */
                         if (ends("ent")) break; return;
               case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 
't')) break;
                                         /* j >= 0 fixes Bug 2 */
                         if (ends("ou")) break; return;
                         /* takes care of -ous */
               case 's': if (ends("ism")) break; return;
               case 't': if (ends("ate")) break;
                         if (ends("iti")) break; return;
               case 'u': if (ends("ous")) break; return;
               case 'v': if (ends("ive")) break; return;
               case 'z': if (ends("ize")) break; return;
               default: return;
            }
            if (m() > 1) k = j;
        }
     
        /* step6() removes a final -e if m() > 1. */
     
        private final void step6()
        {  j = k;
           if (b[k] == 'e')
           {  int a = m();
              if (a > 1 || a == 1 && !cvc(k-1)) k--;
           }
           if (b[k] == 'l' && doublec(k) && m() > 1) k--;
        }
     
        /** Stem the word placed into the Stemmer buffer through calls to add().
         * Returns true if the stemming process resulted in a word different
         * from the input.  You can retrieve the result with
         * getResultLength()/getResultBuffer() or toString().
         */
        public void stem()
        {  k = i - 1;
           if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); }
           i_end = k+1; i = 0;
        }
     }
  }
  
  
  
  1.1                  
xml-xindice/java/src/org/apache/xindice/core/fulltext/WordStemmer.java
  
  Index: WordStemmer.java
  ===================================================================
  package org.apache.xindice.core.fulltext;
  
  /**
   * WordStemmer is an interface for defining stemmers.  Stemmers are typically
   * used for western languages that support the concept of prefix, suffix, and
   * tense.  By default, dbXML uses a Porter Stemmer, which is specifically for
   * the English language, but it should be fairly trivial to develop a Stemmer
   * for non-English languages.
   */
  
  public interface WordStemmer {
     /**
      * normalizeCase normalizes the case of the specific word.  Case
      * normalization is language-specific, as is stemming, so it made
      * sense to tie the two functions into one interface.  By default
      * dbXML normalizes to lower-case.
      *
      * @param word The word to normalize
      * @return The normalized word
      */
     String normalizeCase(String word);
     
     /**
      * stemWord stems the specified word.
      *
      * @param word The word to stem
      * @return The stemmed word
      */
     String stemWord(String word);
  }

cvs commit: xml-xindice/java/src/org/apache/xindice/core/fulltext FullTextIndexer.java PorterStemmer.java WordStemmer.java

Reply via email to