bradford 02/04/21 12:34:02
Added: java/src/org/apache/xindice/core/fulltext
FullTextIndexer.java PorterStemmer.java
WordStemmer.java
Log:
Adding full-text indexing code
Revision Changes Path
1.1
xml-xindice/java/src/org/apache/xindice/core/fulltext/FullTextIndexer.java
Index: FullTextIndexer.java
===================================================================
package org.apache.xindice.core.fulltext;
import org.apache.xindice.core.*;
import org.apache.xindice.core.Collection;
import org.apache.xindice.core.data.*;
import org.apache.xindice.core.filer.*;
import org.apache.xindice.core.indexer.*;
import org.apache.xindice.core.query.*;
import org.apache.xindice.util.*;
import org.apache.xindice.xml.*;
import org.apache.xindice.xml.dom.*;
import org.apache.xindice.server.*;
import java.io.*;
import java.util.*;
import org.w3c.dom.*;
/**
* FullTextIndexer is a full text search implementation of the
* Indexer interface.
*/
public final class FullTextIndexer extends BTree implements Indexer {
private static final IndexMatch[] EmptyMatches = new IndexMatch[0];
private static final Value EmptyValue = new Value(new byte[0]);
private static final String DEFAULT_STEMMER =
"org.apache.xindice.fulltext.PorterStemmer";
private static final String DEFAULT_STOPWORDS = "stopwords.txt";
private static final byte MATCHES = 20;
private static final long MATCH_INFO = -1000;
private static final String NAME = "name";
private static final String PATTERN = "pattern";
private static final String PAGESIZE = "pagesize";
private static final String MAXKEYSIZE = "maxkeysize";
private static final String STEMMER = "stemmer";
private static final String STOPWORDS = "stopwords";
private static final String ROLLCASE = "rollcase";
private Configuration config;
private Collection collection;
private SymbolTable symbols;
private String name;
private String pattern;
private WordStemmer stemmer;
private Set stopWords;
private boolean rollCase = true;
private boolean wildcard = false;
private FileHeader fileHeader;
public FullTextIndexer() {
super();
fileHeader = getFileHeader();
fileHeader.setPageSize(1024);
}
public void setConfig(Configuration config) {
this.config = config;
try {
name = config.getAttribute(NAME);
pattern = config.getAttribute(PATTERN);
wildcard = pattern.indexOf('*') != -1;
rollCase = config.getBooleanAttribute(ROLLCASE, rollCase);
String stemClass = config.getAttribute(STEMMER, DEFAULT_STEMMER);
if ( stemClass != null && stemClass.length() > 0 ) {
try {
stemmer = (WordStemmer)Class.forName(stemClass).newInstance();
}
catch ( Exception e ) {
org.apache.xindice.Debug.println("Stemmer '"+stemClass+"' Not
Found");
}
}
String stopName = config.getAttribute(STOPWORDS, DEFAULT_STOPWORDS);
if ( stopName != null && stopName.length() > 0 ) {
try {
stopWords = new HashSet();
File stopFile = new File(stopName);
FileInputStream fis = new FileInputStream(stopFile);
BufferedInputStream bis = new BufferedInputStream(fis, 4096);
DataInputStream dis = new DataInputStream(bis);
while ( dis.available() > 0 ) {
String word = dis.readLine();
if ( word != null && word.trim().length() > 0 )
stopWords.add(word);
}
fis.close();
}
catch ( Exception e ) {
org.apache.xindice.Debug.println("Stop Word list
'"+stopName+"' Not Found");
}
}
fileHeader.setPageSize(config.getIntAttribute(PAGESIZE,
fileHeader.getPageSize()));
fileHeader.setMaxKeySize(config.getShortAttribute(MAXKEYSIZE,
fileHeader.getMaxKeySize()));
setLocation(name);
}
catch ( Exception e ) {
org.apache.xindice.Debug.printStackTrace(e);
}
}
public Configuration getConfig() {
return config;
}
public String getName() {
return name;
}
public void setLocation(String location) {
setFile(new File(collection.getCollectionRoot(), location+".idx"));
}
public void setCollection(Collection collection) {
try {
this.collection = collection;
symbols = collection.getSymbols();
}
catch ( Exception e ) {
org.apache.xindice.Debug.printStackTrace(e);
}
}
public String getIndexStyle() {
return STYLE_FULLTEXT;
}
public String getPattern() {
return pattern;
}
private Value getCombinedValue(Key key, int pos, int len, short elemID,
short attrID) {
Value result;
try {
int l = key.getLength();
byte[] b = new byte[l+13];
// Write the key
System.arraycopy(key.getData(), 0, b, 0, l);
b[l] = 0;
// Write the pos
b[l+1] = (byte)((pos >>> 24) & 0xFF);
b[l+2] = (byte)((pos >>> 16) & 0xFF);
b[l+3] = (byte)((pos >>> 8) & 0xFF);
b[l+4] = (byte)((pos >>> 0) & 0xFF);
// Write the len
b[l+5] = (byte)((len >>> 24) & 0xFF);
b[l+6] = (byte)((len >>> 16) & 0xFF);
b[l+7] = (byte)((len >>> 8) & 0xFF);
b[l+8] = (byte)((len >>> 0) & 0xFF);
// Write the elemID
b[l+9] = (byte)((elemID >>> 8) & 0xFF);
b[l+10] = (byte)((elemID >>> 0) & 0xFF);
// Write the attrID
b[l+11] = (byte)((attrID >>> 8) & 0xFF);
b[l+12] = (byte)((attrID >>> 0) & 0xFF);
result = new Value(b);
}
catch ( Exception e ) {
result = null; // This will never happen
}
return result;
}
private IndexMatch getIndexMatch(Value v) {
byte[] b = v.getData();
int l = b.length-13;
Key key = new Key(b, 0, b.length-13);
int pos = ((b[l+1] << 24) | (b[l+2] << 16) | (b[l+3] << 8) | b[l+4]);
int len = ((b[l+5] << 24) | (b[l+6] << 16) | (b[l+7] << 8) | b[l+8]);
short elemID = (short)((b[l+9] << 8) | b[l+10]);
short attrID = (short)((b[l+11] << 8) | b[l+12]);
return new IndexMatch(key, pos, len, elemID, attrID);
}
public synchronized void remove(String value, Key key, int pos, int len,
short elemID, short attrID) throws DBException {
StringTokenizer st = new StringTokenizer(value);
while ( st.hasMoreTokens() ) {
String s = st.nextToken();
if ( stemmer != null )
s = stemmer.normalizeCase(s);
if ( stopWords != null && stopWords.contains(s) )
continue;
if ( stemmer != null )
s = stemmer.stemWord(s);
Value v = new Value(s);
try {
BTreeRootInfo root = findBTreeRoot(v);
Value cv = getCombinedValue(key, pos, len, elemID, attrID);
removeValue(root, cv);
}
catch ( DBException d ) {
throw d;
}
catch ( Exception e ) {
org.apache.xindice.Debug.printStackTrace(e);
}
}
}
public synchronized void add(String value, Key key, int pos, int len,
short elemID, short attrID) throws DBException {
StringTokenizer st = new StringTokenizer(value);
while ( st.hasMoreTokens() ) {
String s = st.nextToken();
if ( stemmer != null )
s = stemmer.normalizeCase(s);
if ( stopWords != null && stopWords.contains(s) )
continue;
if ( stemmer != null )
s = stemmer.stemWord(s);
Value v = new Value(s);
try {
BTreeRootInfo root;
try {
root = findBTreeRoot(v);
}
catch ( BTreeNotFoundException e ) {
root = createBTreeRoot(v);
}
Value cv = getCombinedValue(key, pos, len, elemID, attrID);
addValue(root, cv, MATCH_INFO);
}
catch ( DBException d ) {
throw d;
}
catch ( IOException e ) {
throw new BTreeCorruptException("Corruption detected on add");
}
catch ( Exception e ) {
org.apache.xindice.Debug.printStackTrace(e);
}
}
}
public synchronized void flush() throws DBException {
super.flush();
}
public synchronized IndexMatch[] queryMatches(final IndexQuery query)
throws DBException {
// Pre-process the value-set for stop words and stemming
Value[] vals = query.getValues();
for ( int i = 0; i < vals.length; i++ ) {
String s = vals[i].toString();
if ( stemmer != null )
s = stemmer.normalizeCase(s);
if ( stopWords != null && stopWords.contains(s) )
continue;
if ( stemmer != null )
s = stemmer.stemWord(s);
vals[i] = new Value(s);
}
// Now issue the query
final List results = new ArrayList();
try {
query(query, new BTreeCallback() {
public boolean indexInfo(Value value, long pos) {
try {
if ( pos == MATCH_INFO ) {
IndexMatch match = getIndexMatch(value);
if ( !wildcard )
results.add(match);
else {
IndexPattern pt = new IndexPattern(symbols,
match.getElement(), match.getAttribute());
if ( pt.getMatchLevel(query.getPattern()) > 0 )
results.add(match);
}
}
else {
BTreeRootInfo root = new BTreeRootInfo(value, pos);
query(root, null, this);
}
return true;
}
catch ( Exception e ) {
org.apache.xindice.Debug.printStackTrace(e);
}
return true;
}
});
}
catch ( IOException e ) {
throw new BTreeCorruptException("Corruption detected on query");
}
catch ( Exception e ) {
org.apache.xindice.Debug.printStackTrace(e);
}
return (IndexMatch[])results.toArray(EmptyMatches);
}
}
1.1
xml-xindice/java/src/org/apache/xindice/core/fulltext/PorterStemmer.java
Index: PorterStemmer.java
===================================================================
package org.apache.xindice.core.fulltext;
/*
Porter stemmer in Java. The original paper is in
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
no. 3, pp 130-137,
See also <a
href="http://www.muscat.com/~martin/stem.html">http://www.muscat.com/~martin/stem.html</a>
History:
Release 1
Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
is then out outside the bounds of b.
Release 2
Similarly,
Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
b[j] is then outside the bounds of b.
Release 3
Considerably revised 4/9/00 in the light of many helpful suggestions
from Brian Goetz of Quiotix Corporation ([EMAIL PROTECTED]). The earlier
version of the program can still be got from [EMAIL PROTECTED] should
anyone require it.
Release 4
*/
import java.io.*;
public class PorterStemmer implements WordStemmer {
public String normalizeCase(String word) {
return word.toLowerCase();
}
public String stemWord(String word) {
Stemmer s = new Stemmer();
s.add(word.toCharArray(), word.length());
s.stem();
return s.toString();
}
/**
* Stemmer, implementing the Porter Stemming Algorithm
*
* The Stemmer class transforms a word into its root form. The input
* word can be provided a character at time (by calling add()), or at once
* by calling one of the various stem(something) methods.
*/
private class Stemmer
{ private char[] b;
private int i, /* offset into b */
i_end, /* offset to end of stemmed word */
j, k;
private static final int INC = 50;
/* unit of size whereby b is increased */
public Stemmer()
{ b = new char[INC];
i = 0;
i_end = 0;
}
/**
* Add a character to the word being stemmed. When you are finished
* adding characters, you can call stem(void) to stem the word.
*/
public void add(char ch)
{ if (i == b.length)
{ char[] new_b = new char[i+INC];
for (int c = 0; c < i; c++) new_b[c] = b[c];
b = new_b;
}
b[i++] = ch;
}
/** Adds wLen characters to the word being stemmed contained in a
portion
* of a char[] array. This is like repeated calls of add(char ch), but
* faster.
*/
public void add(char[] w, int wLen)
{ if (i+wLen >= b.length)
{ char[] new_b = new char[i+wLen+INC];
for (int c = 0; c < i; c++) new_b[c] = b[c];
b = new_b;
}
for (int c = 0; c < wLen; c++) b[i++] = w[c];
}
/**
* After a word has been stemmed, it can be retrieved by toString(),
* or a reference to the internal buffer can be retrieved by
getResultBuffer
* and getResultLength (which is generally more efficient.)
*/
public String toString() { return new String(b,0,i_end); }
/**
* Returns the length of the word resulting from the stemming process.
*/
public int getResultLength() { return i_end; }
/**
* Returns a reference to a character buffer containing the results of
* the stemming process. You also need to consult getResultLength()
* to determine the length of the result.
*/
public char[] getResultBuffer() { return b; }
/* cons(i) is true <=> b[i] is a consonant. */
private final boolean cons(int i)
{ switch (b[i])
{ case 'a': case 'e': case 'i': case 'o': case 'u': return false;
case 'y': return (i==0) ? true : !cons(i-1);
default: return true;
}
}
/* m() measures the number of consonant sequences between 0 and j. if c
is
a consonant sequence and v a vowel sequence, and <..> indicates
arbitrary
presence,
<c><v> gives 0
<c>vc<v> gives 1
<c>vcvc<v> gives 2
<c>vcvcvc<v> gives 3
....
*/
private final int m()
{ int n = 0;
int i = 0;
while(true)
{ if (i > j) return n;
if (! cons(i)) break; i++;
}
i++;
while(true)
{ while(true)
{ if (i > j) return n;
if (cons(i)) break;
i++;
}
i++;
n++;
while(true)
{ if (i > j) return n;
if (! cons(i)) break;
i++;
}
i++;
}
}
/* vowelinstem() is true <=> 0,...j contains a vowel */
private final boolean vowelinstem()
{ int i; for (i = 0; i <= j; i++) if (! cons(i)) return true;
return false;
}
/* doublec(j) is true <=> j,(j-1) contain a double consonant. */
private final boolean doublec(int j)
{ if (j < 1) return false;
if (b[j] != b[j-1]) return false;
return cons(j);
}
/* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel -
consonant
and also if the second c is not w,x or y. this is used when trying to
restore an e at the end of a short word. e.g.
cav(e), lov(e), hop(e), crim(e), but
snow, box, tray.
*/
private final boolean cvc(int i)
{ if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false;
{ int ch = b[i];
if (ch == 'w' || ch == 'x' || ch == 'y') return false;
}
return true;
}
private final boolean ends(String s)
{ int l = s.length();
int o = k-l+1;
if (o < 0) return false;
for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false;
j = k-l;
return true;
}
/* setto(s) sets (j+1),...k to the characters in the string s,
readjusting
k. */
private final void setto(String s)
{ int l = s.length();
int o = j+1;
for (int i = 0; i < l; i++) b[o+i] = s.charAt(i);
k = j+l;
}
/* r(s) is used further down. */
private final void r(String s) { if (m() > 0) setto(s); }
/* step1() gets rid of plurals and -ed or -ing. e.g.
caresses -> caress
ponies -> poni
ties -> ti
caress -> caress
cats -> cat
feed -> feed
agreed -> agree
disabled -> disable
matting -> mat
mating -> mate
meeting -> meet
milling -> mill
messing -> mess
meetings -> meet
*/
private final void step1()
{ if (b[k] == 's')
{ if (ends("sses")) k -= 2; else
if (ends("ies")) setto("i"); else
if (b[k-1] != 's') k--;
}
if (ends("eed")) { if (m() > 0) k--; } else
if ((ends("ed") || ends("ing")) && vowelinstem())
{ k = j;
if (ends("at")) setto("ate"); else
if (ends("bl")) setto("ble"); else
if (ends("iz")) setto("ize"); else
if (doublec(k))
{ k--;
{ int ch = b[k];
if (ch == 'l' || ch == 's' || ch == 'z') k++;
}
}
else if (m() == 1 && cvc(k)) setto("e");
}
}
/* step2() turns terminal y to i when there is another vowel in the
stem. */
private final void step2() { if (ends("y") && vowelinstem()) b[k] =
'i'; }
/* step3() maps double suffices to single ones. so -ization ( = -ize
plus
-ation) maps to -ize etc. note that the string before the suffix
must give
m() > 0. */
private final void step3() { if (k == 0) return; /* For Bug 1 */ switch
(b[k-1])
{
case 'a': if (ends("ational")) { r("ate"); break; }
if (ends("tional")) { r("tion"); break; }
break;
case 'c': if (ends("enci")) { r("ence"); break; }
if (ends("anci")) { r("ance"); break; }
break;
case 'e': if (ends("izer")) { r("ize"); break; }
break;
case 'l': if (ends("bli")) { r("ble"); break; }
if (ends("alli")) { r("al"); break; }
if (ends("entli")) { r("ent"); break; }
if (ends("eli")) { r("e"); break; }
if (ends("ousli")) { r("ous"); break; }
break;
case 'o': if (ends("ization")) { r("ize"); break; }
if (ends("ation")) { r("ate"); break; }
if (ends("ator")) { r("ate"); break; }
break;
case 's': if (ends("alism")) { r("al"); break; }
if (ends("iveness")) { r("ive"); break; }
if (ends("fulness")) { r("ful"); break; }
if (ends("ousness")) { r("ous"); break; }
break;
case 't': if (ends("aliti")) { r("al"); break; }
if (ends("iviti")) { r("ive"); break; }
if (ends("biliti")) { r("ble"); break; }
break;
case 'g': if (ends("logi")) { r("log"); break; }
} }
/* step4() deals with -ic-, -full, -ness etc. similar strategy to
step3. */
private final void step4() { switch (b[k])
{
case 'e': if (ends("icate")) { r("ic"); break; }
if (ends("ative")) { r(""); break; }
if (ends("alize")) { r("al"); break; }
break;
case 'i': if (ends("iciti")) { r("ic"); break; }
break;
case 'l': if (ends("ical")) { r("ic"); break; }
if (ends("ful")) { r(""); break; }
break;
case 's': if (ends("ness")) { r(""); break; }
break;
} }
/* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
private final void step5()
{ if (k == 0) return; /* for Bug 1 */ switch (b[k-1])
{ case 'a': if (ends("al")) break; return;
case 'c': if (ends("ance")) break;
if (ends("ence")) break; return;
case 'e': if (ends("er")) break; return;
case 'i': if (ends("ic")) break; return;
case 'l': if (ends("able")) break;
if (ends("ible")) break; return;
case 'n': if (ends("ant")) break;
if (ends("ement")) break;
if (ends("ment")) break;
/* element etc. not stripped before the m */
if (ends("ent")) break; return;
case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] ==
't')) break;
/* j >= 0 fixes Bug 2 */
if (ends("ou")) break; return;
/* takes care of -ous */
case 's': if (ends("ism")) break; return;
case 't': if (ends("ate")) break;
if (ends("iti")) break; return;
case 'u': if (ends("ous")) break; return;
case 'v': if (ends("ive")) break; return;
case 'z': if (ends("ize")) break; return;
default: return;
}
if (m() > 1) k = j;
}
/* step6() removes a final -e if m() > 1. */
private final void step6()
{ j = k;
if (b[k] == 'e')
{ int a = m();
if (a > 1 || a == 1 && !cvc(k-1)) k--;
}
if (b[k] == 'l' && doublec(k) && m() > 1) k--;
}
/** Stem the word placed into the Stemmer buffer through calls to add().
* Returns true if the stemming process resulted in a word different
* from the input. You can retrieve the result with
* getResultLength()/getResultBuffer() or toString().
*/
public void stem()
{ k = i - 1;
if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); }
i_end = k+1; i = 0;
}
}
}
1.1
xml-xindice/java/src/org/apache/xindice/core/fulltext/WordStemmer.java
Index: WordStemmer.java
===================================================================
package org.apache.xindice.core.fulltext;
/**
* WordStemmer is an interface for defining stemmers. Stemmers are typically
* used for western languages that support the concept of prefix, suffix, and
* tense. By default, dbXML uses a Porter Stemmer, which is specifically for
* the English language, but it should be fairly trivial to develop a Stemmer
* for non-English languages.
*/
public interface WordStemmer {
/**
* normalizeCase normalizes the case of the specific word. Case
* normalization is language-specific, as is stemming, so it made
* sense to tie the two functions into one interface. By default
* dbXML normalizes to lower-case.
*
* @param word The word to normalize
* @return The normalized word
*/
String normalizeCase(String word);
/**
* stemWord stems the specified word.
*
* @param word The word to stem
* @return The stemmed word
*/
String stemWord(String word);
}