goller 2004/10/01 02:56:49 Modified: src/java/org/apache/lucene/search Tag: lucene_1_4_2_dev FuzzyQuery.java SloppyPhraseScorer.java PhraseScorer.java FuzzyTermEnum.java PhrasePrefixQuery.java PhraseQuery.java PhrasePositions.java ExactPhraseScorer.java Log: Extensions to FuzzyQuery, PhraseQuery, and PhrasePrefixQuery. (Backport) Revision Changes Path No revision No revision 1.4.2.1 +70 -4 jakarta-lucene/src/java/org/apache/lucene/search/FuzzyQuery.java Index: FuzzyQuery.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/FuzzyQuery.java,v retrieving revision 1.4 retrieving revision 1.4.2.1 diff -u -r1.4 -r1.4.2.1 --- FuzzyQuery.java 29 Mar 2004 22:48:03 -0000 1.4 +++ FuzzyQuery.java 1 Oct 2004 09:56:49 -0000 1.4.2.1 @@ -20,17 +20,83 @@ import org.apache.lucene.index.Term; import java.io.IOException; -/** Implements the fuzzy search query */ +/** Implements the fuzzy search query. The similiarity measurement + * is based on the Levenshtein (edit distance) algorithm. + */ public final class FuzzyQuery extends MultiTermQuery { - public FuzzyQuery(Term term) { + + public final static float defaultMinSimilarity = 0.5f; + private float minimumSimilarity; + private int prefixLength; + + /** + * Create a new FuzzyQuery that will match terms with a similarity + * of at least <code>minimumSimilarity</code> to <code>term</code>. + * If a <code>prefixLength</code> > 0 is specified, a common prefix + * of that length is also required. + * + * @param term the term to search for + * @param minimumSimilarity a value between 0 and 1 to set the required similarity + * between the query term and the matching terms. For example, for a + * <code>minimumSimilarity</code> of <code>0.5</code> a term of the same length + * as the query term is considered similar to the query term if the edit distance + * between both terms is less than <code>length(term)*0.5</code> + * @param prefixLength length of common (non-fuzzy) prefix + * @throws IllegalArgumentException if minimumSimilarity is > 1 or < 0 + * or if prefixLength < 0 or > <code>term.text().length()</code>. + */ + public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) throws IllegalArgumentException { super(term); + + if (minimumSimilarity > 1.0f) + throw new IllegalArgumentException("minimumSimilarity > 1"); + else if (minimumSimilarity < 0.0f) + throw new IllegalArgumentException("minimumSimilarity < 0"); + this.minimumSimilarity = minimumSimilarity; + + if(prefixLength < 0) + throw new IllegalArgumentException("prefixLength < 0"); + else if(prefixLength >= term.text().length()) + throw new IllegalArgumentException("prefixLength >= term.text().length()"); + this.prefixLength = prefixLength; + } + + /** + * Calls [EMAIL PROTECTED] #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0)}. + */ + public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException { + this(term, minimumSimilarity, 0); + } + + /** + * Calls [EMAIL PROTECTED] #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0)}. + */ + public FuzzyQuery(Term term) { + this(term, defaultMinSimilarity, 0); + } + + /** + * Returns the minimum similarity that is required for this query to match. + * @return float value between 0.0 and 1.0 + */ + public float getMinSimilarity() { + return minimumSimilarity; } + /** + * Returns the prefix length, i.e. the number of characters at the start + * of a term that must be identical (not fuzzy) to the query term if the query + * is to match that term. + */ + public int getPrefixLength() { + return prefixLength; + } + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { - return new FuzzyTermEnum(reader, getTerm()); + return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength); } public String toString(String field) { - return super.toString(field) + '~'; + return super.toString(field) + '~' + Float.toString(minimumSimilarity); } } 1.6.2.1 +3 -3 jakarta-lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java Index: SloppyPhraseScorer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java,v retrieving revision 1.6 retrieving revision 1.6.2.1 diff -u -r1.6 -r1.6.2.1 --- SloppyPhraseScorer.java 29 Mar 2004 22:48:04 -0000 1.6 +++ SloppyPhraseScorer.java 1 Oct 2004 09:56:49 -0000 1.6.2.1 @@ -23,9 +23,9 @@ final class SloppyPhraseScorer extends PhraseScorer { private int slop; - SloppyPhraseScorer(Weight weight, TermPositions[] tps, Similarity similarity, - int slop, byte[] norms) throws IOException { - super(weight, tps, similarity, norms); + SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity, + int slop, byte[] norms) { + super(weight, tps, positions, similarity, norms); this.slop = slop; } 1.14.2.1 +4 -3 jakarta-lucene/src/java/org/apache/lucene/search/PhraseScorer.java Index: PhraseScorer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhraseScorer.java,v retrieving revision 1.14 retrieving revision 1.14.2.1 diff -u -r1.14 -r1.14.2.1 --- PhraseScorer.java 11 May 2004 17:18:04 -0000 1.14 +++ PhraseScorer.java 1 Oct 2004 09:56:49 -0000 1.14.2.1 @@ -32,8 +32,9 @@ private float freq; - PhraseScorer(Weight weight, TermPositions[] tps, Similarity similarity, - byte[] norms) throws IOException { + + PhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity, + byte[] norms) { super(similarity); this.norms = norms; this.weight = weight; @@ -41,7 +42,7 @@ // convert tps to a list for (int i = 0; i < tps.length; i++) { - PhrasePositions pp = new PhrasePositions(tps[i], i); + PhrasePositions pp = new PhrasePositions(tps[i], positions[i]); if (last != null) { // add next to end of list last.next = pp; } else 1.6.2.1 +55 -9 jakarta-lucene/src/java/org/apache/lucene/search/FuzzyTermEnum.java Index: FuzzyTermEnum.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/FuzzyTermEnum.java,v retrieving revision 1.6 retrieving revision 1.6.2.1 diff -u -r1.6 -r1.6.2.1 --- FuzzyTermEnum.java 11 May 2004 17:23:21 -0000 1.6 +++ FuzzyTermEnum.java 1 Oct 2004 09:56:49 -0000 1.6.2.1 @@ -26,21 +26,69 @@ the enumeration is greater than all that precede it. */ public final class FuzzyTermEnum extends FilteredTermEnum { double distance; - boolean fieldMatch = false; boolean endEnum = false; Term searchTerm = null; String field = ""; String text = ""; int textlen; + String prefix = ""; + int prefixLength = 0; + float minimumSimilarity; + double scale_factor; + + /** + * Empty prefix and minSimilarity of 0.5f are used. + * + * @param reader + * @param term + * @throws IOException + * @see #FuzzyTermEnum(IndexReader, Term, float, int) + */ public FuzzyTermEnum(IndexReader reader, Term term) throws IOException { + this(reader, term, FuzzyQuery.defaultMinSimilarity, 0); + } + + /** + * This is the standard FuzzyTermEnum with an empty prefix. + * + * @param reader + * @param term + * @param minSimilarity + * @throws IOException + * @see #FuzzyTermEnum(IndexReader, Term, float, int) + */ + public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) throws IOException { + this(reader, term, minSimilarity, 0); + } + + /** + * Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of + * length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity > + * <code>minSimilarity</code>. + * + * @param reader Delivers terms. + * @param term Pattern term. + * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f. + * @param prefixLength Length of required common prefix. Default value is 0. + * @throws IOException + */ + public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity, int prefixLength) throws IOException { super(); + minimumSimilarity = minSimilarity; + scale_factor = 1.0f / (1.0f - minimumSimilarity); searchTerm = term; field = searchTerm.field(); text = searchTerm.text(); textlen = text.length(); - setEnum(reader.terms(new Term(searchTerm.field(), ""))); + if(prefixLength > 0 && prefixLength < textlen){ + this.prefixLength = prefixLength; + prefix = text.substring(0, prefixLength); + text = text.substring(prefixLength); + textlen = text.length(); + } + setEnum(reader.terms(new Term(searchTerm.field(), prefix))); } /** @@ -48,19 +96,20 @@ calculate the distance between the given term and the comparing term. */ protected final boolean termCompare(Term term) { - if (field == term.field()) { - String target = term.text(); + String termText = term.text(); + if (field == term.field() && termText.startsWith(prefix)) { + String target = termText.substring(prefixLength); int targetlen = target.length(); int dist = editDistance(text, target, textlen, targetlen); distance = 1 - ((double)dist / (double)Math.min(textlen, targetlen)); - return (distance > FUZZY_THRESHOLD); + return (distance > minimumSimilarity); } endEnum = true; return false; } protected final float difference() { - return (float)((distance - FUZZY_THRESHOLD) * SCALE_FACTOR); + return (float)((distance - minimumSimilarity) * scale_factor); } public final boolean endEnum() { @@ -70,9 +119,6 @@ /****************************** * Compute Levenshtein distance ******************************/ - - public static final double FUZZY_THRESHOLD = 0.5; - public static final double SCALE_FACTOR = 1.0f / (1.0f - FUZZY_THRESHOLD); /** Finds and returns the smallest of three integers 1.12.2.1 +42 -12 jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java Index: PhrasePrefixQuery.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhrasePrefixQuery.java,v retrieving revision 1.12 retrieving revision 1.12.2.1 diff -u -r1.12 -r1.12.2.1 --- PhrasePrefixQuery.java 29 Mar 2004 22:48:03 -0000 1.12 +++ PhrasePrefixQuery.java 1 Oct 2004 09:56:49 -0000 1.12.2.1 @@ -19,6 +19,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; +import java.util.Vector; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultipleTermPositions; @@ -40,42 +41,69 @@ public class PhrasePrefixQuery extends Query { private String field; private ArrayList termArrays = new ArrayList(); + private Vector positions = new Vector(); private int slop = 0; - /* Sets the phrase slop for this query. + /** Sets the phrase slop for this query. * @see PhraseQuery#setSlop(int) */ public void setSlop(int s) { slop = s; } - /* Sets the phrase slop for this query. + /** Sets the phrase slop for this query. * @see PhraseQuery#getSlop() */ public int getSlop() { return slop; } - /* Add a single term at the next position in the phrase. + /** Add a single term at the next position in the phrase. * @see PhraseQuery#add(Term) */ public void add(Term term) { add(new Term[]{term}); } - /* Add multiple terms at the next position in the phrase. Any of the terms + /** Add multiple terms at the next position in the phrase. Any of the terms * may match. * * @see PhraseQuery#add(Term) */ public void add(Term[] terms) { + int position = 0; + if (positions.size() > 0) + position = ((Integer) positions.lastElement()).intValue() + 1; + + add(terms, position); + } + + /** + * Allows to specify the relative position of terms within the phrase. + * + * @see PhraseQuery#add(Term, int) + * @param terms + * @param position + */ + public void add(Term[] terms, int position) { if (termArrays.size() == 0) field = terms[0].field(); - - for (int i=0; i<terms.length; i++) { + + for (int i = 0; i < terms.length; i++) { if (terms[i].field() != field) { - throw new IllegalArgumentException - ("All phrase terms must be in the same field (" + field + "): " - + terms[i]); + throw new IllegalArgumentException( + "All phrase terms must be in the same field (" + field + "): " + + terms[i]); } } termArrays.add(terms); + positions.addElement(new Integer(position)); + } + + /** + * Returns the relative positions of terms in this phrase. + */ + public int[] getPositions() { + int[] result = new int[positions.size()]; + for (int i = 0; i < positions.size(); i++) + result[i] = ((Integer) positions.elementAt(i)).intValue(); + return result; } private class PhrasePrefixWeight implements Weight { @@ -131,10 +159,10 @@ } if (slop == 0) - return new ExactPhraseScorer(this, tps, getSimilarity(searcher), + return new ExactPhraseScorer(this, tps, getPositions(), getSimilarity(searcher), reader.norms(field)); else - return new SloppyPhraseScorer(this, tps, getSimilarity(searcher), + return new SloppyPhraseScorer(this, tps, getPositions(), getSimilarity(searcher), slop, reader.norms(field)); } @@ -222,7 +250,9 @@ Iterator i = termArrays.iterator(); while (i.hasNext()) { Term[] terms = (Term[])i.next(); - buffer.append(terms[0].text() + (terms.length > 0 ? "*" : "")); + buffer.append(terms[0].text() + (terms.length > 1 ? "*" : "")); + if (i.hasNext()) + buffer.append(" "); } buffer.append("\""); 1.15.2.1 +45 -12 jakarta-lucene/src/java/org/apache/lucene/search/PhraseQuery.java Index: PhraseQuery.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhraseQuery.java,v retrieving revision 1.15 retrieving revision 1.15.2.1 diff -u -r1.15 -r1.15.2.1 --- PhraseQuery.java 29 Mar 2004 22:48:03 -0000 1.15 +++ PhraseQuery.java 1 Oct 2004 09:56:49 -0000 1.15.2.1 @@ -29,6 +29,7 @@ public class PhraseQuery extends Query { private String field; private Vector terms = new Vector(); + private Vector positions = new Vector(); private int slop = 0; /** Constructs an empty phrase query. */ @@ -52,21 +53,51 @@ /** Returns the slop. See setSlop(). */ public int getSlop() { return slop; } - /** Adds a term to the end of the query phrase. */ + /** + * Adds a term to the end of the query phrase. + * The relative position of the term is the one immediately after the last term added. + */ public void add(Term term) { - if (terms.size() == 0) - field = term.field(); - else if (term.field() != field) - throw new IllegalArgumentException - ("All phrase terms must be in the same field: " + term); - - terms.addElement(term); + int position = 0; + if(positions.size() > 0) + position = ((Integer) positions.lastElement()).intValue() + 1; + + add(term, position); + } + + /** + * Adds a term to the end of the query phrase. + * The relative position of the term within the phrase is specified explicitly. + * This allows e.g. phrases with more than one term at the same position + * or phrases with gaps (e.g. in connection with stopwords). + * + * @param term + * @param position + */ + public void add(Term term, int position) { + if (terms.size() == 0) + field = term.field(); + else if (term.field() != field) + throw new IllegalArgumentException("All phrase terms must be in the same field: " + term); + + terms.addElement(term); + positions.addElement(new Integer(position)); } /** Returns the set of terms in this phrase. */ public Term[] getTerms() { return (Term[])terms.toArray(new Term[0]); } + + /** + * Returns the relative positions of terms in this phrase. + */ + public int[] getPositions() { + int[] result = new int[positions.size()]; + for(int i = 0; i < positions.size(); i++) + result[i] = ((Integer) positions.elementAt(i)).intValue(); + return result; + } private class PhraseWeight implements Weight { private Searcher searcher; @@ -109,11 +140,11 @@ } if (slop == 0) // optimize exact case - return new ExactPhraseScorer(this, tps, getSimilarity(searcher), + return new ExactPhraseScorer(this, tps, getPositions(), getSimilarity(searcher), reader.norms(field)); else return - new SloppyPhraseScorer(this, tps, getSimilarity(searcher), slop, + new SloppyPhraseScorer(this, tps, getPositions(), getSimilarity(searcher), slop, reader.norms(field)); } @@ -244,14 +275,16 @@ PhraseQuery other = (PhraseQuery)o; return (this.getBoost() == other.getBoost()) && (this.slop == other.slop) - && this.terms.equals(other.terms); + && this.terms.equals(other.terms) + && this.positions.equals(other.positions); } /** Returns a hash code value for this object.*/ public int hashCode() { return Float.floatToIntBits(getBoost()) ^ Float.floatToIntBits(slop) - ^ terms.hashCode(); + ^ terms.hashCode() + ^ positions.hashCode(); } } 1.3.2.1 +1 -1 jakarta-lucene/src/java/org/apache/lucene/search/PhrasePositions.java Index: PhrasePositions.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/PhrasePositions.java,v retrieving revision 1.3 retrieving revision 1.3.2.1 diff -u -r1.3 -r1.3.2.1 --- PhrasePositions.java 29 Mar 2004 22:48:03 -0000 1.3 +++ PhrasePositions.java 1 Oct 2004 09:56:49 -0000 1.3.2.1 @@ -27,7 +27,7 @@ TermPositions tp; // stream of positions PhrasePositions next; // used to make lists - PhrasePositions(TermPositions t, int o) throws IOException { + PhrasePositions(TermPositions t, int o) { tp = t; offset = o; } 1.6.2.1 +2 -2 jakarta-lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java Index: ExactPhraseScorer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java,v retrieving revision 1.6 retrieving revision 1.6.2.1 diff -u -r1.6 -r1.6.2.1 --- ExactPhraseScorer.java 11 May 2004 17:18:03 -0000 1.6 +++ ExactPhraseScorer.java 1 Oct 2004 09:56:49 -0000 1.6.2.1 @@ -21,9 +21,9 @@ final class ExactPhraseScorer extends PhraseScorer { - ExactPhraseScorer(Weight weight, TermPositions[] tps, Similarity similarity, + ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] positions, Similarity similarity, byte[] norms) throws IOException { - super(weight, tps, similarity, norms); + super(weight, tps, positions, similarity, norms); } protected final float phraseFreq() throws IOException {
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]