Hi
I've written a PrefixQuery and it's not hard to do -I can post it too.
Problem is that it is not integrated into the query parser (.jj) so odds
are noone will use it, and the general sentiment on this list (and lucene-dev)
is that prefix queries are evil because it's an expensive operation as the query
code has to traverse all terms to "expand" the query. I would prefer
a more user oriented view i.e. just allow it as sometimes it's what you need and
the only alternative I can think of, doing a fuzzy query, isn't quite right.
wow - great!
I'm looking for a sample code for quite a goode time. I'd like to test the performance on our data to see if it's really that slow.
2 files attached, SubstringQuery (which you'll use) and SubstringTermEnum ( used by the former to be
consistent w/ other Query code).
I find this kind of query useful to have and think that the query parser should allow it in spite of the perception
of this being slow, however I think the debate is the "user centric view" (say mine, allow substring queries)
vs the "protect the engines performance" view which says not to allow expensive queries.
It would be great if you could post a URL whrer to find your extension.
thank you
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]
package com.tropo.lucene;
/* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Lucene" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache", * "Apache Lucene", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */ import org.apache.lucene.search.*; import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; /** Subclass of FilteredTermEnum for enumerating all terms that are similiar to the specified filter term. <p>Term enumerations are always ordered by Term.compareTo(). Each term in the enumeration is greater than all that precede it. */ public final class SubstringTermEnum extends FilteredTermEnum { int del_len; boolean endEnum = false; Term searchTerm = null; String field = ""; String text = ""; int textlen; public SubstringTermEnum(IndexReader reader, Term term) throws IOException { super(reader, term); searchTerm = term; field = searchTerm.field(); text = searchTerm.text(); textlen = text.length(); setEnum(reader.terms(new Term(searchTerm.field(), ""))); } /** The termCompare method in SubstringTermEnum uses the difference in lengths to calculate the distance between the given term and the comparing term assuming that the term passed to the ctr is a substring of the current term. */ protected final boolean termCompare(Term term) { if (field == term.field()) { String target = term.text(); boolean res = target.indexOf( text) >= 0; del_len = Math.abs( text.length() - target.length()); return res; } endEnum = true; return false; } protected final float difference() { // if lengths differ by more than 10 then clamp the difference at 0.05 // if it's an exact match then we return 1.0 here return (float) Math.max( ((10 - del_len)/10.0f), 0.05); } public final boolean endEnum() { return endEnum; } public void close() throws IOException { super.close(); searchTerm = null; field = null; text = null; } }
package com.tropo.lucene; /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Lucene" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact [EMAIL PROTECTED] * * 5. Products derived from this software may not be called "Apache", * "Apache Lucene", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. */ import java.io.IOException; import org.apache.lucene.search.*; import org.apache.lucene.index.*; import org.apache.lucene.analysis.*; import org.apache.lucene.document.*; /** A Query that matches documents containing terms with a specified substring. */ public final class SubstringQuery extends MultiTermQuery { public SubstringQuery(Term term) { super(term); } protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { return new SubstringTermEnum(reader, getTerm()); } public String toString(String field) { return '*' + super.toString(field) + '*'; } /** * */ /* public static void main( String[] args) throws Throwable { String name = "index"; String srch = "valid"; for ( int i = 0; i< args.length; i++) { if ( args[i].equals( "-i")) name = args[ ++i]; else if ( args[i].equals( "-w")) srch = args[ ++i]; } java.io.PrintStream o = System.out; o.println( "Opening "+ name); if ( true ) { final IndexReader r = IndexReader.open( Rammer.convertMaybe( name)); //final TermEnum te = r.terms(); FilteredTermEnum te = new SubstringTermEnum( r, new Term( "contents", srch)); while ( te.next()) { Term term = te.term(); o.println( "" + term); } o.println( "te: " +te); Query q = new SubstringQuery( new Term( "contents", srch)); o.println( "q=" + q.toString( "contents")); o.println( "q=" + q); o.println( "q=" + q.rewrite( r)); } else { final IndexSearcher searcher = new IndexSearcher( name); final Analyzer analyzer = IndexBase.getAnalyzer(); final Query q = new SubstringQuery( new Term( DFields.CONTENTS, srch)); o.println( "q=" +q); Hits hits = searcher.search( q); int len = hits.length(); o.println( "len: " +len); for ( int i = 0; i < len; i++) { Document d = hits.doc( i); } } } */ }
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]
