dspencer 2005/01/11 12:58:11 Modified: contributions/WordNet build.xml contributions/WordNet/src/java/org/apache/lucene/wordnet SynLookup.java Syns2Index.java Added: contributions/WordNet/src/java/org/apache/lucene/wordnet SynExpand.java package.html Log: make sure code works with WordNet2.0 (no problem) and add Query expansion, and comments Revision Changes Path 1.4 +21 -0 jakarta-lucene-sandbox/contributions/WordNet/build.xml Index: build.xml =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/WordNet/build.xml,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- build.xml 23 Feb 2004 15:23:25 -0000 1.3 +++ build.xml 11 Jan 2005 20:58:11 -0000 1.4 @@ -29,6 +29,7 @@ </java> </target> + <target name="synonym" description="Find synonyms for word"> <fail unless="synindex.exists"> Index does not exist. @@ -46,6 +47,26 @@ <arg file="${synindex.dir}"/> <arg value="${word}"/> + </java> + </target> + + <target name="expand" description="Perform synonym expansion on a query"> + <fail unless="synindex.exists"> + Index does not exist. + </fail> + + <fail unless="query"> + Must specify 'query' property. + </fail> + + <java classname="org.apache.lucene.wordnet.SynExpand"> + <classpath> + <path refid="compile.classpath"/> + <pathelement location="${build.classes.dir}"/> + </classpath> + + <arg file="${synindex.dir}"/> + <arg value="${query}"/> </java> </target> 1.2 +108 -39 jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/SynLookup.java Index: SynLookup.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/SynLookup.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- SynLookup.java 26 Jan 2004 17:29:35 -0000 1.1 +++ SynLookup.java 11 Jan 2005 20:58:11 -0000 1.2 @@ -1,45 +1,114 @@ package org.apache.lucene.wordnet; -import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.Hits; -import org.apache.lucene.index.Term; -import org.apache.lucene.document.Document; -import java.io.IOException; +import org.apache.lucene.store.*; +import org.apache.lucene.search.*; +import org.apache.lucene.index.*; +import org.apache.lucene.document.*; +import org.apache.lucene.analysis.*; +import java.io.*; +import java.util.*; + +/** + * Test program to look up synonyms. + */ public class SynLookup { - public static void main(String[] args) throws IOException { - if (args.length != 2) { - System.out.println( - "java org.apache.lucene.wordnet.SynLookup <index path> <word>"); - } - - FSDirectory directory = FSDirectory.getDirectory(args[0], false); - IndexSearcher searcher = new IndexSearcher(directory); - - String word = args[1]; - Hits hits = searcher.search( - new TermQuery(new Term("word", word))); - - if (hits.length() == 0) { - System.out.println("No synonyms found for " + word); - } else { - System.out.println("Synonyms found for \"" + word + "\":"); - } - - for (int i = 0; i < hits.length(); i++) { - Document doc = hits.doc(i); - - String[] values = doc.getValues("syn"); - - for (int j = 0; j < values.length; j++) { - System.out.println(values[j]); - } - } - - searcher.close(); - directory.close(); - } + public static void main(String[] args) throws IOException { + if (args.length != 2) { + System.out.println( + "java org.apache.lucene.wordnet.SynLookup <index path> <word>"); + } + + FSDirectory directory = FSDirectory.getDirectory(args[0], false); + IndexSearcher searcher = new IndexSearcher(directory); + + String word = args[1]; + Hits hits = searcher.search( + new TermQuery(new Term(Syns2Index.F_WORD, word))); + + if (hits.length() == 0) { + System.out.println("No synonyms found for " + word); + } else { + System.out.println("Synonyms found for \"" + word + "\":"); + } + + for (int i = 0; i < hits.length(); i++) { + Document doc = hits.doc(i); + + String[] values = doc.getValues(Syns2Index.F_SYN); + + for (int j = 0; j < values.length; j++) { + System.out.println(values[j]); + } + } + + searcher.close(); + directory.close(); + } + + + /** + * Perform synonym expansion on a query. + * + * @param query + * @param syns + * @param a + * @param field + * @param boost + */ + public static Query expand( String query, + Searcher syns, + Analyzer a, + String field, + float boost) + throws IOException + { + Set already = new HashSet(); // avoid dups + List top = new LinkedList(); // needs to be separately listed.. + + // [1] Parse query into separate words so that when we expand we can avoid dups + TokenStream ts = a.tokenStream( field, new StringReader( query)); + org.apache.lucene.analysis.Token t; + while ( (t = ts.next()) != null) + { + String word = t.termText(); + if ( already.add( word)) + top.add( word); + } + BooleanQuery tmp = new BooleanQuery(); + + // [2] form query + Iterator it = top.iterator(); + while ( it.hasNext()) + { + // [2a] add to level words in + String word = (String) it.next(); + TermQuery tq = new TermQuery( new Term( field, word)); + tmp.add( tq, BooleanClause.Occur.SHOULD); + + // [2b] add in unique synonums + Hits hits = syns.search( new TermQuery( new Term(Syns2Index.F_WORD, word))); + for (int i = 0; i < hits.length(); i++) + { + Document doc = hits.doc(i); + String[] values = doc.getValues( Syns2Index.F_SYN); + for ( int j = 0; j < values.length; j++) + { + String syn = values[ j]; + if ( already.add( syn)) + { + tq = new TermQuery( new Term( field, syn)); + if ( boost > 0) // else keep normal 1.0 + tq.setBoost( boost); + tmp.add( tq, BooleanClause.Occur.SHOULD); + } + } + } + } + + + return tmp; + } + } 1.6 +10 -8 jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/Syns2Index.java Index: Syns2Index.java =================================================================== RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/Syns2Index.java,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- Syns2Index.java 11 Jan 2005 20:13:39 -0000 1.5 +++ Syns2Index.java 11 Jan 2005 20:58:11 -0000 1.6 @@ -23,8 +23,10 @@ import java.util.TreeMap; /** - * Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/~wn/obtain.shtml">WordNet prolog download</a> - * into a Lucene index suitable for looking up synonyms and performing query expansion. + * Convert the prolog file wn_s.pl from the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog download</a> + * into a Lucene index suitable for looking up synonyms and performing query expansion ([EMAIL PROTECTED] SynExpand#expand SynExpand.expand(...)}). + * + * This has been tested with WordNet 2.0. * * The index has fields named "word" ([EMAIL PROTECTED] #F_WORD}) * and "syn" ([EMAIL PROTECTED] #F_SYN}). @@ -40,8 +42,7 @@ * related meanings we don't do that here. * </p> * - * This can take 8 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB. - * If you boost the minMergeDocuments and mergeFactor of the index writer than you can get this down to under 4 minutes. + * This can take 4 minutes to execute and build an index on a "fast" system and the index takes up almost 3 MB. * * @author Dave Spencer, dave@searchmorph.com * @see <a href="http://www.cogsci.princeton.edu/~wn/">WordNet home page</a> @@ -76,7 +77,7 @@ private static final Analyzer ana = new StandardAnalyzer(); /** - * Takes optional arg of prolog file name. + * Takes arg of prolog file name and index directory. */ public static void main(String[] args) throws Throwable @@ -228,9 +229,10 @@ // override the specific index if it already exists IndexWriter writer = new IndexWriter(indexDir, ana, true); - writer.setUseCompoundFile(true); - writer.mergeFactor *= 2; - writer.minMergeDocs *= 2; + writer.setUseCompoundFile(true); // why? + // blindly up these parameters for speed + writer.setMergeFactor( writer.getMergeFactor() * 2); + writer.setMaxBufferedDocs( writer.getMaxBufferedDocs() * 2); Iterator i1 = word2Nums.keySet().iterator(); while (i1.hasNext()) // for each word { 1.1 jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/SynExpand.java Index: SynExpand.java =================================================================== package org.apache.lucene.wordnet; import org.apache.lucene.store.*; import org.apache.lucene.search.*; import org.apache.lucene.index.*; import org.apache.lucene.document.*; import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.standard.*; import java.io.*; import java.util.*; /** * Expand a query by looking up synonyms for every term. * You need to invoke [EMAIL PROTECTED] Syns2Index} first to build the synonym index. * * @see Syns2Index */ public final class SynExpand { /** * Test driver for synonym expansion. * Uses boost factor of 0.9 for illustrative purposes. * * If you pass in the query "big dog" then it prints out: * * <code><pre> * Query: big adult^0.9 bad^0.9 bighearted^0.9 boastful^0.9 boastfully^0.9 bounteous^0.9 bountiful^0.9 braggy^0.9 crowing^0.9 freehanded^0.9 giving^0.9 grown^0.9 grownup^0.9 handsome^0.9 large^0.9 liberal^0.9 magnanimous^0.9 momentous^0.9 openhanded^0.9 prominent^0.9 swelled^0.9 vainglorious^0.9 vauntingly^0.9 * dog andiron^0.9 blackguard^0.9 bounder^0.9 cad^0.9 chase^0.9 click^0.9 detent^0.9 dogtooth^0.9 firedog^0.9 frank^0.9 frankfurter^0.9 frump^0.9 heel^0.9 hotdog^0.9 hound^0.9 pawl^0.9 tag^0.9 tail^0.9 track^0.9 trail^0.9 weenie^0.9 wiener^0.9 wienerwurst^0.9 * </pre></code> */ public static void main(String[] args) throws IOException { if (args.length != 2) { System.out.println( "java org.apache.lucene.wordnet.SynExpand <index path> <query>"); } FSDirectory directory = FSDirectory.getDirectory(args[0], false); IndexSearcher searcher = new IndexSearcher(directory); String query = args[1]; String field = "contents"; Query q = expand( query, searcher, new StandardAnalyzer(), field, 0.9f); System.out.println( "Query: " + q.toString( field)); searcher.close(); directory.close(); } /** * Perform synonym expansion on a query. * * @param query users query that is assumed to not have any "special" query syntax, thus it should be just normal words, so "big dog" makes sense, but a query like "title:foo^1.2" doesn't as this should presumably be passed directly to the default query parser. * * @param syns a opened to the Lucene index you previously created with [EMAIL PROTECTED] Syns2Index}. The searcher is not closed or otherwise altered. * * @param a optional analyzer used to parse the users query else [EMAIL PROTECTED] StandardAnalzyer} is used * * @param field optional field name to search in or null if you want the default of "contents" * * @param boost optional boost applied to synonyms else no boost is applied * * @return the expanded Query */ public static Query expand( String query, Searcher syns, Analyzer a, String field, float boost) throws IOException { Set already = new HashSet(); // avoid dups List top = new LinkedList(); // needs to be separately listed.. if ( field == null) field = "contents"; if ( a == null) a = new StandardAnalyzer(); // [1] Parse query into separate words so that when we expand we can avoid dups TokenStream ts = a.tokenStream( field, new StringReader( query)); org.apache.lucene.analysis.Token t; while ( (t = ts.next()) != null) { String word = t.termText(); if ( already.add( word)) top.add( word); } BooleanQuery tmp = new BooleanQuery(); // [2] form query Iterator it = top.iterator(); while ( it.hasNext()) { // [2a] add to level words in String word = (String) it.next(); TermQuery tq = new TermQuery( new Term( field, word)); tmp.add( tq, BooleanClause.Occur.SHOULD); // [2b] add in unique synonums Hits hits = syns.search( new TermQuery( new Term(Syns2Index.F_WORD, word))); for (int i = 0; i < hits.length(); i++) { Document doc = hits.doc(i); String[] values = doc.getValues( Syns2Index.F_SYN); for ( int j = 0; j < values.length; j++) { String syn = values[ j]; if ( already.add( syn)) // avoid dups of top level words and synonyms { tq = new TermQuery( new Term( field, syn)); if ( boost > 0) // else keep normal 1.0 tq.setBoost( boost); tmp.add( tq, BooleanClause.Occur.SHOULD); } } } } return tmp; } } 1.1 jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/package.html Index: package.html =================================================================== <html> <head> <title>WordNet Lucene Synonyms Integration</title> </head> <body> This package uses synonyms defined by <a href="http://www.cogsci.princeton.edu/~wn/">WordNet</a> to build a Lucene index storing them, which in turn can be used for query expansion. You normally run [EMAIL PROTECTED] org.apache.lucene.wordnet.Syns2Index} once to build the query index/"database", and then call [EMAIL PROTECTED] org.apache.lucene.wordnet.SynExpand#expand SynExpand.expand(...)} to expand a query. <p> <h3> Instructions </h3> <ol> <li> Download the <a href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">WordNet prolog database</a> , gunzip, untar etc. <li> Invoke Syn2Index as appropriate to build a synonym index. It'll take 2 arguments, the path to wn_s.pl from that WordNet downlaod, and the index name. <li> Update your UI so that as appropriate you call SynExpand.expand(...) to expand user queries with synonyms. </ol> </body> </html>
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]