dspencer    2005/01/11 12:58:11

  Modified:    contributions/WordNet build.xml
               contributions/WordNet/src/java/org/apache/lucene/wordnet
                        SynLookup.java Syns2Index.java
  Added:       contributions/WordNet/src/java/org/apache/lucene/wordnet
                        SynExpand.java package.html
  Log:
  make sure code works with WordNet2.0 (no problem) and add Query expansion, 
and comments
  
  Revision  Changes    Path
  1.4       +21 -0     jakarta-lucene-sandbox/contributions/WordNet/build.xml
  
  Index: build.xml
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene-sandbox/contributions/WordNet/build.xml,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- build.xml 23 Feb 2004 15:23:25 -0000      1.3
  +++ build.xml 11 Jan 2005 20:58:11 -0000      1.4
  @@ -29,6 +29,7 @@
       </java>
     </target>
   
  +
     <target name="synonym" description="Find synonyms for word">
       <fail unless="synindex.exists">
         Index does not exist.
  @@ -46,6 +47,26 @@
   
         <arg file="${synindex.dir}"/>
         <arg value="${word}"/>
  +    </java>
  +  </target>
  +
  +  <target name="expand" description="Perform synonym expansion on a query">
  +    <fail unless="synindex.exists">
  +      Index does not exist.
  +    </fail>
  +
  +    <fail unless="query">
  +      Must specify 'query' property.
  +    </fail>
  +    
  +    <java classname="org.apache.lucene.wordnet.SynExpand">
  +      <classpath>
  +        <path refid="compile.classpath"/>
  +        <pathelement location="${build.classes.dir}"/>
  +      </classpath>
  +
  +      <arg file="${synindex.dir}"/>
  +      <arg value="${query}"/>
       </java>
     </target>
   
  
  
  
  1.2       +108 -39   
jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/SynLookup.java
  
  Index: SynLookup.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/SynLookup.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- SynLookup.java    26 Jan 2004 17:29:35 -0000      1.1
  +++ SynLookup.java    11 Jan 2005 20:58:11 -0000      1.2
  @@ -1,45 +1,114 @@
   package org.apache.lucene.wordnet;
   
  -import org.apache.lucene.store.FSDirectory;
  -import org.apache.lucene.search.IndexSearcher;
  -import org.apache.lucene.search.TermQuery;
  -import org.apache.lucene.search.Hits;
  -import org.apache.lucene.index.Term;
  -import org.apache.lucene.document.Document;
  -import java.io.IOException;
  +import org.apache.lucene.store.*;
  +import org.apache.lucene.search.*;
  +import org.apache.lucene.index.*;
  +import org.apache.lucene.document.*;
  +import org.apache.lucene.analysis.*;
  +import java.io.*;
  +import java.util.*;
   
  +
  +/**
  + * Test program to look up synonyms.
  + */
   public class SynLookup {
   
  -  public static void main(String[] args) throws IOException {
  -    if (args.length != 2) {
  -      System.out.println(
  -    "java org.apache.lucene.wordnet.SynLookup <index path> <word>");
  -    }
  -
  -    FSDirectory directory = FSDirectory.getDirectory(args[0], false);
  -    IndexSearcher searcher = new IndexSearcher(directory);
  -
  -    String word = args[1];
  -    Hits hits = searcher.search(
  -      new TermQuery(new Term("word", word)));
  -
  -    if (hits.length() == 0) {
  -      System.out.println("No synonyms found for " + word);
  -    } else {
  -      System.out.println("Synonyms found for \"" + word + "\":");
  -    }
  -
  -    for (int i = 0; i < hits.length(); i++) {
  -      Document doc = hits.doc(i);
  -
  -      String[] values = doc.getValues("syn");
  -
  -      for (int j = 0; j < values.length; j++) {
  -        System.out.println(values[j]);
  -      }
  -    }
  -
  -    searcher.close();
  -    directory.close();
  -  }
  +     public static void main(String[] args) throws IOException {
  +             if (args.length != 2) {
  +                     System.out.println(
  +                                                        "java 
org.apache.lucene.wordnet.SynLookup <index path> <word>");
  +             }
  +
  +             FSDirectory directory = FSDirectory.getDirectory(args[0], 
false);
  +             IndexSearcher searcher = new IndexSearcher(directory);
  +
  +             String word = args[1];
  +             Hits hits = searcher.search(
  +                                                                     new 
TermQuery(new Term(Syns2Index.F_WORD, word)));
  +
  +             if (hits.length() == 0) {
  +                     System.out.println("No synonyms found for " + word);
  +             } else {
  +                     System.out.println("Synonyms found for \"" + word + 
"\":");
  +             }
  +
  +             for (int i = 0; i < hits.length(); i++) {
  +                     Document doc = hits.doc(i);
  +
  +                     String[] values = doc.getValues(Syns2Index.F_SYN);
  +
  +                     for (int j = 0; j < values.length; j++) {
  +                             System.out.println(values[j]);
  +                     }
  +             }
  +
  +             searcher.close();
  +             directory.close();
  +     }
  +
  +
  +     /**
  +      * Perform synonym expansion on a query.
  +      *
  +      * @param query
  +      * @param syns
  +      * @param a
  +      * @param field
  +      * @param boost
  +      */ 
  +     public static Query expand( String query,
  +                                                             Searcher syns,
  +                                                             Analyzer a,
  +                                                             String field,
  +                                                             float boost)
  +             throws IOException
  +     {
  +             Set already = new HashSet(); // avoid dups              
  +             List top = new LinkedList(); // needs to be separately listed..
  +
  +             // [1] Parse query into separate words so that when we expand 
we can avoid dups
  +             TokenStream ts = a.tokenStream( field, new StringReader( 
query));
  +             org.apache.lucene.analysis.Token t;
  +             while ( (t = ts.next()) != null)
  +             {
  +                     String word = t.termText();
  +                     if ( already.add( word))
  +                             top.add( word);
  +             }
  +             BooleanQuery tmp = new BooleanQuery();
  +             
  +             // [2] form query
  +             Iterator it = top.iterator();
  +             while ( it.hasNext())
  +             {
  +                     // [2a] add to level words in
  +                     String word = (String) it.next();
  +                     TermQuery tq = new TermQuery( new Term( field, word));
  +                     tmp.add( tq, BooleanClause.Occur.SHOULD);
  +
  +                     // [2b] add in unique synonums
  +                     Hits hits = syns.search( new TermQuery( new 
Term(Syns2Index.F_WORD, word)));
  +                     for (int i = 0; i < hits.length(); i++)
  +                     {
  +                             Document doc = hits.doc(i);
  +                             String[] values = doc.getValues( 
Syns2Index.F_SYN);
  +                             for ( int j = 0; j < values.length; j++)
  +                             {
  +                                     String syn = values[ j];
  +                                     if ( already.add( syn))
  +                                     {
  +                                             tq = new TermQuery( new Term( 
field, syn));
  +                                             if ( boost > 0) // else keep 
normal 1.0
  +                                                     tq.setBoost( boost);
  +                                             tmp.add( tq, 
BooleanClause.Occur.SHOULD); 
  +                                     }
  +                             }
  +                     }
  +             }
  +
  +
  +             return tmp;
  +     }
  +                                                             
   }
  
  
  
  1.6       +10 -8     
jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/Syns2Index.java
  
  Index: Syns2Index.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/Syns2Index.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- Syns2Index.java   11 Jan 2005 20:13:39 -0000      1.5
  +++ Syns2Index.java   11 Jan 2005 20:58:11 -0000      1.6
  @@ -23,8 +23,10 @@
   import java.util.TreeMap;
   
   /**
  - * Convert the prolog file wn_s.pl from the <a 
href="http://www.cogsci.princeton.edu/~wn/obtain.shtml";>WordNet prolog 
download</a>
  - * into a Lucene index suitable for looking up synonyms and performing query 
expansion.
  + * Convert the prolog file wn_s.pl from the <a 
href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz";>WordNet prolog 
download</a>
  + * into a Lucene index suitable for looking up synonyms and performing query 
expansion ([EMAIL PROTECTED] SynExpand#expand SynExpand.expand(...)}).
  + *
  + * This has been tested with WordNet 2.0.
    *
    * The index has fields named "word" ([EMAIL PROTECTED] #F_WORD})
    * and "syn" ([EMAIL PROTECTED] #F_SYN}).
  @@ -40,8 +42,7 @@
    * related meanings we don't do that here.
    * </p>
    *
  - * This can take 8 minutes to execute and build an index on a "fast" system 
and the index takes up almost 3 MB.
  - * If you boost the minMergeDocuments and mergeFactor of the index writer 
than you can get this down to under 4 minutes.
  + * This can take 4 minutes to execute and build an index on a "fast" system 
and the index takes up almost 3 MB.
    *
    * @author Dave Spencer, dave&#064;searchmorph.com
    * @see <a href="http://www.cogsci.princeton.edu/~wn/";>WordNet home page</a>
  @@ -76,7 +77,7 @@
       private static final Analyzer ana = new StandardAnalyzer();
   
       /**
  -     * Takes optional arg of prolog file name.
  +     * Takes arg of prolog file name and index directory.
        */
       public static void main(String[] args)
           throws Throwable
  @@ -228,9 +229,10 @@
   
           // override the specific index if it already exists
           IndexWriter writer = new IndexWriter(indexDir, ana, true);
  -        writer.setUseCompoundFile(true);
  -             writer.mergeFactor *= 2;
  -             writer.minMergeDocs *= 2;
  +        writer.setUseCompoundFile(true); // why?
  +             // blindly up these parameters for speed
  +             writer.setMergeFactor( writer.getMergeFactor() * 2);
  +             writer.setMaxBufferedDocs( writer.getMaxBufferedDocs() * 2);
           Iterator i1 = word2Nums.keySet().iterator();
           while (i1.hasNext()) // for each word
           {
  
  
  
  1.1                  
jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/SynExpand.java
  
  Index: SynExpand.java
  ===================================================================
  package org.apache.lucene.wordnet;
  
  import org.apache.lucene.store.*;
  import org.apache.lucene.search.*;
  import org.apache.lucene.index.*;
  import org.apache.lucene.document.*;
  import org.apache.lucene.analysis.*;
  import org.apache.lucene.analysis.standard.*;
  import java.io.*;
  import java.util.*;
  
  
  /**
   * Expand a query by looking up synonyms for every term.
   * You need to invoke [EMAIL PROTECTED] Syns2Index} first to build the 
synonym index.
   *
   * @see Syns2Index
   */
  public final class SynExpand {
  
        /**
         * Test driver for synonym expansion.
         * Uses boost factor of 0.9 for illustrative purposes.
         *
         * If you pass in the query "big dog" then it prints out:
         *
         * <code><pre>
         * Query: big adult^0.9 bad^0.9 bighearted^0.9 boastful^0.9 
boastfully^0.9 bounteous^0.9 bountiful^0.9 braggy^0.9 crowing^0.9 
freehanded^0.9 giving^0.9 grown^0.9 grownup^0.9 handsome^0.9 large^0.9 
liberal^0.9 magnanimous^0.9 momentous^0.9 openhanded^0.9 prominent^0.9 
swelled^0.9 vainglorious^0.9 vauntingly^0.9
         * dog andiron^0.9 blackguard^0.9 bounder^0.9 cad^0.9 chase^0.9 
click^0.9 detent^0.9 dogtooth^0.9 firedog^0.9 frank^0.9 frankfurter^0.9 
frump^0.9 heel^0.9 hotdog^0.9 hound^0.9 pawl^0.9 tag^0.9 tail^0.9 track^0.9 
trail^0.9 weenie^0.9 wiener^0.9 wienerwurst^0.9
         * </pre></code>
         */
        public static void main(String[] args) throws IOException
        {
                if (args.length != 2)
                {
                        System.out.println(
                                                           "java 
org.apache.lucene.wordnet.SynExpand <index path> <query>");
                }
  
                FSDirectory directory = FSDirectory.getDirectory(args[0], 
false);
                IndexSearcher searcher = new IndexSearcher(directory);
  
                String query = args[1];
                String field = "contents";
  
                Query q = expand( query, searcher, new StandardAnalyzer(), 
field, 0.9f);
                System.out.println( "Query: " + q.toString( field));
  
  
  
                searcher.close();
                directory.close();
        }
  
  
        /**
         * Perform synonym expansion on a query.
         *
         * @param query users query that is assumed to not have any "special" 
query syntax, thus it should be just normal words, so "big dog" makes sense, 
but a query like "title:foo^1.2" doesn't as this should presumably be passed 
directly to the default query parser.
         *
         * @param syns a opened to the Lucene index you previously created with 
[EMAIL PROTECTED] Syns2Index}. The searcher is not closed or otherwise altered.
         *
         * @param a optional analyzer used to parse the users query else [EMAIL 
PROTECTED] StandardAnalzyer} is used
         *
         * @param field optional field name to search in or null if you want 
the default of "contents"
         *
         * @param boost optional boost applied to synonyms else no boost is 
applied
         *
         * @return the expanded Query
         */ 
        public static Query expand( String query,
                                                                Searcher syns,
                                                                Analyzer a,
                                                                String field,
                                                                float boost)
                throws IOException
        {
                Set already = new HashSet(); // avoid dups 
                List top = new LinkedList(); // needs to be separately listed..
                if ( field == null) field = "contents";
                if ( a == null) a = new StandardAnalyzer();
  
                // [1] Parse query into separate words so that when we expand 
we can avoid dups
                TokenStream ts = a.tokenStream( field, new StringReader( 
query));
                org.apache.lucene.analysis.Token t;
                while ( (t = ts.next()) != null)
                {
                        String word = t.termText();
                        if ( already.add( word))
                                top.add( word);
                }
                BooleanQuery tmp = new BooleanQuery();
                
                // [2] form query
                Iterator it = top.iterator();
                while ( it.hasNext())
                {
                        // [2a] add to level words in
                        String word = (String) it.next();
                        TermQuery tq = new TermQuery( new Term( field, word));
                        tmp.add( tq, BooleanClause.Occur.SHOULD);
  
                        // [2b] add in unique synonums
                        Hits hits = syns.search( new TermQuery( new 
Term(Syns2Index.F_WORD, word)));
                        for (int i = 0; i < hits.length(); i++)
                        {
                                Document doc = hits.doc(i);
                                String[] values = doc.getValues( 
Syns2Index.F_SYN);
                                for ( int j = 0; j < values.length; j++)
                                {
                                        String syn = values[ j];
                                        if ( already.add( syn)) // avoid dups 
of top level words and synonyms
                                        {
                                                tq = new TermQuery( new Term( 
field, syn));
                                                if ( boost > 0) // else keep 
normal 1.0
                                                        tq.setBoost( boost);
                                                tmp.add( tq, 
BooleanClause.Occur.SHOULD); 
                                        }
                                }
                        }
                }
  
  
                return tmp;
        }
                                                                
  }
  
  
  
  1.1                  
jakarta-lucene-sandbox/contributions/WordNet/src/java/org/apache/lucene/wordnet/package.html
  
  Index: package.html
  ===================================================================
  <html>
      <head>
  <title>WordNet Lucene Synonyms Integration</title>
  </head>
  <body>
  
      This package uses synonyms defined by <a 
href="http://www.cogsci.princeton.edu/~wn/";>WordNet</a> to build a
      Lucene index storing them, which in turn can be used for query expansion.
  
      You normally run [EMAIL PROTECTED] org.apache.lucene.wordnet.Syns2Index} 
once to build the query index/"database", and then call
      [EMAIL PROTECTED] org.apache.lucene.wordnet.SynExpand#expand 
SynExpand.expand(...)} to expand a query.
  
      <p>
  
        <h3> Instructions </h3>
        <ol>
            <li> Download the <a 
href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz";>WordNet prolog 
database</a> , gunzip, untar etc.
        <li> Invoke Syn2Index as appropriate to build a synonym index.
            It'll take 2 arguments, the path to wn_s.pl from that WordNet 
downlaod, and the index name.
     
         <li> Update your UI so that as appropriate you call 
SynExpand.expand(...) to expand user queries with synonyms.
         </ol>
  
  </body>
      </html>
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to