search TestSetNorm.java

cutting Mon, 15 Dec 2003 16:07:24 -0800

cutting     2003/12/15 15:04:42

  Modified:    .        CHANGES.txt
               src/java/org/apache/lucene/index FilterIndexReader.java
                        IndexReader.java SegmentReader.java
                        SegmentsReader.java
  Added:       src/test/org/apache/lucene/search TestSetNorm.java
  Log:
  Add new method IndexReader.setNorm(), to permit altering boosts after an index is 
created.
  
  Revision  Changes    Path
  1.62      +7 -1      jakarta-lucene/CHANGES.txt
  
  Index: CHANGES.txt
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/CHANGES.txt,v
  retrieving revision 1.61
  retrieving revision 1.62
  diff -u -r1.61 -r1.62
  --- CHANGES.txt       5 Dec 2003 14:30:12 -0000       1.61
  +++ CHANGES.txt       15 Dec 2003 23:04:42 -0000      1.62
  @@ -7,6 +7,12 @@
    1. Added catch of BooleanQuery$TooManyClauses in QueryParser to
       throw ParseException instead. (Erik Hatcher)
   
  + 2. Fixed a NullPointerException in Query.explain(). (Doug Cutting)
  +
  + 3. Added a new method IndexReader.setNorm(), that permits one to
  +    alter the boosting of fields after an index is created.
  +
  +
   1.3 RC3
   
    1. Added minMergeDocs in IndexWriter.  This can be raised to speed
  
  
  
  1.5       +3 -0      
jakarta-lucene/src/java/org/apache/lucene/index/FilterIndexReader.java
  
  Index: FilterIndexReader.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/FilterIndexReader.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- FilterIndexReader.java    20 Nov 2003 19:10:41 -0000      1.4
  +++ FilterIndexReader.java    15 Dec 2003 23:04:42 -0000      1.5
  @@ -128,6 +128,9 @@
     public void undeleteAll() throws IOException { in.undeleteAll(); }
   
     public byte[] norms(String f) throws IOException { return in.norms(f); }
  +  public void setNorm(int d, String f, byte b) throws IOException {
  +    in.setNorm(d,f,b);
  +  }
   
     public TermEnum terms() throws IOException { return in.terms(); }
     public TermEnum terms(Term t) throws IOException { return in.terms(t); }
  
  
  
  1.24      +26 -1     jakarta-lucene/src/java/org/apache/lucene/index/IndexReader.java
  
  Index: IndexReader.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/IndexReader.java,v
  retrieving revision 1.23
  retrieving revision 1.24
  diff -u -r1.23 -r1.24
  --- IndexReader.java  20 Nov 2003 19:10:41 -0000      1.23
  +++ IndexReader.java  15 Dec 2003 23:04:42 -0000      1.24
  @@ -63,6 +63,7 @@
   import org.apache.lucene.store.Lock;
   import org.apache.lucene.document.Document;
   import org.apache.lucene.document.Field;          // for javadoc
  +import org.apache.lucene.search.Similarity;
   
   /** IndexReader is an abstract class, providing an interface for accessing an
     index.  Search of an index is done entirely through this abstract interface,
  @@ -270,6 +271,30 @@
      * @see Field#setBoost(float)
      */
     public abstract byte[] norms(String field) throws IOException;
  +
  +  /** Expert: Resets the normalization factor for the named field of the named
  +   * document.  The norm represents the product of the field's [EMAIL PROTECTED]
  +   * Field#setBoost(float) boost} and its [EMAIL PROTECTED] 
Similarity#lengthNorm(String,
  +   * int) length normalization}.  Thus, to preserve the length normalization
  +   * values when resetting this, one should base the new value upon the old.
  +   *
  +   * @see #norms(String)
  +   * @see Similarity#decodeNorm(byte)
  +   */
  +  public abstract void setNorm(int doc, String field, byte value)
  +    throws IOException;
  +
  +  /** Expert: Resets the normalization factor for the named field of the named
  +   * document.
  +   *
  +   * @see #norms(String)
  +   * @see Similarity#decodeNorm(byte)
  +   */
  +  public void setNorm(int doc, String field, float value)
  +    throws IOException {
  +    setNorm(doc, field, Similarity.encodeNorm(value));
  +  }
  +
   
     /** Returns an enumeration of all the terms in the index.
       The enumeration is ordered by Term.compareTo().  Each term
  
  
  
  1.17      +79 -29    
jakarta-lucene/src/java/org/apache/lucene/index/SegmentReader.java
  
  Index: SegmentReader.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/SegmentReader.java,v
  retrieving revision 1.16
  retrieving revision 1.17
  diff -u -r1.16 -r1.17
  --- SegmentReader.java        20 Nov 2003 19:10:41 -0000      1.16
  +++ SegmentReader.java        15 Dec 2003 23:04:42 -0000      1.17
  @@ -64,6 +64,7 @@
   
   import org.apache.lucene.document.Document;
   import org.apache.lucene.store.InputStream;
  +import org.apache.lucene.store.OutputStream;
   import org.apache.lucene.store.Lock;
   import org.apache.lucene.store.Directory;
   import org.apache.lucene.util.BitVector;
  @@ -84,6 +85,7 @@
   
     BitVector deletedDocs = null;
     private boolean deletedDocsDirty = false;
  +  private boolean normsDirty = false;
   
     InputStream freqStream;
     InputStream proxStream;
  @@ -91,10 +93,25 @@
     // Compound File Reader when based on a compound file segment
     CompoundFileReader cfsReader;
   
  -  private static class Norm {
  +  private class Norm {
       public Norm(InputStream in) { this.in = in; }
  -    public InputStream in;
  -    public byte[] bytes;
  +
  +    private InputStream in;
  +    private byte[] bytes;
  +    private boolean dirty;
  +
  +    private void reWrite(String name) throws IOException {
  +      // NOTE: norms are re-written in regular directory, not cfs
  +      OutputStream out = directory().createFile(segment + ".tmp");
  +      try {
  +        out.writeBytes(bytes, maxDoc());
  +      } finally {
  +        out.close();
  +      }
  +      String fileName = segment + ".f" + fieldInfos.fieldNumber(name);
  +      directory().renameFile(segment + ".tmp",  fileName);
  +      this.dirty = false;
  +    }
     }
     private Hashtable norms = new Hashtable();
   
  @@ -135,13 +152,29 @@
     }
   
     protected final synchronized void doClose() throws IOException {
  -    if (deletedDocsDirty) {
  +    if (deletedDocsDirty || normsDirty) {
         synchronized (directory()) {             // in- & inter-process sync
           new Lock.With(directory().makeLock(IndexWriter.COMMIT_LOCK_NAME),
             IndexWriter.COMMIT_LOCK_TIMEOUT) {
             public Object doBody() throws IOException {
  -            deletedDocs.write(directory(), segment + ".tmp");
  -            directory().renameFile(segment + ".tmp", segment + ".del");
  +
  +            if (deletedDocsDirty) {               // re-write deleted 
  +              deletedDocs.write(directory(), segment + ".tmp");
  +              directory().renameFile(segment + ".tmp", segment + ".del");
  +            }
  +
  +            if (normsDirty) {               // re-write norms 
  +              Enumeration keys  = norms.keys();
  +              Enumeration values  = norms.elements();
  +              while (values.hasMoreElements()) {
  +                String field = (String)keys.nextElement();
  +                Norm norm = (Norm)values.nextElement();
  +                if (norm.dirty) {
  +                  norm.reWrite(field);
  +                }
  +              }
  +            }
  +
               if(segmentInfos != null)
                 segmentInfos.write(directory());
               else
  @@ -151,6 +184,7 @@
           }.run();
         }
         deletedDocsDirty = false;
  +      normsDirty = false;
       }
   
       fieldsReader.close();
  @@ -189,7 +223,7 @@
       deletedDocs.set(docNum);
     }
   
  -  public void undeleteAll() throws IOException {
  +  public synchronized void undeleteAll() throws IOException {
       synchronized (directory()) {               // in- & inter-process sync
         new Lock.With(directory().makeLock(IndexWriter.COMMIT_LOCK_NAME),
                       IndexWriter.COMMIT_LOCK_TIMEOUT) {
  @@ -299,44 +333,60 @@
         return fieldSet;
       }
   
  -  public final byte[] norms(String field) throws IOException {
  +  public synchronized byte[] norms(String field) throws IOException {
       Norm norm = (Norm)norms.get(field);
  -    if (norm == null)
  +    if (norm == null)                             // not an indexed field
         return null;
  -    if (norm.bytes == null) {
  +    if (norm.bytes == null) {                     // value not yet read
         byte[] bytes = new byte[maxDoc()];
         norms(field, bytes, 0);
  -      norm.bytes = bytes;
  +      norm.bytes = bytes;                         // cache it
       }
       return norm.bytes;
     }
   
  -  final void norms(String field, byte[] bytes, int offset) throws IOException {
  -    InputStream normStream = normStream(field);
  -    if (normStream == null)
  +  public synchronized void setNorm(int doc, String field, byte value)
  +    throws IOException {
  +    Norm norm = (Norm)norms.get(field);
  +    if (norm == null)                             // not an indexed field
  +      return;
  +    norm.dirty = true;                            // mark it dirty
  +    normsDirty = true;
  +
  +    norms(field)[doc] = value;                    // set the value
  +  }
  +
  +  /** Read norms into a pre-allocated array. */
  +  synchronized void norms(String field, byte[] bytes, int offset)
  +    throws IOException {
  +
  +    Norm norm = (Norm)norms.get(field);
  +    if (norm == null)
         return;                                          // use zeros in array
  -    try {
  +
  +    if (norm.bytes != null) {                     // can copy from cache
  +      System.arraycopy(norm.bytes, 0, bytes, offset, maxDoc());
  +      return;
  +    }
  +
  +    InputStream normStream = (InputStream)norm.in.clone();
  +    try {                                         // read from disk
  +      normStream.seek(0);
         normStream.readBytes(bytes, offset, maxDoc());
       } finally {
         normStream.close();
       }
     }
   
  -  final InputStream normStream(String field) throws IOException {
  -    Norm norm = (Norm)norms.get(field);
  -    if (norm == null)
  -      return null;
  -    InputStream result = (InputStream)norm.in.clone();
  -    result.seek(0);
  -    return result;
  -  }
  -
  -  private final void openNorms(Directory useDir) throws IOException {
  +  private final void openNorms(Directory cfsDir) throws IOException {
       for (int i = 0; i < fieldInfos.size(); i++) {
         FieldInfo fi = fieldInfos.fieldInfo(i);
  -      if (fi.isIndexed)
  -        norms.put(fi.name,
  -                  new Norm(useDir.openFile(segment + ".f" + fi.number)));
  +      if (fi.isIndexed) {
  +        String fileName = segment + ".f" + fi.number;
  +        // look first for re-written file, then in compound format
  +        Directory d = directory().fileExists(fileName) ? directory() : cfsDir;
  +        norms.put(fi.name, new Norm(d.openFile(fileName)));
  +      }
       }
     }
   
  
  
  
  1.17      +8 -1      
jakarta-lucene/src/java/org/apache/lucene/index/SegmentsReader.java
  
  Index: SegmentsReader.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/SegmentsReader.java,v
  retrieving revision 1.16
  retrieving revision 1.17
  diff -u -r1.16 -r1.17
  --- SegmentsReader.java       20 Nov 2003 19:10:41 -0000      1.16
  +++ SegmentsReader.java       15 Dec 2003 23:04:42 -0000      1.17
  @@ -165,6 +165,13 @@
       return bytes;
     }
   
  +  public synchronized void setNorm(int n, String field, byte value)
  +    throws IOException {
  +    normsCache.remove(field);                     // clear cache
  +    int i = readerIndex(n);                    // find segment num
  +    readers[i].setNorm(n-starts[i], field, value); // dispatch
  +  }
  +
     public final TermEnum terms() throws IOException {
       return new SegmentsTermEnum(readers, starts, null);
     }
  
  
  
  1.1                  
jakarta-lucene/src/test/org/apache/lucene/search/TestSetNorm.java
  
  Index: TestSetNorm.java
  ===================================================================
  package org.apache.lucene.search;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact [EMAIL PROTECTED]
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import junit.framework.TestCase;
  
  import org.apache.lucene.index.Term;
  import org.apache.lucene.index.IndexWriter;
  import org.apache.lucene.index.IndexReader;
  import org.apache.lucene.search.IndexSearcher;
  import org.apache.lucene.store.RAMDirectory;
  import org.apache.lucene.analysis.SimpleAnalyzer;
  import org.apache.lucene.document.Document;
  import org.apache.lucene.document.Field;
  
  /** Document boost unit test.
   *
   * @author Doug Cutting
   * @version $Revision: 1.1 $
   */
  public class TestSetNorm extends TestCase {
    public TestSetNorm(String name) {
      super(name);
    }
    
    public void testSetNorm() throws Exception {
      RAMDirectory store = new RAMDirectory();
      IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true);
      
      // add the same document four times
      Field f1 = Field.Text("field", "word");
      Document d1 = new Document();
      d1.add(f1);
      writer.addDocument(d1);
      writer.addDocument(d1);
      writer.addDocument(d1);
      writer.addDocument(d1);
      writer.close();
  
      // reset the boost of each instance of this document
      IndexReader reader = IndexReader.open(store);
      reader.setNorm(0, "field", 1.0f);
      reader.setNorm(1, "field", 2.0f);
      reader.setNorm(2, "field", 4.0f);
      reader.setNorm(3, "field", 16.0f);
      reader.close();
  
      // check that searches are ordered by this boost
      final float[] scores = new float[4];
  
      new IndexSearcher(store).search
        (new TermQuery(new Term("field", "word")),
         new HitCollector() {
           public final void collect(int doc, float score) {
             scores[doc] = score;
           }
         });
      
      float lastScore = 0.0f;
  
      for (int i = 0; i < 4; i++) {
        assertTrue(scores[i] > lastScore);
        lastScore = scores[i];
      }
    }
  }


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: jakarta-lucene/src/test/org/apache/lucene/search TestSetNorm.java

Reply via email to