cutting 2003/12/15 15:04:42
Modified: . CHANGES.txt
src/java/org/apache/lucene/index FilterIndexReader.java
IndexReader.java SegmentReader.java
SegmentsReader.java
Added: src/test/org/apache/lucene/search TestSetNorm.java
Log:
Add new method IndexReader.setNorm(), to permit altering boosts after an index is
created.
Revision Changes Path
1.62 +7 -1 jakarta-lucene/CHANGES.txt
Index: CHANGES.txt
===================================================================
RCS file: /home/cvs/jakarta-lucene/CHANGES.txt,v
retrieving revision 1.61
retrieving revision 1.62
diff -u -r1.61 -r1.62
--- CHANGES.txt 5 Dec 2003 14:30:12 -0000 1.61
+++ CHANGES.txt 15 Dec 2003 23:04:42 -0000 1.62
@@ -7,6 +7,12 @@
1. Added catch of BooleanQuery$TooManyClauses in QueryParser to
throw ParseException instead. (Erik Hatcher)
+ 2. Fixed a NullPointerException in Query.explain(). (Doug Cutting)
+
+ 3. Added a new method IndexReader.setNorm(), that permits one to
+ alter the boosting of fields after an index is created.
+
+
1.3 RC3
1. Added minMergeDocs in IndexWriter. This can be raised to speed
1.5 +3 -0
jakarta-lucene/src/java/org/apache/lucene/index/FilterIndexReader.java
Index: FilterIndexReader.java
===================================================================
RCS file:
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/FilterIndexReader.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- FilterIndexReader.java 20 Nov 2003 19:10:41 -0000 1.4
+++ FilterIndexReader.java 15 Dec 2003 23:04:42 -0000 1.5
@@ -128,6 +128,9 @@
public void undeleteAll() throws IOException { in.undeleteAll(); }
public byte[] norms(String f) throws IOException { return in.norms(f); }
+ public void setNorm(int d, String f, byte b) throws IOException {
+ in.setNorm(d,f,b);
+ }
public TermEnum terms() throws IOException { return in.terms(); }
public TermEnum terms(Term t) throws IOException { return in.terms(t); }
1.24 +26 -1 jakarta-lucene/src/java/org/apache/lucene/index/IndexReader.java
Index: IndexReader.java
===================================================================
RCS file:
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/IndexReader.java,v
retrieving revision 1.23
retrieving revision 1.24
diff -u -r1.23 -r1.24
--- IndexReader.java 20 Nov 2003 19:10:41 -0000 1.23
+++ IndexReader.java 15 Dec 2003 23:04:42 -0000 1.24
@@ -63,6 +63,7 @@
import org.apache.lucene.store.Lock;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; // for javadoc
+import org.apache.lucene.search.Similarity;
/** IndexReader is an abstract class, providing an interface for accessing an
index. Search of an index is done entirely through this abstract interface,
@@ -270,6 +271,30 @@
* @see Field#setBoost(float)
*/
public abstract byte[] norms(String field) throws IOException;
+
+ /** Expert: Resets the normalization factor for the named field of the named
+ * document. The norm represents the product of the field's [EMAIL PROTECTED]
+ * Field#setBoost(float) boost} and its [EMAIL PROTECTED]
Similarity#lengthNorm(String,
+ * int) length normalization}. Thus, to preserve the length normalization
+ * values when resetting this, one should base the new value upon the old.
+ *
+ * @see #norms(String)
+ * @see Similarity#decodeNorm(byte)
+ */
+ public abstract void setNorm(int doc, String field, byte value)
+ throws IOException;
+
+ /** Expert: Resets the normalization factor for the named field of the named
+ * document.
+ *
+ * @see #norms(String)
+ * @see Similarity#decodeNorm(byte)
+ */
+ public void setNorm(int doc, String field, float value)
+ throws IOException {
+ setNorm(doc, field, Similarity.encodeNorm(value));
+ }
+
/** Returns an enumeration of all the terms in the index.
The enumeration is ordered by Term.compareTo(). Each term
1.17 +79 -29
jakarta-lucene/src/java/org/apache/lucene/index/SegmentReader.java
Index: SegmentReader.java
===================================================================
RCS file:
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/SegmentReader.java,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -r1.16 -r1.17
--- SegmentReader.java 20 Nov 2003 19:10:41 -0000 1.16
+++ SegmentReader.java 15 Dec 2003 23:04:42 -0000 1.17
@@ -64,6 +64,7 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.store.InputStream;
+import org.apache.lucene.store.OutputStream;
import org.apache.lucene.store.Lock;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BitVector;
@@ -84,6 +85,7 @@
BitVector deletedDocs = null;
private boolean deletedDocsDirty = false;
+ private boolean normsDirty = false;
InputStream freqStream;
InputStream proxStream;
@@ -91,10 +93,25 @@
// Compound File Reader when based on a compound file segment
CompoundFileReader cfsReader;
- private static class Norm {
+ private class Norm {
public Norm(InputStream in) { this.in = in; }
- public InputStream in;
- public byte[] bytes;
+
+ private InputStream in;
+ private byte[] bytes;
+ private boolean dirty;
+
+ private void reWrite(String name) throws IOException {
+ // NOTE: norms are re-written in regular directory, not cfs
+ OutputStream out = directory().createFile(segment + ".tmp");
+ try {
+ out.writeBytes(bytes, maxDoc());
+ } finally {
+ out.close();
+ }
+ String fileName = segment + ".f" + fieldInfos.fieldNumber(name);
+ directory().renameFile(segment + ".tmp", fileName);
+ this.dirty = false;
+ }
}
private Hashtable norms = new Hashtable();
@@ -135,13 +152,29 @@
}
protected final synchronized void doClose() throws IOException {
- if (deletedDocsDirty) {
+ if (deletedDocsDirty || normsDirty) {
synchronized (directory()) { // in- & inter-process sync
new Lock.With(directory().makeLock(IndexWriter.COMMIT_LOCK_NAME),
IndexWriter.COMMIT_LOCK_TIMEOUT) {
public Object doBody() throws IOException {
- deletedDocs.write(directory(), segment + ".tmp");
- directory().renameFile(segment + ".tmp", segment + ".del");
+
+ if (deletedDocsDirty) { // re-write deleted
+ deletedDocs.write(directory(), segment + ".tmp");
+ directory().renameFile(segment + ".tmp", segment + ".del");
+ }
+
+ if (normsDirty) { // re-write norms
+ Enumeration keys = norms.keys();
+ Enumeration values = norms.elements();
+ while (values.hasMoreElements()) {
+ String field = (String)keys.nextElement();
+ Norm norm = (Norm)values.nextElement();
+ if (norm.dirty) {
+ norm.reWrite(field);
+ }
+ }
+ }
+
if(segmentInfos != null)
segmentInfos.write(directory());
else
@@ -151,6 +184,7 @@
}.run();
}
deletedDocsDirty = false;
+ normsDirty = false;
}
fieldsReader.close();
@@ -189,7 +223,7 @@
deletedDocs.set(docNum);
}
- public void undeleteAll() throws IOException {
+ public synchronized void undeleteAll() throws IOException {
synchronized (directory()) { // in- & inter-process sync
new Lock.With(directory().makeLock(IndexWriter.COMMIT_LOCK_NAME),
IndexWriter.COMMIT_LOCK_TIMEOUT) {
@@ -299,44 +333,60 @@
return fieldSet;
}
- public final byte[] norms(String field) throws IOException {
+ public synchronized byte[] norms(String field) throws IOException {
Norm norm = (Norm)norms.get(field);
- if (norm == null)
+ if (norm == null) // not an indexed field
return null;
- if (norm.bytes == null) {
+ if (norm.bytes == null) { // value not yet read
byte[] bytes = new byte[maxDoc()];
norms(field, bytes, 0);
- norm.bytes = bytes;
+ norm.bytes = bytes; // cache it
}
return norm.bytes;
}
- final void norms(String field, byte[] bytes, int offset) throws IOException {
- InputStream normStream = normStream(field);
- if (normStream == null)
+ public synchronized void setNorm(int doc, String field, byte value)
+ throws IOException {
+ Norm norm = (Norm)norms.get(field);
+ if (norm == null) // not an indexed field
+ return;
+ norm.dirty = true; // mark it dirty
+ normsDirty = true;
+
+ norms(field)[doc] = value; // set the value
+ }
+
+ /** Read norms into a pre-allocated array. */
+ synchronized void norms(String field, byte[] bytes, int offset)
+ throws IOException {
+
+ Norm norm = (Norm)norms.get(field);
+ if (norm == null)
return; // use zeros in array
- try {
+
+ if (norm.bytes != null) { // can copy from cache
+ System.arraycopy(norm.bytes, 0, bytes, offset, maxDoc());
+ return;
+ }
+
+ InputStream normStream = (InputStream)norm.in.clone();
+ try { // read from disk
+ normStream.seek(0);
normStream.readBytes(bytes, offset, maxDoc());
} finally {
normStream.close();
}
}
- final InputStream normStream(String field) throws IOException {
- Norm norm = (Norm)norms.get(field);
- if (norm == null)
- return null;
- InputStream result = (InputStream)norm.in.clone();
- result.seek(0);
- return result;
- }
-
- private final void openNorms(Directory useDir) throws IOException {
+ private final void openNorms(Directory cfsDir) throws IOException {
for (int i = 0; i < fieldInfos.size(); i++) {
FieldInfo fi = fieldInfos.fieldInfo(i);
- if (fi.isIndexed)
- norms.put(fi.name,
- new Norm(useDir.openFile(segment + ".f" + fi.number)));
+ if (fi.isIndexed) {
+ String fileName = segment + ".f" + fi.number;
+ // look first for re-written file, then in compound format
+ Directory d = directory().fileExists(fileName) ? directory() : cfsDir;
+ norms.put(fi.name, new Norm(d.openFile(fileName)));
+ }
}
}
1.17 +8 -1
jakarta-lucene/src/java/org/apache/lucene/index/SegmentsReader.java
Index: SegmentsReader.java
===================================================================
RCS file:
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/SegmentsReader.java,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -r1.16 -r1.17
--- SegmentsReader.java 20 Nov 2003 19:10:41 -0000 1.16
+++ SegmentsReader.java 15 Dec 2003 23:04:42 -0000 1.17
@@ -165,6 +165,13 @@
return bytes;
}
+ public synchronized void setNorm(int n, String field, byte value)
+ throws IOException {
+ normsCache.remove(field); // clear cache
+ int i = readerIndex(n); // find segment num
+ readers[i].setNorm(n-starts[i], field, value); // dispatch
+ }
+
public final TermEnum terms() throws IOException {
return new SegmentsTermEnum(readers, starts, null);
}
1.1
jakarta-lucene/src/test/org/apache/lucene/search/TestSetNorm.java
Index: TestSetNorm.java
===================================================================
package org.apache.lucene.search;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact [EMAIL PROTECTED]
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import junit.framework.TestCase;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
/** Document boost unit test.
*
* @author Doug Cutting
* @version $Revision: 1.1 $
*/
public class TestSetNorm extends TestCase {
public TestSetNorm(String name) {
super(name);
}
public void testSetNorm() throws Exception {
RAMDirectory store = new RAMDirectory();
IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true);
// add the same document four times
Field f1 = Field.Text("field", "word");
Document d1 = new Document();
d1.add(f1);
writer.addDocument(d1);
writer.addDocument(d1);
writer.addDocument(d1);
writer.addDocument(d1);
writer.close();
// reset the boost of each instance of this document
IndexReader reader = IndexReader.open(store);
reader.setNorm(0, "field", 1.0f);
reader.setNorm(1, "field", 2.0f);
reader.setNorm(2, "field", 4.0f);
reader.setNorm(3, "field", 16.0f);
reader.close();
// check that searches are ordered by this boost
final float[] scores = new float[4];
new IndexSearcher(store).search
(new TermQuery(new Term("field", "word")),
new HitCollector() {
public final void collect(int doc, float score) {
scores[doc] = score;
}
});
float lastScore = 0.0f;
for (int i = 0; i < 4; i++) {
assertTrue(scores[i] > lastScore);
lastScore = scores[i];
}
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]