Re: Revival of Dmitry's Term Vector patches

Damian Gajda Tue, 09 Dec 2003 12:45:44 -0800

Hello Otis,

Here is a patch with documentation from Dmitry.


I used
cvs diff -uN

Hope it is OK now.

-- 
Damian

Index: src/java/org/apache/lucene/document/Field.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/document/Field.java,v
retrieving revision 1.11
diff -u -r1.11 Field.java
--- src/java/org/apache/lucene/document/Field.java	20 Mar 2003 18:28:13 -0000	1.11
+++ src/java/org/apache/lucene/document/Field.java	9 Dec 2003 19:39:05 -0000
@@ -162,6 +162,8 @@
     is used.  Exactly one of stringValue() and readerValue() must be set. */
   public Reader readerValue()	{ return readerValue; }
 
+  /** Create a field by specifying all parameters.
+   */
   public Field(String name, String string,
 	       boolean store, boolean index, boolean token) {
     if (name == null)
Index: src/java/org/apache/lucene/index/FieldInfos.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/FieldInfos.java,v
retrieving revision 1.4
diff -u -r1.4 FieldInfos.java
--- src/java/org/apache/lucene/index/FieldInfos.java	21 Oct 2003 17:59:16 -0000	1.4
+++ src/java/org/apache/lucene/index/FieldInfos.java	9 Dec 2003 19:39:05 -0000
@@ -68,6 +68,12 @@
 import org.apache.lucene.store.OutputStream;
 import org.apache.lucene.store.InputStream;
 
+/** Access to the Field Info file that describes document fields and whether or
+ *  not they are indexed. Each segment has a separate Field Info file. Objects
+ *  of this class is thread-safe for multiple readers, but only one thread can
+ *  be adding documents at a time, with no other reader or writer threads
+ *  accessing this object.
+ */
 final class FieldInfos {
   private Vector byNumber = new Vector();
   private Hashtable byName = new Hashtable();
@@ -94,6 +100,10 @@
     }
   }
 
+  /** Adds in information for a set of FieldInfos.
+   *  Returns an array mapping each field number in the <code>names</code>
+   *  collection to the field numbers in this one.
+   */
   final void add(Collection names, boolean isIndexed) {
     Iterator i = names.iterator();
     while (i.hasNext()) {
@@ -101,6 +111,10 @@
     }
   }
 
+  /** If the field is not yet known, adds it. If it is known, checks
+	*  to make sure that the isIndexed flag is the same as was given
+	*  previously for this field. If not - throws IllegalStateException.
+	*/
   final void add(String name, boolean isIndexed) {
     FieldInfo fi = fieldInfo(name);
     if (fi == null)
Index: src/java/org/apache/lucene/index/SegmentMergeInfo.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentMergeInfo.java,v
retrieving revision 1.2
diff -u -r1.2 SegmentMergeInfo.java
--- src/java/org/apache/lucene/index/SegmentMergeInfo.java	21 Oct 2003 17:59:16 -0000	1.2
+++ src/java/org/apache/lucene/index/SegmentMergeInfo.java	9 Dec 2003 19:39:06 -0000
@@ -57,14 +57,38 @@
 import java.io.IOException;
 import org.apache.lucene.util.BitVector;
 
+/** Data container to work with SegmentMergeQueue. Represents a single segment
+ *  to be merged. Maintains the segment reader, TermEnum, and TermPositions
+ *  for this segment.
+ */
 final class SegmentMergeInfo {
+  /** The current term of this segment, or null if none. */
   Term term;
+
+  /** Index of the 0th document from this segment in the merged document numbering. */
   int base;
+
+  /** This segment's term enum. Do not use directly. */
   TermEnum termEnum;
+
+  /** This segment's reader. Do not use directly. */
   IndexReader reader;
+
+  /** Postings for the current term. */
   TermPositions postings;
+
+
+  /** Maps around deleted docs. Contains a slot for each document in the
+   *  reader. Slots corresponding to deleted docs have the value of -1. The
+   *  rest have their new document numbers that start at 0. This value
+   *  added to <code>base</code> is the document number in the merged numbering.
+   */
   int[] docMap = null;				  // maps around deleted docs
 
+  /** Create a new merge info. Base <code>b</code> is a starting
+   *  number for documents from this segment in the merged document
+   *  numbering.
+   */
   SegmentMergeInfo(int b, TermEnum te, IndexReader r)
     throws IOException {
     base = b;
@@ -87,6 +111,12 @@
     }
   }
 
+
+  /** Shift to the next term on this segment's TermEnum. The new
+   *  term becomes the current term for this segment, effecting the
+   *  ordering of the SegmentMergeQueue. If no more terms remain
+   *  in this segment, returns false and resets the current term to null.
+   */
   final boolean next() throws IOException {
     if (termEnum.next()) {
       term = termEnum.term();
Index: src/java/org/apache/lucene/index/SegmentMergeQueue.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentMergeQueue.java,v
retrieving revision 1.1.1.1
diff -u -r1.1.1.1 SegmentMergeQueue.java
--- src/java/org/apache/lucene/index/SegmentMergeQueue.java	18 Sep 2001 16:29:53 -0000	1.1.1.1
+++ src/java/org/apache/lucene/index/SegmentMergeQueue.java	9 Dec 2003 19:39:06 -0000
@@ -57,6 +57,10 @@
 import java.io.IOException;
 import org.apache.lucene.util.PriorityQueue;
 
+/** Priority queue of SegmentMergeInfo objects. The queue sorts the
+ *  info objects by their current term, and if the terms are equal,
+ *  by their base offset.
+ */
 final class SegmentMergeQueue extends PriorityQueue {
   SegmentMergeQueue(int size) {
     initialize(size);
Index: src/java/org/apache/lucene/index/SegmentMerger.java
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentMerger.java,v
retrieving revision 1.6
diff -u -r1.6 SegmentMerger.java
--- src/java/org/apache/lucene/index/SegmentMerger.java	31 Oct 2003 09:28:44 -0000	1.6
+++ src/java/org/apache/lucene/index/SegmentMerger.java	9 Dec 2003 19:39:07 -0000
@@ -77,20 +77,33 @@
     "fnm", "frq", "prx", "fdx", "fdt", "tii", "tis"
   };
   
+  /** Create a segment merger that will merge a number of segments (specified
+   *  as SegmentReaders added to this object with calls to <code>add</code>) into a
+   *  single segment with the specified <code>name</code>.
+   */
   SegmentMerger(Directory dir, String name, boolean compoundFile) {
     directory = dir;
     segment = name;
     useCompoundFile = compoundFile;
   }
 
+  /** Add segment reader to be merged.
+   *
+   */
   final void add(IndexReader reader) {
     readers.addElement(reader);
   }
 
+  /** Return one of the segment readers being merged.
+   *
+   */
   final IndexReader segmentReader(int i) {
     return (IndexReader)readers.elementAt(i);
   }
 
+  /** Start the merge. All segment readers to be merged must have been added
+   *  prior to this call.
+   */
   final int merge() throws IOException {
     int value;
     try {
@@ -148,6 +161,9 @@
   }
   
   
+  /** Merge the field information from the segment readers.
+   *  Called from <code>merge</code>.
+   */
   private final int mergeFields() throws IOException {
     fieldInfos = new FieldInfos();		  // merge field names
     int docCount = 0;
@@ -181,6 +197,9 @@
   private TermInfosWriter termInfosWriter = null;
   private SegmentMergeQueue queue = null;
 
+  /** Merge the term index, frequency and proximity information
+   *  from specified segment readers. Called from <code>merge</code>.
+   */
   private final void mergeTerms() throws IOException {
     try {
       freqOutput = directory.createFile(segment + ".frq");
@@ -198,7 +217,11 @@
     }
   }
 
+  /** Merge the term index information. Called from <code>mergeTerms</code>.
+   */
   private final void mergeTermInfos() throws IOException {
+	// Create and populate a priority queue of segments to be merged.
+	// Segments are sorted by their top term and the base doc number in the merged segment.
     queue = new SegmentMergeQueue(readers.size());
     int base = 0;
     for (int i = 0; i < readers.size(); i++) {
@@ -220,13 +243,19 @@
       Term term = match[0].term;
       SegmentMergeInfo top = (SegmentMergeInfo)queue.top();
       
+      // pop off the queue and put into match[] all segments
+      // that have the same term at the top
       while (top != null && term.compareTo(top.term) == 0) {
         match[matchSize++] = (SegmentMergeInfo)queue.pop();
         top = (SegmentMergeInfo)queue.top();
       }
 
+      // perform the merge for all segments that are positioned on
+      // the same term
       mergeTermInfo(match, matchSize);		  // add new TermInfo
       
+      // advance the matched segments to the next term and, if one exists, put
+      // the segment back onto the queue (priority queue takes care of sorting them)
       while (matchSize > 0) {
         SegmentMergeInfo smi = match[--matchSize];
         if (smi.next())
@@ -239,6 +268,14 @@
 
   private final TermInfo termInfo = new TermInfo(); // minimize consing
 
+
+  /** Merge one term found in one or more segments. The array <code>smis</code>
+   *  contains segments that are positioned at the same term. <code>N</code>
+   *  is the number of cells in the array actually occupied.
+   *
+   * @param smis array of segments
+   * @param n number of cells in the array actually occupied
+   */
   private final void mergeTermInfo(SegmentMergeInfo[] smis, int n)
        throws IOException {
     long freqPointer = freqOutput.getFilePointer();
@@ -253,6 +290,14 @@
     }
   }
 
+  /** Process postings from multiple segments all positioned on the
+   *  same term. Writes out merged entries into freqOutput and
+   *  the proxOutput streams.
+   *
+   * @param smis array of segments
+   * @param n number of cells in the array actually occupied
+   * @return number of documents across all segments where this term was found
+   */
   private final int appendPostings(SegmentMergeInfo[] smis, int n)
        throws IOException {
     int lastDoc = 0;
@@ -295,6 +340,10 @@
     }
     return df;
   }
+
+  /** Merge field normalization factors for the specified segment readers.
+   *  Called from <code>merge</code>.
+   */
   private final void mergeNorms() throws IOException {
     for (int i = 0; i < fieldInfos.size(); i++) {
       FieldInfo fi = fieldInfos.fieldInfo(i);

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Re: Revival of Dmitry's Term Vector patches

Reply via email to