cutting     2003/12/22 13:40:18

  Modified:    src/java/org/apache/lucene/index DocumentWriter.java
  Log:
  Distinguish between positions and length when indexing a field.  The
  length is now defined as the total number of tokens, not the final
  position.  Length is used for score normalization
  (Similarity.lengthNorm()) and for controlling memory usage
  (IndexWriter.maxFieldLength).  In both cases the total number of
  tokens is more reasonable than the final position.  Position is used
  in phrase searching (see PhraseQuery and Token.setPositionIncrement()).
  
  Revision  Changes    Path
  1.7       +10 -5     
jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java
  
  Index: DocumentWriter.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- DocumentWriter.java       20 Sep 2003 17:42:40 -0000      1.6
  +++ DocumentWriter.java       22 Dec 2003 21:40:18 -0000      1.7
  @@ -103,7 +103,8 @@
   
       // invert doc into postingTable
       postingTable.clear();                      // clear postingTable
  -    fieldLengths = new int[fieldInfos.size()];         // init fieldLengths
  +    fieldLengths = new int[fieldInfos.size()];    // init fieldLengths
  +    fieldPositions = new int[fieldInfos.size()];  // init fieldPositions
   
       fieldBoosts = new float[fieldInfos.size()];        // init fieldBoosts
       Arrays.fill(fieldBoosts, doc.getBoost());
  @@ -138,6 +139,7 @@
     // Used to buffer a document before it is written to the index.
     private final Hashtable postingTable = new Hashtable();
     private int[] fieldLengths;
  +  private int[] fieldPositions;
     private float[] fieldBoosts;
   
     // Tokenizes the fields of a document into Postings.
  @@ -149,11 +151,13 @@
         String fieldName = field.name();
         int fieldNumber = fieldInfos.fieldNumber(fieldName);
   
  -      int position = fieldLengths[fieldNumber];        // position in field
  +      int length = fieldLengths[fieldNumber];     // length of field
  +      int position = fieldPositions[fieldNumber]; // position in field
   
         if (field.isIndexed()) {
           if (!field.isTokenized()) {            // un-tokenized field
             addPosition(fieldName, field.stringValue(), position++);
  +          length++;
           } else {
             Reader reader;                       // find or make Reader
             if (field.readerValue() != null)
  @@ -170,14 +174,15 @@
               for (Token t = stream.next(); t != null; t = stream.next()) {
                 position += (t.getPositionIncrement() - 1);
                 addPosition(fieldName, t.termText(), position++);
  -              if (position > maxFieldLength) break;
  +              if (++length > maxFieldLength) break;
               }
             } finally {
               stream.close();
             }
           }
   
  -        fieldLengths[fieldNumber] = position;          // save field length
  +        fieldLengths[fieldNumber] = length;    // save field length
  +        fieldPositions[fieldNumber] = position;        // save field position
           fieldBoosts[fieldNumber] *= field.getBoost();
         }
       }
  @@ -321,7 +326,7 @@
         if (field.isIndexed()) {
           int n = fieldInfos.fieldNumber(field.name());
           float norm =
  -          fieldBoosts[n] * similarity.lengthNorm(field.name(), fieldLengths[n]);
  +          fieldBoosts[n] * similarity.lengthNorm(field.name(),fieldLengths[n]);
           OutputStream norms = directory.createFile(segment + ".f" + n);
           try {
             norms.writeByte(similarity.encodeNorm(norm));
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to