cutting 2003/12/22 13:40:18 Modified: src/java/org/apache/lucene/index DocumentWriter.java Log: Distinguish between positions and length when indexing a field. The length is now defined as the total number of tokens, not the final position. Length is used for score normalization (Similarity.lengthNorm()) and for controlling memory usage (IndexWriter.maxFieldLength). In both cases the total number of tokens is more reasonable than the final position. Position is used in phrase searching (see PhraseQuery and Token.setPositionIncrement()). Revision Changes Path 1.7 +10 -5 jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java Index: DocumentWriter.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- DocumentWriter.java 20 Sep 2003 17:42:40 -0000 1.6 +++ DocumentWriter.java 22 Dec 2003 21:40:18 -0000 1.7 @@ -103,7 +103,8 @@ // invert doc into postingTable postingTable.clear(); // clear postingTable - fieldLengths = new int[fieldInfos.size()]; // init fieldLengths + fieldLengths = new int[fieldInfos.size()]; // init fieldLengths + fieldPositions = new int[fieldInfos.size()]; // init fieldPositions fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts Arrays.fill(fieldBoosts, doc.getBoost()); @@ -138,6 +139,7 @@ // Used to buffer a document before it is written to the index. private final Hashtable postingTable = new Hashtable(); private int[] fieldLengths; + private int[] fieldPositions; private float[] fieldBoosts; // Tokenizes the fields of a document into Postings. @@ -149,11 +151,13 @@ String fieldName = field.name(); int fieldNumber = fieldInfos.fieldNumber(fieldName); - int position = fieldLengths[fieldNumber]; // position in field + int length = fieldLengths[fieldNumber]; // length of field + int position = fieldPositions[fieldNumber]; // position in field if (field.isIndexed()) { if (!field.isTokenized()) { // un-tokenized field addPosition(fieldName, field.stringValue(), position++); + length++; } else { Reader reader; // find or make Reader if (field.readerValue() != null) @@ -170,14 +174,15 @@ for (Token t = stream.next(); t != null; t = stream.next()) { position += (t.getPositionIncrement() - 1); addPosition(fieldName, t.termText(), position++); - if (position > maxFieldLength) break; + if (++length > maxFieldLength) break; } } finally { stream.close(); } } - fieldLengths[fieldNumber] = position; // save field length + fieldLengths[fieldNumber] = length; // save field length + fieldPositions[fieldNumber] = position; // save field position fieldBoosts[fieldNumber] *= field.getBoost(); } } @@ -321,7 +326,7 @@ if (field.isIndexed()) { int n = fieldInfos.fieldNumber(field.name()); float norm = - fieldBoosts[n] * similarity.lengthNorm(field.name(), fieldLengths[n]); + fieldBoosts[n] * similarity.lengthNorm(field.name(),fieldLengths[n]); OutputStream norms = directory.createFile(segment + ".f" + n); try { norms.writeByte(similarity.encodeNorm(norm));
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]