Greets,
I've been experimenting with using the UTF-8 bytecount as the VInt
count at the top of Lucene's string format, as was discussed back in
the "Lucene does NOT use UTF-8" thread. Changes were made to
IndexInput and IndexOutput as per some of Robert Engel's
suggestions. Here's the implementation of writeString, which chooses
the memory hit of buffering over the performance hit of pre-scanning:
public void writeString(String s) throws IOException {
utf8Bytes = s.getBytes("UTF-8");
int length = utf8Bytes.length;
writeVInt(length);
writeBytes(utf8Bytes, length);
}
That, in conjunction with a similar implementation of readString and
the UTF-8-clean implementation of readChars I submitted a while back,
executes 2-3% slower than the current implementation against my
index-1000-wikipedia-articles benchmarker.
I also had a hack at TermBuffer, TermInfosWriter, and StringHelper,
in an attempt to convert the prefix length for the Term Dictionary
from chars to UTF-8 bytes. (Note that I've left off changing
TermVectors for now.)
A major change has to be inflicted on TermBuffer in order for it to
work with a bytecount: the char[] "text" buffer must be swapped out
for a byte[] "utf8Bytes" buffer. With that change in place,
TermBuffer's 'read' method can utilize readBytes instead of readChars:
public final void read(IndexInput input, FieldInfos fieldInfos)
throws IOException {
this.term = null; // invalidate cache
int start = input.readVInt();
int length = input.readVInt();
int totalLength = start + length;
setutf8Length(totalLength);
input.readBytes(this.utf8Bytes, start, length);
this.field = fieldInfos.fieldName(input.readVInt());
}
I'd thought this might speed up scanning through a SegmentTermEnum,
since the intermediate step of converting UTF-8 bytes to chars was
deferred until toTerm gets called. However, a ScanEnum benchmarker I
cooked up which calls next() a whole bunch indicated a wash.
Concerns were raised before about whether it would be necessary to
convert all strings to UTF-8 to calculate prefix length for the Term
Dictionary. Yes, it's necessary to convert them -- though the same
copy can be used as a byte buffer which obviates the need to call
writeChars.
private final void writeTerm(Term term)
throws IOException {
byte[] bytes = term.text().getBytes("UTF-8");
int totalLength = bytes.length;
int start = StringHelper.bytesDifference(lastBytes, bytes);
int length = totalLength - start;
output.writeVInt(start); // write shared
prefix length
output.writeVInt(length); // write delta length
for (int i = start ; i < totalLength; i++) {
output.writeByte(bytes[i]); // write delta UTF-8
bytes
}
output.writeVInt(fieldInfos.fieldNumber(term.field)); // write
field num
lastTerm = term;
lastBytes = bytes;
}
Unfortunately, once the changes to TermBuffer, TermInfosWriter, and
StringHelper are applied, execution speed at index-time suffers a
slowdown of about 20%. Perhaps this can be blamed on all the calls
to getBytes("UTF-8") in TermInfosWriter? Maybe alternative
implementations using ByteBuffer, CharsetDecoder, and CharsetEncoder
are possible that can mitigate the problem?
A patch for all 5 files against repository revision 329640 is below;
to apply it, cd to the trunk directory first.
Marvin Humphrey
Rectangular Research
http://www.rectangular.com/
diff -u -r -X../excludefile src_old/java/org/apache/lucene/index/
TermBuffer.java src/java/org/apache/lucene/index/TermBuffer.java
--- src_old/java/org/apache/lucene/index/TermBuffer.java
2005-10-30 14:30:23.000000000 -0800
+++ src/java/org/apache/lucene/index/TermBuffer.java 2005-10-30
13:27:47.000000000 -0800
@@ -20,40 +20,38 @@
import org.apache.lucene.store.IndexInput;
final class TermBuffer implements Cloneable {
- private static final char[] NO_CHARS = new char[0];
+ private static final byte[] NO_BYTES = new byte[0];
private String field;
- private char[] text = NO_CHARS;
- private int textLength;
+ private byte[] utf8Bytes = NO_BYTES;
+ private int utf8Length;
private Term term; // cached
public final int compareTo(TermBuffer other) {
if (field == other.field) // fields are interned
- return compareChars(text, textLength, other.text,
other.textLength);
+ return compareUTF8(utf8Bytes, utf8Length,
+ other.utf8Bytes, other.utf8Length);
else
return field.compareTo(other.field);
}
- private static final int compareChars(char[] v1, int len1,
- char[] v2, int len2) {
+ private static final int compareUTF8(byte[] v1, int len1,
+ byte[] v2, int len2) {
int end = Math.min(len1, len2);
for (int k = 0; k < end; k++) {
- char c1 = v1[k];
- char c2 = v2[k];
- if (c1 != c2) {
- return c1 - c2;
- }
+ if (v1[k] != v2[k])
+ return v1[k] - v2[k];
}
return len1 - len2;
}
- private final void setTextLength(int newLength) {
- if (text.length < newLength) {
- char[] newText = new char[newLength];
- System.arraycopy(text, 0, newText, 0, textLength);
- text = newText;
+ private final void setutf8Length(int newLength) {
+ if (utf8Bytes.length < newLength) {
+ byte[] newBytes = new byte[newLength];
+ System.arraycopy(utf8Bytes, 0, newBytes, 0, utf8Length);
+ utf8Bytes = newBytes;
}
- textLength = newLength;
+ utf8Length = newLength;
}
public final void read(IndexInput input, FieldInfos fieldInfos)
@@ -62,28 +60,30 @@
int start = input.readVInt();
int length = input.readVInt();
int totalLength = start + length;
- setTextLength(totalLength);
- input.readChars(this.text, start, length);
+ setutf8Length(totalLength);
+ input.readBytes(this.utf8Bytes, start, length);
this.field = fieldInfos.fieldName(input.readVInt());
}
- public final void set(Term term) {
- if (term == null) {
+ public final void set(Term t) {
+ if (t == null) {
reset();
return;
}
- // copy text into the buffer
- setTextLength(term.text().length());
- term.text().getChars(0, term.text().length(), text, 0);
+ // convert chars into UTF-8 bytes, store in buffer
+ try {
+ utf8Bytes = t.text().getBytes("UTF-8");
+ } catch (java.io.UnsupportedEncodingException e) { }
+ setutf8Length(utf8Bytes.length);
- this.field = term.field();
- this.term = term;
+ this.field = t.field();
+ this.term = t;
}
public final void set(TermBuffer other) {
- setTextLength(other.textLength);
- System.arraycopy(other.text, 0, text, 0, textLength);
+ setutf8Length(other.utf8Length);
+ System.arraycopy(other.utf8Bytes, 0, utf8Bytes, 0, utf8Length);
this.field = other.field;
this.term = other.term;
@@ -91,7 +91,7 @@
public void reset() {
this.field = null;
- this.textLength = 0;
+ this.utf8Length = 0;
this.term = null;
}
@@ -100,7 +100,9 @@
return null;
if (term == null)
- term = new Term(field, new String(text, 0, textLength), false);
+ try {
+ term = new Term(field, new String(utf8Bytes, 0, utf8Length,
"UTF-8"), false);
+ } catch (java.io.UnsupportedEncodingException e) { }
return term;
}
@@ -111,8 +113,8 @@
clone = (TermBuffer)super.clone();
} catch (CloneNotSupportedException e) {}
- clone.text = new char[text.length];
- System.arraycopy(text, 0, clone.text, 0, textLength);
+ clone.utf8Bytes = new byte[utf8Bytes.length];
+ System.arraycopy(utf8Bytes, 0, clone.utf8Bytes, 0, utf8Length);
return clone;
}
diff -u -r -X../excludefile src_old/java/org/apache/lucene/index/
TermInfosWriter.java src/java/org/apache/lucene/index/
TermInfosWriter.java
--- src_old/java/org/apache/lucene/index/TermInfosWriter.java
2005-10-30 14:30:23.000000000 -0800
+++ src/java/org/apache/lucene/index/TermInfosWriter.java
2005-10-30 13:47:43.000000000 -0800
@@ -33,6 +33,8 @@
private IndexOutput output;
private Term lastTerm = new Term("", "");
private TermInfo lastTi = new TermInfo();
+ private byte[] NO_BYTES = new byte[0];
+ private byte[] lastBytes = NO_BYTES;
private long size = 0;
// TODO: the default values for these two parameters should be
settable from
@@ -91,8 +93,10 @@
TermInfo pointers must be positive and greater than all
previous.*/
final void add(Term term, TermInfo ti)
throws IOException {
- if (!isIndex && term.compareTo(lastTerm) <= 0)
- throw new IOException("term out of order");
+ if (!isIndex && term.compareTo(lastTerm) <= 0) {
+ throw new IOException("term out of order" + lastTerm.field +
+ "--" + lastTerm.text + "--" + term.field + "--" + term.text);
+ }
if (ti.freqPointer < lastTi.freqPointer)
throw new IOException("freqPointer out of order");
if (ti.proxPointer < lastTi.proxPointer)
@@ -121,16 +125,22 @@
private final void writeTerm(Term term)
throws IOException {
- int start = StringHelper.stringDifference(lastTerm.text,
term.text);
- int length = term.text.length() - start;
+ byte[] bytes = term.text().getBytes("UTF-8");
+ int totalLength = bytes.length;
+
+ int start = StringHelper.bytesDifference(lastBytes, bytes);
+ int length = totalLength - start;
output.writeVInt(start); // write shared
prefix length
output.writeVInt(length); // write delta length
- output.writeChars(term.text, start, length); // write delta chars
+ for (int i = start ; i < totalLength; i++) {
+ output.writeByte(bytes[i]); // write delta UTF-8
bytes
+ }
output.writeVInt(fieldInfos.fieldNumber(term.field)); // write
field num
lastTerm = term;
+ lastBytes = bytes;
}
diff -u -r -X../excludefile src_old/java/org/apache/lucene/store/
IndexInput.java src/java/org/apache/lucene/store/IndexInput.java
--- src_old/java/org/apache/lucene/store/IndexInput.java
2005-10-30 14:30:25.000000000 -0800
+++ src/java/org/apache/lucene/store/IndexInput.java 2005-10-30
13:27:47.000000000 -0800
@@ -23,7 +23,7 @@
* @see Directory
*/
public abstract class IndexInput implements Cloneable {
- private char[] chars; // used by
readString()
+ private byte[] utf8Bytes; // used by
readString()
/** Reads and returns a single byte.
* @see IndexOutput#writeByte(byte)
@@ -87,32 +87,66 @@
*/
public String readString() throws IOException {
int length = readVInt();
- if (chars == null || length > chars.length)
- chars = new char[length];
- readChars(chars, 0, length);
- return new String(chars, 0, length);
- }
-
- /** Reads UTF-8 encoded characters into an array.
+ if (utf8Bytes == null || length > utf8Bytes.length)
+ utf8Bytes = new byte[length];
+ readBytes(utf8Bytes, 0, length);
+ return new String(utf8Bytes, 0, length, "UTF-8");
+ }
+
+ static final byte[] TRAILING_BYTES_FOR_UTF8 = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3
+ };
+
+ /** Reads UTF-8 encoded characters into an array.
* @param buffer the array to read characters into
* @param start the offset in the array to start storing characters
* @param length the number of characters to read
* @see IndexOutput#writeChars(String,int,int)
*/
- public void readChars(char[] buffer, int start, int length)
+ public void readChars(char[] buffer, int start, int length)
throws IOException {
- final int end = start + length;
+ final int end = start + length;
for (int i = start; i < end; i++) {
- byte b = readByte();
- if ((b & 0x80) == 0)
- buffer[i] = (char)(b & 0x7F);
- else if ((b & 0xE0) != 0xE0) {
- buffer[i] = (char)(((b & 0x1F) << 6)
- | (readByte() & 0x3F));
- } else
- buffer[i] = (char)(((b & 0x0F) << 12)
- | ((readByte() & 0x3F) << 6)
- | (readByte() & 0x3F));
+ byte b = readByte();
+ switch (TRAILING_BYTES_FOR_UTF8[b & 0xFF]) {
+ case 0:
+ buffer[i] = (char)(b & 0x7F);
+ break;
+ case 1:
+ buffer[i] = (char)(((b & 0x1F) << 6)
+ | (readByte() & 0x3F));
+ break;
+ case 2:
+ buffer[i] = (char)(((b & 0x0F) << 12)
+ | ((readByte() & 0x3F) << 6)
+ | (readByte() & 0x3F));
+ break;
+ case 3:
+ int utf32 = (((b & 0x0F) << 18)
+ | ((readByte() & 0x3F) << 12)
+ | ((readByte() & 0x3F) << 6)
+ | (readByte() & 0x3F));
+ buffer[i] = (char)((utf32 >> 10) + 0xD7C0);
+ i++;
+ buffer[i] = (char)((utf32 & 0x03FF) + 0xDC00);
+ break;
+ }
}
}
@@ -148,7 +182,7 @@
clone = (IndexInput)super.clone();
} catch (CloneNotSupportedException e) {}
- clone.chars = null;
+ clone.utf8Bytes = null;
return clone;
}
diff -u -r -X../excludefile src_old/java/org/apache/lucene/store/
IndexOutput.java src/java/org/apache/lucene/store/IndexOutput.java
--- src_old/java/org/apache/lucene/store/IndexOutput.java
2005-10-30 14:30:25.000000000 -0800
+++ src/java/org/apache/lucene/store/IndexOutput.java 2005-10-30
13:27:47.000000000 -0800
@@ -25,6 +25,8 @@
*/
public abstract class IndexOutput {
+ private byte[] utf8Bytes; // used by
writeString()
+
/** Writes a single byte.
* @see IndexInput#readByte()
*/
@@ -85,9 +87,11 @@
* @see IndexInput#readString()
*/
public void writeString(String s) throws IOException {
- int length = s.length();
+ utf8Bytes = s.getBytes("UTF-8");
+
+ int length = utf8Bytes.length;
writeVInt(length);
- writeChars(s, 0, length);
+ writeBytes(utf8Bytes, length);
}
/** Writes a sequence of UTF-8 encoded characters from a string.
diff -u -r -X../excludefile src_old/java/org/apache/lucene/util/
StringHelper.java src/java/org/apache/lucene/util/StringHelper.java
--- src_old/java/org/apache/lucene/util/StringHelper.java
2005-10-30 14:30:25.000000000 -0800
+++ src/java/org/apache/lucene/util/StringHelper.java 2005-10-30
13:48:07.000000000 -0800
@@ -44,7 +44,26 @@
return len;
}
-
+ /**
+ * Compares two byte[] arrays, element by element, and returns the
+ * number of elements common to both arrays.
+ *
+ * @param bytes1 The first byte[] to compare
+ * @param bytes2 The second byte[] to compare
+ * @return The number of common elements.
+ */
+ public static final int bytesDifference(byte[] bytes1, byte[]
bytes2) {
+ int len1 = bytes1.length;
+ int len2 = bytes2.length;
+ int len = len1 < len2 ? len1 : len2;
+ for (int i = 0; i < len; i++) {
+ if (bytes1[i] != bytes2[i]) {
+ return i;
+ }
+ }
+ return len;
+ }
+
private StringHelper() {
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]