hi all,
as promised last week, i would like to contribute a patch which allows to remove all synchronization from TermVectorsReader class. This patch dramatically improves the overall performance using one IndexReader within several threads. To show the difference, there is a new JUnit test which shows the difference.
calculated time without patch: threadcount: 1 average term vector time: 385 threadcount: 2 average term vector time: 1013 threadcount: 4 average term vector time: 2879 threadcount: 6 average term vector time: 11311 threadcount: 8 average term vector time: 29323 threadcount: 10 average term vector time: 35797
time result using the patch: threadcount: 1 average term vector time: 379 threadcount: 2 average term vector time: 736 threadcount: 4 average term vector time: 877 threadcount: 6 average term vector time: 947 threadcount: 8 average term vector time: 2407 threadcount: 10 average term vector time: 2648
If lucene developers think this patch is useful and might has a chance to become committed, i can create a new Bugzilla issue and add the files there.
best regards Bernhard
Index: SegmentReader.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/SegmentReader.java,v retrieving revision 1.25 diff -u -r1.25 SegmentReader.java --- SegmentReader.java 11 Aug 2004 17:37:52 -0000 1.25 +++ SegmentReader.java 15 Aug 2004 14:12:34 -0000 @@ -42,8 +42,7 @@ private FieldsReader fieldsReader; TermInfosReader tis; - TermVectorsReader termVectorsReader; - + BitVector deletedDocs = null; private boolean deletedDocsDirty = false; private boolean normsDirty = false; @@ -51,6 +50,8 @@ InputStream freqStream; InputStream proxStream; + + private ThreadLocal termVectorsLocal = null; // Compound File Reader when based on a compound file segment CompoundFileReader cfsReader = null; @@ -128,7 +129,17 @@ openNorms(cfsDir); if (fieldInfos.hasVectors()) { // open term vector files only as needed - termVectorsReader = new TermVectorsReader(cfsDir, segment, fieldInfos); + final Directory dir = cfsDir; + termVectorsLocal = new ThreadLocal() { + protected synchronized Object initialValue() { + try { + return new TermVectorsReader(dir, segment, fieldInfos); + } catch (IOException ioe) { + ioe.printStackTrace(); + return null; + } + } + }; } } @@ -164,8 +175,11 @@ proxStream.close(); closeNorms(); - if (termVectorsReader != null) termVectorsReader.close(); - + TermVectorsReader termVectorsReader = (TermVectorsReader)termVectorsLocal.get(); + if (termVectorsReader != null) { + termVectorsReader.close(); + } + if (cfsReader != null) cfsReader.close(); } @@ -408,6 +422,11 @@ FieldInfo fi = fieldInfos.fieldInfo(field); if (fi == null || !fi.storeTermVector) return null; + TermVectorsReader termVectorsReader = (TermVectorsReader)termVectorsLocal.get(); + if (termVectorsReader == null) { + return null; + } + return termVectorsReader.get(docNumber, field); } @@ -419,9 +438,10 @@ * If no such fields existed, the method returns null. */ public TermFreqVector[] getTermFreqVectors(int docNumber) { - if (termVectorsReader == null) - return null; - + TermVectorsReader termVectorsReader = (TermVectorsReader)termVectorsLocal.get(); + if (termVectorsReader == null) { + return null; + } return termVectorsReader.get(docNumber); } }
Index: TermVectorsReader.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/index/TermVectorsReader.java,v retrieving revision 1.2 diff -u -r1.2 TermVectorsReader.java --- TermVectorsReader.java 6 Aug 2004 20:50:00 -0000 1.2 +++ TermVectorsReader.java 15 Aug 2004 14:12:00 -0000 @@ -41,7 +41,7 @@ } - synchronized void close() throws IOException { + void close() throws IOException { // why don't we trap the exception and at least make sure that // all streams that we can close are closed? if (tvx != null) tvx.close(); @@ -63,7 +63,7 @@ * @param field The field within the document to retrieve * @return The TermFreqVector for the document and field or null */ - synchronized TermFreqVector get(int docNum, String field) { + TermFreqVector get(int docNum, String field) { // Check if no term vectors are available for this segment at all int fieldNumber = fieldInfos.fieldNumber(field); TermFreqVector result = null; @@ -116,7 +116,7 @@ /** Return all term vectors stored for this document or null if the could not be read in. */ - synchronized TermFreqVector[] get(int docNum) { + TermFreqVector[] get(int docNum) { TermFreqVector[] result = null; // Check if no term vectors are available for this segment at all if (tvx != null) {
/* * Created on 15.08.2004 * */ package org.apache.lucene.search;
import java.io.IOException; import junit.framework.TestCase; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.TermFreqVector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.English; /** * @author Bernhard Messer * @version $rcs = ' $Id: Exp $ ' ; */ public class TestMultiThreadTermVectors extends TestCase { private IndexReader reader; //private RAMDirectory directory = new RAMDirectory(); public TestMultiThreadTermVectors(String s) { super(s); } public void setUp() throws Exception { //FSDirectory directory = FSDirectory.getDirectory("/tmp/termv", false); FSDirectory directory = FSDirectory.getDirectory("/tmp/termv", true); IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true); //writer.setUseCompoundFile(true); //writer.infoStream = System.out; for (int i = 0; i < 1000; i++) { Document doc = new Document(); //doc.add(Field.Text("field", English.intToEnglish(i), true)); Field fld = new Field("field", English.intToEnglish(i), true, true, false, true); doc.add(fld); writer.addDocument(doc); } writer.close(); reader = IndexReader.open(directory); } public void test() { assertTrue(reader != null); testTermPositionVectors(1); testTermPositionVectors(2); testTermPositionVectors(4); testTermPositionVectors(6); testTermPositionVectors(8); testTermPositionVectors(10); /** close the opened reader */ try { reader.close(); } catch (IOException ioe) { fail(ioe.getMessage()); } } public void testTermPositionVectors(int threadCount) { MultiThreadTermVectorsReader[] mtr = new MultiThreadTermVectorsReader[threadCount]; for (int i = 0; i < threadCount; i++) { mtr[i] = new MultiThreadTermVectorsReader(); mtr[i].init(reader); } /** run until all threads finished */ int threadsAlive = mtr.length; while (threadsAlive > 0) { try { //System.out.println("Threads alive"); Thread.sleep(10); threadsAlive = mtr.length; for (int i = 0; i < mtr.length; i++) { if (mtr[i].isAlive() == true) { break; } threadsAlive--; } } catch (InterruptedException ie) {} } long totalTime = 0L; for (int i = 0; i < mtr.length; i++) { totalTime += mtr[i].timeElapsed; mtr[i] = null; } System.out.println("threadcount: " + mtr.length + " average term vector time: " + totalTime/mtr.length); } public static void main(String[] args) { TestMultiThreadTermVectors t = new TestMultiThreadTermVectors(""); try { t.setUp(); t.test(); } catch (Exception e) { e.printStackTrace(); } } } class MultiThreadTermVectorsReader implements Runnable { private IndexReader reader = null; private Thread t = null; private final int runsToDo = 100; long timeElapsed = 0; public void init(IndexReader reader) { this.reader = reader; timeElapsed = 0; t=new Thread(this); t.start(); } public boolean isAlive() { if (t == null) return false; return t.isAlive(); } public void run() { try { // run the test 100 times for (int i = 0; i < runsToDo; i++) testTermVectors(); } catch (Exception e) { e.printStackTrace(); } return; } private void testTermVectors() throws Exception { // check: int numDocs = reader.numDocs(); long start = 0L; for (int docId = 0; docId < numDocs; docId++) { start = System.currentTimeMillis(); TermFreqVector [] vectors = reader.getTermFreqVectors(docId); timeElapsed += System.currentTimeMillis()-start; // verify vectors result verifyVectors(vectors, docId); start = System.currentTimeMillis(); TermFreqVector vector = reader.getTermFreqVector(docId, "field"); timeElapsed += System.currentTimeMillis()-start; vectors = new TermFreqVector[1]; vectors[0] = vector; verifyVectors(vectors, docId); } } private void verifyVectors(TermFreqVector[] vectors, int num) { StringBuffer temp = new StringBuffer(); String[] terms = null; for (int i = 0; i < vectors.length; i++) { terms = vectors[i].getTerms(); for (int z = 0; z < terms.length; z++) { temp.append(terms[z]); } } if (!English.intToEnglish(num).trim().equals(temp.toString().trim())) System.out.println("worng term result"); } }
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]