Hi all,
We use Tomcat and Slide in one of our major applications, and we came
across a known problem with asynchron indexing : files to index are all
stored in memory! So we implemented a little patch to circumvent this.
Basically, we don't pass any file content in argument any more (no more
InputStream field in the IndexJob class) ; we rather user the
corresponding Uri, from which the content is retrieved when indexing.
This way, there's always only one file to index in memory.
As we had made some other changes to the same classes to customize Slide
to our needs, I had to do some merges (gasp). I know it compiles
successfully, but I have hardly any time to test it.
Anyway, I hope it'll come in handy to someone.
Cheers,
--
Jimmy Monin
EADS DSS SAS
--- stores/org/apache/slide/index/lucene/Index.java.orig 2006-02-14
12:00:06.000000000 +0100
+++ stores/org/apache/slide/index/lucene/Index.java 2007-01-03
16:06:16.260018988 +0100
@@ -50,12 +50,14 @@
import org.apache.slide.common.PropertyName;
import org.apache.slide.common.Uri;
import org.apache.slide.content.NodeProperty;
+import org.apache.slide.content.NodeRevisionContent;
import org.apache.slide.content.NodeRevisionDescriptor;
import org.apache.slide.content.NodeRevisionNumber;
import org.apache.slide.extractor.ContentExtractor;
import org.apache.slide.extractor.ExtractorException;
import org.apache.slide.extractor.ExtractorManager;
import org.apache.slide.search.IndexException;
+import org.apache.slide.store.Store;
import org.apache.slide.util.logger.Logger;
/**
@@ -238,20 +240,19 @@
* @param descriptor
* properties to be indexed
*/
- private Document createLuceneDocument(String uri,
- NodeRevisionDescriptor descriptor) {
-
- Document doc = new Document();
-
- doc.add(unstoredString(Index.KEY_FIELD_NAME,
configuration.generateKey(
- uri, descriptor.getRevisionNumber())));
- doc.add(storedString(Index.URI_FIELD_NAME, uri));
+ private Document createLuceneDocument(Uri uri, NodeRevisionDescriptor
descriptor) {
+
+ Document doc = new Document();
+
+ doc.add(unstoredString(Index.KEY_FIELD_NAME,
+ configuration.generateKey(uri.toString(),
descriptor.getRevisionNumber())));
+ doc.add(storedString(Index.URI_FIELD_NAME, uri.toString()));
doc.add(storedString(Index.REVISION_FIELD_NAME, descriptor
.getRevisionNumber().toString()));
// scopes
- StringTokenizer tokenizer = new StringTokenizer(uri, "/");
- StringBuffer buffer = new StringBuffer(uri.length());
+ StringTokenizer tokenizer = new StringTokenizer(uri.toString(),
"/");
+ StringBuffer buffer = new StringBuffer(uri.toString().length());
doc.add(unstoredString(Index.SCOPE_FIELD_NAME, "/"));
int depth = 0;
for (; tokenizer.hasMoreTokens();) {
@@ -327,20 +328,20 @@
/**
* Creates a lucene document for content indexing.
*/
- private Document createLuceneDocument(String uri,
- NodeRevisionDescriptor descriptor, InputStream content)
+ private Document createLuceneDocument(Uri uri,
+ NodeRevisionDescriptor descriptor, boolean isIndexContent)
throws IndexException, ExtractorException {
Document doc = new Document();
doc.add(unstoredString(Index.KEY_FIELD_NAME,
configuration.generateKey(
- uri, descriptor.getRevisionNumber())));
- doc.add(storedString(Index.URI_FIELD_NAME, uri));
+ uri.toString(),
descriptor.getRevisionNumber())));
+ doc.add(storedString(Index.URI_FIELD_NAME, uri.toString()));
doc.add(storedString(Index.REVISION_FIELD_NAME, descriptor
.getRevisionNumber().toString()));
// scopes
- StringTokenizer tokenizer = new StringTokenizer(uri, "/");
- StringBuffer buffer = new StringBuffer(uri.length());
+ StringTokenizer tokenizer = new StringTokenizer(uri.toString(),
"/");
+ StringBuffer buffer = new StringBuffer(uri.toString().length());
doc.add(unstoredString(Index.SCOPE_FIELD_NAME, "/"));
int depth = 0;
for (; tokenizer.hasMoreTokens();) {
@@ -352,12 +353,35 @@
.intToIndexString(depth)));
List extractors =
ExtractorManager.getInstance().getContentExtractors(
- configuration.getNamespaceName(), uri,
descriptor);
+ configuration.getNamespaceName(), uri.toString(), descriptor);
+
+ // We get the content from the URI
+
+ InputStream streamContent = null;
+ try {
+ Store store = uri.getStore();
+
+ NodeRevisionContent content = store.retrieveRevisionContent(uri,
descriptor);
+ streamContent = content.streamContent();
for (Iterator i = extractors.iterator(); i.hasNext();) {
ContentExtractor extractor = (ContentExtractor)
i.next();
doc.add(textField(Index.CONTENT_FIELD_NAME, extractor
- .extract(content)));
+ .extract(streamContent)));
+ }
+ } catch (Exception e) {
+ logger.log("Error retrieving content for indexing"
+ + e.getMessage());
+ e.printStackTrace();
+ } finally {
+ if (streamContent != null) {
+ try {
+ // Close the stream if open (remove associated lock)
+ streamContent.close();
+ } catch (Exception e) {
+ // do nothing
+ }
+ }
}
return doc;
@@ -423,14 +447,13 @@
for (Iterator i = addJobs.iterator();
i.hasNext();) {
IndexJob job = (IndexJob) i.next();
Document doc;
- if (job.content != null) {
+ if (job.isIndexContent) {
if
(logger.isEnabled(LOG_CHANNEL, Logger.DEBUG)) {
logger.log("index
content: " + job.key,
LOG_CHANNEL, Logger.DEBUG);
}
try {
- doc =
createLuceneDocument(job.uri, job.descriptor,
-
job.content);
+ doc = createLuceneDocument(job.uri, job.descriptor, true);
writer.addDocument(doc);
} catch (ExtractorException e) {
logger.log("Error while
extracting content: "
@@ -477,12 +500,16 @@
return new IndexJob(uri, descriptor);
}
- /**
- * Creates an IndexJob for <em>content indexing</em>.
- */
- public IndexJob createIndexJob(Uri uri, NodeRevisionDescriptor
descriptor,
- InputStream content) {
- return new IndexJob(uri, descriptor, content);
+ /**
+ * Creates an IndexJob for <em>content indexing</em>.
+ */
+ public IndexJob createIndexJob(Uri uri,
+ NodeRevisionDescriptor descriptor, InputStream content) {
+ return new IndexJob(uri, descriptor, true);
+ }
+ public IndexJob createIndexJob(Uri uri,
+ NodeRevisionDescriptor descriptor, boolean isIndexContent) {
+ return new IndexJob(uri, descriptor, isIndexContent);
}
/**
@@ -494,39 +521,32 @@
public class IndexJob {
protected String key;
-
- protected String uri;
+ protected Uri uri;
protected NodeRevisionDescriptor descriptor;
-
- protected InputStream content;
+ protected boolean isIndexContent;
protected String getKey() {
return key;
}
protected IndexJob(Uri uri, NodeRevisionNumber number) {
- this.uri = uri.toString();
+ this.uri = uri;
this.descriptor = null;
- this.content = null;
- this.key = configuration.generateKey(this.uri, number);
+ this.key = configuration.generateKey(this.uri.toString(), number);
}
protected IndexJob(Uri uri, NodeRevisionDescriptor descriptor) {
- this.uri = uri.toString();
+ this.uri = uri;
this.descriptor = descriptor;
- this.content = null;
- this.key = configuration.generateKey(this.uri,
descriptor
- .getRevisionNumber());
+ this.key = configuration.generateKey(this.uri.toString(),
descriptor.getRevisionNumber());
+ this.isIndexContent = false;
}
-
- protected IndexJob(Uri uri, NodeRevisionDescriptor descriptor,
- InputStream content) {
- this.uri = uri.toString();
+ protected IndexJob(Uri uri, NodeRevisionDescriptor descriptor, boolean
isIndexContent) {
+ this.uri = uri;
this.descriptor = descriptor;
- this.content = content;
- this.key = configuration.generateKey(this.uri,
descriptor
- .getRevisionNumber());
+ this.key = configuration.generateKey(this.uri.toString(),
descriptor.getRevisionNumber());
+ this.isIndexContent = isIndexContent;
}
public boolean equals(Object obj) {
--- stores/org/apache/slide/index/lucene/AbstractLuceneIndexer.java.orig
2006-02-14 12:00:06.000000000 +0100
+++ stores/org/apache/slide/index/lucene/AbstractLuceneIndexer.java
2007-01-03 15:55:26.582627930 +0100
@@ -328,8 +328,11 @@
void addIndexJob(Uri uri, NodeRevisionDescriptor descriptor) {
this.indexJobs.add(this.idx.createIndexJob(uri,
descriptor));
}
-
- void addIndexJob(Uri uri, NodeRevisionDescriptor descriptor,
+ void addIndexJob(Uri uri, NodeRevisionDescriptor descriptor,
+ boolean isIndexContent) {
+ this.indexJobs.add(this.idx.createIndexJob(uri, descriptor,
isIndexContent));
+ }
+ void addIndexJob(Uri uri, NodeRevisionDescriptor descriptor,
InputStream content) {
this.indexJobs.add(this.idx
.createIndexJob(uri, descriptor,
content));
@@ -346,25 +349,21 @@
void addUpdateJob(Uri uri, NodeRevisionDescriptor descriptor) {
Index.IndexJob job = this.idx.createIndexJob(uri,
descriptor);
- this.removeJobs.remove(job); // remove before add
because
-
// different jobs for same uri have
-
// the same key
- this.removeJobs.add(job);
- this.indexJobs.remove(job);
- this.indexJobs.add(job);
- }
-
- void addUpdateJob(Uri uri, NodeRevisionDescriptor descriptor,
- InputStream content) {
- Index.IndexJob job = this.idx.createIndexJob(uri,
descriptor,
- content);
- this.removeJobs.remove(job); // remove before add
because
-
// different jobs for same uri have
-
// the same key
- this.removeJobs.add(job);
- this.indexJobs.remove(job);
- this.indexJobs.add(job);
- }
+ this.removeJobs.remove(job); // remove before add
because different jobs for same uri have the same key
+ this.removeJobs.add(job);
+ this.indexJobs.remove(job);
+ this.indexJobs.add(job);
+ }
+ void addUpdateJob(Uri uri,
+ NodeRevisionDescriptor descriptor,
+ boolean isIndexContent)
+ {
+ Index.IndexJob job = this.idx.createIndexJob(uri, descriptor,
isIndexContent);
+ this.removeJobs.remove(job); // remove before add because different
jobs for same uri have the same key
+ this.removeJobs.add(job);
+ this.indexJobs.remove(job);
+ this.indexJobs.add(job);
+ }
public void begin() throws XAException {
}
--- stores/org/apache/slide/index/lucene/LuceneContentIndexer.java.orig
2006-02-14 12:00:06.000000000 +0100
+++ stores/org/apache/slide/index/lucene/LuceneContentIndexer.java
2007-01-03 15:26:22.000000000 +0100
@@ -22,7 +22,8 @@
*/
package org.apache.slide.index.lucene;
-import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
import java.util.Hashtable;
import javax.transaction.xa.XAException;
@@ -109,9 +110,8 @@
uri.getNamespace().getName(),
uri.toString(),
revisionDescriptor)) {
TransactionalIndexResource indexResource =
getCurrentTxn();
- indexResource.addIndexJob(uri,
revisionDescriptor,
- new
ByteArrayInputStream(revisionContent
-
.getContentBytes()));
+ indexResource.addIndexJob(uri, revisionDescriptor, true);
+
}
}
}
@@ -131,9 +131,8 @@
uri.getNamespace().getName(),
uri.toString(),
revisionDescriptor)) {
TransactionalIndexResource indexResource =
getCurrentTxn();
- indexResource.addUpdateJob(uri,
revisionDescriptor,
- new
ByteArrayInputStream(revisionContent
-
.getContentBytes()));
+ indexResource.addUpdateJob(uri, revisionDescriptor, true);
+
}
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]