Author: chetanm
Date: Thu Feb 25 10:37:50 2016
New Revision: 1732278
URL: http://svn.apache.org/viewvc?rev=1732278&view=rev
Log:
OAK-4060 - Allow use of pre extrcated text cache for incremental indexing
Exposed a config option around "Always use PreExtracted Text Provider". If
enable then provider would be consulted for incremental indexing. Otherwise it
would only be used for reindex
Added some logging also to simplify debugging in this area
Modified:
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/blob/datastore/DataStoreTextWriter.java
jackrabbit/oak/trunk/oak-lucene/pom.xml
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCacheTest.java
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java
Modified:
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/blob/datastore/DataStoreTextWriter.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/blob/datastore/DataStoreTextWriter.java?rev=1732278&r1=1732277&r2=1732278&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/blob/datastore/DataStoreTextWriter.java
(original)
+++
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/blob/datastore/DataStoreTextWriter.java
Thu Feb 25 10:37:50 2016
@@ -89,6 +89,7 @@ public class DataStoreTextWriter impleme
String blobId = blob.getContentIdentity();
if (blobId == null) {
log.debug("No id found for blob at path {}", propertyPath);
+ return null;
}
blobId = stripLength(blobId);
@@ -105,6 +106,10 @@ public class DataStoreTextWriter impleme
}
}
+ if (log.isDebugEnabled()){
+ String extractionResult = result != null ?
result.getExtractionResult().toString() : null;
+ log.debug("Extraction result for [{}] at path [{}] is [{}]",
blobId, propertyPath, extractionResult);
+ }
return result;
}
Modified: jackrabbit/oak/trunk/oak-lucene/pom.xml
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/pom.xml?rev=1732278&r1=1732277&r2=1732278&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/pom.xml (original)
+++ jackrabbit/oak/trunk/oak-lucene/pom.xml Thu Feb 25 10:37:50 2016
@@ -325,5 +325,11 @@
<version>1.3</version>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>org.mockito</groupId>
+ <artifactId>mockito-core</artifactId>
+ <version>1.10.19</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
</project>
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java?rev=1732278&r1=1732277&r2=1732278&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java
Thu Feb 25 10:37:50 2016
@@ -49,8 +49,13 @@ class ExtractedTextCache {
private int preFetchedCount;
private final Cache<String, String> cache;
private final CacheStats cacheStats;
+ private final boolean alwaysUsePreExtractedCache;
- public ExtractedTextCache(long maxWeight, long expiryTimeInSecs) {
+ public ExtractedTextCache(long maxWeight, long expiryTimeInSecs){
+ this(maxWeight, expiryTimeInSecs, false);
+ }
+
+ public ExtractedTextCache(long maxWeight, long expiryTimeInSecs, boolean
alwaysUsePreExtractedCache) {
if (maxWeight > 0) {
cache = CacheBuilder.newBuilder()
.weigher(EmpiricalWeigher.INSTANCE)
@@ -64,6 +69,7 @@ class ExtractedTextCache {
cache = null;
cacheStats = null;
}
+ this.alwaysUsePreExtractedCache = alwaysUsePreExtractedCache;
}
/**
@@ -77,8 +83,9 @@ class ExtractedTextCache {
//Consult the PreExtractedTextProvider only in reindex mode and not in
//incremental indexing mode. As that would only contain older entries
//That also avoid loading on various state (See DataStoreTextWriter)
- if (reindexMode && extractedTextProvider != null){
- String propertyPath = concat(nodePath, propertyName);
+ String propertyPath = concat(nodePath, propertyName);
+ log.trace("Looking for extracted text for [{}] with blobId [{}]",
propertyPath, blob.getContentIdentity());
+ if ((reindexMode || alwaysUsePreExtractedCache) &&
extractedTextProvider != null){
try {
ExtractedText text =
extractedTextProvider.getText(propertyPath, blob);
if (text != null) {
@@ -154,6 +161,11 @@ class ExtractedTextCache {
public String getBytesRead() {
return IOUtils.humanReadableByteCount(totalBytesRead);
}
+
+ @Override
+ public boolean isAlwaysUsePreExtractedCache() {
+ return alwaysUsePreExtractedCache;
+ }
};
}
@@ -176,6 +188,10 @@ class ExtractedTextCache {
}
}
+ boolean isAlwaysUsePreExtractedCache() {
+ return alwaysUsePreExtractedCache;
+ }
+
//Taken from DocumentNodeStore and cache packages as they are private
private static class EmpiricalWeigher implements Weigher<String, String> {
public static final EmpiricalWeigher INSTANCE = new EmpiricalWeigher();
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java?rev=1732278&r1=1732277&r2=1732278&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java
Thu Feb 25 10:37:50 2016
@@ -171,6 +171,15 @@ public class LuceneIndexProviderService
)
private static final String PROP_EXTRACTED_TEXT_CACHE_EXPIRY =
"extractedTextCacheExpiryInSecs";
+ private static final boolean PROP_PRE_EXTRACTED_TEXT_ALWAYS_USE_DEFAULT =
false;
+ @Property(
+ boolValue = PROP_PRE_EXTRACTED_TEXT_ALWAYS_USE_DEFAULT,
+ label = "Always use pre-extracted text cache",
+ description = "By default pre extracted text cache would only be
used for reindex case. If this setting " +
+ "is enabled then it would also be used in normal
incremental indexing"
+ )
+ private static final String PROP_PRE_EXTRACTED_TEXT_ALWAYS_USE =
"alwaysUsePreExtractedCache";
+
private static final int PROP_BOOLEAN_CLAUSE_LIMIT_DEFAULT = 1024;
@Property(
intValue = PROP_BOOLEAN_CLAUSE_LIMIT_DEFAULT,
@@ -439,8 +448,10 @@ public class LuceneIndexProviderService
PROP_EXTRACTED_TEXT_CACHE_SIZE_DEFAULT);
int cacheExpiryInSecs =
PropertiesUtil.toInteger(config.get(PROP_EXTRACTED_TEXT_CACHE_EXPIRY),
PROP_EXTRACTED_TEXT_CACHE_EXPIRY_DEFAULT);
+ boolean alwaysUsePreExtractedCache =
PropertiesUtil.toBoolean(config.get(PROP_PRE_EXTRACTED_TEXT_ALWAYS_USE),
+ PROP_PRE_EXTRACTED_TEXT_ALWAYS_USE_DEFAULT);
- extractedTextCache = new ExtractedTextCache(cacheSizeInMB * ONE_MB,
cacheExpiryInSecs);
+ extractedTextCache = new ExtractedTextCache(cacheSizeInMB * ONE_MB,
cacheExpiryInSecs, alwaysUsePreExtractedCache);
if (extractedTextProvider != null){
registerExtractedTextProvider(extractedTextProvider);
}
@@ -457,7 +468,10 @@ public class LuceneIndexProviderService
private void registerExtractedTextProvider(PreExtractedTextProvider
provider){
if (extractedTextCache != null){
if (provider != null){
- log.info("Registering PreExtractedTextProvider {} with
extracted text cache", provider);
+ String usage =
extractedTextCache.isAlwaysUsePreExtractedCache() ?
+ "always" : "only during reindexing phase";
+ log.info("Registering PreExtractedTextProvider {} with
extracted text cache. " +
+ "It would be used {}", provider, usage);
} else {
log.info("Unregistering PreExtractedTextProvider with
extracted text cache");
}
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java?rev=1732278&r1=1732277&r2=1732278&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java
Thu Feb 25 10:37:50 2016
@@ -27,6 +27,8 @@ public interface TextExtractionStatsMBea
boolean isPreExtractedTextProviderConfigured();
+ boolean isAlwaysUsePreExtractedCache();
+
int getTextExtractionCount();
long getTotalTime();
Modified:
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCacheTest.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCacheTest.java?rev=1732278&r1=1732277&r2=1732278&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCacheTest.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCacheTest.java
Thu Feb 25 10:37:50 2016
@@ -22,12 +22,19 @@ package org.apache.jackrabbit.oak.plugin
import org.apache.commons.io.FileUtils;
import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
+import
org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText.ExtractionResult;
+import
org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider;
import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.anyString;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verifyZeroInteractions;
+import static org.mockito.Mockito.when;
public class ExtractedTextCacheTest {
@@ -46,7 +53,7 @@ public class ExtractedTextCacheTest {
String text = cache.get("/a", "foo", b, false);
assertNull(text);
- cache.put(b, new ExtractedText(ExtractedText.ExtractionResult.SUCCESS,
"test hello"));
+ cache.put(b, new ExtractedText(ExtractionResult.SUCCESS, "test
hello"));
text = cache.get("/a", "foo", b, false);
assertEquals("test hello", text);
@@ -60,7 +67,7 @@ public class ExtractedTextCacheTest {
String text = cache.get("/a", "foo", b, false);
assertNull(text);
- cache.put(b, new ExtractedText(ExtractedText.ExtractionResult.SUCCESS,
"test hello"));
+ cache.put(b, new ExtractedText(ExtractionResult.SUCCESS, "test
hello"));
text = cache.get("/a", "foo", b, false);
assertNull(text);
@@ -74,12 +81,59 @@ public class ExtractedTextCacheTest {
String text = cache.get("/a", "foo", b, false);
assertNull(text);
- cache.put(b, new ExtractedText(ExtractedText.ExtractionResult.ERROR,
"test hello"));
+ cache.put(b, new ExtractedText(ExtractionResult.ERROR, "test hello"));
text = cache.get("/a", "foo", b, false);
assertNull(text);
}
+ @Test
+ public void preExtractionNoReindexNoProvider() throws Exception{
+ ExtractedTextCache cache = new ExtractedTextCache(10 *
FileUtils.ONE_MB, 100);
+
+ Blob b = new IdBlob("hello", "a");
+ String text = cache.get("/a", "foo", b, true);
+ assertNull(text);
+ }
+
+ @Test
+ public void preExtractionNoReindex() throws Exception{
+ ExtractedTextCache cache = new ExtractedTextCache(10 *
FileUtils.ONE_MB, 100);
+ PreExtractedTextProvider provider =
mock(PreExtractedTextProvider.class);
+
+ cache.setExtractedTextProvider(provider);
+ Blob b = new IdBlob("hello", "a");
+ String text = cache.get("/a", "foo", b, false);
+ assertNull(text);
+
+ verifyZeroInteractions(provider);
+ }
+
+ @Test
+ public void preExtractionReindex() throws Exception{
+ ExtractedTextCache cache = new ExtractedTextCache(10 *
FileUtils.ONE_MB, 100);
+ PreExtractedTextProvider provider =
mock(PreExtractedTextProvider.class);
+
+ cache.setExtractedTextProvider(provider);
+ when(provider.getText(anyString(), any(Blob.class)))
+ .thenReturn(new ExtractedText(ExtractionResult.SUCCESS,
"bar"));
+ Blob b = new IdBlob("hello", "a");
+ String text = cache.get("/a", "foo", b, true);
+ assertEquals("bar", text);
+ }
+
+ @Test
+ public void preExtractionAlwaysUse() throws Exception{
+ ExtractedTextCache cache = new ExtractedTextCache(10 *
FileUtils.ONE_MB, 100, true);
+ PreExtractedTextProvider provider =
mock(PreExtractedTextProvider.class);
+
+ cache.setExtractedTextProvider(provider);
+ when(provider.getText(anyString(), any(Blob.class)))
+ .thenReturn(new ExtractedText(ExtractionResult.SUCCESS,
"bar"));
+ Blob b = new IdBlob("hello", "a");
+ String text = cache.get("/a", "foo", b, false);
+ assertEquals("bar", text);
+ }
private static class IdBlob extends ArrayBasedBlob {
final String id;
Modified:
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java?rev=1732278&r1=1732277&r2=1732278&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java
Thu Feb 25 10:37:50 2016
@@ -41,6 +41,7 @@ import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
@@ -158,6 +159,7 @@ public class LuceneIndexProviderServiceT
LuceneIndexEditorProvider editorProvider =
(LuceneIndexEditorProvider)
context.getService(IndexEditorProvider.class);
assertNull(editorProvider.getExtractedTextCache().getExtractedTextProvider());
+
assertFalse(editorProvider.getExtractedTextCache().isAlwaysUsePreExtractedCache());
//Mock OSGi does not support components
//context.registerService(PreExtractedTextProvider.class, new
DummyProvider());
@@ -176,6 +178,16 @@ public class LuceneIndexProviderServiceT
}
@Test
+ public void alwaysUsePreExtractedCache() throws Exception{
+ Map<String,Object> config = getDefaultConfig();
+ config.put("alwaysUsePreExtractedCache", "true");
+ MockOsgi.activate(service, context.bundleContext(), config);
+ LuceneIndexEditorProvider editorProvider =
+ (LuceneIndexEditorProvider)
context.getService(IndexEditorProvider.class);
+
assertTrue(editorProvider.getExtractedTextCache().isAlwaysUsePreExtractedCache());
+ }
+
+ @Test
public void booleanQuerySize() throws Exception{
Map<String,Object> config = getDefaultConfig();
config.put("booleanClauseLimit", 4000);