Author: chetanm
Date: Thu Feb 25 10:37:50 2016
New Revision: 1732278

URL: http://svn.apache.org/viewvc?rev=1732278&view=rev
Log:
OAK-4060 - Allow use of pre extrcated text cache for incremental indexing

Exposed a config option around "Always use PreExtracted Text Provider". If 
enable then provider would be consulted for incremental indexing. Otherwise it 
would only be used for reindex

Added some logging also to simplify debugging in this area

Modified:
    
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/blob/datastore/DataStoreTextWriter.java
    jackrabbit/oak/trunk/oak-lucene/pom.xml
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java
    
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCacheTest.java
    
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java

Modified: 
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/blob/datastore/DataStoreTextWriter.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/blob/datastore/DataStoreTextWriter.java?rev=1732278&r1=1732277&r2=1732278&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/blob/datastore/DataStoreTextWriter.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/blob/datastore/DataStoreTextWriter.java
 Thu Feb 25 10:37:50 2016
@@ -89,6 +89,7 @@ public class DataStoreTextWriter impleme
         String blobId = blob.getContentIdentity();
         if (blobId == null) {
             log.debug("No id found for blob at path {}", propertyPath);
+            return null;
         }
 
         blobId = stripLength(blobId);
@@ -105,6 +106,10 @@ public class DataStoreTextWriter impleme
             }
         }
 
+        if (log.isDebugEnabled()){
+            String extractionResult = result != null ? 
result.getExtractionResult().toString() : null;
+            log.debug("Extraction result for [{}] at path [{}] is [{}]", 
blobId, propertyPath, extractionResult);
+        }
         return result;
     }
 

Modified: jackrabbit/oak/trunk/oak-lucene/pom.xml
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/pom.xml?rev=1732278&r1=1732277&r2=1732278&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/pom.xml (original)
+++ jackrabbit/oak/trunk/oak-lucene/pom.xml Thu Feb 25 10:37:50 2016
@@ -325,5 +325,11 @@
       <version>1.3</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-core</artifactId>
+      <version>1.10.19</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 </project>

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java?rev=1732278&r1=1732277&r2=1732278&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java
 Thu Feb 25 10:37:50 2016
@@ -49,8 +49,13 @@ class ExtractedTextCache {
     private int preFetchedCount;
     private final Cache<String, String> cache;
     private final CacheStats cacheStats;
+    private final boolean alwaysUsePreExtractedCache;
 
-    public ExtractedTextCache(long maxWeight, long expiryTimeInSecs) {
+    public ExtractedTextCache(long maxWeight, long expiryTimeInSecs){
+        this(maxWeight, expiryTimeInSecs, false);
+    }
+
+    public ExtractedTextCache(long maxWeight, long expiryTimeInSecs, boolean 
alwaysUsePreExtractedCache) {
         if (maxWeight > 0) {
             cache = CacheBuilder.newBuilder()
                     .weigher(EmpiricalWeigher.INSTANCE)
@@ -64,6 +69,7 @@ class ExtractedTextCache {
             cache = null;
             cacheStats = null;
         }
+        this.alwaysUsePreExtractedCache = alwaysUsePreExtractedCache;
     }
 
     /**
@@ -77,8 +83,9 @@ class ExtractedTextCache {
         //Consult the PreExtractedTextProvider only in reindex mode and not in
         //incremental indexing mode. As that would only contain older entries
         //That also avoid loading on various state (See DataStoreTextWriter)
-        if (reindexMode && extractedTextProvider != null){
-            String propertyPath = concat(nodePath, propertyName);
+        String propertyPath = concat(nodePath, propertyName);
+        log.trace("Looking for extracted text for [{}] with blobId [{}]", 
propertyPath, blob.getContentIdentity());
+        if ((reindexMode || alwaysUsePreExtractedCache) && 
extractedTextProvider != null){
             try {
                 ExtractedText text = 
extractedTextProvider.getText(propertyPath, blob);
                 if (text != null) {
@@ -154,6 +161,11 @@ class ExtractedTextCache {
             public String getBytesRead() {
                 return IOUtils.humanReadableByteCount(totalBytesRead);
             }
+
+            @Override
+            public boolean isAlwaysUsePreExtractedCache() {
+                return alwaysUsePreExtractedCache;
+            }
         };
     }
 
@@ -176,6 +188,10 @@ class ExtractedTextCache {
         }
     }
 
+    boolean isAlwaysUsePreExtractedCache() {
+        return alwaysUsePreExtractedCache;
+    }
+
     //Taken from DocumentNodeStore and cache packages as they are private
     private static class EmpiricalWeigher implements Weigher<String, String> {
         public static final EmpiricalWeigher INSTANCE = new EmpiricalWeigher();

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java?rev=1732278&r1=1732277&r2=1732278&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java
 Thu Feb 25 10:37:50 2016
@@ -171,6 +171,15 @@ public class LuceneIndexProviderService
     )
     private static final String PROP_EXTRACTED_TEXT_CACHE_EXPIRY = 
"extractedTextCacheExpiryInSecs";
 
+    private static final boolean PROP_PRE_EXTRACTED_TEXT_ALWAYS_USE_DEFAULT = 
false;
+    @Property(
+            boolValue = PROP_PRE_EXTRACTED_TEXT_ALWAYS_USE_DEFAULT,
+            label = "Always use pre-extracted text cache",
+            description = "By default pre extracted text cache would only be 
used for reindex case. If this setting " +
+                    "is enabled then it would also be used in normal 
incremental indexing"
+    )
+    private static final String PROP_PRE_EXTRACTED_TEXT_ALWAYS_USE = 
"alwaysUsePreExtractedCache";
+
     private static final int PROP_BOOLEAN_CLAUSE_LIMIT_DEFAULT = 1024;
     @Property(
             intValue = PROP_BOOLEAN_CLAUSE_LIMIT_DEFAULT,
@@ -439,8 +448,10 @@ public class LuceneIndexProviderService
                 PROP_EXTRACTED_TEXT_CACHE_SIZE_DEFAULT);
         int cacheExpiryInSecs = 
PropertiesUtil.toInteger(config.get(PROP_EXTRACTED_TEXT_CACHE_EXPIRY),
                 PROP_EXTRACTED_TEXT_CACHE_EXPIRY_DEFAULT);
+        boolean alwaysUsePreExtractedCache = 
PropertiesUtil.toBoolean(config.get(PROP_PRE_EXTRACTED_TEXT_ALWAYS_USE),
+                PROP_PRE_EXTRACTED_TEXT_ALWAYS_USE_DEFAULT);
 
-        extractedTextCache = new ExtractedTextCache(cacheSizeInMB * ONE_MB, 
cacheExpiryInSecs);
+        extractedTextCache = new ExtractedTextCache(cacheSizeInMB * ONE_MB, 
cacheExpiryInSecs, alwaysUsePreExtractedCache);
         if (extractedTextProvider != null){
             registerExtractedTextProvider(extractedTextProvider);
         }
@@ -457,7 +468,10 @@ public class LuceneIndexProviderService
     private void registerExtractedTextProvider(PreExtractedTextProvider 
provider){
         if (extractedTextCache != null){
             if (provider != null){
-                log.info("Registering PreExtractedTextProvider {} with 
extracted text cache", provider);
+                String usage = 
extractedTextCache.isAlwaysUsePreExtractedCache() ?
+                        "always" : "only during reindexing phase";
+                log.info("Registering PreExtractedTextProvider {} with 
extracted text cache. " +
+                        "It would be used {}",  provider, usage);
             } else {
                 log.info("Unregistering PreExtractedTextProvider with 
extracted text cache");
             }

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java?rev=1732278&r1=1732277&r2=1732278&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java
 Thu Feb 25 10:37:50 2016
@@ -27,6 +27,8 @@ public interface TextExtractionStatsMBea
 
     boolean isPreExtractedTextProviderConfigured();
 
+    boolean isAlwaysUsePreExtractedCache();
+
     int getTextExtractionCount();
 
     long getTotalTime();

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCacheTest.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCacheTest.java?rev=1732278&r1=1732277&r2=1732278&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCacheTest.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCacheTest.java
 Thu Feb 25 10:37:50 2016
@@ -22,12 +22,19 @@ package org.apache.jackrabbit.oak.plugin
 import org.apache.commons.io.FileUtils;
 import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
+import 
org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText.ExtractionResult;
+import 
org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider;
 import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob;
 import org.junit.Test;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.anyString;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verifyZeroInteractions;
+import static org.mockito.Mockito.when;
 
 public class ExtractedTextCacheTest {
 
@@ -46,7 +53,7 @@ public class ExtractedTextCacheTest {
         String text = cache.get("/a", "foo", b, false);
         assertNull(text);
 
-        cache.put(b, new ExtractedText(ExtractedText.ExtractionResult.SUCCESS, 
"test hello"));
+        cache.put(b, new ExtractedText(ExtractionResult.SUCCESS, "test 
hello"));
 
         text = cache.get("/a", "foo", b, false);
         assertEquals("test hello", text);
@@ -60,7 +67,7 @@ public class ExtractedTextCacheTest {
         String text = cache.get("/a", "foo", b, false);
         assertNull(text);
 
-        cache.put(b, new ExtractedText(ExtractedText.ExtractionResult.SUCCESS, 
"test hello"));
+        cache.put(b, new ExtractedText(ExtractionResult.SUCCESS, "test 
hello"));
 
         text = cache.get("/a", "foo", b, false);
         assertNull(text);
@@ -74,12 +81,59 @@ public class ExtractedTextCacheTest {
         String text = cache.get("/a", "foo", b, false);
         assertNull(text);
 
-        cache.put(b, new ExtractedText(ExtractedText.ExtractionResult.ERROR, 
"test hello"));
+        cache.put(b, new ExtractedText(ExtractionResult.ERROR, "test hello"));
 
         text = cache.get("/a", "foo", b, false);
         assertNull(text);
     }
 
+    @Test
+    public void preExtractionNoReindexNoProvider() throws Exception{
+        ExtractedTextCache cache = new ExtractedTextCache(10 * 
FileUtils.ONE_MB, 100);
+
+        Blob b = new IdBlob("hello", "a");
+        String text = cache.get("/a", "foo", b, true);
+        assertNull(text);
+    }
+
+    @Test
+    public void preExtractionNoReindex() throws Exception{
+        ExtractedTextCache cache = new ExtractedTextCache(10 * 
FileUtils.ONE_MB, 100);
+        PreExtractedTextProvider provider = 
mock(PreExtractedTextProvider.class);
+
+        cache.setExtractedTextProvider(provider);
+        Blob b = new IdBlob("hello", "a");
+        String text = cache.get("/a", "foo", b, false);
+        assertNull(text);
+
+        verifyZeroInteractions(provider);
+    }
+
+    @Test
+    public void preExtractionReindex() throws Exception{
+        ExtractedTextCache cache = new ExtractedTextCache(10 * 
FileUtils.ONE_MB, 100);
+        PreExtractedTextProvider provider = 
mock(PreExtractedTextProvider.class);
+
+        cache.setExtractedTextProvider(provider);
+        when(provider.getText(anyString(), any(Blob.class)))
+                .thenReturn(new ExtractedText(ExtractionResult.SUCCESS, 
"bar"));
+        Blob b = new IdBlob("hello", "a");
+        String text = cache.get("/a", "foo", b, true);
+        assertEquals("bar", text);
+    }
+
+    @Test
+    public void preExtractionAlwaysUse() throws Exception{
+        ExtractedTextCache cache = new ExtractedTextCache(10 * 
FileUtils.ONE_MB, 100, true);
+        PreExtractedTextProvider provider = 
mock(PreExtractedTextProvider.class);
+
+        cache.setExtractedTextProvider(provider);
+        when(provider.getText(anyString(), any(Blob.class)))
+                .thenReturn(new ExtractedText(ExtractionResult.SUCCESS, 
"bar"));
+        Blob b = new IdBlob("hello", "a");
+        String text = cache.get("/a", "foo", b, false);
+        assertEquals("bar", text);
+    }
 
     private static class IdBlob extends ArrayBasedBlob {
         final String id;

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java?rev=1732278&r1=1732277&r2=1732278&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java
 Thu Feb 25 10:37:50 2016
@@ -41,6 +41,7 @@ import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
@@ -158,6 +159,7 @@ public class LuceneIndexProviderServiceT
         LuceneIndexEditorProvider editorProvider =
                 (LuceneIndexEditorProvider) 
context.getService(IndexEditorProvider.class);
         
assertNull(editorProvider.getExtractedTextCache().getExtractedTextProvider());
+        
assertFalse(editorProvider.getExtractedTextCache().isAlwaysUsePreExtractedCache());
 
         //Mock OSGi does not support components
         //context.registerService(PreExtractedTextProvider.class, new 
DummyProvider());
@@ -176,6 +178,16 @@ public class LuceneIndexProviderServiceT
     }
 
     @Test
+    public void alwaysUsePreExtractedCache() throws Exception{
+        Map<String,Object> config = getDefaultConfig();
+        config.put("alwaysUsePreExtractedCache", "true");
+        MockOsgi.activate(service, context.bundleContext(), config);
+        LuceneIndexEditorProvider editorProvider =
+                (LuceneIndexEditorProvider) 
context.getService(IndexEditorProvider.class);
+        
assertTrue(editorProvider.getExtractedTextCache().isAlwaysUsePreExtractedCache());
+    }
+
+    @Test
     public void booleanQuerySize() throws Exception{
         Map<String,Object> config = getDefaultConfig();
         config.put("booleanClauseLimit", 4000);


Reply via email to