Author: chetanm
Date: Tue May 19 05:43:22 2015
New Revision: 1680177

URL: http://svn.apache.org/r1680177
Log:
OAK-2865 - Log stats around time spent in extracting text from binaries

Merging 1679144

Modified:
    jackrabbit/oak/branches/1.2/   (props changed)
    
jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java
    
jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java

Propchange: jackrabbit/oak/branches/1.2/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue May 19 05:43:22 2015
@@ -1,3 +1,3 @@
 /jackrabbit/oak/branches/1.0:1665962
-/jackrabbit/oak/trunk:1672350,1672468,1672537,1672603,1672642,1672644,1672834-1672835,1673351,1673410,1673414,1673436,1673644,1673662-1673664,1673669,1673695,1674046,1674065,1674075,1674107,1674228,1674880,1675055,1675332,1675354,1675357,1675593,1676198,1676237,1676407,1676458,1676539,1676670,1676725,1677579,1677581,1677609,1677611,1677939,1677991,1678173,1678323,1678758,1678938,1678954,1679165,1679191,1679235
+/jackrabbit/oak/trunk:1672350,1672468,1672537,1672603,1672642,1672644,1672834-1672835,1673351,1673410,1673414,1673436,1673644,1673662-1673664,1673669,1673695,1674046,1674065,1674075,1674107,1674228,1674880,1675055,1675332,1675354,1675357,1675593,1676198,1676237,1676407,1676458,1676539,1676670,1676725,1677579,1677581,1677609,1677611,1677939,1677991,1678173,1678323,1678758,1678938,1678954,1679144,1679165,1679191,1679235
 /jackrabbit/trunk:1345480

Modified: 
jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java?rev=1680177&r1=1680176&r2=1680177&view=diff
==============================================================================
--- 
jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java
 (original)
+++ 
jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java
 Tue May 19 05:43:22 2015
@@ -28,7 +28,6 @@ import static org.apache.jackrabbit.oak.
 import static 
org.apache.jackrabbit.oak.plugins.index.lucene.util.ConfigUtil.getPrimaryTypeName;
 
 import java.io.IOException;
-import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
@@ -39,6 +38,7 @@ import com.google.common.collect.Iterabl
 import com.google.common.collect.Lists;
 
 import com.google.common.collect.Sets;
+import com.google.common.io.CountingInputStream;
 import org.apache.jackrabbit.JcrConstants;
 import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.api.CommitFailedException;
@@ -800,11 +800,14 @@ public class LuceneIndexEditor implement
 
     private String parseStringValue(Blob v, Metadata metadata, String path) {
         WriteOutContentHandler handler = new 
WriteOutContentHandler(context.getDefinition().getMaxExtractLength());
+        long start = System.currentTimeMillis();
+        long size = 0;
         try {
-            InputStream stream = v.getNewStream();
+            CountingInputStream stream = new 
CountingInputStream(v.getNewStream());
             try {
                 context.getParser().parse(stream, handler, metadata, new 
ParseContext());
             } finally {
+                size = stream.getCount();
                 stream.close();
             }
         } catch (LinkageError e) {
@@ -824,7 +827,9 @@ public class LuceneIndexEditor implement
                 return "TextExtractionError";
             }
         }
-        return handler.toString();
+        String result = handler.toString();
+        context.recordTextExtractionStats(System.currentTimeMillis() - start, 
size);
+        return result;
     }
 
 }

Modified: 
jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java?rev=1680177&r1=1680176&r2=1680177&view=diff
==============================================================================
--- 
jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
 (original)
+++ 
jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
 Tue May 19 05:43:22 2015
@@ -16,6 +16,7 @@
  */
 package org.apache.jackrabbit.oak.plugins.index.lucene;
 
+import static org.apache.jackrabbit.oak.commons.IOUtils.humanReadableByteCount;
 import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.INDEX_DATA_CHILD_NAME;
 import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.PERSISTENCE_PATH;
 import static 
org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants.VERSION;
@@ -27,6 +28,7 @@ import java.io.InputStream;
 import java.net.URL;
 import java.util.Calendar;
 import java.util.Set;
+import java.util.concurrent.TimeUnit;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.jackrabbit.oak.api.CommitFailedException;
@@ -120,6 +122,8 @@ public class LuceneIndexEditorContext {
 
     private Directory directory;
 
+    private final TextExtractionStats textExtractionStats = new 
TextExtractionStats();
+
     /**
      * The media types supported by the parser used.
      */
@@ -180,6 +184,8 @@ public class LuceneIndexEditorContext {
             status.setProperty("lastUpdated", 
ISO8601.format(Calendar.getInstance()), Type.DATE);
             status.setProperty("indexedNodes",indexedNodes);
             PERF_LOGGER.end(start, -1, "Closed IndexWriter for directory {}", 
definition);
+
+            textExtractionStats.log(reindex);
         }
     }
 
@@ -249,6 +255,10 @@ public class LuceneIndexEditorContext {
         return definition;
     }
 
+    public void recordTextExtractionStats(long timeInMillis, long size) {
+        textExtractionStats.addStats(timeInMillis, size);
+    }
+
     private static Parser initializeTikaParser(IndexDefinition definition) {
         ClassLoader current = Thread.currentThread().getContextClassLoader();
         try {
@@ -295,4 +305,50 @@ public class LuceneIndexEditorContext {
         }
         return TikaConfig.getDefaultConfig();
     }
+
+    static class TextExtractionStats {
+        /**
+         * Log stats only if time spent is more than 2 min
+         */
+        private static final long LOGGING_THRESHOLD = 
TimeUnit.MINUTES.toMillis(2);
+        private int count;
+        private long totalSize;
+        private long totalTime;
+
+        public void addStats(long timeInMillis, long size) {
+            count++;
+            totalSize += size;
+            totalTime += timeInMillis;
+        }
+
+        public void log(boolean reindex) {
+            if (log.isDebugEnabled()) {
+                log.debug("Text extraction stats {}", this);
+            } else if (anyParsingDone() && (reindex || isTakingLotsOfTime())) {
+                log.info("Text extraction stats {}", this);
+            }
+        }
+
+        private boolean isTakingLotsOfTime() {
+            return totalTime > LOGGING_THRESHOLD;
+        }
+
+        private boolean anyParsingDone() {
+            return count > 0;
+        }
+
+        @Override
+        public String toString() {
+            return String.format(" %d (%s, %s)", count,
+                    timeInWords(totalTime), humanReadableByteCount(totalSize));
+        }
+
+        private static String timeInWords(long millis) {
+            return String.format("%d min, %d sec",
+                    TimeUnit.MILLISECONDS.toMinutes(millis),
+                    TimeUnit.MILLISECONDS.toSeconds(millis) -
+                            
TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(millis))
+            );
+        }
+    }
 }


Reply via email to