External Parser now have consumer for ignored lines,  Fix TIKA-2002

Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/9b5dc7fa
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/9b5dc7fa
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/9b5dc7fa

Branch: refs/heads/master
Commit: 9b5dc7fae4456b12b75ec21d050b9439e6527c47
Parents: e48d191
Author: Thamme Gowda <[email protected]>
Authored: Sat Jun 11 19:04:08 2016 -0700
Committer: Thamme Gowda <[email protected]>
Committed: Sat Jun 11 19:04:08 2016 -0700

----------------------------------------------------------------------
 .../tika/parser/external/ExternalParser.java    | 85 +++++++++++++++++---
 1 file changed, 76 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/9b5dc7fa/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java 
b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
index ab50f01..0ec8eb6 100644
--- 
a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
+++ 
b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
@@ -51,6 +51,29 @@ import static java.nio.charset.StandardCharsets.UTF_8;
  *  text content and metadata from a given document.
  */
 public class ExternalParser extends AbstractParser {
+
+    /**
+     * Consumer contract
+     * @since Apache Tika 1.14
+     */
+    public interface LineConsumer {
+        /**
+         * Consume a line
+         * @param line a line of string
+         */
+        void consume(String line);
+
+        /**
+         * A null consumer
+         */
+        LineConsumer NULL = new LineConsumer() {
+            @Override
+            public void consume(String line) {
+               //ignore
+            }
+        };
+    }
+
     private static final long serialVersionUID = -1079128990650687037L;
     
     /**
@@ -83,6 +106,11 @@ public class ExternalParser extends AbstractParser {
      */
     private String[] command = new String[] { "cat" };
 
+    /**
+     * A consumer for ignored Lines
+     */
+    private LineConsumer ignoredLineConsumer = LineConsumer.NULL;
+
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return getSupportedTypes();
     }
@@ -110,8 +138,23 @@ public class ExternalParser extends AbstractParser {
     public void setCommand(String... command) {
         this.command = command;
     }
-    
-    
+
+    /**
+     * Gets lines consumer
+     * @return consumer instance
+     */
+    public LineConsumer getIgnoredLineConsumer() {
+        return ignoredLineConsumer;
+    }
+
+    /**
+     * Set a consumer for the lines ignored by the parse functions
+     * @param ignoredLineConsumer consumer instance
+     */
+    public void setIgnoredLineConsumer(LineConsumer ignoredLineConsumer) {
+        this.ignoredLineConsumer = ignoredLineConsumer;
+    }
+
     public Map<Pattern,String> getMetadataExtractionPatterns() {
        return metadataPatterns;
     }
@@ -283,14 +326,27 @@ public class ExternalParser extends AbstractParser {
         catch(InterruptedException ignore){}        
     }
 
+
     /**
      * Starts a thread that reads and discards the contents of the
      * standard stream of the given process. Potential exceptions
      * are ignored, and the stream is closed once fully processed.
-     *
-     * @param process process
+     * Note: calling this starts a new thread and blocks the current(caller) 
thread until the new thread dies
+     * @param stream stream to be ignored
      */
-    private void ignoreStream(final InputStream stream) {
+    private static void ignoreStream(final InputStream stream) {
+        ignoreStream(stream, true);
+    }
+
+    /**
+     * Starts a thread that reads and discards the contents of the
+     * standard stream of the given process. Potential exceptions
+     * are ignored, and the stream is closed once fully processed.
+     * @param stream stream to sent to black hole (a k a null)
+     * @param waitForDeath when {@code true} the caller thread will be blocked 
till the death of new thread.
+     * @return The thread that is created and started
+     */
+    private static Thread ignoreStream(final InputStream stream, boolean 
waitForDeath) {
         Thread t = new Thread() {
             public void run() {
                 try {
@@ -302,10 +358,12 @@ public class ExternalParser extends AbstractParser {
             }
         };
         t.start();
-        try{
-          t.join();
+        if (waitForDeath) {
+            try {
+                t.join();
+            } catch (InterruptedException ignore) {}
         }
-        catch(InterruptedException ignore){}
+        return t;
     }
     
     private void extractMetadata(final InputStream stream, final Metadata 
metadata) {
@@ -316,9 +374,11 @@ public class ExternalParser extends AbstractParser {
              try {
                 String line;
                 while ( (line = reader.readLine()) != null ) {
+                    boolean consumed = false;
                    for(Pattern p : metadataPatterns.keySet()) {
                       Matcher m = p.matcher(line);
                       if(m.find()) {
+                          consumed = true;
                         if (metadataPatterns.get(p) != null && 
                                         !metadataPatterns.get(p).equals("")){
                                    metadata.add( metadataPatterns.get(p), 
m.group(1) );
@@ -328,6 +388,9 @@ public class ExternalParser extends AbstractParser {
                         }
                       }
                    }
+                    if (!consumed) {
+                        ignoredLineConsumer.consume(line);
+                    }
                 }
              } catch (IOException e) {
                  // Ignore
@@ -363,7 +426,11 @@ public class ExternalParser extends AbstractParser {
        
        try {
           Process process= Runtime.getRuntime().exec(checkCmd);
-          int result = process.waitFor(); 
+          Thread stdErrSuckerThread = ignoreStream(process.getErrorStream(), 
false);
+          Thread stdOutSuckerThread = ignoreStream(process.getInputStream(), 
false);
+          stdErrSuckerThread.join();
+          stdOutSuckerThread.join();
+          int result = process.waitFor();
           for(int err : errorValue) {
              if(result == err) return false;
           }

Reply via email to