This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4256
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 9092e6c450e80749086e901b2b758726fa92110f
Author: tallison <[email protected]>
AuthorDate: Fri May 17 10:16:42 2024 -0400

    TIKA-4256 -- allow inlining of ocr'd content
---
 .../tika/extractor/ParentContentHandler.java       | 36 +++++++++
 .../apache/tika/parser/RecursiveParserWrapper.java | 94 +++++++++++++++-------
 .../apache/tika/parser/ocr/TesseractOCRConfig.java | 10 +++
 .../apache/tika/parser/ocr/TesseractOCRParser.java | 36 ++++++++-
 .../tika/parser/ocr/TesseractOCRParserTest.java    | 17 ++++
 5 files changed, 162 insertions(+), 31 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/ParentContentHandler.java 
b/tika-core/src/main/java/org/apache/tika/extractor/ParentContentHandler.java
new file mode 100644
index 000000000..83220f0d1
--- /dev/null
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/ParentContentHandler.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.extractor;
+
+import org.xml.sax.ContentHandler;
+
+/**
+ * Simple pointer class to allow parsers to pass on the parent contenthandler 
through
+ * to the embedded document's parse
+ */
+public class ParentContentHandler {
+
+    private final ContentHandler contentHandler;
+
+    public ParentContentHandler(ContentHandler contentHandler) {
+        this.contentHandler = contentHandler;
+    }
+
+    public ContentHandler getContentHandler() {
+        return contentHandler;
+    }
+}
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java 
b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 3cb78d520..629b289ae 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -19,7 +19,9 @@ package org.apache.tika.parser;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
 
+import org.apache.commons.io.input.CloseShieldInputStream;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -29,6 +31,7 @@ import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.exception.ZeroByteFileException;
+import org.apache.tika.extractor.ParentContentHandler;
 import org.apache.tika.io.FilenameUtils;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
@@ -82,6 +85,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
 
     private final boolean catchEmbeddedExceptions;
 
+    private final boolean inlineContent = false;
     /**
      * Initialize the wrapper with {@link #catchEmbeddedExceptions} set
      * to <code>true</code> as default.
@@ -158,7 +162,7 @@ public class RecursiveParserWrapper extends ParserDecorator 
{
         try {
             TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
             RecursivelySecureContentHandler secureContentHandler =
-                    new RecursivelySecureContentHandler(localHandler, tis, 
writeLimit,
+                    new RecursivelySecureContentHandler(localHandler, tis, new 
SecureHandlerCounter(writeLimit),
                             throwOnWriteLimitReached, context);
             context.set(RecursivelySecureContentHandler.class, 
secureContentHandler);
             getWrappedParser().parse(tis, secureContentHandler, metadata, 
context);
@@ -179,6 +183,7 @@ public class RecursiveParserWrapper extends ParserDecorator 
{
             metadata.set(TikaCoreProperties.PARSE_TIME_MILLIS, 
Long.toString(elapsedMillis));
             
parserState.recursiveParserWrapperHandler.endDocument(localHandler, metadata);
             parserState.recursiveParserWrapperHandler.endDocument();
+            context.set(RecursivelySecureContentHandler.class, null);
         }
     }
 
@@ -250,12 +255,21 @@ public class RecursiveParserWrapper extends 
ParserDecorator {
                     new EmbeddedParserDecorator(getWrappedParser(), 
objectLocation,
                             idPath, parserState));
             long started = System.currentTimeMillis();
-            RecursivelySecureContentHandler secureContentHandler =
-                    context.get(RecursivelySecureContentHandler.class);
             //store the handler that was used before this parse
             //so that you can return it back to its state at the end of this 
parse
-            ContentHandler preContextHandler = secureContentHandler.handler;
-            secureContentHandler.updateContentHandler(localHandler);
+            RecursivelySecureContentHandler preParseHandler = 
context.get(RecursivelySecureContentHandler.class);
+
+            ParentContentHandler preParseParentHandler = 
context.get(ParentContentHandler.class);
+            context.set(ParentContentHandler.class, new 
ParentContentHandler(preParseHandler));
+            TemporaryResources tmp = null;
+            TikaInputStream tis = TikaInputStream.cast(stream);
+            if (tis == null) {
+                tmp = new TemporaryResources();
+                tis = TikaInputStream.get(CloseShieldInputStream.wrap(stream), 
tmp, metadata);
+            }
+            ContentHandler secureContentHandler =
+                    new RecursivelySecureContentHandler(localHandler, tis, 
preParseHandler.handlerCounter,
+                    preParseHandler.throwOnWriteLimitReached, context);
 
             try {
                 super.parse(stream, secureContentHandler, metadata, context);
@@ -286,11 +300,15 @@ public class RecursiveParserWrapper extends 
ParserDecorator {
                 }
             } finally {
                 context.set(Parser.class, preContextParser);
-                secureContentHandler.updateContentHandler(preContextHandler);
+                context.set(RecursivelySecureContentHandler.class, 
preParseHandler);
+                context.set(ParentContentHandler.class, preParseParentHandler);
                 long elapsedMillis = System.currentTimeMillis() - started;
                 metadata.set(TikaCoreProperties.PARSE_TIME_MILLIS, 
Long.toString(elapsedMillis));
                 parserState.recursiveParserWrapperHandler
                         .endEmbeddedDocument(localHandler, metadata);
+                if (tmp != null) {
+                    tis.close();
+                }
             }
         }
     }
@@ -308,35 +326,51 @@ public class RecursiveParserWrapper extends 
ParserDecorator {
         }
     }
 
-    static class RecursivelySecureContentHandler extends SecureContentHandler {
-        private ContentHandler handler;
-
-        //total allowable chars across all handlers
+    static class SecureHandlerCounter {
         private final int totalWriteLimit;
+        private boolean writeLimitReached = false;
+        //total chars written to all handlers
+        private int totalChars = 0;
+
+        private SecureHandlerCounter(int totalWriteLimit) {
+            this.totalWriteLimit = totalWriteLimit;
+        }
+        /**
+         * Given the requested length, how many characters are actually 
available
+         * @param length
+         * @return
+         */
+        int getAvailable(int length) {
+            return Math.min(totalWriteLimit - totalChars, length);
+        }
+        void addChars(int numChars) {
+            totalChars += numChars;
+        }
+
+    }
+
+    //
+    static class RecursivelySecureContentHandler extends SecureContentHandler {
+        private static AtomicInteger COUNTER = new AtomicInteger();
+        private final ContentHandler handler;
+        private final SecureHandlerCounter handlerCounter;
 
         private final boolean throwOnWriteLimitReached;
 
         private final ParseContext parseContext;
 
-        private boolean writeLimitReached = false;
+        private final int id = COUNTER.getAndIncrement();
 
-        //total chars written to all handlers
-        private int totalChars = 0;
         public RecursivelySecureContentHandler(ContentHandler handler, 
TikaInputStream stream,
-                                               int totalWriteLimit,
+                                               SecureHandlerCounter 
handlerCounter,
                                                boolean 
throwOnWriteLimitReached, ParseContext parseContext) {
             super(handler, stream);
             this.handler = handler;
-            this.totalWriteLimit = totalWriteLimit;
+            this.handlerCounter = handlerCounter;
             this.throwOnWriteLimitReached = throwOnWriteLimitReached;
             this.parseContext = parseContext;
         }
 
-        public void updateContentHandler(ContentHandler handler) {
-            setContentHandler(handler);
-            this.handler = handler;
-        }
-
         /**
          * Bypass the SecureContentHandler...
          * <p>
@@ -364,17 +398,17 @@ public class RecursiveParserWrapper extends 
ParserDecorator {
 
         @Override
         public void characters(char[] ch, int start, int length) throws 
SAXException {
-            if (writeLimitReached) {
+            if (handlerCounter.writeLimitReached) {
                 return;
             }
 
-            if (totalWriteLimit < 0) {
+            if (handlerCounter.totalWriteLimit < 0) {
                 super.characters(ch, start, length);
                 return;
             }
-            int availableLength = Math.min(totalWriteLimit - totalChars, 
length);
+            int availableLength = handlerCounter.getAvailable(length);
             super.characters(ch, start, availableLength);
-            totalChars += availableLength;
+            handlerCounter.addChars(availableLength);
             if (availableLength < length) {
                 handleWriteLimitReached();
             }
@@ -382,27 +416,27 @@ public class RecursiveParserWrapper extends 
ParserDecorator {
 
         @Override
         public void ignorableWhitespace(char[] ch, int start, int length) 
throws SAXException {
-            if (writeLimitReached) {
+            if (handlerCounter.writeLimitReached) {
                 return;
             }
 
-            if (totalWriteLimit < 0) {
+            if (handlerCounter.totalWriteLimit < 0) {
                 super.ignorableWhitespace(ch, start, length);
                 return;
             }
-            int availableLength = Math.min(totalWriteLimit - totalChars, 
length);
+            int availableLength = handlerCounter.getAvailable(length);
             super.ignorableWhitespace(ch, start, availableLength);
-            totalChars += availableLength;
+            handlerCounter.addChars(availableLength);
             if (availableLength < length) {
                 handleWriteLimitReached();
             }
         }
 
         private void handleWriteLimitReached() throws 
WriteLimitReachedException {
-            writeLimitReached = true;
+            handlerCounter.writeLimitReached = true;
 
             if (throwOnWriteLimitReached) {
-                throw new WriteLimitReachedException(totalWriteLimit);
+                throw new 
WriteLimitReachedException(handlerCounter.totalWriteLimit);
             } else {
                 ParseRecord parseRecord = parseContext.get(ParseRecord.class);
                 if (parseRecord != null) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 18359735f..ec021643c 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -96,6 +96,7 @@ public class TesseractOCRConfig implements Serializable {
     // See addOtherTesseractConfig.
     private Map<String, String> otherTesseractConfig = new HashMap<>();
     private Set<String> userConfigured = new HashSet<>();
+    private boolean inlineContent = false;
 
     /**
      * This takes a language string, parses it and then bins individual langs 
into
@@ -477,6 +478,15 @@ public class TesseractOCRConfig implements Serializable {
         return this.applyRotation;
     }
 
+    public void setInlineContent(boolean inlineContent) {
+        this.inlineContent = inlineContent;
+        userConfigured.add("inlineContent");
+    }
+
+    public boolean isInlineContent() {
+        return inlineContent;
+    }
+
     /**
      * Sets whether or not a rotation value should be calculated and passed to 
ImageMagick.
      *
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index a28ae8951..8012a00f9 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -65,14 +65,19 @@ import org.apache.tika.config.Param;
 import org.apache.tika.config.TikaTaskTimeout;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.ParentContentHandler;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractExternalProcessParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.StringUtils;
 import org.apache.tika.utils.XMLReaderUtils;
@@ -265,13 +270,33 @@ public class TesseractOCRParser extends 
AbstractExternalProcessParser implements
             //this is the text output file name specified on the tesseract
             //commandline.  The actual output file name will have a suffix 
added.
             File tmpOCROutputFile = tmp.createTemporaryFile();
-            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+            ContentHandler baseHandler = 
getContentHandler(config.isInlineContent(), handler, metadata, parseContext);
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, 
metadata);
             xhtml.startDocument();
             parse(tikaStream, tmpOCROutputFile, xhtml, metadata, parseContext, 
config);
             xhtml.endDocument();
         }
     }
 
+    private ContentHandler getContentHandler(boolean isInlineContent,
+                                             ContentHandler handler, Metadata 
metadata, ParseContext parseContext) {
+        if (! isInlineContent) {
+            return handler;
+        }
+        //check for inlining of the parent content handler
+        //if there's no parent, skip
+        ParentContentHandler parentContentHandler = 
parseContext.get(ParentContentHandler.class);
+        if (parentContentHandler == null) {
+            return handler;
+        }
+        String embeddedType = 
metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+        if (! 
TikaCoreProperties.EmbeddedResourceType.INLINE.name().equals(embeddedType)) {
+            return handler;
+        }
+        //check for literally the same or wrapped parent and handler?
+        return new TeeContentHandler(new EmbeddedContentHandler(new 
BodyContentHandler(parentContentHandler.getContentHandler())), handler);
+    }
+
     private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile,
                        ContentHandler xhtml,
                        Metadata metadata, ParseContext parseContext, 
TesseractOCRConfig config)
@@ -824,6 +849,15 @@ public class TesseractOCRParser extends 
AbstractExternalProcessParser implements
     public boolean isApplyRotation() {
         return defaultConfig.isApplyRotation();
     }
+
+    @Field
+    public void setInlineContent(boolean inlineContent) {
+        defaultConfig.setInlineContent(inlineContent);
+    }
+
+    public boolean isInlineContent() {
+        return defaultConfig.isInlineContent();
+    }
     /**
      * If set to <code>true</code> and if tesseract is found, this will load 
the
      * langs that result from --list-langs. At parse time, the
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 6ce19e3dd..bf056fe7f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -248,6 +248,23 @@ public class TesseractOCRParserTest extends TikaTest {
         assertEquals("72 dots per inch", m.get("Exif IFD0:Y Resolution"));
     }
 
+    @Test
+    public void testInlining() throws Exception {
+        TesseractOCRConfig config = new TesseractOCRConfig();
+        config.setInlineContent(true);
+        ParseContext context = new ParseContext();
+        context.set(TesseractOCRConfig.class, config);
+        List<Metadata> metadataList = getRecursiveMetadata("testOCR.pptx", 
context);
+        debug(metadataList);
+        //0 is main doc, 1 is embedded image, 2 is thumbnail
+        assertEquals(3, metadataList.size());
+        assertContains("This is some text", 
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+        assertNotContained("This is some text", 
metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+        assertNotContained("This is some text", 
metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT));
+
+        assertContains("Happy New Year 2003", 
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+        assertContains("Happy New Year 2003", 
metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+    }
     //TODO: add unit tests for jp2/jpx/ppm TIKA-2174
 
 }

Reply via email to