This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 7a03331f8 TIKA-4256 -- allow inlining of ocr'd content in the
RecursiveParserWrapper (#1762)
7a03331f8 is described below
commit 7a03331f87e44548b30970b66d24a81823bc68ab
Author: Tim Allison <[email protected]>
AuthorDate: Mon May 20 09:03:52 2024 -0400
TIKA-4256 -- allow inlining of ocr'd content in the RecursiveParserWrapper
(#1762)
* TIKA-4256 -- allow inlining of ocr'd content
---
.../tika/extractor/ParentContentHandler.java | 36 +++++++++
.../apache/tika/parser/RecursiveParserWrapper.java | 94 +++++++++++++++-------
.../apache/tika/parser/ocr/TesseractOCRConfig.java | 10 +++
.../apache/tika/parser/ocr/TesseractOCRParser.java | 36 ++++++++-
.../tika/parser/ocr/TesseractOCRParserTest.java | 18 +++++
5 files changed, 163 insertions(+), 31 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/extractor/ParentContentHandler.java
b/tika-core/src/main/java/org/apache/tika/extractor/ParentContentHandler.java
new file mode 100644
index 000000000..83220f0d1
--- /dev/null
+++
b/tika-core/src/main/java/org/apache/tika/extractor/ParentContentHandler.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.extractor;
+
+import org.xml.sax.ContentHandler;
+
+/**
+ * Simple pointer class to allow parsers to pass on the parent contenthandler
through
+ * to the embedded document's parse
+ */
+public class ParentContentHandler {
+
+ private final ContentHandler contentHandler;
+
+ public ParentContentHandler(ContentHandler contentHandler) {
+ this.contentHandler = contentHandler;
+ }
+
+ public ContentHandler getContentHandler() {
+ return contentHandler;
+ }
+}
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 3cb78d520..629b289ae 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -19,7 +19,9 @@ package org.apache.tika.parser;
import java.io.IOException;
import java.io.InputStream;
import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.commons.io.input.CloseShieldInputStream;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -29,6 +31,7 @@ import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.exception.ZeroByteFileException;
+import org.apache.tika.extractor.ParentContentHandler;
import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
@@ -82,6 +85,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
private final boolean catchEmbeddedExceptions;
+ private final boolean inlineContent = false;
/**
* Initialize the wrapper with {@link #catchEmbeddedExceptions} set
* to <code>true</code> as default.
@@ -158,7 +162,7 @@ public class RecursiveParserWrapper extends ParserDecorator
{
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
RecursivelySecureContentHandler secureContentHandler =
- new RecursivelySecureContentHandler(localHandler, tis,
writeLimit,
+ new RecursivelySecureContentHandler(localHandler, tis, new
SecureHandlerCounter(writeLimit),
throwOnWriteLimitReached, context);
context.set(RecursivelySecureContentHandler.class,
secureContentHandler);
getWrappedParser().parse(tis, secureContentHandler, metadata,
context);
@@ -179,6 +183,7 @@ public class RecursiveParserWrapper extends ParserDecorator
{
metadata.set(TikaCoreProperties.PARSE_TIME_MILLIS,
Long.toString(elapsedMillis));
parserState.recursiveParserWrapperHandler.endDocument(localHandler, metadata);
parserState.recursiveParserWrapperHandler.endDocument();
+ context.set(RecursivelySecureContentHandler.class, null);
}
}
@@ -250,12 +255,21 @@ public class RecursiveParserWrapper extends
ParserDecorator {
new EmbeddedParserDecorator(getWrappedParser(),
objectLocation,
idPath, parserState));
long started = System.currentTimeMillis();
- RecursivelySecureContentHandler secureContentHandler =
- context.get(RecursivelySecureContentHandler.class);
//store the handler that was used before this parse
//so that you can return it back to its state at the end of this
parse
- ContentHandler preContextHandler = secureContentHandler.handler;
- secureContentHandler.updateContentHandler(localHandler);
+ RecursivelySecureContentHandler preParseHandler =
context.get(RecursivelySecureContentHandler.class);
+
+ ParentContentHandler preParseParentHandler =
context.get(ParentContentHandler.class);
+ context.set(ParentContentHandler.class, new
ParentContentHandler(preParseHandler));
+ TemporaryResources tmp = null;
+ TikaInputStream tis = TikaInputStream.cast(stream);
+ if (tis == null) {
+ tmp = new TemporaryResources();
+ tis = TikaInputStream.get(CloseShieldInputStream.wrap(stream),
tmp, metadata);
+ }
+ ContentHandler secureContentHandler =
+ new RecursivelySecureContentHandler(localHandler, tis,
preParseHandler.handlerCounter,
+ preParseHandler.throwOnWriteLimitReached, context);
try {
super.parse(stream, secureContentHandler, metadata, context);
@@ -286,11 +300,15 @@ public class RecursiveParserWrapper extends
ParserDecorator {
}
} finally {
context.set(Parser.class, preContextParser);
- secureContentHandler.updateContentHandler(preContextHandler);
+ context.set(RecursivelySecureContentHandler.class,
preParseHandler);
+ context.set(ParentContentHandler.class, preParseParentHandler);
long elapsedMillis = System.currentTimeMillis() - started;
metadata.set(TikaCoreProperties.PARSE_TIME_MILLIS,
Long.toString(elapsedMillis));
parserState.recursiveParserWrapperHandler
.endEmbeddedDocument(localHandler, metadata);
+ if (tmp != null) {
+ tis.close();
+ }
}
}
}
@@ -308,35 +326,51 @@ public class RecursiveParserWrapper extends
ParserDecorator {
}
}
- static class RecursivelySecureContentHandler extends SecureContentHandler {
- private ContentHandler handler;
-
- //total allowable chars across all handlers
+ static class SecureHandlerCounter {
private final int totalWriteLimit;
+ private boolean writeLimitReached = false;
+ //total chars written to all handlers
+ private int totalChars = 0;
+
+ private SecureHandlerCounter(int totalWriteLimit) {
+ this.totalWriteLimit = totalWriteLimit;
+ }
+ /**
+ * Given the requested length, how many characters are actually
available
+ * @param length
+ * @return
+ */
+ int getAvailable(int length) {
+ return Math.min(totalWriteLimit - totalChars, length);
+ }
+ void addChars(int numChars) {
+ totalChars += numChars;
+ }
+
+ }
+
+ //
+ static class RecursivelySecureContentHandler extends SecureContentHandler {
+ private static AtomicInteger COUNTER = new AtomicInteger();
+ private final ContentHandler handler;
+ private final SecureHandlerCounter handlerCounter;
private final boolean throwOnWriteLimitReached;
private final ParseContext parseContext;
- private boolean writeLimitReached = false;
+ private final int id = COUNTER.getAndIncrement();
- //total chars written to all handlers
- private int totalChars = 0;
public RecursivelySecureContentHandler(ContentHandler handler,
TikaInputStream stream,
- int totalWriteLimit,
+ SecureHandlerCounter
handlerCounter,
boolean
throwOnWriteLimitReached, ParseContext parseContext) {
super(handler, stream);
this.handler = handler;
- this.totalWriteLimit = totalWriteLimit;
+ this.handlerCounter = handlerCounter;
this.throwOnWriteLimitReached = throwOnWriteLimitReached;
this.parseContext = parseContext;
}
- public void updateContentHandler(ContentHandler handler) {
- setContentHandler(handler);
- this.handler = handler;
- }
-
/**
* Bypass the SecureContentHandler...
* <p>
@@ -364,17 +398,17 @@ public class RecursiveParserWrapper extends
ParserDecorator {
@Override
public void characters(char[] ch, int start, int length) throws
SAXException {
- if (writeLimitReached) {
+ if (handlerCounter.writeLimitReached) {
return;
}
- if (totalWriteLimit < 0) {
+ if (handlerCounter.totalWriteLimit < 0) {
super.characters(ch, start, length);
return;
}
- int availableLength = Math.min(totalWriteLimit - totalChars,
length);
+ int availableLength = handlerCounter.getAvailable(length);
super.characters(ch, start, availableLength);
- totalChars += availableLength;
+ handlerCounter.addChars(availableLength);
if (availableLength < length) {
handleWriteLimitReached();
}
@@ -382,27 +416,27 @@ public class RecursiveParserWrapper extends
ParserDecorator {
@Override
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
- if (writeLimitReached) {
+ if (handlerCounter.writeLimitReached) {
return;
}
- if (totalWriteLimit < 0) {
+ if (handlerCounter.totalWriteLimit < 0) {
super.ignorableWhitespace(ch, start, length);
return;
}
- int availableLength = Math.min(totalWriteLimit - totalChars,
length);
+ int availableLength = handlerCounter.getAvailable(length);
super.ignorableWhitespace(ch, start, availableLength);
- totalChars += availableLength;
+ handlerCounter.addChars(availableLength);
if (availableLength < length) {
handleWriteLimitReached();
}
}
private void handleWriteLimitReached() throws
WriteLimitReachedException {
- writeLimitReached = true;
+ handlerCounter.writeLimitReached = true;
if (throwOnWriteLimitReached) {
- throw new WriteLimitReachedException(totalWriteLimit);
+ throw new
WriteLimitReachedException(handlerCounter.totalWriteLimit);
} else {
ParseRecord parseRecord = parseContext.get(ParseRecord.class);
if (parseRecord != null) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 18359735f..ec021643c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -96,6 +96,7 @@ public class TesseractOCRConfig implements Serializable {
// See addOtherTesseractConfig.
private Map<String, String> otherTesseractConfig = new HashMap<>();
private Set<String> userConfigured = new HashSet<>();
+ private boolean inlineContent = false;
/**
* This takes a language string, parses it and then bins individual langs
into
@@ -477,6 +478,15 @@ public class TesseractOCRConfig implements Serializable {
return this.applyRotation;
}
+ public void setInlineContent(boolean inlineContent) {
+ this.inlineContent = inlineContent;
+ userConfigured.add("inlineContent");
+ }
+
+ public boolean isInlineContent() {
+ return inlineContent;
+ }
+
/**
* Sets whether or not a rotation value should be calculated and passed to
ImageMagick.
*
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index a28ae8951..8012a00f9 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -65,14 +65,19 @@ import org.apache.tika.config.Param;
import org.apache.tika.config.TikaTaskTimeout;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.ParentContentHandler;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractExternalProcessParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;
@@ -265,13 +270,33 @@ public class TesseractOCRParser extends
AbstractExternalProcessParser implements
//this is the text output file name specified on the tesseract
//commandline. The actual output file name will have a suffix
added.
File tmpOCROutputFile = tmp.createTemporaryFile();
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
metadata);
+ ContentHandler baseHandler =
getContentHandler(config.isInlineContent(), handler, metadata, parseContext);
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler,
metadata);
xhtml.startDocument();
parse(tikaStream, tmpOCROutputFile, xhtml, metadata, parseContext,
config);
xhtml.endDocument();
}
}
+ private ContentHandler getContentHandler(boolean isInlineContent,
+ ContentHandler handler, Metadata
metadata, ParseContext parseContext) {
+ if (! isInlineContent) {
+ return handler;
+ }
+ //check for inlining of the parent content handler
+ //if there's no parent, skip
+ ParentContentHandler parentContentHandler =
parseContext.get(ParentContentHandler.class);
+ if (parentContentHandler == null) {
+ return handler;
+ }
+ String embeddedType =
metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+ if (!
TikaCoreProperties.EmbeddedResourceType.INLINE.name().equals(embeddedType)) {
+ return handler;
+ }
+ //check for literally the same or wrapped parent and handler?
+ return new TeeContentHandler(new EmbeddedContentHandler(new
BodyContentHandler(parentContentHandler.getContentHandler())), handler);
+ }
+
private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile,
ContentHandler xhtml,
Metadata metadata, ParseContext parseContext,
TesseractOCRConfig config)
@@ -824,6 +849,15 @@ public class TesseractOCRParser extends
AbstractExternalProcessParser implements
public boolean isApplyRotation() {
return defaultConfig.isApplyRotation();
}
+
+ @Field
+ public void setInlineContent(boolean inlineContent) {
+ defaultConfig.setInlineContent(inlineContent);
+ }
+
+ public boolean isInlineContent() {
+ return defaultConfig.isInlineContent();
+ }
/**
* If set to <code>true</code> and if tesseract is found, this will load
the
* langs that result from --list-langs. At parse time, the
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 6ce19e3dd..764930672 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -248,6 +248,24 @@ public class TesseractOCRParserTest extends TikaTest {
assertEquals("72 dots per inch", m.get("Exif IFD0:Y Resolution"));
}
+ @Test
+ public void testInlining() throws Exception {
+ assumeTrue(canRun(), "can run OCR");
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ config.setInlineContent(true);
+ ParseContext context = new ParseContext();
+ context.set(TesseractOCRConfig.class, config);
+ List<Metadata> metadataList = getRecursiveMetadata("testOCR.pptx",
context);
+ debug(metadataList);
+ //0 is main doc, 1 is embedded image, 2 is thumbnail
+ assertEquals(3, metadataList.size());
+ assertContains("This is some text",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+ assertNotContained("This is some text",
metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+ assertNotContained("This is some text",
metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT));
+
+ assertContains("Happy New Year 2003",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+ assertContains("Happy New Year 2003",
metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+ }
//TODO: add unit tests for jp2/jpx/ppm TIKA-2174
}