This is an automated email from the ASF dual-hosted git repository.
exceptionfactory pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/nifi.git
The following commit(s) were added to refs/heads/main by this push:
new 931ed06729 NIFI-15098 Disabled OCR for ExtractMediaMetadata Processor
(#10426)
931ed06729 is described below
commit 931ed06729800b9d8727184812a156d53d8e5ea5
Author: Lars Francke <[email protected]>
AuthorDate: Fri Oct 17 05:42:29 2025 +0200
NIFI-15098 Disabled OCR for ExtractMediaMetadata Processor (#10426)
Signed-off-by: David Handermann <[email protected]>
---
.../apache/nifi/processors/media/ExtractMediaMetadata.java | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
diff --git
a/nifi-extension-bundles/nifi-media-bundle/nifi-media-processors/src/main/java/org/apache/nifi/processors/media/ExtractMediaMetadata.java
b/nifi-extension-bundles/nifi-media-bundle/nifi-media-processors/src/main/java/org/apache/nifi/processors/media/ExtractMediaMetadata.java
index 7bed9edc0e..7453c94663 100644
---
a/nifi-extension-bundles/nifi-media-bundle/nifi-media-processors/src/main/java/org/apache/nifi/processors/media/ExtractMediaMetadata.java
+++
b/nifi-extension-bundles/nifi-media-bundle/nifi-media-processors/src/main/java/org/apache/nifi/processors/media/ExtractMediaMetadata.java
@@ -48,6 +48,8 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
@@ -196,8 +198,16 @@ public class ExtractMediaMetadata extends
AbstractProcessor {
Integer maxAttribLen) throws
IOException, TikaException, SAXException {
final Metadata metadata = new Metadata();
final TikaInputStream tikaInputStream =
TikaInputStream.get(sourceStream);
+
+ // Configure ParseContext to disable OCR - metadata extraction does
not require OCR
+ // https://issues.apache.org/jira/browse/NIFI-15098
+ final TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
+ ocrConfig.setSkipOcr(true);
+ final ParseContext parseContext = new ParseContext();
+ parseContext.set(TesseractOCRConfig.class, ocrConfig);
+
try {
- autoDetectParser.parse(tikaInputStream, new DefaultHandler(),
metadata);
+ autoDetectParser.parse(tikaInputStream, new DefaultHandler(),
metadata, parseContext);
} finally {
tikaInputStream.close();
}