This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4202 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 83cc605a5c8f2fef735ff4f1a8f9aa676821273b Author: tallison <talli...@apache.org> AuthorDate: Fri Feb 23 13:30:23 2024 -0500 TIKA-4202 -- add ocr page count to PDFs --- tika-core/src/main/java/org/apache/tika/metadata/PDF.java | 2 ++ .../java/org/apache/tika/parser/pdf/OCRPageCounter.java | 14 ++++++++++++++ .../main/java/org/apache/tika/parser/pdf/PDFParser.java | 6 ++++++ .../java/org/apache/tika/parser/pdf/PDFParserTest.java | 2 +- 4 files changed, 23 insertions(+), 1 deletion(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java index c2baca0e8..a6c753fcd 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java @@ -209,4 +209,6 @@ public interface PDF { Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"), new Property[]{ TikaCoreProperties.VERSION_COUNT }); + Property OCR_PAGE_COUNT = Property.externalInteger(PDF_PREFIX + "ocrPageCount"); + } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java new file mode 100644 index 000000000..d3dcc9155 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java @@ -0,0 +1,14 @@ +package org.apache.tika.parser.pdf; + +public class OCRPageCounter { + + private int count; + + public void increment() { + count++; + } + + public int getCount() { + return count; + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index c93571daf..f21b65d4e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -16,6 +16,8 @@ */ package org.apache.tika.parser.pdf; +import static org.apache.tika.metadata.PDF.OCR_PAGE_COUNT; + import java.io.IOException; import java.io.InputStream; import java.nio.file.Path; @@ -158,6 +160,8 @@ public class PDFParser implements Parser, RenderingParser, Initializable { PDFRenderingState incomingRenderingState = context.get(PDFRenderingState.class); TikaInputStream tstream = null; boolean shouldClose = false; + OCRPageCounter prevOCRCounter = context.get(OCRPageCounter.class); + context.set(OCRPageCounter.class, new OCRPageCounter()); try { if (shouldSpool(localConfig)) { if (stream instanceof TikaInputStream) { @@ -220,6 +224,8 @@ public class PDFParser implements Parser, RenderingParser, Initializable { metadata.set(PDF.IS_ENCRYPTED, "true"); throw new EncryptedDocumentException(e); } finally { + metadata.set(OCR_PAGE_COUNT, context.get(OCRPageCounter.class).getCount()); + context.set(OCRPageCounter.class, prevOCRCounter); //reset the incrementalUpdateRecord even if null context.set(IncrementalUpdateRecord.class, incomingIncrementalUpdateRecord); PDFRenderingState currState = context.get(PDFRenderingState.class); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 6e9167f37..0269a58ef 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -238,7 +238,7 @@ public class PDFParserTest extends TikaTest { assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("true", metadata.get("pdf:encrypted")); //pdf:encrypted, X-Parsed-By and Content-Type - assertEquals(4, metadata.names().length, "very little metadata should be parsed"); + assertEquals(5, metadata.names().length, "very little metadata should be parsed"); assertEquals(0, handler.toString().length()); }