This is an automated email from the ASF dual-hosted git repository. tballison pushed a commit to branch TIKA-4756 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4afc6b5009c2f33da19938ebf0b1bde1c2d9b69f Author: tballison <[email protected]> AuthorDate: Thu Jun 11 06:43:22 2026 +0200 TIKA-4756 -- add HAS_SIGNATURE_FIELDS --- .../src/main/java/org/apache/tika/metadata/PDF.java | 6 ++++++ .../main/java/org/apache/tika/parser/pdf/PDFParser.java | 14 ++++++++++---- .../java/org/apache/tika/parser/pdf/PDFParserTest.java | 16 +++++++++++++--- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java index f852189365..51451e71df 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java @@ -151,6 +151,12 @@ public interface PDF { */ Property HAS_ACROFORM_FIELDS = Property.internalBoolean(PDF_PREFIX + "hasAcroFormFields"); + /** + * Has at least one AcroForm signature field (/FT /Sig), whether or not it has been signed. + * For documents that have been actually signed, see {@link TikaCoreProperties#HAS_SIGNATURE}. + */ + Property HAS_SIGNATURE_FIELDS = Property.internalBoolean(PDF_PREFIX + "hasSignatureFields"); + Property HAS_MARKED_CONTENT = Property.internalBoolean(PDF_PREFIX + "hasMarkedContent"); /** diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index f4e734f532..25aa853e54 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -53,6 +53,7 @@ import org.apache.pdfbox.pdmodel.fixup.PDDocumentFixup; import org.apache.pdfbox.pdmodel.fixup.processor.AcroFormDefaultsProcessor; import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature; import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; +import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -398,13 +399,19 @@ public class PDFParser implements Parser, RenderingParser { } private void extractSignatures(PDDocument pdfDocument, Metadata metadata) { + List<PDSignatureField> sigFields = pdfDocument.getSignatureFields(); + if (sigFields.isEmpty()) { + return; + } + metadata.set(PDF.HAS_SIGNATURE_FIELDS, true); + boolean hasSignature = false; - for (PDSignature signature : pdfDocument.getSignatureDictionaries()) { + for (PDSignatureField sigField : sigFields) { + PDSignature signature = sigField.getSignature(); if (signature == null) { continue; } PDMetadataExtractor.addNotNull(signature.getName(), metadata, TikaCoreProperties.SIGNATURE_NAME); - Calendar date = signature.getSignDate(); if (date != null) { metadata.add(TikaCoreProperties.SIGNATURE_DATE, date); @@ -414,11 +421,10 @@ public class PDFParser implements Parser, RenderingParser { PDMetadataExtractor.addNotNull(signature.getLocation(), metadata, TikaCoreProperties.SIGNATURE_LOCATION); PDMetadataExtractor.addNotNull(signature.getReason(), metadata, TikaCoreProperties.SIGNATURE_REASON); hasSignature = true; - } if (hasSignature) { - metadata.set(TikaCoreProperties.HAS_SIGNATURE, hasSignature); + metadata.set(TikaCoreProperties.HAS_SIGNATURE, true); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 947a45dbdd..987d2c7083 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -621,17 +621,27 @@ public class PDFParserTest extends TikaTest { //TIKA-1226 @Test public void testSignatureInAcroForm() throws Exception { - //The current test doc does not contain any content in the signature area. - //This just tests that a RuntimeException is not thrown. - //TODO: find a better test file for this issue. XMLResult result = getXML("testPDF_acroform3.pdf"); Metadata m = result.metadata; assertEquals("true", m.get(PDF.HAS_XMP)); assertEquals("true", m.get(PDF.HAS_ACROFORM_FIELDS)); assertEquals("false", m.get(PDF.HAS_XFA)); + assertEquals("true", m.get(PDF.HAS_SIGNATURE_FIELDS)); + assertNull(m.get(TikaCoreProperties.HAS_SIGNATURE)); assertContains("<li>aTextField: TIKA-1226</li>", result.xml); } + //TIKA-4756 + @Test + public void testUnsignedSignatureField() throws Exception { + // PDF has an AcroForm with /SigFlags 1 and a /Sig type field, but no actual signature value. + // Should detect the signature field but not report hasSignature. + Metadata m = getXML("testPDF_sigflags.pdf").metadata; + assertEquals("true", m.get(PDF.HAS_ACROFORM_FIELDS)); + assertEquals("true", m.get(PDF.HAS_SIGNATURE_FIELDS)); + assertNull(m.get(TikaCoreProperties.HAS_SIGNATURE)); + } + @Test public void testSingleCloseDoc() throws Exception { //TIKA-1341
