This is an automated email from the ASF dual-hosted git repository.
tilman pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 4150317 TIKA-3246: call tailored fixup when getting AcroForm the
first time to avoid the creation of appearances which aren't needed in tika
(newly needed in PDFBox 2.0.22)
4150317 is described below
commit 4150317e6667576acb8d6f808a5686ff22d2ee7d
Author: Tilman Hausherr <[email protected]>
AuthorDate: Sun Dec 20 05:56:12 2020 +0100
TIKA-3246: call tailored fixup when getting AcroForm the first time to
avoid the creation of appearances which aren't needed in tika (newly needed in
PDFBox 2.0.22)
---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 9 +++---
.../java/org/apache/tika/parser/pdf/PDFParser.java | 37 ++++++++++++++++++----
2 files changed, 35 insertions(+), 11 deletions(-)
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 8b25304..43526ef 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -100,7 +100,6 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
-import org.apache.tika.parser.sas.SAS7BDATParser;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
@@ -234,8 +233,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
//now try the xfa
- if (pdfDocument.getDocumentCatalog().getAcroForm() != null &&
- pdfDocument.getDocumentCatalog().getAcroForm().getXFA() != null) {
+ if (pdfDocument.getDocumentCatalog().getAcroForm(null) != null &&
+ pdfDocument.getDocumentCatalog().getAcroForm(null).getXFA() !=
null) {
Metadata xfaMetadata = new Metadata();
xfaMetadata.set(Metadata.CONTENT_TYPE, XFA_MEDIA_TYPE.toString());
@@ -244,7 +243,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
supportedTypes.contains(XFA_MEDIA_TYPE)) {
byte[] bytes = null;
try {
- bytes =
pdfDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes();
+ bytes =
pdfDocument.getDocumentCatalog().getAcroForm(null).getXFA().getBytes();
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e,
parentMetadata);
}
@@ -744,7 +743,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
if (catalog == null)
return;
- PDAcroForm form = catalog.getAcroForm();
+ PDAcroForm form = catalog.getAcroForm(null);
if (form == null)
return;
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index f66f086..f5c09ae 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -39,6 +39,10 @@ import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import
org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
+import org.apache.pdfbox.pdmodel.fixup.AbstractFixup;
+import org.apache.pdfbox.pdmodel.fixup.PDDocumentFixup;
+import org.apache.pdfbox.pdmodel.fixup.processor.AcroFormDefaultsProcessor;
+import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
@@ -264,9 +268,14 @@ public class PDFParser extends AbstractParser implements
Initializable {
if (document.getDocumentCatalog().getLanguage() != null) {
metadata.set(TikaCoreProperties.LANGUAGE,
document.getDocumentCatalog().getLanguage());
}
- if (document.getDocumentCatalog().getAcroForm() != null &&
- document.getDocumentCatalog().getAcroForm().getFields() != null &&
- document.getDocumentCatalog().getAcroForm().getFields().size() >
0) {
+
+ // TIKA-3246: Do this for the first call of getAcroForm(),
+ // subsequent calls should use the same fixup or null to avoid a
default fixup.
+ // Do not call without parameters (would mean default fixup which is
slower because
+ // it creates annotation appearances)
+ PDDocumentFixup fixup = new TikaAcroFormFixup(document);
+ PDAcroForm acroForm = document.getDocumentCatalog().getAcroForm(fixup);
+ if (acroForm != null && acroForm.getFields() != null &&
!acroForm.getFields().isEmpty()) {
metadata.set(PDF.HAS_ACROFORM_FIELDS, "true");
}
PDMetadataExtractor.extract(document.getDocumentCatalog().getMetadata(),
metadata, context);
@@ -361,8 +370,8 @@ public class PDFParser extends AbstractParser implements
Initializable {
private boolean hasXFA(PDDocument pdDocument) {
return pdDocument.getDocumentCatalog() != null &&
- pdDocument.getDocumentCatalog().getAcroForm() != null &&
- pdDocument.getDocumentCatalog().getAcroForm().hasXFA();
+ pdDocument.getDocumentCatalog().getAcroForm(null) != null &&
+ pdDocument.getDocumentCatalog().getAcroForm(null).hasXFA();
}
private boolean shouldHandleXFAOnly(boolean hasXFA, PDFParserConfig
config) {
@@ -376,7 +385,7 @@ public class PDFParser extends AbstractParser implements
Initializable {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try (InputStream is = new ByteArrayInputStream(
-
pdDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes())) {
+
pdDocument.getDocumentCatalog().getAcroForm(null).getXFA().getBytes())) {
ex.extract(is, xhtml, metadata, context);
} catch (XMLStreamException e) {
throw new TikaException("XML error in XFA", e);
@@ -657,4 +666,20 @@ public class PDFParser extends AbstractParser implements
Initializable {
HAS_WARNED = true;
}
}
+
+ /**
+ * Copied from AcroformDefaultFixup minus generation of appearances and
handling of orphan
+ * widgets, which we don't need.
+ */
+ class TikaAcroFormFixup extends AbstractFixup
+ {
+ TikaAcroFormFixup(PDDocument document) {
+ super(document);
+ }
+
+ @Override
+ public void apply() {
+ new AcroFormDefaultsProcessor(document).process();
+ }
+ }
}