This is an automated email from the ASF dual-hosted git repository.
tilman pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 3a4c529 TIKA-3246: call tailored fixup when getting AcroForm the
first time to avoid the creation of appearances which aren't needed in tika
(newly needed in PDFBox 2.0.22)
3a4c529 is described below
commit 3a4c529a201c9c3d9b56cbdf8c2f8b702d74768e
Author: THausherr <[email protected]>
AuthorDate: Sun Dec 20 05:55:16 2020 +0100
TIKA-3246: call tailored fixup when getting AcroForm the first time to
avoid the creation of appearances which aren't needed in tika (newly needed in
PDFBox 2.0.22)
---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 8 ++---
.../java/org/apache/tika/parser/pdf/PDFParser.java | 36 ++++++++++++++++++----
2 files changed, 34 insertions(+), 10 deletions(-)
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 2ba928a..27415d3 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -233,8 +233,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
//now try the xfa
- if (pdfDocument.getDocumentCatalog().getAcroForm() != null &&
- pdfDocument.getDocumentCatalog().getAcroForm().getXFA() != null) {
+ if (pdfDocument.getDocumentCatalog().getAcroForm(null) != null &&
+ pdfDocument.getDocumentCatalog().getAcroForm(null).getXFA() !=
null) {
Metadata xfaMetadata = new Metadata();
xfaMetadata.set(Metadata.CONTENT_TYPE, XFA_MEDIA_TYPE.toString());
@@ -243,7 +243,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
supportedTypes.contains(XFA_MEDIA_TYPE)) {
byte[] bytes = null;
try {
- bytes =
pdfDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes();
+ bytes =
pdfDocument.getDocumentCatalog().getAcroForm(null).getXFA().getBytes();
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e,
parentMetadata);
}
@@ -743,7 +743,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
if (catalog == null)
return;
- PDAcroForm form = catalog.getAcroForm();
+ PDAcroForm form = catalog.getAcroForm(null);
if (form == null)
return;
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index dd18464..9ca5ce9 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -39,6 +39,10 @@ import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import
org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
+import org.apache.pdfbox.pdmodel.fixup.AbstractFixup;
+import org.apache.pdfbox.pdmodel.fixup.PDDocumentFixup;
+import org.apache.pdfbox.pdmodel.fixup.processor.AcroFormDefaultsProcessor;
+import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
@@ -260,9 +264,13 @@ public class PDFParser extends AbstractParser implements
Initializable {
if (document.getDocumentCatalog().getLanguage() != null) {
metadata.set(TikaCoreProperties.LANGUAGE,
document.getDocumentCatalog().getLanguage());
}
- if (document.getDocumentCatalog().getAcroForm() != null &&
- document.getDocumentCatalog().getAcroForm().getFields() != null &&
- document.getDocumentCatalog().getAcroForm().getFields().size() >
0) {
+ // TIKA-3246: Do this for the first call of getAcroForm(),
+ // subsequent calls should use the same fixup or null to avoid a
default fixup.
+ // Do not call without parameters (would mean default fixup which is
slower because
+ // it creates annotation appearances)
+ PDDocumentFixup fixup = new TikaAcroFormFixup(document);
+ PDAcroForm acroForm = document.getDocumentCatalog().getAcroForm(fixup);
+ if (acroForm != null && acroForm.getFields() != null &&
!acroForm.getFields().isEmpty()) {
metadata.set(PDF.HAS_ACROFORM_FIELDS, "true");
}
PDMetadataExtractor.extract(document.getDocumentCatalog().getMetadata(),
metadata, context);
@@ -353,8 +361,8 @@ public class PDFParser extends AbstractParser implements
Initializable {
private boolean hasXFA(PDDocument pdDocument) {
return pdDocument.getDocumentCatalog() != null &&
- pdDocument.getDocumentCatalog().getAcroForm() != null &&
- pdDocument.getDocumentCatalog().getAcroForm().hasXFA();
+ pdDocument.getDocumentCatalog().getAcroForm(null) != null &&
+ pdDocument.getDocumentCatalog().getAcroForm(null).hasXFA();
}
private boolean shouldHandleXFAOnly(boolean hasXFA, PDFParserConfig
config) {
@@ -368,7 +376,7 @@ public class PDFParser extends AbstractParser implements
Initializable {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try (InputStream is = new ByteArrayInputStream(
-
pdDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes())) {
+
pdDocument.getDocumentCatalog().getAcroForm(null).getXFA().getBytes())) {
ex.extract(is, xhtml, metadata, context);
} catch (XMLStreamException e) {
throw new TikaException("XML error in XFA", e);
@@ -639,4 +647,20 @@ public class PDFParser extends AbstractParser implements
Initializable {
HAS_WARNED = true;
}
}
+
+ /**
+ * Copied from AcroformDefaultFixup minus generation of appearances and
handling of orphan
+ * widgets, which we don't need.
+ */
+ class TikaAcroFormFixup extends AbstractFixup
+ {
+ TikaAcroFormFixup(PDDocument document) {
+ super(document);
+ }
+
+ @Override
+ public void apply() {
+ new AcroFormDefaultsProcessor(document).process();
+ }
+ }
}