This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new 339a1ae89 TIKA-4465 -- extract javascript from name tree (#2305)
339a1ae89 is described below
commit 339a1ae8958b63bd0d53c5dbe36d89a2a51ef193
Author: Tim Allison <[email protected]>
AuthorDate: Mon Aug 18 15:53:31 2025 -0400
TIKA-4465 -- extract javascript from name tree (#2305)
(cherry picked from commit b466c4920c3ab0ae5bb9fd203749c8585c52126f)
---
.../main/java/org/apache/tika/metadata/PDF.java | 6 +
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 386 +++++++++++++--------
.../org/apache/tika/parser/pdf/PDFParserTest.java | 92 ++++-
3 files changed, 328 insertions(+), 156 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index b15c10383..f85218936 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -215,4 +215,10 @@ public interface PDF {
*/
Property OCR_PAGE_COUNT = Property.externalInteger(PDF_PREFIX +
"ocrPageCount");
+ /**
+ * When javascript is stored in the names tree, there's a name associated
with that script.
+ * This is that name. When javascript is stored in an action, there is no
name, and this
+ * metadata will not be populated.
+ */
+ Property JS_NAME = Property.internalText(PDF_PREFIX + "jsName");
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index c3e6bb7e8..132efa3f9 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -28,6 +28,7 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
+import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -52,8 +53,10 @@ import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
+import org.apache.pdfbox.pdmodel.PDJavascriptNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.common.COSObjectable;
@@ -147,7 +150,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
//These can be unbounded. We need to limit the number we store.
private final static int MAX_ANNOTATION_TYPES = 100;
private static final String THREE_D = "3D";
- private static final COSName THREE_DD = COSName.getPDFName("3DD");
+ private static final COSName ON_INSTANTIATE =
COSName.getPDFName("OnInstantiate");
private static final String NULL_STRING = "null";
private static final MediaType XFA_MEDIA_TYPE =
MediaType.application("vnd.adobe.xdp+xml");
private static final MediaType XMP_MEDIA_TYPE =
MediaType.application("rdf+xml");
@@ -263,9 +266,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
if (supportedTypes.contains(XMP_MEDIA_TYPE)) {
//try the main metadata
if (pdfDocument.getDocumentCatalog().getMetadata() != null) {
- try (InputStream is =
pdfDocument.getDocumentCatalog().getMetadata()
- .exportXMPMetadata()) {
- extractXMPAsEmbeddedFile(is,
XMP_DOCUMENT_CATALOG_LOCATION);
+ try (TikaInputStream tis = TikaInputStream.get(
+
pdfDocument.getDocumentCatalog().getMetadata().exportXMPMetadata())) {
+ extractXMPAsEmbeddedFile(tis,
XMP_DOCUMENT_CATALOG_LOCATION);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e,
parentMetadata);
}
@@ -274,8 +277,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
int pageNumber = 1;
for (PDPage page : pdfDocument.getPages()) {
if (page.getMetadata() != null) {
- try (InputStream is =
page.getMetadata().exportXMPMetadata()) {
- extractXMPAsEmbeddedFile(is, XMP_PAGE_LOCATION_PREFIX
+ pageNumber);
+ try (TikaInputStream tis =
TikaInputStream.get(page.getMetadata().exportXMPMetadata())) {
+ extractXMPAsEmbeddedFile(tis, XMP_PAGE_LOCATION_PREFIX
+ pageNumber);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e,
parentMetadata);
}
@@ -301,17 +304,17 @@ class AbstractPDF2XHTML extends PDFTextStripper {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e,
parentMetadata);
}
if (bytes != null) {
- try (InputStream is =
UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
- parseMetadata(is, xfaMetadata);
+ try (TikaInputStream tis = TikaInputStream.get(bytes)) {
+ parseMetadata(tis, xfaMetadata);
}
}
}
}
}
- private void extractXMPAsEmbeddedFile(InputStream is, String location)
+ private void extractXMPAsEmbeddedFile(TikaInputStream tis, String location)
throws IOException, SAXException {
- if (is == null) {
+ if (tis == null) {
return;
}
Metadata xmpMetadata = new Metadata();
@@ -320,19 +323,15 @@ class AbstractPDF2XHTML extends PDFTextStripper {
TikaCoreProperties.EmbeddedResourceType.METADATA.toString());
xmpMetadata.set(PDF.XMP_LOCATION, location);
if (embeddedDocumentExtractor.shouldParseEmbedded(xmpMetadata)) {
- try {
- parseMetadata(is, xmpMetadata);
- } finally {
- IOUtils.closeQuietly(is);
- }
+ parseMetadata(tis, xmpMetadata);
}
}
- private void parseMetadata(InputStream stream, Metadata embeddedMetadata)
+ private void parseMetadata(TikaInputStream tis, Metadata embeddedMetadata)
throws IOException, SAXException {
try {
- embeddedDocumentExtractor.parseEmbedded(stream, new
EmbeddedContentHandler(xhtml),
+ embeddedDocumentExtractor.parseEmbedded(tis, new
EmbeddedContentHandler(xhtml),
embeddedMetadata, true);
} catch (IOException e) {
handleCatchableIOE(e);
@@ -557,10 +556,10 @@ class AbstractPDF2XHTML extends PDFTextStripper {
try (TemporaryResources tmp = new TemporaryResources()) {
try (RenderResult renderResult = renderCurrentPage(pdPage,
context, tmp)) {
Metadata renderMetadata = renderResult.getMetadata();
- try (InputStream is = renderResult.getInputStream()) {
+ try (TikaInputStream tis = renderResult.getInputStream()) {
renderMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
ocrImageMediaType.toString());
- ocrParser.parse(is, new EmbeddedContentHandler(new
BodyContentHandler(xhtml)),
+ ocrParser.parse(tis, new EmbeddedContentHandler(new
BodyContentHandler(xhtml)),
renderMetadata, context);
}
}
@@ -704,92 +703,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
try {
for (PDAnnotation annotation : page.getAnnotations()) {
- String annotationName = annotation.getAnnotationName();
- if (annotationTypes.size() < MAX_ANNOTATION_TYPES) {
- if (annotationName != null) {
- annotationTypes.add(annotationName);
- } else {
- annotationTypes.add(NULL_STRING);
- }
- }
- String annotationSubtype = annotation.getSubtype();
- if (annotationSubtypes.size() < MAX_ANNOTATION_TYPES) {
- if (annotationSubtype != null) {
- annotationSubtypes.add(annotationSubtype);
- } else {
- annotationSubtypes.add(NULL_STRING);
- }
- }
- if (annotation instanceof PDAnnotationFileAttachment) {
- PDAnnotationFileAttachment fann =
(PDAnnotationFileAttachment) annotation;
- String subtype = "annotationFileAttachment";
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "source", "source", "CDATA",
subtype);
- processDocOnAction("", subtype, fann.getFile(),
attributes);
- } else if (annotation instanceof PDAnnotationWidget) {
- handleWidget((PDAnnotationWidget) annotation);
- } else {
- if (annotationSubtype == null) {
- annotationSubtype = "unknown";
- } else if (annotationSubtype.equals(THREE_D) ||
- annotation.getCOSObject().containsKey(THREE_DD)) {
- //To make this stricter, we could get the 3DD stream
object and see if the
- //subtype is U3D or PRC or model/ (prefix for model
mime type)
- metadata.set(PDF.HAS_3D, true);
- num3DAnnotations++;
- }
- for (COSDictionary fileSpec :
findFileSpecs(annotation.getCOSObject())) {
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "source", "source",
"CDATA", annotationSubtype);
- processDocOnAction("", annotationSubtype,
createFileSpecification(fileSpec),
- attributes);
- }
- }
- // TODO: remove once PDFBOX-1143 is fixed:
- if (config.isExtractAnnotationText()) {
- PDActionURI uri = getActionURI(annotation);
- if (uri != null) {
- String link = uri.getURI();
- if (link != null && !link.isBlank()) {
- xhtml.startElement("div", "class", "annotation");
- xhtml.startElement("a", "href", link);
- xhtml.characters(link);
- xhtml.endElement("a");
- xhtml.endElement("div");
- }
- }
-
- if (annotation instanceof PDAnnotationMarkup) {
- PDAnnotationMarkup annotationMarkup =
(PDAnnotationMarkup) annotation;
- String title = annotationMarkup.getTitlePopup();
- String subject = annotationMarkup.getSubject();
- String contents = annotationMarkup.getContents();
- // TODO: maybe also annotationMarkup.getRichContents()?
- if (title != null || subject != null || contents !=
null) {
- xhtml.startElement("div", "class", "annotation");
-
- if (title != null) {
- xhtml.startElement("div", "class",
"annotationTitle");
- xhtml.characters(title);
- xhtml.endElement("div");
- }
-
- if (subject != null) {
- xhtml.startElement("div", "class",
"annotationSubject");
- xhtml.characters(subject);
- xhtml.endElement("div");
- }
-
- if (contents != null) {
- xhtml.startElement("div", "class",
"annotationContents");
- xhtml.characters(contents);
- xhtml.endElement("div");
- }
-
- xhtml.endElement("div");
- }
- }
- }
+ processPageAnnotation(annotation);
}
if (config.getOcrStrategy() ==
PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION) {
doOCROnCurrentPage(page, OCR_AND_TEXT_EXTRACTION);
@@ -839,6 +753,124 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
+ private void processPageAnnotation(PDAnnotation annotation) throws
TikaException, IOException, SAXException {
+
+ String annotationName = annotation.getAnnotationName();
+ if (annotationTypes.size() < MAX_ANNOTATION_TYPES) {
+ if (annotationName != null) {
+ annotationTypes.add(annotationName);
+ } else {
+ annotationTypes.add(NULL_STRING);
+ }
+ }
+ String annotationSubtype = annotation.getSubtype();
+ if (annotationSubtypes.size() < MAX_ANNOTATION_TYPES) {
+ if (annotationSubtype != null) {
+ annotationSubtypes.add(annotationSubtype);
+ } else {
+ annotationSubtypes.add(NULL_STRING);
+ }
+ }
+ if (annotation instanceof PDAnnotationFileAttachment) {
+ PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment)
annotation;
+ String subtype = "annotationFileAttachment";
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "source", "source", "CDATA", subtype);
+ processDocOnAction("", subtype, fann.getFile(), attributes);
+ } else if (annotation instanceof PDAnnotationWidget) {
+ handleWidget((PDAnnotationWidget) annotation);
+ } else {
+ if (annotationSubtype == null) {
+ annotationSubtype = "unknown";
+ } else if (annotationSubtype.equals(THREE_D) ||
+ annotation.getCOSObject().containsKey(COSName.THREE_DD)) {
+ //To make this stricter, we could get the 3DD stream object
and see if the
+ //subtype is U3D or PRC or model/ (prefix for model mime type)
+ extractOnInstantiate(annotation);
+ COSDictionary additionalActions =
annotation.getCOSObject().getCOSDictionary(COSName.AA);
+ if (additionalActions != null) {
+ handlePDAnnotationAdditionalActions(new
PDAnnotationAdditionalActions(additionalActions));
+ }
+ metadata.set(PDF.HAS_3D, true);
+ num3DAnnotations++;
+ }
+ for (COSDictionary fileSpec :
findFileSpecs(annotation.getCOSObject())) {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "source", "source", "CDATA",
annotationSubtype);
+ processDocOnAction("", annotationSubtype,
createFileSpecification(fileSpec),
+ attributes);
+ }
+ }
+ if (! config.isExtractAnnotationText()) {
+ return;
+ }
+ // TODO: remove once PDFBOX-1143 is fixed:
+ PDActionURI uri = getActionURI(annotation);
+ if (uri != null) {
+ String link = uri.getURI();
+ if (link != null && !link.isBlank()) {
+ xhtml.startElement("div", "class", "annotation");
+ xhtml.startElement("a", "href", link);
+ xhtml.characters(link);
+ xhtml.endElement("a");
+ xhtml.endElement("div");
+ }
+ }
+
+ if (annotation instanceof PDAnnotationMarkup) {
+ PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup)
annotation;
+ String title = annotationMarkup.getTitlePopup();
+ String subject = annotationMarkup.getSubject();
+ String contents = annotationMarkup.getContents();
+ // TODO: maybe also annotationMarkup.getRichContents()?
+ if (title != null || subject != null || contents != null) {
+ xhtml.startElement("div", "class", "annotation");
+
+ if (title != null) {
+ xhtml.startElement("div", "class", "annotationTitle");
+ xhtml.characters(title);
+ xhtml.endElement("div");
+ }
+
+ if (subject != null) {
+ xhtml.startElement("div", "class", "annotationSubject");
+ xhtml.characters(subject);
+ xhtml.endElement("div");
+ }
+
+ if (contents != null) {
+ xhtml.startElement("div", "class", "annotationContents");
+ xhtml.characters(contents);
+ xhtml.endElement("div");
+ }
+
+ xhtml.endElement("div");
+ }
+ }
+ }
+
+ private void extractOnInstantiate(PDAnnotation annotation) throws
IOException, SAXException {
+ COSDictionary threeDD =
annotation.getCOSObject().getCOSDictionary(COSName.THREE_DD);
+ if (threeDD == null) {
+ return;
+ }
+ COSStream stream = threeDD.getCOSStream(ON_INSTANTIATE);
+ if (stream == null) {
+ return;
+ }
+ Metadata m = getJavascriptMetadata("3DD_ON_INSTANTIATE", null, null);
+ if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
+ try (TikaInputStream tis =
TikaInputStream.get(stream.createInputStream())) {
+ embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, true);
+ }
+ }
+ AttributesImpl attrs = new AttributesImpl();
+ addNonNullAttribute("class", "javascript", attrs);
+ addNonNullAttribute("type", "3dd_on_instantiate", attrs);
+ xhtml.startElement("div", attrs);
+ xhtml.endElement("div");
+ }
+
private List<COSDictionary> findFileSpecs(COSDictionary cosDict) {
Set<COSName> types = new HashSet<>();
types.add(COSName.FILESPEC);
@@ -880,36 +912,30 @@ class AbstractPDF2XHTML extends PDFTextStripper {
return;
}
handleDestinationOrAction(widget.getAction(),
ActionTrigger.ANNOTATION_WIDGET);
- PDAnnotationAdditionalActions annotationActions = widget.getActions();
- if (annotationActions != null) {
- handleDestinationOrAction(annotationActions.getBl(),
- ActionTrigger.ANNOTATION_LOSE_INPUT_FOCUS);
- handleDestinationOrAction(annotationActions.getD(),
- ActionTrigger.ANNOTATION_MOUSE_CLICK);
- handleDestinationOrAction(annotationActions.getE(),
- ActionTrigger.ANNOTATION_CURSOR_ENTERS);
- handleDestinationOrAction(annotationActions.getFo(),
- ActionTrigger.ANNOTATION_RECEIVES_FOCUS);
- handleDestinationOrAction(annotationActions.getPC(),
- ActionTrigger.ANNOTATION_PAGE_CLOSED);
- handleDestinationOrAction(annotationActions.getPI(),
- ActionTrigger.ANNOTATION_PAGE_NO_LONGER_VISIBLE);
- handleDestinationOrAction(annotationActions.getPO(),
- ActionTrigger.ANNOTATION_PAGE_OPENED);
- handleDestinationOrAction(annotationActions.getPV(),
- ActionTrigger.ANNOTATION_PAGE_VISIBLE);
- handleDestinationOrAction(annotationActions.getU(),
- ActionTrigger.ANNOTATION_MOUSE_RELEASED);
- handleDestinationOrAction(annotationActions.getX(),
- ActionTrigger.ANNOTATION_CURSOR_EXIT);
- }
+ handlePDAnnotationAdditionalActions(widget.getActions());
+ }
+ private void
handlePDAnnotationAdditionalActions(PDAnnotationAdditionalActions
annotationActions) throws TikaException, IOException, SAXException {
+ if (annotationActions == null) {
+ return;
+ }
+ handleDestinationOrAction(annotationActions.getBl(),
ActionTrigger.ANNOTATION_LOSE_INPUT_FOCUS);
+ handleDestinationOrAction(annotationActions.getD(),
ActionTrigger.ANNOTATION_MOUSE_CLICK);
+ handleDestinationOrAction(annotationActions.getE(),
ActionTrigger.ANNOTATION_CURSOR_ENTERS);
+ handleDestinationOrAction(annotationActions.getFo(),
ActionTrigger.ANNOTATION_RECEIVES_FOCUS);
+ handleDestinationOrAction(annotationActions.getPC(),
ActionTrigger.ANNOTATION_PAGE_CLOSED);
+ handleDestinationOrAction(annotationActions.getPI(),
ActionTrigger.ANNOTATION_PAGE_NO_LONGER_VISIBLE);
+ handleDestinationOrAction(annotationActions.getPO(),
ActionTrigger.ANNOTATION_PAGE_OPENED);
+ handleDestinationOrAction(annotationActions.getPV(),
ActionTrigger.ANNOTATION_PAGE_VISIBLE);
+ handleDestinationOrAction(annotationActions.getU(),
ActionTrigger.ANNOTATION_MOUSE_RELEASED);
+ handleDestinationOrAction(annotationActions.getX(),
ActionTrigger.ANNOTATION_CURSOR_EXIT);
}
@Override
protected void startDocument(PDDocument pdf) throws IOException {
try {
xhtml.startDocument();
+ extractJavaScriptFromNameTreeNode(pdf);
try {
handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(),
ActionTrigger.DOCUMENT_OPEN);
@@ -922,6 +948,57 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
+ private void extractJavaScriptFromNameTreeNode(PDDocument pdf) throws
SAXException {
+ if (! config.isExtractActions()) {
+ return;
+ }
+ if (pdf.getDocumentCatalog() == null ||
pdf.getDocumentCatalog().getNames() == null
+ || pdf.getDocumentCatalog().getNames().getJavaScript() ==
null) {
+ return;
+ }
+ try {
+ PDJavascriptNameTreeNode pdjntn =
pdf.getDocumentCatalog().getNames().getJavaScript();
+ addJavaScript(pdjntn.getNames());
+ int depth = 0;
+ processJavascriptNameTreeNodeKids(pdjntn.getKids(), depth + 1);
+ } catch (IOException e) {
+ //swallow
+ }
+ }
+
+ private void addJavaScript(Map<String, PDActionJavaScript>
pdActionJavaScriptMap) throws IOException, SAXException {
+ for (Map.Entry<String, PDActionJavaScript> e :
pdActionJavaScriptMap.entrySet()) {
+ String action = e.getValue().getAction();
+ if (StringUtils.isBlank(action)) {
+ return;
+ }
+ AttributesImpl attributes = new AttributesImpl();
+
+ addNonNullAttribute("trigger", "namesTree", attributes);
+ addNonNullAttribute("type",
e.getValue().getClass().getSimpleName(), attributes);
+
+ processJavaScriptAction("NAMES_TREE", e.getKey(), e.getValue(),
attributes);
+ }
+
+ }
+
+ private void
processJavascriptNameTreeNodeKids(List<PDNameTreeNode<PDActionJavaScript>>
kids, int depth) throws IOException, SAXException {
+
+ if (kids == null) {
+ return;
+ }
+
+ if (depth > MAX_RECURSION_DEPTH) {
+ //hit max recursion
+ //return silently for now...maybe throw Exception?
+ return;
+ }
+ for (PDNameTreeNode<PDActionJavaScript> pdntn: kids) {
+ addJavaScript(pdntn.getNames());
+ processJavascriptNameTreeNodeKids(pdntn.getKids(), depth + 1);
+ };
+ }
+
private void handleDestinationOrAction(PDDestinationOrAction action,
ActionTrigger actionTrigger)
throws IOException, SAXException, TikaException {
@@ -956,25 +1033,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo) action;
processDocOnAction("", "", remoteGoTo.getFile(), attributes);
} else if (action instanceof PDActionJavaScript) {
- PDActionJavaScript jsAction = (PDActionJavaScript) action;
- Metadata m = new Metadata();
- m.set(Metadata.CONTENT_TYPE, "application/javascript");
- m.set(Metadata.CONTENT_ENCODING,
StandardCharsets.UTF_8.toString());
- m.set(PDF.ACTION_TRIGGER, actionTrigger.toString());
- m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
- TikaCoreProperties.EmbeddedResourceType.MACRO.name());
- String js = jsAction.getAction();
- js = (js == null) ? "" : js;
- if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
- try (InputStream is =
TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
- embeddedDocumentExtractor.parseEmbedded(is, xhtml, m,
true);
- }
- }
- addNonNullAttribute("class", "javascript", attributes);
- addNonNullAttribute("type", jsAction.getType(), attributes);
- addNonNullAttribute("subtype", jsAction.getSubType(), attributes);
- xhtml.startElement("div", attributes);
- xhtml.endElement("div");
+ processJavaScriptAction(actionTrigger.name(), null,
(PDActionJavaScript) action, attributes);
/*} else if (action instanceof PDActionSubmitForm) {
PDActionSubmitForm submitForm = (PDActionSubmitForm) action;
//these are typically urls, not actual file specification
@@ -986,6 +1045,37 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
+ private void processJavaScriptAction(String trigger, String jsActionName,
PDActionJavaScript jsAction, AttributesImpl attrs) throws IOException,
SAXException {
+ Metadata m = getJavascriptMetadata(trigger, jsActionName,
StandardCharsets.UTF_8);
+ String js = jsAction.getAction();
+ js = (js == null) ? "" : js;
+ if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
+ try (TikaInputStream tis =
TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
+ embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, true);
+ }
+ };
+ addNonNullAttribute("class", "javascript", attrs);
+ addNonNullAttribute("type", jsAction.getType(), attrs);
+ addNonNullAttribute("subtype", jsAction.getSubType(), attrs);
+ xhtml.startElement("div", attrs);
+ xhtml.endElement("div");
+ }
+
+ private Metadata getJavascriptMetadata(String trigger, String
jsActionName, Charset charset) {
+ Metadata m = new Metadata();
+ m.set(Metadata.CONTENT_TYPE, "application/javascript");
+ m.set(PDF.ACTION_TRIGGER, trigger);
+ m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.MACRO.name());
+ if (! StringUtils.isBlank(jsActionName)) {
+ m.set(PDF.JS_NAME, jsActionName);
+ }
+ if (charset != null) {
+ m.set(Metadata.CONTENT_ENCODING, charset.toString());
+ }
+ return m;
+ }
+
@Override
protected void endDocument(PDDocument pdf) throws IOException {
try {
@@ -1105,7 +1195,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
updateMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.VERSION.toString());
if (embeddedDocumentExtractor.shouldParseEmbedded(updateMetadata))
{
- try (InputStream tis = TikaInputStream.get(update)) {
+ try (TikaInputStream tis = TikaInputStream.get(update)) {
context.set(IsIncrementalUpdate.class,
IsIncrementalUpdate.IS_INCREMENTAL_UPDATE);
embeddedDocumentExtractor.parseEmbedded(tis, xhtml,
updateMetadata, false);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 78f54c4f1..57feed96d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -25,13 +25,18 @@ import static org.junit.jupiter.api.Assertions.fail;
import static org.junit.jupiter.api.Assumptions.assumeTrue;
import java.io.InputStream;
+import java.util.HashSet;
import java.util.List;
import java.util.Locale;
+import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
@@ -54,6 +59,7 @@ import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.parser.xml.XMLProfiler;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.utils.StringUtils;
public class PDFParserTest extends TikaTest {
public static final MediaType TYPE_TEXT = MediaType.TEXT_PLAIN;
@@ -250,7 +256,7 @@ public class PDFParserTest extends TikaTest {
@Test
public void testEmbeddedDocsWithOCROnly() throws Exception {
- assumeTrue(canRunOCR(), "can run OCR");
+ assumeTrue(canRunOCR(), "can't run OCR");
//test default is "auto"
assertEquals(PDFParserConfig.OCR_STRATEGY.AUTO, new
PDFParserConfig().getOcrStrategy());
testStrategy(null);
@@ -366,7 +372,7 @@ public class PDFParserTest extends TikaTest {
@Test
public void testJBIG2OCROnly() throws Exception {
- assumeTrue(canRunOCR(), "can run OCR");
+ assumeTrue(canRunOCR(), "can't run OCR");
PDFParserConfig config = new PDFParserConfig();
config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
ParseContext context = new ParseContext();
@@ -378,7 +384,7 @@ public class PDFParserTest extends TikaTest {
@Test
public void testJPEG2000() throws Exception {
- assumeTrue(canRunOCR(), "can run OCR");
+ assumeTrue(canRunOCR(), "can't run OCR");
PDFParserConfig config = new PDFParserConfig();
config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
ParseContext context = new ParseContext();
@@ -390,7 +396,7 @@ public class PDFParserTest extends TikaTest {
@Test
public void testOCRAutoMode() throws Exception {
- assumeTrue(canRunOCR(), "can run OCR");
+ assumeTrue(canRunOCR(), "can't run OCR");
//default
assertContains("Happy New Year", getXML("testOCR.pdf").xml);
@@ -409,7 +415,7 @@ public class PDFParserTest extends TikaTest {
@Test
public void testOCRNoText() throws Exception {
- assumeTrue(canRunOCR(), "can run OCR");
+ assumeTrue(canRunOCR(), "can't run OCR");
PDFParserConfig config = new PDFParserConfig();
config.setOcrRenderingStrategy(PDFParserConfig.OCR_RENDERING_STRATEGY.ALL);
config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
@@ -432,7 +438,7 @@ public class PDFParserTest extends TikaTest {
//TIKA-2970 -- make sure that configurations set on the
TesseractOCRParser
//make it through to when the TesseractOCRParser is called via
//the PDFParser
- assumeTrue(canRunOCR(), "can run OCR");
+ assumeTrue(canRunOCR(), "can't run OCR");
//via the config, tesseract should skip this file because it is too
large
try (InputStream is = getResourceAsStream(
@@ -457,8 +463,8 @@ public class PDFParserTest extends TikaTest {
public void testMuPDFInOCR() throws Exception {
//TODO -- need to add "rendered by" to confirm that mutool was
actually called
//and that there wasn't some backoff to PDFBox the PDFParser
- assumeTrue(canRunOCR(), "can run OCR");
- assumeTrue(hasMuPDF(), "has mupdf");
+ assumeTrue(canRunOCR(), "can't run OCR");
+ assumeTrue(hasMuPDF(), "does not have mupdf");
try (InputStream is = getResourceAsStream(
"/configs/tika-rendering-mupdf-config.xml")) {
assertNotNull(is);
@@ -507,4 +513,74 @@ public class PDFParserTest extends TikaTest {
assertEquals(TikaCoreProperties.EmbeddedResourceType.VERSION.toString(),
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
}
+
+ @Test
+ public void testJavascriptInNamesTreeOne() throws Exception {
+ PDFParserConfig config = new PDFParserConfig();
+ config.setExtractActions(true);
+ ParseContext pc = new ParseContext();
+ pc.set(PDFParserConfig.class, config);
+ List<Metadata> metadataList =
getRecursiveMetadata("testPDFPackage.pdf", pc, true);
+ assertEquals(4, metadataList.size());
+ //look for markup in primary document
+ Metadata m = metadataList.get(0);
+ String xhtml = m.get(TikaCoreProperties.TIKA_CONTENT);
+ Matcher matcher = Pattern.compile("<div
([^>]{0,1000})>").matcher(xhtml);
+ boolean found = false;
+ while (matcher.find()) {
+ String div = matcher.group(1);
+ if (div.contains("trigger=\"namesTree\"")) {
+ assertContains("type=\"PDActionJavaScript\"", div);
+ assertContains("class=\"javascript\"", div);
+ assertContains("subtype=\"JavaScript\"", div);
+ found = true;
+ }
+ }
+ if (! found) {
+ fail("failed to find js div in main document");
+ }
+ //now test js extraction
+ Metadata js = metadataList.get(1);
+ assertEquals("MACRO",
js.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ assertEquals("NAMES_TREE", js.get(PDF.ACTION_TRIGGER));
+
assertTrue(js.get(PDF.JS_NAME).startsWith("ADBE::FileAttachmentsCompatibility"));
+ assertContains("app.viewerVersion",
js.get(TikaCoreProperties.TIKA_CONTENT));
+ }
+
+ @Test
+ public void testJavascriptInNamesTreeTwo() throws Exception {
+ Set<String> expected = Set.of("!ADBE::0200_VersChkCode_XFACheck",
"!ADBE::0100_VersChkVars", "!ADBE::0100_VersChkStrings");
+ PDFParserConfig config = new PDFParserConfig();
+ config.setExtractActions(true);
+ ParseContext pc = new ParseContext();
+ pc.set(PDFParserConfig.class, config);
+ List<Metadata> metadataList =
getRecursiveMetadata("testPDF_XFA_govdocs1_258578.pdf", pc, true);
+ Set<String> jsNames = new HashSet<>();
+ for (Metadata m : metadataList) {
+ String n = m.get(PDF.JS_NAME);
+ if (!StringUtils.isBlank(n)) {
+ jsNames.add(n);
+ }
+ }
+ assertEquals(expected, jsNames);
+ }
+
+ @Test
+ @Disabled("until we can sort the license of the test file")
+ public void testJavascriptOnInstantiate() throws Exception {
+ // test file:
https://pdfa.org/wp-content/uploads/2021/12/Make-Buy-BOM-to-EBOM-Alignment-Example.pdf
+ PDFParserConfig config = new PDFParserConfig();
+ config.setExtractActions(true);
+ ParseContext pc = new ParseContext();
+ pc.set(PDFParserConfig.class, config);
+ List<Metadata> metadataList =
getRecursiveMetadata("Make-Buy-BOM-to-EBOM-Alignment-Example.pdf", pc, true);
+ assertEquals(6, metadataList.size());
+ Metadata onInstantiate = metadataList.get(4);
+ assertContains("scene.cameras.getByIndex",
onInstantiate.get(TikaCoreProperties.TIKA_CONTENT));
+ assertEquals("MACRO",
onInstantiate.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ assertEquals("3DD_ON_INSTANTIATE",
onInstantiate.get(PDF.ACTION_TRIGGER));
+
+ //test that the additional actions on the 3d object are processed
+ assertContains("this.notify3DAnnotPageOpen()",
metadataList.get(5).get(TikaCoreProperties.TIKA_CONTENT));
+ }
}