(tika) branch main updated: TIKA-4465 -- extract javascript from name tree (#2305)

tallison Mon, 18 Aug 2025 12:53:42 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/main by this push:
     new b466c4920 TIKA-4465 -- extract javascript from name tree (#2305)
b466c4920 is described below

commit b466c4920c3ab0ae5bb9fd203749c8585c52126f
Author: Tim Allison <[email protected]>
AuthorDate: Mon Aug 18 15:53:31 2025 -0400

    TIKA-4465 -- extract javascript from name tree (#2305)
    
    * TIKA-4465 -- extract javascript from name tree
---
 .../main/java/org/apache/tika/metadata/PDF.java    |   6 +
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 352 +++++++++++++--------
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  92 +++++-
 3 files changed, 313 insertions(+), 137 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java 
b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index b15c10383..f85218936 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -215,4 +215,10 @@ public interface PDF {
      */
     Property OCR_PAGE_COUNT = Property.externalInteger(PDF_PREFIX + 
"ocrPageCount");
 
+    /**
+     * When javascript is stored in the names tree, there's a name associated 
with that script.
+     * This is that name. When javascript is stored in an action, there is no 
name, and this
+     * metadata will not be populated.
+     */
+    Property JS_NAME = Property.internalText(PDF_PREFIX + "jsName");
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index de47f2394..132efa3f9 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -28,6 +28,7 @@ import java.io.InputStream;
 import java.io.OutputStream;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
+import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -52,8 +53,10 @@ import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.cos.COSStream;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
+import org.apache.pdfbox.pdmodel.PDJavascriptNameTreeNode;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDPageTree;
 import org.apache.pdfbox.pdmodel.common.COSObjectable;
@@ -147,7 +150,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     //These can be unbounded.  We need to limit the number we store.
     private final static int MAX_ANNOTATION_TYPES = 100;
     private static final String THREE_D = "3D";
-    private static final COSName THREE_DD = COSName.getPDFName("3DD");
+    private static final COSName ON_INSTANTIATE = 
COSName.getPDFName("OnInstantiate");
     private static final String NULL_STRING = "null";
     private static final MediaType XFA_MEDIA_TYPE = 
MediaType.application("vnd.adobe.xdp+xml");
     private static final MediaType XMP_MEDIA_TYPE = 
MediaType.application("rdf+xml");
@@ -700,92 +703,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
 
         try {
             for (PDAnnotation annotation : page.getAnnotations()) {
-                String annotationName = annotation.getAnnotationName();
-                if (annotationTypes.size() < MAX_ANNOTATION_TYPES) {
-                    if (annotationName != null) {
-                        annotationTypes.add(annotationName);
-                    } else {
-                        annotationTypes.add(NULL_STRING);
-                    }
-                }
-                String annotationSubtype = annotation.getSubtype();
-                if (annotationSubtypes.size() < MAX_ANNOTATION_TYPES) {
-                    if (annotationSubtype != null) {
-                        annotationSubtypes.add(annotationSubtype);
-                    } else {
-                        annotationSubtypes.add(NULL_STRING);
-                    }
-                }
-                if (annotation instanceof PDAnnotationFileAttachment) {
-                    PDAnnotationFileAttachment fann = 
(PDAnnotationFileAttachment) annotation;
-                    String subtype = "annotationFileAttachment";
-                    AttributesImpl attributes = new AttributesImpl();
-                    attributes.addAttribute("", "source", "source", "CDATA", 
subtype);
-                    processDocOnAction("", subtype, fann.getFile(), 
attributes);
-                } else if (annotation instanceof PDAnnotationWidget) {
-                    handleWidget((PDAnnotationWidget) annotation);
-                } else {
-                    if (annotationSubtype == null) {
-                        annotationSubtype = "unknown";
-                    } else if (annotationSubtype.equals(THREE_D) ||
-                            annotation.getCOSObject().containsKey(THREE_DD)) {
-                        //To make this stricter, we could get the 3DD stream 
object and see if the
-                        //subtype is U3D or PRC or model/ (prefix for model 
mime type)
-                        metadata.set(PDF.HAS_3D, true);
-                        num3DAnnotations++;
-                    }
-                    for (COSDictionary fileSpec : 
findFileSpecs(annotation.getCOSObject())) {
-                        AttributesImpl attributes = new AttributesImpl();
-                        attributes.addAttribute("", "source", "source", 
"CDATA", annotationSubtype);
-                        processDocOnAction("", annotationSubtype, 
createFileSpecification(fileSpec),
-                                attributes);
-                    }
-                }
-                // TODO: remove once PDFBOX-1143 is fixed:
-                if (config.isExtractAnnotationText()) {
-                    PDActionURI uri = getActionURI(annotation);
-                    if (uri != null) {
-                        String link = uri.getURI();
-                        if (link != null && !link.isBlank()) {
-                            xhtml.startElement("div", "class", "annotation");
-                            xhtml.startElement("a", "href", link);
-                            xhtml.characters(link);
-                            xhtml.endElement("a");
-                            xhtml.endElement("div");
-                        }
-                    }
-
-                    if (annotation instanceof PDAnnotationMarkup) {
-                        PDAnnotationMarkup annotationMarkup = 
(PDAnnotationMarkup) annotation;
-                        String title = annotationMarkup.getTitlePopup();
-                        String subject = annotationMarkup.getSubject();
-                        String contents = annotationMarkup.getContents();
-                        // TODO: maybe also annotationMarkup.getRichContents()?
-                        if (title != null || subject != null || contents != 
null) {
-                            xhtml.startElement("div", "class", "annotation");
-
-                            if (title != null) {
-                                xhtml.startElement("div", "class", 
"annotationTitle");
-                                xhtml.characters(title);
-                                xhtml.endElement("div");
-                            }
-
-                            if (subject != null) {
-                                xhtml.startElement("div", "class", 
"annotationSubject");
-                                xhtml.characters(subject);
-                                xhtml.endElement("div");
-                            }
-
-                            if (contents != null) {
-                                xhtml.startElement("div", "class", 
"annotationContents");
-                                xhtml.characters(contents);
-                                xhtml.endElement("div");
-                            }
-
-                            xhtml.endElement("div");
-                        }
-                    }
-                }
+                processPageAnnotation(annotation);
             }
             if (config.getOcrStrategy() == 
PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION) {
                 doOCROnCurrentPage(page, OCR_AND_TEXT_EXTRACTION);
@@ -835,6 +753,124 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
     }
 
+    private void processPageAnnotation(PDAnnotation annotation) throws 
TikaException, IOException, SAXException {
+
+        String annotationName = annotation.getAnnotationName();
+        if (annotationTypes.size() < MAX_ANNOTATION_TYPES) {
+            if (annotationName != null) {
+                annotationTypes.add(annotationName);
+            } else {
+                annotationTypes.add(NULL_STRING);
+            }
+        }
+        String annotationSubtype = annotation.getSubtype();
+        if (annotationSubtypes.size() < MAX_ANNOTATION_TYPES) {
+            if (annotationSubtype != null) {
+                annotationSubtypes.add(annotationSubtype);
+            } else {
+                annotationSubtypes.add(NULL_STRING);
+            }
+        }
+        if (annotation instanceof PDAnnotationFileAttachment) {
+            PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) 
annotation;
+            String subtype = "annotationFileAttachment";
+            AttributesImpl attributes = new AttributesImpl();
+            attributes.addAttribute("", "source", "source", "CDATA", subtype);
+            processDocOnAction("", subtype, fann.getFile(), attributes);
+        } else if (annotation instanceof PDAnnotationWidget) {
+            handleWidget((PDAnnotationWidget) annotation);
+        } else {
+            if (annotationSubtype == null) {
+                annotationSubtype = "unknown";
+            } else if (annotationSubtype.equals(THREE_D) ||
+                    annotation.getCOSObject().containsKey(COSName.THREE_DD)) {
+                //To make this stricter, we could get the 3DD stream object 
and see if the
+                //subtype is U3D or PRC or model/ (prefix for model mime type)
+                extractOnInstantiate(annotation);
+                COSDictionary additionalActions = 
annotation.getCOSObject().getCOSDictionary(COSName.AA);
+                if (additionalActions != null) {
+                    handlePDAnnotationAdditionalActions(new 
PDAnnotationAdditionalActions(additionalActions));
+                }
+                metadata.set(PDF.HAS_3D, true);
+                num3DAnnotations++;
+            }
+            for (COSDictionary fileSpec : 
findFileSpecs(annotation.getCOSObject())) {
+                AttributesImpl attributes = new AttributesImpl();
+                attributes.addAttribute("", "source", "source", "CDATA", 
annotationSubtype);
+                processDocOnAction("", annotationSubtype, 
createFileSpecification(fileSpec),
+                        attributes);
+            }
+        }
+        if (! config.isExtractAnnotationText()) {
+            return;
+        }
+        // TODO: remove once PDFBOX-1143 is fixed:
+        PDActionURI uri = getActionURI(annotation);
+        if (uri != null) {
+            String link = uri.getURI();
+            if (link != null && !link.isBlank()) {
+                xhtml.startElement("div", "class", "annotation");
+                xhtml.startElement("a", "href", link);
+                xhtml.characters(link);
+                xhtml.endElement("a");
+                xhtml.endElement("div");
+            }
+        }
+
+        if (annotation instanceof PDAnnotationMarkup) {
+            PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) 
annotation;
+            String title = annotationMarkup.getTitlePopup();
+            String subject = annotationMarkup.getSubject();
+            String contents = annotationMarkup.getContents();
+            // TODO: maybe also annotationMarkup.getRichContents()?
+            if (title != null || subject != null || contents != null) {
+                xhtml.startElement("div", "class", "annotation");
+
+                if (title != null) {
+                    xhtml.startElement("div", "class", "annotationTitle");
+                    xhtml.characters(title);
+                    xhtml.endElement("div");
+                }
+
+                if (subject != null) {
+                    xhtml.startElement("div", "class", "annotationSubject");
+                    xhtml.characters(subject);
+                    xhtml.endElement("div");
+                }
+
+                if (contents != null) {
+                    xhtml.startElement("div", "class", "annotationContents");
+                    xhtml.characters(contents);
+                    xhtml.endElement("div");
+                }
+
+                xhtml.endElement("div");
+            }
+        }
+    }
+
+    private void extractOnInstantiate(PDAnnotation annotation) throws 
IOException, SAXException {
+        COSDictionary threeDD = 
annotation.getCOSObject().getCOSDictionary(COSName.THREE_DD);
+        if (threeDD == null) {
+            return;
+        }
+        COSStream stream = threeDD.getCOSStream(ON_INSTANTIATE);
+        if (stream == null) {
+            return;
+        }
+        Metadata m = getJavascriptMetadata("3DD_ON_INSTANTIATE", null, null);
+        if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
+            try (TikaInputStream tis = 
TikaInputStream.get(stream.createInputStream())) {
+                embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, true);
+            }
+        }
+        AttributesImpl attrs = new AttributesImpl();
+        addNonNullAttribute("class", "javascript", attrs);
+        addNonNullAttribute("type", "3dd_on_instantiate", attrs);
+        xhtml.startElement("div", attrs);
+        xhtml.endElement("div");
+    }
+
     private List<COSDictionary> findFileSpecs(COSDictionary cosDict) {
         Set<COSName> types = new HashSet<>();
         types.add(COSName.FILESPEC);
@@ -876,36 +912,30 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             return;
         }
         handleDestinationOrAction(widget.getAction(), 
ActionTrigger.ANNOTATION_WIDGET);
-        PDAnnotationAdditionalActions annotationActions = widget.getActions();
-        if (annotationActions != null) {
-            handleDestinationOrAction(annotationActions.getBl(),
-                    ActionTrigger.ANNOTATION_LOSE_INPUT_FOCUS);
-            handleDestinationOrAction(annotationActions.getD(),
-                    ActionTrigger.ANNOTATION_MOUSE_CLICK);
-            handleDestinationOrAction(annotationActions.getE(),
-                    ActionTrigger.ANNOTATION_CURSOR_ENTERS);
-            handleDestinationOrAction(annotationActions.getFo(),
-                    ActionTrigger.ANNOTATION_RECEIVES_FOCUS);
-            handleDestinationOrAction(annotationActions.getPC(),
-                    ActionTrigger.ANNOTATION_PAGE_CLOSED);
-            handleDestinationOrAction(annotationActions.getPI(),
-                    ActionTrigger.ANNOTATION_PAGE_NO_LONGER_VISIBLE);
-            handleDestinationOrAction(annotationActions.getPO(),
-                    ActionTrigger.ANNOTATION_PAGE_OPENED);
-            handleDestinationOrAction(annotationActions.getPV(),
-                    ActionTrigger.ANNOTATION_PAGE_VISIBLE);
-            handleDestinationOrAction(annotationActions.getU(),
-                    ActionTrigger.ANNOTATION_MOUSE_RELEASED);
-            handleDestinationOrAction(annotationActions.getX(),
-                    ActionTrigger.ANNOTATION_CURSOR_EXIT);
-        }
+        handlePDAnnotationAdditionalActions(widget.getActions());
+    }
 
+    private void 
handlePDAnnotationAdditionalActions(PDAnnotationAdditionalActions 
annotationActions) throws TikaException, IOException, SAXException {
+        if (annotationActions == null) {
+            return;
+        }
+        handleDestinationOrAction(annotationActions.getBl(), 
ActionTrigger.ANNOTATION_LOSE_INPUT_FOCUS);
+        handleDestinationOrAction(annotationActions.getD(), 
ActionTrigger.ANNOTATION_MOUSE_CLICK);
+        handleDestinationOrAction(annotationActions.getE(), 
ActionTrigger.ANNOTATION_CURSOR_ENTERS);
+        handleDestinationOrAction(annotationActions.getFo(), 
ActionTrigger.ANNOTATION_RECEIVES_FOCUS);
+        handleDestinationOrAction(annotationActions.getPC(), 
ActionTrigger.ANNOTATION_PAGE_CLOSED);
+        handleDestinationOrAction(annotationActions.getPI(), 
ActionTrigger.ANNOTATION_PAGE_NO_LONGER_VISIBLE);
+        handleDestinationOrAction(annotationActions.getPO(), 
ActionTrigger.ANNOTATION_PAGE_OPENED);
+        handleDestinationOrAction(annotationActions.getPV(), 
ActionTrigger.ANNOTATION_PAGE_VISIBLE);
+        handleDestinationOrAction(annotationActions.getU(), 
ActionTrigger.ANNOTATION_MOUSE_RELEASED);
+        handleDestinationOrAction(annotationActions.getX(), 
ActionTrigger.ANNOTATION_CURSOR_EXIT);
     }
 
     @Override
     protected void startDocument(PDDocument pdf) throws IOException {
         try {
             xhtml.startDocument();
+            extractJavaScriptFromNameTreeNode(pdf);
             try {
                 
handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(),
                         ActionTrigger.DOCUMENT_OPEN);
@@ -918,6 +948,57 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
     }
 
+    private void extractJavaScriptFromNameTreeNode(PDDocument pdf) throws 
SAXException {
+        if (! config.isExtractActions()) {
+            return;
+        }
+        if (pdf.getDocumentCatalog() == null || 
pdf.getDocumentCatalog().getNames() == null
+                || pdf.getDocumentCatalog().getNames().getJavaScript() == 
null) {
+            return;
+        }
+        try {
+            PDJavascriptNameTreeNode pdjntn = 
pdf.getDocumentCatalog().getNames().getJavaScript();
+            addJavaScript(pdjntn.getNames());
+            int depth = 0;
+            processJavascriptNameTreeNodeKids(pdjntn.getKids(), depth + 1);
+        } catch (IOException e) {
+            //swallow
+        }
+    }
+
+    private void addJavaScript(Map<String, PDActionJavaScript> 
pdActionJavaScriptMap) throws IOException, SAXException {
+        for (Map.Entry<String, PDActionJavaScript> e : 
pdActionJavaScriptMap.entrySet()) {
+            String action = e.getValue().getAction();
+            if (StringUtils.isBlank(action)) {
+                return;
+            }
+            AttributesImpl attributes = new AttributesImpl();
+
+            addNonNullAttribute("trigger", "namesTree", attributes);
+            addNonNullAttribute("type", 
e.getValue().getClass().getSimpleName(), attributes);
+
+            processJavaScriptAction("NAMES_TREE", e.getKey(), e.getValue(), 
attributes);
+        }
+
+    }
+
+    private void 
processJavascriptNameTreeNodeKids(List<PDNameTreeNode<PDActionJavaScript>> 
kids, int depth) throws IOException, SAXException {
+
+        if (kids == null) {
+            return;
+        }
+
+        if (depth > MAX_RECURSION_DEPTH) {
+            //hit max recursion
+            //return silently for now...maybe throw Exception?
+            return;
+        }
+        for (PDNameTreeNode<PDActionJavaScript> pdntn: kids) {
+            addJavaScript(pdntn.getNames());
+            processJavascriptNameTreeNodeKids(pdntn.getKids(), depth + 1);
+        };
+    }
+
     private void handleDestinationOrAction(PDDestinationOrAction action,
                                            ActionTrigger actionTrigger)
             throws IOException, SAXException, TikaException {
@@ -952,25 +1033,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo) action;
             processDocOnAction("", "", remoteGoTo.getFile(), attributes);
         } else if (action instanceof PDActionJavaScript) {
-            PDActionJavaScript jsAction = (PDActionJavaScript) action;
-            Metadata m = new Metadata();
-            m.set(Metadata.CONTENT_TYPE, "application/javascript");
-            m.set(Metadata.CONTENT_ENCODING, 
StandardCharsets.UTF_8.toString());
-            m.set(PDF.ACTION_TRIGGER, actionTrigger.toString());
-            m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
-                    TikaCoreProperties.EmbeddedResourceType.MACRO.name());
-            String js = jsAction.getAction();
-            js = (js == null) ? "" : js;
-            if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
-                try (TikaInputStream tis = 
TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
-                    embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, 
true);
-                }
-            }
-            addNonNullAttribute("class", "javascript", attributes);
-            addNonNullAttribute("type", jsAction.getType(), attributes);
-            addNonNullAttribute("subtype", jsAction.getSubType(), attributes);
-            xhtml.startElement("div", attributes);
-            xhtml.endElement("div");
+            processJavaScriptAction(actionTrigger.name(), null, 
(PDActionJavaScript) action, attributes);
         /*} else if (action instanceof PDActionSubmitForm) {
             PDActionSubmitForm submitForm = (PDActionSubmitForm) action;
             //these are typically urls, not actual file specification
@@ -982,6 +1045,37 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
     }
 
+    private void processJavaScriptAction(String trigger, String jsActionName, 
PDActionJavaScript jsAction, AttributesImpl attrs) throws IOException, 
SAXException {
+        Metadata m = getJavascriptMetadata(trigger, jsActionName, 
StandardCharsets.UTF_8);
+        String js = jsAction.getAction();
+        js = (js == null) ? "" : js;
+        if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
+            try (TikaInputStream tis = 
TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
+                embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, true);
+            }
+        };
+        addNonNullAttribute("class", "javascript", attrs);
+        addNonNullAttribute("type", jsAction.getType(), attrs);
+        addNonNullAttribute("subtype", jsAction.getSubType(), attrs);
+        xhtml.startElement("div", attrs);
+        xhtml.endElement("div");
+    }
+
+    private Metadata getJavascriptMetadata(String trigger, String 
jsActionName, Charset charset) {
+        Metadata m = new Metadata();
+        m.set(Metadata.CONTENT_TYPE, "application/javascript");
+        m.set(PDF.ACTION_TRIGGER, trigger);
+        m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.MACRO.name());
+        if (! StringUtils.isBlank(jsActionName)) {
+            m.set(PDF.JS_NAME, jsActionName);
+        }
+        if (charset != null) {
+            m.set(Metadata.CONTENT_ENCODING, charset.toString());
+        }
+        return m;
+    }
+
     @Override
     protected void endDocument(PDDocument pdf) throws IOException {
         try {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 49b0042cb..d4d9df116 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -25,13 +25,18 @@ import static org.junit.jupiter.api.Assertions.fail;
 import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
 import java.io.InputStream;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
+import java.util.Set;
 import java.util.logging.Level;
 import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.TikaTest;
@@ -55,6 +60,7 @@ import org.apache.tika.parser.ocr.TesseractOCRParser;
 import org.apache.tika.parser.xml.XMLProfiler;
 import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.utils.StringUtils;
 
 public class PDFParserTest extends TikaTest {
     public static final MediaType TYPE_TEXT = MediaType.TEXT_PLAIN;
@@ -251,7 +257,7 @@ public class PDFParserTest extends TikaTest {
 
     @Test
     public void testEmbeddedDocsWithOCROnly() throws Exception {
-        assumeTrue(canRunOCR(), "can run OCR");
+        assumeTrue(canRunOCR(), "can't run OCR");
         //test default is "auto"
         assertEquals(PDFParserConfig.OCR_STRATEGY.AUTO, new 
PDFParserConfig().getOcrStrategy());
         testStrategy(null);
@@ -367,7 +373,7 @@ public class PDFParserTest extends TikaTest {
 
     @Test
     public void testJBIG2OCROnly() throws Exception {
-        assumeTrue(canRunOCR(), "can run OCR");
+        assumeTrue(canRunOCR(), "can't run OCR");
         PDFParserConfig config = new PDFParserConfig();
         config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
         ParseContext context = new ParseContext();
@@ -379,7 +385,7 @@ public class PDFParserTest extends TikaTest {
 
     @Test
     public void testJPEG2000() throws Exception {
-        assumeTrue(canRunOCR(), "can run OCR");
+        assumeTrue(canRunOCR(), "can't run OCR");
         PDFParserConfig config = new PDFParserConfig();
         config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
         ParseContext context = new ParseContext();
@@ -391,7 +397,7 @@ public class PDFParserTest extends TikaTest {
 
     @Test
     public void testOCRAutoMode() throws Exception {
-        assumeTrue(canRunOCR(), "can run OCR");
+        assumeTrue(canRunOCR(), "can't run OCR");
 
         //default
         assertContains("Happy New Year", getXML("testOCR.pdf").xml);
@@ -410,7 +416,7 @@ public class PDFParserTest extends TikaTest {
 
     @Test
     public void testOCRNoText() throws Exception {
-        assumeTrue(canRunOCR(), "can run OCR");
+        assumeTrue(canRunOCR(), "can't run OCR");
         PDFParserConfig config = new PDFParserConfig();
         
config.setOcrRenderingStrategy(PDFParserConfig.OCR_RENDERING_STRATEGY.ALL);
         config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
@@ -433,7 +439,7 @@ public class PDFParserTest extends TikaTest {
         //TIKA-2970 -- make sure that configurations set on the 
TesseractOCRParser
         //make it through to when the TesseractOCRParser is called via
         //the PDFParser
-        assumeTrue(canRunOCR(), "can run OCR");
+        assumeTrue(canRunOCR(), "can't run OCR");
 
         //via the config, tesseract should skip this file because it is too 
large
         try (InputStream is = getResourceAsStream(
@@ -458,8 +464,8 @@ public class PDFParserTest extends TikaTest {
     public void testMuPDFInOCR() throws Exception {
         //TODO -- need to add "rendered by" to confirm that mutool was 
actually called
         //and that there wasn't some backoff to PDFBox the PDFParser
-        assumeTrue(canRunOCR(), "can run OCR");
-        assumeTrue(hasMuPDF(), "has mupdf");
+        assumeTrue(canRunOCR(), "can't run OCR");
+        assumeTrue(hasMuPDF(), "does not have mupdf");
         try (InputStream is = getResourceAsStream(
                 "/configs/tika-rendering-mupdf-config.xml")) {
             assertNotNull(is);
@@ -508,4 +514,74 @@ public class PDFParserTest extends TikaTest {
         
assertEquals(TikaCoreProperties.EmbeddedResourceType.VERSION.toString(),
                 
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
     }
+
+    @Test
+    public void testJavascriptInNamesTreeOne() throws Exception {
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractActions(true);
+        ParseContext pc = new ParseContext();
+        pc.set(PDFParserConfig.class, config);
+        List<Metadata> metadataList = 
getRecursiveMetadata("testPDFPackage.pdf", pc, true);
+        assertEquals(4, metadataList.size());
+        //look for markup in primary document
+        Metadata m = metadataList.get(0);
+        String xhtml = m.get(TikaCoreProperties.TIKA_CONTENT);
+        Matcher matcher = Pattern.compile("<div 
([^>]{0,1000})>").matcher(xhtml);
+        boolean found = false;
+        while (matcher.find()) {
+            String div = matcher.group(1);
+            if (div.contains("trigger=\"namesTree\"")) {
+                assertContains("type=\"PDActionJavaScript\"", div);
+                assertContains("class=\"javascript\"", div);
+                assertContains("subtype=\"JavaScript\"", div);
+                found = true;
+            }
+        }
+        if (! found) {
+            fail("failed to find js div in main document");
+        }
+        //now test js extraction
+        Metadata js = metadataList.get(1);
+        assertEquals("MACRO", 
js.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+        assertEquals("NAMES_TREE", js.get(PDF.ACTION_TRIGGER));
+        
assertTrue(js.get(PDF.JS_NAME).startsWith("ADBE::FileAttachmentsCompatibility"));
+        assertContains("app.viewerVersion", 
js.get(TikaCoreProperties.TIKA_CONTENT));
+    }
+
+    @Test
+    public void testJavascriptInNamesTreeTwo() throws Exception {
+        Set<String> expected = Set.of("!ADBE::0200_VersChkCode_XFACheck", 
"!ADBE::0100_VersChkVars", "!ADBE::0100_VersChkStrings");
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractActions(true);
+        ParseContext pc = new ParseContext();
+        pc.set(PDFParserConfig.class, config);
+        List<Metadata> metadataList = 
getRecursiveMetadata("testPDF_XFA_govdocs1_258578.pdf", pc, true);
+        Set<String> jsNames = new HashSet<>();
+        for (Metadata m : metadataList) {
+            String n = m.get(PDF.JS_NAME);
+            if (!StringUtils.isBlank(n)) {
+                jsNames.add(n);
+            }
+        }
+        assertEquals(expected, jsNames);
+    }
+
+    @Test
+    @Disabled("until we can sort the license of the test file")
+    public void testJavascriptOnInstantiate() throws Exception {
+        // test file: 
https://pdfa.org/wp-content/uploads/2021/12/Make-Buy-BOM-to-EBOM-Alignment-Example.pdf
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractActions(true);
+        ParseContext pc = new ParseContext();
+        pc.set(PDFParserConfig.class, config);
+        List<Metadata> metadataList = 
getRecursiveMetadata("Make-Buy-BOM-to-EBOM-Alignment-Example.pdf", pc, true);
+        assertEquals(6, metadataList.size());
+        Metadata onInstantiate = metadataList.get(4);
+        assertContains("scene.cameras.getByIndex", 
onInstantiate.get(TikaCoreProperties.TIKA_CONTENT));
+        assertEquals("MACRO", 
onInstantiate.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+        assertEquals("3DD_ON_INSTANTIATE", 
onInstantiate.get(PDF.ACTION_TRIGGER));
+
+        //test that the additional actions on the 3d object are processed
+        assertContains("this.notify3DAnnotPageOpen()", 
metadataList.get(5).get(TikaCoreProperties.TIKA_CONTENT));
+    }
 }

(tika) branch main updated: TIKA-4465 -- extract javascript from name tree (#2305)

Reply via email to