This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new b77e8ba6c TIKA-3832 -- prevent infinite cycle in bookmark extraction
b77e8ba6c is described below

commit b77e8ba6c9cb3effd0c5fe785ac54b25a5fa9118
Author: tallison <[email protected]>
AuthorDate: Fri Aug 5 10:35:33 2022 -0400

    TIKA-3832 -- prevent infinite cycle in bookmark extraction
---
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 26 ++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 1a4f76866..539e201b0 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -52,6 +52,7 @@ import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
 import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDPageTree;
+import org.apache.pdfbox.pdmodel.common.COSObjectable;
 import org.apache.pdfbox.pdmodel.common.PDDestinationOrAction;
 import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
 import 
org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
@@ -145,6 +146,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
      */
     private final static int MAX_RECURSION_DEPTH = 100;
 
+    private final static int MAX_BOOKMARK_ITEMS = 10000;
+
     private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new 
TesseractOCRConfig();
 
     private static final MediaType XFA_MEDIA_TYPE = 
MediaType.application("vnd.adobe.xdp+xml");
@@ -729,23 +732,38 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     void extractBookmarkText() throws SAXException, IOException, TikaException 
{
         PDDocumentOutline outline = 
document.getDocumentCatalog().getDocumentOutline();
         if (outline != null) {
-            extractBookmarkText(outline);
+            Set<COSObjectable> seen = new HashSet<>();
+            extractBookmarkText(outline, seen, 0);
         }
     }
 
-    void extractBookmarkText(PDOutlineNode bookmark) throws SAXException, 
IOException, TikaException {
+    void extractBookmarkText(PDOutlineNode bookmark, Set<COSObjectable> seen, 
int itemCount)
+            throws SAXException, IOException, TikaException {
         PDOutlineItem current = bookmark.getFirstChild();
-
         if (current != null) {
+            if (seen.contains(current)) {
+                return;
+            }
+            if (itemCount > MAX_BOOKMARK_ITEMS) {
+                return;
+            }
             xhtml.startElement("ul");
             while (current != null) {
+                if (seen.contains(current)) {
+                    break;
+                }
+                if (itemCount > MAX_BOOKMARK_ITEMS) {
+                    break;
+                }
+                seen.add(current);
                 xhtml.startElement("li");
                 xhtml.characters(current.getTitle());
                 xhtml.endElement("li");
                 handleDestinationOrAction(current.getAction(), 
ActionTrigger.BOOKMARK);
                 // Recurse:
-                extractBookmarkText(current);
+                extractBookmarkText(current, seen, itemCount + 1);
                 current = current.getNextSibling();
+                itemCount++;
             }
             xhtml.endElement("ul");
         }

Reply via email to