This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new b77e8ba6c TIKA-3832 -- prevent infinite cycle in bookmark extraction
b77e8ba6c is described below
commit b77e8ba6c9cb3effd0c5fe785ac54b25a5fa9118
Author: tallison <[email protected]>
AuthorDate: Fri Aug 5 10:35:33 2022 -0400
TIKA-3832 -- prevent infinite cycle in bookmark extraction
---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 26 ++++++++++++++++++----
1 file changed, 22 insertions(+), 4 deletions(-)
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 1a4f76866..539e201b0 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -52,6 +52,7 @@ import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
+import org.apache.pdfbox.pdmodel.common.COSObjectable;
import org.apache.pdfbox.pdmodel.common.PDDestinationOrAction;
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
import
org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
@@ -145,6 +146,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
*/
private final static int MAX_RECURSION_DEPTH = 100;
+ private final static int MAX_BOOKMARK_ITEMS = 10000;
+
private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new
TesseractOCRConfig();
private static final MediaType XFA_MEDIA_TYPE =
MediaType.application("vnd.adobe.xdp+xml");
@@ -729,23 +732,38 @@ class AbstractPDF2XHTML extends PDFTextStripper {
void extractBookmarkText() throws SAXException, IOException, TikaException
{
PDDocumentOutline outline =
document.getDocumentCatalog().getDocumentOutline();
if (outline != null) {
- extractBookmarkText(outline);
+ Set<COSObjectable> seen = new HashSet<>();
+ extractBookmarkText(outline, seen, 0);
}
}
- void extractBookmarkText(PDOutlineNode bookmark) throws SAXException,
IOException, TikaException {
+ void extractBookmarkText(PDOutlineNode bookmark, Set<COSObjectable> seen,
int itemCount)
+ throws SAXException, IOException, TikaException {
PDOutlineItem current = bookmark.getFirstChild();
-
if (current != null) {
+ if (seen.contains(current)) {
+ return;
+ }
+ if (itemCount > MAX_BOOKMARK_ITEMS) {
+ return;
+ }
xhtml.startElement("ul");
while (current != null) {
+ if (seen.contains(current)) {
+ break;
+ }
+ if (itemCount > MAX_BOOKMARK_ITEMS) {
+ break;
+ }
+ seen.add(current);
xhtml.startElement("li");
xhtml.characters(current.getTitle());
xhtml.endElement("li");
handleDestinationOrAction(current.getAction(),
ActionTrigger.BOOKMARK);
// Recurse:
- extractBookmarkText(current);
+ extractBookmarkText(current, seen, itemCount + 1);
current = current.getNextSibling();
+ itemCount++;
}
xhtml.endElement("ul");
}