This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 7fc25dd TIKA-3413 -- avoid stackoverflow in bookmark handling
7fc25dd is described below
commit 7fc25dd9b050c38222626a9925e32b1a8d2b0067
Author: tballison <[email protected]>
AuthorDate: Fri May 21 12:48:07 2021 -0400
TIKA-3413 -- avoid stackoverflow in bookmark handling
---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 18 ++++++++++++----
.../org/apache/tika/parser/pdf/PDFParserTest.java | 25 ----------------------
2 files changed, 14 insertions(+), 29 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 3f47272..79a4160 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -55,6 +55,7 @@ import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
+import org.apache.pdfbox.pdmodel.common.COSObjectable;
import org.apache.pdfbox.pdmodel.common.PDDestinationOrAction;
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
import
org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
@@ -123,6 +124,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
* the embedded document tree.
*/
private final static int MAX_RECURSION_DEPTH = 100;
+ private final static int MAX_BOOKMARK_ITEMS = 10000;
private static final MediaType XFA_MEDIA_TYPE =
MediaType.application("vnd.adobe.xdp+xml");
private static final MediaType XMP_MEDIA_TYPE =
MediaType.application("rdf+xml");
final List<IOException> exceptions = new ArrayList<>();
@@ -797,24 +799,32 @@ class AbstractPDF2XHTML extends PDFTextStripper {
void extractBookmarkText() throws SAXException, IOException, TikaException
{
PDDocumentOutline outline =
document.getDocumentCatalog().getDocumentOutline();
if (outline != null) {
- extractBookmarkText(outline);
+ Set<COSObjectable> seen = new HashSet<>();
+ extractBookmarkText(outline, seen, 0);
}
}
- void extractBookmarkText(PDOutlineNode bookmark)
+ void extractBookmarkText(PDOutlineNode bookmark, Set<COSObjectable> seen,
int itemCount)
throws SAXException, IOException, TikaException {
PDOutlineItem current = bookmark.getFirstChild();
-
+ if (itemCount > MAX_BOOKMARK_ITEMS) {
+ return;
+ }
if (current != null) {
+ if (seen.contains(current)) {
+ return;
+ }
xhtml.startElement("ul");
while (current != null) {
+ seen.add(current);
xhtml.startElement("li");
xhtml.characters(current.getTitle());
xhtml.endElement("li");
handleDestinationOrAction(current.getAction(),
ActionTrigger.BOOKMARK);
// Recurse:
- extractBookmarkText(current);
+ extractBookmarkText(current, seen, itemCount + 1);
current = current.getNextSibling();
+ itemCount++;
}
xhtml.endElement("ul");
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 183ecbf..b90a405 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -23,12 +23,7 @@ import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
-import java.awt.image.BufferedImage;
import java.io.InputStream;
-import java.io.OutputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
@@ -41,10 +36,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
-import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
-import org.apache.pdfbox.rendering.PDFRenderer;
-import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Ignore;
@@ -1391,21 +1383,4 @@ public class PDFParserTest extends TikaTest {
return metadata;
}*/
- @Test
- public void oneOff() throws Exception {
- Path p =
Paths.get("/home/tallison/Intellij/tika-main/tika-parsers/tika-parsers-standard"
+
-
"/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test"
+
- "-documents/testPDF_XFA_govdocs1_258578.pdf");
- p = Paths.get("/home/tallison/Downloads/tiger.pdf");
- PDDocument pdDocument = PDDocument.load(p.toFile());
- PDFRenderer renderer = new NoTextPDFRenderer(pdDocument);
- Path target = Paths.get("/home/tallison/Desktop/tiger-no-text.png");
- BufferedImage image = renderer.renderImageWithDPI(0, 300);
- try (OutputStream os = Files.newOutputStream(target)) {
- //TODO: get output format from TesseractConfig
- ImageIOUtil.writeImage(image, "png", os, 300);
- }
- }
-
-
}