[
https://issues.apache.org/jira/browse/TIKA-3224?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17226366#comment-17226366
]
Tim Allison commented on TIKA-3224:
-----------------------------------
I manually extracted the PDF file from the docx, and I'm getting the same
behavior with straight PDFBox's `java -jar pdfbox.jar ExtractText`.
{noformat}
Exception in thread "main" java.lang.StackOverflowError
at
java.base/java.util.regex.Pattern$BmpCharProperty.match(Pattern.java:3950)
at java.base/java.util.regex.Pattern$GroupHead.match(Pattern.java:4791)
at java.base/java.util.regex.Pattern$BranchConn.match(Pattern.java:4700)
at java.base/java.util.regex.Pattern$GroupTail.match(Pattern.java:4850)
at
java.base/java.util.regex.Pattern$BmpCharProperty.match(Pattern.java:3951)
at java.base/java.util.regex.Pattern$GroupHead.match(Pattern.java:4791)
at java.base/java.util.regex.Pattern$Branch.match(Pattern.java:4736)
at java.base/java.util.regex.Pattern$Branch.match(Pattern.java:4734)
at java.base/java.util.regex.Pattern$Branch.match(Pattern.java:4734)
at java.base/java.util.regex.Pattern$BranchConn.match(Pattern.java:4700)
at java.base/java.util.regex.Pattern$GroupTail.match(Pattern.java:4850)
at
java.base/java.util.regex.Pattern$BmpCharPropertyGreedy.match(Pattern.java:4331)
at java.base/java.util.regex.Pattern$GroupHead.match(Pattern.java:4791)
at java.base/java.util.regex.Pattern$Branch.match(Pattern.java:4736)
at java.base/java.util.regex.Pattern$BranchConn.match(Pattern.java:4700)
at java.base/java.util.regex.Pattern$GroupTail.match(Pattern.java:4850)
at
java.base/java.util.regex.Pattern$BmpCharProperty.match(Pattern.java:3951)
at
java.base/java.util.regex.Pattern$BmpCharPropertyGreedy.match(Pattern.java:4331)
at java.base/java.util.regex.Pattern$GroupHead.match(Pattern.java:4791)
at java.base/java.util.regex.Pattern$Branch.match(Pattern.java:4736)
at
java.base/java.util.regex.Pattern$BmpCharProperty.match(Pattern.java:3951)
at java.base/java.util.regex.Pattern$Start.match(Pattern.java:3606)
at java.base/java.util.regex.Matcher.search(Matcher.java:1729)
at java.base/java.util.regex.Matcher.find(Matcher.java:773)
at java.base/java.util.Formatter.parse(Formatter.java:2702)
at java.base/java.util.Formatter.format(Formatter.java:2655)
at java.base/java.util.Formatter.format(Formatter.java:2609)
at java.base/java.lang.String.format(String.java:2897)
at
java.logging/java.util.logging.SimpleFormatter.format(SimpleFormatter.java:178)
at
java.logging/java.util.logging.StreamHandler.publish(StreamHandler.java:199)
at
java.logging/java.util.logging.ConsoleHandler.publish(ConsoleHandler.java:95)
at java.logging/java.util.logging.Logger.log(Logger.java:979)
at java.logging/java.util.logging.Logger.doLog(Logger.java:1006)
at java.logging/java.util.logging.Logger.logp(Logger.java:1172)
at org.apache.commons.logging.impl.Jdk14Logger.log(Jdk14Logger.java:87)
at
org.apache.commons.logging.impl.Jdk14Logger.warn(Jdk14Logger.java:260)
at org.apache.pdfbox.pdmodel.PDPageTree.getKids(PDPageTree.java:159)
at org.apache.pdfbox.pdmodel.PDPageTree.access$200(PDPageTree.java:41)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:183)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
at
org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
{noformat}
I'm not able to open the PDF with Foxit or Apple's preview, and I get a crazy
amount of errors from pdftotext and `mutool clean -s`.
So, this looks like a SO caused by bad/crafted PDF issue.
> Stackoverflow with Embedded PDF in DOCX document
> ------------------------------------------------
>
> Key: TIKA-3224
> URL: https://issues.apache.org/jira/browse/TIKA-3224
> Project: Tika
> Issue Type: Bug
> Components: parser
> Affects Versions: 1.24.1
> Reporter: David Pilato
> Priority: Major
> Attachments: issue-stackoverflow.docx, oleObject1_cleaned.pdf
>
>
> This issue has been reported by a user on
> [discuss.elastic.co|https://discuss.elastic.co/t/stackoverflow-on-elasticsearch-file-indexation-with-ingest-attachment/253455/4].
> I can reproduce the problem using the latest version of Tika (1.24.1) in
> FSCrawler project.
> When running the extraction of the data, we are seeing:
> {code:java}
> java.lang.StackOverflowError: null
> at
> java.util.regex.Pattern$BmpCharPredicate.lambda$union$2(Pattern.java:5692)
> ~[?:?]
> at java.util.regex.Pattern$BmpCharProperty.match(Pattern.java:4019)
> ~[?:?]
> at java.util.regex.Pattern$GroupHead.match(Pattern.java:4855) ~[?:?]
> at java.util.regex.Pattern$BranchConn.match(Pattern.java:4763) ~[?:?]
> at java.util.regex.Pattern$GroupTail.match(Pattern.java:4886) ~[?:?]
> at java.util.regex.Pattern$BmpCharProperty.match(Pattern.java:4020)
> ~[?:?]
> at java.util.regex.Pattern$GroupHead.match(Pattern.java:4855) ~[?:?]
> at java.util.regex.Pattern$Branch.match(Pattern.java:4800) ~[?:?]
> at java.util.regex.Pattern$Branch.match(Pattern.java:4798) ~[?:?]
> at java.util.regex.Pattern$Branch.match(Pattern.java:4798) ~[?:?]
> at java.util.regex.Pattern$BranchConn.match(Pattern.java:4763) ~[?:?]
> at java.util.regex.Pattern$GroupTail.match(Pattern.java:4886) ~[?:?]
> at
> java.util.regex.Pattern$BmpCharPropertyGreedy.match(Pattern.java:4394) ~[?:?]
> at java.util.regex.Pattern$GroupHead.match(Pattern.java:4855) ~[?:?]
> at java.util.regex.Pattern$Branch.match(Pattern.java:4800) ~[?:?]
> at java.util.regex.Pattern$BranchConn.match(Pattern.java:4763) ~[?:?]
> at java.util.regex.Pattern$GroupTail.match(Pattern.java:4886) ~[?:?]
> at java.util.regex.Pattern$BmpCharProperty.match(Pattern.java:4020)
> ~[?:?]
> at
> java.util.regex.Pattern$BmpCharPropertyGreedy.match(Pattern.java:4394) ~[?:?]
> at java.util.regex.Pattern$GroupHead.match(Pattern.java:4855) ~[?:?]
> at java.util.regex.Pattern$Branch.match(Pattern.java:4800) ~[?:?]
> at java.util.regex.Pattern$BmpCharProperty.match(Pattern.java:4020)
> ~[?:?]
> at java.util.regex.Pattern$Start.match(Pattern.java:3673) ~[?:?]
> at java.util.regex.Matcher.search(Matcher.java:1729) ~[?:?]
> at java.util.regex.Matcher.find(Matcher.java:773) ~[?:?]
> at java.util.Formatter.parse(Formatter.java:2702) ~[?:?]
> at java.util.Formatter.format(Formatter.java:2655) ~[?:?]
> at java.util.Formatter.format(Formatter.java:2609) ~[?:?]
> at java.lang.String.format(String.java:3292) ~[?:?]
> at java.util.logging.SimpleFormatter.format(SimpleFormatter.java:176)
> ~[?:?]
> at java.util.logging.StreamHandler.publish(StreamHandler.java:199)
> ~[?:?]
> at java.util.logging.ConsoleHandler.publish(ConsoleHandler.java:95)
> ~[?:?]
> at java.util.logging.Logger.log(Logger.java:979) ~[?:?]
> at java.util.logging.Logger.doLog(Logger.java:1006) ~[?:?]
> at java.util.logging.Logger.logp(Logger.java:1172) ~[?:?]
> at org.apache.commons.logging.impl.Jdk14Logger.log(Jdk14Logger.java:87)
> ~[?:?]
> at
> org.apache.commons.logging.impl.Jdk14Logger.warn(Jdk14Logger.java:260) ~[?:?]
> at org.apache.pdfbox.pdmodel.PDPageTree.getKids(PDPageTree.java:159)
> ~[?:?]
> at org.apache.pdfbox.pdmodel.PDPageTree.access$200(PDPageTree.java:41)
> ~[?:?]
> at
> org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:183)
> ~[?:?]
> at
> org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
> ~[?:?]
> at
> org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
> ~[?:?]
> at
> org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
> ~[?:?]
> at
> org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
> ~[?:?]
> at
> org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
> ~[?:?]
> at
> org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
> ~[?:?]
> at
> org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
> ~[?:?]
> at
> org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
> ~[?:?]
> at
> org.apache.pdfbox.pdmodel.PDPageTree$PageIterator.enqueueKids(PDPageTree.java:186)
> ~[?:?]
> {code}
> It sounds like related to pdfbox project though but I found that it could be
> useful to report it here.
--
This message was sent by Atlassian Jira
(v8.3.4#803005)