[
https://issues.apache.org/jira/browse/TIKA-3536?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
saurabh saxena closed TIKA-3536.
--------------------------------
Resolution: Not A Problem
> Text extraction from PDF never finishes
> ---------------------------------------
>
> Key: TIKA-3536
> URL: https://issues.apache.org/jira/browse/TIKA-3536
> Project: Tika
> Issue Type: Bug
> Affects Versions: 1.27
> Environment: Apache Tika version 1.24
> PDFBOX version 2.0.24
> Reporter: saurabh saxena
> Priority: Major
> Attachments: PdfExtTest.java
>
>
> Text extraction from pdf using apache tika version 1.27 never finishes.
> Looking code and thread dump it looks there is a infinite recursion loop.
> Link to pdf file : [https://arxiv.org/pdf/1707.00393]
> PDF File has been attached as well.
> When directly parsing via pdf box it extracts the text successfully.
> {code:java}
> public void runtestUsingPdfBox() {
> try(FileInputStream fileInputStream = new
> FileInputStream("1707.00393-arxiv.pdf")) {
> final PDDocument document = PDDocument.load(fileInputStream);
> PDFTextStripper pdfStripper = new PDFTextStripper();
> final String text = pdfStripper.getText(document);
> System.out.println(text);
> //Closing the document
> document.close();
> } catch (Exception e) {
> e.printStackTrace();
> }
> }
> {code}
> When tried using tika parser it never finishes.
> {code:java}
> public void runtestUsingTikaParser() {
> try(FileInputStream fileInputStream = new
> FileInputStream("1707.00393-arxiv.pdf")) {
> BodyContentHandler handler = new BodyContentHandler();
> Metadata metadata = new Metadata();
> ParseContext pcontext = new ParseContext();
> //parsing the document using PDF parser
> PDFParser pdfparser = new PDFParser();
> pdfparser.parse(fileInputStream, handler, metadata,pcontext);
> //getting the content of the document
> System.out.println("Contents of the PDF :" + handler.toString());
> //getting metadata of the document
> System.out.println("Metadata of the PDF:");
> String[] metadataNames = metadata.names();
> for(String name : metadataNames) {
> System.out.println(name+ " : " + metadata.get(name));
> }
> } catch (Exception e) {
> e.printStackTrace();
> }
> }
> {code}
>
> Thread stack:
> {code:java}
> "main" #1 prio=5 os_prio=31 tid=0x00007f8c5b809000 nid=0xe03 runnable
> [0x000070000303f000]"main" #1 prio=5 os_prio=31 tid=0x00007f8c5b809000
> nid=0xe03 runnable [0x000070000303f000] java.lang.Thread.State: RUNNABLE at
> sun.misc.FDBigInteger.multByPow52(FDBigInteger.java:727) at
> sun.misc.FloatingDecimal$ASCIIToBinaryBuffer.floatValue(FloatingDecimal.java:1560)
> at sun.misc.FloatingDecimal.parseFloat(FloatingDecimal.java:122) at
> java.lang.Float.parseFloat(Float.java:451) at
> java.math.BigDecimal.floatValue(BigDecimal.java:3233) at
> org.apache.pdfbox.cos.COSFloat.checkMinMaxValues(COSFloat.java:94) at
> org.apache.pdfbox.cos.COSFloat.<init>(COSFloat.java:60) at
> org.apache.pdfbox.cos.COSNumber.get(COSNumber.java:101) at
> org.apache.pdfbox.pdfparser.PDFStreamParser.parseNextToken(PDFStreamParser.java:240)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:521)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at
> org.apache.pdfbox.contentstream.operator.DrawObject.process(DrawObject.java:79)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:933)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:240)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at
> org.apache.pdfbox.contentstream.operator.DrawObject.process(DrawObject.java:79)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:933)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:240)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at
> org.apache.pdfbox.contentstream.operator.DrawObject.process(DrawObject.java:79)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:933)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:240)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at
> org.apache.pdfbox.contentstream.operator.DrawObject.process(DrawObject.java:79)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:933)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:240)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at
> org.apache.pdfbox.contentstream.operator.DrawObject.process(DrawObject.java:79)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:933)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:240)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
> at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processPage(PDFStreamEngine.java:155)
> at
> org.apache.pdfbox.text.LegacyPDFStreamEngine.processPage(LegacyPDFStreamEngine.java:144)
> at
> org.apache.pdfbox.text.PDFTextStripper.processPage(PDFTextStripper.java:394)
> at org.apache.tika.parser.pdf.PDF2XHTML.processPage(PDF2XHTML.java:250) at
> org.apache.tika.parser.pdf.AbstractPDF2XHTML.processPages(AbstractPDF2XHTML.java:985)
> at
> org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:269) at
> org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:153) at
> org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:177) at
> com.maarcus.extractor.tika.PdfExtTest.runtestUsingTikaParser(PdfExtTest.java:41)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
> at
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:498) at
> org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:50)
> at
> org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
> at
> org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:47)
> at
> org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
> at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:325) at
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:78)
> at
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:57)
> at org.junit.runners.ParentRunner$3.run(ParentRunner.java:290) at
> org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71) at
> org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288) at
> org.junit.runners.ParentRunner.access$000(ParentRunner.java:58) at
> org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268) at
> org.junit.runners.ParentRunner.run(ParentRunner.java:363) at
> org.mockito.internal.runners.DefaultInternalRunner$1.run(DefaultInternalRunner.java:79)
> at
> org.mockito.internal.runners.DefaultInternalRunner.run(DefaultInternalRunner.java:85)
> at org.mockito.internal.runners.StrictRunner.run(StrictRunner.java:39) at
> org.mockito.junit.MockitoJUnitRunner.run(MockitoJUnitRunner.java:163) at
> org.junit.runner.JUnitCore.run(JUnitCore.java:137) at
> com.intellij.junit4.JUnit4IdeaTestRunner.startRunnerWithArgs(JUnit4IdeaTestRunner.java:69)
> at
> com.intellij.rt.junit.IdeaTestRunner$Repeater.startRunnerWithArgs(IdeaTestRunner.java:33)
> at
> com.intellij.rt.junit.JUnitStarter.prepareStreamsAndStart(JUnitStarter.java:235)
> at com.intellij.rt.junit.JUnitStarter.main(JUnitStarter.java:54)
> {code}
>
--
This message was sent by Atlassian Jira
(v8.3.4#803005)