[ 
https://issues.apache.org/jira/browse/TIKA-3536?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

saurabh saxena closed TIKA-3536.
--------------------------------
    Resolution: Not A Problem

> Text extraction from PDF never finishes
> ---------------------------------------
>
>                 Key: TIKA-3536
>                 URL: https://issues.apache.org/jira/browse/TIKA-3536
>             Project: Tika
>          Issue Type: Bug
>    Affects Versions: 1.27
>         Environment: Apache Tika version 1.24 
> PDFBOX version 2.0.24
>            Reporter: saurabh saxena
>            Priority: Major
>         Attachments: PdfExtTest.java
>
>
> Text extraction from pdf using apache tika version 1.27 never finishes. 
> Looking code and thread dump it looks there is a infinite recursion loop.
> Link to pdf file : [https://arxiv.org/pdf/1707.00393]
> PDF File has been attached as well.
> When directly parsing via pdf box it extracts the text successfully. 
> {code:java}
> public void runtestUsingPdfBox() {
>      try(FileInputStream fileInputStream = new 
> FileInputStream("1707.00393-arxiv.pdf")) {
>          final PDDocument document = PDDocument.load(fileInputStream);
>          PDFTextStripper pdfStripper = new PDFTextStripper();
>          final String text = pdfStripper.getText(document);
>          System.out.println(text);
>          //Closing the document
>          document.close();
>      } catch (Exception e) {
>          e.printStackTrace();
>      }
> }
> {code}
> When tried using tika parser it never finishes.
> {code:java}
> public void runtestUsingTikaParser() {
>      try(FileInputStream fileInputStream = new 
> FileInputStream("1707.00393-arxiv.pdf")) {
>          BodyContentHandler handler = new BodyContentHandler();
>          Metadata metadata = new Metadata();
>          ParseContext pcontext = new ParseContext();
>          //parsing the document using PDF parser
>          PDFParser pdfparser = new PDFParser();
>          pdfparser.parse(fileInputStream, handler, metadata,pcontext);
>          //getting the content of the document
>          System.out.println("Contents of the PDF :" + handler.toString());
>          //getting metadata of the document
>          System.out.println("Metadata of the PDF:");
>          String[] metadataNames = metadata.names();
>          for(String name : metadataNames) {
>              System.out.println(name+ " : " + metadata.get(name));
>          }
>      } catch (Exception e) {
>          e.printStackTrace();
>      }
> }
> {code}
>  
> Thread stack:
> {code:java}
> "main" #1 prio=5 os_prio=31 tid=0x00007f8c5b809000 nid=0xe03 runnable 
> [0x000070000303f000]"main" #1 prio=5 os_prio=31 tid=0x00007f8c5b809000 
> nid=0xe03 runnable [0x000070000303f000]   java.lang.Thread.State: RUNNABLE at 
> sun.misc.FDBigInteger.multByPow52(FDBigInteger.java:727) at 
> sun.misc.FloatingDecimal$ASCIIToBinaryBuffer.floatValue(FloatingDecimal.java:1560)
>  at sun.misc.FloatingDecimal.parseFloat(FloatingDecimal.java:122) at 
> java.lang.Float.parseFloat(Float.java:451) at 
> java.math.BigDecimal.floatValue(BigDecimal.java:3233) at 
> org.apache.pdfbox.cos.COSFloat.checkMinMaxValues(COSFloat.java:94) at 
> org.apache.pdfbox.cos.COSFloat.<init>(COSFloat.java:60) at 
> org.apache.pdfbox.cos.COSNumber.get(COSNumber.java:101) at 
> org.apache.pdfbox.pdfparser.PDFStreamParser.parseNextToken(PDFStreamParser.java:240)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:521)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at 
> org.apache.pdfbox.contentstream.operator.DrawObject.process(DrawObject.java:79)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:933)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:240) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at 
> org.apache.pdfbox.contentstream.operator.DrawObject.process(DrawObject.java:79)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:933)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:240) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at 
> org.apache.pdfbox.contentstream.operator.DrawObject.process(DrawObject.java:79)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:933)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:240) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at 
> org.apache.pdfbox.contentstream.operator.DrawObject.process(DrawObject.java:79)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:933)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:240) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at 
> org.apache.pdfbox.contentstream.operator.DrawObject.process(DrawObject.java:79)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:933)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:240) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
>  at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) 
> at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
>  at 
> org.apache.pdfbox.contentstream.PDFStreamEngine.processPage(PDFStreamEngine.java:155)
>  at 
> org.apache.pdfbox.text.LegacyPDFStreamEngine.processPage(LegacyPDFStreamEngine.java:144)
>  at 
> org.apache.pdfbox.text.PDFTextStripper.processPage(PDFTextStripper.java:394) 
> at org.apache.tika.parser.pdf.PDF2XHTML.processPage(PDF2XHTML.java:250) at 
> org.apache.tika.parser.pdf.AbstractPDF2XHTML.processPages(AbstractPDF2XHTML.java:985)
>  at 
> org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:269) at 
> org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:153) at 
> org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:177) at 
> com.maarcus.extractor.tika.PdfExtTest.runtestUsingTikaParser(PdfExtTest.java:41)
>  at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>  at java.lang.reflect.Method.invoke(Method.java:498) at 
> org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:50)
>  at 
> org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
>  at 
> org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:47)
>  at 
> org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
>  at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:325) at 
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:78)
>  at 
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:57)
>  at org.junit.runners.ParentRunner$3.run(ParentRunner.java:290) at 
> org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71) at 
> org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288) at 
> org.junit.runners.ParentRunner.access$000(ParentRunner.java:58) at 
> org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268) at 
> org.junit.runners.ParentRunner.run(ParentRunner.java:363) at 
> org.mockito.internal.runners.DefaultInternalRunner$1.run(DefaultInternalRunner.java:79)
>  at 
> org.mockito.internal.runners.DefaultInternalRunner.run(DefaultInternalRunner.java:85)
>  at org.mockito.internal.runners.StrictRunner.run(StrictRunner.java:39) at 
> org.mockito.junit.MockitoJUnitRunner.run(MockitoJUnitRunner.java:163) at 
> org.junit.runner.JUnitCore.run(JUnitCore.java:137) at 
> com.intellij.junit4.JUnit4IdeaTestRunner.startRunnerWithArgs(JUnit4IdeaTestRunner.java:69)
>  at 
> com.intellij.rt.junit.IdeaTestRunner$Repeater.startRunnerWithArgs(IdeaTestRunner.java:33)
>  at 
> com.intellij.rt.junit.JUnitStarter.prepareStreamsAndStart(JUnitStarter.java:235)
>  at com.intellij.rt.junit.JUnitStarter.main(JUnitStarter.java:54)
> {code}
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to