saurabh saxena created TIKA-3536:
------------------------------------

             Summary: Text extraction from PDF never finishes
                 Key: TIKA-3536
                 URL: https://issues.apache.org/jira/browse/TIKA-3536
             Project: Tika
          Issue Type: Bug
    Affects Versions: 1.27
         Environment: Apache Tika version 1.24 

PDFBOX version 2.0.24
            Reporter: saurabh saxena


Text extraction from pdf using apache tika version 1.27 never finishes. Looking 
code and thread dump it looks there is a infinite recursion loop.

Link to pdf file : [https://arxiv.org/pdf/1707.00393]

PDF File has been attached as well.

When directly parsing via pdf box it extracts the text successfully. 
{code:java}
public void runtestUsingPdfBox() {
     try(FileInputStream fileInputStream = new 
FileInputStream("1707.00393-arxiv.pdf")) {
         final PDDocument document = PDDocument.load(fileInputStream);
         PDFTextStripper pdfStripper = new PDFTextStripper();
         final String text = pdfStripper.getText(document);
         System.out.println(text);
         //Closing the document
         document.close();
     } catch (Exception e) {
         e.printStackTrace();
     }
}
{code}
When tried using tika parser it never finishes.
{code:java}
public void runtestUsingTikaParser() {
     try(FileInputStream fileInputStream = new 
FileInputStream("1707.00393-arxiv.pdf")) {
         BodyContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
         ParseContext pcontext = new ParseContext();

         //parsing the document using PDF parser
         PDFParser pdfparser = new PDFParser();
         pdfparser.parse(fileInputStream, handler, metadata,pcontext);

         //getting the content of the document
         System.out.println("Contents of the PDF :" + handler.toString());

         //getting metadata of the document
         System.out.println("Metadata of the PDF:");
         String[] metadataNames = metadata.names();

         for(String name : metadataNames) {
             System.out.println(name+ " : " + metadata.get(name));
         }
     } catch (Exception e) {
         e.printStackTrace();
     }
}
{code}
 

Thread stack:
{code:java}
"main" #1 prio=5 os_prio=31 tid=0x00007f8c5b809000 nid=0xe03 runnable 
[0x000070000303f000]"main" #1 prio=5 os_prio=31 tid=0x00007f8c5b809000 
nid=0xe03 runnable [0x000070000303f000]   java.lang.Thread.State: RUNNABLE at 
sun.misc.FDBigInteger.multByPow52(FDBigInteger.java:727) at 
sun.misc.FloatingDecimal$ASCIIToBinaryBuffer.floatValue(FloatingDecimal.java:1560)
 at sun.misc.FloatingDecimal.parseFloat(FloatingDecimal.java:122) at 
java.lang.Float.parseFloat(Float.java:451) at 
java.math.BigDecimal.floatValue(BigDecimal.java:3233) at 
org.apache.pdfbox.cos.COSFloat.checkMinMaxValues(COSFloat.java:94) at 
org.apache.pdfbox.cos.COSFloat.<init>(COSFloat.java:60) at 
org.apache.pdfbox.cos.COSNumber.get(COSNumber.java:101) at 
org.apache.pdfbox.pdfparser.PDFStreamParser.parseNextToken(PDFStreamParser.java:240)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:521)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at 
org.apache.pdfbox.contentstream.operator.DrawObject.process(DrawObject.java:79) 
at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:933)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:240) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at 
org.apache.pdfbox.contentstream.operator.DrawObject.process(DrawObject.java:79) 
at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:933)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:240) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at 
org.apache.pdfbox.contentstream.operator.DrawObject.process(DrawObject.java:79) 
at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:933)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:240) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at 
org.apache.pdfbox.contentstream.operator.DrawObject.process(DrawObject.java:79) 
at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:933)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:240) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at 
org.apache.pdfbox.contentstream.operator.DrawObject.process(DrawObject.java:79) 
at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:933)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:240) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:186)
 at org.apache.tika.parser.pdf.PDF2XHTML.processOperator(PDF2XHTML.java:237) at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
 at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processPage(PDFStreamEngine.java:155)
 at 
org.apache.pdfbox.text.LegacyPDFStreamEngine.processPage(LegacyPDFStreamEngine.java:144)
 at 
org.apache.pdfbox.text.PDFTextStripper.processPage(PDFTextStripper.java:394) at 
org.apache.tika.parser.pdf.PDF2XHTML.processPage(PDF2XHTML.java:250) at 
org.apache.tika.parser.pdf.AbstractPDF2XHTML.processPages(AbstractPDF2XHTML.java:985)
 at org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:269) 
at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:153) at 
org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:177) at 
com.maarcus.extractor.tika.PdfExtTest.runtestUsingTikaParser(PdfExtTest.java:41)
 at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at 
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 
at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
 at java.lang.reflect.Method.invoke(Method.java:498) at 
org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:50)
 at 
org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
 at 
org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:47)
 at 
org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
 at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:325) at 
org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:78)
 at 
org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:57)
 at org.junit.runners.ParentRunner$3.run(ParentRunner.java:290) at 
org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71) at 
org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288) at 
org.junit.runners.ParentRunner.access$000(ParentRunner.java:58) at 
org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268) at 
org.junit.runners.ParentRunner.run(ParentRunner.java:363) at 
org.mockito.internal.runners.DefaultInternalRunner$1.run(DefaultInternalRunner.java:79)
 at 
org.mockito.internal.runners.DefaultInternalRunner.run(DefaultInternalRunner.java:85)
 at org.mockito.internal.runners.StrictRunner.run(StrictRunner.java:39) at 
org.mockito.junit.MockitoJUnitRunner.run(MockitoJUnitRunner.java:163) at 
org.junit.runner.JUnitCore.run(JUnitCore.java:137) at 
com.intellij.junit4.JUnit4IdeaTestRunner.startRunnerWithArgs(JUnit4IdeaTestRunner.java:69)
 at 
com.intellij.rt.junit.IdeaTestRunner$Repeater.startRunnerWithArgs(IdeaTestRunner.java:33)
 at 
com.intellij.rt.junit.JUnitStarter.prepareStreamsAndStart(JUnitStarter.java:235)
 at com.intellij.rt.junit.JUnitStarter.main(JUnitStarter.java:54)
{code}
 



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to