[
https://issues.apache.org/jira/browse/TIKA-3720?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17524386#comment-17524386
]
denisn edited comment on TIKA-3720 at 4/19/22 3:25 PM:
-------------------------------------------------------
I guess the problem is only in 1.x starting with 1.23. 1.23 and 1.28.1 are
failing right after startup and not extracting anything from my pdf. My parser
extends the ParserDecorator at some point:
{code:java}
private class TimeoutParser(parser: Parser) extends ParserDecorator(parser)
private class CEParser extends AutoDetectParser {
override protected def getParser(metadata: Metadata, context: ParseContext):
Parser = {
val parser = super.getParser(metadata, context)
new TimeoutParser(parser)
}
}
private val parser = new CEParser()
val contentParser = new AbstractParser {
def parse = parser.parse
}
val currentContext = new ParseContext
currentContext.set(classOf[TesseractOCRConfig], ocrConfig)
currentContext.set(classOf[PDFParserConfig], pdfConfig)
currentContext.set(classOf[Parser], contentParser){code}
2.x is not failing at startup (i just didn't wait long enough to get the
results).
was (Author: JIRAUSER288220):
I guess the problem is only in 1.x starting with 1.23. 1.23 and 1.28.1 are
failing right after startup and not extracting anything from my pdf. My parser
extends the ParserDecorator at some point:
{code:java}
private class TimeoutParser(parser: Parser) extends ParserDecorator(parser)
private class CEParser extends AutoDetectParser {
override protected def getParser(metadata: Metadata, context: ParseContext):
Parser = {
val parser = super.getParser(metadata, context)
val p = if (parser.isInstanceOf[DefaultParser]) new
DFP().getParserPublic(metadata, context) else parser
new TimeoutParser(parser)
}
}
private val parser = new CEParser()
val contentParser = new AbstractParser {
def parse = parser.parse
}
val currentContext = new ParseContext
currentContext.set(classOf[TesseractOCRConfig], ocrConfig)
currentContext.set(classOf[PDFParserConfig], pdfConfig)
currentContext.set(classOf[Parser], contentParser){code}
2.x is not failing at startup (i just didn't wait long enough to get the
results).
> IllegalArgumentException in PDF parser
> --------------------------------------
>
> Key: TIKA-3720
> URL: https://issues.apache.org/jira/browse/TIKA-3720
> Project: Tika
> Issue Type: Bug
> Affects Versions: 1.23
> Environment: Fedora 36, Java 11, Scala 2.13.4, Tika 1.28.1
> Reporter: denisn
> Priority: Major
> Attachments: test.pdf
>
>
> Tika packages:
> {code:java}
> "org.apache.tika" % "tika" % 1.28.1
> "org.apache.tika" % "tika-core" % 1.28.1
> "org.apache.tika" % "tika-parsers" % 1.28.1
> "org.apache.poi" % "poi" % "4.0.1"
> "org.apache.poi" % "poi-ooxml" % "4.0.1"{code}
> It seems to work fine in 1.22 but in 1.23 and all following versions there is
> an error. I've attached the pdf file which i've tested.
> Exception text:
> {code:java}
> java.lang.IllegalArgumentException
> at org.apache.xerces.jaxp.DocumentBuilderFactoryImpl.setAttribute(Unknown
> Source)
> at
> org.apache.tika.utils.XMLReaderUtils.trySetXercesSecurityManager(XMLReaderUtils.java:721)
> at
> org.apache.tika.utils.XMLReaderUtils.getDocumentBuilderFactory(XMLReaderUtils.java:289)
> at
> org.apache.tika.utils.XMLReaderUtils.getDocumentBuilder(XMLReaderUtils.java:305)
> at
> org.apache.tika.parser.external.ExternalParsersConfigReader.read(ExternalParsersConfigReader.java:58)
> at
> org.apache.tika.parser.external.ExternalParsersFactory.create(ExternalParsersFactory.java:67)
> at
> org.apache.tika.parser.external.ExternalParsersFactory.create(ExternalParsersFactory.java:59)
> at
> org.apache.tika.parser.external.ExternalParsersFactory.create(ExternalParsersFactory.java:49)
> at
> org.apache.tika.parser.external.ExternalParsersFactory.create(ExternalParsersFactory.java:44)
> at
> org.apache.tika.parser.external.CompositeExternalParser.<init>(CompositeExternalParser.java:44)
> at
> org.apache.tika.parser.external.CompositeExternalParser.<init>(CompositeExternalParser.java:37)
> at
> java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native
> Method)
> at
> java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
> at
> java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
> at
> java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:490)
> at java.base/java.lang.Class.newInstance(Class.java:584)
> at
> org.apache.tika.config.ServiceLoader.loadStaticServiceProviders(ServiceLoader.java:358)
> at
> org.apache.tika.parser.DefaultParser.getDefaultParsers(DefaultParser.java:55)
> at org.apache.tika.parser.DefaultParser.<init>(DefaultParser.java:85)
> at org.apache.tika.parser.DefaultParser.<init>(DefaultParser.java:100)
> at org.apache.tika.parser.DefaultParser.<init>(DefaultParser.java:112)
> at org.apache.tika.parser.DefaultParser.<init>(DefaultParser.java:116)
> at test.Main$DFP.<init>(Main.scala:55)
> at test.Main$CEParser.getParser(Main.scala:75)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:269)
> at
> org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:143)
> at test.Main$.parseNode(Main.scala:194)
> at test.Main$$anon$1.parse(Main.scala:151)
> at org.apache.tika.parser.DelegatingParser.parse(DelegatingParser.java:72)
> at
> org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor.parseEmbedded(ParsingEmbeddedDocumentExtractor.java:104)
> at
> org.apache.tika.parser.pdf.ImageGraphicsEngine.processImage(ImageGraphicsEngine.java:321)
> at
> org.apache.tika.parser.pdf.ImageGraphicsEngine.drawImage(ImageGraphicsEngine.java:182)
> at
> org.apache.pdfbox.contentstream.operator.graphics.DrawObject.process(DrawObject.java:67)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:939)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:514)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:492)
> at
> org.apache.pdfbox.contentstream.PDFStreamEngine.processPage(PDFStreamEngine.java:155)
> at
> org.apache.tika.parser.pdf.ImageGraphicsEngine.run(ImageGraphicsEngine.java:128)
> at org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:159)
> at org.apache.tika.parser.pdf.PDF2XHTML.endPage(PDF2XHTML.java:139)
> at
> org.apache.pdfbox.text.PDFTextStripper.processPage(PDFTextStripper.java:365)
> at org.apache.tika.parser.pdf.PDF2XHTML.processPage(PDF2XHTML.java:127)
> at
> org.apache.tika.parser.pdf.AbstractPDF2XHTML.processPages(AbstractPDF2XHTML.java:985)
> at
> org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:238)
> at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:98)
> at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:177)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:281)
> at org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:188)
> at test.Main$TimeoutParser.super$parse(Main.scala:67)
> at test.Main$TimeoutParser.$anonfun$parse$1(Main.scala:67)
> at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18)
> at
> cats.effect.internals.IORunLoop$.cats$effect$internals$IORunLoop$$loop(IORunLoop.scala:104)
> at
> cats.effect.internals.IORunLoop$RestartCallback.signal(IORunLoop.scala:463)
> at
> cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:484)
> at
> cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:422)
> at cats.effect.internals.IOShift$Tick.run(IOShift.scala:36)
> at
> java.base/java.util.concurrent.ForkJoinTask$RunnableExecuteAction.exec(ForkJoinTask.java:1426)
> at
> java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290)
> at
> java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020)
> at
> java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656)
> at
> java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594)
> at
> java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183)Error:
> org.apache.tika.exception.TikaException: Unexpected RuntimeException from
> org.apache.tika.parser.pdf.PDFParser@268b50a0
> org.apache.tika.exception.TikaException: Unexpected RuntimeException from
> org.apache.tika.parser.pdf.PDFParser@268b50a0
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:297)
> at org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:188)
> at test.Main$TimeoutParser.super$parse(Main.scala:67)
> at test.Main$TimeoutParser.$anonfun$parse$1(Main.scala:67)
> at unsafeRunSync @ test.Main$TimeoutParser.parse(Main.scala:68)
> Caused by: java.lang.NullPointerException
> at
> org.apache.tika.parser.pdf.AbstractPDF2XHTML.doOCROnCurrentPage(AbstractPDF2XHTML.java:450)
> at
> org.apache.tika.parser.pdf.AbstractPDF2XHTML.endPage(AbstractPDF2XHTML.java:557)
> at org.apache.tika.parser.pdf.PDF2XHTML.endPage(PDF2XHTML.java:143)
> at
> org.apache.pdfbox.text.PDFTextStripper.processPage(PDFTextStripper.java:365)
> at org.apache.tika.parser.pdf.PDF2XHTML.processPage(PDF2XHTML.java:127)
> at
> org.apache.tika.parser.pdf.AbstractPDF2XHTML.processPages(AbstractPDF2XHTML.java:985)
> at
> org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:238)
> at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:98)
> at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:177)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:281)
> at org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:188)
> at test.Main$TimeoutParser.super$parse(Main.scala:67)
> at test.Main$TimeoutParser.$anonfun$parse$1(Main.scala:67)
> at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18)
> at
> cats.effect.internals.IORunLoop$.cats$effect$internals$IORunLoop$$loop(IORunLoop.scala:104)
> at
> cats.effect.internals.IORunLoop$RestartCallback.signal(IORunLoop.scala:463)
> at
> cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:484)
> at
> cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:422)
> at cats.effect.internals.IOShift$Tick.run(IOShift.scala:36)
> at
> java.base/java.util.concurrent.ForkJoinTask$RunnableExecuteAction.exec(ForkJoinTask.java:1426)
> at
> java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290)
> at
> java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020)
> at
> java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656)
> at
> java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594)
> at
> java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183)
> {code}
>
> In 2.3.0 it still gives me an error but the extraction seems to work after
> all:
> {code:java}
> Error: org.apache.tika.exception.TikaException: Unable to extract PDF content
> org.apache.tika.exception.TikaException: Unable to extract PDF content
> at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:119)
> at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:174)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:289)
> at org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:152)
> at test.Main$TimeoutParser.super$parse(Main.scala:67)
> at test.Main$TimeoutParser.$anonfun$parse$1(Main.scala:67)
> at unsafeRunSync @ test.Main$TimeoutParser.parse(Main.scala:68)
> Caused by: java.io.IOException: Unable to end a page
> at
> org.apache.tika.parser.pdf.AbstractPDF2XHTML.endPage(AbstractPDF2XHTML.java:637)
> at org.apache.tika.parser.pdf.PDF2XHTML.endPage(PDF2XHTML.java:142)
> at
> org.apache.pdfbox.text.PDFTextStripper.processPage(PDFTextStripper.java:365)
> at org.apache.tika.parser.pdf.PDF2XHTML.processPage(PDF2XHTML.java:126)
> at
> org.apache.tika.parser.pdf.AbstractPDF2XHTML.processPages(AbstractPDF2XHTML.java:1089)
> at
> org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:238)
> at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:97)
> at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:174)
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:289)
> at org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:152)
> at test.Main$TimeoutParser.super$parse(Main.scala:67)
> at test.Main$TimeoutParser.$anonfun$parse$1(Main.scala:67)
> at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.scala:18)
> at
> cats.effect.internals.IORunLoop$.cats$effect$internals$IORunLoop$$loop(IORunLoop.scala:104)
> at
> cats.effect.internals.IORunLoop$RestartCallback.signal(IORunLoop.scala:463)
> at
> cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:484)
> at
> cats.effect.internals.IORunLoop$RestartCallback.apply(IORunLoop.scala:422)
> at cats.effect.internals.IOShift$Tick.run(IOShift.scala:36)
> at
> java.base/java.util.concurrent.ForkJoinTask$RunnableExecuteAction.exec(ForkJoinTask.java:1426)
> at
> java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290)
> at
> java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020)
> at
> java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656)
> at
> java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594)
> at
> java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183)
> Caused by: org.apache.tika.exception.TikaException: I regret that I couldn't
> find an OCR parser to handle image/ocr-png.Please set the OCR_STRATEGY to
> NO_OCR or configure yourOCR parser correctly
> at
> org.apache.tika.parser.pdf.AbstractPDF2XHTML.doOCROnCurrentPage(AbstractPDF2XHTML.java:473)
> at
> org.apache.tika.parser.pdf.AbstractPDF2XHTML.endPage(AbstractPDF2XHTML.java:614)
> ... 23 more
> {code}
>
--
This message was sent by Atlassian Jira
(v8.20.7#820007)