[ https://issues.apache.org/jira/browse/TIKA-3040?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17035049#comment-17035049 ]
Tilman Hausherr commented on TIKA-3040: --------------------------------------- Unrelated to this: I think the current code misses inline images, see e.g. the file from PDFBOX-52. [~tallison] > PDF inline OCR: Exception while processing certain image (others in same PDF > work) > ---------------------------------------------------------------------------------- > > Key: TIKA-3040 > URL: https://issues.apache.org/jira/browse/TIKA-3040 > Project: Tika > Issue Type: Bug > Components: ocr > Affects Versions: 1.23 > Environment: Debian 10 > Tesseract > Reporter: Markus Mandalka > Priority: Minor > > There is a PDF document (without plain text content) in which text content > are scans of multiple pages. > OCR for one of the images (text of a page) fails by tika-server with > activated inline OCR for PDF. > My fallback/alternate in Open Semantic ETL / Open Semantic Search using > pdfimages of Debian package poppler-utils to extract the images works for all > images in that PDF document). > I can not attach/upload this document here to the public because of > Copyright/Classified issues, but if interested, i could send it to certain > developer(s). > Following tika-server exception in result field > X-TIKA:EXCEPTION:embedded_stream_exception: > javax.imageio.IIOException: Bogus input colorspace at > java.desktop/com.sun.imageio.plugins.jpeg.JPEGImageWriter.writeImage(Native > Method) at > java.desktop/com.sun.imageio.plugins.jpeg.JPEGImageWriter.writeOnThread(JPEGImageWriter.java:1007) > at > java.desktop/com.sun.imageio.plugins.jpeg.JPEGImageWriter.write(JPEGImageWriter.java:371) > at > org.apache.pdfbox.tools.imageio.ImageIOUtil.writeImage(ImageIOUtil.java:316) > at > org.apache.pdfbox.tools.imageio.ImageIOUtil.writeImage(ImageIOUtil.java:189) > at > org.apache.pdfbox.tools.imageio.ImageIOUtil.writeImage(ImageIOUtil.java:166) > at > org.apache.pdfbox.tools.imageio.ImageIOUtil.writeImage(ImageIOUtil.java:148) > at org.apache.tika.parser.pdf.PDF2XHTML.writeToBuffer(PDF2XHTML.java:304) at > org.apache.tika.parser.pdf.PDF2XHTML.processImageObject(PDF2XHTML.java:268) > at org.apache.tika.parser.pdf.PDF2XHTML.extractImages(PDF2XHTML.java:194) at > org.apache.tika.parser.pdf.PDF2XHTML.endPage(PDF2XHTML.java:165) at > org.apache.pdfbox.text.PDFTextStripper.processPage(PDFTextStripper.java:393) > at org.apache.tika.parser.pdf.PDF2XHTML.processPage(PDF2XHTML.java:153) at > org.apache.tika.parser.pdf.AbstractPDF2XHTML.processPages(AbstractPDF2XHTML.java:867) > at > org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:266) at > org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:124) at > org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:162) at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280) at > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280) at > org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:143) at > org.apache.tika.parser.RecursiveParserWrapper.parse(RecursiveParserWrapper.java:233) > at org.apache.tika.server.resource.TikaResource.parse(TikaResource.java:409) > at > org.apache.tika.server.resource.RecursiveMetadataResource.parseMetadata(RecursiveMetadataResource.java:147) > at > org.apache.tika.server.resource.RecursiveMetadataResource.getMetadata(RecursiveMetadataResource.java:123) > at jdk.internal.reflect.GeneratedMethodAccessor5.invoke(Unknown Source) at > java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.base/java.lang.reflect.Method.invoke(Method.java:566) at > org.apache.cxf.service.invoker.AbstractInvoker.performInvocation(AbstractInvoker.java:179) > at > org.apache.cxf.service.invoker.AbstractInvoker.invoke(AbstractInvoker.java:96) > at org.apache.cxf.jaxrs.JAXRSInvoker.invoke(JAXRSInvoker.java:201) at > org.apache.cxf.jaxrs.JAXRSInvoker.invoke(JAXRSInvoker.java:104) at > org.apache.cxf.interceptor.ServiceInvokerInterceptor$1.run(ServiceInvokerInterceptor.java:59) > at > org.apache.cxf.interceptor.ServiceInvokerInterceptor.handleMessage(ServiceInvokerInterceptor.java:96) > at > org.apache.cxf.phase.PhaseInterceptorChain.doIntercept(PhaseInterceptorChain.java:308) > at > org.apache.cxf.transport.ChainInitiationObserver.onMessage(ChainInitiationObserver.java:121) > at > org.apache.cxf.transport.http.AbstractHTTPDestination.invoke(AbstractHTTPDestination.java:267) > at > org.apache.cxf.transport.http_jetty.JettyHTTPDestination.doService(JettyHTTPDestination.java:247) > at > org.apache.cxf.transport.http_jetty.JettyHTTPHandler.handle(JettyHTTPHandler.java:79) > at > org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) > at > org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:235) > at > org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1296) > at > org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:190) > at > org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1211) > at > org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141) > at > org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:221) > at > org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127) > at org.eclipse.jetty.server.Server.handle(Server.java:500) at > org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:386) at > org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:560) at > org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:378) at > org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:268) > at > org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311) > at org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:103) at > org.eclipse.jetty.io.ChannelEndPoint$2.run(ChannelEndPoint.java:117) at > org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:782) > at > org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:914) > at java.base/java.lang.Thread.run(Thread.java:834) -- This message was sent by Atlassian Jira (v8.3.4#803005)