[
https://issues.apache.org/jira/browse/TIKA-1967?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
kostali updated TIKA-1967:
--------------------------
Description:
I try to index riche data (msword and pdf) using DIH data import handler with
solr 5.4.1
When I run full import for the attaching files I get this Error:
null:org.apache.solr.common.SolrException:
org.apache.tika.exception.TikaException: Unexpected RuntimeException from
org.apache.tika.parser.microsoft.OfficeParser@10b8c32
at
org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:234)
at
org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:70)
at
org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:156)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:2073)
at org.apache.solr.servlet.HttpSolrCall.execute(HttpSolrCall.java:658)
at org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:457)
at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:223)
at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:181)
at
org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652)
at
org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:585)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)
at
org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:577)
at
org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:223)
at
org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1127)
at
org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:515)
at
org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:185)
at
org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1061)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
at
org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:215)
at
org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:110)
at
org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:97)
at org.eclipse.jetty.server.Server.handle(Server.java:499)
at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:310)
at
org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:257)
at
org.eclipse.jetty.io.AbstractConnection$2.run(AbstractConnection.java:540)
at
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635)
at
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.tika.exception.TikaException: Unexpected RuntimeException
from org.apache.tika.parser.microsoft.OfficeParser@10b8c32
at
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:258)
at
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:256)
at
org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
at
org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:228)
... 27 more
Caused by: java.lang.StringIndexOutOfBoundsException: String index out of
range: -1
at java.lang.String.substring(String.java:1955)
at
org.apache.tika.parser.microsoft.WordExtractor.handleSpecialCharacterRuns(WordExtractor.java:407)
at
org.apache.tika.parser.microsoft.WordExtractor.handleParagraph(WordExtractor.java:256)
at
org.apache.tika.parser.microsoft.WordExtractor.handleParagraph(WordExtractor.java:196)
at
org.apache.tika.parser.microsoft.WordExtractor.parse(WordExtractor.java:105)
at
org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:201)
at
org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:172)
at
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:256)
... 30 more
was:
I try to index riche data (msword and pdf) using DIH data import handler with
solr 5.4.1
null:org.apache.solr.common.SolrException:
org.apache.tika.exception.TikaException: Unexpected RuntimeException from
org.apache.tika.parser.microsoft.OfficeParser@10b8c32
at
org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:234)
at
org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:70)
at
org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:156)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:2073)
at org.apache.solr.servlet.HttpSolrCall.execute(HttpSolrCall.java:658)
at org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:457)
at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:223)
at
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:181)
at
org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652)
at
org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:585)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)
at
org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:577)
at
org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:223)
at
org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1127)
at
org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:515)
at
org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:185)
at
org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1061)
at
org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
at
org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:215)
at
org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:110)
at
org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:97)
at org.eclipse.jetty.server.Server.handle(Server.java:499)
at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:310)
at
org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:257)
at
org.eclipse.jetty.io.AbstractConnection$2.run(AbstractConnection.java:540)
at
org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635)
at
org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.tika.exception.TikaException: Unexpected RuntimeException
from org.apache.tika.parser.microsoft.OfficeParser@10b8c32
at
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:258)
at
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:256)
at
org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
at
org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:228)
... 27 more
Caused by: java.lang.StringIndexOutOfBoundsException: String index out of
range: -1
at java.lang.String.substring(String.java:1955)
at
org.apache.tika.parser.microsoft.WordExtractor.handleSpecialCharacterRuns(WordExtractor.java:407)
at
org.apache.tika.parser.microsoft.WordExtractor.handleParagraph(WordExtractor.java:256)
at
org.apache.tika.parser.microsoft.WordExtractor.handleParagraph(WordExtractor.java:196)
at
org.apache.tika.parser.microsoft.WordExtractor.parse(WordExtractor.java:105)
at
org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:201)
at
org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:172)
at
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:256)
... 30 more
> Unexpected RuntimeException from
> org.apache.tika.parser.microsoft.OfficeParser@10b8c32
> ---------------------------------------------------------------------------------------
>
> Key: TIKA-1967
> URL: https://issues.apache.org/jira/browse/TIKA-1967
> Project: Tika
> Issue Type: Bug
> Environment: windows
> Reporter: kostali
> Fix For: 1.7, 1.12
>
> Attachments: HAMOU-Newman-CV.doc, MAAZOUZ-Sofiane-DocComp.docx
>
>
> I try to index riche data (msword and pdf) using DIH data import handler with
> solr 5.4.1
> When I run full import for the attaching files I get this Error:
> null:org.apache.solr.common.SolrException:
> org.apache.tika.exception.TikaException: Unexpected RuntimeException from
> org.apache.tika.parser.microsoft.OfficeParser@10b8c32
> at
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:234)
> at
> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:70)
> at
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:156)
> at org.apache.solr.core.SolrCore.execute(SolrCore.java:2073)
> at org.apache.solr.servlet.HttpSolrCall.execute(HttpSolrCall.java:658)
> at org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:457)
> at
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:223)
> at
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:181)
> at
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652)
> at
> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:585)
> at
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)
> at
> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:577)
> at
> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:223)
> at
> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1127)
> at
> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:515)
> at
> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:185)
> at
> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1061)
> at
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
> at
> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:215)
> at
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:110)
> at
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:97)
> at org.eclipse.jetty.server.Server.handle(Server.java:499)
> at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:310)
> at
> org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:257)
> at
> org.eclipse.jetty.io.AbstractConnection$2.run(AbstractConnection.java:540)
> at
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635)
> at
> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555)
> at java.lang.Thread.run(Thread.java:745)
> Caused by: org.apache.tika.exception.TikaException: Unexpected
> RuntimeException from org.apache.tika.parser.microsoft.OfficeParser@10b8c32
> at
> org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:258)
> at
> org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:256)
> at
> org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
> at
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:228)
> ... 27 more
> Caused by: java.lang.StringIndexOutOfBoundsException: String index out of
> range: -1
> at java.lang.String.substring(String.java:1955)
> at
> org.apache.tika.parser.microsoft.WordExtractor.handleSpecialCharacterRuns(WordExtractor.java:407)
> at
> org.apache.tika.parser.microsoft.WordExtractor.handleParagraph(WordExtractor.java:256)
> at
> org.apache.tika.parser.microsoft.WordExtractor.handleParagraph(WordExtractor.java:196)
> at
> org.apache.tika.parser.microsoft.WordExtractor.parse(WordExtractor.java:105)
> at
> org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:201)
> at
> org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:172)
> at
> org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:256)
> ... 30 more
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)