[ 
https://issues.apache.org/jira/browse/TIKA-1044?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Jonas Wilhelmsson updated TIKA-1044:
------------------------------------

    Attachment: test2.doc
                test.docx

Stacktrace while parseing test.docx:
2012-dec-13 15:51:39 org.apache.solr.common.SolrException log
ALLVARLIG: org.apache.solr.common.SolrException
        at 
org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:233)
        at 
org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:58)
        at 
org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:129)
        at 
org.apache.solr.core.RequestHandlers$LazyRequestHandlerWrapper.handleRequest(RequestHandlers.java:244)
        at org.apache.solr.core.SolrCore.execute(SolrCore.java:1376)
        at 
org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:365)
        at 
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:260)
        at 
org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:235)
        at 
org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
        at 
org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:233)
        at 
org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:191)
        at 
org.apache.catalina.authenticator.AuthenticatorBase.invoke(AuthenticatorBase.java:563)
        at 
org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:127)
        at 
org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:102)
        at 
org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:109)
        at 
org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:293)
        at 
org.apache.coyote.http11.Http11Processor.process(Http11Processor.java:859)
        at 
org.apache.coyote.http11.Http11Protocol$Http11ConnectionHandler.process(Http11Protocol.java:602)
        at 
org.apache.tomcat.util.net.JIoEndpoint$Worker.run(JIoEndpoint.java:489)
        at java.lang.Thread.run(Thread.java:662)
Caused by: org.apache.tika.exception.TikaException: Unexpected RuntimeException 
from org.apache.tika.parser.microsoft.ooxml.OOXMLParser@299629
        at 
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244)
        at 
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242)
        at 
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242)
        at 
org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
        at 
org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:227)
        ... 19 more
Caused by: java.lang.NullPointerException
        at 
org.apache.tika.parser.microsoft.WordExtractor.buildParagraphTagAndStyle(WordExtractor.java:463)
        at 
org.apache.tika.parser.microsoft.ooxml.XWPFWordExtractorDecorator.extractParagraph(XWPFWordExtractorDecorator.java:108)
        at 
org.apache.tika.parser.microsoft.ooxml.XWPFWordExtractorDecorator.extractIBodyText(XWPFWordExtractorDecorator.java:76)
        at 
org.apache.tika.parser.microsoft.ooxml.XWPFWordExtractorDecorator.buildXHTML(XWPFWordExtractorDecorator.java:63)
        at 
org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.getXHTML(AbstractOOXMLExtractor.java:110)
        at 
org.apache.tika.parser.microsoft.ooxml.OOXMLExtractorFactory.parse(OOXMLExtractorFactory.java:97)
        at 
org.apache.tika.parser.microsoft.ooxml.OOXMLParser.parse(OOXMLParser.java:69)
        at 
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242)
        ... 23 more

Stacktrace while parseing test2.doc:
2012-dec-13 15:51:52 org.apache.solr.common.SolrException log
ALLVARLIG: org.apache.solr.common.SolrException
        at 
org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:233)
        at 
org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:58)
        at 
org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:129)
        at 
org.apache.solr.core.RequestHandlers$LazyRequestHandlerWrapper.handleRequest(RequestHandlers.java:244)
        at org.apache.solr.core.SolrCore.execute(SolrCore.java:1376)
        at 
org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:365)
        at 
org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:260)
        at 
org.apache.catalina.core.ApplicationFilterChain.internalDoFilter(ApplicationFilterChain.java:235)
        at 
org.apache.catalina.core.ApplicationFilterChain.doFilter(ApplicationFilterChain.java:206)
        at 
org.apache.catalina.core.StandardWrapperValve.invoke(StandardWrapperValve.java:233)
        at 
org.apache.catalina.core.StandardContextValve.invoke(StandardContextValve.java:191)
        at 
org.apache.catalina.authenticator.AuthenticatorBase.invoke(AuthenticatorBase.java:563)
        at 
org.apache.catalina.core.StandardHostValve.invoke(StandardHostValve.java:127)
        at 
org.apache.catalina.valves.ErrorReportValve.invoke(ErrorReportValve.java:102)
        at 
org.apache.catalina.core.StandardEngineValve.invoke(StandardEngineValve.java:109)
        at 
org.apache.catalina.connector.CoyoteAdapter.service(CoyoteAdapter.java:293)
        at 
org.apache.coyote.http11.Http11Processor.process(Http11Processor.java:859)
        at 
org.apache.coyote.http11.Http11Protocol$Http11ConnectionHandler.process(Http11Protocol.java:602)
        at 
org.apache.tomcat.util.net.JIoEndpoint$Worker.run(JIoEndpoint.java:489)
        at java.lang.Thread.run(Thread.java:662)
Caused by: org.apache.tika.exception.TikaException: Unexpected RuntimeException 
from org.apache.tika.parser.microsoft.OfficeParser@247418
        at 
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:244)
        at 
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242)
        at 
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242)
        at 
org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
        at 
org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:227)
        ... 19 more
Caused by: java.lang.StringIndexOutOfBoundsException: String index out of 
range: 1
        at java.lang.String.substring(String.java:1934)
        at 
org.apache.tika.parser.microsoft.WordExtractor.buildParagraphTagAndStyle(WordExtractor.java:482)
        at 
org.apache.tika.parser.microsoft.WordExtractor.handleParagraph(WordExtractor.java:171)
        at 
org.apache.tika.parser.microsoft.WordExtractor.parse(WordExtractor.java:97)
        at 
org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:204)
        at 
org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:177)
        at 
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242)
        ... 23 more
                
> Can't parse Word files with no format set
> -----------------------------------------
>
>                 Key: TIKA-1044
>                 URL: https://issues.apache.org/jira/browse/TIKA-1044
>             Project: Tika
>          Issue Type: Bug
>          Components: parser
>    Affects Versions: 1.0
>            Reporter: Jonas Wilhelmsson
>            Priority: Trivial
>         Attachments: test2.doc, test.docx
>
>
> When we were using Solr for indexing we came over this Tika bug.
> While parsing a doc or docx file that contains text without any format set 
> (format inside Microsoft Word) the parser will throw exceptions.
> By setting a format to the text the file can be correctly parsed without 
> unexpected errors.

--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira

Reply via email to