[
https://issues.apache.org/jira/browse/TIKA-1143?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14362308#comment-14362308
]
Vincent Massol commented on TIKA-1143:
--------------------------------------
Thanks Tyler. Could you set the "fix version" and "assignee" fields please? The
fix version is especially important so that we can know which version of Tika
we have to take that has the fix (ie the POI upgrade). Thanks!
> Fails to parse some PPT file
> ----------------------------
>
> Key: TIKA-1143
> URL: https://issues.apache.org/jira/browse/TIKA-1143
> Project: Tika
> Issue Type: Bug
> Components: parser
> Affects Versions: 1.4
> Reporter: Vincent Massol
> Attachments: XWikiIExpoPresentation.ppt
>
>
> See also http://jira.xwiki.org/browse/XWIKI-9308
> Here's what I get with the attached file:
> {noformat}
> 2013-07-03 11:52:45,332 [XWiki Solr index thread] WARN
> a.t.p.m.AbstractPOIFSExtractor - Ignoring unexpected exception while parsing
> summary entry DocumentSummaryInformation
> java.lang.ClassCastException: [B cannot be cast to java.lang.String
> at
> org.apache.poi.hpsf.DocumentSummaryInformation.getCategory(DocumentSummaryInformation.java:78)
> ~[poi-3.9.jar:3.9]
> at
> org.apache.tika.parser.microsoft.SummaryExtractor.parse(SummaryExtractor.java:143)
> [tika-parsers-1.4.jar:na]
> at
> org.apache.tika.parser.microsoft.SummaryExtractor.parseSummaryEntryIfExists(SummaryExtractor.java:88)
> [tika-parsers-1.4.jar:na]
> at
> org.apache.tika.parser.microsoft.SummaryExtractor.parseSummaries(SummaryExtractor.java:73)
> [tika-parsers-1.4.jar:na]
> at
> org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:170)
> [tika-parsers-1.4.jar:na]
> at
> org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:161)
> [tika-parsers-1.4.jar:na]
> at
> org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242)
> [tika-core-1.4.jar:na]
> at
> org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242)
> [tika-core-1.4.jar:na]
> at
> org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
> [tika-core-1.4.jar:na]
> at org.apache.tika.Tika.parseToString(Tika.java:380)
> [tika-core-1.4.jar:na]
> at
> org.xwiki.search.solr.internal.metadata.AttachmentSolrMetadataExtractor.getContentAsText(AttachmentSolrMetadataExtractor.java:130)
> [xwiki-platform-search-solr-api-5.2-20130702.194010-10.jar:na]
> at
> org.xwiki.search.solr.internal.metadata.AttachmentSolrMetadataExtractor.setLocaleAndContentFields(AttachmentSolrMetadataExtractor.java:97)
> [xwiki-platform-search-solr-api-5.2-20130702.194010-10.jar:na]
> at
> org.xwiki.search.solr.internal.metadata.AttachmentSolrMetadataExtractor.setFieldsInternal(AttachmentSolrMetadataExtractor.java:79)
> [xwiki-platform-search-solr-api-5.2-20130702.194010-10.jar:na]
> at
> org.xwiki.search.solr.internal.metadata.AbstractSolrMetadataExtractor.getSolrDocument(AbstractSolrMetadataExtractor.java:114)
> [xwiki-platform-search-solr-api-5.2-20130702.194010-10.jar:na]
> at
> org.xwiki.search.solr.internal.DefaultSolrIndexer.getSolrDocument(DefaultSolrIndexer.java:465)
> [xwiki-platform-search-solr-api-5.2-20130702.194010-10.jar:na]
> at
> org.xwiki.search.solr.internal.DefaultSolrIndexer.processBatch(DefaultSolrIndexer.java:378)
> [xwiki-platform-search-solr-api-5.2-20130702.194010-10.jar:na]
> at
> org.xwiki.search.solr.internal.DefaultSolrIndexer.runInternal(DefaultSolrIndexer.java:353)
> [xwiki-platform-search-solr-api-5.2-20130702.194010-10.jar:na]
> at
> com.xpn.xwiki.util.AbstractXWikiRunnable.run(AbstractXWikiRunnable.java:121)
> [xwiki-platform-oldcore-5.2-20130702.190754-22.jar:na]
> at java.lang.Thread.run(Thread.java:680) [na:1.6.0_51]
> 2013-07-03 11:52:49,985 [Lucene Index Updater] WARN
> a.t.p.m.AbstractPOIFSExtractor - Ignoring unexpected exception while parsing
> summary entry DocumentSummaryInformation
> java.lang.ClassCastException: [B cannot be cast to java.lang.String
> at
> org.apache.poi.hpsf.DocumentSummaryInformation.getCategory(DocumentSummaryInformation.java:78)
> ~[poi-3.9.jar:3.9]
> at
> org.apache.tika.parser.microsoft.SummaryExtractor.parse(SummaryExtractor.java:143)
> [tika-parsers-1.4.jar:na]
> at
> org.apache.tika.parser.microsoft.SummaryExtractor.parseSummaryEntryIfExists(SummaryExtractor.java:88)
> [tika-parsers-1.4.jar:na]
> at
> org.apache.tika.parser.microsoft.SummaryExtractor.parseSummaries(SummaryExtractor.java:73)
> [tika-parsers-1.4.jar:na]
> at
> org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:170)
> [tika-parsers-1.4.jar:na]
> at
> org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:161)
> [tika-parsers-1.4.jar:na]
> at
> org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242)
> [tika-core-1.4.jar:na]
> at
> org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242)
> [tika-core-1.4.jar:na]
> at
> org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
> [tika-core-1.4.jar:na]
> at org.apache.tika.Tika.parseToString(Tika.java:380)
> [tika-core-1.4.jar:na]
> at
> com.xpn.xwiki.plugin.lucene.internal.AttachmentData.getContentAsText(AttachmentData.java:221)
> [xwiki-platform-search-lucene-api-5.2-20130702.191134-22.jar:na]
> at
> com.xpn.xwiki.plugin.lucene.internal.AttachmentData.getFullText(AttachmentData.java:197)
> [xwiki-platform-search-lucene-api-5.2-20130702.191134-22.jar:na]
> at
> com.xpn.xwiki.plugin.lucene.internal.AbstractDocumentData.getFullText(AbstractDocumentData.java:251)
> [xwiki-platform-search-lucene-api-5.2-20130702.191134-22.jar:na]
> at
> com.xpn.xwiki.plugin.lucene.internal.AbstractDocumentData.addDocumentDataToLuceneDocument(AbstractDocumentData.java:212)
> [xwiki-platform-search-lucene-api-5.2-20130702.191134-22.jar:na]
> at
> com.xpn.xwiki.plugin.lucene.internal.AbstractDocumentData.addDataToLuceneDocument(AbstractDocumentData.java:147)
> [xwiki-platform-search-lucene-api-5.2-20130702.191134-22.jar:na]
> at
> com.xpn.xwiki.plugin.lucene.internal.AttachmentData.addDataToLuceneDocument(AttachmentData.java:99)
> [xwiki-platform-search-lucene-api-5.2-20130702.191134-22.jar:na]
> at
> com.xpn.xwiki.plugin.lucene.internal.IndexUpdater.addToIndex(IndexUpdater.java:296)
> [xwiki-platform-search-lucene-api-5.2-20130702.191134-22.jar:na]
> at
> com.xpn.xwiki.plugin.lucene.internal.IndexUpdater.updateIndex(IndexUpdater.java:237)
> [xwiki-platform-search-lucene-api-5.2-20130702.191134-22.jar:na]
> at
> com.xpn.xwiki.plugin.lucene.internal.IndexUpdater.runMainLoop(IndexUpdater.java:183)
> [xwiki-platform-search-lucene-api-5.2-20130702.191134-22.jar:na]
> at
> com.xpn.xwiki.plugin.lucene.internal.IndexUpdater.runInternal(IndexUpdater.java:168)
> [xwiki-platform-search-lucene-api-5.2-20130702.191134-22.jar:na]
> at
> com.xpn.xwiki.util.AbstractXWikiRunnable.run(AbstractXWikiRunnable.java:121)
> [xwiki-platform-oldcore-5.2-20130702.190754-22.jar:na]
> at java.lang.Thread.run(Thread.java:680) [na:1.6.0_51]
> {noformat}
> Note that if I resave the file on which this happens, I don't get the error
> any more. I tried to save both with libreoffice 4.0.1.2 and MS PPT.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)