Tim Allison created TIKA-2102:
---------------------------------
Summary: SecureContentHandler incorrectly calculates bytesCount
for TStreams w open container
Key: TIKA-2102
URL: https://issues.apache.org/jira/browse/TIKA-2102
Project: Tika
Issue Type: Bug
Reporter: Tim Allison
Priority: Minor
On a handful of files in the latest regression run, we're now getting zip bomb
exceptions when an embedded document is an "open container" inside a
TikaInputStream. SecureContentHandler calculates a ratio of bytes written to
total bytes in the stream; if we're parsing an embedded "open container" the
length of the inputstream is 0.
We should add another if clause to handle parsing of open containers (?). This
would leave us vulnerable to zip bombs... Is there an obvious way to give a
hint about length?
{noformat}
{org.apache.tika.exception.TikaException: Zip bomb detected!
at
org.apache.tika.sax.SecureContentHandler.throwIfCauseOf(SecureContentHandler.java:192)
at
org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:123)
at
org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:188)
at org.apache.tika.parser.DigestingParser.parse(DigestingParser.java:74)
at
org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:188)
at
org.apache.tika.parser.RecursiveParserWrapper$EmbeddedParserDecorator.parse(RecursiveParserWrapper.java:317)
at
org.apache.tika.parser.DelegatingParser.parse(DelegatingParser.java:72)
at
org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor.parseEmbedded(ParsingEmbeddedDocumentExtractor.java:102)
at
org.apache.tika.parser.microsoft.AbstractPOIFSExtractor.handleEmbeddedResource(AbstractPOIFSExtractor.java:141)
at
org.apache.tika.parser.microsoft.AbstractPOIFSExtractor.handleEmbeddedResource(AbstractPOIFSExtractor.java:117)
at
org.apache.tika.parser.microsoft.HSLFExtractor.handleSlideEmbeddedResources(HSLFExtractor.java:393)
at
org.apache.tika.parser.microsoft.HSLFExtractor.parse(HSLFExtractor.java:142)
at
org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:172)
at
org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:130)
at
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
at
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
at
org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
at
org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:188)
at org.apache.tika.parser.DigestingParser.parse(DigestingParser.java:74)
at
org.apache.tika.parser.RecursiveParserWrapper.parse(RecursiveParserWrapper.java:158)
at
org.apache.tika.batch.FileResourceConsumer.parse(FileResourceConsumer.java:407)
at
org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer.processFileResource(RecursiveParserWrapperFSConsumer.java:104)
at
org.apache.tika.batch.FileResourceConsumer._processFileResource(FileResourceConsumer.java:182)
at
org.apache.tika.batch.FileResourceConsumer.call(FileResourceConsumer.java:115)
at
org.apache.tika.batch.FileResourceConsumer.call(FileResourceConsumer.java:50)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.tika.sax.SecureContentHandler$SecureSAXException:
Suspected zip bomb: 0 input bytes produced 1000016 output characters
at
org.apache.tika.sax.SecureContentHandler.advance(SecureContentHandler.java:221)
at
org.apache.tika.sax.SecureContentHandler.characters(SecureContentHandler.java:269)
at
org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:146)
at
org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:146)
at
org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:146)
at
org.apache.tika.sax.SafeContentHandler.access$001(SafeContentHandler.java:46)
at
org.apache.tika.sax.SafeContentHandler$1.write(SafeContentHandler.java:82)
at
org.apache.tika.sax.SafeContentHandler.filter(SafeContentHandler.java:140)
at
org.apache.tika.sax.SafeContentHandler.characters(SafeContentHandler.java:287)
at
org.apache.tika.sax.XHTMLContentHandler.characters(XHTMLContentHandler.java:279)
at
org.apache.tika.sax.XHTMLContentHandler.characters(XHTMLContentHandler.java:306)
at org.apache.tika.parser.microsoft.TextCell.render(TextCell.java:34)
at
org.apache.tika.parser.microsoft.ExcelExtractor$TikaHSSFListener.processSheet(ExcelExtractor.java:585)
at
org.apache.tika.parser.microsoft.ExcelExtractor$TikaHSSFListener.internalProcessRecord(ExcelExtractor.java:385)
at
org.apache.tika.parser.microsoft.ExcelExtractor$TikaHSSFListener.processRecord(ExcelExtractor.java:336)
at
org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener.processRecord(FormatTrackingHSSFListener.java:92)
at
org.apache.poi.hssf.eventusermodel.HSSFRequest.processRecord(HSSFRequest.java:109)
at
org.apache.poi.hssf.eventusermodel.HSSFEventFactory.genericProcessEvents(HSSFEventFactory.java:179)
at
org.apache.poi.hssf.eventusermodel.HSSFEventFactory.processEvents(HSSFEventFactory.java:136)
at
org.apache.tika.parser.microsoft.ExcelExtractor$TikaHSSFListener.processFile(ExcelExtractor.java:312)
at
org.apache.tika.parser.microsoft.ExcelExtractor.parse(ExcelExtractor.java:169)
at
org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:177)
at
org.apache.tika.parser.microsoft.OfficeParser.parse(OfficeParser.java:130)
at
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
at
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
at
org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
... 29 more
{noformat}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)