Hi
I am trying to create a very simple boiler pipe parser for Tika
The scala class is
class BoilerPipeParser(val extractor: BoilerpipeExtractor) extends
HtmlParser {
def this() = this(DefaultExtractor.INSTANCE)
override def parse(stream: InputStream, handler: ContentHandler,
metadata: Metadata, context: ParseContext) {
super.parse(stream, new BoilerpipeContentHandler(handler, extractor),
metadata, context)
}
}
I then use the Tika class parse having set my Tika istance having defined
the following
AutoDetectParser
val generic = new AutoDetectParser
new AutoDetectParser(
generic.getDetector,
generic,
ParserDecorator.withTypes(BoilerPipeParser.defaultExtractor,
htmlMediaTypes)
)
Invoking: tika.parse(stream, meta)
generates the following exception
[ERROR] [07/27/2012 14:49:04.189]
[pipeline-akka.actor.default-dispatcher-6] [akka://pipeline/user/$c]
java.io.IOException:
at org.apache.tika.parser.ParsingReader.read(ParsingReader.java:260)
at java.io.BufferedReader.fill(BufferedReader.java:154)
at java.io.BufferedReader.readLine(BufferedReader.java:317)
at java.io.BufferedReader.readLine(BufferedReader.java:382)
at com.eligotech.common.io.package$.readLine(package.scala:74)
at com.eligotech.common.io.package$.readLines(package.scala:79)
at com.eligotech.common.io.package$.readAllLines(package.scala:84)
at
com.eligotech.samiksa.enhancers.tika.TextExtractor.com<http://com.eligotech.samiksa.enhancers.tika.textextractor.com/>
$eligotech$samiksa$enhancers$tika$TextExtractor$$extractText(Te
xtExtractor.scala:28)
at
com.eligotech.samiksa.enhancers.tika.TextExtractor$$anonfun$extractText$1$$anonfun$apply$2.apply(TextExtractor.scala:
23)
at
com.eligotech.samiksa.enhancers.tika.TextExtractor$$anonfun$extractText$1$$anonfun$apply$2.apply(TextExtractor.scala:
23)
at
scala.util.control.Exception$Catch$$anonfun$either$1.apply(Exception.scala:110)
at
scala.util.control.Exception$Catch$$anonfun$either$1.apply(Exception.scala:110)
at scala.util.control.Exception$Catch.apply(Exception.scala:88)
at scala.util.control.Exception$Catch.either(Exception.scala:110)
at com.eligotech.samiksa.package$.expect(package.scala:17)
at
com.eligotech.samiksa.enhancers.tika.TextExtractor$$anonfun$extractText$1.apply(TextExtractor.scala:23)
at
com.eligotech.samiksa.enhancers.tika.TextExtractor$$anonfun$extractText$1.apply(TextExtractor.scala:22)
at com.eligotech.common.package$.closing(package.scala:7)
at
com.eligotech.samiksa.enhancers.tika.TextExtractor.extractText(TextExtractor.scala:22)
at
com.eligotech.samiksa.enhancers.tika.TextExtractor.apply(TextExtractor.scala:16)
at
com.eligotech.samiksa.enhancers.tika.TextExtractor.apply(TextExtractor.scala:15)
at
com.eligotech.samiksa.pipelines.akka.AkkaPipelinesBuilder$$anonfun$process$1.apply(AkkaPipelinesBuilder.scala:98)
at
com.eligotech.samiksa.pipelines.akka.AkkaPipelinesBuilder$$anonfun$process$1.apply(AkkaPipelinesBuilder.scala:98)
at
com.eligotech.samiksa.pipelines.akka.AkkaPipelinesBuilder$AkkaProcessor$$anonfun$receive$1.apply(AkkaPipelinesBuilder
.scala:40)
at
com.eligotech.samiksa.pipelines.akka.AkkaPipelinesBuilder$AkkaProcessor$$anonfun$receive$1.apply(AkkaPipelinesBuilder
.scala:39)
at akka.actor.Actor$class.apply(Actor.scala:318)
at
com.eligotech.samiksa.pipelines.akka.AkkaPipelinesBuilder$AkkaProcessor.apply(AkkaPipelinesBuilder.scala:25)
at akka.actor.ActorCell.invoke(ActorCell.scala:626)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:197)
at akka.dispatch.Mailbox.run(Mailbox.scala:179)
at
akka.dispatch.ForkJoinExecutorConfigurator$MailboxExecutionTask.exec(AbstractDispatcher.scala:516)
at akka.jsr166y.ForkJoinTask.doExec(ForkJoinTask.java:259)
at
akka.jsr166y.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:975)
at akka.jsr166y.ForkJoinPool.runWorker(ForkJoinPool.java:1479)
at
akka.jsr166y.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:104)
Caused by: org.apache.tika.exception.TikaException: Zip bomb detected!
at
org.apache.tika.sax.SecureContentHandler.throwIfCauseOf(SecureContentHandler.java:192)
at
org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:123)
at
org.apache.tika.parser.ParsingReader$ParsingTask.run(ParsingReader.java:221)
at java.lang.Thread.run(Thread.java:722)
Caused by: org.apache.tika.sax.SecureContentHandler$SecureSAXException:
Suspected zip bomb: 100 levels of XML element nesting
at
org.apache.tika.sax.SecureContentHandler.startElement(SecureContentHandler.java:234)
at
org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at
org.apache.tika.parser.html.BoilerpipeContentHandler.startElement(BoilerpipeContentHandler.java:205)
at
org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at
org.apache.tika.sax.SafeContentHandler.startElement(SafeContentHandler.java:264)
at
org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:245)
at
org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:275)
at
org.apache.tika.parser.html.HtmlHandler.startElementWithSafeAttributes(HtmlHandler.java:169)
at
org.apache.tika.parser.html.HtmlHandler.startElement(HtmlHandler.java:129)
at
org.apache.tika.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:126)
at
org.apache.tika.parser.html.XHTMLDowngradeHandler.startElement(XHTMLDowngradeHandler.java:61)
at org.ccil.cowan.tagsoup.Parser.push(Parser.java:794)
at org.ccil.cowan.tagsoup.Parser.rectify(Parser.java:1061)
at org.ccil.cowan.tagsoup.Parser.stagc(Parser.java:1016)
at org.ccil.cowan.tagsoup.HTMLScanner.scan(HTMLScanner.java:567)
at org.ccil.cowan.tagsoup.Parser.parse(Parser.java:449)
at org.apache.tika.parser.html.HtmlParser.parse(HtmlParser.java:104)
at
com.eligotech.samiksa.enhancers.tika.BoilerPipeParser.parse(BoilerPipeParser.scala:18)
at
org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:91)
at
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242)
at
org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
... 2 more
Did I miss a point ?
Kind regards
Mark