[
https://issues.apache.org/jira/browse/TIKA-2955?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Luke Butters updated TIKA-2955:
-------------------------------
Description:
Hi, I am trying to parse: [^314.pdf]
what is happening when I try to convert it to XHTML is my XML parser fails
because:
{code}
14:35:12.876 [main] ERROR com.funnelback.common.filter.TikaFilterProvider -
Unable to filter stream with document type '.pdf'
org.xml.sax.SAXException: net.sf.saxon.trans.XPathException: Illegal HTML
character: decimal 147
at
net.sf.saxon.event.ReceivingContentHandler.endElement(ReceivingContentHandler.java:538)
~[Saxon-HE-9.9.0-2.jar:?]
at
org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:136)
~[tika-core-1.19.1.jar:1.19.1]
at
org.apache.tika.sax.SecureContentHandler.endElement(SecureContentHandler.java:256)
~[tika-core-1.19.1.jar:1.19.1]
at
org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:136)
~[tika-core-1.19.1.jar:1.19.1]
at
org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:136)
~[tika-core-1.19.1.jar:1.19.1]
at
org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:136)
~[tika-core-1.19.1.jar:1.19.1]
at
org.apache.tika.sax.SafeContentHandler.endElement(SafeContentHandler.java:274)
~[tika-core-1.19.1.jar:1.19.1]
at
org.apache.tika.sax.XHTMLContentHandler.endDocument(XHTMLContentHandler.java:229)
~[tika-core-1.19.1.jar:1.19.1]
at
org.apache.tika.parser.pdf.AbstractPDF2XHTML.endDocument(AbstractPDF2XHTML.java:556)
~[tika-parsers-1.19.1.jar:1.19.1]
at org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:267)
~[pdfbox-2.0.12.jar:2.0.12]
at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:117)
~[tika-parsers-1.19.1.jar:1.19.1]
at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:172)
~[tika-parsers-1.19.1.jar:1.19.1]
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
~[tika-core-1.19.1.jar:1.19.1]
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
~[tika-core-1.19.1.jar:1.19.1]
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:143)
~[tika-core-1.19.1.jar:1.19.1]
at
[removed section of trace]
Caused by: net.sf.saxon.trans.XPathException: Illegal HTML character: decimal
147
at net.sf.saxon.serialize.HTMLEmitter.writeEscape(HTMLEmitter.java:379)
~[Saxon-HE-9.9.0-2.jar:?]
at net.sf.saxon.serialize.XMLEmitter.characters(XMLEmitter.java:662)
~[Saxon-HE-9.9.0-2.jar:?]
at net.sf.saxon.serialize.HTMLEmitter.characters(HTMLEmitter.java:441)
~[Saxon-HE-9.9.0-2.jar:?]
at net.sf.saxon.serialize.HTMLIndenter.characters(HTMLIndenter.java:216)
~[Saxon-HE-9.9.0-2.jar:?]
at net.sf.saxon.event.ProxyReceiver.characters(ProxyReceiver.java:193)
~[Saxon-HE-9.9.0-2.jar:?]
at net.sf.saxon.event.ProxyReceiver.characters(ProxyReceiver.java:193)
~[Saxon-HE-9.9.0-2.jar:?]
at net.sf.saxon.event.ProxyReceiver.characters(ProxyReceiver.java:193)
~[Saxon-HE-9.9.0-2.jar:?]
at
net.sf.saxon.event.SequenceNormalizer.characters(SequenceNormalizer.java:183)
~[Saxon-HE-9.9.0-2.jar:?]
at
net.sf.saxon.event.ReceivingContentHandler.flush(ReceivingContentHandler.java:646)
~[Saxon-HE-9.9.0-2.jar:?]
at
net.sf.saxon.event.ReceivingContentHandler.endElement(ReceivingContentHandler.java:526)
~[Saxon-HE-9.9.0-2.jar:?]
... 43 more
{code}
It looks like tika is asking the XML library to handle chracter 147 ie 0x93
which is not allowed in HTML.
This saxon XML library is not happy with that, I think the default java one
doesn't complain when given the invalid character though, however tika is
probably wrong to write out that character when writing XHTML.
was:
Hi, I am trying to parse: [^314.pdf]
what is happening when I try to convert it to XHTML is my XML parser fails
because:
{code}
14:35:12.876 [main] ERROR com.funnelback.common.filter.TikaFilterProvider -
Unable to filter stream with document type '.pdf'
org.xml.sax.SAXException: net.sf.saxon.trans.XPathException: Illegal HTML
character: decimal 147
at
net.sf.saxon.event.ReceivingContentHandler.endElement(ReceivingContentHandler.java:538)
~[Saxon-HE-9.9.0-2.jar:?]
at
org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:136)
~[tika-core-1.19.1.jar:1.19.1]
at
org.apache.tika.sax.SecureContentHandler.endElement(SecureContentHandler.java:256)
~[tika-core-1.19.1.jar:1.19.1]
at
org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:136)
~[tika-core-1.19.1.jar:1.19.1]
at
org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:136)
~[tika-core-1.19.1.jar:1.19.1]
at
org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:136)
~[tika-core-1.19.1.jar:1.19.1]
at
org.apache.tika.sax.SafeContentHandler.endElement(SafeContentHandler.java:274)
~[tika-core-1.19.1.jar:1.19.1]
at
org.apache.tika.sax.XHTMLContentHandler.endDocument(XHTMLContentHandler.java:229)
~[tika-core-1.19.1.jar:1.19.1]
at
org.apache.tika.parser.pdf.AbstractPDF2XHTML.endDocument(AbstractPDF2XHTML.java:556)
~[tika-parsers-1.19.1.jar:1.19.1]
at org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:267)
~[pdfbox-2.0.12.jar:2.0.12]
at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:117)
~[tika-parsers-1.19.1.jar:1.19.1]
at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:172)
~[tika-parsers-1.19.1.jar:1.19.1]
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
~[tika-core-1.19.1.jar:1.19.1]
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
~[tika-core-1.19.1.jar:1.19.1]
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:143)
~[tika-core-1.19.1.jar:1.19.1]
at
[removed section of trace]
Caused by: net.sf.saxon.trans.XPathException: Illegal HTML character: decimal
147
at net.sf.saxon.serialize.HTMLEmitter.writeEscape(HTMLEmitter.java:379)
~[Saxon-HE-9.9.0-2.jar:?]
at net.sf.saxon.serialize.XMLEmitter.characters(XMLEmitter.java:662)
~[Saxon-HE-9.9.0-2.jar:?]
at net.sf.saxon.serialize.HTMLEmitter.characters(HTMLEmitter.java:441)
~[Saxon-HE-9.9.0-2.jar:?]
at net.sf.saxon.serialize.HTMLIndenter.characters(HTMLIndenter.java:216)
~[Saxon-HE-9.9.0-2.jar:?]
at net.sf.saxon.event.ProxyReceiver.characters(ProxyReceiver.java:193)
~[Saxon-HE-9.9.0-2.jar:?]
at net.sf.saxon.event.ProxyReceiver.characters(ProxyReceiver.java:193)
~[Saxon-HE-9.9.0-2.jar:?]
at net.sf.saxon.event.ProxyReceiver.characters(ProxyReceiver.java:193)
~[Saxon-HE-9.9.0-2.jar:?]
at
net.sf.saxon.event.SequenceNormalizer.characters(SequenceNormalizer.java:183)
~[Saxon-HE-9.9.0-2.jar:?]
at
net.sf.saxon.event.ReceivingContentHandler.flush(ReceivingContentHandler.java:646)
~[Saxon-HE-9.9.0-2.jar:?]
at
net.sf.saxon.event.ReceivingContentHandler.endElement(ReceivingContentHandler.java:526)
~[Saxon-HE-9.9.0-2.jar:?]
... 43 more
[code}
It looks like tika is asking the XML library to handle chracter 147 ie 0x93
which is not allowed in HTML.
This saxon XML library is not happy with that, I think the default java one
doesn't complain when given the invalid character though, however tika is
probably wrong to write out that character when writing XHTML.
> PDF parsing to XHTML results in tika attempting to write invalid HTML
> characters.
> ---------------------------------------------------------------------------------
>
> Key: TIKA-2955
> URL: https://issues.apache.org/jira/browse/TIKA-2955
> Project: Tika
> Issue Type: Bug
> Reporter: Luke Butters
> Priority: Major
> Attachments: 314.pdf
>
>
> Hi, I am trying to parse: [^314.pdf]
> what is happening when I try to convert it to XHTML is my XML parser fails
> because:
> {code}
> 14:35:12.876 [main] ERROR com.funnelback.common.filter.TikaFilterProvider -
> Unable to filter stream with document type '.pdf'
> org.xml.sax.SAXException: net.sf.saxon.trans.XPathException: Illegal HTML
> character: decimal 147
> at
> net.sf.saxon.event.ReceivingContentHandler.endElement(ReceivingContentHandler.java:538)
> ~[Saxon-HE-9.9.0-2.jar:?]
> at
> org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:136)
> ~[tika-core-1.19.1.jar:1.19.1]
> at
> org.apache.tika.sax.SecureContentHandler.endElement(SecureContentHandler.java:256)
> ~[tika-core-1.19.1.jar:1.19.1]
> at
> org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:136)
> ~[tika-core-1.19.1.jar:1.19.1]
> at
> org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:136)
> ~[tika-core-1.19.1.jar:1.19.1]
> at
> org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:136)
> ~[tika-core-1.19.1.jar:1.19.1]
> at
> org.apache.tika.sax.SafeContentHandler.endElement(SafeContentHandler.java:274)
> ~[tika-core-1.19.1.jar:1.19.1]
> at
> org.apache.tika.sax.XHTMLContentHandler.endDocument(XHTMLContentHandler.java:229)
> ~[tika-core-1.19.1.jar:1.19.1]
> at
> org.apache.tika.parser.pdf.AbstractPDF2XHTML.endDocument(AbstractPDF2XHTML.java:556)
> ~[tika-parsers-1.19.1.jar:1.19.1]
> at
> org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:267)
> ~[pdfbox-2.0.12.jar:2.0.12]
> at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:117)
> ~[tika-parsers-1.19.1.jar:1.19.1]
> at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:172)
> ~[tika-parsers-1.19.1.jar:1.19.1]
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
> ~[tika-core-1.19.1.jar:1.19.1]
> at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:280)
> ~[tika-core-1.19.1.jar:1.19.1]
> at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:143)
> ~[tika-core-1.19.1.jar:1.19.1]
> at
> [removed section of trace]
> Caused by: net.sf.saxon.trans.XPathException: Illegal HTML character: decimal
> 147
> at net.sf.saxon.serialize.HTMLEmitter.writeEscape(HTMLEmitter.java:379)
> ~[Saxon-HE-9.9.0-2.jar:?]
> at net.sf.saxon.serialize.XMLEmitter.characters(XMLEmitter.java:662)
> ~[Saxon-HE-9.9.0-2.jar:?]
> at net.sf.saxon.serialize.HTMLEmitter.characters(HTMLEmitter.java:441)
> ~[Saxon-HE-9.9.0-2.jar:?]
> at net.sf.saxon.serialize.HTMLIndenter.characters(HTMLIndenter.java:216)
> ~[Saxon-HE-9.9.0-2.jar:?]
> at net.sf.saxon.event.ProxyReceiver.characters(ProxyReceiver.java:193)
> ~[Saxon-HE-9.9.0-2.jar:?]
> at net.sf.saxon.event.ProxyReceiver.characters(ProxyReceiver.java:193)
> ~[Saxon-HE-9.9.0-2.jar:?]
> at net.sf.saxon.event.ProxyReceiver.characters(ProxyReceiver.java:193)
> ~[Saxon-HE-9.9.0-2.jar:?]
> at
> net.sf.saxon.event.SequenceNormalizer.characters(SequenceNormalizer.java:183)
> ~[Saxon-HE-9.9.0-2.jar:?]
> at
> net.sf.saxon.event.ReceivingContentHandler.flush(ReceivingContentHandler.java:646)
> ~[Saxon-HE-9.9.0-2.jar:?]
> at
> net.sf.saxon.event.ReceivingContentHandler.endElement(ReceivingContentHandler.java:526)
> ~[Saxon-HE-9.9.0-2.jar:?]
> ... 43 more
> {code}
> It looks like tika is asking the XML library to handle chracter 147 ie 0x93
> which is not allowed in HTML.
> This saxon XML library is not happy with that, I think the default java one
> doesn't complain when given the invalid character though, however tika is
> probably wrong to write out that character when writing XHTML.
--
This message was sent by Atlassian Jira
(v8.3.4#803005)