Author: jukka
Date: Thu Jan 15 15:52:48 2009
New Revision: 734861
URL: http://svn.apache.org/viewvc?rev=734861&view=rev
Log:
TIKA-185: XML files with (unsatisfied) SYSTEM entities can not be extracted
Implemented an OfflineContentHandler decorator class that makes the XML parser
silently ignore all external entities (they are replaced by an empty string).
Use the OfflineContentHandler in the XML and OpenOffice parsers. Also enable
the secure XML parsing feature.
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/OfflineContentHandler.java
lucene/tika/trunk/src/test/java/org/apache/tika/sax/OfflineContentHandlerTest.java
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java?rev=734861&r1=734860&r2=734861&view=diff
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java
(original)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeContentParser.java
Thu Jan 15 15:52:48 2009
@@ -23,6 +23,7 @@
import java.util.Collections;
import java.util.BitSet;
+import javax.xml.XMLConstants;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
@@ -33,6 +34,7 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.ElementMappingContentHandler;
+import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.ElementMappingContentHandler.TargetElement;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
@@ -259,10 +261,12 @@
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setValidating(false);
factory.setNamespaceAware(true);
+ factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
SAXParser parser = factory.newSAXParser();
parser.parse(
new CloseShieldInputStream(stream),
- new NSNormalizerContentHandler(dh));
+ new OfflineContentHandler(
+ new NSNormalizerContentHandler(dh)));
} catch (ParserConfigurationException e) {
throw new TikaException("XML parser configuration error", e);
}
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java?rev=734861&r1=734860&r2=734861&view=diff
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java
(original)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeMetaParser.java
Thu Jan 15 15:52:48 2009
@@ -25,7 +25,6 @@
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
/**
* Parser for OpenDocument <code>meta.xml</code> files.
@@ -35,7 +34,7 @@
private static final XPathParser META_XPATH = new XPathParser(
"meta", "urn:oasis:names:tc:opendocument:xmlns:meta:1.0");
- private static DefaultHandler getMeta(
+ private static ContentHandler getMeta(
ContentHandler ch, Metadata md, String name, String element) {
Matcher matcher = new CompositeMatcher(
META_XPATH.parse("//meta:" + element),
@@ -45,7 +44,7 @@
return new TeeContentHandler(ch, branch);
}
- private static DefaultHandler getStatistic(
+ private static ContentHandler getStatistic(
ContentHandler ch, Metadata md, String name, String attribute) {
Matcher matcher =
META_XPATH.parse("//meta:document-statistic/@meta:" + attribute);
@@ -54,19 +53,19 @@
return new TeeContentHandler(ch, branch);
}
- protected DefaultHandler getDefaultHandler(ContentHandler ch, Metadata md)
{
- DefaultHandler dh = super.getDefaultHandler(ch, md);
- dh = getMeta(dh, md, Metadata.KEYWORDS, "keyword");
- dh = getMeta(dh, md, "generator", "generator");
- dh = getStatistic(dh, md, "nbTab", "table-count");
- dh = getStatistic(dh, md, "nbObject", "object-count");
- dh = getStatistic(dh, md, "nbImg", "image-count");
- dh = getStatistic(dh, md, "nbPage", "page-count");
- dh = getStatistic(dh, md, "nbPara", "paragraph-count");
- dh = getStatistic(dh, md, "nbWord", "word-count");
- dh = getStatistic(dh, md, "nbCharacter", "character-count");
- dh = new NSNormalizerContentHandler(dh);
- return dh;
+ protected ContentHandler getContentHandler(ContentHandler ch, Metadata md)
{
+ ch = super.getContentHandler(ch, md);
+ ch = getMeta(ch, md, Metadata.KEYWORDS, "keyword");
+ ch = getMeta(ch, md, "generator", "generator");
+ ch = getStatistic(ch, md, "nbTab", "table-count");
+ ch = getStatistic(ch, md, "nbObject", "object-count");
+ ch = getStatistic(ch, md, "nbImg", "image-count");
+ ch = getStatistic(ch, md, "nbPage", "page-count");
+ ch = getStatistic(ch, md, "nbPara", "paragraph-count");
+ ch = getStatistic(ch, md, "nbWord", "word-count");
+ ch = getStatistic(ch, md, "nbCharacter", "character-count");
+ ch = new NSNormalizerContentHandler(ch);
+ return ch;
}
}
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java?rev=734861&r1=734860&r2=734861&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
Thu Jan 15 15:52:48 2009
@@ -24,7 +24,6 @@
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
/**
* Dublin Core metadata parser
@@ -34,7 +33,7 @@
private static final XPathParser DC_XPATH = new XPathParser(
"dc", "http://purl.org/dc/elements/1.1/");
- private static DefaultHandler getDublinCore(
+ private static ContentHandler getDublinCore(
ContentHandler ch, Metadata md, String name, String element) {
Matcher matcher = new CompositeMatcher(
DC_XPATH.parse("//dc:" + element),
@@ -44,21 +43,21 @@
return new TeeContentHandler(ch, branch);
}
- protected DefaultHandler getDefaultHandler(ContentHandler ch, Metadata md)
{
- DefaultHandler dh = super.getDefaultHandler(ch, md);
- dh = getDublinCore(dh, md, DublinCore.TITLE, "title");
- dh = getDublinCore(dh, md, DublinCore.SUBJECT, "subject");
- dh = getDublinCore(dh, md, DublinCore.CREATOR, "creator");
- dh = getDublinCore(dh, md, DublinCore.DESCRIPTION, "description");
- dh = getDublinCore(dh, md, DublinCore.PUBLISHER, "publisher");
- dh = getDublinCore(dh, md, DublinCore.CONTRIBUTOR, "contributor");
- dh = getDublinCore(dh, md, DublinCore.DATE, "date");
- dh = getDublinCore(dh, md, DublinCore.TYPE, "type");
- dh = getDublinCore(dh, md, DublinCore.FORMAT, "format");
- dh = getDublinCore(dh, md, DublinCore.IDENTIFIER, "identifier");
- dh = getDublinCore(dh, md, DublinCore.LANGUAGE, "language");
- dh = getDublinCore(dh, md, DublinCore.RIGHTS, "rights");
- return dh;
+ protected ContentHandler getContentHandler(ContentHandler ch, Metadata md)
{
+ ch = super.getContentHandler(ch, md);
+ ch = getDublinCore(ch, md, DublinCore.TITLE, "title");
+ ch = getDublinCore(ch, md, DublinCore.SUBJECT, "subject");
+ ch = getDublinCore(ch, md, DublinCore.CREATOR, "creator");
+ ch = getDublinCore(ch, md, DublinCore.DESCRIPTION, "description");
+ ch = getDublinCore(ch, md, DublinCore.PUBLISHER, "publisher");
+ ch = getDublinCore(ch, md, DublinCore.CONTRIBUTOR, "contributor");
+ ch = getDublinCore(ch, md, DublinCore.DATE, "date");
+ ch = getDublinCore(ch, md, DublinCore.TYPE, "type");
+ ch = getDublinCore(ch, md, DublinCore.FORMAT, "format");
+ ch = getDublinCore(ch, md, DublinCore.IDENTIFIER, "identifier");
+ ch = getDublinCore(ch, md, DublinCore.LANGUAGE, "language");
+ ch = getDublinCore(ch, md, DublinCore.RIGHTS, "rights");
+ return ch;
}
}
Modified:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=734861&r1=734860&r2=734861&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java
Thu Jan 15 15:52:48 2009
@@ -19,6 +19,7 @@
import java.io.IOException;
import java.io.InputStream;
+import javax.xml.XMLConstants;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
@@ -27,11 +28,11 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.TextContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
/**
* XML parser
@@ -53,10 +54,12 @@
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(true);
+ factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
SAXParser parser = factory.newSAXParser();
parser.parse(
new CloseShieldInputStream(stream),
- getDefaultHandler(handler, metadata));
+ new OfflineContentHandler(
+ getContentHandler(handler, metadata)));
} catch (ParserConfigurationException e) {
throw new TikaException("XML parser configuration error", e);
}
@@ -65,7 +68,7 @@
xhtml.endDocument();
}
- protected DefaultHandler getDefaultHandler(
+ protected ContentHandler getContentHandler(
ContentHandler handler, Metadata metadata) {
return new TextContentHandler(handler);
}
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/sax/OfflineContentHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/sax/OfflineContentHandler.java?rev=734861&view=auto
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/sax/OfflineContentHandler.java
(added)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/sax/OfflineContentHandler.java
Thu Jan 15 15:52:48 2009
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import org.apache.commons.io.input.ClosedInputStream;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+
+/**
+ * Content handler decorator that always returns an empty stream from the
+ * {...@link #resolveEntity(String, String)} method to prevent potential
+ * network or other external resources from being accessed by an XML parser.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-185">TIKA-185</a>
+ */
+public class OfflineContentHandler extends ContentHandlerDecorator {
+
+ public OfflineContentHandler(ContentHandler handler) {
+ super(handler);
+ }
+
+ /**
+ * Returns an empty stream. This will make an XML parser silently
+ * ignore any external entities.
+ */
+ @Override
+ public InputSource resolveEntity(String publicId, String systemId) {
+ return new InputSource(ClosedInputStream.CLOSED_INPUT_STREAM);
+ }
+
+}
Added:
lucene/tika/trunk/src/test/java/org/apache/tika/sax/OfflineContentHandlerTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/sax/OfflineContentHandlerTest.java?rev=734861&view=auto
==============================================================================
---
lucene/tika/trunk/src/test/java/org/apache/tika/sax/OfflineContentHandlerTest.java
(added)
+++
lucene/tika/trunk/src/test/java/org/apache/tika/sax/OfflineContentHandlerTest.java
Thu Jan 15 15:52:48 2009
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.io.StringReader;
+import java.net.ConnectException;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import junit.framework.TestCase;
+
+import org.xml.sax.InputSource;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Unit tests for the {...@link OfflineContentHandler} class.
+ */
+public class OfflineContentHandlerTest extends TestCase {
+
+ private SAXParser parser;
+
+ private DefaultHandler offline;
+
+ protected void setUp() throws Exception {
+ parser = SAXParserFactory.newInstance().newSAXParser();
+ offline = new OfflineContentHandler(new DefaultHandler());
+ }
+
+ public void testExternalDTD() throws Exception {
+ String xml =
+ "<!DOCTYPE foo SYSTEM \"http://127.234.172.38:7845/bar\"><foo/>";
+ try {
+ parser.parse(new InputSource(new StringReader(xml)), offline);
+ } catch (ConnectException e) {
+ fail("Parser tried to access the external DTD:" + e);
+ }
+ }
+
+ public void testExternalEntity() throws Exception {
+ String xml =
+ "<!DOCTYPE foo ["
+ + " <!ENTITY bar SYSTEM \"http://127.234.172.38:7845/bar\">"
+ + " ]><foo>&bar;</foo>";
+ try {
+ parser.parse(new InputSource(new StringReader(xml)), offline);
+ } catch (ConnectException e) {
+ fail("Parser tried to access the external DTD:" + e);
+ }
+ }
+
+}