This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4135 in repository https://gitbox.apache.org/repos/asf/tika.git
commit d6d13b3d3abf01fa047c3ad8105efadf0dd70cac Author: tallison <talli...@apache.org> AuthorDate: Tue Sep 19 14:11:12 2023 -0400 TIKA-4135 -- remove xerces2 --- tika-bundles/tika-bundle-standard/pom.xml | 9 ----- tika-core/pom.xml | 10 +----- .../java/org/apache/tika/utils/XMLReaderUtils.java | 10 ++++++ .../apache/tika/sax/CustomErrorHandlerTest.java | 2 ++ .../tika/sax/ErrorResistantSAXParserFactory.java | 39 ---------------------- tika-parent/pom.xml | 13 -------- .../tika-parsers-ml/tika-parser-nlp-module/pom.xml | 4 +++ .../tika-parser-microsoft-module/pom.xml | 2 -- .../tika-parser-xml-module/pom.xml | 4 --- .../java/org/apache/tika/parser/XMLTestBase.java | 3 +- 10 files changed, 18 insertions(+), 78 deletions(-) diff --git a/tika-bundles/tika-bundle-standard/pom.xml b/tika-bundles/tika-bundle-standard/pom.xml index db605c044..db6304bae 100644 --- a/tika-bundles/tika-bundle-standard/pom.xml +++ b/tika-bundles/tika-bundle-standard/pom.xml @@ -151,8 +151,6 @@ </Bundle-Activator> <Embed-Dependency>*;scope=compile;artifactId=tika-parsers-standard-package| javax.activation| - xerces| - xercesImpl| commons-compress| xz| commons-codec| @@ -317,10 +315,6 @@ org.apache.tools.ant;resolution:=optional, org.apache.tools.ant.taskdefs;resolution:=optional, org.apache.tools.ant.types;resolution:=optional, - org.apache.xerces.parsers;resolution:=optional, - org.apache.xerces.util;resolution:=optional, - org.apache.xerces.xni;resolution:=optional, - org.apache.xerces.xni.parser;resolution:=optional, org.apache.xml.resolver;resolution:=optional, org.apache.xml.resolver.readers;resolution:=optional, org.apache.xml.resolver.tools;resolution:=optional, @@ -411,9 +405,6 @@ INFO </org.ops4j.pax.logging.DefaultServiceLog.level> </systemPropertyVariables> - <classpathDependencyExcludes> - <classpathDependencyExclude>xerces:xercesImpl</classpathDependencyExclude> - </classpathDependencyExcludes> </configuration> </plugin> <plugin> diff --git a/tika-core/pom.xml b/tika-core/pom.xml index 3680cb8de..58f9fd298 100644 --- a/tika-core/pom.xml +++ b/tika-core/pom.xml @@ -104,12 +104,6 @@ <version>${junit5.version}</version> <scope>test</scope> </dependency> - <dependency> - <groupId>xerces</groupId> - <artifactId>xercesImpl</artifactId> - <version>2.12.2</version> - <scope>test</scope> -</dependency> </dependencies> <build> @@ -156,9 +150,7 @@ org.apache.tika.config.TikaActivator </Bundle-Activator> <Bundle-ActivationPolicy>lazy</Bundle-ActivationPolicy> - <Import-Package>!sun.misc,org.apache.xerces.util;resolution:=optional, - org.apache.commons.io.*;version="[2,3)", - *</Import-Package> + <Import-Package>!org.apache.commons.io.*;version="[2,3)",*</Import-Package> <Export-Package> org.apache.tika.* </Export-Package> diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java index 53ebc154e..262ebfef9 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java @@ -215,6 +215,9 @@ public class XMLReaderUtils implements Serializable { */ public static SAXParserFactory getSAXParserFactory() { SAXParserFactory factory = SAXParserFactory.newInstance(); + if (LOG.isDebugEnabled()) { + LOG.debug("SAXParserFactory class {}", factory.getClass()); + } factory.setNamespaceAware(true); factory.setValidating(false); trySetSAXFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true); @@ -241,6 +244,10 @@ public class XMLReaderUtils implements Serializable { public static DocumentBuilderFactory getDocumentBuilderFactory() { //borrowed from Apache POI DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + if (LOG.isDebugEnabled()) { + LOG.debug("DocumentBuilderFactory class {}", factory.getClass()); + } + factory.setExpandEntityReferences(false); factory.setNamespaceAware(true); factory.setValidating(false); @@ -290,6 +297,9 @@ public class XMLReaderUtils implements Serializable { */ public static XMLInputFactory getXMLInputFactory() { XMLInputFactory factory = XMLInputFactory.newFactory(); + if (LOG.isDebugEnabled()) { + LOG.debug("XMLInputFactory class {}", factory.getClass()); + } tryToSetStaxProperty(factory, XMLInputFactory.IS_NAMESPACE_AWARE, true); tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false); diff --git a/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java index 0cee6d21b..88643147f 100644 --- a/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java +++ b/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java @@ -27,6 +27,7 @@ import javax.xml.parsers.ParserConfigurationException; import org.apache.commons.io.output.ByteArrayOutputStream; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.xml.sax.SAXException; @@ -40,6 +41,7 @@ import org.apache.tika.utils.XMLReaderUtils; * * @see <a href="https://issues.apache.org/jira/browse/TIKA-4062">TIKA-4062</a> */ +@Disabled("TODO -- rework without xerces") public class CustomErrorHandlerTest extends TikaTest { private static String DEFAULT_SAX_PARSER_FACTORY; diff --git a/tika-core/src/test/java/org/apache/tika/sax/ErrorResistantSAXParserFactory.java b/tika-core/src/test/java/org/apache/tika/sax/ErrorResistantSAXParserFactory.java deleted file mode 100644 index 9916b9df5..000000000 --- a/tika-core/src/test/java/org/apache/tika/sax/ErrorResistantSAXParserFactory.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.sax; - -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.parsers.SAXParserFactory; - -import org.apache.xerces.jaxp.SAXParserFactoryImpl; -import org.xml.sax.SAXNotRecognizedException; -import org.xml.sax.SAXNotSupportedException; - -/** - * - * A custom {@link SAXParserFactory} to force the parser to continue even after fatal error - * - */ -public class ErrorResistantSAXParserFactory extends SAXParserFactoryImpl { - - public ErrorResistantSAXParserFactory() throws SAXNotRecognizedException, SAXNotSupportedException, - ParserConfigurationException { - super(); - this.setFeature("http://apache.org/xml/features/continue-after-fatal-error", true); - } - -} diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 662cb67e1..3b147da00 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -394,7 +394,6 @@ <uima.version>3.4.1</uima.version> <uimafit.version>3.4.0</uimafit.version> <vorbis.version>0.8</vorbis.version> - <xerces.version>2.12.2</xerces.version> <xmpcore.version>6.1.11</xmpcore.version> <zstd.version>1.5.5-5</zstd.version> <kafka.version>3.5.1</kafka.version> @@ -929,11 +928,6 @@ <artifactId>snappy-java</artifactId> <version>1.1.10.3</version> </dependency> - <dependency> - <groupId>xerces</groupId> - <artifactId>xercesImpl</artifactId> - <version>${xerces.version}</version> - </dependency> <dependency> <groupId>commons-fileupload</groupId> <artifactId>commons-fileupload</artifactId> @@ -1015,13 +1009,6 @@ <artifactId>netty-handler</artifactId> <version>${netty.version}</version> </exclude> - <exclude> - <!-- the most recent cve in sonatype for this artifact is 2.11.0, - not at all the version we're using...smh--> - <groupId>xerces</groupId> - <artifactId>xercesImpl</artifactId> - <version>${xerces.version}</version> - </exclude> <!-- these are used by the nlp-module --> <exclude> <groupId>org.apache.lucene</groupId> diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/pom.xml b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/pom.xml index 34327e839..5144584f7 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/pom.xml +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/pom.xml @@ -58,6 +58,10 @@ <groupId>log4j</groupId> <artifactId>log4j</artifactId> </exclusion> + <exclusion> + <groupId>xerces</groupId> + <artifactId>xercesImpl</artifactId> + </exclusion> </exclusions> </dependency> <dependency> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml index cd4b37326..c3edb9ffa 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml @@ -42,8 +42,6 @@ <artifactId>tika-parser-text-module</artifactId> <version>${project.version}</version> </dependency> - <!-- needed for only for AbstractXML2003Parser, but it ensures that xerces - is used --> <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-parser-xml-module</artifactId> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/pom.xml index f5056ea97..8add50ab0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/pom.xml @@ -34,10 +34,6 @@ <groupId>commons-codec</groupId> <artifactId>commons-codec</artifactId> </dependency> - <dependency> - <groupId>xerces</groupId> - <artifactId>xercesImpl</artifactId> - </dependency> </dependencies> <build> <plugins> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/XMLTestBase.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/XMLTestBase.java index f69a7e948..f2ddf0e6b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/XMLTestBase.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/XMLTestBase.java @@ -142,8 +142,7 @@ public class XMLTestBase extends TikaTest { TaggedContentHandler tagged = new TaggedContentHandler(handler); try { SAXParserFactory saxParserFactory = SAXParserFactory - .newInstance("org.apache.xerces.jaxp.SAXParserFactoryImpl", - this.getClass().getClassLoader()); + .newInstance(); SAXParser parser = saxParserFactory.newSAXParser(); parser.parse(stream, new TextContentHandler(handler, true)); } catch (ParserConfigurationException e) {