This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new f7abd982e TIKA-4427 -- allow pool size to be zero, and set a
configurable max reuse value (#2239)
f7abd982e is described below
commit f7abd982ec148d78f6c0c2378b294231d43ff713
Author: Tim Allison <[email protected]>
AuthorDate: Tue Jun 3 08:54:50 2025 -0400
TIKA-4427 -- allow pool size to be zero, and set a configurable max reuse
value (#2239)
* TIKA-4427 -- allow pool size to be zero, and set a configurable max reuse
value
(cherry picked from commit 27e8c302d089cf7dcb378a0c51c94a3d38d8847c)
---
.../java/org/apache/tika/config/TikaConfig.java | 3 +
.../java/org/apache/tika/utils/XMLReaderUtils.java | 182 ++++++++++++++++-----
.../org/apache/tika/config/TikaConfigTest.java | 53 ++++++
.../org/apache/tika/utils/XMLReaderUtilsTest.java | 14 ++
.../tika/config/TIKA-4427-max-num-reuses.xml | 20 +++
.../apache/tika/config/TIKA-4427-no-sax-pool.xml | 20 +++
6 files changed, 248 insertions(+), 44 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 8137a7ad8..fa3241d1e 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -579,6 +579,9 @@ public class TikaConfig {
XMLReaderUtils.setMaxEntityExpansions(Integer.parseInt(child.getAttribute("maxEntityExpansions")));
}
+ if (child.hasAttribute("maxNumReuses")) {
+
XMLReaderUtils.setMaxNumReuses(Integer.parseInt(child.getAttribute("maxNumReuses")));
+ }
// make sure to call this after set entity expansions
if (child.hasAttribute("poolSize")) {
XMLReaderUtils.setPoolSize(Integer.parseInt(child.getAttribute("poolSize")));
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index efc9c019b..9c27cfc1d 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -73,6 +73,7 @@ public class XMLReaderUtils implements Serializable {
*/
public static final int DEFAULT_POOL_SIZE = 10;
public static final int DEFAULT_MAX_ENTITY_EXPANSIONS = 20;
+ public static final int DEFAULT_NUM_REUSES = 100;
/**
* Serial version UID
*/
@@ -128,6 +129,7 @@ public class XMLReaderUtils implements Serializable {
* Parser pool size
*/
private static int POOL_SIZE = DEFAULT_POOL_SIZE;
+ private static int MAX_NUM_REUSES = DEFAULT_NUM_REUSES;
private static long LAST_LOG = -1;
private static volatile int MAX_ENTITY_EXPANSIONS =
determineMaxEntityExpansions();
private static ArrayBlockingQueue<PoolSAXParser> SAX_PARSERS =
@@ -400,20 +402,22 @@ public class XMLReaderUtils implements Serializable {
DocumentBuilder builder = context.get(DocumentBuilder.class);
PoolDOMBuilder poolBuilder = null;
if (builder == null) {
- poolBuilder = acquireDOMBuilder();
- if (poolBuilder != null) {
- builder = poolBuilder.getDocumentBuilder();
- } else {
+ if (POOL_SIZE == 0) {
builder = getDocumentBuilder();
+ } else {
+ poolBuilder = acquireDOMBuilder();
+ if (poolBuilder != null) {
+ builder = poolBuilder.getDocumentBuilder();
+ } else {
+ builder = getDocumentBuilder();
+ }
}
}
try {
return builder.parse(is);
} finally {
- if (poolBuilder != null) {
- releaseDOMBuilder(poolBuilder);
- }
+ releaseDOMBuilder(poolBuilder);
}
}
@@ -434,16 +438,22 @@ public class XMLReaderUtils implements Serializable {
DocumentBuilder builder = context.get(DocumentBuilder.class);
PoolDOMBuilder poolBuilder = null;
if (builder == null) {
- poolBuilder = acquireDOMBuilder();
- builder = poolBuilder.getDocumentBuilder();
+ if (POOL_SIZE == 0) {
+ builder = getDocumentBuilder();
+ } else {
+ poolBuilder = acquireDOMBuilder();
+ if (poolBuilder != null) {
+ builder = poolBuilder.getDocumentBuilder();
+ } else {
+ builder = getDocumentBuilder();
+ }
+ }
}
try {
return builder.parse(new InputSource(reader));
} finally {
- if (poolBuilder != null) {
- releaseDOMBuilder(poolBuilder);
- }
+ releaseDOMBuilder(poolBuilder);
}
}
@@ -475,11 +485,23 @@ public class XMLReaderUtils implements Serializable {
*/
public static Document buildDOM(String uriString)
throws TikaException, IOException, SAXException {
- PoolDOMBuilder builder = acquireDOMBuilder();
+ PoolDOMBuilder poolBuilder = null;
+ DocumentBuilder builder = null;
+ if (POOL_SIZE == 0) {
+ builder = getDocumentBuilder();
+ } else {
+ poolBuilder = acquireDOMBuilder();
+ if (poolBuilder != null) {
+ builder = poolBuilder.getDocumentBuilder();
+ } else {
+ builder = getDocumentBuilder();
+ }
+ }
+
try {
- return builder.getDocumentBuilder().parse(uriString);
+ return builder.parse(uriString);
} finally {
- releaseDOMBuilder(builder);
+ releaseDOMBuilder(poolBuilder);
}
}
@@ -494,11 +516,23 @@ public class XMLReaderUtils implements Serializable {
*/
public static Document buildDOM(InputStream is)
throws TikaException, IOException, SAXException {
- PoolDOMBuilder builder = acquireDOMBuilder();
+ PoolDOMBuilder poolBuilder = null;
+ DocumentBuilder builder = null;
+ if (POOL_SIZE == 0) {
+ builder = getDocumentBuilder();
+ } else {
+ poolBuilder = acquireDOMBuilder();
+ if (poolBuilder != null) {
+ builder = poolBuilder.getDocumentBuilder();
+ } else {
+ builder = getDocumentBuilder();
+ }
+ }
+
try {
- return builder.getDocumentBuilder().parse(is);
+ return builder.parse(is);
} finally {
- releaseDOMBuilder(builder);
+ releaseDOMBuilder(poolBuilder);
}
}
@@ -522,19 +556,21 @@ public class XMLReaderUtils implements Serializable {
SAXParser saxParser = context.get(SAXParser.class);
PoolSAXParser poolSAXParser = null;
if (saxParser == null) {
- poolSAXParser = acquireSAXParser();
- if (poolSAXParser != null) {
- saxParser = poolSAXParser.getSAXParser();
- } else {
+ if (POOL_SIZE == 0) {
saxParser = getSAXParser();
+ } else {
+ poolSAXParser = acquireSAXParser();
+ if (poolSAXParser != null) {
+ saxParser = poolSAXParser.getSAXParser();
+ } else {
+ saxParser = getSAXParser();
+ }
}
}
try {
saxParser.parse(is, new OfflineContentHandler(contentHandler));
} finally {
- if (poolSAXParser != null) {
- releaseParser(poolSAXParser);
- }
+ releaseParser(poolSAXParser);
}
}
@@ -558,19 +594,21 @@ public class XMLReaderUtils implements Serializable {
SAXParser saxParser = context.get(SAXParser.class);
PoolSAXParser poolSAXParser = null;
if (saxParser == null) {
- poolSAXParser = acquireSAXParser();
- if (poolSAXParser != null) {
- saxParser = poolSAXParser.getSAXParser();
- } else {
+ if (POOL_SIZE == 0) {
saxParser = getSAXParser();
+ } else {
+ poolSAXParser = acquireSAXParser();
+ if (poolSAXParser != null) {
+ saxParser = poolSAXParser.getSAXParser();
+ } else {
+ saxParser = getSAXParser();
+ }
}
}
try {
saxParser.parse(new InputSource(reader), new
OfflineContentHandler(contentHandler));
} finally {
- if (poolSAXParser != null) {
- releaseParser(poolSAXParser);
- }
+ releaseParser(poolSAXParser);
}
}
@@ -609,6 +647,9 @@ public class XMLReaderUtils implements Serializable {
* @param builder builder to return
*/
private static void releaseDOMBuilder(PoolDOMBuilder builder) {
+ if (builder == null) {
+ return;
+ }
if (builder.getPoolGeneration() != POOL_GENERATION.get()) {
return;
}
@@ -619,6 +660,15 @@ public class XMLReaderUtils implements Serializable {
}
DOM_POOL_LOCK
.readLock().lock();
+ builder.incrementUses();
+ if (builder.numUses >= MAX_NUM_REUSES) {
+ try {
+ builder = new PoolDOMBuilder(builder.getPoolGeneration(),
getDocumentBuilderFactory().newDocumentBuilder());
+ } catch (ParserConfigurationException e) {
+ LOG.warn("Exception trying to configure a new dom builder?!",
e);
+ return;
+ }
+ }
try {
//if there are extra parsers (e.g. after a reset of the pool to a
smaller size),
// this parser will not be added and will then be gc'd
@@ -671,6 +721,9 @@ public class XMLReaderUtils implements Serializable {
* @param parser parser to return
*/
private static void releaseParser(PoolSAXParser parser) {
+ if (parser == null) {
+ return;
+ }
try {
parser.reset();
} catch (UnsupportedOperationException e) {
@@ -684,6 +737,15 @@ public class XMLReaderUtils implements Serializable {
SAX_POOL_LOCK
.readLock().lock();
try {
+ parser.incrementUses();
+ if (parser.numUses >= MAX_NUM_REUSES) {
+ try {
+ parser = buildPoolParser(parser.getGeneration(),
getSAXParserFactory().newSAXParser());
+ } catch (SAXException | ParserConfigurationException e) {
+ LOG.warn("Couldn't build new SAXParser after hitting max
reuses", e);
+ return;
+ }
+ }
//if there are extra parsers (e.g. after a reset of the pool to a
smaller size),
// this parser will not be added and will then be gc'd
boolean success = SAX_PARSERS.offer(parser);
@@ -804,6 +866,19 @@ public class XMLReaderUtils implements Serializable {
}
}
+ /**
+ * Get the maximum number of times a SAXParser or DOMBuilder may be reused.
+ *
+ * @return
+ */
+ public static int getMaxNumReuses() {
+ return MAX_NUM_REUSES;
+ }
+
+ public static void setMaxNumReuses(int maxNumReuses) {
+ MAX_NUM_REUSES = maxNumReuses;
+ }
+
public static int getPoolSize() {
return POOL_SIZE;
}
@@ -813,10 +888,16 @@ public class XMLReaderUtils implements Serializable {
* effect of locking the pool, and rebuilding the pool from
* scratch with the most recent settings, such as {@link
#MAX_ENTITY_EXPANSIONS}
*
+ * As of Tika 3.2.1, if a value of <code>0</code> is passed in, no
SAXParsers or DOMBuilders
+ * will be pooled, and a new parser/builder will be built for each parse.
+ *
* @param poolSize
* @since Apache Tika 1.19
*/
public static void setPoolSize(int poolSize) throws TikaException {
+ if (poolSize < 0) {
+ throw new IllegalArgumentException("PoolSize must be >= 0");
+ }
//stop the world with a write lock.
//parsers that are currently in use will be offered later (once the
lock is released),
//but not accepted and will be gc'd. We have to do this locking and
@@ -831,14 +912,15 @@ public class XMLReaderUtils implements Serializable {
parser.reset();
}
SAX_PARSERS.clear();
- SAX_PARSERS = new ArrayBlockingQueue<>(poolSize);
- int generation = POOL_GENERATION.incrementAndGet();
- for (int i = 0; i < poolSize; i++) {
- try {
- SAX_PARSERS.offer(buildPoolParser(generation,
- getSAXParserFactory().newSAXParser()));
- } catch (SAXException | ParserConfigurationException e) {
- throw new TikaException("problem creating sax parser", e);
+ if (poolSize > 0) {
+ SAX_PARSERS = new ArrayBlockingQueue<>(poolSize);
+ int generation = POOL_GENERATION.incrementAndGet();
+ for (int i = 0; i < poolSize; i++) {
+ try {
+ SAX_PARSERS.offer(buildPoolParser(generation,
getSAXParserFactory().newSAXParser()));
+ } catch (SAXException | ParserConfigurationException e) {
+ throw new TikaException("problem creating sax parser",
e);
+ }
}
}
} finally {
@@ -850,9 +932,11 @@ public class XMLReaderUtils implements Serializable {
.writeLock().lock();
try {
DOM_BUILDERS.clear();
- DOM_BUILDERS = new ArrayBlockingQueue<>(poolSize);
- for (int i = 0; i < poolSize; i++) {
- DOM_BUILDERS.offer(new PoolDOMBuilder(POOL_GENERATION.get(),
getDocumentBuilder()));
+ if (poolSize > 0) {
+ DOM_BUILDERS = new ArrayBlockingQueue<>(poolSize);
+ for (int i = 0; i < poolSize; i++) {
+ DOM_BUILDERS.offer(new
PoolDOMBuilder(POOL_GENERATION.get(), getDocumentBuilder()));
+ }
}
} finally {
DOM_POOL_LOCK
@@ -973,6 +1057,7 @@ public class XMLReaderUtils implements Serializable {
private static class PoolDOMBuilder {
private final int poolGeneration;
private final DocumentBuilder documentBuilder;
+ int numUses = 0;
PoolDOMBuilder(int poolGeneration, DocumentBuilder documentBuilder) {
this.poolGeneration = poolGeneration;
@@ -992,12 +1077,16 @@ public class XMLReaderUtils implements Serializable {
documentBuilder.setEntityResolver(IGNORING_SAX_ENTITY_RESOLVER);
documentBuilder.setErrorHandler(null);
}
+
+ void incrementUses() {
+ numUses = 0;
+ }
}
private abstract static class PoolSAXParser {
final int poolGeneration;
final SAXParser saxParser;
-
+ int numUses = 0;
PoolSAXParser(int poolGeneration, SAXParser saxParser) {
this.poolGeneration = poolGeneration;
this.saxParser = saxParser;
@@ -1012,6 +1101,11 @@ public class XMLReaderUtils implements Serializable {
public SAXParser getSAXParser() {
return saxParser;
}
+
+ void incrementUses() {
+ numUses++;
+ }
+
}
private static class XercesPoolSAXParser extends PoolSAXParser {
diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
index 7fa021729..b89491414 100644
--- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
@@ -348,6 +348,59 @@ public class TikaConfigTest extends AbstractTikaConfigTest
{
}
}
+ @Test
+ public void testXMLReaderUtilsReuse() throws Exception {
+ //this just tests that there's no exception thrown
+ try {
+ XMLReaderUtils.setPoolSize(10);
+ TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
+ for (int i = 0; i < 500; i++) {
+ assertEquals("application/rdf+xml",
detect("test-difficult-rdf1.xml", tikaConfig).toString());
+ }
+ } finally {
+
XMLReaderUtils.setMaxEntityExpansions(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS);
+ XMLReaderUtils.setPoolSize(XMLReaderUtils.DEFAULT_POOL_SIZE);
+ }
+ }
+
+ @Test
+ public void testXMLReaderUtilsConfigReuse() throws Exception {
+ TikaConfig tikaConfig = getConfig("TIKA-4427-max-num-reuses.xml");
+ try {
+ assertEquals(11, XMLReaderUtils.getPoolSize());
+ assertEquals(5000, XMLReaderUtils.getMaxEntityExpansions());
+ assertEquals(10000, XMLReaderUtils.getMaxNumReuses());
+ //make sure that there's actually a change in behavior
+ assertEquals("application/rdf+xml",
detect("test-difficult-rdf1.xml", tikaConfig).toString());
+ } finally {
+
XMLReaderUtils.setMaxEntityExpansions(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS);
+ XMLReaderUtils.setPoolSize(XMLReaderUtils.DEFAULT_POOL_SIZE);
+ }
+ }
+
+ @Test
+ public void testXMLReaderUtilsNoPool() throws Exception {
+ //pool size may have been reset already by an
+ //earlier test. Can't test for default here.
+ assertEquals(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS,
+ XMLReaderUtils.getMaxEntityExpansions());
+ //make sure that detection on this file actually works with
+ //default expansions
+ assertEquals("application/rdf+xml",
+ detect("test-difficult-rdf1.xml",
TikaConfig.getDefaultConfig()).toString());
+
+ TikaConfig tikaConfig = getConfig("TIKA-4427-no-sax-pool.xml");
+ try {
+ assertEquals(0, XMLReaderUtils.getPoolSize());
+ assertEquals(5, XMLReaderUtils.getMaxEntityExpansions());
+ //make sure that there's actually a change in behavior
+ assertEquals("text/plain", detect("test-difficult-rdf1.xml",
tikaConfig).toString());
+ } finally {
+
XMLReaderUtils.setMaxEntityExpansions(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS);
+ XMLReaderUtils.setPoolSize(XMLReaderUtils.DEFAULT_POOL_SIZE);
+ }
+ }
+
private MediaType detect(String testFileName, TikaConfig tikaConfig)
throws Exception {
try (InputStream is =
MimeDetectionTest.class.getResourceAsStream(testFileName)) {
return tikaConfig.getDetector().detect(is, new Metadata());
diff --git
a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
index 18e253587..1d5371019 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
@@ -52,4 +52,18 @@ public class XMLReaderUtilsTest {
fail("Parser tried to access the external DTD:" + e);
}
}
+
+ @Test
+ public void testExternalEntityLocal() throws Exception {
+ String xml =
+ "<!DOCTYPE foo [" +
+ "<!ENTITY % local_dtd SYSTEM
\"file:///usr/local/app/schema.dtd\">" +
+ "%local_dtd;]><foo/>";
+ try {
+ XMLReaderUtils.parseSAX(new
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+ new ToTextContentHandler(), new ParseContext());
+ } catch (ConnectException e) {
+ fail("Parser tried to access the external DTD:" + e);
+ }
+ }
}
diff --git
a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4427-max-num-reuses.xml
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4427-max-num-reuses.xml
new file mode 100644
index 000000000..105ced933
--- /dev/null
+++
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4427-max-num-reuses.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <xml-reader-utils maxEntityExpansions="5000" poolSize="11"
maxNumReuses="10000"/>
+</properties>
diff --git
a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4427-no-sax-pool.xml
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4427-no-sax-pool.xml
new file mode 100644
index 000000000..1e2f8b371
--- /dev/null
+++
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4427-no-sax-pool.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <xml-reader-utils maxEntityExpansions="5" poolSize="0"/>
+</properties>