This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_3x by this push:
     new f7abd982e TIKA-4427 -- allow pool size to be zero, and set a 
configurable max reuse value (#2239)
f7abd982e is described below

commit f7abd982ec148d78f6c0c2378b294231d43ff713
Author: Tim Allison <[email protected]>
AuthorDate: Tue Jun 3 08:54:50 2025 -0400

    TIKA-4427 -- allow pool size to be zero, and set a configurable max reuse 
value (#2239)
    
    * TIKA-4427 -- allow pool size to be zero, and set a configurable max reuse 
value
    
    (cherry picked from commit 27e8c302d089cf7dcb378a0c51c94a3d38d8847c)
---
 .../java/org/apache/tika/config/TikaConfig.java    |   3 +
 .../java/org/apache/tika/utils/XMLReaderUtils.java | 182 ++++++++++++++++-----
 .../org/apache/tika/config/TikaConfigTest.java     |  53 ++++++
 .../org/apache/tika/utils/XMLReaderUtilsTest.java  |  14 ++
 .../tika/config/TIKA-4427-max-num-reuses.xml       |  20 +++
 .../apache/tika/config/TIKA-4427-no-sax-pool.xml   |  20 +++
 6 files changed, 248 insertions(+), 44 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 8137a7ad8..fa3241d1e 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -579,6 +579,9 @@ public class TikaConfig {
             
XMLReaderUtils.setMaxEntityExpansions(Integer.parseInt(child.getAttribute("maxEntityExpansions")));
         }
 
+        if (child.hasAttribute("maxNumReuses")) {
+            
XMLReaderUtils.setMaxNumReuses(Integer.parseInt(child.getAttribute("maxNumReuses")));
+        }
         // make sure to call this after set entity expansions
         if (child.hasAttribute("poolSize")) {
             
XMLReaderUtils.setPoolSize(Integer.parseInt(child.getAttribute("poolSize")));
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java 
b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index efc9c019b..9c27cfc1d 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -73,6 +73,7 @@ public class XMLReaderUtils implements Serializable {
      */
     public static final int DEFAULT_POOL_SIZE = 10;
     public static final int DEFAULT_MAX_ENTITY_EXPANSIONS = 20;
+    public static final int DEFAULT_NUM_REUSES = 100;
     /**
      * Serial version UID
      */
@@ -128,6 +129,7 @@ public class XMLReaderUtils implements Serializable {
      * Parser pool size
      */
     private static int POOL_SIZE = DEFAULT_POOL_SIZE;
+    private static int MAX_NUM_REUSES = DEFAULT_NUM_REUSES;
     private static long LAST_LOG = -1;
     private static volatile int MAX_ENTITY_EXPANSIONS = 
determineMaxEntityExpansions();
     private static ArrayBlockingQueue<PoolSAXParser> SAX_PARSERS =
@@ -400,20 +402,22 @@ public class XMLReaderUtils implements Serializable {
         DocumentBuilder builder = context.get(DocumentBuilder.class);
         PoolDOMBuilder poolBuilder = null;
         if (builder == null) {
-            poolBuilder = acquireDOMBuilder();
-            if (poolBuilder != null) {
-                builder = poolBuilder.getDocumentBuilder();
-            } else {
+            if (POOL_SIZE == 0) {
                 builder = getDocumentBuilder();
+            } else {
+                poolBuilder = acquireDOMBuilder();
+                if (poolBuilder != null) {
+                    builder = poolBuilder.getDocumentBuilder();
+                } else {
+                    builder = getDocumentBuilder();
+                }
             }
         }
 
         try {
             return builder.parse(is);
         } finally {
-            if (poolBuilder != null) {
-                releaseDOMBuilder(poolBuilder);
-            }
+            releaseDOMBuilder(poolBuilder);
         }
     }
 
@@ -434,16 +438,22 @@ public class XMLReaderUtils implements Serializable {
         DocumentBuilder builder = context.get(DocumentBuilder.class);
         PoolDOMBuilder poolBuilder = null;
         if (builder == null) {
-            poolBuilder = acquireDOMBuilder();
-            builder = poolBuilder.getDocumentBuilder();
+            if (POOL_SIZE == 0) {
+                builder = getDocumentBuilder();
+            } else {
+                poolBuilder = acquireDOMBuilder();
+                if (poolBuilder != null) {
+                    builder = poolBuilder.getDocumentBuilder();
+                } else {
+                    builder = getDocumentBuilder();
+                }
+            }
         }
 
         try {
             return builder.parse(new InputSource(reader));
         } finally {
-            if (poolBuilder != null) {
-                releaseDOMBuilder(poolBuilder);
-            }
+            releaseDOMBuilder(poolBuilder);
         }
     }
 
@@ -475,11 +485,23 @@ public class XMLReaderUtils implements Serializable {
      */
     public static Document buildDOM(String uriString)
             throws TikaException, IOException, SAXException {
-        PoolDOMBuilder builder = acquireDOMBuilder();
+        PoolDOMBuilder poolBuilder = null;
+        DocumentBuilder builder = null;
+        if (POOL_SIZE == 0) {
+            builder = getDocumentBuilder();
+        } else {
+            poolBuilder = acquireDOMBuilder();
+            if (poolBuilder != null) {
+                builder = poolBuilder.getDocumentBuilder();
+            } else {
+                builder = getDocumentBuilder();
+            }
+        }
+
         try {
-            return builder.getDocumentBuilder().parse(uriString);
+            return builder.parse(uriString);
         } finally {
-            releaseDOMBuilder(builder);
+            releaseDOMBuilder(poolBuilder);
         }
     }
 
@@ -494,11 +516,23 @@ public class XMLReaderUtils implements Serializable {
      */
     public static Document buildDOM(InputStream is)
             throws TikaException, IOException, SAXException {
-        PoolDOMBuilder builder = acquireDOMBuilder();
+        PoolDOMBuilder poolBuilder = null;
+        DocumentBuilder builder = null;
+        if (POOL_SIZE == 0) {
+            builder = getDocumentBuilder();
+        } else {
+            poolBuilder = acquireDOMBuilder();
+            if (poolBuilder != null) {
+                builder = poolBuilder.getDocumentBuilder();
+            } else {
+                builder = getDocumentBuilder();
+            }
+        }
+
         try {
-            return builder.getDocumentBuilder().parse(is);
+            return builder.parse(is);
         } finally {
-            releaseDOMBuilder(builder);
+            releaseDOMBuilder(poolBuilder);
         }
     }
 
@@ -522,19 +556,21 @@ public class XMLReaderUtils implements Serializable {
         SAXParser saxParser = context.get(SAXParser.class);
         PoolSAXParser poolSAXParser = null;
         if (saxParser == null) {
-            poolSAXParser = acquireSAXParser();
-            if (poolSAXParser != null) {
-                saxParser = poolSAXParser.getSAXParser();
-            } else {
+            if (POOL_SIZE == 0) {
                 saxParser = getSAXParser();
+            } else {
+                poolSAXParser = acquireSAXParser();
+                if (poolSAXParser != null) {
+                    saxParser = poolSAXParser.getSAXParser();
+                } else {
+                    saxParser = getSAXParser();
+                }
             }
         }
         try {
             saxParser.parse(is, new OfflineContentHandler(contentHandler));
         } finally {
-            if (poolSAXParser != null) {
-                releaseParser(poolSAXParser);
-            }
+            releaseParser(poolSAXParser);
         }
     }
 
@@ -558,19 +594,21 @@ public class XMLReaderUtils implements Serializable {
         SAXParser saxParser = context.get(SAXParser.class);
         PoolSAXParser poolSAXParser = null;
         if (saxParser == null) {
-            poolSAXParser = acquireSAXParser();
-            if (poolSAXParser != null) {
-                saxParser = poolSAXParser.getSAXParser();
-            } else {
+            if (POOL_SIZE == 0) {
                 saxParser = getSAXParser();
+            } else {
+                poolSAXParser = acquireSAXParser();
+                if (poolSAXParser != null) {
+                    saxParser = poolSAXParser.getSAXParser();
+                } else {
+                    saxParser = getSAXParser();
+                }
             }
         }
         try {
             saxParser.parse(new InputSource(reader), new 
OfflineContentHandler(contentHandler));
         } finally {
-            if (poolSAXParser != null) {
-                releaseParser(poolSAXParser);
-            }
+            releaseParser(poolSAXParser);
         }
     }
 
@@ -609,6 +647,9 @@ public class XMLReaderUtils implements Serializable {
      * @param builder builder to return
      */
     private static void releaseDOMBuilder(PoolDOMBuilder builder) {
+        if (builder == null) {
+            return;
+        }
         if (builder.getPoolGeneration() != POOL_GENERATION.get()) {
             return;
         }
@@ -619,6 +660,15 @@ public class XMLReaderUtils implements Serializable {
         }
         DOM_POOL_LOCK
                 .readLock().lock();
+        builder.incrementUses();
+        if (builder.numUses >= MAX_NUM_REUSES) {
+            try {
+                builder = new PoolDOMBuilder(builder.getPoolGeneration(), 
getDocumentBuilderFactory().newDocumentBuilder());
+            } catch (ParserConfigurationException e) {
+                LOG.warn("Exception trying to configure a new dom builder?!", 
e);
+                return;
+            }
+        }
         try {
             //if there are extra parsers (e.g. after a reset of the pool to a 
smaller size),
             // this parser will not be added and will then be gc'd
@@ -671,6 +721,9 @@ public class XMLReaderUtils implements Serializable {
      * @param parser parser to return
      */
     private static void releaseParser(PoolSAXParser parser) {
+        if (parser == null) {
+            return;
+        }
         try {
             parser.reset();
         } catch (UnsupportedOperationException e) {
@@ -684,6 +737,15 @@ public class XMLReaderUtils implements Serializable {
         SAX_POOL_LOCK
                 .readLock().lock();
         try {
+            parser.incrementUses();
+            if (parser.numUses >= MAX_NUM_REUSES) {
+                try {
+                    parser = buildPoolParser(parser.getGeneration(), 
getSAXParserFactory().newSAXParser());
+                } catch (SAXException | ParserConfigurationException e) {
+                    LOG.warn("Couldn't build new SAXParser after hitting max 
reuses", e);
+                    return;
+                }
+            }
             //if there are extra parsers (e.g. after a reset of the pool to a 
smaller size),
             // this parser will not be added and will then be gc'd
             boolean success = SAX_PARSERS.offer(parser);
@@ -804,6 +866,19 @@ public class XMLReaderUtils implements Serializable {
         }
     }
 
+    /**
+     * Get the maximum number of times a SAXParser or DOMBuilder may be reused.
+     *
+     * @return
+     */
+    public static int getMaxNumReuses() {
+        return MAX_NUM_REUSES;
+    }
+
+    public static void setMaxNumReuses(int maxNumReuses) {
+        MAX_NUM_REUSES = maxNumReuses;
+    }
+
     public static int getPoolSize() {
         return POOL_SIZE;
     }
@@ -813,10 +888,16 @@ public class XMLReaderUtils implements Serializable {
      * effect of locking the pool, and rebuilding the pool from
      * scratch with the most recent settings, such as {@link 
#MAX_ENTITY_EXPANSIONS}
      *
+     * As of Tika 3.2.1, if a value of <code>0</code> is passed in, no 
SAXParsers or DOMBuilders
+     * will be pooled, and a new parser/builder will be built for each parse.
+     *
      * @param poolSize
      * @since Apache Tika 1.19
      */
     public static void setPoolSize(int poolSize) throws TikaException {
+        if (poolSize < 0) {
+            throw new IllegalArgumentException("PoolSize must be >= 0");
+        }
         //stop the world with a write lock.
         //parsers that are currently in use will be offered later (once the 
lock is released),
         //but not accepted and will be gc'd.  We have to do this locking and
@@ -831,14 +912,15 @@ public class XMLReaderUtils implements Serializable {
                 parser.reset();
             }
             SAX_PARSERS.clear();
-            SAX_PARSERS = new ArrayBlockingQueue<>(poolSize);
-            int generation = POOL_GENERATION.incrementAndGet();
-            for (int i = 0; i < poolSize; i++) {
-                try {
-                    SAX_PARSERS.offer(buildPoolParser(generation,
-                            getSAXParserFactory().newSAXParser()));
-                } catch (SAXException | ParserConfigurationException e) {
-                    throw new TikaException("problem creating sax parser", e);
+            if (poolSize > 0) {
+                SAX_PARSERS = new ArrayBlockingQueue<>(poolSize);
+                int generation = POOL_GENERATION.incrementAndGet();
+                for (int i = 0; i < poolSize; i++) {
+                    try {
+                        SAX_PARSERS.offer(buildPoolParser(generation, 
getSAXParserFactory().newSAXParser()));
+                    } catch (SAXException | ParserConfigurationException e) {
+                        throw new TikaException("problem creating sax parser", 
e);
+                    }
                 }
             }
         } finally {
@@ -850,9 +932,11 @@ public class XMLReaderUtils implements Serializable {
                 .writeLock().lock();
         try {
             DOM_BUILDERS.clear();
-            DOM_BUILDERS = new ArrayBlockingQueue<>(poolSize);
-            for (int i = 0; i < poolSize; i++) {
-                DOM_BUILDERS.offer(new PoolDOMBuilder(POOL_GENERATION.get(), 
getDocumentBuilder()));
+            if (poolSize > 0) {
+                DOM_BUILDERS = new ArrayBlockingQueue<>(poolSize);
+                for (int i = 0; i < poolSize; i++) {
+                    DOM_BUILDERS.offer(new 
PoolDOMBuilder(POOL_GENERATION.get(), getDocumentBuilder()));
+                }
             }
         } finally {
             DOM_POOL_LOCK
@@ -973,6 +1057,7 @@ public class XMLReaderUtils implements Serializable {
     private static class PoolDOMBuilder {
         private final int poolGeneration;
         private final DocumentBuilder documentBuilder;
+        int numUses = 0;
 
         PoolDOMBuilder(int poolGeneration, DocumentBuilder documentBuilder) {
             this.poolGeneration = poolGeneration;
@@ -992,12 +1077,16 @@ public class XMLReaderUtils implements Serializable {
             documentBuilder.setEntityResolver(IGNORING_SAX_ENTITY_RESOLVER);
             documentBuilder.setErrorHandler(null);
         }
+
+        void incrementUses() {
+            numUses = 0;
+        }
     }
 
     private abstract static class PoolSAXParser {
         final int poolGeneration;
         final SAXParser saxParser;
-
+        int numUses = 0;
         PoolSAXParser(int poolGeneration, SAXParser saxParser) {
             this.poolGeneration = poolGeneration;
             this.saxParser = saxParser;
@@ -1012,6 +1101,11 @@ public class XMLReaderUtils implements Serializable {
         public SAXParser getSAXParser() {
             return saxParser;
         }
+
+        void incrementUses() {
+            numUses++;
+        }
+
     }
 
     private static class XercesPoolSAXParser extends PoolSAXParser {
diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java 
b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
index 7fa021729..b89491414 100644
--- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
@@ -348,6 +348,59 @@ public class TikaConfigTest extends AbstractTikaConfigTest 
{
         }
     }
 
+    @Test
+    public void testXMLReaderUtilsReuse() throws Exception {
+        //this just tests that there's no exception thrown
+        try {
+            XMLReaderUtils.setPoolSize(10);
+            TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
+            for (int i = 0; i < 500; i++) {
+                assertEquals("application/rdf+xml", 
detect("test-difficult-rdf1.xml", tikaConfig).toString());
+            }
+        } finally {
+            
XMLReaderUtils.setMaxEntityExpansions(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS);
+            XMLReaderUtils.setPoolSize(XMLReaderUtils.DEFAULT_POOL_SIZE);
+        }
+    }
+
+    @Test
+    public void testXMLReaderUtilsConfigReuse() throws Exception {
+        TikaConfig tikaConfig = getConfig("TIKA-4427-max-num-reuses.xml");
+        try {
+            assertEquals(11, XMLReaderUtils.getPoolSize());
+            assertEquals(5000, XMLReaderUtils.getMaxEntityExpansions());
+            assertEquals(10000, XMLReaderUtils.getMaxNumReuses());
+            //make sure that there's actually a change in behavior
+            assertEquals("application/rdf+xml", 
detect("test-difficult-rdf1.xml", tikaConfig).toString());
+        } finally {
+            
XMLReaderUtils.setMaxEntityExpansions(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS);
+            XMLReaderUtils.setPoolSize(XMLReaderUtils.DEFAULT_POOL_SIZE);
+        }
+    }
+
+    @Test
+    public void testXMLReaderUtilsNoPool() throws Exception {
+        //pool size may have been reset already by an
+        //earlier test.  Can't test for default here.
+        assertEquals(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS,
+                XMLReaderUtils.getMaxEntityExpansions());
+        //make sure that detection on this file actually works with
+        //default expansions
+        assertEquals("application/rdf+xml",
+                detect("test-difficult-rdf1.xml", 
TikaConfig.getDefaultConfig()).toString());
+
+        TikaConfig tikaConfig = getConfig("TIKA-4427-no-sax-pool.xml");
+        try {
+            assertEquals(0, XMLReaderUtils.getPoolSize());
+            assertEquals(5, XMLReaderUtils.getMaxEntityExpansions());
+            //make sure that there's actually a change in behavior
+            assertEquals("text/plain", detect("test-difficult-rdf1.xml", 
tikaConfig).toString());
+        } finally {
+            
XMLReaderUtils.setMaxEntityExpansions(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS);
+            XMLReaderUtils.setPoolSize(XMLReaderUtils.DEFAULT_POOL_SIZE);
+        }
+    }
+
     private MediaType detect(String testFileName, TikaConfig tikaConfig) 
throws Exception {
         try (InputStream is = 
MimeDetectionTest.class.getResourceAsStream(testFileName)) {
             return tikaConfig.getDetector().detect(is, new Metadata());
diff --git 
a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java 
b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
index 18e253587..1d5371019 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
@@ -52,4 +52,18 @@ public class XMLReaderUtilsTest {
             fail("Parser tried to access the external DTD:" + e);
         }
     }
+
+    @Test
+    public void testExternalEntityLocal() throws Exception {
+        String xml =
+                "<!DOCTYPE foo [" +
+                "<!ENTITY % local_dtd SYSTEM 
\"file:///usr/local/app/schema.dtd\">" +
+                "%local_dtd;]><foo/>";
+        try {
+            XMLReaderUtils.parseSAX(new 
ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+                    new ToTextContentHandler(), new ParseContext());
+        } catch (ConnectException e) {
+            fail("Parser tried to access the external DTD:" + e);
+        }
+    }
 }
diff --git 
a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4427-max-num-reuses.xml
 
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4427-max-num-reuses.xml
new file mode 100644
index 000000000..105ced933
--- /dev/null
+++ 
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4427-max-num-reuses.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <xml-reader-utils maxEntityExpansions="5000" poolSize="11" 
maxNumReuses="10000"/>
+</properties>
diff --git 
a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4427-no-sax-pool.xml 
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4427-no-sax-pool.xml
new file mode 100644
index 000000000..1e2f8b371
--- /dev/null
+++ 
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4427-no-sax-pool.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <xml-reader-utils maxEntityExpansions="5" poolSize="0"/>
+</properties>

Reply via email to