This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push: new 8bf7e977d TIKA-4149 -- PipesServer should include throwOnWriteLimitReached in the constructor of the BasicContentHandlerFactory (#1374) 8bf7e977d is described below commit 8bf7e977db6e08fc84c2e72a7419f39d9fcc97bf Author: Tim Allison <talli...@apache.org> AuthorDate: Fri Sep 29 13:36:11 2023 -0400 TIKA-4149 -- PipesServer should include throwOnWriteLimitReached in the constructor of the BasicContentHandlerFactory (#1374) --- .../src/main/java/org/apache/tika/pipes/PipesServer.java | 14 ++++++++++---- .../org/apache/tika/pipes/pipesiterator/PipesIterator.java | 13 ++++++++++--- .../org/apache/tika/sax/BasicContentHandlerFactory.java | 1 + 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java index 64d0d602e..ed1e5bb5e 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java @@ -501,10 +501,14 @@ public class PipesServer implements Runnable { private List<Metadata> parseConcatenated(FetchEmitTuple fetchEmitTuple, HandlerConfig handlerConfig, InputStream stream, Metadata metadata) { + ParseContext parseContext = new ParseContext(); + ContentHandlerFactory contentHandlerFactory = - new BasicContentHandlerFactory(handlerConfig.getType(), handlerConfig.getWriteLimit()); + new BasicContentHandlerFactory(handlerConfig.getType(), + handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), + parseContext); + ContentHandler handler = contentHandlerFactory.getNewContentHandler(); - ParseContext parseContext = new ParseContext(); parseContext.set(DocumentSelector.class, new DocumentSelector() { final int maxEmbedded = handlerConfig.maxEmbeddedResources; int embedded = 0; @@ -549,12 +553,14 @@ public class PipesServer implements Runnable { private List<Metadata> parseRecursive(FetchEmitTuple fetchEmitTuple, HandlerConfig handlerConfig, InputStream stream, Metadata metadata) { + ParseContext parseContext = new ParseContext(); //Intentionally do not add the metadata filter here! //We need to let stacktraces percolate RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(handlerConfig.getType(), handlerConfig.getWriteLimit()), + new BasicContentHandlerFactory(handlerConfig.getType(), + handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), parseContext), handlerConfig.getMaxEmbeddedResources()); - ParseContext parseContext = new ParseContext(); + long start = System.currentTimeMillis(); preParse(fetchEmitTuple, stream, metadata, parseContext); try { diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java index 98b766ce7..34706f7e8 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java @@ -73,6 +73,7 @@ public abstract class PipesIterator extends ConfigBase private HandlerConfig.PARSE_MODE parseMode = HandlerConfig.PARSE_MODE.RMETA; + private boolean throwOnWriteLimitReached = false; private int writeLimit = -1; private int maxEmbeddedResources = -1; @@ -146,6 +147,11 @@ public abstract class PipesIterator extends ConfigBase this.writeLimit = writeLimit; } + @Field + public void setThrowOnWriteLimitReached(boolean throwOnWriteLimitReached) { + this.throwOnWriteLimitReached = throwOnWriteLimitReached; + } + @Field public void setMaxEmbeddedResources(int maxEmbeddedResources) { this.maxEmbeddedResources = maxEmbeddedResources; @@ -156,8 +162,8 @@ public abstract class PipesIterator extends ConfigBase setParseMode(HandlerConfig.PARSE_MODE.parseMode(parseModeString)); } - public void setParseMode(HandlerConfig.PARSE_MODE parsePARSEMode) { - this.parseMode = parsePARSEMode; + public void setParseMode(HandlerConfig.PARSE_MODE parseMode) { + this.parseMode = parseMode; } public Integer call() throws Exception { @@ -168,7 +174,8 @@ public abstract class PipesIterator extends ConfigBase protected HandlerConfig getHandlerConfig() { //TODO: make throwOnWriteLimitReached configurable - return new HandlerConfig(handlerType, parseMode, writeLimit, maxEmbeddedResources, false); + return new HandlerConfig(handlerType, parseMode, writeLimit, maxEmbeddedResources, + throwOnWriteLimitReached); } protected abstract void enqueue() throws IOException, TimeoutException, InterruptedException; diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java index 9de0d4071..d10e7adf5 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java +++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java @@ -41,6 +41,7 @@ public class BasicContentHandlerFactory implements ContentHandlerFactory, WriteL private final ParseContext parseContext; /** + * Create a BasicContentHandlerFactory with {@link #throwOnWriteLimitReached} is true * @param type basic type of handler * @param writeLimit max number of characters to store; if < 0, * the handler will store all characters