This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new edd9398  TIKA-3382 -- improve writelimit handling throughout various 
parsers
edd9398 is described below

commit edd939864216c635373c252c86576f9a2a66c4ab
Author: tballison <[email protected]>
AuthorDate: Tue May 4 15:17:48 2021 -0400

    TIKA-3382 -- improve writelimit handling throughout various parsers
---
 .../apache/tika/exception/RuntimeSAXException.java |  30 ++
 .../tika/exception/WriteLimitReachedException.java |  31 +-
 .../org/apache/tika/parser/CompositeParser.java    |   2 +
 .../apache/tika/parser/RecursiveParserWrapper.java |  35 +--
 .../apache/tika/sax/WriteOutContentHandler.java    |  11 +-
 .../apache/tika/parser/asm/XHTMLClassVisitor.java  |  17 +-
 .../org/apache/tika/parser/crypto/Pkcs7Parser.java |   2 +-
 .../org/apache/tika/parser/crypto/TSDParser.java   |   7 +-
 .../tika/parser/microsoft/JackcessExtractor.java   |   2 +
 .../tika/parser/microsoft/chm/ChmParser.java       |   4 +-
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |  10 +-
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |   3 +
 .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java  | 336 ++++++++-------------
 .../ooxml/OOXMLWordAndPowerPointTextHandler.java   |  54 ++--
 .../ooxml/XSSFExcelExtractorDecorator.java         |   6 +-
 .../ooxml/xwpf/XWPFEventBasedWordExtractor.java    |   8 +
 .../microsoft/xml/AbstractXML2003Parser.java       |   3 +-
 .../apache/tika/parser/odf/OpenDocumentParser.java |   2 +
 .../tika/parser/RecursiveParserWrapperTest.java    |   3 +-
 .../classic/RecursiveMetadataResourceTest.java     |   2 +-
 .../tika/server/core/resource/TikaResource.java    |   4 +-
 .../server/core/RecursiveMetadataResourceTest.java |  64 ++++
 .../apache/tika/server/core/TikaResourceTest.java  | 105 +++++--
 23 files changed, 421 insertions(+), 320 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java 
b/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java
new file mode 100644
index 0000000..4e0bc43
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.exception;
+
+import org.xml.sax.SAXException;
+
+/**
+ * Use this to throw a SAXException in subclassed methods that don't throw 
SAXExceptions
+ */
+public class RuntimeSAXException extends RuntimeException {
+
+    public RuntimeSAXException(SAXException t) {
+        super(t);
+    }
+
+}
diff --git 
a/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
 
b/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
index 5bf454f..fe0621e 100644
--- 
a/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
+++ 
b/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
@@ -23,10 +23,19 @@ public class WriteLimitReachedException extends 
SAXException {
     //in case of (hopefully impossible) cyclic exception
     private final static int MAX_DEPTH = 100;
 
-    public WriteLimitReachedException(String msg) {
-        super(msg);
+    private final int writeLimit;
+    public WriteLimitReachedException(int writeLimit) {
+        this.writeLimit = writeLimit;
     }
 
+    @Override
+    public String getMessage() {
+        return "Your document contained more than " + writeLimit
+                + " characters, and so your requested limit has been"
+                + " reached. To receive the full text of the document,"
+                + " increase your limit. (Text up to the limit is"
+                + " however available).";
+    }
     /**
      * Checks whether the given exception (or any of it's root causes) was
      * thrown by this handler as a signal of reaching the write limit.
@@ -53,4 +62,22 @@ public class WriteLimitReachedException extends SAXException 
{
             return t.getCause() != null && isWriteLimitReached(t.getCause(), 
depth + 1);
         }
     }
+
+    public static void throwIfWriteLimitReached(Exception ex) throws 
SAXException {
+        throwIfWriteLimitReached(ex, 0);
+    }
+
+    private static void throwIfWriteLimitReached(Exception ex, int depth) 
throws SAXException {
+        if (ex == null) {
+            return;
+        }
+        if (depth > MAX_DEPTH) {
+            return;
+        }
+        if (ex instanceof WriteLimitReachedException) {
+            throw (SAXException) ex;
+        } else {
+            isWriteLimitReached(ex.getCause(), depth + 1);
+        }
+    }
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java 
b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
index 72e7dde..b838cc8 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
@@ -31,6 +31,7 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -293,6 +294,7 @@ public class CompositeParser extends AbstractParser {
                 taggedStream.throwIfCauseOf(e);
                 throw new TikaException("TIKA-198: Illegal IOException from " 
+ parser, e);
             } catch (SAXException e) {
+                WriteLimitReachedException.throwIfWriteLimitReached(e);
                 if (taggedHandler != null) {
                     taggedHandler.throwIfCauseOf(e);
                 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java 
b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index c98c8fb..ca09477 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -155,18 +155,14 @@ public class RecursiveParserWrapper extends 
ParserDecorator {
                     new RecursivelySecureContentHandler(localHandler, tis, 
writeLimit);
             context.set(RecursivelySecureContentHandler.class, 
secureContentHandler);
             getWrappedParser().parse(tis, secureContentHandler, metadata, 
context);
-        } catch (SAXException e) {
-            boolean wlr = WriteLimitReachedException.isWriteLimitReached(e);
-            if (wlr == false) {
+        } catch (Throwable e) {
+            if (WriteLimitReachedException.isWriteLimitReached(e)) {
+                metadata.set(TikaCoreProperties.WRITE_LIMIT_REACHED, "true");
+            } else {
+                String stackTrace = ExceptionUtils.getFilteredStackTrace(e);
+                metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION, 
stackTrace);
                 throw e;
             }
-            metadata.set(TikaCoreProperties.WRITE_LIMIT_REACHED, "true");
-        } catch (Throwable e) {
-            //try our best to record the problem in the metadata object
-            //then rethrow
-            String stackTrace = ExceptionUtils.getFilteredStackTrace(e);
-            metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION, stackTrace);
-            throw e;
         } finally {
             tmp.dispose();
             long elapsedMillis = System.currentTimeMillis() - started;
@@ -240,9 +236,9 @@ public class RecursiveParserWrapper extends ParserDecorator 
{
             try {
                 super.parse(stream, secureContentHandler, metadata, context);
             } catch (SAXException e) {
-                boolean wlr = 
WriteLimitReachedException.isWriteLimitReached(e);
-                if (wlr == true) {
+                if (WriteLimitReachedException.isWriteLimitReached(e)) {
                     metadata.add(TikaCoreProperties.WRITE_LIMIT_REACHED, 
"true");
+                    throw e;
                 } else {
                     if (catchEmbeddedExceptions) {
                         ParserUtils.recordParserFailure(this, e, metadata);
@@ -339,13 +335,7 @@ public class RecursiveParserWrapper extends 
ParserDecorator {
             int availableLength = Math.min(totalWriteLimit - totalChars, 
length);
             super.characters(ch, start, availableLength);
             if (availableLength < length) {
-                throw new WriteLimitReachedException(
-                        "Your document contained more than " + totalWriteLimit 
+
-                                " characters, and so your requested limit has 
been" +
-                                " reached. To receive the full text of the 
document," +
-                                " increase your limit. (Text up to the limit 
is" +
-                                " however available)."
-                );
+                throw new WriteLimitReachedException(totalWriteLimit);
             }
         }
 
@@ -358,12 +348,7 @@ public class RecursiveParserWrapper extends 
ParserDecorator {
             int availableLength = Math.min(totalWriteLimit - totalChars, 
length);
             super.ignorableWhitespace(ch, start, availableLength);
             if (availableLength < length) {
-                throw new WriteLimitReachedException("Your document contained 
more than "
-                        + totalWriteLimit +
-                        " characters, and so your requested limit has been" +
-                        " reached. To receive the full text of the document," +
-                        " increase your limit. (Text up to the limit is" + " 
however available)."
-                );
+                throw new WriteLimitReachedException(totalWriteLimit);
             }
         }
     }
diff --git 
a/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java 
b/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
index 920afaf..2704d4c 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
@@ -134,10 +134,7 @@ public class WriteOutContentHandler extends 
ContentHandlerDecorator {
         } else {
             super.characters(ch, start, writeLimit - writeCount);
             writeCount = writeLimit;
-            throw new WriteLimitReachedException("Your document contained more 
than " + writeLimit +
-                    " characters, and so your requested limit has been" +
-                    " reached. To receive the full text of the document," +
-                    " increase your limit. (Text up to the limit is" + " 
however available).");
+            throw new WriteLimitReachedException(writeLimit);
         }
     }
 
@@ -149,11 +146,7 @@ public class WriteOutContentHandler extends 
ContentHandlerDecorator {
         } else {
             super.ignorableWhitespace(ch, start, writeLimit - writeCount);
             writeCount = writeLimit;
-            throw new WriteLimitReachedException("Your document contained more 
than "
-                    + writeLimit +
-                    " characters, and so your requested limit has been" +
-                    " reached. To receive the full text of the document," +
-                    " increase your limit. (Text up to the limit is however 
available).");
+            throw new WriteLimitReachedException(writeLimit);
         }
     }
 }
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
index 199d5ca..d55528d 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
@@ -30,7 +30,9 @@ import org.objectweb.asm.Type;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
+import org.apache.tika.exception.RuntimeSAXException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -64,11 +66,8 @@ class XHTMLClassVisitor extends ClassVisitor {
             ClassReader reader = new ClassReader(stream);
             reader.accept(this, ClassReader.SKIP_FRAMES | 
ClassReader.SKIP_CODE);
         } catch (RuntimeException e) {
-            if (e.getCause() instanceof SAXException) {
-                throw (SAXException) e.getCause();
-            } else {
-                throw new TikaException("Failed to parse a Java class", e);
-            }
+            WriteLimitReachedException.throwIfWriteLimitReached(e);
+            throw new TikaException("Failed to parse a Java class", e);
         }
     }
 
@@ -125,7 +124,7 @@ class XHTMLClassVisitor extends ClassVisitor {
             }
             xhtml.characters("{\n");
         } catch (SAXException e) {
-            throw new RuntimeException(e);
+            throw new RuntimeSAXException(e);
         }
     }
 
@@ -148,7 +147,7 @@ class XHTMLClassVisitor extends ClassVisitor {
             xhtml.endElement("pre");
             xhtml.endDocument();
         } catch (SAXException e) {
-            throw new RuntimeException(e);
+            throw new RuntimeSAXException(e);
         }
     }
 
@@ -204,7 +203,7 @@ class XHTMLClassVisitor extends ClassVisitor {
                 writeSemicolon();
                 writeNewline();
             } catch (SAXException e) {
-                throw new RuntimeException(e);
+                throw new RuntimeSAXException(e);
             }
         }
 
@@ -251,7 +250,7 @@ class XHTMLClassVisitor extends ClassVisitor {
                 writeSemicolon();
                 writeNewline();
             } catch (SAXException e) {
-                throw new RuntimeException(e);
+                throw new RuntimeSAXException(e);
             }
         }
 
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/Pkcs7Parser.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/Pkcs7Parser.java
index 64babca..7e42be8 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/Pkcs7Parser.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/Pkcs7Parser.java
@@ -74,7 +74,7 @@ public class Pkcs7Parser extends AbstractParser {
                 }
                 try (InputStream input = content.getContentStream()) {
                     Parser delegate = context.get(Parser.class, 
EmptyParser.INSTANCE);
-                    delegate.parse(input, handler, metadata, context);
+                    delegate.parse(input, handler, new Metadata(), context);
                 }
             } finally {
                 parser.close();
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
index 856bf40..7b1d44f 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
@@ -51,6 +51,7 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
@@ -109,7 +110,7 @@ public class TSDParser extends AbstractParser {
         }
     }
 
-    private List<TSDMetas> extractMetas(InputStream stream) {
+    private List<TSDMetas> extractMetas(InputStream stream) throws 
SAXException {
         List<TSDMetas> tsdMetasList = new ArrayList<>();
 
         try {
@@ -130,6 +131,7 @@ public class TSDParser extends AbstractParser {
         } catch (SecurityException e) {
             throw e;
         } catch (Exception ex) {
+            WriteLimitReachedException.throwIfWriteLimitReached(ex);
             LOG.error("Error in TSDParser.buildMetas {}", ex.getMessage());
             tsdMetasList.clear();
         }
@@ -160,7 +162,7 @@ public class TSDParser extends AbstractParser {
     }
 
     private void parseTSDContent(InputStream stream, ContentHandler handler, 
Metadata metadata,
-                                 ParseContext context) {
+                                 ParseContext context) throws SAXException {
 
         CMSTimeStampedDataParser cmsTimeStampedDataParser = null;
         EmbeddedDocumentExtractor edx = 
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
@@ -176,6 +178,7 @@ public class TSDParser extends AbstractParser {
             } catch (SecurityException e) {
                 throw e;
             } catch (Exception ex) {
+                WriteLimitReachedException.throwIfWriteLimitReached(ex);
                 LOG.error("Error in TSDParser.parseTSDContent {}", 
ex.getMessage());
             } finally {
                 this.closeCMSParser(cmsTimeStampedDataParser);
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index 059027d..ff258c9 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -46,6 +46,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.xml.sax.SAXException;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -224,6 +225,7 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
                             .parse(new 
ByteArrayInputStream(v.getBytes(UTF_8)), h, m, parseContext);
                     handler.characters(h.toString());
                 } catch (SAXException e) {
+                    WriteLimitReachedException.throwIfWriteLimitReached(e);
                     //if something went wrong in htmlparser, just append the 
characters
                     handler.characters(v);
                 }
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
index 33e7d0c..56a9cad 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
@@ -94,15 +94,13 @@ public class ChmParser extends AbstractParser {
 
 
     private void parsePage(byte[] byteObject, Parser htmlParser, 
ContentHandler xhtml,
-                           ParseContext context) throws TikaException { // 
throws IOException
+                           ParseContext context) throws TikaException, 
SAXException { // throws IOException
         InputStream stream = null;
         Metadata metadata = new Metadata();
         ContentHandler handler = new EmbeddedContentHandler(new 
BodyContentHandler(xhtml));// -1
         try {
             stream = new ByteArrayInputStream(byteObject);
             htmlParser.parse(stream, handler, metadata, context);
-        } catch (SAXException e) {
-            throw new RuntimeException(e);
         } catch (IOException e) {
             // Pushback overflow from tagsoup
         }
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index fc332a9..cd6c6e5 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -52,6 +52,7 @@ import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
@@ -142,7 +143,7 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
         handleEmbeddedParts(xhtml, metadata);
 
         // thumbnail
-        handleThumbnail(xhtml);
+        handleThumbnail(xhtml, metadata);
 
         xhtml.endDocument();
     }
@@ -160,7 +161,7 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
         return desc;
     }
 
-    private void handleThumbnail(ContentHandler handler) {
+    private void handleThumbnail(ContentHandler handler, Metadata metadata) 
throws SAXException {
         try {
             OPCPackage opcPackage = extractor.getPackage();
             for (PackageRelationship rel : opcPackage
@@ -193,7 +194,10 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
         } catch (SecurityException e) {
             throw e;
         } catch (Exception ex) {
-            //swallow
+            WriteLimitReachedException.throwIfWriteLimitReached(ex);
+            //swallow otherwise
+            metadata.add(TikaCoreProperties.EMBEDDED_EXCEPTION,
+                    ExceptionUtils.getStackTrace(ex));
         }
     }
 
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 154efa2..180899e 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -51,6 +51,7 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 import org.apache.tika.detect.microsoft.ooxml.OPCPackageDetector;
+import org.apache.tika.exception.RuntimeSAXException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -234,6 +235,8 @@ public class OOXMLExtractorFactory {
             throw new TikaException("Error creating OOXML extractor", e);
         } catch (XmlException e) {
             throw new TikaException("Error creating OOXML extractor", e);
+        } catch (RuntimeSAXException e) {
+            throw(SAXException) e.getCause();
         } finally {
             if (tmpRepairedCopy != null) {
                 if (pkg != null) {
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
index 9471237..1de5de6 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
@@ -82,115 +82,98 @@ public class OOXMLTikaBodyPartHandler
     }
 
     @Override
-    public void run(RunProperties runProperties, String contents) {
-        try {
-
-            // True if we are currently in the named style tag:
-            if (runProperties.isBold() != isBold) {
-                if (isStrikeThrough) {
-                    xhtml.endElement("strike");
-                    isStrikeThrough = false;
-                }
-                if (isUnderline) {
-                    xhtml.endElement("u");
-                    isUnderline = false;
-                    ;
-                }
-                if (isItalics) {
-                    xhtml.endElement("i");
-                    isItalics = false;
-                }
-                if (runProperties.isBold()) {
-                    xhtml.startElement("b");
-                } else {
-                    xhtml.endElement("b");
-                }
-                isBold = runProperties.isBold();
+    public void run(RunProperties runProperties, String contents) throws 
SAXException {
+
+        // True if we are currently in the named style tag:
+        if (runProperties.isBold() != isBold) {
+            if (isStrikeThrough) {
+                xhtml.endElement("strike");
+                isStrikeThrough = false;
+            }
+            if (isUnderline) {
+                xhtml.endElement("u");
+                isUnderline = false;
+            }
+            if (isItalics) {
+                xhtml.endElement("i");
+                isItalics = false;
             }
+            if (runProperties.isBold()) {
+                xhtml.startElement("b");
+            } else {
+                xhtml.endElement("b");
+            }
+            isBold = runProperties.isBold();
+        }
 
-            if (runProperties.isItalics() != isItalics) {
-                if (isStrikeThrough) {
-                    xhtml.endElement("strike");
-                    isStrikeThrough = false;
-                }
-                if (isUnderline) {
-                    xhtml.endElement("u");
-                    isUnderline = false;
-                }
-                if (runProperties.isItalics()) {
-                    xhtml.startElement("i");
-                } else {
-                    xhtml.endElement("i");
-                }
-                isItalics = runProperties.isItalics();
+        if (runProperties.isItalics() != isItalics) {
+            if (isStrikeThrough) {
+                xhtml.endElement("strike");
+                isStrikeThrough = false;
+            }
+            if (isUnderline) {
+                xhtml.endElement("u");
+                isUnderline = false;
+            }
+            if (runProperties.isItalics()) {
+                xhtml.startElement("i");
+            } else {
+                xhtml.endElement("i");
             }
+            isItalics = runProperties.isItalics();
+        }
 
-            if (runProperties.isStrikeThrough() != isStrikeThrough) {
-                if (isUnderline) {
-                    xhtml.endElement("u");
-                    isUnderline = false;
-                }
-                if (runProperties.isStrikeThrough()) {
-                    xhtml.startElement("strike");
-                } else {
-                    xhtml.endElement("strike");
-                }
-                isStrikeThrough = runProperties.isStrikeThrough();
+        if (runProperties.isStrikeThrough() != isStrikeThrough) {
+            if (isUnderline) {
+                xhtml.endElement("u");
+                isUnderline = false;
+            }
+            if (runProperties.isStrikeThrough()) {
+                xhtml.startElement("strike");
+            } else {
+                xhtml.endElement("strike");
             }
+            isStrikeThrough = runProperties.isStrikeThrough();
+        }
 
-            boolean runIsUnderlined = runProperties.getUnderline() != 
UnderlinePatterns.NONE;
-            if (runIsUnderlined != isUnderline) {
-                if (runIsUnderlined) {
-                    xhtml.startElement("u");
-                } else {
-                    xhtml.endElement("u");
-                }
-                isUnderline = runIsUnderlined;
+        boolean runIsUnderlined = runProperties.getUnderline() != 
UnderlinePatterns.NONE;
+        if (runIsUnderlined != isUnderline) {
+            if (runIsUnderlined) {
+                xhtml.startElement("u");
+            } else {
+                xhtml.endElement("u");
             }
+            isUnderline = runIsUnderlined;
+        }
 
-            xhtml.characters(contents);
+        xhtml.characters(contents);
 
-        } catch (SAXException e) {
-            //swallow
-        }
     }
 
     @Override
-    public void hyperlinkStart(String link) {
-        try {
-            if (link != null) {
-                xhtml.startElement("a", "href", link);
-                wroteHyperlinkStart = true;
-            }
-        } catch (SAXException e) {
-            //swallow
+    public void hyperlinkStart(String link) throws SAXException {
+        if (link != null) {
+            xhtml.startElement("a", "href", link);
+            wroteHyperlinkStart = true;
         }
     }
 
     @Override
-    public void hyperlinkEnd() {
-        try {
-            if (wroteHyperlinkStart) {
-                closeStyleTags();
-                wroteHyperlinkStart = false;
-                xhtml.endElement("a");
-            }
-        } catch (SAXException e) {
-            //swallow
+    public void hyperlinkEnd() throws SAXException {
+        if (wroteHyperlinkStart) {
+            closeStyleTags();
+            wroteHyperlinkStart = false;
+            xhtml.endElement("a");
         }
     }
 
     @Override
-    public void startParagraph(ParagraphProperties paragraphProperties) {
+    public void startParagraph(ParagraphProperties paragraphProperties) throws 
SAXException {
 
         //if you're in a table cell and your after the first paragraph
         //make sure to prepend a \n
         if (tableCellDepth > 0 && pWithinCell > 0) {
-            try {
-                xhtml.characters(NEWLINE, 0, 1);
-            } catch (SAXException e) {
-                //swallow
-            }
+            xhtml.characters(NEWLINE, 0, 1);
         }
 
         if (pDepth == 0 && tableDepth == 0 && sdtDepth == 0) {
@@ -208,41 +191,30 @@ public class OOXMLTikaBodyPartHandler
             }
 
 
-            try {
-                if (styleClass == null) {
-                    xhtml.startElement(paragraphTag);
-                } else {
-                    xhtml.startElement(paragraphTag, "class", styleClass);
-                }
-            } catch (SAXException e) {
-                //swallow
+            if (styleClass == null) {
+                xhtml.startElement(paragraphTag);
+            } else {
+                xhtml.startElement(paragraphTag, "class", styleClass);
             }
         }
 
-        try {
-            writeParagraphNumber(paragraphProperties.getNumId(), 
paragraphProperties.getIlvl(),
-                    listManager, xhtml);
-        } catch (SAXException e) {
-            //swallow
-        }
+        writeParagraphNumber(paragraphProperties.getNumId(), 
paragraphProperties.getIlvl(),
+                listManager, xhtml);
         pDepth++;
     }
 
 
     @Override
-    public void endParagraph() {
-        try {
-            closeStyleTags();
-            if (pDepth == 1 && tableDepth == 0) {
-                xhtml.endElement(paragraphTag);
-            } else if (tableCellDepth > 0 && pWithinCell > 0) {
-                xhtml.characters(NEWLINE, 0, 1);
-            } else if (tableCellDepth == 0) {
-                xhtml.characters(NEWLINE, 0, 1);
-            }
-        } catch (SAXException e) {
-            //swallow
+    public void endParagraph() throws SAXException {
+        closeStyleTags();
+        if (pDepth == 1 && tableDepth == 0) {
+            xhtml.endElement(paragraphTag);
+        } else if (tableCellDepth > 0 && pWithinCell > 0) {
+            xhtml.characters(NEWLINE, 0, 1);
+        } else if (tableCellDepth == 0) {
+            xhtml.characters(NEWLINE, 0, 1);
         }
+
         if (tableCellDepth > 0) {
             pWithinCell++;
         }
@@ -250,72 +222,48 @@ public class OOXMLTikaBodyPartHandler
     }
 
     @Override
-    public void startTable() {
-        try {
-            xhtml.startElement("table");
-            tableDepth++;
-        } catch (SAXException e) {
-            //swallow
-        }
+    public void startTable() throws SAXException {
+
+        xhtml.startElement("table");
+        tableDepth++;
+
     }
 
     @Override
-    public void endTable() {
-        try {
-            xhtml.endElement("table");
-            tableDepth--;
-        } catch (SAXException e) {
-            //swallow
-        }
+    public void endTable() throws SAXException {
+
+        xhtml.endElement("table");
+        tableDepth--;
+
     }
 
     @Override
-    public void startTableRow() {
-        try {
-            xhtml.startElement("tr");
-        } catch (SAXException e) {
-            //swallow
-        }
+    public void startTableRow() throws SAXException {
+        xhtml.startElement("tr");
     }
 
     @Override
-    public void endTableRow() {
-        try {
-            xhtml.endElement("tr");
-        } catch (SAXException e) {
-            //swallow
-        }
+    public void endTableRow() throws SAXException {
+        xhtml.endElement("tr");
     }
 
     @Override
-    public void startTableCell() {
-        try {
-            xhtml.startElement("td");
-        } catch (SAXException e) {
-            //swallow
-        }
+    public void startTableCell() throws SAXException {
+        xhtml.startElement("td");
         tableCellDepth++;
     }
 
     @Override
-    public void endTableCell() {
-        try {
-            xhtml.endElement("td");
-        } catch (SAXException e) {
-            //swallow
-        }
+    public void endTableCell() throws SAXException {
+        xhtml.endElement("td");
         pWithinCell = 0;
         tableCellDepth--;
     }
 
     @Override
-    public void startSDT() {
-        try {
-            closeStyleTags();
-            sdtDepth++;
-        } catch (SAXException e) {
-            //swallow
-        }
+    public void startSDT() throws SAXException {
+        closeStyleTags();
+        sdtDepth++;
     }
 
     @Override
@@ -340,28 +288,20 @@ public class OOXMLTikaBodyPartHandler
     }
 
     @Override
-    public void footnoteReference(String id) {
+    public void footnoteReference(String id) throws SAXException {
         if (id != null) {
-            try {
-                xhtml.characters("[");
-                xhtml.characters(id);
-                xhtml.characters("]");
-            } catch (SAXException e) {
-                //swallow
-            }
+            xhtml.characters("[");
+            xhtml.characters(id);
+            xhtml.characters("]");
         }
     }
 
     @Override
-    public void endnoteReference(String id) {
+    public void endnoteReference(String id) throws SAXException {
         if (id != null) {
-            try {
-                xhtml.characters("[");
-                xhtml.characters(id);
-                xhtml.characters("]");
-            } catch (SAXException e) {
-                //swallow
-            }
+            xhtml.characters("[");
+            xhtml.characters(id);
+            xhtml.characters("]");
         }
     }
 
@@ -371,52 +311,40 @@ public class OOXMLTikaBodyPartHandler
     }
 
     @Override
-    public void embeddedOLERef(String relId) {
+    public void embeddedOLERef(String relId) throws SAXException {
         if (relId == null) {
             return;
         }
-        try {
-            AttributesImpl attributes = new AttributesImpl();
-            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
-            attributes.addAttribute("", "id", "id", "CDATA", relId);
-            xhtml.startElement("div", attributes);
-            xhtml.endElement("div");
-
-        } catch (SAXException e) {
-            //swallow
-        }
+        AttributesImpl attributes = new AttributesImpl();
+        attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+        attributes.addAttribute("", "id", "id", "CDATA", relId);
+        xhtml.startElement("div", attributes);
+        xhtml.endElement("div");
     }
 
     @Override
-    public void embeddedPicRef(String picFileName, String picDescription) {
+    public void embeddedPicRef(String picFileName, String picDescription) 
throws SAXException {
 
-        try {
-            AttributesImpl attr = new AttributesImpl();
-            if (picFileName != null) {
-                attr.addAttribute("", "src", "src", "CDATA", "embedded:" + 
picFileName);
-            }
-            if (picDescription != null) {
-                attr.addAttribute("", "alt", "alt", "CDATA", picDescription);
-            }
+        AttributesImpl attr = new AttributesImpl();
+        if (picFileName != null) {
+            attr.addAttribute("", "src", "src", "CDATA", "embedded:" + 
picFileName);
+        }
+        if (picDescription != null) {
+            attr.addAttribute("", "alt", "alt", "CDATA", picDescription);
+        }
+
+        xhtml.startElement("img", attr);
+        xhtml.endElement("img");
 
-            xhtml.startElement("img", attr);
-            xhtml.endElement("img");
 
-        } catch (SAXException e) {
-            //swallow
-        }
     }
 
     @Override
-    public void startBookmark(String id, String name) {
+    public void startBookmark(String id, String name) throws SAXException {
         //skip bookmarks within hyperlinks
         if (name != null && !wroteHyperlinkStart) {
-            try {
-                xhtml.startElement("a", "name", name);
-                xhtml.endElement("a");
-            } catch (SAXException e) {
-                //swallow
-            }
+            xhtml.startElement("a", "name", name);
+            xhtml.endElement("a");
         }
     }
 
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index 71567fd..77d0887 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -337,7 +337,7 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
 
     }
 
-    private void startEditedSection(EditType editType, Attributes atts) {
+    private void startEditedSection(EditType editType, Attributes atts) throws 
SAXException {
         String editAuthor = atts.getValue(W_NS, "author");
         String editDateString = atts.getValue(W_NS, "date");
         Date editDate = null;
@@ -436,7 +436,7 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
         }
     }
 
-    private void handleEndOfRuby() {
+    private void handleEndOfRuby() throws SAXException {
         if (rubyBuffer.length() > 0) {
             if (concatenatePhoneticRuns) {
                 bodyContentsHandler.run(currRunProperties, " (" + 
rubyBuffer.toString() + ")");
@@ -445,7 +445,7 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
         }
     }
 
-    private void handleEndOfRun() {
+    private void handleEndOfRun() throws SAXException {
         bodyContentsHandler.run(currRunProperties, runBuffer.toString());
         if (inHlinkClick) {
             bodyContentsHandler.hyperlinkEnd();
@@ -459,7 +459,7 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
         currRunProperties.setUnderline(UnderlinePatterns.NONE.name());
     }
 
-    private void handlePict() {
+    private void handlePict() throws SAXException {
         String picFileName = null;
         if (picRId != null) {
             picFileName = linkedRelationships.get(picRId);
@@ -522,53 +522,53 @@ public class OOXMLWordAndPowerPointTextHandler extends 
DefaultHandler {
 
     public interface XWPFBodyContentsHandler {
 
-        void run(RunProperties runProperties, String contents);
+        void run(RunProperties runProperties, String contents) throws 
SAXException;
 
         /**
          * @param link the link; can be null
          */
-        void hyperlinkStart(String link);
+        void hyperlinkStart(String link) throws SAXException;
 
-        void hyperlinkEnd();
+        void hyperlinkEnd() throws SAXException;
 
-        void startParagraph(ParagraphProperties paragraphProperties);
+        void startParagraph(ParagraphProperties paragraphProperties) throws 
SAXException;
 
-        void endParagraph();
+        void endParagraph() throws SAXException;
 
-        void startTable();
+        void startTable() throws SAXException;
 
-        void endTable();
+        void endTable() throws SAXException;
 
-        void startTableRow();
+        void startTableRow() throws SAXException;
 
-        void endTableRow();
+        void endTableRow() throws SAXException;
 
-        void startTableCell();
+        void startTableCell() throws SAXException;
 
-        void endTableCell();
+        void endTableCell() throws SAXException;
 
-        void startSDT();
+        void startSDT() throws SAXException;
 
-        void endSDT();
+        void endSDT() throws SAXException;
 
-        void startEditedSection(String editor, Date date, EditType editType);
+        void startEditedSection(String editor, Date date, EditType editType) 
throws SAXException;
 
-        void endEditedSection();
+        void endEditedSection() throws SAXException;
 
-        boolean isIncludeDeletedText();
+        boolean isIncludeDeletedText() throws SAXException;
 
-        void footnoteReference(String id);
+        void footnoteReference(String id) throws SAXException;
 
-        void endnoteReference(String id);
+        void endnoteReference(String id) throws SAXException;
 
-        boolean isIncludeMoveFromText();
+        boolean isIncludeMoveFromText() throws SAXException;
 
-        void embeddedOLERef(String refId);
+        void embeddedOLERef(String refId) throws SAXException;
 
-        void embeddedPicRef(String picFileName, String picDescription);
+        void embeddedPicRef(String picFileName, String picDescription) throws 
SAXException;
 
-        void startBookmark(String id, String name);
+        void startBookmark(String id, String name) throws SAXException;
 
-        void endBookmark(String id);
+        void endBookmark(String id) throws SAXException;
     }
 }
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 968b312..4fbe36e 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -65,6 +65,7 @@ import org.xml.sax.Locator;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
+import org.apache.tika.exception.RuntimeSAXException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -443,6 +444,7 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
                 lastSeenCol = -1;
             } catch (SAXException e) {
                 //swallow
+                throw new RuntimeSAXException(e);
             }
 
         }
@@ -451,7 +453,7 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
             try {
                 xhtml.endElement("tr");
             } catch (SAXException e) {
-                //swallow
+                throw new RuntimeSAXException(e);
             }
         }
 
@@ -485,7 +487,7 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
 
                 xhtml.endElement("td");
             } catch (SAXException e) {
-                //swallow
+                throw new RuntimeSAXException(e);
             }
         }
 
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 1244d35..b782f2f 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -46,6 +46,8 @@ import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 import org.xml.sax.XMLReader;
 
+import org.apache.tika.exception.RuntimeSAXException;
+import org.apache.tika.exception.WriteLimitReachedException;
 import 
org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
 import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
 import org.apache.tika.parser.microsoft.ooxml.RunProperties;
@@ -119,6 +121,9 @@ public class XWPFEventBasedWordExtractor extends 
POIXMLTextExtractor {
                 } catch (IOException e) {
                     LOG.warn("IOException handling document part", e);
                 } catch (SAXException e) {
+                    if (WriteLimitReachedException.isWriteLimitReached(e)) {
+                        throw new RuntimeSAXException(e);
+                    }
                     //swallow this because we don't actually call it
                     LOG.warn("SAXException handling document part", e);
                 }
@@ -135,6 +140,9 @@ public class XWPFEventBasedWordExtractor extends 
POIXMLTextExtractor {
                 } catch (IOException e) {
                     LOG.warn("IOException handling glossary document part", e);
                 } catch (SAXException e) {
+                    if (WriteLimitReachedException.isWriteLimitReached(e)) {
+                        throw new RuntimeSAXException(e);
+                    }
                     //swallow this because we don't actually call it
                     LOG.warn("SAXException handling glossary document part", 
e);
                 }
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
index b57d506..01b3380 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
@@ -26,6 +26,7 @@ import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.OfficeOpenXMLCore;
@@ -98,7 +99,7 @@ public abstract class AbstractXML2003Parser extends 
AbstractParser {
                     new OfflineContentHandler(new EmbeddedContentHandler(
                             getContentHandler(tagged, metadata, context))));
         } catch (SAXException e) {
-            tagged.throwIfCauseOf(e);
+            WriteLimitReachedException.throwIfWriteLimitReached(e);
             throw new TikaException("XML parse error", e);
         } finally {
             xhtml.endDocument();
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index 15a0669..ba2f0c3 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -40,6 +40,7 @@ import org.xml.sax.helpers.DefaultHandler;
 import org.apache.tika.config.Field;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -205,6 +206,7 @@ public class OpenDocumentParser extends AbstractParser {
                 handleZipEntry(entry, zipStream, metadata, context, handler,
                         embeddedDocumentUtil);
             } catch (SAXException e) {
+                WriteLimitReachedException.throwIfWriteLimitReached(e);
                 if (e.getCause() instanceof EncryptedDocumentException) {
                     throw (EncryptedDocumentException)e.getCause();
                 } else {
diff --git 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 9e3fc5b..fd6c4d6 100644
--- 
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ 
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -107,8 +107,7 @@ public class RecursiveParserWrapperTest extends TikaTest {
                 wlr++;
             }
         }
-        assertEquals(1, wlr);
-
+        assertEquals(2, wlr);
     }
 
 
diff --git 
a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/RecursiveMetadataResourceTest.java
 
b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/RecursiveMetadataResourceTest.java
index dec8a99..bd4b621 100644
--- 
a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/RecursiveMetadataResourceTest.java
+++ 
b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/RecursiveMetadataResourceTest.java
@@ -355,7 +355,7 @@ public class RecursiveMetadataResourceTest extends 
CXFTestBase {
         // Check results
         reader = new InputStreamReader((InputStream) response.getEntity(), 
UTF_8);
         metadataList = JsonMetadataList.fromJson(reader);
-        assertEquals(12, metadataList.size());
+        assertEquals(10, metadataList.size());
         assertEquals("true", 
metadataList.get(6).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
         assertContains("When in the Course of human events it becomes 
necessary for one people",
                 metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT));
diff --git 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 1d52857..260da2d 100644
--- 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++ 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -349,7 +349,9 @@ public class TikaResource {
             logger.warn("{}: Encrypted document ({})", path, fileName, e);
             throw new TikaServerParseException(e);
         } catch (Exception e) {
-            logger.warn("{}: Text extraction failed ({})", path, fileName, e);
+            if (! WriteLimitReachedException.isWriteLimitReached(e)) {
+                logger.warn("{}: Text extraction failed ({})", path, fileName, 
e);
+            }
             throw new TikaServerParseException(e);
         } catch (OutOfMemoryError e) {
             logger.warn("{}: OOM ({})", path, fileName, e);
diff --git 
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
 
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
index 971e0a5..95a9ca6 100644
--- 
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
+++ 
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
@@ -73,5 +73,69 @@ public class RecursiveMetadataResourceTest extends 
CXFTestBase {
                 metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION));
 
     }
+    /*
+    @Test
+    public void testWriteLimitInAll() throws Exception {
+        //specify your file directory here
+        Path testDocs = 
Paths.get("..../tika-parsers/src/test/resources/test-documents");
+        for (File f : testDocs.toFile().listFiles()) {
+            if (f.isDirectory()) {
+                continue;
+            }
+            System.out.println(f.getName());
+            testWriteLimit(f);
+        }
+    }
+    private void testWriteLimit(File f) throws Exception {
+        Response response = WebClient.create(endPoint + 
META_PATH+"/text").accept(
+                "application/json")
+                .put(f);
+        assertEquals(200, response.getStatus());
+        Reader reader = new InputStreamReader((InputStream) 
response.getEntity(), UTF_8);
+        List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+        int totalLen = 0;
+        StringBuilder sb = new StringBuilder();
+        for (Metadata m : metadataList) {
+            String txt = 
m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
+            sb.append(txt);
+            totalLen += (txt == null) ? 0 : txt.length();
+        }
+        String fullText = sb.toString();
+        Random r = new Random();
+        for (int i = 0; i < 20; i++) {
+            int writeLimit = r.nextInt(totalLen+100);
+            response = WebClient.create(endPoint + META_PATH+"/text").accept(
+                    "application/json")
+                    .header("writeLimit", Integer.toString(writeLimit)).put(f);
+            assertEquals(200, response.getStatus());
+            reader = new InputStreamReader((InputStream) response.getEntity(), 
UTF_8);
+            List<Metadata> writeLimitMetadataList = 
JsonMetadataList.fromJson(reader);
+            int len = 0;
+            StringBuilder extracted = new StringBuilder();
+            for (Metadata m : writeLimitMetadataList) {
+                String txt = 
m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
+                len += (txt == null) ? 0 : txt.length();
+                extracted.append(txt);
+            }
+            if (totalLen > len) {
+                boolean wlr = false;
+                for (Metadata m : writeLimitMetadataList) {
+                    if 
("true".equals(m.get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED)))
 {
+                        wlr = true;
+                    }
+                }
+                System.out.println(f.getName() + " actualLen:" + len + " : 
writeLimit: "
+                        + writeLimit + " : totalLen: "+totalLen);
+                assertTrue(f.getName() + ": writelimit: " + writeLimit + " 
len: "+len,
+                        len <= writeLimit);
+                assertEquals(f.getName() +" writeLimit: " + writeLimit +
+                                " : fullLen:" + totalLen + " limitedLen: " 
+len,
+                        true, wlr);
+            } else if (len > totalLen) {
+                fail("len should never be > totalLen "+len + "  : "+ totalLen);
+            }
+        }
+    }
+    */
 
 }
diff --git 
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
 
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
index e818dc7..79cea79 100644
--- 
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
+++ 
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
@@ -126,12 +126,11 @@ public class TikaResourceTest extends CXFTestBase {
 
     @Test
     public void testJson() throws Exception {
-        Response response = WebClient.create(endPoint + TIKA_PATH).accept(
-                "application/json")
+        Response response = WebClient.create(endPoint + 
TIKA_PATH).accept("application/json")
                 .put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD));
-        Metadata metadata =
-                JsonMetadata.fromJson(new InputStreamReader(
-                        ((InputStream)response.getEntity()), 
StandardCharsets.UTF_8));
+        Metadata metadata = JsonMetadata.fromJson(
+                new InputStreamReader(((InputStream) response.getEntity()),
+                        StandardCharsets.UTF_8));
 
         assertEquals("Nikolai Lobachevsky", metadata.get("author"));
         assertEquals("application/mock+xml", 
metadata.get(Metadata.CONTENT_TYPE));
@@ -140,12 +139,11 @@ public class TikaResourceTest extends CXFTestBase {
 
     @Test
     public void testJsonNPE() throws Exception {
-        Response response = WebClient.create(endPoint + TIKA_PATH).accept(
-                "application/json")
+        Response response = WebClient.create(endPoint + 
TIKA_PATH).accept("application/json")
                 .put(ClassLoader.getSystemResourceAsStream(TEST_NULL_POINTER));
-        Metadata metadata =
-                JsonMetadata.fromJson(new InputStreamReader(
-                        ((InputStream)response.getEntity()), 
StandardCharsets.UTF_8));
+        Metadata metadata = JsonMetadata.fromJson(
+                new InputStreamReader(((InputStream) response.getEntity()),
+                        StandardCharsets.UTF_8));
 
         assertEquals("Nikolai Lobachevsky", metadata.get("author"));
         assertEquals("application/mock+xml", 
metadata.get(Metadata.CONTENT_TYPE));
@@ -156,32 +154,29 @@ public class TikaResourceTest extends CXFTestBase {
 
     @Test
     public void testJsonWriteLimit() throws Exception {
-        Response response = WebClient.create(endPoint + TIKA_PATH)
-                .header("writeLimit", "100")
+        Response response = WebClient.create(endPoint + 
TIKA_PATH).header("writeLimit", "100")
                 .accept("application/json")
                 
.put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
-        Metadata metadata =
-                JsonMetadata.fromJson(new InputStreamReader(
-                        ((InputStream)response.getEntity()), 
StandardCharsets.UTF_8));
+        Metadata metadata = JsonMetadata.fromJson(
+                new InputStreamReader(((InputStream) response.getEntity()),
+                        StandardCharsets.UTF_8));
 
         assertEquals("Nikolai Lobachevsky", metadata.get("author"));
         assertEquals("application/mock+xml", 
metadata.get(Metadata.CONTENT_TYPE));
         assertContains("Hello world", 
metadata.get(TikaCoreProperties.TIKA_CONTENT));
         assertNotFound("dissolve", 
metadata.get(TikaCoreProperties.TIKA_CONTENT));
-        
assertTrue(metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION).startsWith(
-                "org.apache.tika.exception.WriteLimitReachedException"
-        ));
+        assertTrue(metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION)
+                
.startsWith("org.apache.tika.exception.WriteLimitReachedException"));
         assertEquals("true", 
metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
     }
 
     @Test
     public void testJsonHandlerType() throws Exception {
-        Response response = WebClient.create(endPoint + TIKA_PATH)
-                .accept("application/json")
+        Response response = WebClient.create(endPoint + 
TIKA_PATH).accept("application/json")
                 
.put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
-        Metadata metadata =
-                JsonMetadata.fromJson(new InputStreamReader(
-                        ((InputStream)response.getEntity()), 
StandardCharsets.UTF_8));
+        Metadata metadata = JsonMetadata.fromJson(
+                new InputStreamReader(((InputStream) response.getEntity()),
+                        StandardCharsets.UTF_8));
 
         assertEquals("Nikolai Lobachevsky", metadata.get("author"));
         assertEquals("application/mock+xml", 
metadata.get(Metadata.CONTENT_TYPE));
@@ -189,16 +184,70 @@ public class TikaResourceTest extends CXFTestBase {
         //default is xhtml
         assertContains("<p>", metadata.get(TikaCoreProperties.TIKA_CONTENT));
 
-        response = WebClient.create(endPoint + TIKA_PATH + "/text")
-                .accept("application/json")
+        response = WebClient.create(endPoint + TIKA_PATH + 
"/text").accept("application/json")
                 
.put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
-        metadata =
-                JsonMetadata.fromJson(new InputStreamReader(
-                        ((InputStream)response.getEntity()), 
StandardCharsets.UTF_8));
+        metadata = JsonMetadata.fromJson(new InputStreamReader(((InputStream) 
response.getEntity()),
+                StandardCharsets.UTF_8));
 
         assertEquals("Nikolai Lobachevsky", metadata.get("author"));
         assertEquals("application/mock+xml", 
metadata.get(Metadata.CONTENT_TYPE));
         assertContains("Hello world", 
metadata.get(TikaCoreProperties.TIKA_CONTENT));
         assertNotFound("<p>", metadata.get(TikaCoreProperties.TIKA_CONTENT));
     }
+
+    /*
+    @Test
+    public void testWriteLimitInAll() throws Exception {
+        //specify your file directory here
+        Path testDocs = 
Paths.get("..../tika-parsers/src/test/resources/test-documents");
+        for (File f : testDocs.toFile().listFiles()) {
+            if (f.isDirectory()) {
+                continue;
+            }
+            System.out.println(f.getName());
+            testWriteLimit(f);
+        }
+    }
+
+    private void testWriteLimit(File f) throws Exception {
+        Response response =
+                WebClient.create(endPoint + TIKA_PATH + 
"/text").accept("application/json").put(f);
+        assertEquals(200, response.getStatus());
+        Reader reader = new InputStreamReader((InputStream) 
response.getEntity(), UTF_8);
+        Metadata metadata = JsonMetadata.fromJson(reader);
+        int totalLen = 0;
+        StringBuilder sb = new StringBuilder();
+        String txt = 
metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
+        sb.append(txt);
+        totalLen += (txt == null) ? 0 : txt.length();
+        String fullText = sb.toString();
+        //        System.out.println(fullText);
+        Random r = new Random();
+        for (int i = 0; i < 20; i++) {
+            int writeLimit = r.nextInt(totalLen + 100);
+            response = WebClient.create(endPoint + TIKA_PATH + 
"/text").accept("application/json")
+                    .header("writeLimit", Integer.toString(writeLimit)).put(f);
+            assertEquals(200, response.getStatus());
+            reader = new InputStreamReader((InputStream) response.getEntity(), 
UTF_8);
+            Metadata writeLimitMetadata = JsonMetadata.fromJson(reader);
+            int len = 0;
+            StringBuilder extracted = new StringBuilder();
+            txt = 
writeLimitMetadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
+            len += (txt == null) ? 0 : txt.length();
+            extracted.append(txt);
+            if (totalLen > len) {
+                boolean wlr = "true".equals(writeLimitMetadata
+                        
.get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED));
+                System.out.println(f.getName() + " " + len + " : " + 
writeLimit);
+                assertTrue(f.getName() + ": writelimit: " + writeLimit + " 
len: " + len,
+                        len <= writeLimit);
+                assertEquals(
+                        f.getName() + " : " + writeLimit + " : " + len + " 
total len: " + totalLen,
+                        true, wlr);
+            } else if (len > totalLen) {
+                fail("len should never be > totalLen " + len + "  : " + 
totalLen);
+            }
+        }
+    }*/
+
 }

Reply via email to