This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new edd9398 TIKA-3382 -- improve writelimit handling throughout various
parsers
edd9398 is described below
commit edd939864216c635373c252c86576f9a2a66c4ab
Author: tballison <[email protected]>
AuthorDate: Tue May 4 15:17:48 2021 -0400
TIKA-3382 -- improve writelimit handling throughout various parsers
---
.../apache/tika/exception/RuntimeSAXException.java | 30 ++
.../tika/exception/WriteLimitReachedException.java | 31 +-
.../org/apache/tika/parser/CompositeParser.java | 2 +
.../apache/tika/parser/RecursiveParserWrapper.java | 35 +--
.../apache/tika/sax/WriteOutContentHandler.java | 11 +-
.../apache/tika/parser/asm/XHTMLClassVisitor.java | 17 +-
.../org/apache/tika/parser/crypto/Pkcs7Parser.java | 2 +-
.../org/apache/tika/parser/crypto/TSDParser.java | 7 +-
.../tika/parser/microsoft/JackcessExtractor.java | 2 +
.../tika/parser/microsoft/chm/ChmParser.java | 4 +-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 10 +-
.../microsoft/ooxml/OOXMLExtractorFactory.java | 3 +
.../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 336 ++++++++-------------
.../ooxml/OOXMLWordAndPowerPointTextHandler.java | 54 ++--
.../ooxml/XSSFExcelExtractorDecorator.java | 6 +-
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 8 +
.../microsoft/xml/AbstractXML2003Parser.java | 3 +-
.../apache/tika/parser/odf/OpenDocumentParser.java | 2 +
.../tika/parser/RecursiveParserWrapperTest.java | 3 +-
.../classic/RecursiveMetadataResourceTest.java | 2 +-
.../tika/server/core/resource/TikaResource.java | 4 +-
.../server/core/RecursiveMetadataResourceTest.java | 64 ++++
.../apache/tika/server/core/TikaResourceTest.java | 105 +++++--
23 files changed, 421 insertions(+), 320 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java
b/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java
new file mode 100644
index 0000000..4e0bc43
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.exception;
+
+import org.xml.sax.SAXException;
+
+/**
+ * Use this to throw a SAXException in subclassed methods that don't throw
SAXExceptions
+ */
+public class RuntimeSAXException extends RuntimeException {
+
+ public RuntimeSAXException(SAXException t) {
+ super(t);
+ }
+
+}
diff --git
a/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
b/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
index 5bf454f..fe0621e 100644
---
a/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
+++
b/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
@@ -23,10 +23,19 @@ public class WriteLimitReachedException extends
SAXException {
//in case of (hopefully impossible) cyclic exception
private final static int MAX_DEPTH = 100;
- public WriteLimitReachedException(String msg) {
- super(msg);
+ private final int writeLimit;
+ public WriteLimitReachedException(int writeLimit) {
+ this.writeLimit = writeLimit;
}
+ @Override
+ public String getMessage() {
+ return "Your document contained more than " + writeLimit
+ + " characters, and so your requested limit has been"
+ + " reached. To receive the full text of the document,"
+ + " increase your limit. (Text up to the limit is"
+ + " however available).";
+ }
/**
* Checks whether the given exception (or any of it's root causes) was
* thrown by this handler as a signal of reaching the write limit.
@@ -53,4 +62,22 @@ public class WriteLimitReachedException extends SAXException
{
return t.getCause() != null && isWriteLimitReached(t.getCause(),
depth + 1);
}
}
+
+ public static void throwIfWriteLimitReached(Exception ex) throws
SAXException {
+ throwIfWriteLimitReached(ex, 0);
+ }
+
+ private static void throwIfWriteLimitReached(Exception ex, int depth)
throws SAXException {
+ if (ex == null) {
+ return;
+ }
+ if (depth > MAX_DEPTH) {
+ return;
+ }
+ if (ex instanceof WriteLimitReachedException) {
+ throw (SAXException) ex;
+ } else {
+ isWriteLimitReached(ex.getCause(), depth + 1);
+ }
+ }
}
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
index 72e7dde..b838cc8 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
@@ -31,6 +31,7 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -293,6 +294,7 @@ public class CompositeParser extends AbstractParser {
taggedStream.throwIfCauseOf(e);
throw new TikaException("TIKA-198: Illegal IOException from "
+ parser, e);
} catch (SAXException e) {
+ WriteLimitReachedException.throwIfWriteLimitReached(e);
if (taggedHandler != null) {
taggedHandler.throwIfCauseOf(e);
}
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index c98c8fb..ca09477 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -155,18 +155,14 @@ public class RecursiveParserWrapper extends
ParserDecorator {
new RecursivelySecureContentHandler(localHandler, tis,
writeLimit);
context.set(RecursivelySecureContentHandler.class,
secureContentHandler);
getWrappedParser().parse(tis, secureContentHandler, metadata,
context);
- } catch (SAXException e) {
- boolean wlr = WriteLimitReachedException.isWriteLimitReached(e);
- if (wlr == false) {
+ } catch (Throwable e) {
+ if (WriteLimitReachedException.isWriteLimitReached(e)) {
+ metadata.set(TikaCoreProperties.WRITE_LIMIT_REACHED, "true");
+ } else {
+ String stackTrace = ExceptionUtils.getFilteredStackTrace(e);
+ metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION,
stackTrace);
throw e;
}
- metadata.set(TikaCoreProperties.WRITE_LIMIT_REACHED, "true");
- } catch (Throwable e) {
- //try our best to record the problem in the metadata object
- //then rethrow
- String stackTrace = ExceptionUtils.getFilteredStackTrace(e);
- metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION, stackTrace);
- throw e;
} finally {
tmp.dispose();
long elapsedMillis = System.currentTimeMillis() - started;
@@ -240,9 +236,9 @@ public class RecursiveParserWrapper extends ParserDecorator
{
try {
super.parse(stream, secureContentHandler, metadata, context);
} catch (SAXException e) {
- boolean wlr =
WriteLimitReachedException.isWriteLimitReached(e);
- if (wlr == true) {
+ if (WriteLimitReachedException.isWriteLimitReached(e)) {
metadata.add(TikaCoreProperties.WRITE_LIMIT_REACHED,
"true");
+ throw e;
} else {
if (catchEmbeddedExceptions) {
ParserUtils.recordParserFailure(this, e, metadata);
@@ -339,13 +335,7 @@ public class RecursiveParserWrapper extends
ParserDecorator {
int availableLength = Math.min(totalWriteLimit - totalChars,
length);
super.characters(ch, start, availableLength);
if (availableLength < length) {
- throw new WriteLimitReachedException(
- "Your document contained more than " + totalWriteLimit
+
- " characters, and so your requested limit has
been" +
- " reached. To receive the full text of the
document," +
- " increase your limit. (Text up to the limit
is" +
- " however available)."
- );
+ throw new WriteLimitReachedException(totalWriteLimit);
}
}
@@ -358,12 +348,7 @@ public class RecursiveParserWrapper extends
ParserDecorator {
int availableLength = Math.min(totalWriteLimit - totalChars,
length);
super.ignorableWhitespace(ch, start, availableLength);
if (availableLength < length) {
- throw new WriteLimitReachedException("Your document contained
more than "
- + totalWriteLimit +
- " characters, and so your requested limit has been" +
- " reached. To receive the full text of the document," +
- " increase your limit. (Text up to the limit is" + "
however available)."
- );
+ throw new WriteLimitReachedException(totalWriteLimit);
}
}
}
diff --git
a/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
b/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
index 920afaf..2704d4c 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
@@ -134,10 +134,7 @@ public class WriteOutContentHandler extends
ContentHandlerDecorator {
} else {
super.characters(ch, start, writeLimit - writeCount);
writeCount = writeLimit;
- throw new WriteLimitReachedException("Your document contained more
than " + writeLimit +
- " characters, and so your requested limit has been" +
- " reached. To receive the full text of the document," +
- " increase your limit. (Text up to the limit is" + "
however available).");
+ throw new WriteLimitReachedException(writeLimit);
}
}
@@ -149,11 +146,7 @@ public class WriteOutContentHandler extends
ContentHandlerDecorator {
} else {
super.ignorableWhitespace(ch, start, writeLimit - writeCount);
writeCount = writeLimit;
- throw new WriteLimitReachedException("Your document contained more
than "
- + writeLimit +
- " characters, and so your requested limit has been" +
- " reached. To receive the full text of the document," +
- " increase your limit. (Text up to the limit is however
available).");
+ throw new WriteLimitReachedException(writeLimit);
}
}
}
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
index 199d5ca..d55528d 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
@@ -30,7 +30,9 @@ import org.objectweb.asm.Type;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.apache.tika.exception.RuntimeSAXException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -64,11 +66,8 @@ class XHTMLClassVisitor extends ClassVisitor {
ClassReader reader = new ClassReader(stream);
reader.accept(this, ClassReader.SKIP_FRAMES |
ClassReader.SKIP_CODE);
} catch (RuntimeException e) {
- if (e.getCause() instanceof SAXException) {
- throw (SAXException) e.getCause();
- } else {
- throw new TikaException("Failed to parse a Java class", e);
- }
+ WriteLimitReachedException.throwIfWriteLimitReached(e);
+ throw new TikaException("Failed to parse a Java class", e);
}
}
@@ -125,7 +124,7 @@ class XHTMLClassVisitor extends ClassVisitor {
}
xhtml.characters("{\n");
} catch (SAXException e) {
- throw new RuntimeException(e);
+ throw new RuntimeSAXException(e);
}
}
@@ -148,7 +147,7 @@ class XHTMLClassVisitor extends ClassVisitor {
xhtml.endElement("pre");
xhtml.endDocument();
} catch (SAXException e) {
- throw new RuntimeException(e);
+ throw new RuntimeSAXException(e);
}
}
@@ -204,7 +203,7 @@ class XHTMLClassVisitor extends ClassVisitor {
writeSemicolon();
writeNewline();
} catch (SAXException e) {
- throw new RuntimeException(e);
+ throw new RuntimeSAXException(e);
}
}
@@ -251,7 +250,7 @@ class XHTMLClassVisitor extends ClassVisitor {
writeSemicolon();
writeNewline();
} catch (SAXException e) {
- throw new RuntimeException(e);
+ throw new RuntimeSAXException(e);
}
}
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/Pkcs7Parser.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/Pkcs7Parser.java
index 64babca..7e42be8 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/Pkcs7Parser.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/Pkcs7Parser.java
@@ -74,7 +74,7 @@ public class Pkcs7Parser extends AbstractParser {
}
try (InputStream input = content.getContentStream()) {
Parser delegate = context.get(Parser.class,
EmptyParser.INSTANCE);
- delegate.parse(input, handler, metadata, context);
+ delegate.parse(input, handler, new Metadata(), context);
}
} finally {
parser.close();
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
index 856bf40..7b1d44f 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
@@ -51,6 +51,7 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
@@ -109,7 +110,7 @@ public class TSDParser extends AbstractParser {
}
}
- private List<TSDMetas> extractMetas(InputStream stream) {
+ private List<TSDMetas> extractMetas(InputStream stream) throws
SAXException {
List<TSDMetas> tsdMetasList = new ArrayList<>();
try {
@@ -130,6 +131,7 @@ public class TSDParser extends AbstractParser {
} catch (SecurityException e) {
throw e;
} catch (Exception ex) {
+ WriteLimitReachedException.throwIfWriteLimitReached(ex);
LOG.error("Error in TSDParser.buildMetas {}", ex.getMessage());
tsdMetasList.clear();
}
@@ -160,7 +162,7 @@ public class TSDParser extends AbstractParser {
}
private void parseTSDContent(InputStream stream, ContentHandler handler,
Metadata metadata,
- ParseContext context) {
+ ParseContext context) throws SAXException {
CMSTimeStampedDataParser cmsTimeStampedDataParser = null;
EmbeddedDocumentExtractor edx =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
@@ -176,6 +178,7 @@ public class TSDParser extends AbstractParser {
} catch (SecurityException e) {
throw e;
} catch (Exception ex) {
+ WriteLimitReachedException.throwIfWriteLimitReached(ex);
LOG.error("Error in TSDParser.parseTSDContent {}",
ex.getMessage());
} finally {
this.closeCMSParser(cmsTimeStampedDataParser);
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index 059027d..ff258c9 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -46,6 +46,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -224,6 +225,7 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
.parse(new
ByteArrayInputStream(v.getBytes(UTF_8)), h, m, parseContext);
handler.characters(h.toString());
} catch (SAXException e) {
+ WriteLimitReachedException.throwIfWriteLimitReached(e);
//if something went wrong in htmlparser, just append the
characters
handler.characters(v);
}
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
index 33e7d0c..56a9cad 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java
@@ -94,15 +94,13 @@ public class ChmParser extends AbstractParser {
private void parsePage(byte[] byteObject, Parser htmlParser,
ContentHandler xhtml,
- ParseContext context) throws TikaException { //
throws IOException
+ ParseContext context) throws TikaException,
SAXException { // throws IOException
InputStream stream = null;
Metadata metadata = new Metadata();
ContentHandler handler = new EmbeddedContentHandler(new
BodyContentHandler(xhtml));// -1
try {
stream = new ByteArrayInputStream(byteObject);
htmlParser.parse(stream, handler, metadata, context);
- } catch (SAXException e) {
- throw new RuntimeException(e);
} catch (IOException e) {
// Pushback overflow from tagsoup
}
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index fc332a9..cd6c6e5 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -52,6 +52,7 @@ import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
@@ -142,7 +143,7 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
handleEmbeddedParts(xhtml, metadata);
// thumbnail
- handleThumbnail(xhtml);
+ handleThumbnail(xhtml, metadata);
xhtml.endDocument();
}
@@ -160,7 +161,7 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
return desc;
}
- private void handleThumbnail(ContentHandler handler) {
+ private void handleThumbnail(ContentHandler handler, Metadata metadata)
throws SAXException {
try {
OPCPackage opcPackage = extractor.getPackage();
for (PackageRelationship rel : opcPackage
@@ -193,7 +194,10 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
} catch (SecurityException e) {
throw e;
} catch (Exception ex) {
- //swallow
+ WriteLimitReachedException.throwIfWriteLimitReached(ex);
+ //swallow otherwise
+ metadata.add(TikaCoreProperties.EMBEDDED_EXCEPTION,
+ ExceptionUtils.getStackTrace(ex));
}
}
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 154efa2..180899e 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -51,6 +51,7 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.detect.microsoft.ooxml.OPCPackageDetector;
+import org.apache.tika.exception.RuntimeSAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -234,6 +235,8 @@ public class OOXMLExtractorFactory {
throw new TikaException("Error creating OOXML extractor", e);
} catch (XmlException e) {
throw new TikaException("Error creating OOXML extractor", e);
+ } catch (RuntimeSAXException e) {
+ throw(SAXException) e.getCause();
} finally {
if (tmpRepairedCopy != null) {
if (pkg != null) {
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
index 9471237..1de5de6 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
@@ -82,115 +82,98 @@ public class OOXMLTikaBodyPartHandler
}
@Override
- public void run(RunProperties runProperties, String contents) {
- try {
-
- // True if we are currently in the named style tag:
- if (runProperties.isBold() != isBold) {
- if (isStrikeThrough) {
- xhtml.endElement("strike");
- isStrikeThrough = false;
- }
- if (isUnderline) {
- xhtml.endElement("u");
- isUnderline = false;
- ;
- }
- if (isItalics) {
- xhtml.endElement("i");
- isItalics = false;
- }
- if (runProperties.isBold()) {
- xhtml.startElement("b");
- } else {
- xhtml.endElement("b");
- }
- isBold = runProperties.isBold();
+ public void run(RunProperties runProperties, String contents) throws
SAXException {
+
+ // True if we are currently in the named style tag:
+ if (runProperties.isBold() != isBold) {
+ if (isStrikeThrough) {
+ xhtml.endElement("strike");
+ isStrikeThrough = false;
+ }
+ if (isUnderline) {
+ xhtml.endElement("u");
+ isUnderline = false;
+ }
+ if (isItalics) {
+ xhtml.endElement("i");
+ isItalics = false;
}
+ if (runProperties.isBold()) {
+ xhtml.startElement("b");
+ } else {
+ xhtml.endElement("b");
+ }
+ isBold = runProperties.isBold();
+ }
- if (runProperties.isItalics() != isItalics) {
- if (isStrikeThrough) {
- xhtml.endElement("strike");
- isStrikeThrough = false;
- }
- if (isUnderline) {
- xhtml.endElement("u");
- isUnderline = false;
- }
- if (runProperties.isItalics()) {
- xhtml.startElement("i");
- } else {
- xhtml.endElement("i");
- }
- isItalics = runProperties.isItalics();
+ if (runProperties.isItalics() != isItalics) {
+ if (isStrikeThrough) {
+ xhtml.endElement("strike");
+ isStrikeThrough = false;
+ }
+ if (isUnderline) {
+ xhtml.endElement("u");
+ isUnderline = false;
+ }
+ if (runProperties.isItalics()) {
+ xhtml.startElement("i");
+ } else {
+ xhtml.endElement("i");
}
+ isItalics = runProperties.isItalics();
+ }
- if (runProperties.isStrikeThrough() != isStrikeThrough) {
- if (isUnderline) {
- xhtml.endElement("u");
- isUnderline = false;
- }
- if (runProperties.isStrikeThrough()) {
- xhtml.startElement("strike");
- } else {
- xhtml.endElement("strike");
- }
- isStrikeThrough = runProperties.isStrikeThrough();
+ if (runProperties.isStrikeThrough() != isStrikeThrough) {
+ if (isUnderline) {
+ xhtml.endElement("u");
+ isUnderline = false;
+ }
+ if (runProperties.isStrikeThrough()) {
+ xhtml.startElement("strike");
+ } else {
+ xhtml.endElement("strike");
}
+ isStrikeThrough = runProperties.isStrikeThrough();
+ }
- boolean runIsUnderlined = runProperties.getUnderline() !=
UnderlinePatterns.NONE;
- if (runIsUnderlined != isUnderline) {
- if (runIsUnderlined) {
- xhtml.startElement("u");
- } else {
- xhtml.endElement("u");
- }
- isUnderline = runIsUnderlined;
+ boolean runIsUnderlined = runProperties.getUnderline() !=
UnderlinePatterns.NONE;
+ if (runIsUnderlined != isUnderline) {
+ if (runIsUnderlined) {
+ xhtml.startElement("u");
+ } else {
+ xhtml.endElement("u");
}
+ isUnderline = runIsUnderlined;
+ }
- xhtml.characters(contents);
+ xhtml.characters(contents);
- } catch (SAXException e) {
- //swallow
- }
}
@Override
- public void hyperlinkStart(String link) {
- try {
- if (link != null) {
- xhtml.startElement("a", "href", link);
- wroteHyperlinkStart = true;
- }
- } catch (SAXException e) {
- //swallow
+ public void hyperlinkStart(String link) throws SAXException {
+ if (link != null) {
+ xhtml.startElement("a", "href", link);
+ wroteHyperlinkStart = true;
}
}
@Override
- public void hyperlinkEnd() {
- try {
- if (wroteHyperlinkStart) {
- closeStyleTags();
- wroteHyperlinkStart = false;
- xhtml.endElement("a");
- }
- } catch (SAXException e) {
- //swallow
+ public void hyperlinkEnd() throws SAXException {
+ if (wroteHyperlinkStart) {
+ closeStyleTags();
+ wroteHyperlinkStart = false;
+ xhtml.endElement("a");
}
}
@Override
- public void startParagraph(ParagraphProperties paragraphProperties) {
+ public void startParagraph(ParagraphProperties paragraphProperties) throws
SAXException {
//if you're in a table cell and your after the first paragraph
//make sure to prepend a \n
if (tableCellDepth > 0 && pWithinCell > 0) {
- try {
- xhtml.characters(NEWLINE, 0, 1);
- } catch (SAXException e) {
- //swallow
- }
+ xhtml.characters(NEWLINE, 0, 1);
}
if (pDepth == 0 && tableDepth == 0 && sdtDepth == 0) {
@@ -208,41 +191,30 @@ public class OOXMLTikaBodyPartHandler
}
- try {
- if (styleClass == null) {
- xhtml.startElement(paragraphTag);
- } else {
- xhtml.startElement(paragraphTag, "class", styleClass);
- }
- } catch (SAXException e) {
- //swallow
+ if (styleClass == null) {
+ xhtml.startElement(paragraphTag);
+ } else {
+ xhtml.startElement(paragraphTag, "class", styleClass);
}
}
- try {
- writeParagraphNumber(paragraphProperties.getNumId(),
paragraphProperties.getIlvl(),
- listManager, xhtml);
- } catch (SAXException e) {
- //swallow
- }
+ writeParagraphNumber(paragraphProperties.getNumId(),
paragraphProperties.getIlvl(),
+ listManager, xhtml);
pDepth++;
}
@Override
- public void endParagraph() {
- try {
- closeStyleTags();
- if (pDepth == 1 && tableDepth == 0) {
- xhtml.endElement(paragraphTag);
- } else if (tableCellDepth > 0 && pWithinCell > 0) {
- xhtml.characters(NEWLINE, 0, 1);
- } else if (tableCellDepth == 0) {
- xhtml.characters(NEWLINE, 0, 1);
- }
- } catch (SAXException e) {
- //swallow
+ public void endParagraph() throws SAXException {
+ closeStyleTags();
+ if (pDepth == 1 && tableDepth == 0) {
+ xhtml.endElement(paragraphTag);
+ } else if (tableCellDepth > 0 && pWithinCell > 0) {
+ xhtml.characters(NEWLINE, 0, 1);
+ } else if (tableCellDepth == 0) {
+ xhtml.characters(NEWLINE, 0, 1);
}
+
if (tableCellDepth > 0) {
pWithinCell++;
}
@@ -250,72 +222,48 @@ public class OOXMLTikaBodyPartHandler
}
@Override
- public void startTable() {
- try {
- xhtml.startElement("table");
- tableDepth++;
- } catch (SAXException e) {
- //swallow
- }
+ public void startTable() throws SAXException {
+
+ xhtml.startElement("table");
+ tableDepth++;
+
}
@Override
- public void endTable() {
- try {
- xhtml.endElement("table");
- tableDepth--;
- } catch (SAXException e) {
- //swallow
- }
+ public void endTable() throws SAXException {
+
+ xhtml.endElement("table");
+ tableDepth--;
+
}
@Override
- public void startTableRow() {
- try {
- xhtml.startElement("tr");
- } catch (SAXException e) {
- //swallow
- }
+ public void startTableRow() throws SAXException {
+ xhtml.startElement("tr");
}
@Override
- public void endTableRow() {
- try {
- xhtml.endElement("tr");
- } catch (SAXException e) {
- //swallow
- }
+ public void endTableRow() throws SAXException {
+ xhtml.endElement("tr");
}
@Override
- public void startTableCell() {
- try {
- xhtml.startElement("td");
- } catch (SAXException e) {
- //swallow
- }
+ public void startTableCell() throws SAXException {
+ xhtml.startElement("td");
tableCellDepth++;
}
@Override
- public void endTableCell() {
- try {
- xhtml.endElement("td");
- } catch (SAXException e) {
- //swallow
- }
+ public void endTableCell() throws SAXException {
+ xhtml.endElement("td");
pWithinCell = 0;
tableCellDepth--;
}
@Override
- public void startSDT() {
- try {
- closeStyleTags();
- sdtDepth++;
- } catch (SAXException e) {
- //swallow
- }
+ public void startSDT() throws SAXException {
+ closeStyleTags();
+ sdtDepth++;
}
@Override
@@ -340,28 +288,20 @@ public class OOXMLTikaBodyPartHandler
}
@Override
- public void footnoteReference(String id) {
+ public void footnoteReference(String id) throws SAXException {
if (id != null) {
- try {
- xhtml.characters("[");
- xhtml.characters(id);
- xhtml.characters("]");
- } catch (SAXException e) {
- //swallow
- }
+ xhtml.characters("[");
+ xhtml.characters(id);
+ xhtml.characters("]");
}
}
@Override
- public void endnoteReference(String id) {
+ public void endnoteReference(String id) throws SAXException {
if (id != null) {
- try {
- xhtml.characters("[");
- xhtml.characters(id);
- xhtml.characters("]");
- } catch (SAXException e) {
- //swallow
- }
+ xhtml.characters("[");
+ xhtml.characters(id);
+ xhtml.characters("]");
}
}
@@ -371,52 +311,40 @@ public class OOXMLTikaBodyPartHandler
}
@Override
- public void embeddedOLERef(String relId) {
+ public void embeddedOLERef(String relId) throws SAXException {
if (relId == null) {
return;
}
- try {
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", relId);
- xhtml.startElement("div", attributes);
- xhtml.endElement("div");
-
- } catch (SAXException e) {
- //swallow
- }
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", relId);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
}
@Override
- public void embeddedPicRef(String picFileName, String picDescription) {
+ public void embeddedPicRef(String picFileName, String picDescription)
throws SAXException {
- try {
- AttributesImpl attr = new AttributesImpl();
- if (picFileName != null) {
- attr.addAttribute("", "src", "src", "CDATA", "embedded:" +
picFileName);
- }
- if (picDescription != null) {
- attr.addAttribute("", "alt", "alt", "CDATA", picDescription);
- }
+ AttributesImpl attr = new AttributesImpl();
+ if (picFileName != null) {
+ attr.addAttribute("", "src", "src", "CDATA", "embedded:" +
picFileName);
+ }
+ if (picDescription != null) {
+ attr.addAttribute("", "alt", "alt", "CDATA", picDescription);
+ }
+
+ xhtml.startElement("img", attr);
+ xhtml.endElement("img");
- xhtml.startElement("img", attr);
- xhtml.endElement("img");
- } catch (SAXException e) {
- //swallow
- }
}
@Override
- public void startBookmark(String id, String name) {
+ public void startBookmark(String id, String name) throws SAXException {
//skip bookmarks within hyperlinks
if (name != null && !wroteHyperlinkStart) {
- try {
- xhtml.startElement("a", "name", name);
- xhtml.endElement("a");
- } catch (SAXException e) {
- //swallow
- }
+ xhtml.startElement("a", "name", name);
+ xhtml.endElement("a");
}
}
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index 71567fd..77d0887 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -337,7 +337,7 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
}
- private void startEditedSection(EditType editType, Attributes atts) {
+ private void startEditedSection(EditType editType, Attributes atts) throws
SAXException {
String editAuthor = atts.getValue(W_NS, "author");
String editDateString = atts.getValue(W_NS, "date");
Date editDate = null;
@@ -436,7 +436,7 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
}
}
- private void handleEndOfRuby() {
+ private void handleEndOfRuby() throws SAXException {
if (rubyBuffer.length() > 0) {
if (concatenatePhoneticRuns) {
bodyContentsHandler.run(currRunProperties, " (" +
rubyBuffer.toString() + ")");
@@ -445,7 +445,7 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
}
}
- private void handleEndOfRun() {
+ private void handleEndOfRun() throws SAXException {
bodyContentsHandler.run(currRunProperties, runBuffer.toString());
if (inHlinkClick) {
bodyContentsHandler.hyperlinkEnd();
@@ -459,7 +459,7 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
currRunProperties.setUnderline(UnderlinePatterns.NONE.name());
}
- private void handlePict() {
+ private void handlePict() throws SAXException {
String picFileName = null;
if (picRId != null) {
picFileName = linkedRelationships.get(picRId);
@@ -522,53 +522,53 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
public interface XWPFBodyContentsHandler {
- void run(RunProperties runProperties, String contents);
+ void run(RunProperties runProperties, String contents) throws
SAXException;
/**
* @param link the link; can be null
*/
- void hyperlinkStart(String link);
+ void hyperlinkStart(String link) throws SAXException;
- void hyperlinkEnd();
+ void hyperlinkEnd() throws SAXException;
- void startParagraph(ParagraphProperties paragraphProperties);
+ void startParagraph(ParagraphProperties paragraphProperties) throws
SAXException;
- void endParagraph();
+ void endParagraph() throws SAXException;
- void startTable();
+ void startTable() throws SAXException;
- void endTable();
+ void endTable() throws SAXException;
- void startTableRow();
+ void startTableRow() throws SAXException;
- void endTableRow();
+ void endTableRow() throws SAXException;
- void startTableCell();
+ void startTableCell() throws SAXException;
- void endTableCell();
+ void endTableCell() throws SAXException;
- void startSDT();
+ void startSDT() throws SAXException;
- void endSDT();
+ void endSDT() throws SAXException;
- void startEditedSection(String editor, Date date, EditType editType);
+ void startEditedSection(String editor, Date date, EditType editType)
throws SAXException;
- void endEditedSection();
+ void endEditedSection() throws SAXException;
- boolean isIncludeDeletedText();
+ boolean isIncludeDeletedText() throws SAXException;
- void footnoteReference(String id);
+ void footnoteReference(String id) throws SAXException;
- void endnoteReference(String id);
+ void endnoteReference(String id) throws SAXException;
- boolean isIncludeMoveFromText();
+ boolean isIncludeMoveFromText() throws SAXException;
- void embeddedOLERef(String refId);
+ void embeddedOLERef(String refId) throws SAXException;
- void embeddedPicRef(String picFileName, String picDescription);
+ void embeddedPicRef(String picFileName, String picDescription) throws
SAXException;
- void startBookmark(String id, String name);
+ void startBookmark(String id, String name) throws SAXException;
- void endBookmark(String id);
+ void endBookmark(String id) throws SAXException;
}
}
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 968b312..4fbe36e 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -65,6 +65,7 @@ import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
+import org.apache.tika.exception.RuntimeSAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -443,6 +444,7 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
lastSeenCol = -1;
} catch (SAXException e) {
//swallow
+ throw new RuntimeSAXException(e);
}
}
@@ -451,7 +453,7 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
try {
xhtml.endElement("tr");
} catch (SAXException e) {
- //swallow
+ throw new RuntimeSAXException(e);
}
}
@@ -485,7 +487,7 @@ public class XSSFExcelExtractorDecorator extends
AbstractOOXMLExtractor {
xhtml.endElement("td");
} catch (SAXException e) {
- //swallow
+ throw new RuntimeSAXException(e);
}
}
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 1244d35..b782f2f 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -46,6 +46,8 @@ import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
+import org.apache.tika.exception.RuntimeSAXException;
+import org.apache.tika.exception.WriteLimitReachedException;
import
org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
import org.apache.tika.parser.microsoft.ooxml.RunProperties;
@@ -119,6 +121,9 @@ public class XWPFEventBasedWordExtractor extends
POIXMLTextExtractor {
} catch (IOException e) {
LOG.warn("IOException handling document part", e);
} catch (SAXException e) {
+ if (WriteLimitReachedException.isWriteLimitReached(e)) {
+ throw new RuntimeSAXException(e);
+ }
//swallow this because we don't actually call it
LOG.warn("SAXException handling document part", e);
}
@@ -135,6 +140,9 @@ public class XWPFEventBasedWordExtractor extends
POIXMLTextExtractor {
} catch (IOException e) {
LOG.warn("IOException handling glossary document part", e);
} catch (SAXException e) {
+ if (WriteLimitReachedException.isWriteLimitReached(e)) {
+ throw new RuntimeSAXException(e);
+ }
//swallow this because we don't actually call it
LOG.warn("SAXException handling glossary document part",
e);
}
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
index b57d506..01b3380 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
@@ -26,6 +26,7 @@ import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
@@ -98,7 +99,7 @@ public abstract class AbstractXML2003Parser extends
AbstractParser {
new OfflineContentHandler(new EmbeddedContentHandler(
getContentHandler(tagged, metadata, context))));
} catch (SAXException e) {
- tagged.throwIfCauseOf(e);
+ WriteLimitReachedException.throwIfWriteLimitReached(e);
throw new TikaException("XML parse error", e);
} finally {
xhtml.endDocument();
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index 15a0669..ba2f0c3 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -40,6 +40,7 @@ import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.config.Field;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -205,6 +206,7 @@ public class OpenDocumentParser extends AbstractParser {
handleZipEntry(entry, zipStream, metadata, context, handler,
embeddedDocumentUtil);
} catch (SAXException e) {
+ WriteLimitReachedException.throwIfWriteLimitReached(e);
if (e.getCause() instanceof EncryptedDocumentException) {
throw (EncryptedDocumentException)e.getCause();
} else {
diff --git
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 9e3fc5b..fd6c4d6 100644
---
a/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++
b/tika-parsers/tika-parsers-classic/tika-parsers-classic-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -107,8 +107,7 @@ public class RecursiveParserWrapperTest extends TikaTest {
wlr++;
}
}
- assertEquals(1, wlr);
-
+ assertEquals(2, wlr);
}
diff --git
a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/RecursiveMetadataResourceTest.java
b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/RecursiveMetadataResourceTest.java
index dec8a99..bd4b621 100644
---
a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/RecursiveMetadataResourceTest.java
+++
b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/RecursiveMetadataResourceTest.java
@@ -355,7 +355,7 @@ public class RecursiveMetadataResourceTest extends
CXFTestBase {
// Check results
reader = new InputStreamReader((InputStream) response.getEntity(),
UTF_8);
metadataList = JsonMetadataList.fromJson(reader);
- assertEquals(12, metadataList.size());
+ assertEquals(10, metadataList.size());
assertEquals("true",
metadataList.get(6).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
assertContains("When in the Course of human events it becomes
necessary for one people",
metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT));
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 1d52857..260da2d 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -349,7 +349,9 @@ public class TikaResource {
logger.warn("{}: Encrypted document ({})", path, fileName, e);
throw new TikaServerParseException(e);
} catch (Exception e) {
- logger.warn("{}: Text extraction failed ({})", path, fileName, e);
+ if (! WriteLimitReachedException.isWriteLimitReached(e)) {
+ logger.warn("{}: Text extraction failed ({})", path, fileName,
e);
+ }
throw new TikaServerParseException(e);
} catch (OutOfMemoryError e) {
logger.warn("{}: OOM ({})", path, fileName, e);
diff --git
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
index 971e0a5..95a9ca6 100644
---
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
+++
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/RecursiveMetadataResourceTest.java
@@ -73,5 +73,69 @@ public class RecursiveMetadataResourceTest extends
CXFTestBase {
metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION));
}
+ /*
+ @Test
+ public void testWriteLimitInAll() throws Exception {
+ //specify your file directory here
+ Path testDocs =
Paths.get("..../tika-parsers/src/test/resources/test-documents");
+ for (File f : testDocs.toFile().listFiles()) {
+ if (f.isDirectory()) {
+ continue;
+ }
+ System.out.println(f.getName());
+ testWriteLimit(f);
+ }
+ }
+ private void testWriteLimit(File f) throws Exception {
+ Response response = WebClient.create(endPoint +
META_PATH+"/text").accept(
+ "application/json")
+ .put(f);
+ assertEquals(200, response.getStatus());
+ Reader reader = new InputStreamReader((InputStream)
response.getEntity(), UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+ int totalLen = 0;
+ StringBuilder sb = new StringBuilder();
+ for (Metadata m : metadataList) {
+ String txt =
m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
+ sb.append(txt);
+ totalLen += (txt == null) ? 0 : txt.length();
+ }
+ String fullText = sb.toString();
+ Random r = new Random();
+ for (int i = 0; i < 20; i++) {
+ int writeLimit = r.nextInt(totalLen+100);
+ response = WebClient.create(endPoint + META_PATH+"/text").accept(
+ "application/json")
+ .header("writeLimit", Integer.toString(writeLimit)).put(f);
+ assertEquals(200, response.getStatus());
+ reader = new InputStreamReader((InputStream) response.getEntity(),
UTF_8);
+ List<Metadata> writeLimitMetadataList =
JsonMetadataList.fromJson(reader);
+ int len = 0;
+ StringBuilder extracted = new StringBuilder();
+ for (Metadata m : writeLimitMetadataList) {
+ String txt =
m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
+ len += (txt == null) ? 0 : txt.length();
+ extracted.append(txt);
+ }
+ if (totalLen > len) {
+ boolean wlr = false;
+ for (Metadata m : writeLimitMetadataList) {
+ if
("true".equals(m.get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED)))
{
+ wlr = true;
+ }
+ }
+ System.out.println(f.getName() + " actualLen:" + len + " :
writeLimit: "
+ + writeLimit + " : totalLen: "+totalLen);
+ assertTrue(f.getName() + ": writelimit: " + writeLimit + "
len: "+len,
+ len <= writeLimit);
+ assertEquals(f.getName() +" writeLimit: " + writeLimit +
+ " : fullLen:" + totalLen + " limitedLen: "
+len,
+ true, wlr);
+ } else if (len > totalLen) {
+ fail("len should never be > totalLen "+len + " : "+ totalLen);
+ }
+ }
+ }
+ */
}
diff --git
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
index e818dc7..79cea79 100644
---
a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
+++
b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
@@ -126,12 +126,11 @@ public class TikaResourceTest extends CXFTestBase {
@Test
public void testJson() throws Exception {
- Response response = WebClient.create(endPoint + TIKA_PATH).accept(
- "application/json")
+ Response response = WebClient.create(endPoint +
TIKA_PATH).accept("application/json")
.put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD));
- Metadata metadata =
- JsonMetadata.fromJson(new InputStreamReader(
- ((InputStream)response.getEntity()),
StandardCharsets.UTF_8));
+ Metadata metadata = JsonMetadata.fromJson(
+ new InputStreamReader(((InputStream) response.getEntity()),
+ StandardCharsets.UTF_8));
assertEquals("Nikolai Lobachevsky", metadata.get("author"));
assertEquals("application/mock+xml",
metadata.get(Metadata.CONTENT_TYPE));
@@ -140,12 +139,11 @@ public class TikaResourceTest extends CXFTestBase {
@Test
public void testJsonNPE() throws Exception {
- Response response = WebClient.create(endPoint + TIKA_PATH).accept(
- "application/json")
+ Response response = WebClient.create(endPoint +
TIKA_PATH).accept("application/json")
.put(ClassLoader.getSystemResourceAsStream(TEST_NULL_POINTER));
- Metadata metadata =
- JsonMetadata.fromJson(new InputStreamReader(
- ((InputStream)response.getEntity()),
StandardCharsets.UTF_8));
+ Metadata metadata = JsonMetadata.fromJson(
+ new InputStreamReader(((InputStream) response.getEntity()),
+ StandardCharsets.UTF_8));
assertEquals("Nikolai Lobachevsky", metadata.get("author"));
assertEquals("application/mock+xml",
metadata.get(Metadata.CONTENT_TYPE));
@@ -156,32 +154,29 @@ public class TikaResourceTest extends CXFTestBase {
@Test
public void testJsonWriteLimit() throws Exception {
- Response response = WebClient.create(endPoint + TIKA_PATH)
- .header("writeLimit", "100")
+ Response response = WebClient.create(endPoint +
TIKA_PATH).header("writeLimit", "100")
.accept("application/json")
.put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
- Metadata metadata =
- JsonMetadata.fromJson(new InputStreamReader(
- ((InputStream)response.getEntity()),
StandardCharsets.UTF_8));
+ Metadata metadata = JsonMetadata.fromJson(
+ new InputStreamReader(((InputStream) response.getEntity()),
+ StandardCharsets.UTF_8));
assertEquals("Nikolai Lobachevsky", metadata.get("author"));
assertEquals("application/mock+xml",
metadata.get(Metadata.CONTENT_TYPE));
assertContains("Hello world",
metadata.get(TikaCoreProperties.TIKA_CONTENT));
assertNotFound("dissolve",
metadata.get(TikaCoreProperties.TIKA_CONTENT));
-
assertTrue(metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION).startsWith(
- "org.apache.tika.exception.WriteLimitReachedException"
- ));
+ assertTrue(metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION)
+
.startsWith("org.apache.tika.exception.WriteLimitReachedException"));
assertEquals("true",
metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
}
@Test
public void testJsonHandlerType() throws Exception {
- Response response = WebClient.create(endPoint + TIKA_PATH)
- .accept("application/json")
+ Response response = WebClient.create(endPoint +
TIKA_PATH).accept("application/json")
.put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
- Metadata metadata =
- JsonMetadata.fromJson(new InputStreamReader(
- ((InputStream)response.getEntity()),
StandardCharsets.UTF_8));
+ Metadata metadata = JsonMetadata.fromJson(
+ new InputStreamReader(((InputStream) response.getEntity()),
+ StandardCharsets.UTF_8));
assertEquals("Nikolai Lobachevsky", metadata.get("author"));
assertEquals("application/mock+xml",
metadata.get(Metadata.CONTENT_TYPE));
@@ -189,16 +184,70 @@ public class TikaResourceTest extends CXFTestBase {
//default is xhtml
assertContains("<p>", metadata.get(TikaCoreProperties.TIKA_CONTENT));
- response = WebClient.create(endPoint + TIKA_PATH + "/text")
- .accept("application/json")
+ response = WebClient.create(endPoint + TIKA_PATH +
"/text").accept("application/json")
.put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
- metadata =
- JsonMetadata.fromJson(new InputStreamReader(
- ((InputStream)response.getEntity()),
StandardCharsets.UTF_8));
+ metadata = JsonMetadata.fromJson(new InputStreamReader(((InputStream)
response.getEntity()),
+ StandardCharsets.UTF_8));
assertEquals("Nikolai Lobachevsky", metadata.get("author"));
assertEquals("application/mock+xml",
metadata.get(Metadata.CONTENT_TYPE));
assertContains("Hello world",
metadata.get(TikaCoreProperties.TIKA_CONTENT));
assertNotFound("<p>", metadata.get(TikaCoreProperties.TIKA_CONTENT));
}
+
+ /*
+ @Test
+ public void testWriteLimitInAll() throws Exception {
+ //specify your file directory here
+ Path testDocs =
Paths.get("..../tika-parsers/src/test/resources/test-documents");
+ for (File f : testDocs.toFile().listFiles()) {
+ if (f.isDirectory()) {
+ continue;
+ }
+ System.out.println(f.getName());
+ testWriteLimit(f);
+ }
+ }
+
+ private void testWriteLimit(File f) throws Exception {
+ Response response =
+ WebClient.create(endPoint + TIKA_PATH +
"/text").accept("application/json").put(f);
+ assertEquals(200, response.getStatus());
+ Reader reader = new InputStreamReader((InputStream)
response.getEntity(), UTF_8);
+ Metadata metadata = JsonMetadata.fromJson(reader);
+ int totalLen = 0;
+ StringBuilder sb = new StringBuilder();
+ String txt =
metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
+ sb.append(txt);
+ totalLen += (txt == null) ? 0 : txt.length();
+ String fullText = sb.toString();
+ // System.out.println(fullText);
+ Random r = new Random();
+ for (int i = 0; i < 20; i++) {
+ int writeLimit = r.nextInt(totalLen + 100);
+ response = WebClient.create(endPoint + TIKA_PATH +
"/text").accept("application/json")
+ .header("writeLimit", Integer.toString(writeLimit)).put(f);
+ assertEquals(200, response.getStatus());
+ reader = new InputStreamReader((InputStream) response.getEntity(),
UTF_8);
+ Metadata writeLimitMetadata = JsonMetadata.fromJson(reader);
+ int len = 0;
+ StringBuilder extracted = new StringBuilder();
+ txt =
writeLimitMetadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
+ len += (txt == null) ? 0 : txt.length();
+ extracted.append(txt);
+ if (totalLen > len) {
+ boolean wlr = "true".equals(writeLimitMetadata
+
.get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED));
+ System.out.println(f.getName() + " " + len + " : " +
writeLimit);
+ assertTrue(f.getName() + ": writelimit: " + writeLimit + "
len: " + len,
+ len <= writeLimit);
+ assertEquals(
+ f.getName() + " : " + writeLimit + " : " + len + "
total len: " + totalLen,
+ true, wlr);
+ } else if (len > totalLen) {
+ fail("len should never be > totalLen " + len + " : " +
totalLen);
+ }
+ }
+ }*/
+
}