This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new f58b01f TIKA-3196 -- multithreading issue in package parser
f58b01f is described below
commit f58b01f10950f04ab10dca70ab9d639178001f5c
Author: tallison <[email protected]>
AuthorDate: Mon Apr 19 09:35:18 2021 -0400
TIKA-3196 -- multithreading issue in package parser
---
.../org/apache/tika/parser/pkg/PackageParser.java | 21 ++++++++++-----------
1 file changed, 10 insertions(+), 11 deletions(-)
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 6f6db03..7632586 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -24,6 +24,7 @@ import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.ZipEntry;
import org.apache.commons.compress.PasswordRequiredException;
@@ -109,8 +110,6 @@ public class PackageParser extends AbstractParser {
// the mark limit used for stream
private static final int MARK_LIMIT = 100 * 1024 * 1024; // 100M
- // count of the entries in the archive, this is used for zip requires Data
Descriptor
- private int entryCnt = 0;
static final Set<MediaType> loadPackageSpecializations() {
Set<MediaType> zipSpecializations = new HashSet<>();
@@ -300,9 +299,10 @@ public class PackageParser extends AbstractParser {
// mark before we start parsing entries for potential reset
stream.mark(MARK_LIMIT);
- entryCnt = 0;
+ //require mutable int...this is not being used for thread safety.
+ AtomicInteger entryCnt = new AtomicInteger(0);
try {
- parseEntries(false, ais, metadata, extractor, xhtml);
+ parseEntries(false, ais, metadata, extractor, xhtml, entryCnt);
} catch (UnsupportedZipFeatureException zfe) {
// If this is a zip archive which requires a data descriptor,
parse it again
if (zfe.getFeature() == Feature.DATA_DESCRIPTOR) {
@@ -311,13 +311,11 @@ public class PackageParser extends AbstractParser {
// An exception would be thrown if MARK_LIMIT is not big enough
stream.reset();
ais = new ZipArchiveInputStream(new
CloseShieldInputStream(stream), encoding, true, true);
- parseEntries(true, ais, metadata, extractor, xhtml);
+ parseEntries(true, ais, metadata, extractor, xhtml, entryCnt);
}
} finally {
ais.close();
tmp.close();
- // reset the entryCnt
- entryCnt = 0;
}
xhtml.endDocument();
@@ -336,17 +334,18 @@ public class PackageParser extends AbstractParser {
* @throws SAXException if the SAX events could not be processed
*/
private void parseEntries(boolean shouldUseDataDescriptor,
ArchiveInputStream ais, Metadata metadata,
- EmbeddedDocumentExtractor extractor,
XHTMLContentHandler xhtml)
+ EmbeddedDocumentExtractor extractor,
XHTMLContentHandler xhtml,
+ AtomicInteger entryCnt)
throws TikaException, IOException, SAXException {
try {
ArchiveEntry entry = ais.getNextEntry();
while (entry != null) {
- if (shouldUseDataDescriptor && entryCnt > 0) {
+ if (shouldUseDataDescriptor && entryCnt.get() > 0) {
// With shouldUseDataDescriptor being true, we are reading
// the zip once again. The number of entryCnt entries have
// already been parsed in the last time, so we can just
// skip these entries.
- entryCnt--;
+ entryCnt.decrementAndGet();
entry = ais.getNextEntry();
continue;
}
@@ -359,7 +358,7 @@ public class PackageParser extends AbstractParser {
// Record the number of entries we have read, this is used
// for zip archives using Data Descriptor. It's used for
// skipping the entries we have already read
- entryCnt++;
+ entryCnt.incrementAndGet();
}
entry = ais.getNextEntry();