This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4048 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 46832d718eb1572d3cca4d7d18415afff336d4c6 Author: tallison <talli...@apache.org> AuthorDate: Wed Aug 16 13:27:43 2023 -0400 TIKA-4048 -- revert change in default decompressConcatenated and add a gzip subtype detector for warc+gz --- CHANGES.txt | 3 +- .../org/apache/tika/mime/tika-mimetypes.xml | 6 ++ .../src/test/java/org/apache/tika/TikaTest.java | 63 +++++++++++++++ .../detect/gzip/GZipSpecializationDetector.java | 90 +++++++++++++++++++++ .../apache/tika/parser/pkg/CompressorParser.java | 2 +- .../services/org.apache.tika.detect.Detector | 16 ++++ .../tika-parser-webarchive-module/pom.xml | 8 +- .../org/apache/tika/parser/warc/WARCParser.java | 5 +- .../apache/tika/parser/warc/WARCParserTest.java | 23 +++++- .../test-documents/testWARC_multiple.warc | Bin 0 -> 6773 bytes .../test-documents/testWARC_multiple.warc.gz | Bin 0 -> 5907 bytes .../apache/tika/detect/TestDetectorLoading.java | 8 +- .../org/apache/tika/parser/pkg/GzipParserTest.java | 12 ++- .../resources/configs/tika-config-multiple-gz.xml | 29 +++++++ 14 files changed, 253 insertions(+), 12 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 55bd83671..015a55e43 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -7,8 +7,7 @@ Release 2.8.1 - ??? * Fix bug that led to duplicate extraction of macros from some OLE2 containers (TIKA-4116). - * Changed default decompressConcatenated to true in CompressorParser. - Users may revert to legacy behavior via tika-config.xml (TIKA-4048). + * Add detection of warc.gz as a specialization of gz and parse as if a standard WARC (TIKA-4048). * Allow users to modify the attachment limit size in the /unpack resource (TIKA-4039) diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 362ace8c1..47203a163 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -3212,6 +3212,12 @@ <glob pattern="*.warc"/> </mime-type> + <mime-type type="application/warc+gz"> + <acronym>WARC</acronym> + <_comment>WARC</_comment> + <glob pattern="*.warc.gz"/> + </mime-type> + <mime-type type="application/wasm"> <acronym>Wasm</acronym> <_comment>Web Assembly</_comment> diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index a00d7b2b0..c20229b59 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -16,6 +16,7 @@ */ package org.apache.tika; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -101,6 +102,33 @@ public abstract class TikaTest { assertFalse(haystack.contains(needle), needle + " unexpectedly found in:\n" + haystack); } + public static void assertMetadataListEquals(List<Metadata> metadataListA, + List<Metadata> metadataListB, + Set<String> fieldsToIgnore) { + assertEquals(metadataListA.size(), metadataListB.size(), "different sizes"); + for (int i = 0; i < metadataListA.size(); i++) { + Metadata mA = metadataListA.get(i); + Metadata mB = metadataListB.get(i); + Set<String> mAFields = new HashSet<>(); + for (String n : mA.names()) { + if (fieldsToIgnore.contains(n)) { + continue; + } + mAFields.add(n); + assertArrayEquals(mA.getValues(n), mB.getValues(n), "problem with " + n + + " in metadata index=" + i); + } + Set<String> mBFields = new HashSet<>(); + for (String n : mB.names()) { + if (fieldsToIgnore.contains(n)) { + continue; + } + mBFields.add(n); + } + assertEquals(mAFields, mBFields); + } + } + /** * Test that in at least one item in metadataList, all keys and values * in minExpected are contained. @@ -315,6 +343,14 @@ public abstract class TikaTest { return getRecursiveMetadata(filePath, new ParseContext()); } + protected List<Metadata> getRecursiveMetadata(String filePath, + BasicContentHandlerFactory.HANDLER_TYPE handlerType) + throws Exception { + return getRecursiveMetadata(filePath, TikaTest.AUTO_DETECT_PARSER, new Metadata(), + new ParseContext(), true, + handlerType); + } + protected List<Metadata> getRecursiveMetadata(String filePath, Metadata metadata) throws Exception { return getRecursiveMetadata(filePath, metadata, new ParseContext()); @@ -340,6 +376,16 @@ public abstract class TikaTest { } } + protected List<Metadata> getRecursiveMetadata(String filePath, Parser wrapped, + Metadata metadata, ParseContext context, + boolean suppressException, + BasicContentHandlerFactory.HANDLER_TYPE handlerType) + throws Exception { + try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) { + return getRecursiveMetadata(is, wrapped, metadata, context, suppressException, handlerType); + } + } + protected List<Metadata> getRecursiveMetadata(Path path, ParseContext context, boolean suppressException) throws Exception { Metadata metadata = new Metadata(); @@ -406,6 +452,23 @@ public abstract class TikaTest { return handler.getMetadataList(); } + protected List<Metadata> getRecursiveMetadata(InputStream is, Parser p, Metadata metadata, + ParseContext context, boolean suppressException, + BasicContentHandlerFactory.HANDLER_TYPE handlerType) + throws Exception { + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory(handlerType, -1)); + try { + wrapper.parse(is, handler, metadata, context); + } catch (Exception e) { + if (!suppressException) { + throw e; + } + } + return handler.getMetadataList(); + } + protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java new file mode 100644 index 000000000..e3d743ad3 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.detect.gzip; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.zip.GZIPInputStream; + +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; +import org.apache.commons.compress.utils.IOUtils; +import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; +import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; + +import org.apache.tika.detect.Detector; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; + +/** + * This is designed to detect commonly gzipped file types such as warc.gz. + * This is a first step. We still need to implement tar.gz and svg.gz and ??? + */ +public class GZipSpecializationDetector implements Detector { + public static MediaType GZ = MediaType.application("gzip"); + public static MediaType WARC_GZ = MediaType.application("warc+gz"); + + @Override + public MediaType detect(InputStream input, Metadata metadata) throws IOException { + if (input == null) { + return MediaType.OCTET_STREAM; + } + input.mark(2); + byte[] firstTwo = new byte[2]; + try { + IOUtils.readFully(input, firstTwo); + } finally { + input.reset(); + } + int magic = ((firstTwo[1] & 0xff) << 8) | (firstTwo[0] & 0xff); + if (GZIPInputStream.GZIP_MAGIC != magic) { + return MediaType.OCTET_STREAM; + } + return detectSpecialization(input, metadata); + } + + private MediaType detectSpecialization(InputStream input, Metadata metadata) throws IOException { + + int buffSize = 1024; + UnsynchronizedByteArrayOutputStream gzippedBytes = new UnsynchronizedByteArrayOutputStream(); + try { + IOUtils.copyRange(input, buffSize, gzippedBytes); + } catch (IOException e) { + //swallow + } finally { + input.reset(); + } + UnsynchronizedByteArrayOutputStream bytes = new UnsynchronizedByteArrayOutputStream(); + try (InputStream is = new + GzipCompressorInputStream(new UnsynchronizedByteArrayInputStream(gzippedBytes.toByteArray()))) { + int c = is.read(); + //read bytes one at a time to avoid premature EOF from buffering + while (c > -1) { + bytes.write(c); + c = is.read(); + } + } catch (IOException e) { + //swallow + } + //TODO: something better than this + String s = new String(bytes.toByteArray(), StandardCharsets.UTF_8); + if (s.startsWith("WARC/")) { + return WARC_GZ; + } + return GZ; + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java index 6b42250b3..77f5b9647 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java @@ -86,7 +86,7 @@ public class CompressorParser extends AbstractParser { private static Set<MediaType> SUPPORTED_TYPES; private static Map<String, String> MIMES_TO_NAME; - private boolean decompressConcatenated = true; + private boolean decompressConcatenated = false; static { Set<MediaType> TMP_SET = new HashSet<>(MediaType diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector new file mode 100644 index 000000000..a5d143217 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.tika.detect.gzip.GZipSpecializationDetector diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml index c3743cb6f..d194f190e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml @@ -38,7 +38,7 @@ <groupId>org.apache.commons</groupId> <artifactId>commons-compress</artifactId> </dependency> - <!-- need these for detection/ungzipping and html parsing in tests --> + <!-- need these for detection/gunzipping and html+txt parsing in tests --> <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-parser-html-module</artifactId> @@ -51,6 +51,12 @@ <version>${project.version}</version> <scope>test</scope> </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-parser-text-module</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> </dependencies> <build> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java index 7e1dcc17a..8025f6643 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java @@ -52,7 +52,8 @@ import org.apache.tika.utils.StringUtils; public class WARCParser extends AbstractParser { private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet( - new HashSet<>(Arrays.asList(MediaType.application("warc")))); + new HashSet<>(Arrays.asList(MediaType.application("warc"), + MediaType.application("warc+gz")))); public static String WARC_PREFIX = "warc:"; public static String WARC_HTTP_PREFIX = WARC_PREFIX + "http:"; @@ -137,6 +138,8 @@ public class WARCParser extends AbstractParser { metadata.set(Metadata.CONTENT_LENGTH, Long.toString(payload.body().size())); if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) { + //TODO check Content-Encoding on the warcResponse.http.headers and wrap the stream. + //May need to sniff first few bytes to confirm accuracy, e.g. gzip compression ? try (InputStream tis = TikaInputStream.get(payload.body().stream())) { embeddedDocumentExtractor.parseEmbedded(tis, xhtml, metadata, true); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java index e37203ce6..c92f8ec15 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java @@ -18,13 +18,16 @@ package org.apache.tika.parser.warc; import static org.junit.jupiter.api.Assertions.assertEquals; +import java.util.HashSet; import java.util.List; +import java.util.Set; import org.junit.jupiter.api.Test; import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.sax.BasicContentHandlerFactory; public class WARCParserTest extends TikaTest { @@ -35,14 +38,30 @@ public class WARCParserTest extends TikaTest { public void testBasic() throws Exception { List<Metadata> metadataList = getRecursiveMetadata("cc.warc.gz"); - assertEquals(3, metadataList.size()); + assertEquals(2, metadataList.size()); + assertEquals("application/warc+gz", metadataList.get(0).get(Metadata.CONTENT_TYPE)); assertContains("text/html", metadataList.get(1).get(Metadata.CONTENT_TYPE)); assertContains("Common Crawl on Twitter", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); - assertEquals("application/warc", metadataList.get(2).get(Metadata.CONTENT_TYPE)); assertEquals("<urn:uuid:c3f02271-44d2-4159-9cdb-3e3efeb16ba0>", metadataList.get(1).get("warc:WARC-Warcinfo-ID")); assertEquals("http://commoncrawl.org/", metadataList.get(1).get("warc:WARC-Target-URI")); + } + + @Test + public void testMultipleRecords() throws Exception { + //TIKA- + List<Metadata> metadataList = getRecursiveMetadata("testWARC_multiple.warc", + BasicContentHandlerFactory.HANDLER_TYPE.TEXT); + List<Metadata> gzMetadataList = getRecursiveMetadata("testWARC_multiple.warc.gz", + BasicContentHandlerFactory.HANDLER_TYPE.TEXT); + + Set<String> fieldsToIgnore = new HashSet<>(); + fieldsToIgnore.add("X-TIKA:parse_time_millis"); + fieldsToIgnore.add("Content-Type"); + assertMetadataListEquals(metadataList, gzMetadataList, fieldsToIgnore); + assertEquals("application/warc", metadataList.get(0).get(Metadata.CONTENT_TYPE)); + assertEquals("application/warc+gz", gzMetadataList.get(0).get(Metadata.CONTENT_TYPE)); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc new file mode 100644 index 000000000..e0bdf7e24 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc.gz new file mode 100644 index 000000000..4a5dcbf5b Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc.gz differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java index a95212944..82a9e7df9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java @@ -32,11 +32,13 @@ public class TestDetectorLoading { //integration test Detector detector = TikaConfig.getDefaultConfig().getDetector(); List<Detector> detectors = ((CompositeDetector) detector).getDetectors(); - assertEquals(6, detectors.size()); + assertEquals(7, detectors.size()); assertEquals("org.gagravarr.tika.OggDetector", detectors.get(0).getClass().getName()); + assertEquals("org.apache.tika.detect.gzip.GZipSpecializationDetector", + detectors.get(2).getClass().getName()); assertEquals("org.apache.tika.detect.microsoft.POIFSContainerDetector", - detectors.get(2).getClass().getName()); - assertEquals("org.apache.tika.mime.MimeTypes", detectors.get(5).getClass().getName()); + detectors.get(3).getClass().getName()); + assertEquals("org.apache.tika.mime.MimeTypes", detectors.get(6).getClass().getName()); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java index fba465882..0bc80263a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java @@ -82,9 +82,17 @@ public class GzipParserTest extends AbstractPkgTest { } @Test - public void testDecompressConcatenatedDefault() throws Exception { + public void testDecompressConcatenated() throws Exception { + //test default + assertEquals(2, getRecursiveMetadata("multiple.gz").size()); + + //test config + TikaConfig tikaConfig = null; + try (InputStream is = getResourceAsStream("/configs/tika-config-multiple-gz.xml")) { + tikaConfig = new TikaConfig(is); + } assertContains("<p>ab</p>", - getRecursiveMetadata("multiple.gz").get(1) + getRecursiveMetadata("multiple.gz", new AutoDetectParser(tikaConfig)).get(1) .get(TikaCoreProperties.TIKA_CONTENT)); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-multiple-gz.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-multiple-gz.xml new file mode 100644 index 000000000..370532af4 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-multiple-gz.xml @@ -0,0 +1,29 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"> + <parser-exclude class="org.apache.tika.parser.pkg.CompressorParser"/> + </parser> + <parser class="org.apache.tika.parser.pkg.CompressorParser"> + <params> + <param name="decompressConcatenated" type="bool">true</param> + </params> + </parser> + </parsers> +</properties>