This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 5387cef917 TIKA-4553-rm-tika-config-parsers-standard-package (#2440)
5387cef917 is described below
commit 5387cef91797da3a0af6a5c70a9cceb6d41c3cc3
Author: Tim Allison <[email protected]>
AuthorDate: Thu Dec 11 10:05:04 2025 -0500
TIKA-4553-rm-tika-config-parsers-standard-package (#2440)
---
.../tika/language/translate/DefaultTranslator.java | 6 +
.../java/org/apache/tika/parser/EmptyParser.java | 2 +
.../java/org/apache/tika/parser/ErrorParser.java | 2 +
.../tika/config/TikaConfigSerializerTest.java | 15 +-
.../apache/tika/config/TikaDetectorConfigTest.java | 38 +---
.../tika/config/TikaEncodingDetectorTest.java | 3 +-
.../apache/tika/config/TikaParserConfigTest.java | 164 +++++++-------
.../tika/config/TikaTranslatorConfigTest.java | 54 ++---
.../org/apache/tika/detect/TestZipDetector.java | 7 +-
.../java/org/apache/tika/parser/TestXXEInXML.java | 244 ---------------------
.../parser/fork/ForkParserIntegrationTest.java | 8 +-
.../tika/parser/ocr/TesseractOCRParserTest.java | 9 +-
.../pkg/CompositeZipContainerDetectorTest.java | 14 +-
.../org/apache/tika/parser/pkg/GzipParserTest.java | 9 +-
.../apache/tika/parser/pkg/UnrarParserTest.java | 10 +-
.../configs/TIKA-1708-detector-composite.json | 13 ++
.../configs/TIKA-1708-detector-default.json | 11 +-
.../test/resources/configs/tika-4424-config.xml | 26 ---
.../src/test/resources/configs/tika-4441-120.xml | 36 ---
.../test/resources/configs/tika-4441-12000000.xml | 36 ---
.../src/test/resources/configs/tika-4441-neg1.xml | 36 ---
.../src/test/resources/configs/tika-4533.xml | 47 ----
.../configs/tika-config-digests-pdf-only.xml | 33 ---
.../configs/tika-config-digests-skip-container.xml | 33 ---
.../test/resources/configs/tika-config-digests.xml | 32 ---
...ka-config-doubling-custom-handler-decorator.xml | 27 ---
.../tika-config-geo-point-metadata-filter.xml | 24 --
.../resources/configs/tika-config-lib-pst.json | 11 +-
.../test/resources/configs/tika-config-lib-pst.xml | 26 ---
.../resources/configs/tika-config-multiple-gz.json | 14 ++
.../resources/configs/tika-config-multiple-gz.xml | 29 ---
.../test/resources/configs/tika-unrar-config.json | 12 +
.../org/apache/tika/config/TIKA-1558-exclude.json | 23 +-
.../org/apache/tika/config/TIKA-1558-exclude.xml | 29 ---
.../apache/tika/config/TIKA-1558-excludesub.json | 7 +-
.../apache/tika/config/TIKA-1558-excludesub.xml | 24 --
.../tika/config/TIKA-1702-detector-exclude.xml | 31 ---
.../tika/config/TIKA-1702-translator-default.json | 10 +-
.../tika/config/TIKA-1702-translator-default.xml | 24 --
.../config/TIKA-1702-translator-empty-default.json | 11 +-
.../config/TIKA-1702-translator-empty-default.xml | 22 --
.../tika/config/TIKA-1702-translator-empty.json | 8 +-
.../tika/config/TIKA-1702-translator-empty.xml | 20 --
.../tika/config/TIKA-1708-detector-composite.json | 13 +-
.../tika/config/TIKA-1708-detector-composite.xml | 25 ---
...-2273-encoding-detector-outside-static-init.xml | 34 ---
...TIKA-2273-exclude-encoding-detector-default.xml | 29 ---
.../TIKA-2273-no-icu4j-encoding-detector.xml | 27 ---
.../TIKA-2273-non-detecting-params-bad-charset.xml | 29 ---
.../tika/config/TIKA-2273-non-detecting-params.xml | 29 ---
.../TIKA-2273-parameterize-encoding-detector.xml | 30 ---
.../TIKA-2485-encoding-detector-mark-limits.xml | 38 ----
.../org/apache/tika/parser/TIKA-3137-include.xml | 34 ---
.../apache/tika/parser/ocr/tesseract-config.json | 14 +-
.../apache/tika/parser/ocr/tesseract-config.xml | 32 ---
.../apache/tika/config/loader/DetectorLoader.java | 7 +
.../apache/tika/config/loader/ParserLoader.java | 24 +-
57 files changed, 260 insertions(+), 1345 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java
b/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java
index e9d9636f52..404e5fc2a7 100644
---
a/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java
+++
b/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java
@@ -20,6 +20,7 @@ import java.io.IOException;
import java.util.List;
import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.TikaException;
import org.apache.tika.utils.CompareUtils;
@@ -30,6 +31,7 @@ import org.apache.tika.utils.CompareUtils;
*
* @since Apache Tika 1.6
*/
+@TikaComponent
public class DefaultTranslator implements Translator {
private transient final ServiceLoader loader;
@@ -59,6 +61,10 @@ public class DefaultTranslator implements Translator {
*/
private static Translator getFirstAvailable(ServiceLoader loader) {
for (Translator t : getDefaultTranslators(loader)) {
+ // Skip DefaultTranslator to avoid infinite recursion
+ if (t instanceof DefaultTranslator) {
+ continue;
+ }
if (t.isAvailable()) {
return t;
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java
b/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java
index 546d0c2a71..83e88a1463 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java
@@ -23,6 +23,7 @@ import java.util.Set;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.apache.tika.config.TikaComponent;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -32,6 +33,7 @@ import org.apache.tika.sax.XHTMLContentHandler;
* attempting to parse the given document stream. Useful as a sentinel parser
* for unknown document types.
*/
+@TikaComponent(spi = false)
public class EmptyParser implements Parser {
/**
* Singleton instance of this class.
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java
b/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java
index b8071cb52f..f7d4063d6c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java
@@ -22,6 +22,7 @@ import java.util.Set;
import org.xml.sax.ContentHandler;
+import org.apache.tika.config.TikaComponent;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -31,6 +32,7 @@ import org.apache.tika.mime.MediaType;
* attempting to parse the given document stream. Useful as a sentinel parser
* for unknown document types.
*/
+@TikaComponent(spi = false)
public class ErrorParser implements Parser {
/**
* Singleton instance of this class.
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
index 60172383a9..9cb5df8d52 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
@@ -15,22 +15,11 @@
* limitations under the License.
*/
package org.apache.tika.config;
-
-import static org.apache.tika.TikaTest.assertContains;
-import static org.apache.tika.TikaTest.assertContainsCount;
-import static org.apache.tika.TikaTest.assertNotContained;
-
-import java.io.ByteArrayInputStream;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.nio.charset.StandardCharsets;
-
import org.junit.jupiter.api.Disabled;
-import org.junit.jupiter.api.Test;
@Disabled("TODO -- convert to TikaLoader/serializer")
public class TikaConfigSerializerTest {
-
+/*
@Test
public void testBasicParams() throws Exception {
TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
@@ -73,6 +62,6 @@ public class TikaConfigSerializerTest {
assertContainsCount("<param name=\"concatenatePhoneticRuns\"
type=\"bool\">true</param>",
writer.toString(), 3);
}
-
+*/
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
index 203f86d845..5ae4237b0c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java
@@ -24,6 +24,7 @@ import static org.junit.jupiter.api.Assertions.fail;
import org.junit.jupiter.api.Test;
import org.apache.tika.TikaLoaderHelper;
+import org.apache.tika.TikaTest;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.DefaultDetector;
@@ -33,15 +34,12 @@ import
org.apache.tika.detect.microsoft.POIFSContainerDetector;
import org.apache.tika.detect.zip.DefaultZipContainerDetector;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.microsoft.pst.OutlookPSTParser;
/**
- * Junit test class for {@link TikaConfig}, which cover things
- * that {@link TikaConfigTest} can't do due to a need for the
- * full set of detectors
+ * Junit test class for detector configuration via JSON.
*/
-public class TikaDetectorConfigTest extends AbstractTikaConfigTest {
+public class TikaDetectorConfigTest extends TikaTest {
@Test
public void testDetectorExcludeFromDefault() throws Exception {
@@ -53,30 +51,23 @@ public class TikaDetectorConfigTest extends
AbstractTikaConfigTest {
// Should be wrapping two detectors
assertEquals(2, detector.getDetectors().size());
-
// First should be DefaultDetector, second Empty, that order
assertEquals(DefaultDetector.class,
detector.getDetectors().get(0).getClass());
assertEquals(EmptyDetector.class,
detector.getDetectors().get(1).getClass());
-
// Get the DefaultDetector from the config
DefaultDetector confDetector = (DefaultDetector)
detector.getDetectors().get(0);
- // Get a fresh "default" DefaultParser
- TikaLoader.getMediaTypeRegistry().getTypes();
- MimeTypes mimeTypes = new MimeTypes();
+ // Get a fresh "default" DefaultDetector
DefaultDetector normDetector = new
DefaultDetector(TikaLoader.getMimeTypes());
-
// The default one will offer the Zip and POIFS detectors
assertDetectors(normDetector, true, true);
-
// The one from the config won't, as we excluded those
assertDetectors(confDetector, false, false);
}
-
/**
* TIKA-1708 - If the Zip detector is disabled, either explicitly,
* or via giving a list of detectors that it isn't part of, ensure
@@ -93,30 +84,24 @@ public class TikaDetectorConfigTest extends
AbstractTikaConfigTest {
// Check it has the POIFS one, but not the zip one
assertDetectors(detectorWX, true, false);
-
// Check the one with an explicit list
- TikaConfig configCL = getConfig("TIKA-1708-detector-composite.xml");
- assertNotNull(configCL.getParser());
- assertNotNull(configCL.getDetector());
- CompositeDetector detectorCL = (CompositeDetector)
configCL.getDetector();
+ TikaLoader configCL =
TikaLoaderHelper.getLoader("TIKA-1708-detector-composite.json");
+ assertNotNull(configCL.loadParsers());
+ assertNotNull(configCL.loadDetectors());
+ CompositeDetector detectorCL = (CompositeDetector)
configCL.loadDetectors();
assertEquals(2, detectorCL.getDetectors().size());
// Check it also has the POIFS one, but not the zip one
assertDetectors(detectorCL, true, false);
-
- // Check that both detectors have a mimetypes with entries
- assertTrue(configWX.getMediaTypeRegistry().getTypes().size() > 100,
- "Not enough mime types: " +
configWX.getMediaTypeRegistry().getTypes().size());
- assertTrue(configCL.getMediaTypeRegistry().getTypes().size() > 100,
- "Not enough mime types: " +
configCL.getMediaTypeRegistry().getTypes().size());
-
+ // Check that media type registry has entries
+ assertTrue(TikaLoader.getMediaTypeRegistry().getTypes().size() > 100,
+ "Not enough mime types: " +
TikaLoader.getMediaTypeRegistry().getTypes().size());
// Now check they detect PST files correctly
try (TikaInputStream outer = TikaInputStream
.get(getResourceAsStream("/test-documents/testPST.pst"))) {
try (TikaInputStream stream =
TikaInputStream.get(outer.getPath())) {
-
assertEquals(OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE,
detectorWX.detect(stream, new Metadata()));
assertEquals(OutlookPSTParser.MS_OUTLOOK_PST_MIMETYPE,
@@ -152,5 +137,4 @@ public class TikaDetectorConfigTest extends
AbstractTikaConfigTest {
assertTrue(hasZip, "Should have the ZipContainerDetector");
}
}
-
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
index a6328c00db..98175dde28 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
@@ -32,6 +32,7 @@ import org.junit.jupiter.api.Test;
import org.apache.tika.Tika;
import org.apache.tika.TikaLoaderHelper;
+import org.apache.tika.TikaTest;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.detect.CompositeEncodingDetector;
import org.apache.tika.detect.EncodingDetector;
@@ -49,7 +50,7 @@ import org.apache.tika.parser.txt.Icu4jEncodingDetector;
import org.apache.tika.parser.txt.TXTParser;
import org.apache.tika.parser.txt.UniversalEncodingDetector;
-public class TikaEncodingDetectorTest extends AbstractTikaConfigTest {
+public class TikaEncodingDetectorTest extends TikaTest {
@Test
public void testDefault() throws TikaConfigException {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
index 948cae054e..c2ba6c233a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaParserConfigTest.java
@@ -17,94 +17,87 @@
package org.apache.tika.config;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
-import java.net.URISyntaxException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
+import java.util.Set;
-import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
+import org.apache.tika.TikaTest;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.executable.ExecutableParser;
import org.apache.tika.parser.xml.XMLParser;
/**
- * Junit test class for {@link TikaConfig}, which cover things
- * that {@link TikaConfigTest} can't do due to a need for the
- * full set of parsers
+ * Junit test class for parser configuration via JSON,
+ * covering things that require the full set of parsers.
*/
-public class TikaParserConfigTest extends AbstractTikaConfigTest {
+public class TikaParserConfigTest extends TikaTest {
+
+ protected static ParseContext context = new ParseContext();
+
+ private TikaLoader getLoader(String config) throws Exception {
+ Path path =
Paths.get(TikaParserConfigTest.class.getResource(config).toURI());
+ return TikaLoader.load(path);
+ }
@Test
public void testMimeExcludeInclude() throws Exception {
- TikaConfig config = getConfig("TIKA-1558-exclude.xml");
- assertNotNull(config.getParser());
- assertNotNull(config.getDetector());
- Parser parser = config.getParser();
+ TikaLoader loader = getLoader("TIKA-1558-exclude.json");
+ Parser parser = loader.loadParsers();
+ assertNotNull(parser);
+ assertNotNull(loader.loadDetectors());
MediaType PDF = MediaType.application("pdf");
MediaType JPEG = MediaType.image("jpeg");
-
- // Has two parsers
+ // Has two parsers: EmptyParser (decorated) and CompositeParser of SPI
parsers (decorated)
assertEquals(CompositeParser.class, parser.getClass());
CompositeParser cParser = (CompositeParser) parser;
assertEquals(2, cParser.getAllComponentParsers().size());
- // Both are decorated
- assertTrue(cParser.getAllComponentParsers().get(0) instanceof
ParserDecorator);
- assertTrue(cParser.getAllComponentParsers().get(1) instanceof
ParserDecorator);
- ParserDecorator p0 = (ParserDecorator)
cParser.getAllComponentParsers().get(0);
- ParserDecorator p1 = (ParserDecorator)
cParser.getAllComponentParsers().get(1);
-
-
- // DefaultParser will be wrapped with excludes
- assertEquals(DefaultParser.class, p0.getWrappedParser().getClass());
-
- assertNotContained(PDF, p0.getSupportedTypes(context));
- assertContains(PDF, p0.getWrappedParser().getSupportedTypes(context));
- assertNotContained(JPEG, p0.getSupportedTypes(context));
- assertContains(JPEG, p0.getWrappedParser().getSupportedTypes(context));
-
-
- // Will have an empty parser for PDF
- assertEquals(EmptyParser.class, p1.getWrappedParser().getClass());
- assertEquals(1, p1.getSupportedTypes(context).size());
- assertContains(PDF, p1.getSupportedTypes(context));
- assertNotContained(PDF,
p1.getWrappedParser().getSupportedTypes(context));
+ // First parser should be EmptyParser decorated with mimeInclude for
PDF
+ Parser p0 = cParser.getAllComponentParsers().get(0);
+ assertTrue(p0 instanceof ParserDecorator, "First parser should be
decorated");
+ ParserDecorator pd0 = (ParserDecorator) p0;
+ assertEquals(EmptyParser.class, pd0.getWrappedParser().getClass());
+ Set<MediaType> p0Types = pd0.getSupportedTypes(context);
+ assertContains(PDF, p0Types);
+ assertEquals(1, p0Types.size());
+
+ // Second parser should be SPI parsers decorated with mimeExclude for
PDF/JPEG
+ Parser p1 = cParser.getAllComponentParsers().get(1);
+ assertTrue(p1 instanceof ParserDecorator, "Second parser should be
decorated");
+ ParserDecorator pd1 = (ParserDecorator) p1;
+ Set<MediaType> p1Types = pd1.getSupportedTypes(context);
+ assertNotContained(PDF, p1Types);
+ assertNotContained(JPEG, p1Types);
}
@Test
public void testParserExcludeFromDefault() throws Exception {
- TikaConfig config = getConfig("TIKA-1558-exclude.xml");
- assertNotNull(config.getParser());
- assertNotNull(config.getDetector());
- CompositeParser parser = (CompositeParser) config.getParser();
+ TikaLoader loader = getLoader("TIKA-1558-exclude.json");
+ Parser parser = loader.loadParsers();
+ assertNotNull(parser);
MediaType PE_EXE = MediaType.application("x-msdownload");
MediaType ELF = MediaType.application("x-elf");
-
- // Get the DefaultParser from the config
- ParserDecorator confWrappedParser =
- (ParserDecorator)
parser.getParsers().get(MediaType.APPLICATION_XML);
- assertNotNull(confWrappedParser);
- DefaultParser confParser = (DefaultParser)
confWrappedParser.getWrappedParser();
-
- // Get a fresh "default" DefaultParser
- DefaultParser normParser = new
DefaultParser(config.getMediaTypeRegistry());
-
+ // Get a fresh "default" DefaultParser for comparison
+ DefaultParser normParser = new
DefaultParser(TikaLoader.getMediaTypeRegistry());
// The default one will offer the Executable Parser
assertContains(PE_EXE, normParser.getSupportedTypes(context));
@@ -119,16 +112,12 @@ public class TikaParserConfigTest extends
AbstractTikaConfigTest {
}
assertTrue(hasExec);
-
- // The one from the config won't
- assertNotContained(PE_EXE, confParser.getSupportedTypes(context));
- assertNotContained(ELF, confParser.getSupportedTypes(context));
-
- for (Parser p : confParser.getParsers().values()) {
- if (p instanceof ExecutableParser) {
- fail("Shouldn't have the Executable Parser from config");
- }
- }
+ // The config-loaded parser should NOT support executable types
+ // (ExecutableParser was excluded)
+ CompositeParser cParser = (CompositeParser) parser;
+ Set<MediaType> supportedTypes = cParser.getSupportedTypes(context);
+ assertNotContained(PE_EXE, supportedTypes);
+ assertNotContained(ELF, supportedTypes);
}
/**
@@ -137,10 +126,9 @@ public class TikaParserConfigTest extends
AbstractTikaConfigTest {
*/
@Test
public void defaultParserExclude() throws Exception {
- TikaConfig config = new TikaConfig();
- assertNotNull(config.getParser());
- assertNotNull(config.getDetector());
- CompositeParser cp = (CompositeParser) config.getParser();
+ // First verify default config includes XMLParser
+ TikaLoader defaultLoader = TikaLoader.loadDefault();
+ CompositeParser cp = (CompositeParser) defaultLoader.loadParsers();
List<Parser> parsers = cp.getAllComponentParsers();
boolean hasXML = false;
@@ -152,36 +140,46 @@ public class TikaParserConfigTest extends
AbstractTikaConfigTest {
}
assertTrue(hasXML, "Default config should include an XMLParser.");
- // This custom TikaConfig should exclude XMLParser and all of its
subclasses.
- config = getConfig("TIKA-1558-excludesub.xml");
- cp = (CompositeParser) config.getParser();
+ // This custom config should exclude XMLParser
+ TikaLoader loader = getLoader("TIKA-1558-excludesub.json");
+ cp = (CompositeParser) loader.loadParsers();
parsers = cp.getAllComponentParsers();
+ // Flatten nested CompositeParser if present
for (Parser p : parsers) {
- if (p instanceof XMLParser) {
+ if (p instanceof CompositeParser) {
+ for (Parser inner : ((CompositeParser)
p).getAllComponentParsers()) {
+ if (inner instanceof XMLParser) {
+ fail("Custom config should not include an XMLParser ("
+ inner.getClass() + ").");
+ }
+ }
+ } else if (p instanceof ParserDecorator) {
+ Parser wrapped = ((ParserDecorator) p).getWrappedParser();
+ if (wrapped instanceof XMLParser) {
+ fail("Custom config should not include an XMLParser (" +
wrapped.getClass() + ").");
+ }
+ if (wrapped instanceof CompositeParser) {
+ for (Parser inner : ((CompositeParser)
wrapped).getAllComponentParsers()) {
+ if (inner instanceof XMLParser) {
+ fail("Custom config should not include an
XMLParser (" + inner.getClass() + ").");
+ }
+ }
+ }
+ } else if (p instanceof XMLParser) {
fail("Custom config should not include an XMLParser (" +
p.getClass() + ").");
}
}
}
@Test
- @Disabled("TODO -- turn into actual unit test")
- public void testTesseractList() throws Exception {
- TikaLoader tikaLoader =
TikaLoader.load(getPath("tika-config-tesseract-arbitrary.json"));
- Parser p = tikaLoader.loadAutoDetectParser();
- Parser tesseract =
((CompositeParser)p).getAllComponentParsers().get(0);
-
- System.out.println(tesseract);
-
- }
-
- private Path getPath(String config) {
- try {
- return
Paths.get(TikaParserConfigTest.class.getResource("/configs/" + config)
- .toURI());
- } catch (URISyntaxException e) {
- throw new RuntimeException(e);
- }
+ public void testDefaultLoaderIncludesAllParsers() throws Exception {
+ TikaLoader loader = TikaLoader.loadDefault();
+ Parser parser = loader.loadParsers();
+ assertNotNull(parser);
+ assertTrue(parser instanceof CompositeParser);
+
+ CompositeParser cp = (CompositeParser) parser;
+ // Should have many parsers loaded from SPI
+ assertFalse(cp.getAllComponentParsers().isEmpty());
}
-
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java
index 234d4ea298..7a527382f7 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaTranslatorConfigTest.java
@@ -18,56 +18,50 @@ package org.apache.tika.config;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
-import static org.junit.jupiter.api.Assertions.assertThrows;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
import org.junit.jupiter.api.Test;
-import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.language.translate.DefaultTranslator;
import org.apache.tika.language.translate.EmptyTranslator;
/**
- * Junit test class for {@link TikaConfig}, which cover things
- * that {@link TikaConfigTest} can't do due to a need for the
- * full set of translators
+ * Junit test class for translator configuration via JSON.
*/
-public class TikaTranslatorConfigTest extends AbstractTikaConfigTest {
+public class TikaTranslatorConfigTest {
+
+ private TikaLoader getLoader(String config) throws Exception {
+ Path path =
Paths.get(TikaTranslatorConfigTest.class.getResource(config).toURI());
+ return TikaLoader.load(path);
+ }
@Test
public void testDefaultBehaviour() throws Exception {
- TikaConfig config = TikaConfig.getDefaultConfig();
- assertNotNull(config.getTranslator());
- assertEquals(DefaultTranslator.class,
config.getTranslator().getClass());
+ TikaLoader loader = TikaLoader.loadDefault();
+ assertNotNull(loader.loadTranslator());
+ assertEquals(DefaultTranslator.class,
loader.loadTranslator().getClass());
}
@Test
public void testRequestsDefault() throws Exception {
- TikaConfig config = getConfig("TIKA-1702-translator-default.xml");
- assertNotNull(config.getParser());
- assertNotNull(config.getDetector());
- assertNotNull(config.getTranslator());
+ TikaLoader loader = getLoader("TIKA-1702-translator-default.json");
+ assertNotNull(loader.loadParsers());
+ assertNotNull(loader.loadDetectors());
+ assertNotNull(loader.loadTranslator());
- assertEquals(DefaultTranslator.class,
config.getTranslator().getClass());
+ assertEquals(DefaultTranslator.class,
loader.loadTranslator().getClass());
}
@Test
public void testRequestsEmpty() throws Exception {
- TikaConfig config = getConfig("TIKA-1702-translator-empty.xml");
- assertNotNull(config.getParser());
- assertNotNull(config.getDetector());
- assertNotNull(config.getTranslator());
+ TikaLoader loader = getLoader("TIKA-1702-translator-empty.json");
+ assertNotNull(loader.loadParsers());
+ assertNotNull(loader.loadDetectors());
+ assertNotNull(loader.loadTranslator());
- assertEquals(EmptyTranslator.class, config.getTranslator().getClass());
- }
-
- /**
- * Currently, Translators don't support Composites, so
- * if multiple translators are given, throw a TikaConfigException
- */
- @Test
- public void testRequestsMultiple() throws Exception {
- assertThrows(TikaConfigException.class, () -> {
- TikaConfig config =
getConfig("TIKA-1702-translator-empty-default.xml");
- });
+ assertEquals(EmptyTranslator.class,
loader.loadTranslator().getClass());
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestZipDetector.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestZipDetector.java
index a4327544c4..a60fb8df86 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestZipDetector.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestZipDetector.java
@@ -32,7 +32,7 @@ import org.junit.jupiter.api.Test;
import org.apache.tika.Tika;
import org.apache.tika.TikaTest;
-import org.apache.tika.config.TikaConfig;
+import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -40,13 +40,12 @@ public class TestZipDetector extends TikaTest {
private static final String ZIP_FILE = "testTika4424.zip";
- private static final Detector DETECTOR = TikaConfig
- .getDefaultConfig()
- .getDetector();
+ private static Detector DETECTOR;
private static Path DOCX;
@BeforeAll
public static void setUp() throws Exception {
+ DETECTOR = TikaLoader.loadDefault().loadDetectors();
DOCX = Files.createTempFile("test-zip-", ".docx");
Files.copy(TestZipDetector.class.getResourceAsStream("/test-documents/testWORD.docx"),
DOCX, StandardCopyOption.REPLACE_EXISTING);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/TestXXEInXML.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/TestXXEInXML.java
deleted file mode 100644
index 2853b81854..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/TestXXEInXML.java
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser;
-
-import static org.junit.jupiter.api.Assertions.fail;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.FileNotFoundException;
-import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
-import org.junit.jupiter.api.Disabled;
-import org.junit.jupiter.api.Test;
-import org.xml.sax.ContentHandler;
-
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.microsoft.OfficeParserConfig;
-import org.apache.tika.sax.ToHTMLContentHandler;
-import org.apache.tika.utils.XMLReaderUtils;
-
-/**
- * This tests for XXE in basically xml type files, straight xml and zipped
- * xmls, e.g. ebook and ooxml.
- * It does not test for XXE prevention in files that may contain xml
- * files, such as PDFs and other XMP-containing files.
- */
-public class TestXXEInXML extends XMLTestBase {
- //TODO: figure out how to test XFA and xmp in PDFs
-
- private static final byte[] XXE =
- "<!DOCTYPE roottag PUBLIC \"-//OXML/XXE/EN\"
\"file:///couldnt_possibly_exist\">"
- .getBytes(StandardCharsets.UTF_8);
-
- @Test
- @Disabled("ignore vulnerable tests")
- public void testConfirmVulnerable() throws Exception {
- try {
- parse("testXXE.xml",
getResourceAsStream("/test-documents/testXXE.xml"),
- new VulnerableSAXParser(), new ParseContext());
- fail("should have failed!!!");
- } catch (FileNotFoundException e) {
- //expected
- }
- }
-
- @Test
- public void testXML() throws Exception {
- try (InputStream is =
getResourceAsStream("/test-documents/testXXE.xml")) {
- parse("testXXE.xml", is, AUTO_DETECT_PARSER, new ParseContext());
- }
- }
-
- @Test
- public void testInjectedXML() throws Exception {
- byte[] bytes = "<?xml version=\"1.0\"
encoding=\"UTF-8\"?><document>blah</document>"
- .getBytes(StandardCharsets.UTF_8);
- byte[] injected = injectXML(bytes, XXE);
- try {
- parse("injected", new ByteArrayInputStream(injected), new
VulnerableSAXParser(),
- new ParseContext());
- fail("injected should have triggered xxe");
- } catch (FileNotFoundException e) {
- //expected
- }
- }
-
- @Test
- public void test2003_2006xml() throws Exception {
- InputStream is =
getResourceAsStream("/test-documents/testWORD_2003ml.xml");
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- IOUtils.copy(is, bos);
- byte[] injected = injectXML(bos.toByteArray(), XXE);
- parse("testWORD_2003ml.xml", new ByteArrayInputStream(injected),
AUTO_DETECT_PARSER,
- new ParseContext());
- is.close();
-
- is = getResourceAsStream("/test-documents/testWORD_2006ml.xml");
- bos = new ByteArrayOutputStream();
- IOUtils.copy(is, bos);
- injected = injectXML(bos.toByteArray(), XXE);
- parse("testWORD_2006ml.xml", new ByteArrayInputStream(injected),
AUTO_DETECT_PARSER,
- new ParseContext());
- }
-
-
- @Test
- public void testPOIOOXMLs() throws Exception {
- for (String fileName : new String[]{"testWORD.docx",
"testWORD_1img.docx",
- "testWORD_2006ml.docx", "testWORD_embedded_pics.docx",
"testWORD_macros.docm",
- "testEXCEL_textbox.xlsx", "testEXCEL_macro.xlsm",
"testEXCEL_phonetic.xlsx",
- "testEXCEL_embeddedPDF_windows.xlsx", "testPPT_2imgs.pptx",
"testPPT_comment.pptx",
- "testPPT_EmbeddedPDF.pptx", "testPPT_macros.pptm"}) {
- _testPOIOOXMLs(fileName);
- }
- }
-
- private void _testPOIOOXMLs(String fileName) throws Exception {
- Path injected = null;
- try (TikaInputStream tis = TikaInputStream
- .get(getResourceAsStream("/test-documents/" + fileName))) {
- Path originalOOXML = tis.getPath();
- injected = injectZippedXMLs(originalOOXML, XXE, false);
-
-
- ContentHandler xhtml = new ToHTMLContentHandler();
- ParseContext parseContext = new ParseContext();
- //if the SafeContentHandler is turned off, this will throw an FNFE
- Metadata metadata = new Metadata();
- try {
- AUTO_DETECT_PARSER
- .parse(Files.newInputStream(injected), xhtml,
metadata, parseContext);
- } catch (TikaException e) {
- Throwable cause = e.getCause();
- if (!(cause instanceof InvalidFormatException)) {
- //as of POI 4.1.x
- fail("POI should have thrown an IFE complaining about " +
- "not being able to read content types part !");
- }
- } finally {
- Files.delete(injected);
- }
-
- try {
- metadata = new Metadata();
- xhtml = new ToHTMLContentHandler();
-
- OfficeParserConfig officeParserConfig = new
OfficeParserConfig();
- parseContext.set(OfficeParserConfig.class, officeParserConfig);
- officeParserConfig.setUseSAXDocxExtractor(true);
- officeParserConfig.setUseSAXPptxExtractor(true);
- injected = injectZippedXMLs(originalOOXML, XXE, true);
-
- AUTO_DETECT_PARSER
- .parse(Files.newInputStream(injected), xhtml,
metadata, parseContext);
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- fail("problem with SAX-based: " + fileName + ": " +
e.getMessage());
- } finally {
- Files.delete(injected);
- }
- }
- }
-
- @Test
- public void testXMLInZips() throws Exception {
- for (String fileName : new String[]{"testEPUB.epub"}) {
- _testXMLInZips(fileName);
- }
- }
-
- private void _testXMLInZips(String fileName) throws Exception {
- Path injected = null;
- try (TikaInputStream tis = TikaInputStream
- .get(getResourceAsStream("/test-documents/" + fileName))) {
- injected = injectZippedXMLs(tis.getPath(), XXE, false);
- }
- Parser p = AUTO_DETECT_PARSER;
- ContentHandler xhtml = new ToHTMLContentHandler();
- ParseContext parseContext = new ParseContext();
- //if the SafeContentHandler is turned off, this will throw an FNFE
- Metadata metadata = new Metadata();
- try {
- p.parse(Files.newInputStream(injected), xhtml, metadata,
parseContext);
- } finally {
- Files.delete(injected);
- }
-
- }
-
-
- @Test
- public void testDOM() throws Exception {
- byte[] bytes = "<?xml version=\"1.0\"
encoding=\"UTF-8\"?><document>blah</document>"
- .getBytes(StandardCharsets.UTF_8);
- byte[] injected = injectXML(bytes, XXE);
- for (int i = 0; i < XMLReaderUtils.getPoolSize() * 2; i++) {
- //this shouldn't throw an exception
- XMLReaderUtils.buildDOM(new ByteArrayInputStream(injected), new
ParseContext());
- }
- }
-
- //use this to confirm that this works
- //by manually turning off the SafeContentHandler in
SXWPFWordExtractorDecorator's
- //handlePart
- public void testDocxWithIncorrectSAXConfiguration() throws Exception {
- Path injected = null;
-
- try (TikaInputStream tis = TikaInputStream
-
.get(getResourceAsStream("/test-documents/testWORD_macros.docm"))) {
- injected = injectZippedXMLs(tis.getPath(), XXE, true);
- }
-
- ContentHandler xhtml = new ToHTMLContentHandler();
- ParseContext parseContext = new ParseContext();
- OfficeParserConfig officeParserConfig = new OfficeParserConfig();
- officeParserConfig.setUseSAXDocxExtractor(true);
- parseContext.set(OfficeParserConfig.class, officeParserConfig);
- parseContext.set(SAXParser.class,
SAXParserFactory.newInstance().newSAXParser());
- //if the SafeContentHandler is turned off, this will throw an FNFE
- try {
- AUTO_DETECT_PARSER
- .parse(Files.newInputStream(injected), xhtml, new
Metadata(), parseContext);
- } finally {
- //Files.delete(injected);
- }
- }
-
- @Test
- public void testDOMTikaConfig() throws Exception {
- //tests the DOM reader in TikaConfig
- //if the safeguards aren't in place, this throws a FNFE
- try (InputStream is = getResourceAsStream(
- "/org/apache/tika/config/TIKA-1558-exclude.xml")) {
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- IOUtils.copy(is, bos);
- byte[] injected = injectXML(bos.toByteArray(), XXE);
- TikaConfig tikaConfig = new TikaConfig(new
ByteArrayInputStream(injected));
- }
- }
-}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
index a1ce0f05bd..4f3ce66ddd 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/fork/ForkParserIntegrationTest.java
@@ -37,13 +37,12 @@ import org.xml.sax.SAXException;
import org.apache.tika.MultiThreadedTikaTest;
import org.apache.tika.Tika;
-import org.apache.tika.config.TikaConfig;
+import org.apache.tika.TikaLoaderHelper;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.fork.ForkParser;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -218,10 +217,9 @@ public class ForkParserIntegrationTest extends
MultiThreadedTikaTest {
if (! new LibPstParser().checkQuietly()) {
return;
}
- TikaConfig tikaConfig = new TikaConfig(
-
ForkParserIntegrationTest.class.getResourceAsStream("/configs/tika-config-lib-pst.xml"));
+ Parser autoDetectParser =
TikaLoaderHelper.getLoader("tika-config-lib-pst.json").loadAutoDetectParser();
try (ForkParser parser = new
ForkParser(ForkParserIntegrationTest.class.getClassLoader(),
- new AutoDetectParser(tikaConfig))) {
+ autoDetectParser)) {
ContentHandler output = new BodyContentHandler();
InputStream stream =
getResourceAsStream("/test-documents/testPST.pst");
ParseContext context = new ParseContext();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 03ff429752..d660a61728 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -22,6 +22,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assumptions.assumeTrue;
import java.io.IOException;
+import java.nio.file.Paths;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
@@ -34,14 +35,13 @@ import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
import org.apache.tika.config.ConfigContainer;
import org.apache.tika.config.ParseContextConfig;
-import org.apache.tika.config.TikaConfig;
import org.apache.tika.config.loader.PolymorphicObjectMapperFactory;
+import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PDF;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -101,8 +101,9 @@ public class TesseractOCRParserTest extends TikaTest {
@Disabled("this requires manually moving the default tessdata directory")
@Test
public void testTessdataConfig() throws Exception {
- TikaConfig tikaConfig = new
TikaConfig(getResourceAsStream("tesseract-config.xml"));
- Parser p = new AutoDetectParser(tikaConfig);
+ TikaLoader loader = TikaLoader.load(
+
Paths.get(TesseractOCRParserTest.class.getResource("tesseract-config.json").toURI()));
+ Parser p = loader.loadAutoDetectParser();
List<Metadata> metadataList = getRecursiveMetadata("testOCR.pdf", p);
assertContains("Happy New Year 2003!",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/CompositeZipContainerDetectorTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/CompositeZipContainerDetectorTest.java
index 5fd27ca2c6..66ac671c6c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/CompositeZipContainerDetectorTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/CompositeZipContainerDetectorTest.java
@@ -33,7 +33,7 @@ import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
-import org.apache.tika.config.TikaConfig;
+import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.zip.DefaultZipContainerDetector;
import org.apache.tika.detect.zip.DeprecatedStreamingZipContainerDetector;
@@ -122,9 +122,9 @@ public class CompositeZipContainerDetectorTest extends
TikaTest {
@Disabled("for offline testing")
@Test
public void timeDetection() throws Exception {
- TikaConfig config = TikaConfig.getDefaultConfig();
- Detector detector = config.getDetector();
- MediaTypeRegistry registry = config.getMediaTypeRegistry();
+ TikaLoader loader = TikaLoader.loadDefault();
+ Detector detector = loader.loadDetectors();
+ MediaTypeRegistry registry = TikaLoader.getMediaTypeRegistry();
List<File> zips = getTestZipBasedFiles(detector, registry);
Set<MediaType> mediaTypeSet = new HashSet<>();
@@ -164,9 +164,9 @@ public class CompositeZipContainerDetectorTest extends
TikaTest {
@Test
@Disabled("to be used for offline timing tests")
public void timeParsing() throws Exception {
- TikaConfig config = TikaConfig.getDefaultConfig();
- Detector detector = config.getDetector();
- MediaTypeRegistry registry = config.getMediaTypeRegistry();
+ TikaLoader loader = TikaLoader.loadDefault();
+ Detector detector = loader.loadDetectors();
+ MediaTypeRegistry registry = TikaLoader.getMediaTypeRegistry();
List<File> zips = getTestZipBasedFiles(detector, registry);
System.out.println("zips size: " + zips.size());
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
index 4490847005..82233b6532 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
@@ -24,10 +24,8 @@ import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
import org.apache.tika.TikaLoaderHelper;
-import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
@@ -88,12 +86,9 @@ public class GzipParserTest extends AbstractPkgTest {
assertEquals(2, getRecursiveMetadata("multiple.gz").size());
//test config
- TikaConfig tikaConfig = null;
- try (InputStream is =
getResourceAsStream("/configs/tika-config-multiple-gz.xml")) {
- tikaConfig = new TikaConfig(is);
- }
+ Parser p =
TikaLoaderHelper.getLoader("tika-config-multiple-gz.json").loadAutoDetectParser();
assertContains("<p>ab</p>",
- getRecursiveMetadata("multiple.gz", new
AutoDetectParser(tikaConfig)).get(1)
+ getRecursiveMetadata("multiple.gz", p).get(1)
.get(TikaCoreProperties.TIKA_CONTENT));
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java
index 80abcf8a0e..8c2aea0b7f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java
@@ -19,15 +19,13 @@ package org.apache.tika.parser.pkg;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assumptions.assumeTrue;
-import java.io.InputStream;
import java.util.List;
import org.junit.jupiter.api.Test;
-import org.apache.tika.config.TikaConfig;
+import org.apache.tika.TikaLoaderHelper;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.external.ExternalParser;
@@ -49,11 +47,7 @@ public class UnrarParserTest extends AbstractPkgTest {
String[] expectedResources = { "testHTML.html", "testEXCEL.xls",
"testOpenOffice2.odt", "testPDF.pdf",
"testPPT.ppt", "testRTF.rtf", "testTXT.txt", "testWORD.doc",
"testXML.xml"};
- TikaConfig tikaConfig = null;
- try (InputStream is = getResourceAsStream("tika-unrar-config.xml")) {
- tikaConfig = new TikaConfig(is);
- }
- Parser p = new AutoDetectParser(tikaConfig);
+ Parser p =
TikaLoaderHelper.getLoader("tika-unrar-config.json").loadAutoDetectParser();
List<Metadata> metadataList =
getRecursiveMetadata("test-documents.rar", p);
assertEquals("org.apache.tika.parser.pkg.UnrarParser",
metadataList.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY)[1]);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1708-detector-composite.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1708-detector-composite.json
new file mode 100644
index 0000000000..545abecdc5
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1708-detector-composite.json
@@ -0,0 +1,13 @@
+{
+ "detectors": [
+ {
+ "poifs-container-detector": {}
+ },
+ {
+ "mime-types": {}
+ }
+ ],
+ "translator": {
+ "class": "default-translator"
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1708-detector-default.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1708-detector-default.json
index 2c2e0e676f..4d76bc86a9 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1708-detector-default.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1708-detector-default.json
@@ -1,17 +1,14 @@
{
- "parsers": [],
"detectors": [
{
- "default-detector" : {
+ "default-detector": {
"exclude": [
"default-zip-container-detector"
]
}
}
],
- "translator": [
- {
- "default-translator": {}
- }
- ]
+ "translator": {
+ "class": "default-translator"
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4424-config.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4424-config.xml
deleted file mode 100644
index 6baf4b7fbc..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4424-config.xml
+++ /dev/null
@@ -1,26 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <detectors>
- <!-- All detectors except built-in container ones -->
- <detector class="org.apache.tika.detect.DefaultDetector">
- <!-- DefaultZipContainerDetector will identify *.zip files with
KML content as "kmz" files, this is correct behaviour -->
- <detector-exclude
class="org.apache.tika.detect.zip.DefaultZipContainerDetector"/>
- </detector>
- </detectors>
-</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-120.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-120.xml
deleted file mode 100644
index 6e9bf35175..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-120.xml
+++ /dev/null
@@ -1,36 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <detectors>
- <detector class="org.gagravarr.tika.OggDetector"/>
- <detector class="org.apache.tika.detect.apple.BPListDetector"/>
- <detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/>
- <detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector">
- <params>
- <param name="markLimit" type="int">120</param>
- </params>
- </detector>
- <detector class="org.apache.tika.detect.ole.MiscOLEDetector"/>
- <detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector">
- <params>
- <param name="markLimit" type="int">16777216</param>
- </params>
- </detector>
- <detector class="org.apache.tika.mime.MimeTypes"/>
- </detectors>
-</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-12000000.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-12000000.xml
deleted file mode 100644
index a438b5c63f..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-12000000.xml
+++ /dev/null
@@ -1,36 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <detectors>
- <detector class="org.gagravarr.tika.OggDetector"/>
- <detector class="org.apache.tika.detect.apple.BPListDetector"/>
- <detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/>
- <detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector">
- <params>
- <param name="markLimit" type="int">12000000</param>
- </params>
- </detector>
- <detector class="org.apache.tika.detect.ole.MiscOLEDetector"/>
- <detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector">
- <params>
- <param name="markLimit" type="int">16777216</param>
- </params>
- </detector>
- <detector class="org.apache.tika.mime.MimeTypes"/>
- </detectors>
-</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-neg1.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-neg1.xml
deleted file mode 100644
index 74c0112682..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-neg1.xml
+++ /dev/null
@@ -1,36 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <detectors>
- <detector class="org.gagravarr.tika.OggDetector"/>
- <detector class="org.apache.tika.detect.apple.BPListDetector"/>
- <detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/>
- <detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector">
- <params>
- <param name="markLimit" type="int">-1</param>
- </params>
- </detector>
- <detector class="org.apache.tika.detect.ole.MiscOLEDetector"/>
- <detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector">
- <params>
- <param name="markLimit" type="int">16777216</param>
- </params>
- </detector>
- <detector class="org.apache.tika.mime.MimeTypes"/>
- </detectors>
-</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.xml
deleted file mode 100644
index 83661eca51..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4533.xml
+++ /dev/null
@@ -1,47 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <autoDetectParserConfig>
- <params>
- <!-- if the incoming metadata object has a ContentLength entry and it is
larger than this
- value, spool the file to disk; this is useful for some file formats
that are more efficiently
- processed via a file instead of an InputStream -->
- <spoolToDisk>0</spoolToDisk>
- <!-- the next four are parameters for the SecureContentHandler -->
- <!-- threshold used in zip bomb detection. This many characters must be
written
- before the maximum compression ratio is calculated -->
- <outputThreshold>10000</outputThreshold>
- <!-- maximum compression ratio between output characters and input bytes
-->
- <maximumCompressionRatio>100</maximumCompressionRatio>
- <!-- maximum XML element nesting level -->
- <maximumDepth>100</maximumDepth>
- <!-- maximum embedded file depth -->
- <maximumPackageEntryDepth>100</maximumPackageEntryDepth>
- <!-- throw an exception if a file has zero bytes -->
- <throwOnZeroBytes>false</throwOnZeroBytes>
- </params>
- <!-- as of Tika 2.5.x, this is the preferred way to configure digests -->
- <digesterFactory
class="org.apache.tika.parser.digestutils.CommonsDigesterFactory">
- <params>
- <markLimit>100000</markLimit>
- <!-- this specifies SHA256, base32 and MD5 -->
- <algorithmString>sha256</algorithmString>
- </params>
- </digesterFactory>
- </autoDetectParserConfig>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.xml
deleted file mode 100644
index 03be973bfc..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.xml
+++ /dev/null
@@ -1,33 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
- </parser>
- </parsers>
- <autoDetectParserConfig>
- <spoolToDisk>1000000</spoolToDisk>
- <outputThreshold>1000000</outputThreshold>
- <digesterFactory
- class="org.apache.tika.parser.digestutils.CommonsDigesterFactory">
- <markLimit>100000</markLimit>
- <algorithmString>sha256:32,md5</algorithmString>
- </digesterFactory>
- </autoDetectParserConfig>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.xml
deleted file mode 100644
index 22823dc3c7..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-skip-container.xml
+++ /dev/null
@@ -1,33 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser"/>
- </parsers>
- <autoDetectParserConfig>
- <spoolToDisk>1000000</spoolToDisk>
- <outputThreshold>1000000</outputThreshold>
- <digesterFactory
- class="org.apache.tika.parser.digestutils.CommonsDigesterFactory">
- <markLimit>100000</markLimit>
- <algorithmString>sha256:32,md5</algorithmString>
- <skipContainerDocument>true</skipContainerDocument>
- </digesterFactory>
- <throwOnZeroBytes>false</throwOnZeroBytes>
- </autoDetectParserConfig>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml
deleted file mode 100644
index c1fbb7b48a..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml
+++ /dev/null
@@ -1,32 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser"/>
- </parsers>
- <autoDetectParserConfig>
- <spoolToDisk>1000000</spoolToDisk>
- <outputThreshold>1000000</outputThreshold>
- <digesterFactory
- class="org.apache.tika.parser.digestutils.CommonsDigesterFactory">
- <markLimit>100000</markLimit>
- <algorithmString>sha256:32,md5</algorithmString>
- </digesterFactory>
- <throwOnZeroBytes>false</throwOnZeroBytes>
- </autoDetectParserConfig>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.xml
deleted file mode 100644
index 7892f4687a..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.xml
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser"/>
- </parsers>
- <autoDetectParserConfig>
- <spoolToDisk>123450</spoolToDisk>
- <outputThreshold>678900</outputThreshold>
- <contentHandlerDecoratorFactory
class="org.apache.tika.sax.DoublingContentHandlerDecoratorFactory"/>
- </autoDetectParserConfig>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-geo-point-metadata-filter.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-geo-point-metadata-filter.xml
deleted file mode 100644
index 92942cfe11..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-geo-point-metadata-filter.xml
+++ /dev/null
@@ -1,24 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <metadataFilters>
- <metadataFilter
class="org.apache.tika.metadata.filter.GeoPointMetadataFilter">
- <geoPointFieldName>myGeoPoint</geoPointFieldName>
- </metadataFilter>
- </metadataFilters>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.json
index 7666f82817..1396afc7af 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.json
@@ -1,16 +1,15 @@
{
"parsers": [
{
- "_name": "default-parser",
- "_decorate": {
- "parserExclude": [
- "org.apache.tika.parser.microsoft.pst.OutlookPSTParser",
- "org.apache.tika.parser.microsoft.pst.PSTMailItemParser"
+ "default-parser": {
+ "exclude": [
+ "outlook-pst-parser",
+ "pst-mail-item-parser"
]
}
},
{
- "_name": "org.apache.tika.parser.microsoft.libpst.LibPstParser"
+ "lib-pst-parser": {}
}
]
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.xml
deleted file mode 100644
index df5a431271..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.xml
+++ /dev/null
@@ -1,26 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude
class="org.apache.tika.parser.microsoft.pst.OutlookPSTParser"/>
- <parser-exclude
class="org.apache.tika.parser.microsoft.pst.PSTMailItemParser"/>
- </parser>
- <parser class="org.apache.tika.parser.microsoft.libpst.LibPstParser"/>
- </parsers>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-multiple-gz.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-multiple-gz.json
new file mode 100644
index 0000000000..8e4a3465bd
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-multiple-gz.json
@@ -0,0 +1,14 @@
+{
+ "parsers": [
+ {
+ "default-parser": {
+ "exclude": ["compressor-parser"]
+ }
+ },
+ {
+ "compressor-parser": {
+ "decompressConcatenated": true
+ }
+ }
+ ]
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-multiple-gz.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-multiple-gz.xml
deleted file mode 100644
index 370532af46..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-multiple-gz.xml
+++ /dev/null
@@ -1,29 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.pkg.CompressorParser"/>
- </parser>
- <parser class="org.apache.tika.parser.pkg.CompressorParser">
- <params>
- <param name="decompressConcatenated" type="bool">true</param>
- </params>
- </parser>
- </parsers>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-unrar-config.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-unrar-config.json
new file mode 100644
index 0000000000..5511b90b7a
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-unrar-config.json
@@ -0,0 +1,12 @@
+{
+ "parsers": [
+ {
+ "default-parser": {
+ "exclude": ["rar-parser"]
+ }
+ },
+ {
+ "unrar-parser": {}
+ }
+ ]
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.json
index 4c28ead328..10101b8536 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.json
@@ -1,23 +1,18 @@
{
"parsers": [
{
- "_name": "default-parser",
- "_decorate": {
- "mimeExclude": [
- "image/jpeg",
- "application/pdf"
- ],
- "parserExclude": [
- "org.apache.tika.parser.executable.ExecutableParser"
- ]
+ "default-parser": {
+ "exclude": ["executable-parser"],
+ "_decorate": {
+ "mimeExclude": ["image/jpeg", "application/pdf"]
+ }
}
},
{
- "_name": "org.apache.tika.parser.EmptyParser",
- "_decorate": {
- "mime": [
- "application/pdf"
- ]
+ "empty-parser": {
+ "_decorate": {
+ "mimeInclude": ["application/pdf"]
+ }
}
}
]
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.xml
deleted file mode 100644
index 6ab400097b..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.xml
+++ /dev/null
@@ -1,29 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <mime-exclude>image/jpeg</mime-exclude>
- <mime-exclude>application/pdf</mime-exclude>
- <parser-exclude
class="org.apache.tika.parser.executable.ExecutableParser"/>
- </parser>
- <parser class="org.apache.tika.parser.EmptyParser">
- <mime>application/pdf</mime>
- </parser>
- </parsers>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-excludesub.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-excludesub.json
index 98c9ca11cb..aa34ec2fbd 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-excludesub.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-excludesub.json
@@ -1,11 +1,8 @@
{
"parsers": [
{
- "_name": "default-parser",
- "_decorate": {
- "parserExclude": [
- "org.apache.tika.parser.xml.XMLParser"
- ]
+ "default-parser": {
+ "exclude": ["xml-parser", "dc-xml-parser", "fiction-book-parser"]
}
}
]
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-excludesub.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-excludesub.xml
deleted file mode 100644
index d62e592d1a..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-excludesub.xml
+++ /dev/null
@@ -1,24 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.xml.XMLParser"/>
- </parser>
- </parsers>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-detector-exclude.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-detector-exclude.xml
deleted file mode 100644
index e59af56320..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-detector-exclude.xml
+++ /dev/null
@@ -1,31 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <!-- Explicitly request default parsers -->
- <parsers/>
- <detectors>
- <!-- All detectors except built-in container ones -->
- <detector class="org.apache.tika.detect.DefaultDetector">
- <detector-exclude
class="org.apache.tika.detect.zip.DefaultZipContainerDetector"/>
- <detector-exclude
class="org.apache.tika.detect.microsoft.POIFSContainerDetector"/>
- </detector>
- <!-- One other detector, to check ordering -->
- <detector class="org.apache.tika.detect.EmptyDetector">
- </detector>
- </detectors>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.json
index 60e0449a9c..69f20d6784 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.json
@@ -1,9 +1,5 @@
{
- "parsers": [],
- "detectors": [],
- "translator": [
- {
- "_name": "org.apache.tika.language.translate.DefaultTranslator"
- }
- ]
+ "translator": {
+ "class": "default-translator"
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.xml
deleted file mode 100644
index 975a189aaf..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.xml
+++ /dev/null
@@ -1,24 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <!-- Explicitly request default parsers and translators -->
- <parsers/>
- <detectors/>
- <!-- Explicitly request the default Translator -->
- <translator class="org.apache.tika.language.translate.DefaultTranslator"/>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.json
index 5999a916fc..4e4b88fcc8 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.json
@@ -1,10 +1,5 @@
{
- "translator": [
- {
- "_name": "org.apache.tika.language.translate.EmptyTranslator"
- },
- {
- "_name": "org.apache.tika.language.translate.DefaultTranslator"
- }
- ]
+ "translator": {
+ "class": "empty-translator"
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.xml
deleted file mode 100644
index 9a607ce960..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <!-- As Translators don't support Composites, Empty used -->
- <translator class="org.apache.tika.language.translate.EmptyTranslator"/>
- <translator class="org.apache.tika.language.translate.DefaultTranslator"/>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.json
index 682cff81d9..4e4b88fcc8 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.json
@@ -1,7 +1,5 @@
{
- "translator": [
- {
- "_name": "org.apache.tika.language.translate.EmptyTranslator"
- }
- ]
+ "translator": {
+ "class": "empty-translator"
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.xml
deleted file mode 100644
index 06afe91084..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.xml
+++ /dev/null
@@ -1,20 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <translator class="org.apache.tika.language.translate.EmptyTranslator"/>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1708-detector-composite.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1708-detector-composite.json
index facb053f4e..545abecdc5 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1708-detector-composite.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1708-detector-composite.json
@@ -1,16 +1,13 @@
{
- "parsers": [],
"detectors": [
{
- "_name": "org.apache.tika.detect.microsoft.POIFSContainerDetector"
+ "poifs-container-detector": {}
},
{
- "_name": "org.apache.tika.mime.MimeTypes"
+ "mime-types": {}
}
],
- "translator": [
- {
- "_name": "org.apache.tika.language.translate.DefaultTranslator"
- }
- ]
+ "translator": {
+ "class": "default-translator"
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1708-detector-composite.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1708-detector-composite.xml
deleted file mode 100644
index 48901e640e..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1708-detector-composite.xml
+++ /dev/null
@@ -1,25 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers/>
- <detectors>
- <detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector"/>
- <detector class="org.apache.tika.mime.MimeTypes"/>
- </detectors>
- <translator class="org.apache.tika.language.translate.DefaultTranslator"/>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-encoding-detector-outside-static-init.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-encoding-detector-outside-static-init.xml
deleted file mode 100644
index 6f70448aec..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-encoding-detector-outside-static-init.xml
+++ /dev/null
@@ -1,34 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <!-- exclude TXTParser from Default, add it as if custom
- and confirm that correct charset detector was added -->
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.txt.TXTParser"/>
- </parser>
- <parser class="org.apache.tika.parser.txt.TXTParser">
- </parser>
- </parsers>
- <encodingDetectors>
- <!-- All detectors except Icu4jEncodingDetector-->
- <encodingDetector
class="org.apache.tika.detect.DefaultEncodingDetector">
- <encodingDetector-exclude
class="org.apache.tika.parser.txt.Icu4jEncodingDetector"/>
- </encodingDetector>
- </encodingDetectors>
-</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml
deleted file mode 100644
index c4373f9cce..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-exclude-encoding-detector-default.xml
+++ /dev/null
@@ -1,29 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <!-- Explicitly request default parsers -->
- <parsers/>
- <encodingDetectors>
- <!-- All detectors except HtmlEncodingDetector -->
- <encodingDetector
class="org.apache.tika.detect.DefaultEncodingDetector">
- <encodingDetector-exclude
class="org.apache.tika.parser.html.HtmlEncodingDetector"/>
- </encodingDetector>
- <!-- One other detector, to check ordering -->
- <encodingDetector
class="org.apache.tika.detect.OverrideEncodingDetector"/>
- </encodingDetectors>
-</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-no-icu4j-encoding-detector.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-no-icu4j-encoding-detector.xml
deleted file mode 100644
index a0ec343c9a..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-no-icu4j-encoding-detector.xml
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <!-- Explicitly request default parsers -->
- <parsers/>
- <encodingDetectors>
- <!-- All detectors except Icu4jEncodingDetector-->
- <encodingDetector
class="org.apache.tika.detect.DefaultEncodingDetector">
- <encodingDetector-exclude
class="org.apache.tika.parser.txt.Icu4jEncodingDetector"/>
- </encodingDetector>
- </encodingDetectors>
-</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-non-detecting-params-bad-charset.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-non-detecting-params-bad-charset.xml
deleted file mode 100644
index 66db57d144..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-non-detecting-params-bad-charset.xml
+++ /dev/null
@@ -1,29 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <!-- Explicitly request default parsers -->
- <parsers/>
- <encodingDetectors>
- <!-- One other detector, to check ordering -->
- <encodingDetector
class="org.apache.tika.detect.OverrideEncodingDetector">
- <params>
- <param name="charset" type="string">wtf8</param>
- </params>
- </encodingDetector>
- </encodingDetectors>
-</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-non-detecting-params.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-non-detecting-params.xml
deleted file mode 100644
index 264a23ba56..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-non-detecting-params.xml
+++ /dev/null
@@ -1,29 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <!-- Explicitly request default parsers -->
- <parsers/>
- <encodingDetectors>
- <!-- One other detector, to check ordering -->
- <encodingDetector
class="org.apache.tika.detect.OverrideEncodingDetector">
- <params>
- <param name="charset" type="string">UTF-16LE</param>
- </params>
- </encodingDetector>
- </encodingDetectors>
-</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-parameterize-encoding-detector.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-parameterize-encoding-detector.xml
deleted file mode 100644
index 76cea13882..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2273-parameterize-encoding-detector.xml
+++ /dev/null
@@ -1,30 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <!-- Explicitly request default parsers -->
- <parsers/>
- <encodingDetectors>
- <!-- One other detector, to check ordering -->
- <encodingDetector
class="org.apache.tika.parser.txt.Icu4jEncodingDetector">
- <params>
- <param name="stripMarkup" type="bool">true</param>
- </params>
- </encodingDetector>
- <encodingDetector
class="org.apache.tika.detect.OverrideEncodingDetector"/>
- </encodingDetectors>
-</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2485-encoding-detector-mark-limits.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2485-encoding-detector-mark-limits.xml
deleted file mode 100644
index 7ff326d2eb..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-2485-encoding-detector-mark-limits.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <!-- Explicitly request default parsers -->
- <parsers/>
- <encodingDetectors>
- <encodingDetector
class="org.apache.tika.parser.html.HtmlEncodingDetector">
- <params>
- <param name="markLimit" type="int">64000</param>
- </params>
- </encodingDetector>
- <encodingDetector
class="org.apache.tika.parser.txt.UniversalEncodingDetector">
- <params>
- <param name="markLimit" type="int">64001</param>
- </params>
- </encodingDetector>
- <encodingDetector
class="org.apache.tika.parser.txt.Icu4jEncodingDetector">
- <params>
- <param name="markLimit" type="int">64002</param>
- </params>
- </encodingDetector>
- </encodingDetectors>
-</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml
deleted file mode 100644
index cff7e8f961..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml
+++ /dev/null
@@ -1,34 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <metadataFilters>
- <metadataFilter
class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter">
- <include>
- <field>X-TIKA:content</field>
- <field>extended-properties:Application</field>
- <field>Content-Type</field>
- </include>
- </metadataFilter>
- <metadataFilter
class="org.apache.tika.metadata.filter.ClearByMimeMetadataFilter">
- <mimes>
- <mime>image/emf</mime>
- <mime>text/plain</mime>
- </mimes>
- </metadataFilter>
- </metadataFilters>
-</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/ocr/tesseract-config.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/ocr/tesseract-config.json
index b4ee3da5ec..672584b483 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/ocr/tesseract-config.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/ocr/tesseract-config.json
@@ -1,17 +1,15 @@
{
"parsers": [
{
- "_name": "default-parser",
- "_decorate": {
- "parserExclude": [
- "org.apache.tika.parser.ocr.TesseractOCRParser"
- ]
+ "default-parser": {
+ "exclude": ["tesseract-ocr-parser"]
}
},
{
- "_name": "org.apache.tika.parser.ocr.TesseractOCRParser",
- "tesseractPath": "C:\\Program Files\\Tesseract OCR",
- "tessdataPath": "C:\\Program Files\\Tesseract OCR\\tessdata"
+ "tesseract-ocr-parser": {
+ "tesseractPath": "C:\\Program Files\\Tesseract OCR",
+ "tessdataPath": "C:\\Program Files\\Tesseract OCR\\tessdata"
+ }
}
]
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/ocr/tesseract-config.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/ocr/tesseract-config.xml
deleted file mode 100644
index cd80503288..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/parser/ocr/tesseract-config.xml
+++ /dev/null
@@ -1,32 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<properties>
- <parsers>
- <parser class="org.apache.tika.parser.DefaultParser">
- <parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
- </parser>
- <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
- <params>
- <!-- Note that I manually changed the directory to "Tesseract OCR"
from "Tesseract-OCR"
- on my local machine for this test -->
- <param name="tesseractPath" type="string">C:\Program Files\Tesseract
OCR</param>
- <param name="tessdataPath" type="string">C:\Program Files\Tesseract
OCR\tessdata</param>
- </params>
- </parser>
- </parsers>
-</properties>
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java
index 3f19de4f13..79b0840abd 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/DetectorLoader.java
@@ -171,6 +171,13 @@ public class DetectorLoader {
ComponentRegistry registry)
throws TikaConfigException {
try {
+ // Special case: mime-types requires the initialized registry from
TikaLoader
+ // The no-arg constructor creates an empty MimeTypes without the
XML-loaded types
+ if ("mime-types".equals(name)) {
+ LOG.debug("Using TikaLoader.getMimeTypes() for mime-types
detector");
+ return TikaLoader.getMimeTypes();
+ }
+
// Get detector class - try component name first, then FQCN
fallback
Class<?> detectorClass;
try {
diff --git
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
index 96a8698e7b..95f0dc6166 100644
---
a/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
+++
b/tika-serialization/src/main/java/org/apache/tika/config/loader/ParserLoader.java
@@ -100,9 +100,10 @@ public class ParserLoader {
ComponentRegistry registry = new ComponentRegistry("parsers",
classLoader);
List<Map.Entry<String, JsonNode>> parsers =
config.getArrayComponents("parsers");
- // Check if "default-parser" is in the list and extract exclusions
+ // Check if "default-parser" is in the list and extract exclusions
and decorations
boolean hasDefaultParser = false;
Set<Class<?>> excludedParserClasses = new HashSet<>();
+ FrameworkConfig.ParserDecoration defaultParserDecoration = null;
for (Map.Entry<String, JsonNode> entry : parsers) {
if ("default-parser".equals(entry.getKey())) {
@@ -148,6 +149,16 @@ public class ParserLoader {
}
}
}
+
+ // Extract decoration (mimeInclude/mimeExclude) for
default-parser
+ if (configNode != null) {
+ try {
+ FrameworkConfig frameworkConfig =
FrameworkConfig.extract(configNode, objectMapper);
+ defaultParserDecoration =
frameworkConfig.getDecoration();
+ } catch (Exception e) {
+ LOG.warn("Failed to extract decoration from
default-parser: {}", e.getMessage());
+ }
+ }
break;
}
}
@@ -202,7 +213,16 @@ public class ParserLoader {
// If "default-parser" is NOT present, only load explicitly
configured parsers
if (hasDefaultParser) {
List<Parser> spiParsers =
loadSpiParsers(configuredParserClasses);
- parserList.addAll(spiParsers);
+
+ // Apply decoration to SPI parsers if specified on
default-parser
+ if (defaultParserDecoration != null &&
defaultParserDecoration.hasFiltering()) {
+ // Wrap SPI parsers in a CompositeParser and apply
decoration
+ Parser spiComposite = new
CompositeParser(TikaLoader.getMediaTypeRegistry(), spiParsers);
+ spiComposite = applyMimeFiltering(spiComposite,
defaultParserDecoration);
+ parserList.add(spiComposite);
+ } else {
+ parserList.addAll(spiParsers);
+ }
LOG.debug("Loading SPI parsers because 'default-parser' is in
config");
} else {
LOG.debug("Skipping SPI parsers - 'default-parser' not in
config");