Repository: tika Updated Branches: refs/heads/master 9056894da -> 5a3410715
TIKA-1657 move xmlification of TikaConfig to tika-core. Thank you, Nick! Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3aa1dca4 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3aa1dca4 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3aa1dca4 Branch: refs/heads/master Commit: 3aa1dca4eef13de99b83989010fe02bfd391b378 Parents: 9056894 Author: tballison <[email protected]> Authored: Wed Mar 2 09:18:46 2016 -0500 Committer: tballison <[email protected]> Committed: Wed Mar 2 09:18:46 2016 -0500 ---------------------------------------------------------------------- .../main/java/org/apache/tika/cli/TikaCLI.java | 26 +- .../java/org/apache/tika/cli/TikaCLITest.java | 50 +++- .../test/resources/test-data/tika-config2.xml | 14 + .../tika/config/TikaConfigSerializer.java | 256 +++++++++++++++++++ .../tika/config/TikaConfigSerializerTest.java | 60 +++++ .../tika/example/DumpTikaConfigExample.java | 233 +---------------- .../tika/example/DumpTikaConfigExampleTest.java | 6 +- 7 files changed, 413 insertions(+), 232 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java ---------------------------------------------------------------------- diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 50f3463..4458526 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -73,6 +73,7 @@ import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.tika.Tika; import org.apache.tika.batch.BatchProcessDriverCLI; import org.apache.tika.config.TikaConfig; +import org.apache.tika.config.TikaConfigSerializer; import org.apache.tika.detect.CompositeDetector; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.Detector; @@ -326,6 +327,8 @@ public class TikaCLI { private Parser parser; + private TikaConfig config; + private String configFilePath; private OutputType type = XML; @@ -405,6 +408,15 @@ public class TikaCLI { } else if (arg.startsWith("--compare-file-magic=")) { pipeMode = false; compareFileMagic(arg.substring(arg.indexOf('=')+1)); + } else if (arg.equals("--dump-minimal-config")) { + pipeMode = false; + dumpConfig(TikaConfigSerializer.Mode.MINIMAL); + } else if (arg.equals("--dump-current-config")) { + pipeMode = false; + dumpConfig(TikaConfigSerializer.Mode.CURRENT); + } else if (arg.equals("--dump-static-config")) { + pipeMode = false; + dumpConfig(TikaConfigSerializer.Mode.STATIC); } else if (arg.equals("--container-aware") || arg.equals("--container-aware-detector")) { // ignore, as container-aware detectors are now always used @@ -497,6 +509,13 @@ public class TikaCLI { } } + private void dumpConfig(TikaConfigSerializer.Mode mode) throws Exception { + TikaConfig localConfig = (config == null) ? TikaConfig.getDefaultConfig() : config; + + TikaConfigSerializer.serialize(localConfig, mode, + new OutputStreamWriter(System.out, UTF_8), UTF_8); + } + private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException { Metadata metadata = new Metadata(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser, getContentHandlerFactory(type)); @@ -541,7 +560,10 @@ public class TikaCLI { out.println(" -f or --fork Use Fork Mode for out-of-process extraction"); out.println(); out.println(" --config=<tika-config.xml>"); - out.println(" TikaConfig file. Must be specified before -g, -s or -f!"); + out.println(" TikaConfig file. Must be specified before -g, -s, -f or the dump-x-config !"); + out.println(" --dump-minimal-config Print minimal TikaConfig"); + out.println(" --dump-current-config Print current TikaConfig"); + out.println(" --dump-static-config Print static config"); out.println(""); out.println(" -x or --xml Output XHTML content (default)"); out.println(" -h or --html Output HTML content"); @@ -673,7 +695,7 @@ public class TikaCLI { private void configure(String configFilePath) throws Exception { this.configFilePath = configFilePath; - TikaConfig config = new TikaConfig(new File(configFilePath)); + config = new TikaConfig(new File(configFilePath)); parser = new AutoDetectParser(config); if (digester != null) { parser = new DigestingParser(parser, digester); http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java ---------------------------------------------------------------------- diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java index f9d5a5d..9fc8ee8 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java @@ -377,7 +377,6 @@ public class TikaCLITest { " \"Character-Count-With-Spaces\": \"31\",")); assertTrue(content.contains("\"X-TIKA:embedded_resource_path\": \"/embed1.zip\"")); assertFalse(content.contains("X-TIKA:content")); - } @Test @@ -406,4 +405,53 @@ public class TikaCLITest { assertTrue(content.contains("\"X-TIKA:digest:MD5\": \"f9627095ef86c482e61d99f0cc1cf87d\"")); } + @Test + public void testConfigSerializationStaticAndCurrent() throws Exception { + String[] params = new String[]{"--dump-static-config"}; + TikaCLI.main(params); + String content = outContent.toString(UTF_8.name()); + //make sure at least one detector is there + assertTrue(content.contains("<detector class=\"org.apache.tika.parser.microsoft.POIFSContainerDetector\"/>")); + //make sure Executable is there because follow on tests of custom config + //test that it has been turned off. + assertTrue(content.contains("<parser class=\"org.apache.tika.parser.executable.ExecutableParser\"/>")); + + params = new String[]{"--dump-current-config"}; + TikaCLI.main(params); + content = outContent.toString(UTF_8.name()); + //make sure at least one detector is there + assertTrue(content.contains("<detector class=\"org.apache.tika.parser.microsoft.POIFSContainerDetector\"/>")); + //and at least one parser + assertTrue(content.contains("<parser class=\"org.apache.tika.parser.executable.ExecutableParser\"/>")); + } + + @Test + public void testConfigSerializationCustomMinimal() throws Exception { + String[] params = new String[]{ + "--config=" + testDataFile.toString() + "/tika-config2.xml", + "--dump-minimal-config"}; + TikaCLI.main(params); + String content = outContent.toString(UTF_8.name()).replaceAll("[\r\n\t ]+", " "); + + String expected = + "<parser class=\"org.apache.tika.parser.DefaultParser\">" + + " <mime-exclude>application/pdf</mime-exclude>" + + " <mime-exclude>image/jpeg</mime-exclude> " + + "</parser> " + + "<parser class=\"org.apache.tika.parser.EmptyParser\">" + + " <mime>application/pdf</mime> " + + "</parser>"; + assertTrue(content.contains(expected)); + } + + @Test + public void testConfigSerializationCustomStatic() throws Exception { + String[] params = new String[]{ + "--config=" + testDataFile.toString() + "/tika-config2.xml", "--dump-static-config"}; + TikaCLI.main(params); + String content = outContent.toString(UTF_8.name()); + assertFalse(content.contains("org.apache.tika.parser.executable.Executable")); + } + + } http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-app/src/test/resources/test-data/tika-config2.xml ---------------------------------------------------------------------- diff --git a/tika-app/src/test/resources/test-data/tika-config2.xml b/tika-app/src/test/resources/test-data/tika-config2.xml new file mode 100644 index 0000000..3a511ed --- /dev/null +++ b/tika-app/src/test/resources/test-data/tika-config2.xml @@ -0,0 +1,14 @@ +<properties> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"> + <mime-exclude>image/jpeg</mime-exclude> + <mime-exclude>application/pdf</mime-exclude> + <parser-exclude class="org.apache.tika.parser.executable.ExecutableParser"/> + <parser-exclu class="org.apache.tika.parser.executable.ExecutableParser2"/> + </parser> + <parser class="org.apache.tika.parser.EmptyParser"> + <mime>application/pdf</mime> + <no-mime>hello/world</no-mime> + </parser> + </parsers> +</properties> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java new file mode 100644 index 0000000..3c19cfd --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.config; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import java.io.Writer; +import java.nio.charset.Charset; +import java.util.Collections; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.tika.detect.CompositeDetector; +import org.apache.tika.detect.DefaultDetector; +import org.apache.tika.detect.Detector; +import org.apache.tika.language.translate.DefaultTranslator; +import org.apache.tika.language.translate.Translator; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.CompositeParser; +import org.apache.tika.parser.DefaultParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; + +public class TikaConfigSerializer { + + public enum Mode { + MINIMAL, CURRENT, STATIC; + } + + /** + * + * @param config config to serialize + * @param mode serialization mode + * @param writer writer + * @param charset charset + * @throws Exception + */ + public static void serialize(TikaConfig config, Mode mode, Writer writer, Charset charset) + throws Exception { + DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); + DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); + + // root elements + Document doc = docBuilder.newDocument(); + Element rootElement = doc.createElement("properties"); + + doc.appendChild(rootElement); + addMimeComment(mode, rootElement, doc); + addServiceLoader(mode, rootElement, doc, config); + addExecutorService(mode, rootElement, doc, config); + addTranslator(mode, rootElement, doc, config); + addDetectors(mode, rootElement, doc, config); + addParsers(mode, rootElement, doc, config); + // TODO Service Loader section + + // now write + TransformerFactory transformerFactory = TransformerFactory.newInstance(); + Transformer transformer = transformerFactory.newTransformer(); + transformer.setOutputProperty(OutputKeys.INDENT, "yes"); + transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2"); + transformer.setOutputProperty(OutputKeys.ENCODING, charset.name()); + DOMSource source = new DOMSource(doc); + StreamResult result = new StreamResult(writer); + + transformer.transform(source, result); + } + + private static void addExecutorService(Mode mode, Element rootElement, Document doc, TikaConfig config) { + //TODO + } + + private static void addServiceLoader(Mode mode, Element rootElement, Document doc, TikaConfig config) { + ServiceLoader loader = config.getServiceLoader(); + + if (mode == Mode.MINIMAL) { + // Is this the default? + if (loader.isDynamic() && loader.getLoadErrorHandler() == LoadErrorHandler.IGNORE) { + // Default config, no need to output anything + return; + } + } + + Element dslEl = doc.createElement("service-loader"); + dslEl.setAttribute("dynamic", Boolean.toString(loader.isDynamic())); + dslEl.setAttribute("loadErrorHandler", loader.getLoadErrorHandler().toString()); + rootElement.appendChild(dslEl); + } + + private static void addTranslator(Mode mode, Element rootElement, Document doc, TikaConfig config) { + // Unlike the other entries, TikaConfig only wants one of + // these, and no outer <translators> list + Translator translator = config.getTranslator(); + if (mode == Mode.MINIMAL && translator instanceof DefaultTranslator) { + Node mimeComment = doc.createComment( + "for example: <translator class=\"org.apache.tika.language.translate.GoogleTranslator\"/>"); + rootElement.appendChild(mimeComment); + } else { + if (translator instanceof DefaultTranslator && mode == Mode.STATIC) { + translator = ((DefaultTranslator)translator).getTranslator(); + } + if (translator != null) { + Element translatorElement = doc.createElement("translator"); + translatorElement.setAttribute("class", translator.getClass().getCanonicalName()); + rootElement.appendChild(translatorElement); + } else { + rootElement.appendChild(doc.createComment("No translators available")); + } + } + } + + private static void addMimeComment(Mode mode, Element rootElement, Document doc) { + Node mimeComment = doc.createComment( + "for example: <mimeTypeRepository resource=\"/org/apache/tika/mime/tika-mimetypes.xml\"/>"); + rootElement.appendChild(mimeComment); + } + + private static void addDetectors(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception { + Detector detector = config.getDetector(); + + if (mode == Mode.MINIMAL && detector instanceof DefaultDetector) { + // Don't output anything, all using defaults + Node detComment = doc.createComment( + "for example: <detectors><detector class=\"org.apache.tika.detector.MimeTypes\"></detectors>"); + rootElement.appendChild(detComment); + return; + } + + Element detectorsElement = doc.createElement("detectors"); + if (mode == Mode.CURRENT && detector instanceof DefaultDetector || + ! (detector instanceof CompositeDetector)) { + Element detectorElement = doc.createElement("detector"); + detectorElement.setAttribute("class", detector.getClass().getCanonicalName()); + detectorsElement.appendChild(detectorElement); + } else { + List<Detector> children = ((CompositeDetector)detector).getDetectors(); + for (Detector d : children) { + Element detectorElement = doc.createElement("detector"); + detectorElement.setAttribute("class", d.getClass().getCanonicalName()); + detectorsElement.appendChild(detectorElement); + } + } + rootElement.appendChild(detectorsElement); + } + + private static void addParsers(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception { + Parser parser = config.getParser(); + if (mode == Mode.MINIMAL && parser instanceof DefaultParser) { + // Don't output anything, all using defaults + return; + } else if (mode == Mode.MINIMAL) { + mode = Mode.CURRENT; + } + + Element parsersElement = doc.createElement("parsers"); + rootElement.appendChild(parsersElement); + + addParser(mode, parsersElement, doc, parser); + } + + private static void addParser(Mode mode, Element rootElement, Document doc, Parser parser) throws Exception { + // If the parser is decorated, is it a kind where we output the parser inside? + ParserDecorator decoration = null; + if (parser instanceof ParserDecorator) { + if (parser.getClass().getName().startsWith(ParserDecorator.class.getName()+"$")) { + decoration = ((ParserDecorator)parser); + parser = decoration.getWrappedParser(); + } + } + + boolean outputParser = true; + List<Parser> children = Collections.emptyList(); + if (mode == Mode.CURRENT && parser instanceof DefaultParser) { + // Only output the parser, not the children + } else if (parser instanceof CompositeParser) { + children = ((CompositeParser)parser).getAllComponentParsers(); + // Special case for a naked composite + if (parser.getClass().equals(CompositeParser.class)) { + outputParser = false; + } + // Special case for making Default to static + if (mode == Mode.STATIC && parser instanceof DefaultParser) { + outputParser = false; + } + } + + if (outputParser) { + rootElement = addParser(rootElement, doc, parser, decoration); + } + for (Parser childParser : children) { + addParser(mode, rootElement, doc, childParser); + } + // TODO Parser Exclusions + } + + private static Element addParser(Element rootElement, Document doc, Parser parser, ParserDecorator decorator) throws Exception { + ParseContext context = new ParseContext(); + + Set<MediaType> addedTypes = new TreeSet<>(); + Set<MediaType> excludedTypes = new TreeSet<>(); + if (decorator != null) { + Set<MediaType> types = new TreeSet<>(); + types.addAll(decorator.getSupportedTypes(context)); + addedTypes.addAll(types); + + for (MediaType type : parser.getSupportedTypes(context)) { + if (! types.contains(type)) { + excludedTypes.add(type); + } + addedTypes.remove(type); + } + } + + String className = parser.getClass().getCanonicalName(); + Element parserElement = doc.createElement("parser"); + parserElement.setAttribute("class", className); + rootElement.appendChild(parserElement); + + for (MediaType type : addedTypes) { + Element mimeElement = doc.createElement("mime"); + mimeElement.appendChild(doc.createTextNode(type.toString())); + parserElement.appendChild(mimeElement); + } + for (MediaType type : excludedTypes) { + Element mimeElement = doc.createElement("mime-exclude"); + mimeElement.appendChild(doc.createTextNode(type.toString())); + parserElement.appendChild(mimeElement); + } + + return parserElement; + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java ---------------------------------------------------------------------- diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java new file mode 100644 index 0000000..01a30eb --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.config; + + +import java.io.StringWriter; +import java.nio.charset.StandardCharsets; + +import org.junit.Ignore; +import org.junit.Test; + +public class TikaConfigSerializerTest extends TikaConfigTest { + + /** + * TIKA-1445 It should be possible to exclude DefaultParser from + * certain types, so another parser explicitly listed will take them + */ + @Test + public void defaultParserWithExcludes() throws Exception { + String xml = loadAndSerialize("TIKA-1445-default-except.xml", + TikaConfigSerializer.Mode.STATIC); + assertContains( + "<parser class=\"org.apache.tika.parser.ErrorParser\">" + + " <mime>fail/world</mime> " + + "</parser>", xml); + } + + @Test + @Ignore("TODO: executor-service info needs to be stored in TikaConfig for serialization") + public void testExecutors() throws Exception { + String xml = loadAndSerialize("TIKA-1762-executors.xml", + TikaConfigSerializer.Mode.STATIC); + assertContains("<executor-service class=\"org.apache.tika.config.DummyExecutor\">" + + " <core-threads>3</core-threads>" + + " <max-threads>10</max-threads>" + + "</executor-service>", xml); + } + + String loadAndSerialize(String configFile, TikaConfigSerializer.Mode mode) throws Exception { + TikaConfig config = getConfig(configFile); + StringWriter writer = new StringWriter(); + TikaConfigSerializer.serialize(config, mode, writer, StandardCharsets.UTF_8); + return writer.toString().replaceAll("[\r\n\t ]+", " "); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java ---------------------------------------------------------------------- diff --git a/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java b/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java index 0c51634..b312032 100644 --- a/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java +++ b/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java @@ -24,36 +24,9 @@ import java.io.OutputStreamWriter; import java.io.StringWriter; import java.io.Writer; import java.nio.charset.Charset; -import java.util.Collections; -import java.util.List; -import java.util.Set; -import java.util.TreeSet; -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; - -import org.apache.tika.config.LoadErrorHandler; -import org.apache.tika.config.ServiceLoader; import org.apache.tika.config.TikaConfig; -import org.apache.tika.detect.CompositeDetector; -import org.apache.tika.detect.DefaultDetector; -import org.apache.tika.detect.Detector; -import org.apache.tika.language.translate.DefaultTranslator; -import org.apache.tika.language.translate.Translator; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.CompositeParser; -import org.apache.tika.parser.DefaultParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.parser.ParserDecorator; -import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.w3c.dom.Node; +import org.apache.tika.config.TikaConfigSerializer; /** @@ -67,214 +40,24 @@ import org.w3c.dom.Node; * for your custom mime types. */ public class DumpTikaConfigExample { - /** - * @param config config file to dump - * @param writer writer to which to write - * @throws Exception - */ - public void dump(TikaConfig config, Mode mode, Writer writer, String encoding) throws Exception { - DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); - DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); - - // root elements - Document doc = docBuilder.newDocument(); - Element rootElement = doc.createElement("properties"); - - doc.appendChild(rootElement); - addMimeComment(mode, rootElement, doc); - addServiceLoader(mode, rootElement, doc, config); - addTranslator(mode, rootElement, doc, config); - addDetectors(mode, rootElement, doc, config); - addParsers(mode, rootElement, doc, config); - // TODO Service Loader section - // now write - TransformerFactory transformerFactory = TransformerFactory.newInstance(); - Transformer transformer = transformerFactory.newTransformer(); - transformer.setOutputProperty(OutputKeys.INDENT, "yes"); - transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2"); - transformer.setOutputProperty(OutputKeys.ENCODING, encoding); - DOMSource source = new DOMSource(doc); - StreamResult result = new StreamResult(writer); - - transformer.transform(source, result); - } - - private void addServiceLoader(Mode mode, Element rootElement, Document doc, TikaConfig config) { - ServiceLoader loader = config.getServiceLoader(); - - if (mode == Mode.MINIMAL) { - // Is this the default? - if (loader.isDynamic() && loader.getLoadErrorHandler() == LoadErrorHandler.IGNORE) { - // Default config, no need to output anything - return; - } - } - - Element dslEl = doc.createElement("service-loader"); - dslEl.setAttribute("dynamic", Boolean.toString(loader.isDynamic())); - dslEl.setAttribute("loadErrorHandler", loader.getLoadErrorHandler().toString()); - rootElement.appendChild(dslEl); - } - - private void addTranslator(Mode mode, Element rootElement, Document doc, TikaConfig config) { - // Unlike the other entries, TikaConfig only wants one of - // these, and no outer <translators> list - Translator translator = config.getTranslator(); - if (mode == Mode.MINIMAL && translator instanceof DefaultTranslator) { - Node mimeComment = doc.createComment( - "for example: <translator class=\"org.apache.tika.language.translate.GoogleTranslator\"/>"); - rootElement.appendChild(mimeComment); - } else { - if (translator instanceof DefaultTranslator && mode == Mode.STATIC) { - translator = ((DefaultTranslator)translator).getTranslator(); - } - if (translator != null) { - Element translatorElement = doc.createElement("translator"); - translatorElement.setAttribute("class", translator.getClass().getCanonicalName()); - rootElement.appendChild(translatorElement); - } else { - rootElement.appendChild(doc.createComment("No translators available")); - } - } - } - - private void addMimeComment(Mode mode, Element rootElement, Document doc) { - Node mimeComment = doc.createComment( - "for example: <mimeTypeRepository resource=\"/org/apache/tika/mime/tika-mimetypes.xml\"/>"); - rootElement.appendChild(mimeComment); - } - - private void addDetectors(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception { - Detector detector = config.getDetector(); - - if (mode == Mode.MINIMAL && detector instanceof DefaultDetector) { - // Don't output anything, all using defaults - Node detComment = doc.createComment( - "for example: <detectors><detector class=\"org.apache.tika.detector.MimeTypes\"></detectors>"); - rootElement.appendChild(detComment); - return; - } - - Element detectorsElement = doc.createElement("detectors"); - if (mode == Mode.CURRENT && detector instanceof DefaultDetector || - ! (detector instanceof CompositeDetector)) { - Element detectorElement = doc.createElement("detector"); - detectorElement.setAttribute("class", detector.getClass().getCanonicalName()); - detectorsElement.appendChild(detectorElement); - } else { - List<Detector> children = ((CompositeDetector)detector).getDetectors(); - for (Detector d : children) { - Element detectorElement = doc.createElement("detector"); - detectorElement.setAttribute("class", d.getClass().getCanonicalName()); - detectorsElement.appendChild(detectorElement); - } - } - rootElement.appendChild(detectorsElement); - } - - private void addParsers(Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception { - Parser parser = config.getParser(); - if (mode == Mode.MINIMAL && parser instanceof DefaultParser) { - // Don't output anything, all using defaults - return; - } else if (mode == Mode.MINIMAL) { - mode = Mode.CURRENT; - } - - Element parsersElement = doc.createElement("parsers"); - rootElement.appendChild(parsersElement); - - addParser(mode, parsersElement, doc, parser); - } - private void addParser(Mode mode, Element rootElement, Document doc, Parser parser) throws Exception { - // If the parser is decorated, is it a kind where we output the parser inside? - ParserDecorator decoration = null; - if (parser instanceof ParserDecorator) { - if (parser.getClass().getName().startsWith(ParserDecorator.class.getName()+"$")) { - decoration = ((ParserDecorator)parser); - parser = decoration.getWrappedParser(); - } - } - - boolean outputParser = true; - List<Parser> children = Collections.emptyList(); - if (mode == Mode.CURRENT && parser instanceof DefaultParser) { - // Only output the parser, not the children - } else if (parser instanceof CompositeParser) { - children = ((CompositeParser)parser).getAllComponentParsers(); - // Special case for a naked composite - if (parser.getClass().equals(CompositeParser.class)) { - outputParser = false; - } - // Special case for making Default to static - if (mode == Mode.STATIC && parser instanceof DefaultParser) { - outputParser = false; - } - } - - if (outputParser) { - rootElement = addParser(rootElement, doc, parser, decoration); - } - for (Parser childParser : children) { - addParser(mode, rootElement, doc, childParser); - } - // TODO Parser Exclusions - } - private Element addParser(Element rootElement, Document doc, Parser parser, ParserDecorator decorator) throws Exception { - ParseContext context = new ParseContext(); - - Set<MediaType> addedTypes = new TreeSet<>(); - Set<MediaType> excludedTypes = new TreeSet<>(); - if (decorator != null) { - Set<MediaType> types = new TreeSet<>(); - types.addAll(decorator.getSupportedTypes(context)); - addedTypes.addAll(types); - - for (MediaType type : parser.getSupportedTypes(context)) { - if (! types.contains(type)) { - excludedTypes.add(type); - } - addedTypes.remove(type); - } - } - - String className = parser.getClass().getCanonicalName(); - Element parserElement = doc.createElement("parser"); - parserElement.setAttribute("class", className); - rootElement.appendChild(parserElement); - - for (MediaType type : addedTypes) { - Element mimeElement = doc.createElement("mime"); - mimeElement.appendChild(doc.createTextNode(type.toString())); - parserElement.appendChild(mimeElement); - } - for (MediaType type : excludedTypes) { - Element mimeElement = doc.createElement("mime-exclude"); - mimeElement.appendChild(doc.createTextNode(type.toString())); - parserElement.appendChild(mimeElement); - } - - return parserElement; - } - /** * @param args outputFile, outputEncoding, if args is empty, this prints to console * @throws Exception */ public static void main(String[] args) throws Exception { Charset encoding = UTF_8; - Mode mode = Mode.CURRENT; + TikaConfigSerializer.Mode mode = TikaConfigSerializer.Mode.CURRENT; String filename = null; for (String arg : args) { if (arg.startsWith("-")) { if (arg.contains("-dump-minimal")) { - mode = Mode.MINIMAL; + mode = TikaConfigSerializer.Mode.MINIMAL; } else if (arg.contains("-dump-current")) { - mode = Mode.CURRENT; + mode = TikaConfigSerializer.Mode.CURRENT; } else if (arg.contains("-dump-static")) { - mode = Mode.STATIC; + mode = TikaConfigSerializer.Mode.STATIC; } else { System.out.println("Use:"); System.out.println(" DumpTikaConfig [--dump-minimal] [--dump-current] [--dump-static] [filename] [encoding]"); @@ -299,7 +82,7 @@ public class DumpTikaConfigExample { } DumpTikaConfigExample ex = new DumpTikaConfigExample(); - ex.dump(TikaConfig.getDefaultConfig(), mode, writer, encoding.name()); + TikaConfigSerializer.serialize(TikaConfig.getDefaultConfig(), mode, writer, encoding); writer.flush(); @@ -308,7 +91,5 @@ public class DumpTikaConfigExample { } writer.close(); } - protected enum Mode { - MINIMAL, CURRENT, STATIC; - } + } http://git-wip-us.apache.org/repos/asf/tika/blob/3aa1dca4/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java ---------------------------------------------------------------------- diff --git a/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java b/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java index 29acfab..3f40600 100644 --- a/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java +++ b/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java @@ -30,8 +30,8 @@ import java.io.Writer; import java.nio.charset.Charset; import org.apache.tika.config.TikaConfig; +import org.apache.tika.config.TikaConfigSerializer; import org.apache.tika.detect.CompositeDetector; -import org.apache.tika.example.DumpTikaConfigExample.Mode; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.Parser; @@ -64,9 +64,9 @@ public class DumpTikaConfigExampleTest { public void testDump() throws Exception { DumpTikaConfigExample ex = new DumpTikaConfigExample(); for (Charset charset : new Charset[]{UTF_8, UTF_16LE}) { - for (Mode mode : Mode.values()) { + for (TikaConfigSerializer.Mode mode : TikaConfigSerializer.Mode.values()) { Writer writer = new OutputStreamWriter(new FileOutputStream(configFile), charset); - ex.dump(TikaConfig.getDefaultConfig(), mode, writer, charset.name()); + TikaConfigSerializer.serialize(TikaConfig.getDefaultConfig(), mode, writer, charset); writer.flush(); writer.close();
