Author: tallison Date: Fri Sep 19 14:02:16 2014 New Revision: 1626222 URL: http://svn.apache.org/r1626222 Log: TIKA-1418 add files
Added: tika/trunk/tika-app/src/test/resources/test-data/bad_xml.xml tika/trunk/tika-app/src/test/resources/test-data/tika-config1.xml tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java Added: tika/trunk/tika-app/src/test/resources/test-data/bad_xml.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/resources/test-data/bad_xml.xml?rev=1626222&view=auto ============================================================================== --- tika/trunk/tika-app/src/test/resources/test-data/bad_xml.xml (added) +++ tika/trunk/tika-app/src/test/resources/test-data/bad_xml.xml Fri Sep 19 14:02:16 2014 @@ -0,0 +1,6 @@ +<?xml version="1.0" encoding="UTF-8"?> +<grocery_list> + <item>apple</item> + <item>orange</item> + <item>pear<item> +</grocery_list> \ No newline at end of file Added: tika/trunk/tika-app/src/test/resources/test-data/tika-config1.xml URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/test/resources/test-data/tika-config1.xml?rev=1626222&view=auto ============================================================================== --- tika/trunk/tika-app/src/test/resources/test-data/tika-config1.xml (added) +++ tika/trunk/tika-app/src/test/resources/test-data/tika-config1.xml Fri Sep 19 14:02:16 2014 @@ -0,0 +1,14 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<properties> + <parsers> + + <parser class="org.apache.tika.parser.html.HtmlParser"> + <mime>application/vnd.wap.xhtml+xml</mime> + <mime>application/x-asp</mime> + <mime>application/xhtml+xml</mime> + <mime>text/html</mime> + <mime>application/xml</mime> + <mime>text/xml</mime> + </parser> + </parsers> +</properties> Added: tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java?rev=1626222&view=auto ============================================================================== --- tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java (added) +++ tika/trunk/tika-example/src/main/java/org/apache/tika/example/DumpTikaConfigExample.java Fri Sep 19 14:02:16 2014 @@ -0,0 +1,212 @@ +package org.apache.tika.example; +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.DefaultDetector; +import org.apache.tika.detect.Detector; +import org.apache.tika.exception.TikaException; +import org.apache.tika.language.translate.DefaultTranslator; +import org.apache.tika.language.translate.Translator; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.CompositeParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.StringWriter; +import java.io.Writer; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; + + +/** + * This class shows how to dump a TikaConfig object to a configuration file. + * This allows users to easily dump the default TikaConfig as a base from which + * to start if they want to modify the default configuration file. + * <p> + * For those who want to modify the mimes file, take a look at + * tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml + * for inspiration. Consider adding org/apache/tika/mime/custom-mimetypes.xml + * for your custom mime types. + */ +public class DumpTikaConfigExample { + + /** + * + * @param config config file to dump + * @param writer writer to which to write + * @throws Exception + */ + public void dump(TikaConfig config, Writer writer, String encoding) throws Exception { + DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); + DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); + // root elements + Document doc = docBuilder.newDocument(); + Element rootElement = doc.createElement("properties"); + + doc.appendChild(rootElement); + addMimeComment(rootElement, doc); + addTranslator(rootElement, doc, config); + addDetectors(rootElement, doc, config); + addParsers(rootElement, doc, config); + + + //now write + TransformerFactory transformerFactory = TransformerFactory.newInstance(); + Transformer transformer = transformerFactory.newTransformer(); + transformer.setOutputProperty(OutputKeys.INDENT, "yes"); + transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2"); + transformer.setOutputProperty(OutputKeys.ENCODING, encoding); + DOMSource source = new DOMSource(doc); + StreamResult result = new StreamResult(writer); + + transformer.transform(source, result); + } + + private void addTranslator(Element rootElement, Document doc, TikaConfig config) { + //TikaConfig only reads the first translator from the list, + //but it looks like it expects a list + Translator translator = config.getTranslator(); + if (translator instanceof DefaultTranslator) { + Node mimeComment = doc.createComment( + "for example: "+ + "<translator class=\"org.apache.tika.language.translate.GoogleTranslator\"/>"); + rootElement.appendChild(mimeComment); + } else { + Element translatorElement = doc.createElement("translator"); + translatorElement.setAttribute("class", translator.getClass().getCanonicalName()); + rootElement.appendChild(translatorElement); + } + } + + private void addMimeComment(Element rootElement, Document doc) { + Node mimeComment = doc.createComment( + "for example: <mimeTypeRepository resource=\"/org/apache/tika/mime/tika-mimetypes.xml\"/>"); + rootElement.appendChild(mimeComment); + } + + private void addDetectors(Element rootElement, Document doc, TikaConfig config) throws Exception { + Detector detector = config.getDetector(); + Element detectorsElement = doc.createElement("detectors"); + + if (detector instanceof DefaultDetector) { + List<Detector> children = ((DefaultDetector)detector).getDetectors(); + for (Detector d : children) { + Element detectorElement = doc.createElement("detector"); + detectorElement.setAttribute("class", d.getClass().getCanonicalName()); + detectorsElement.appendChild(detectorElement); + } + } + rootElement.appendChild(detectorsElement); + } + + private void addParsers(Element rootElement, Document doc, TikaConfig config) throws Exception { + Map<String, Parser> parsers = getConcreteParsers(config.getParser()); + + Element parsersElement = doc.createElement("parsers"); + rootElement.appendChild(parsersElement); + + ParseContext context = new ParseContext(); + for (Map.Entry<String, Parser> e : parsers.entrySet()) { + Element parserElement = doc.createElement("parser"); + Parser child = e.getValue(); + String className = e.getKey(); + parserElement.setAttribute("class", className); + Set<MediaType> types = new TreeSet<MediaType>(); + types.addAll(child.getSupportedTypes(context)); + for (MediaType type : types){ + Element mimeElement = doc.createElement("mime"); + mimeElement.appendChild(doc.createTextNode(type.toString())); + parserElement.appendChild(mimeElement); + } + parsersElement.appendChild(parserElement); + } + rootElement.appendChild(parsersElement); + + } + + private Map<String, Parser> getConcreteParsers(Parser parentParser)throws TikaException, IOException { + Map<String, Parser> parsers = new TreeMap<String, Parser>(); + if (parentParser instanceof CompositeParser) { + addParsers((CompositeParser)parentParser, parsers); + } else { + addParser(parentParser, parsers); + } + return parsers; + } + + private void addParsers(CompositeParser p, Map<String, Parser> parsers) { + for (Parser child : p.getParsers().values()) { + System.out.println(child.getClass().getName()); + if (child instanceof CompositeParser) { + addParsers((CompositeParser)child, parsers); + } else { + addParser(child, parsers); + } + } + } + + private void addParser(Parser p, Map<String, Parser> parsers) { + parsers.put(p.getClass().getCanonicalName(), p); + } + + /** + * + * @param args outputFile, outputEncoding, if args is empty, this prints to console + * @throws Exception + */ + public static void main(String[] args) throws Exception { + + String encoding = "UTF-8"; + Writer writer = null; + if (args.length > 0) { + writer = new OutputStreamWriter(new FileOutputStream(new File(args[0]))); + } else { + writer = new StringWriter(); + } + + if (args.length > 1) { + encoding = args[1]; + } + DumpTikaConfigExample ex = new DumpTikaConfigExample(); + ex.dump(TikaConfig.getDefaultConfig(), writer, encoding); + + writer.flush(); + + if (writer instanceof StringWriter) { + System.out.println(writer.toString()); + } + writer.close(); + } +} Added: tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java?rev=1626222&view=auto ============================================================================== --- tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java (added) +++ tika/trunk/tika-example/src/test/java/org/apache/tika/example/DumpTikaConfigExampleTest.java Fri Sep 19 14:02:16 2014 @@ -0,0 +1,83 @@ +package org.apache.tika.example; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.CompositeDetector; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.CompositeParser; +import org.apache.tika.parser.Parser; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; + +import static junit.framework.TestCase.assertEquals; +import static junit.framework.TestCase.assertTrue; + +public class DumpTikaConfigExampleTest { + private File configFile; + @Before + public void setUp() { + try { + configFile = File.createTempFile("tmp", ".xml"); + } catch (IOException e) { + throw new RuntimeException("Failed to create tmp file"); + } + } + + @After + public void tearDown() { + if (configFile != null && configFile.exists()) { + configFile.delete(); + } + if (configFile != null && configFile.exists()) { + throw new RuntimeException("Failed to clean up: "+configFile.getAbsolutePath()); + } + } + + @Test + public void testDump() throws Exception { + DumpTikaConfigExample ex = new DumpTikaConfigExample(); + for (String encoding : new String[]{ "UTF-8", "UTF-16LE"}) { + Writer writer = new OutputStreamWriter(new FileOutputStream(configFile), encoding); + ex.dump(TikaConfig.getDefaultConfig(), writer, encoding); + writer.flush(); + writer.close(); + + TikaConfig c = new TikaConfig(configFile); + assertEquals(CompositeParser.class, c.getParser().getClass()); + assertEquals(CompositeDetector.class, c.getDetector().getClass()); + + CompositeParser p = (CompositeParser) c.getParser(); + assertTrue("enough parsers?", p.getParsers().size() > 130); + + CompositeDetector d = (CompositeDetector) c.getDetector(); + assertTrue("enough detectors?", d.getDetectors().size() > 3); + //just try to load it into autodetect to make sure no errors are thrown + Parser auto = new AutoDetectParser(c); + } + } + +}