Author: nick Date: Sat Aug 1 17:02:26 2015 New Revision: 1693733 URL: http://svn.apache.org/r1693733 Log: TIKA-1702 Move the parser and detector creation logic to the config loader classes
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=1693733&r1=1693732&r2=1693733&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Sat Aug 1 17:02:26 2015 @@ -129,7 +129,7 @@ public class TikaConfig { DetectorXmlLoader detectorLoader = new DetectorXmlLoader(); this.mimeTypes = typesFromDomElement(element); - this.detector = detectorFromDomElement(element, mimeTypes, loader); + this.detector = detectorLoader.loadOverall(element, mimeTypes, loader); this.parser = parserLoader.loadOverall(element, mimeTypes, loader); this.translator = translatorFromDomElement(element, loader); } @@ -213,8 +213,7 @@ public class TikaConfig { this.mimeTypes = typesFromDomElement(element); this.parser = parserLoader.loadOverall(element, mimeTypes, loader); - this.detector = - detectorFromDomElement(element, mimeTypes, loader); + this.detector = detectorLoader.loadOverall(element, mimeTypes, loader); this.translator = translatorFromDomElement(element, loader); } catch (SAXException e) { throw new TikaException( @@ -358,137 +357,6 @@ public class TikaConfig { return getDefaultMimeTypes(null); } } - -// private static CompositeParser parserFromDomElement( -// Element element, MimeTypes mimeTypes, ServiceLoader loader) -// throws TikaException, IOException { -// List<Parser> parsers = new ArrayList<Parser>(); -// -// // Find the parser children of the parsers tag, if any -// for (Element pe : getTopLevelElementChildren(element, "parsers", "parser")) { -// parsers.add(parserFromParserDomElement(pe, mimeTypes, loader)); -// } -// -// if (parsers.isEmpty()) { -// // No parsers defined, create a DefaultParser -// return getDefaultParser(mimeTypes, loader); -// } else if (parsers.size() == 1 && parsers.get(0) instanceof CompositeParser) { -// // Single Composite defined, use that -// return (CompositeParser)parsers.get(0); -// } else { -// // Wrap the defined parsers up in a Composite -// MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); -// return new CompositeParser(registry, parsers); -// } -// } - private static Parser parserFromParserDomElement( - Element parserNode, MimeTypes mimeTypes, ServiceLoader loader) - throws TikaException, IOException { - String name = parserNode.getAttribute("class"); - Parser parser = null; - - try { - Class<? extends Parser> parserClass = - loader.getServiceClass(Parser.class, name); - // https://issues.apache.org/jira/browse/TIKA-866 - if (AutoDetectParser.class.isAssignableFrom(parserClass)) { - throw new TikaException( - "AutoDetectParser not supported in a <parser>" - + " configuration element: " + name); - } - - // Is this a composite or decorated parser? If so, support recursion - if (CompositeParser.class.isAssignableFrom(parserClass) || - ParserDecorator.class.isAssignableFrom(parserClass)) { - - // Get the child parsers for it - List<Parser> childParsers = new ArrayList<Parser>(); - NodeList childParserNodes = parserNode.getElementsByTagName("parser"); - if (childParserNodes.getLength() > 0) { - for (int i = 0; i < childParserNodes.getLength(); i++) { - childParsers.add(parserFromParserDomElement( - (Element)childParserNodes.item(i), mimeTypes, loader - )); - } - } - - // Get the list of parsers to exclude - Set<Class<? extends Parser>> excludeParsers = new HashSet<Class<? extends Parser>>(); - NodeList excludeParserNodes = parserNode.getElementsByTagName("parser-exclude"); - if (excludeParserNodes.getLength() > 0) { - for (int i = 0; i < excludeParserNodes.getLength(); i++) { - Element excl = (Element)excludeParserNodes.item(i); - String exclName = excl.getAttribute("class"); - excludeParsers.add(loader.getServiceClass(Parser.class, exclName)); - } - } - - // Create the Composite Parser - Constructor<? extends Parser> c = null; - MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); - if (parser == null) { - try { - c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class, Collection.class); - parser = c.newInstance(registry, loader, excludeParsers); - } - catch (NoSuchMethodException me) {} - } - if (parser == null) { - try { - c = parserClass.getConstructor(MediaTypeRegistry.class, List.class, Collection.class); - parser = c.newInstance(registry, childParsers, excludeParsers); - } catch (NoSuchMethodException me) {} - } - // Create as a Parser Decorator - if (parser == null && ParserDecorator.class.isAssignableFrom(parserClass)) { - try { - CompositeParser cp = null; - if (childParsers.size() == 1 && excludeParsers.size() == 0 && - childParsers.get(0) instanceof CompositeParser) { - cp = (CompositeParser)childParsers.get(0); - } else { - cp = new CompositeParser(registry, childParsers, excludeParsers); - } - c = parserClass.getConstructor(Parser.class); - parser = c.newInstance(cp); - } catch (NoSuchMethodException me) {} - } - // Default constructor - if (parser == null) { - parser = parserClass.newInstance(); - } - } else { - // Regular parser, create as-is - parser = parserClass.newInstance(); - } - - // Is there an explicit list of mime types for this to handle? - Set<MediaType> parserTypes = mediaTypesListFromDomElement(parserNode, "mime"); - if (! parserTypes.isEmpty()) { - parser = ParserDecorator.withTypes(parser, parserTypes); - } - // Is there an explicit list of mime types this shouldn't handle? - Set<MediaType> parserExclTypes = mediaTypesListFromDomElement(parserNode, "mime-exclude"); - if (! parserExclTypes.isEmpty()) { - parser = ParserDecorator.withoutTypes(parser, parserExclTypes); - } - - // All done with setup - return parser; - } catch (ClassNotFoundException e) { - throw new TikaException( - "Unable to find a parser class: " + name, e); - } catch (IllegalAccessException e) { - throw new TikaException( - "Unable to access a parser class: " + name, e); - } catch (InvocationTargetException e) { - throw new TikaException( - "Unable to create a parser class: " + name, e); - } catch (InstantiationException e) { - throw new TikaException( - "Unable to instantiate a parser class: " + name, e); - } - } private static Set<MediaType> mediaTypesListFromDomElement( Element node, String tag) @@ -516,49 +384,6 @@ public class TikaConfig { return Collections.emptySet(); } - private static CompositeDetector detectorFromDomElement( - Element element, MimeTypes mimeTypes, ServiceLoader loader) - throws TikaException, IOException { - List<Detector> detectors = new ArrayList<Detector>(); - - // Find the detector children of the detectors tag, if any - for (Element de : getTopLevelElementChildren(element, "detectors", "detector")) { - detectors.add(detectorFromDetectorDomElement(de, mimeTypes, loader)); - } - - if (detectors.isEmpty()) { - // No detectors defined, create a DefaultDetector - return getDefaultDetector(mimeTypes, loader); - } else if (detectors.size() == 1 && detectors.get(0) instanceof CompositeDetector) { - // Single Composite defined, use that - return (CompositeDetector)detectors.get(0); - } else { - // Wrap the defined detectors up in a Composite - MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); - return new CompositeDetector(registry, detectors); - } - } - private static Detector detectorFromDetectorDomElement( - Element detectorNode, MimeTypes mimeTypes, ServiceLoader loader) - throws TikaException, IOException { - String name = detectorNode.getAttribute("class"); - - try { - Class<? extends Detector> detectorClass = - loader.getServiceClass(Detector.class, name); - return detectorClass.newInstance(); - } catch (ClassNotFoundException e) { - throw new TikaException( - "Unable to find a detector class: " + name, e); - } catch (IllegalAccessException e) { - throw new TikaException( - "Unable to access a detector class: " + name, e); - } catch (InstantiationException e) { - throw new TikaException( - "Unable to instantiate a detector class: " + name, e); - } - } - private static Translator translatorFromDomElement( Element element, ServiceLoader loader) throws TikaException, IOException { @@ -593,9 +418,17 @@ public class TikaConfig { private static abstract class XmlLoader<CT,T> { abstract String getParentTagName(); // eg parsers abstract String getLoaderTagName(); // eg parser + abstract Class<? extends T> getLoaderClass(); // Generics workaround abstract boolean isComposite(T loaded); + abstract boolean isComposite(Class<? extends T> loadedClass); abstract CT createDefault(MimeTypes mimeTypes, ServiceLoader loader); abstract CT createComposite(List<T> loaded, MimeTypes mimeTypes, ServiceLoader loader); + abstract T createComposite(Class<? extends T> compositeClass, + List<T> children, Set<Class<? extends T>> excludeChildren, + MimeTypes mimeTypes, ServiceLoader loader) + throws InvocationTargetException, IllegalAccessException, InstantiationException; + abstract T decorate(T created, Element element) + throws IOException, TikaException; // eg explicit mime types @SuppressWarnings("unchecked") CT loadOverall(Element element, MimeTypes mimeTypes, @@ -621,22 +454,100 @@ public class TikaConfig { // Wrap the defined parsers/detectors up in a Composite return createComposite(loaded, mimeTypes, loader); } - T loadOne(Element element, MimeTypes mimeTypes, - ServiceLoader loader) throws TikaException, IOException { - // TODO Do this properly - // TODO This is a cheat for parsers only! - return (T)parserFromParserDomElement(element, mimeTypes, loader); - } + T loadOne(Element element, MimeTypes mimeTypes, ServiceLoader loader) + throws TikaException, IOException { + String name = element.getAttribute("class"); + T loaded = null; + + try { + Class<? extends T> loadedClass = + loader.getServiceClass(getLoaderClass(), name); + + // Check for classes which can't be set in config + if (AutoDetectParser.class.isAssignableFrom(loadedClass)) { + // https://issues.apache.org/jira/browse/TIKA-866 + throw new TikaException( + "AutoDetectParser not supported in a <parser>" + + " configuration element: " + name); + } + + // Is this a composite or decorated class? If so, support recursion + if (isComposite(loadedClass)) { + // Get the child objects for it + List<T> children = new ArrayList<T>(); + NodeList childNodes = element.getElementsByTagName(getLoaderTagName()); + if (childNodes.getLength() > 0) { + for (int i = 0; i < childNodes.getLength(); i++) { + children.add(loadOne( + (Element)childNodes.item(i), mimeTypes, loader + )); + } + } + + // Get the list of children to exclude + Set<Class<? extends T>> excludeChildren = new HashSet<Class<? extends T>>(); + NodeList excludeChildNodes = element.getElementsByTagName(getLoaderTagName()+"-exclude"); + if (excludeChildNodes.getLength() > 0) { + for (int i = 0; i < excludeChildNodes.getLength(); i++) { + Element excl = (Element)excludeChildNodes.item(i); + String exclName = excl.getAttribute("class"); + excludeChildren.add(loader.getServiceClass(getLoaderClass(), exclName)); + } + } + + // Create the Composite + loaded = createComposite(loadedClass, children, excludeChildren, mimeTypes, loader); + + // Default constructor fallback + if (loaded == null) { + loaded = loadedClass.newInstance(); + } + } else { + // Regular class, create as-is + // TODO Support arguments, needed for Translators etc + loaded = loadedClass.newInstance(); + } + + // Have any decoration performed, eg explicit mimetypes + loaded = decorate(loaded, element); + + // All done with setup + return loaded; + } catch (ClassNotFoundException e) { + throw new TikaException( + "Unable to find a "+getLoaderTagName()+" class: " + name, e); + } catch (IllegalAccessException e) { + throw new TikaException( + "Unable to access a "+getLoaderTagName()+" class: " + name, e); + } catch (InvocationTargetException e) { + throw new TikaException( + "Unable to create a "+getLoaderTagName()+" class: " + name, e); + } catch (InstantiationException e) { + throw new TikaException( + "Unable to instantiate a "+getLoaderTagName()+" class: " + name, e); + } } } private static class ParserXmlLoader extends XmlLoader<CompositeParser,Parser> { String getParentTagName() { return "parsers"; } String getLoaderTagName() { return "parser"; } @Override + Class<? extends Parser> getLoaderClass() { + return Parser.class; + } + @Override boolean isComposite(Parser loaded) { return loaded instanceof CompositeParser; } @Override + boolean isComposite(Class<? extends Parser> loadedClass) { + if (CompositeParser.class.isAssignableFrom(loadedClass) || + ParserDecorator.class.isAssignableFrom(loadedClass)) { + return true; + } + return false; + } + @Override CompositeParser createDefault(MimeTypes mimeTypes, ServiceLoader loader) { return getDefaultParser(mimeTypes, loader); } @@ -644,17 +555,83 @@ public class TikaConfig { CompositeParser createComposite(List<Parser> parsers, MimeTypes mimeTypes, ServiceLoader loader) { MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); return new CompositeParser(registry, parsers); - } + } + @Override + Parser createComposite(Class<? extends Parser> parserClass, + List<Parser> childParsers, Set<Class<? extends Parser>> excludeParsers, + MimeTypes mimeTypes, ServiceLoader loader) + throws InvocationTargetException, IllegalAccessException, InstantiationException { + Parser parser = null; + Constructor<? extends Parser> c = null; + MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); + + // Try the possible parser constructors + if (parser == null) { + try { + c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class, Collection.class); + parser = c.newInstance(registry, loader, excludeParsers); + } + catch (NoSuchMethodException me) {} + } + if (parser == null) { + try { + c = parserClass.getConstructor(MediaTypeRegistry.class, List.class, Collection.class); + parser = c.newInstance(registry, childParsers, excludeParsers); + } catch (NoSuchMethodException me) {} + } + + // Create as a Parser Decorator + if (parser == null && ParserDecorator.class.isAssignableFrom(parserClass)) { + try { + CompositeParser cp = null; + if (childParsers.size() == 1 && excludeParsers.size() == 0 && + childParsers.get(0) instanceof CompositeParser) { + cp = (CompositeParser)childParsers.get(0); + } else { + cp = new CompositeParser(registry, childParsers, excludeParsers); + } + c = parserClass.getConstructor(Parser.class); + parser = c.newInstance(cp); + } catch (NoSuchMethodException me) {} + } + return parser; + } + @Override + Parser decorate(Parser created, Element element) throws IOException, TikaException { + Parser parser = created; + + // Is there an explicit list of mime types for this to handle? + Set<MediaType> parserTypes = mediaTypesListFromDomElement(element, "mime"); + if (! parserTypes.isEmpty()) { + parser = ParserDecorator.withTypes(parser, parserTypes); + } + // Is there an explicit list of mime types this shouldn't handle? + Set<MediaType> parserExclTypes = mediaTypesListFromDomElement(element, "mime-exclude"); + if (! parserExclTypes.isEmpty()) { + parser = ParserDecorator.withoutTypes(parser, parserExclTypes); + } + + // All done with decoration + return parser; + } } private static class DetectorXmlLoader extends XmlLoader<CompositeDetector,Detector> { String getParentTagName() { return "detectors"; } String getLoaderTagName() { return "detector"; } @Override + Class<? extends Detector> getLoaderClass() { + return Detector.class; + } + @Override boolean isComposite(Detector loaded) { return loaded instanceof CompositeDetector; } @Override + boolean isComposite(Class<? extends Detector> loadedClass) { + return CompositeDetector.class.isAssignableFrom(loadedClass); + } + @Override CompositeDetector createDefault(MimeTypes mimeTypes, ServiceLoader loader) { return getDefaultDetector(mimeTypes, loader); } @@ -662,6 +639,20 @@ public class TikaConfig { CompositeDetector createComposite(List<Detector> detectors, MimeTypes mimeTypes, ServiceLoader loader) { MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry(); return new CompositeDetector(registry, detectors); - } + } + @Override + Detector createComposite(Class<? extends Detector> compositeClass, + List<Detector> children, + Set<Class<? extends Detector>> excludeChildren, + MimeTypes mimeTypes, ServiceLoader loader) + throws InvocationTargetException, IllegalAccessException, + InstantiationException { + // TODO Implement properly + return compositeClass.newInstance(); + } + @Override + Detector decorate(Detector created, Element element) { + return created; // No decoration of Detectors + } } }