Author: jnioche Date: Tue Apr 6 11:38:26 2010 New Revision: 931098 URL: http://svn.apache.org/viewvc?rev=931098&view=rev Log: NUTCH-810 Upgraded to Tika 0.7
Added: lucene/nutch/trunk/lib/tika-core-0.7.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/bcprov-jdk15-1.45.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-1.1.0.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-1.1.0.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-1.1.0.jar (with props) lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.7.jar (with props) Removed: lucene/nutch/trunk/lib/tika-core-0.6.jar lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-0.8.0-incubator.jar lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-0.8.0-incubator.jar lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-0.8.0-incubating.jar lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.6.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/tika-mimetypes.xml lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=931098&r1=931097&r2=931098&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Apr 6 11:38:26 2010 @@ -2,6 +2,8 @@ Nutch Change Log Unreleased Changes +* NUTCH-810 Upgrade to Tika 0.7 (jnioche) + * NUTCH-785 Copy metadata from origin URL when redirecting in Fetcher + call scfilters.initialScore on newly created URL (jnioche) * NUTCH-779 Mechanism for passing metadata from parse to crawldb (jnioche) Modified: lucene/nutch/trunk/conf/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/tika-mimetypes.xml?rev=931098&r1=931097&r2=931098&view=diff ============================================================================== --- lucene/nutch/trunk/conf/tika-mimetypes.xml (original) +++ lucene/nutch/trunk/conf/tika-mimetypes.xml Tue Apr 6 11:38:26 2010 @@ -2198,7 +2198,11 @@ <mime-type type="application/x-cpio"> <magic priority="50"> - <match value="070707" type="host16" offset="0"/> + <match value="070707" type="little16" offset="0"/> + <match value="070707" type="big16" offset="0"/> + <match value="070707" type="string" offset="0"/> + <match value="070701" type="string" offset="0"/> + <match value="070702" type="string" offset="0"/> </magic> <glob pattern="*.cpio"/> </mime-type> @@ -3551,7 +3555,13 @@ bad HTML, unfortunately. --> <root-XML localName="html"/> + <root-XML localName="HTML"/> <root-XML localName="link"/> + <root-XML localName="LINK"/> + <root-XML localName="body"/> + <root-XML localName="BODY"/> + <root-XML localName="p"/> + <root-XML localName="P"/> <magic priority="50"> <match value="<!DOCTYPE HTML" type="string" offset="0:64"/> <match value="<!doctype html" type="string" offset="0:64"/> Added: lucene/nutch/trunk/lib/tika-core-0.7.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/tika-core-0.7.jar?rev=931098&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/tika-core-0.7.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=931098&r1=931097&r2=931098&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Tue Apr 6 11:38:26 2010 @@ -32,8 +32,8 @@ <ant dir="index-basic" target="deploy"/> <ant dir="index-anchor" target="deploy"/> <ant dir="index-more" target="deploy"/> - <ant dir="field-basic" target="deploy"/> - <ant dir="field-boost" target="deploy"/> + <ant dir="field-basic" target="deploy"/> + <ant dir="field-boost" target="deploy"/> <ant dir="languageidentifier" target="deploy"/> <ant dir="lib-http" target="deploy"/> <ant dir="lib-jakarta-poi" target="deploy"/> @@ -65,12 +65,12 @@ <ant dir="query-basic" target="deploy"/> <ant dir="query-more" target="deploy"/> <ant dir="query-site" target="deploy"/> - <ant dir="query-custom" target="deploy"/> + <ant dir="query-custom" target="deploy"/> <ant dir="query-url" target="deploy"/> <ant dir="response-json" target="deploy"/> <ant dir="response-xml" target="deploy"/> <ant dir="scoring-opic" target="deploy"/> - <ant dir="scoring-link" target="deploy"/> + <ant dir="scoring-link" target="deploy"/> <ant dir="summary-basic" target="deploy"/> <ant dir="subcollection" target="deploy"/> <ant dir="summary-lucene" target="deploy"/> @@ -99,7 +99,6 @@ <ant dir="protocol-httpclient" target="test"/> <!--ant dir="parse-ext" target="test"/--> <ant dir="parse-html" target="test"/> - <!-- <ant dir="parse-mp3" target="test"/> --> <ant dir="parse-msexcel" target="test"/> <ant dir="parse-mspowerpoint" target="test"/> <ant dir="parse-msword" target="test"/> @@ -107,7 +106,6 @@ <ant dir="parse-pdf" target="test"/> <ant dir="parse-rss" target="test"/> <ant dir="feed" target="test"/> - <!-- <ant dir="parse-rtf" target="test"/> --> <ant dir="parse-swf" target="test"/> <ant dir="parse-tika" target="test"/> <ant dir="parse-zip" target="test"/> @@ -172,11 +170,11 @@ <ant dir="query-more" target="clean"/> <ant dir="query-site" target="clean"/> <ant dir="query-url" target="clean"/> - <ant dir="query-custom" target="clean"/> + <ant dir="query-custom" target="clean"/> <ant dir="response-json" target="clean"/> <ant dir="response-xml" target="clean"/> <ant dir="scoring-opic" target="clean"/> - <ant dir="scoring-link" target="clean"/> + <ant dir="scoring-link" target="clean"/> <ant dir="subcollection" target="clean"/> <ant dir="summary-basic" target="clean"/> <ant dir="summary-lucene" target="clean"/> Modified: lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml?rev=931098&r1=931097&r2=931098&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml Tue Apr 6 11:38:26 2010 @@ -1,7 +1,7 @@ <ivy-module version="2.0"> <info organisation="apache" module="parse-tika"/> <dependencies> - <dependency org="org.apache.tika" name="tika-parsers" rev="0.6"> + <dependency org="org.apache.tika" name="tika-parsers" rev="0.7"> <exclude module="lucene-*"/> <exclude module="tika-core"/> <exclude module="log4j"/> Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar?rev=931098&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/bcprov-jdk15-1.45.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/bcprov-jdk15-1.45.jar?rev=931098&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/bcprov-jdk15-1.45.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-1.1.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-1.1.0.jar?rev=931098&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-1.1.0.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-1.1.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-1.1.0.jar?rev=931098&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-1.1.0.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-1.1.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-1.1.0.jar?rev=931098&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-1.1.0.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.7.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.7.jar?rev=931098&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.7.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml?rev=931098&r1=931097&r2=931098&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml Tue Apr 6 11:38:26 2010 @@ -25,23 +25,26 @@ <library name="parse-tika.jar"> <export name="*"/> </library> + <library name="asm-3.1.jar"/> - <library name="bcprov-jdk14-136.jar"/> <library name="bcmail-jdk14-136.jar"/> + <library name="bcmail-jdk15-1.45.jar"/> + <library name="bcprov-jdk14-136.jar"/> + <library name="bcprov-jdk15-1.45.jar"/> <library name="commons-compress-1.0.jar"/> <library name="commons-logging-1.1.1.jar"/> <library name="dom4j-1.6.1.jar"/> - <library name="fontbox-0.8.0-incubator.jar"/> + <library name="fontbox-1.1.0.jar"/> <library name="geronimo-stax-api_1.0_spec-1.0.1.jar"/> - <library name="jempbox-0.8.0-incubator.jar"/> + <library name="jempbox-1.1.0.jar"/> <library name="metadata-extractor-2.4.0-beta-1.jar"/> - <library name="pdfbox-0.8.0-incubating.jar"/> + <library name="pdfbox-1.1.0.jar"/> <library name="poi-3.6.jar"/> <library name="poi-ooxml-3.6.jar"/> <library name="poi-ooxml-schemas-3.6.jar"/> <library name="poi-scratchpad-3.6.jar"/> <library name="tagsoup-1.2.jar"/> - <library name="tika-parsers-0.6.jar"/> + <library name="tika-parsers-0.7.jar"/> <library name="xml-apis-1.0.b2.jar"/> <library name="xmlbeans-2.3.0.jar"/> </runtime> Modified: lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java?rev=931098&r1=931097&r2=931098&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java (original) +++ lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java Tue Apr 6 11:38:26 2010 @@ -21,16 +21,22 @@ import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.HashMap; +import java.util.Iterator; import java.util.Map; +import javax.imageio.spi.ServiceRegistry; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.apache.tika.exception.TikaException; +import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; import org.apache.tika.mime.MimeTypesFactory; +import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.mortbay.log.Log; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; @@ -38,143 +44,208 @@ import org.w3c.dom.NodeList; import org.xml.sax.SAXException; /** - * Parse xml config file. Duplicates the Tika equivalent but allows the classes of the parser to be found - * by classloader + * Parse xml config file. */ -class TikaConfig { - - static final String DEFAULT_CONFIG_LOCATION = - "/org/apache/tika/tika-config.xml"; +public class TikaConfig { private final Map<String, Parser> parsers = new HashMap<String, Parser>(); - private static MimeTypes mimeTypes; + private final MimeTypes mimeTypes; - TikaConfig(String file) - throws TikaException, IOException, SAXException { - this(new File(file)); + public TikaConfig(String file) throws TikaException, IOException, + SAXException { + this(new File(file)); } - TikaConfig(File file) - throws TikaException, IOException, SAXException { - this(getBuilder().parse(file)); + public TikaConfig(File file) throws TikaException, IOException, + SAXException { + this(getBuilder().parse(file)); } - TikaConfig(URL url) - throws TikaException, IOException, SAXException { - this(getBuilder().parse(url.toString())); + public TikaConfig(URL url) throws TikaException, IOException, SAXException { + this(getBuilder().parse(url.toString())); } - TikaConfig(InputStream stream) - throws TikaException, IOException, SAXException { - this(getBuilder().parse(stream)); + public TikaConfig(InputStream stream) throws TikaException, IOException, + SAXException { + this(getBuilder().parse(stream)); } - TikaConfig(Document document) throws TikaException, IOException { - this(document.getDocumentElement()); + /** + * @deprecated This method will be removed in Apache Tika 1.0 + * @see <a + * href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> + */ + public TikaConfig(InputStream stream, Parser delegate) + throws TikaException, IOException, SAXException { + this(stream); } - TikaConfig(Element element) throws TikaException, IOException { - Element mtr = getChild(element, "mimeTypeRepository"); - if (mtr != null) { - mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource")); - } + public TikaConfig(Document document) throws TikaException, IOException { + this(document.getDocumentElement()); + } - NodeList nodes = element.getElementsByTagName("parser"); - for (int i = 0; i < nodes.getLength(); i++) { - Element node = (Element) nodes.item(i); - String name = node.getAttribute("class"); - try { - Class<?> parserClass = Class.forName(name); - Parser parser = (Parser) parserClass.newInstance(); + /** + * @deprecated This method will be removed in Apache Tika 1.0 + * @see <a + * href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> + */ + public TikaConfig(Document document, Parser delegate) throws TikaException, + IOException { + this(document); + } + + public TikaConfig(Element element) throws TikaException, IOException { + Element mtr = getChild(element, "mimeTypeRepository"); + if (mtr != null && mtr.hasAttribute("resource")) { + mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource")); + } else { + mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml"); + } + + NodeList nodes = element.getElementsByTagName("parser"); + for (int i = 0; i < nodes.getLength(); i++) { + Element node = (Element) nodes.item(i); + String name = node.getAttribute("class"); + + try { + Class<?> parserClass = Class.forName(name); + Object instance = parserClass.newInstance(); + if (!(instance instanceof Parser)) { + throw new TikaException( + "Configured class is not a Tika Parser: " + name); + } + Parser parser = (Parser) instance; + + NodeList mimes = node.getElementsByTagName("mime"); + if (mimes.getLength() > 0) { + for (int j = 0; j < mimes.getLength(); j++) { + parsers.put(getText(mimes.item(j)).trim(), parser); + } + } else { + ParseContext context = new ParseContext(); + for (MediaType type : parser.getSupportedTypes(context)) { + parsers.put(type.toString(), parser); + } + } + } catch (ClassNotFoundException e) { + throw new TikaException("Configured parser class not found: " + + name, e); + } catch (IllegalAccessException e) { + throw new TikaException("Unable to access a parser class: " + + name, e); + } catch (InstantiationException e) { + throw new TikaException( + "Unable to instantiate a parser class: " + name, e); + } + } + } + + public TikaConfig() throws MimeTypeException, IOException { + ParseContext context = new ParseContext(); + Iterator<Parser> iterator = ServiceRegistry.lookupProviders( + Parser.class, this.getClass().getClassLoader()); + while (iterator.hasNext()) { + Parser parser = iterator.next(); + for (MediaType type : parser.getSupportedTypes(context)) { + parsers.put(type.toString(), parser); + } + } + mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml"); + } - NodeList mimes = node.getElementsByTagName("mime"); - for (int j = 0; j < mimes.getLength(); j++) { - parsers.put(getText(mimes.item(j)).trim(), parser); - } - } catch (Throwable t) { - // TODO: Log warning about an invalid parser configuration - // For now we just ignore this parser class - } - } + /** + * @deprecated This method will be removed in Apache Tika 1.0 + * @see <a + * href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> + */ + public TikaConfig(Element element, Parser delegate) throws TikaException, + IOException { + this(element); } private String getText(Node node) { - if (node.getNodeType() == Node.TEXT_NODE) { - return node.getNodeValue(); - } else if (node.getNodeType() == Node.ELEMENT_NODE) { - StringBuilder builder = new StringBuilder(); - NodeList list = node.getChildNodes(); - for (int i = 0; i < list.getLength(); i++) { - builder.append(getText(list.item(i))); - } - return builder.toString(); - } else { - return ""; - } + if (node.getNodeType() == Node.TEXT_NODE) { + return node.getNodeValue(); + } else if (node.getNodeType() == Node.ELEMENT_NODE) { + StringBuilder builder = new StringBuilder(); + NodeList list = node.getChildNodes(); + for (int i = 0; i < list.getLength(); i++) { + builder.append(getText(list.item(i))); + } + return builder.toString(); + } else { + return ""; + } } /** - * Returns the parser instance configured for the given MIME type. - * Returns <code>null</code> if the given MIME type is unknown. - * - * @param mimeType MIME type + * Returns the parser instance configured for the given MIME type. Returns + * <code>null</code> if the given MIME type is unknown. + * + * @param mimeType + * MIME type * @return configured Parser instance, or <code>null</code> */ - Parser getParser(String mimeType) { - return parsers.get(mimeType); + public Parser getParser(String mimeType) { + return parsers.get(mimeType); } - Map<String, Parser> getParsers() { - return parsers; + public Map<String, Parser> getParsers() { + return parsers; } - MimeTypes getMimeRepository(){ - return mimeTypes; + public MimeTypes getMimeRepository() { + return mimeTypes; } /** - * Provides a default configuration (TikaConfig). Currently creates a - * new instance each time it's called; we may be able to have it - * return a shared instance once it is completely immutable. - * + * Provides a default configuration (TikaConfig). Currently creates a new + * instance each time it's called; we may be able to have it return a shared + * instance once it is completely immutable. + * * @return default configuration */ - static TikaConfig getDefaultConfig() { - try { - InputStream stream = - TikaConfig.class.getResourceAsStream(DEFAULT_CONFIG_LOCATION); - return new TikaConfig(stream); - } catch (IOException e) { - throw new RuntimeException( - "Unable to read default configuration", e); - } catch (SAXException e) { - throw new RuntimeException( - "Unable to parse default configuration", e); - } catch (TikaException e) { - throw new RuntimeException( - "Unable to access default configuration", e); - } + public static TikaConfig getDefaultConfig() { + try { + return new TikaConfig(); + } catch (IOException e) { + throw new RuntimeException("Unable to read default configuration", + e); + } catch (TikaException e) { + throw new RuntimeException( + "Unable to access default configuration", e); + } + } + + /** + * @deprecated This method will be removed in Apache Tika 1.0 + * @see <a + * href="https://issues.apache.org/jira/browse/TIKA-275">TIKA-275</a> + */ + public static TikaConfig getDefaultConfig(Parser delegate) + throws TikaException { + return getDefaultConfig(); } private static DocumentBuilder getBuilder() throws TikaException { - try { - return DocumentBuilderFactory.newInstance().newDocumentBuilder(); - } catch (ParserConfigurationException e) { - throw new TikaException("XML parser not available", e); - } + try { + return DocumentBuilderFactory.newInstance().newDocumentBuilder(); + } catch (ParserConfigurationException e) { + throw new TikaException("XML parser not available", e); + } } private static Element getChild(Element element, String name) { - Node child = element.getFirstChild(); - while (child != null) { - if (child.getNodeType() == Node.ELEMENT_NODE - && name.equals(child.getNodeName())) { - return (Element) child; - } - child = child.getNextSibling(); - } - return null; + Node child = element.getFirstChild(); + while (child != null) { + if (child.getNodeType() == Node.ELEMENT_NODE + && name.equals(child.getNodeName())) { + return (Element) child; + } + child = child.getNextSibling(); + } + return null; } -} +} \ No newline at end of file Modified: lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=931098&r1=931097&r2=931098&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original) +++ lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Tue Apr 6 11:38:26 2010 @@ -170,6 +170,8 @@ public class TikaParser implements org.a this.tikaConfig = null; // do we want a custom Tika configuration file + // deprecated since Tika 0.7 which is based on + // a service provider based configuration String customConfFile = conf.get("tika.config.file"); if (customConfFile != null) { try {