Author: jnioche
Date: Tue Apr  6 11:38:26 2010
New Revision: 931098

URL: http://svn.apache.org/viewvc?rev=931098&view=rev
Log:
NUTCH-810 Upgraded to Tika 0.7

Added:
    lucene/nutch/trunk/lib/tika-core-0.7.jar   (with props)
    lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar   (with 
props)
    lucene/nutch/trunk/src/plugin/parse-tika/lib/bcprov-jdk15-1.45.jar   (with 
props)
    lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-1.1.0.jar   (with 
props)
    lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-1.1.0.jar   (with 
props)
    lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-1.1.0.jar   (with props)
    lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.7.jar   (with 
props)
Removed:
    lucene/nutch/trunk/lib/tika-core-0.6.jar
    lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-0.8.0-incubator.jar
    lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-0.8.0-incubator.jar
    lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-0.8.0-incubating.jar
    lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.6.jar
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/conf/tika-mimetypes.xml
    lucene/nutch/trunk/src/plugin/build.xml
    lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml
    lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml
    
lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java
    
lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=931098&r1=931097&r2=931098&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Apr  6 11:38:26 2010
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Unreleased Changes
 
+* NUTCH-810 Upgrade to Tika 0.7 (jnioche)
+
 * NUTCH-785 Copy metadata from origin URL when redirecting in Fetcher + call 
scfilters.initialScore on newly created URL (jnioche)
 
 * NUTCH-779 Mechanism for passing metadata from parse to crawldb (jnioche)

Modified: lucene/nutch/trunk/conf/tika-mimetypes.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/tika-mimetypes.xml?rev=931098&r1=931097&r2=931098&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/tika-mimetypes.xml (original)
+++ lucene/nutch/trunk/conf/tika-mimetypes.xml Tue Apr  6 11:38:26 2010
@@ -2198,7 +2198,11 @@
 
   <mime-type type="application/x-cpio">
     <magic priority="50">
-      <match value="070707" type="host16" offset="0"/>
+      <match value="070707" type="little16" offset="0"/>
+      <match value="070707" type="big16" offset="0"/>
+      <match value="070707" type="string" offset="0"/>
+      <match value="070701" type="string" offset="0"/>
+      <match value="070702" type="string" offset="0"/>
     </magic>
     <glob pattern="*.cpio"/>
   </mime-type>
@@ -3551,7 +3555,13 @@
           bad HTML, unfortunately.
      -->
     <root-XML localName="html"/>
+    <root-XML localName="HTML"/>
     <root-XML localName="link"/>
+    <root-XML localName="LINK"/>
+    <root-XML localName="body"/>
+    <root-XML localName="BODY"/>
+    <root-XML localName="p"/>
+    <root-XML localName="P"/>
     <magic priority="50">
       <match value="&lt;!DOCTYPE HTML" type="string" offset="0:64"/>
       <match value="&lt;!doctype html" type="string" offset="0:64"/>

Added: lucene/nutch/trunk/lib/tika-core-0.7.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/tika-core-0.7.jar?rev=931098&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/tika-core-0.7.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?rev=931098&r1=931097&r2=931098&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Tue Apr  6 11:38:26 2010
@@ -32,8 +32,8 @@
      <ant dir="index-basic" target="deploy"/>
      <ant dir="index-anchor" target="deploy"/>
      <ant dir="index-more" target="deploy"/>
-        <ant dir="field-basic" target="deploy"/>
-        <ant dir="field-boost" target="deploy"/>
+     <ant dir="field-basic" target="deploy"/>
+     <ant dir="field-boost" target="deploy"/>
      <ant dir="languageidentifier" target="deploy"/>
      <ant dir="lib-http" target="deploy"/>
      <ant dir="lib-jakarta-poi" target="deploy"/>
@@ -65,12 +65,12 @@
      <ant dir="query-basic" target="deploy"/>
      <ant dir="query-more" target="deploy"/>
      <ant dir="query-site" target="deploy"/>
-        <ant dir="query-custom" target="deploy"/>
+     <ant dir="query-custom" target="deploy"/>
      <ant dir="query-url" target="deploy"/>
      <ant dir="response-json" target="deploy"/>
      <ant dir="response-xml" target="deploy"/>
      <ant dir="scoring-opic" target="deploy"/>
-        <ant dir="scoring-link" target="deploy"/>
+     <ant dir="scoring-link" target="deploy"/>
      <ant dir="summary-basic" target="deploy"/>
      <ant dir="subcollection" target="deploy"/>
      <ant dir="summary-lucene" target="deploy"/>
@@ -99,7 +99,6 @@
      <ant dir="protocol-httpclient" target="test"/>
      <!--ant dir="parse-ext" target="test"/-->
      <ant dir="parse-html" target="test"/>
-     <!-- <ant dir="parse-mp3" target="test"/> -->
      <ant dir="parse-msexcel" target="test"/>
      <ant dir="parse-mspowerpoint" target="test"/>
      <ant dir="parse-msword" target="test"/>
@@ -107,7 +106,6 @@
      <ant dir="parse-pdf" target="test"/>
      <ant dir="parse-rss" target="test"/>
      <ant dir="feed" target="test"/>
-     <!-- <ant dir="parse-rtf" target="test"/> -->
      <ant dir="parse-swf" target="test"/>
      <ant dir="parse-tika" target="test"/>
      <ant dir="parse-zip" target="test"/>
@@ -172,11 +170,11 @@
     <ant dir="query-more" target="clean"/>
     <ant dir="query-site" target="clean"/>
     <ant dir="query-url" target="clean"/>
-       <ant dir="query-custom" target="clean"/>
+    <ant dir="query-custom" target="clean"/>
     <ant dir="response-json" target="clean"/>
     <ant dir="response-xml" target="clean"/>
     <ant dir="scoring-opic" target="clean"/>
-       <ant dir="scoring-link" target="clean"/>
+    <ant dir="scoring-link" target="clean"/>
     <ant dir="subcollection" target="clean"/>
     <ant dir="summary-basic" target="clean"/>
     <ant dir="summary-lucene" target="clean"/>

Modified: lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml?rev=931098&r1=931097&r2=931098&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-tika/ivy.xml Tue Apr  6 11:38:26 2010
@@ -1,7 +1,7 @@
 <ivy-module version="2.0">
     <info organisation="apache" module="parse-tika"/>
     <dependencies>
-       <dependency org="org.apache.tika" name="tika-parsers" rev="0.6">
+       <dependency org="org.apache.tika" name="tika-parsers" rev="0.7">
                <exclude module="lucene-*"/>
                <exclude module="tika-core"/>
                <exclude module="log4j"/>

Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar?rev=931098&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/bcmail-jdk15-1.45.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/bcprov-jdk15-1.45.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/bcprov-jdk15-1.45.jar?rev=931098&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/bcprov-jdk15-1.45.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-1.1.0.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-1.1.0.jar?rev=931098&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/fontbox-1.1.0.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-1.1.0.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-1.1.0.jar?rev=931098&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/jempbox-1.1.0.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-1.1.0.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-1.1.0.jar?rev=931098&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/pdfbox-1.1.0.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.7.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.7.jar?rev=931098&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-tika/lib/tika-parsers-0.7.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml?rev=931098&r1=931097&r2=931098&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-tika/plugin.xml Tue Apr  6 11:38:26 2010
@@ -25,23 +25,26 @@
       <library name="parse-tika.jar">
          <export name="*"/>
       </library>
+
       <library name="asm-3.1.jar"/>
-      <library name="bcprov-jdk14-136.jar"/>
       <library name="bcmail-jdk14-136.jar"/>
+      <library name="bcmail-jdk15-1.45.jar"/>
+      <library name="bcprov-jdk14-136.jar"/>           
+      <library name="bcprov-jdk15-1.45.jar"/>
       <library name="commons-compress-1.0.jar"/>
       <library name="commons-logging-1.1.1.jar"/>
       <library name="dom4j-1.6.1.jar"/>
-      <library name="fontbox-0.8.0-incubator.jar"/>
+      <library name="fontbox-1.1.0.jar"/>
       <library name="geronimo-stax-api_1.0_spec-1.0.1.jar"/>
-      <library name="jempbox-0.8.0-incubator.jar"/>
+      <library name="jempbox-1.1.0.jar"/>
       <library name="metadata-extractor-2.4.0-beta-1.jar"/>
-      <library name="pdfbox-0.8.0-incubating.jar"/>
+      <library name="pdfbox-1.1.0.jar"/>
       <library name="poi-3.6.jar"/>
       <library name="poi-ooxml-3.6.jar"/>
       <library name="poi-ooxml-schemas-3.6.jar"/>
       <library name="poi-scratchpad-3.6.jar"/>
       <library name="tagsoup-1.2.jar"/>
-      <library name="tika-parsers-0.6.jar"/>
+      <library name="tika-parsers-0.7.jar"/>
       <library name="xml-apis-1.0.b2.jar"/>
       <library name="xmlbeans-2.3.0.jar"/>
    </runtime>

Modified: 
lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java?rev=931098&r1=931097&r2=931098&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java
 Tue Apr  6 11:38:26 2010
@@ -21,16 +21,22 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.Map;
 
+import javax.imageio.spi.ServiceRegistry;
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
 import javax.xml.parsers.ParserConfigurationException;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeTypeException;
 import org.apache.tika.mime.MimeTypes;
 import org.apache.tika.mime.MimeTypesFactory;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.mortbay.log.Log;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.Node;
@@ -38,143 +44,208 @@ import org.w3c.dom.NodeList;
 import org.xml.sax.SAXException;
 
 /**
- * Parse xml config file. Duplicates the Tika equivalent but allows the 
classes of the parser to be found 
- * by classloader
+ * Parse xml config file.
  */
-class TikaConfig {
-
-    static final String DEFAULT_CONFIG_LOCATION = 
-        "/org/apache/tika/tika-config.xml";
+public class TikaConfig {
 
     private final Map<String, Parser> parsers = new HashMap<String, Parser>();
 
-    private static MimeTypes mimeTypes;
+    private final MimeTypes mimeTypes;
 
-    TikaConfig(String file)
-            throws TikaException, IOException, SAXException {
-        this(new File(file));
+    public TikaConfig(String file) throws TikaException, IOException,
+           SAXException {
+       this(new File(file));
     }
 
-    TikaConfig(File file)
-            throws TikaException, IOException, SAXException {
-        this(getBuilder().parse(file));
+    public TikaConfig(File file) throws TikaException, IOException,
+           SAXException {
+       this(getBuilder().parse(file));
     }
 
-    TikaConfig(URL url)
-            throws TikaException, IOException, SAXException {
-        this(getBuilder().parse(url.toString()));
+    public TikaConfig(URL url) throws TikaException, IOException, SAXException 
{
+       this(getBuilder().parse(url.toString()));
     }
 
-    TikaConfig(InputStream stream)
-            throws TikaException, IOException, SAXException {
-        this(getBuilder().parse(stream));
+    public TikaConfig(InputStream stream) throws TikaException, IOException,
+           SAXException {
+       this(getBuilder().parse(stream));
     }
 
-    TikaConfig(Document document) throws TikaException, IOException {
-        this(document.getDocumentElement());
+    /**
+     * @deprecated This method will be removed in Apache Tika 1.0
+     * @see <a
+     *      href="https://issues.apache.org/jira/browse/TIKA-275";>TIKA-275</a>
+     */
+    public TikaConfig(InputStream stream, Parser delegate)
+           throws TikaException, IOException, SAXException {
+       this(stream);
     }
 
-    TikaConfig(Element element) throws TikaException, IOException {
-        Element mtr = getChild(element, "mimeTypeRepository");
-        if (mtr != null) {
-            mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource"));
-        }
+    public TikaConfig(Document document) throws TikaException, IOException {
+       this(document.getDocumentElement());
+    }
 
-        NodeList nodes = element.getElementsByTagName("parser");
-        for (int i = 0; i < nodes.getLength(); i++) {
-            Element node = (Element) nodes.item(i);
-            String name = node.getAttribute("class");
-            try {
-                Class<?> parserClass = Class.forName(name);
-                Parser parser = (Parser) parserClass.newInstance();
+    /**
+     * @deprecated This method will be removed in Apache Tika 1.0
+     * @see <a
+     *      href="https://issues.apache.org/jira/browse/TIKA-275";>TIKA-275</a>
+     */
+    public TikaConfig(Document document, Parser delegate) throws TikaException,
+           IOException {
+       this(document);
+    }
+
+    public TikaConfig(Element element) throws TikaException, IOException {
+       Element mtr = getChild(element, "mimeTypeRepository");
+       if (mtr != null && mtr.hasAttribute("resource")) {
+           mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource"));
+       } else {
+           mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml");
+       }
+
+       NodeList nodes = element.getElementsByTagName("parser");
+       for (int i = 0; i < nodes.getLength(); i++) {
+           Element node = (Element) nodes.item(i);
+           String name = node.getAttribute("class");
+
+           try {
+               Class<?> parserClass = Class.forName(name);
+               Object instance = parserClass.newInstance();
+               if (!(instance instanceof Parser)) {
+                   throw new TikaException(
+                           "Configured class is not a Tika Parser: " + name);
+               }
+               Parser parser = (Parser) instance;
+
+               NodeList mimes = node.getElementsByTagName("mime");
+               if (mimes.getLength() > 0) {
+                   for (int j = 0; j < mimes.getLength(); j++) {
+                       parsers.put(getText(mimes.item(j)).trim(), parser);
+                   }
+               } else {
+                   ParseContext context = new ParseContext();
+                   for (MediaType type : parser.getSupportedTypes(context)) {
+                       parsers.put(type.toString(), parser);
+                   }
+               }
+           } catch (ClassNotFoundException e) {
+               throw new TikaException("Configured parser class not found: "
+                       + name, e);
+           } catch (IllegalAccessException e) {
+               throw new TikaException("Unable to access a parser class: "
+                       + name, e);
+           } catch (InstantiationException e) {
+               throw new TikaException(
+                       "Unable to instantiate a parser class: " + name, e);
+           }
+       }
+    }
+
+    public TikaConfig() throws MimeTypeException, IOException {
+       ParseContext context = new ParseContext();
+       Iterator<Parser> iterator = ServiceRegistry.lookupProviders(
+               Parser.class, this.getClass().getClassLoader());
+       while (iterator.hasNext()) {
+           Parser parser = iterator.next();
+           for (MediaType type : parser.getSupportedTypes(context)) {
+               parsers.put(type.toString(), parser);
+           }
+       }
+       mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml");
+    }
 
-                NodeList mimes = node.getElementsByTagName("mime");
-                for (int j = 0; j < mimes.getLength(); j++) {
-                    parsers.put(getText(mimes.item(j)).trim(), parser);
-                }
-            } catch (Throwable t) {
-                // TODO: Log warning about an invalid parser configuration
-                // For now we just ignore this parser class
-            }
-        }
+    /**
+     * @deprecated This method will be removed in Apache Tika 1.0
+     * @see <a
+     *      href="https://issues.apache.org/jira/browse/TIKA-275";>TIKA-275</a>
+     */
+    public TikaConfig(Element element, Parser delegate) throws TikaException,
+           IOException {
+       this(element);
     }
 
     private String getText(Node node) {
-        if (node.getNodeType() == Node.TEXT_NODE) {
-            return node.getNodeValue();
-        } else if (node.getNodeType() == Node.ELEMENT_NODE) {
-            StringBuilder builder = new StringBuilder();
-            NodeList list = node.getChildNodes();
-            for (int i = 0; i < list.getLength(); i++) {
-                builder.append(getText(list.item(i)));
-            }
-            return builder.toString();
-        } else {
-            return "";
-        }
+       if (node.getNodeType() == Node.TEXT_NODE) {
+           return node.getNodeValue();
+       } else if (node.getNodeType() == Node.ELEMENT_NODE) {
+           StringBuilder builder = new StringBuilder();
+           NodeList list = node.getChildNodes();
+           for (int i = 0; i < list.getLength(); i++) {
+               builder.append(getText(list.item(i)));
+           }
+           return builder.toString();
+       } else {
+           return "";
+       }
     }
 
     /**
-     * Returns the parser instance configured for the given MIME type.
-     * Returns <code>null</code> if the given MIME type is unknown.
-     *
-     * @param mimeType MIME type
+     * Returns the parser instance configured for the given MIME type. Returns
+     * <code>null</code> if the given MIME type is unknown.
+     * 
+     * @param mimeType
+     *            MIME type
      * @return configured Parser instance, or <code>null</code>
      */
-    Parser getParser(String mimeType) {
-        return parsers.get(mimeType);
+    public Parser getParser(String mimeType) {
+       return parsers.get(mimeType);
     }
 
-    Map<String, Parser> getParsers() {
-        return parsers;
+    public Map<String, Parser> getParsers() {
+       return parsers;
     }
 
-    MimeTypes getMimeRepository(){
-        return mimeTypes;
+    public MimeTypes getMimeRepository() {
+       return mimeTypes;
     }
 
     /**
-     * Provides a default configuration (TikaConfig).  Currently creates a
-     * new instance each time it's called; we may be able to have it
-     * return a shared instance once it is completely immutable.
-     *
+     * Provides a default configuration (TikaConfig). Currently creates a new
+     * instance each time it's called; we may be able to have it return a 
shared
+     * instance once it is completely immutable.
+     * 
      * @return default configuration
      */
-    static TikaConfig getDefaultConfig() {
-        try {
-            InputStream stream =
-                TikaConfig.class.getResourceAsStream(DEFAULT_CONFIG_LOCATION);
-            return new TikaConfig(stream);
-        } catch (IOException e) {
-            throw new RuntimeException(
-                    "Unable to read default configuration", e);
-        } catch (SAXException e) {
-            throw new RuntimeException(
-                    "Unable to parse default configuration", e);
-        } catch (TikaException e) {
-            throw new RuntimeException(
-                    "Unable to access default configuration", e);
-        }
+    public static TikaConfig getDefaultConfig() {
+       try {
+           return new TikaConfig();
+       } catch (IOException e) {
+           throw new RuntimeException("Unable to read default configuration",
+                   e);
+       } catch (TikaException e) {
+           throw new RuntimeException(
+                   "Unable to access default configuration", e);
+       }
+    }
+
+    /**
+     * @deprecated This method will be removed in Apache Tika 1.0
+     * @see <a
+     *      href="https://issues.apache.org/jira/browse/TIKA-275";>TIKA-275</a>
+     */
+    public static TikaConfig getDefaultConfig(Parser delegate)
+           throws TikaException {
+       return getDefaultConfig();
     }
 
     private static DocumentBuilder getBuilder() throws TikaException {
-        try {
-            return DocumentBuilderFactory.newInstance().newDocumentBuilder();
-        } catch (ParserConfigurationException e) {
-            throw new TikaException("XML parser not available", e);
-        }
+       try {
+           return DocumentBuilderFactory.newInstance().newDocumentBuilder();
+       } catch (ParserConfigurationException e) {
+           throw new TikaException("XML parser not available", e);
+       }
     }
 
     private static Element getChild(Element element, String name) {
-        Node child = element.getFirstChild();
-        while (child != null) {
-            if (child.getNodeType() == Node.ELEMENT_NODE
-                    && name.equals(child.getNodeName())) {
-                return (Element) child;
-            }
-            child = child.getNextSibling();
-        }
-        return null;
+       Node child = element.getFirstChild();
+       while (child != null) {
+           if (child.getNodeType() == Node.ELEMENT_NODE
+                   && name.equals(child.getNodeName())) {
+               return (Element) child;
+           }
+           child = child.getNextSibling();
+       }
+       return null;
     }
 
-}
+}
\ No newline at end of file

Modified: 
lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=931098&r1=931097&r2=931098&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
 Tue Apr  6 11:38:26 2010
@@ -170,6 +170,8 @@ public class TikaParser implements org.a
                this.tikaConfig = null;
 
                // do we want a custom Tika configuration file
+               // deprecated since Tika 0.7 which is based on 
+               // a service provider based configuration
                String customConfFile = conf.get("tika.config.file");
                if (customConfFile != null) {
                        try {


Reply via email to