Author: markus
Date: Tue Dec 27 14:36:27 2011
New Revision: 1224916

URL: http://svn.apache.org/viewvc?rev=1224916&view=rev
Log:
NUTCH-1230 and NUTCH-1231 Upgrade to Tika 1.0 and using new Tika detect API

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/ivy/ivy.xml
    nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
    
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
    
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
    
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1224916&r1=1224915&r2=1224916&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Dec 27 14:36:27 2011
@@ -1,5 +1,9 @@
 Nutch Change Log
 
+* NUTCH-1231 Upgrade to Tika 1.0 (markus)
+
+* NUTCH-1230 MimeType API deprecated and breaks with Tika 1.0 (markus)
+
 * NUTCH-1235 Upgrade to new Hadoop 0.20.205.0 (markus)
 
 * NUTCH-1217 Update NOTICE.txt to drop some copyrights (lewismc)

Modified: nutch/trunk/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1224916&r1=1224915&r2=1224916&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Tue Dec 27 14:36:27 2011
@@ -57,7 +57,7 @@
                </dependency>
 
                <dependency org="com.ibm.icu" name="icu4j" rev="4.0.1" />
-               <dependency org="org.apache.tika" name="tika-core" rev="0.10" />
+               <dependency org="org.apache.tika" name="tika-core" rev="1.0" />
                <dependency org="org.mortbay.jetty" name="jetty-client" 
rev="6.1.22" />
 
                <dependency org="log4j" name="log4j" rev="1.2.15" 
conf="*->master" />

Modified: nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java?rev=1224916&r1=1224915&r2=1224916&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Tue Dec 27 
14:36:27 2011
@@ -24,6 +24,7 @@ import java.io.File;
 import org.apache.hadoop.conf.Configuration;
 
 // Tika imports
+import org.apache.tika.Tika;
 import org.apache.tika.mime.MimeType;
 import org.apache.tika.mime.MimeTypeException;
 import org.apache.tika.mime.MimeTypes;
@@ -49,6 +50,9 @@ public final class MimeUtil {
   /* our Tika mime type registry */
   private MimeTypes mimeTypes;
 
+  /* the tika detectors */
+  private Tika tika;
+
   /* whether or not magic should be employed or not */
   private boolean mimeMagic;
 
@@ -56,6 +60,7 @@ public final class MimeUtil {
   private static final Logger LOG = 
LoggerFactory.getLogger(MimeUtil.class.getName());
 
   public MimeUtil(Configuration conf) {
+    tika = new Tika();
     ObjectCache objectCache = ObjectCache.get(conf);
     MimeTypes mimeTypez = (MimeTypes) objectCache.getObject(MimeTypes.class
         .getName());
@@ -133,6 +138,8 @@ public final class MimeUtil {
    * @return The correctly, automatically guessed {@link MimeType} name.
    */
   public String autoResolveContentType(String typeName, String url, byte[] 
data) {
+    String retType = null;
+    String magicType = null;
     MimeType type = null;
     String cleanedMimeType = null;
 
@@ -161,59 +168,65 @@ public final class MimeUtil {
           .getMimeType(url) : type;
     }
 
+    retType= type.getName();
+
     // if magic is enabled use mime magic to guess if the mime type returned
     // from the magic guess is different than the one that's already set so far
     // if it is, and it's not the default mime type, then go with the mime type
     // returned by the magic
     if (this.mimeMagic) {
-      MimeType magicType = this.mimeTypes.getMimeType(data);
-      if (magicType != null && 
!magicType.getName().equals(MimeTypes.OCTET_STREAM)
-          && !magicType.getName().equals(MimeTypes.PLAIN_TEXT)
-          && type != null && !type.getName().equals(magicType.getName())) {
+      magicType = tika.detect(data);
+
+      // Deprecated in Tika 1.0 See 
https://issues.apache.org/jira/browse/NUTCH-1230
+      //MimeType magicType = this.mimeTypes.getMimeType(data);
+      if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM)
+          && !magicType.equals(MimeTypes.PLAIN_TEXT)
+          && retType != null && !retType.equals(magicType)) {
+
         // If magic enabled and the current mime type differs from that of the
         // one returned from the magic, take the magic mimeType
-        type = magicType;
+        retType = magicType;
       }
 
       // if type is STILL null after all the resolution strategies, go for the
       // default type
-      if (type == null) {
+      if (retType == null) {
         try {
-          type = this.mimeTypes.forName(MimeTypes.OCTET_STREAM);
+          retType = MimeTypes.OCTET_STREAM;
         } catch (Exception ignore) {
         }
       }
     }
 
-    return type.getName();
+    return retType;
   }
 
   /**
    * Facade interface to Tika's underlying {@link 
MimeTypes#getMimeType(String)}
    * method.
-   * 
+   *
    * @param url
    *          A string representation of the document {@link URL} to sense the
    *          {@link MimeType} for.
    * @return An appropriate {@link MimeType}, identified from the given
    *         Document url in string form.
    */
-  public MimeType getMimeType(String url) {
-    return this.mimeTypes.getMimeType(url);
+  public String getMimeType(String url) {
+    return tika.detect(url);
   }
 
   /**
    * A facade interface to Tika's underlying {@link MimeTypes#forName(String)}
    * method.
-   * 
+   *
    * @param name
    *          The name of a valid {@link MimeType} in the Tika mime registry.
    * @return The object representation of the {@link MimeType}, if it exists,
    *         or null otherwise.
    */
-  public MimeType forName(String name) {
+  public String forName(String name) {
     try {
-      return this.mimeTypes.forName(name);
+      return this.mimeTypes.forName(name).toString();
     } catch (MimeTypeException e) {
       LOG.error("Exception getting mime type by name: [" + name
           + "]: Message: " + e.getMessage());
@@ -224,14 +237,21 @@ public final class MimeUtil {
   /**
    * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(File)}
    * method.
-   * 
+   *
    * @param f
    *          The {@link File} to sense the {@link MimeType} for.
    * @return The {@link MimeType} of the given {@link File}, or null if it
    *         cannot be determined.
    */
-  public MimeType getMimeType(File f) {
-    return this.mimeTypes.getMimeType(f);
+  public String getMimeType(File f) {
+    try {
+      return tika.detect(f);
+    } catch (Exception e) {
+      LOG.error("Exception getting mime type for file: [" + f.getPath()
+          + "]: Message: " + e.getMessage());
+      return null;
+    }
   }
 
-}
+
+}
\ No newline at end of file

Modified: 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1224916&r1=1224915&r2=1224916&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Tue Dec 27 14:36:27 2011
@@ -185,7 +185,7 @@ public class MoreIndexingFilter implemen
    * @return
    */
   private NutchDocument addType(NutchDocument doc, ParseData data, String url) 
{
-    MimeType mimeType = null;
+    String mimeType = null;
     String contentType = data.getMeta(Response.CONTENT_TYPE);
     if (contentType == null) {
       // Note by Jerome Charron on 20050415:
@@ -209,13 +209,13 @@ public class MoreIndexingFilter implemen
       return doc;
     }
 
-    contentType = mimeType.getName();
+    contentType = mimeType;
 
     doc.add("type", contentType);
 
     // Check if we need to split the content type in sub parts
     if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
-      String[] parts = getParts(contentType.toString());
+      String[] parts = getParts(contentType);
 
       for(String part: parts) {
         doc.add("type", part);

Modified: 
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=1224916&r1=1224915&r2=1224916&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
 Tue Dec 27 14:36:27 2011
@@ -93,7 +93,7 @@ public class ZipTextExtractor {
         int i = fname.lastIndexOf('.');
         if (i != -1) {
           // Trying to resolve the Mime-Type
-          String contentType = MIME.getMimeType(fname).getName();
+          String contentType = MIME.getMimeType(fname);
           try {
             Metadata metadata = new Metadata();
             metadata.set(Response.CONTENT_LENGTH, 
Long.toString(entry.getSize()));

Modified: 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1224916&r1=1224915&r2=1224916&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 Tue Dec 27 14:36:27 2011
@@ -33,7 +33,7 @@ import org.apache.nutch.net.protocols.Ht
 import org.apache.nutch.net.protocols.Response;
 
 // Tika imports
-import org.apache.tika.mime.MimeType;
+import org.apache.tika.Tika;
 
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
@@ -74,6 +74,7 @@ public class FileResponse {
   private Configuration conf;
 
   private MimeUtil MIME;
+  private Tika tika;
 
   /** Returns the response code. */
   public int getCode() {
@@ -103,6 +104,7 @@ public class FileResponse {
     this.conf = conf;
     
     MIME = new MimeUtil(conf);
+    tika = new Tika();
 
     if (!"file".equals(url.getProtocol()))
       throw new FileException("Not a file url:" + url);
@@ -216,9 +218,9 @@ public class FileResponse {
     headers.set(Response.LAST_MODIFIED,
         HttpDateFormat.toString(f.lastModified()));
 
-    MimeType mimeType = MIME.getMimeType(f);
-    String mimeTypeString = mimeType != null ? mimeType.getName() : "";
-    headers.set(Response.CONTENT_TYPE, mimeTypeString);
+    String mimeType = MIME.getMimeType(f);
+
+    headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");
 
     // response code
     this.code = 200; // http OK


Reply via email to