Author: tejasp
Date: Tue Apr 30 20:51:44 2013
New Revision: 1477821

URL: http://svn.apache.org/r1477821
Log:
NUTCH-1549 Fix deprecated use of Tika MimeType API in o.a.n.util.MimeUtil

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
    
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
    
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
    
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1477821&r1=1477820&r2=1477821&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Apr 30 20:51:44 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk): Current Development
 
+* NUTCH-1549 Fix deprecated use of Tika MimeType API in o.a.n.util.MimeUtil 
(tejasp)
+
 * NUTCH-346 Improve readability of logs/hadoop.log (Renaud Richardet via 
tejasp)
 
 * NUTCH-829 duplicate hadoop temp files (Mike Baranczak, lewismc, tejasp)

Modified: nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java?rev=1477821&r1=1477820&r2=1477821&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Tue Apr 30 
20:51:44 2013
@@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configurat
 
 // Tika imports
 import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.mime.MimeType;
 import org.apache.tika.mime.MimeTypeException;
 import org.apache.tika.mime.MimeTypes;
@@ -169,12 +170,19 @@ public final class MimeUtil {
         || (type != null && type.getName().equals(MimeTypes.OCTET_STREAM))) {
       // If no mime-type header, or cannot find a corresponding registered
       // mime-type, then guess a mime-type from the url pattern
-      type = this.mimeTypes.getMimeType(url) != null ? this.mimeTypes
-          .getMimeType(url) : type;
+      try {
+        TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
+        Tika tika = new Tika(tikaConfig);
+        retType = tika.detect(url) != null ? tika.detect(url) : null;
+      } catch (Exception e) {
+        String message = "Problem loading default Tika configuration";
+        LOG.error(message, e);
+        throw new RuntimeException(e);
+      }
+    } else {
+        retType = type.getName();
     }
 
-    retType= type.getName();
-
     // if magic is enabled use mime magic to guess if the mime type returned
     // from the magic guess is different than the one that's already set so far
     // if it is, and it's not the default mime type, then go with the mime type

Modified: 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1477821&r1=1477820&r2=1477821&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Tue Apr 30 20:51:44 2013
@@ -16,8 +16,6 @@
  */
 package org.apache.nutch.indexer.more;
 
-import org.apache.tika.mime.MimeType;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -36,19 +34,16 @@ import org.apache.nutch.crawl.CrawlDatum
 import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.util.MimeUtil;
+import org.apache.tika.Tika;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 
 import java.text.ParseException;
-import java.text.SimpleDateFormat;
 
 import java.io.BufferedReader;
-import java.io.FileReader;
 import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
 import java.util.Date;
 import java.util.regex.*;
 import java.util.HashMap;
@@ -72,11 +67,9 @@ import org.apache.commons.lang.time.Date
 public class MoreIndexingFilter implements IndexingFilter {
   public static final Logger LOG = 
LoggerFactory.getLogger(MoreIndexingFilter.class);
 
-  /** A flag that tells if magic resolution must be performed */
-  private boolean MAGIC;
-
   /** Get the MimeTypes resolver instance. */
   private MimeUtil MIME;
+  private Tika tika = new Tika();
 
   /** Map for mime-type substitution */
   private HashMap<String,String> mimeMap = null;
@@ -114,7 +107,6 @@ public class MoreIndexingFilter implemen
 
     // un-stored, indexed and un-tokenized
     doc.add("date", new Date(time));
-
     return doc;
   }
 
@@ -220,7 +212,8 @@ public class MoreIndexingFilter implemen
       // } else {
       //   contentType = MIME.getMimeType(url);
       // }
-      mimeType = MIME.getMimeType(url);
+
+      mimeType = tika.detect(url);
     } else {
       mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
     }
@@ -240,7 +233,6 @@ public class MoreIndexingFilter implemen
     }
 
     contentType = mimeType;
-
     doc.add("type", contentType);
 
     // Check if we need to split the content type in sub parts

Modified: 
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=1477821&r1=1477820&r2=1477821&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
 Tue Apr 30 20:51:44 2013
@@ -38,13 +38,10 @@ import org.apache.nutch.net.protocols.Re
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.MimeUtil;
-
-
+import org.apache.tika.Tika;
 
 /**
  *
@@ -52,26 +49,18 @@ import org.apache.nutch.util.MimeUtil;
  */
 public class ZipTextExtractor {
   
-  /** Get the MimeTypes resolver instance. */
-  private MimeUtil MIME;
-  
   public static final Logger LOG = 
LoggerFactory.getLogger(ZipTextExtractor.class);
 
   private Configuration conf;
-  
-  
+
   /** Creates a new instance of ZipTextExtractor */
   public ZipTextExtractor(Configuration conf) {
     this.conf = conf;
-    this.MIME = new MimeUtil(conf);
   }
   
-  public String extractText(InputStream input, String url, List outLinksList) 
throws IOException {
+  public String extractText(InputStream input, String url, List<Outlink> 
outLinksList) throws IOException {
     String resultText = "";
-    byte temp;
-    
     ZipInputStream zin = new ZipInputStream(input);
-    
     ZipEntry entry;
     
     while ((entry = zin.getNextEntry()) != null) {
@@ -93,7 +82,8 @@ public class ZipTextExtractor {
         int i = fname.lastIndexOf('.');
         if (i != -1) {
           // Trying to resolve the Mime-Type
-          String contentType = MIME.getMimeType(fname);
+          Tika tika = new Tika();
+          String contentType = tika.detect(fname);
           try {
             Metadata metadata = new Metadata();
             metadata.set(Response.CONTENT_LENGTH, 
Long.toString(entry.getSize()));

Modified: 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1477821&r1=1477820&r2=1477821&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 Tue Apr 30 20:51:44 2013
@@ -224,7 +224,7 @@ public class FileResponse {
     headers.set(Response.LAST_MODIFIED,
         HttpDateFormat.toString(f.lastModified()));
 
-    String mimeType = MIME.getMimeType(f);
+    String mimeType = tika.detect(f);
 
     headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");
 


Reply via email to