Author: tejasp
Date: Tue Apr 30 20:51:44 2013
New Revision: 1477821
URL: http://svn.apache.org/r1477821
Log:
NUTCH-1549 Fix deprecated use of Tika MimeType API in o.a.n.util.MimeUtil
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1477821&r1=1477820&r2=1477821&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Apr 30 20:51:44 2013
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk): Current Development
+* NUTCH-1549 Fix deprecated use of Tika MimeType API in o.a.n.util.MimeUtil
(tejasp)
+
* NUTCH-346 Improve readability of logs/hadoop.log (Renaud Richardet via
tejasp)
* NUTCH-829 duplicate hadoop temp files (Mike Baranczak, lewismc, tejasp)
Modified: nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java?rev=1477821&r1=1477820&r2=1477821&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Tue Apr 30
20:51:44 2013
@@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configurat
// Tika imports
import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
@@ -169,12 +170,19 @@ public final class MimeUtil {
|| (type != null && type.getName().equals(MimeTypes.OCTET_STREAM))) {
// If no mime-type header, or cannot find a corresponding registered
// mime-type, then guess a mime-type from the url pattern
- type = this.mimeTypes.getMimeType(url) != null ? this.mimeTypes
- .getMimeType(url) : type;
+ try {
+ TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
+ Tika tika = new Tika(tikaConfig);
+ retType = tika.detect(url) != null ? tika.detect(url) : null;
+ } catch (Exception e) {
+ String message = "Problem loading default Tika configuration";
+ LOG.error(message, e);
+ throw new RuntimeException(e);
+ }
+ } else {
+ retType = type.getName();
}
- retType= type.getName();
-
// if magic is enabled use mime magic to guess if the mime type returned
// from the magic guess is different than the one that's already set so far
// if it is, and it's not the default mime type, then go with the mime type
Modified:
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1477821&r1=1477820&r2=1477821&view=diff
==============================================================================
---
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
(original)
+++
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Tue Apr 30 20:51:44 2013
@@ -16,8 +16,6 @@
*/
package org.apache.nutch.indexer.more;
-import org.apache.tika.mime.MimeType;
-
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -36,19 +34,16 @@ import org.apache.nutch.crawl.CrawlDatum
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.util.MimeUtil;
+import org.apache.tika.Tika;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import java.text.ParseException;
-import java.text.SimpleDateFormat;
import java.io.BufferedReader;
-import java.io.FileReader;
import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
import java.util.Date;
import java.util.regex.*;
import java.util.HashMap;
@@ -72,11 +67,9 @@ import org.apache.commons.lang.time.Date
public class MoreIndexingFilter implements IndexingFilter {
public static final Logger LOG =
LoggerFactory.getLogger(MoreIndexingFilter.class);
- /** A flag that tells if magic resolution must be performed */
- private boolean MAGIC;
-
/** Get the MimeTypes resolver instance. */
private MimeUtil MIME;
+ private Tika tika = new Tika();
/** Map for mime-type substitution */
private HashMap<String,String> mimeMap = null;
@@ -114,7 +107,6 @@ public class MoreIndexingFilter implemen
// un-stored, indexed and un-tokenized
doc.add("date", new Date(time));
-
return doc;
}
@@ -220,7 +212,8 @@ public class MoreIndexingFilter implemen
// } else {
// contentType = MIME.getMimeType(url);
// }
- mimeType = MIME.getMimeType(url);
+
+ mimeType = tika.detect(url);
} else {
mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
}
@@ -240,7 +233,6 @@ public class MoreIndexingFilter implemen
}
contentType = mimeType;
-
doc.add("type", contentType);
// Check if we need to split the content type in sub parts
Modified:
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=1477821&r1=1477820&r2=1477821&view=diff
==============================================================================
---
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
(original)
+++
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
Tue Apr 30 20:51:44 2013
@@ -38,13 +38,10 @@ import org.apache.nutch.net.protocols.Re
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.MimeUtil;
-
-
+import org.apache.tika.Tika;
/**
*
@@ -52,26 +49,18 @@ import org.apache.nutch.util.MimeUtil;
*/
public class ZipTextExtractor {
- /** Get the MimeTypes resolver instance. */
- private MimeUtil MIME;
-
public static final Logger LOG =
LoggerFactory.getLogger(ZipTextExtractor.class);
private Configuration conf;
-
-
+
/** Creates a new instance of ZipTextExtractor */
public ZipTextExtractor(Configuration conf) {
this.conf = conf;
- this.MIME = new MimeUtil(conf);
}
- public String extractText(InputStream input, String url, List outLinksList)
throws IOException {
+ public String extractText(InputStream input, String url, List<Outlink>
outLinksList) throws IOException {
String resultText = "";
- byte temp;
-
ZipInputStream zin = new ZipInputStream(input);
-
ZipEntry entry;
while ((entry = zin.getNextEntry()) != null) {
@@ -93,7 +82,8 @@ public class ZipTextExtractor {
int i = fname.lastIndexOf('.');
if (i != -1) {
// Trying to resolve the Mime-Type
- String contentType = MIME.getMimeType(fname);
+ Tika tika = new Tika();
+ String contentType = tika.detect(fname);
try {
Metadata metadata = new Metadata();
metadata.set(Response.CONTENT_LENGTH,
Long.toString(entry.getSize()));
Modified:
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1477821&r1=1477820&r2=1477821&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
(original)
+++
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
Tue Apr 30 20:51:44 2013
@@ -224,7 +224,7 @@ public class FileResponse {
headers.set(Response.LAST_MODIFIED,
HttpDateFormat.toString(f.lastModified()));
- String mimeType = MIME.getMimeType(f);
+ String mimeType = tika.detect(f);
headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");