Author: markus
Date: Tue Dec 27 14:36:27 2011
New Revision: 1224916
URL: http://svn.apache.org/viewvc?rev=1224916&view=rev
Log:
NUTCH-1230 and NUTCH-1231 Upgrade to Tika 1.0 and using new Tika detect API
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/ivy/ivy.xml
nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1224916&r1=1224915&r2=1224916&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Dec 27 14:36:27 2011
@@ -1,5 +1,9 @@
Nutch Change Log
+* NUTCH-1231 Upgrade to Tika 1.0 (markus)
+
+* NUTCH-1230 MimeType API deprecated and breaks with Tika 1.0 (markus)
+
* NUTCH-1235 Upgrade to new Hadoop 0.20.205.0 (markus)
* NUTCH-1217 Update NOTICE.txt to drop some copyrights (lewismc)
Modified: nutch/trunk/ivy/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1224916&r1=1224915&r2=1224916&view=diff
==============================================================================
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Tue Dec 27 14:36:27 2011
@@ -57,7 +57,7 @@
</dependency>
<dependency org="com.ibm.icu" name="icu4j" rev="4.0.1" />
- <dependency org="org.apache.tika" name="tika-core" rev="0.10" />
+ <dependency org="org.apache.tika" name="tika-core" rev="1.0" />
<dependency org="org.mortbay.jetty" name="jetty-client"
rev="6.1.22" />
<dependency org="log4j" name="log4j" rev="1.2.15"
conf="*->master" />
Modified: nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java?rev=1224916&r1=1224915&r2=1224916&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Tue Dec 27
14:36:27 2011
@@ -24,6 +24,7 @@ import java.io.File;
import org.apache.hadoop.conf.Configuration;
// Tika imports
+import org.apache.tika.Tika;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
@@ -49,6 +50,9 @@ public final class MimeUtil {
/* our Tika mime type registry */
private MimeTypes mimeTypes;
+ /* the tika detectors */
+ private Tika tika;
+
/* whether or not magic should be employed or not */
private boolean mimeMagic;
@@ -56,6 +60,7 @@ public final class MimeUtil {
private static final Logger LOG =
LoggerFactory.getLogger(MimeUtil.class.getName());
public MimeUtil(Configuration conf) {
+ tika = new Tika();
ObjectCache objectCache = ObjectCache.get(conf);
MimeTypes mimeTypez = (MimeTypes) objectCache.getObject(MimeTypes.class
.getName());
@@ -133,6 +138,8 @@ public final class MimeUtil {
* @return The correctly, automatically guessed {@link MimeType} name.
*/
public String autoResolveContentType(String typeName, String url, byte[]
data) {
+ String retType = null;
+ String magicType = null;
MimeType type = null;
String cleanedMimeType = null;
@@ -161,59 +168,65 @@ public final class MimeUtil {
.getMimeType(url) : type;
}
+ retType= type.getName();
+
// if magic is enabled use mime magic to guess if the mime type returned
// from the magic guess is different than the one that's already set so far
// if it is, and it's not the default mime type, then go with the mime type
// returned by the magic
if (this.mimeMagic) {
- MimeType magicType = this.mimeTypes.getMimeType(data);
- if (magicType != null &&
!magicType.getName().equals(MimeTypes.OCTET_STREAM)
- && !magicType.getName().equals(MimeTypes.PLAIN_TEXT)
- && type != null && !type.getName().equals(magicType.getName())) {
+ magicType = tika.detect(data);
+
+ // Deprecated in Tika 1.0 See
https://issues.apache.org/jira/browse/NUTCH-1230
+ //MimeType magicType = this.mimeTypes.getMimeType(data);
+ if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM)
+ && !magicType.equals(MimeTypes.PLAIN_TEXT)
+ && retType != null && !retType.equals(magicType)) {
+
// If magic enabled and the current mime type differs from that of the
// one returned from the magic, take the magic mimeType
- type = magicType;
+ retType = magicType;
}
// if type is STILL null after all the resolution strategies, go for the
// default type
- if (type == null) {
+ if (retType == null) {
try {
- type = this.mimeTypes.forName(MimeTypes.OCTET_STREAM);
+ retType = MimeTypes.OCTET_STREAM;
} catch (Exception ignore) {
}
}
}
- return type.getName();
+ return retType;
}
/**
* Facade interface to Tika's underlying {@link
MimeTypes#getMimeType(String)}
* method.
- *
+ *
* @param url
* A string representation of the document {@link URL} to sense the
* {@link MimeType} for.
* @return An appropriate {@link MimeType}, identified from the given
* Document url in string form.
*/
- public MimeType getMimeType(String url) {
- return this.mimeTypes.getMimeType(url);
+ public String getMimeType(String url) {
+ return tika.detect(url);
}
/**
* A facade interface to Tika's underlying {@link MimeTypes#forName(String)}
* method.
- *
+ *
* @param name
* The name of a valid {@link MimeType} in the Tika mime registry.
* @return The object representation of the {@link MimeType}, if it exists,
* or null otherwise.
*/
- public MimeType forName(String name) {
+ public String forName(String name) {
try {
- return this.mimeTypes.forName(name);
+ return this.mimeTypes.forName(name).toString();
} catch (MimeTypeException e) {
LOG.error("Exception getting mime type by name: [" + name
+ "]: Message: " + e.getMessage());
@@ -224,14 +237,21 @@ public final class MimeUtil {
/**
* Facade interface to Tika's underlying {@link MimeTypes#getMimeType(File)}
* method.
- *
+ *
* @param f
* The {@link File} to sense the {@link MimeType} for.
* @return The {@link MimeType} of the given {@link File}, or null if it
* cannot be determined.
*/
- public MimeType getMimeType(File f) {
- return this.mimeTypes.getMimeType(f);
+ public String getMimeType(File f) {
+ try {
+ return tika.detect(f);
+ } catch (Exception e) {
+ LOG.error("Exception getting mime type for file: [" + f.getPath()
+ + "]: Message: " + e.getMessage());
+ return null;
+ }
}
-}
+
+}
\ No newline at end of file
Modified:
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1224916&r1=1224915&r2=1224916&view=diff
==============================================================================
---
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
(original)
+++
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Tue Dec 27 14:36:27 2011
@@ -185,7 +185,7 @@ public class MoreIndexingFilter implemen
* @return
*/
private NutchDocument addType(NutchDocument doc, ParseData data, String url)
{
- MimeType mimeType = null;
+ String mimeType = null;
String contentType = data.getMeta(Response.CONTENT_TYPE);
if (contentType == null) {
// Note by Jerome Charron on 20050415:
@@ -209,13 +209,13 @@ public class MoreIndexingFilter implemen
return doc;
}
- contentType = mimeType.getName();
+ contentType = mimeType;
doc.add("type", contentType);
// Check if we need to split the content type in sub parts
if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {
- String[] parts = getParts(contentType.toString());
+ String[] parts = getParts(contentType);
for(String part: parts) {
doc.add("type", part);
Modified:
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=1224916&r1=1224915&r2=1224916&view=diff
==============================================================================
---
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
(original)
+++
nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
Tue Dec 27 14:36:27 2011
@@ -93,7 +93,7 @@ public class ZipTextExtractor {
int i = fname.lastIndexOf('.');
if (i != -1) {
// Trying to resolve the Mime-Type
- String contentType = MIME.getMimeType(fname).getName();
+ String contentType = MIME.getMimeType(fname);
try {
Metadata metadata = new Metadata();
metadata.set(Response.CONTENT_LENGTH,
Long.toString(entry.getSize()));
Modified:
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1224916&r1=1224915&r2=1224916&view=diff
==============================================================================
---
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
(original)
+++
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
Tue Dec 27 14:36:27 2011
@@ -33,7 +33,7 @@ import org.apache.nutch.net.protocols.Ht
import org.apache.nutch.net.protocols.Response;
// Tika imports
-import org.apache.tika.mime.MimeType;
+import org.apache.tika.Tika;
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
@@ -74,6 +74,7 @@ public class FileResponse {
private Configuration conf;
private MimeUtil MIME;
+ private Tika tika;
/** Returns the response code. */
public int getCode() {
@@ -103,6 +104,7 @@ public class FileResponse {
this.conf = conf;
MIME = new MimeUtil(conf);
+ tika = new Tika();
if (!"file".equals(url.getProtocol()))
throw new FileException("Not a file url:" + url);
@@ -216,9 +218,9 @@ public class FileResponse {
headers.set(Response.LAST_MODIFIED,
HttpDateFormat.toString(f.lastModified()));
- MimeType mimeType = MIME.getMimeType(f);
- String mimeTypeString = mimeType != null ? mimeType.getName() : "";
- headers.set(Response.CONTENT_TYPE, mimeTypeString);
+ String mimeType = MIME.getMimeType(f);
+
+ headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");
// response code
this.code = 200; // http OK