Author: markus
Date: Mon Jun 11 10:26:37 2012
New Revision: 1348785
URL: http://svn.apache.org/viewvc?rev=1348785&view=rev
Log:
NUTCH-1262 Map `duplicating` content-types to a single type
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1348785&r1=1348784&r2=1348785&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Jun 11 10:26:37 2012
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1262 Map `duplicating` content-types to a single type (markus)
+
* NUTCH-1384 Typo in ParseSegments's run-method (Matthias Agethle via markus)
* NUTCH-1385 More robust plug-in order properties in nutch-site.xml (Andy Xue
via markus)
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1348785&r1=1348784&r2=1348785&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon Jun 11 10:26:37 2012
@@ -820,6 +820,16 @@
</description>
</property>
+<property>
+ <name>moreIndexingFilter.mapMimeTypes</name>
+ <value>false</value>
+ <description>Determines whether MIME-type mapping is enabled. It takes a
+ plain text file with mapped MIME-types. With it the user can map both
+ application/xhtml+xml and text/html to the same target MIME-type so it
+ can be treated equally in an index. See conf/contenttype-mapping.txt.
+ </description>
+</property>
+
<!-- AnchorIndexing filter plugin properties -->
<property>
Modified:
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1348785&r1=1348784&r2=1348785&view=diff
==============================================================================
---
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
(original)
+++
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Mon Jun 11 10:26:37 2012
@@ -44,10 +44,16 @@ import org.apache.hadoop.io.Writable;
import java.text.ParseException;
import java.text.SimpleDateFormat;
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
import java.util.Date;
import java.util.regex.*;
+import java.util.HashMap;
-
+import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.DateUtils;
/**
@@ -72,6 +78,10 @@ public class MoreIndexingFilter implemen
/** Get the MimeTypes resolver instance. */
private MimeUtil MIME;
+ /** Map for mime-type substitution */
+ private HashMap<String,String> mimeMap = null;
+ private boolean mapMimes = false;
+
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
@@ -218,6 +228,15 @@ public class MoreIndexingFilter implemen
return doc;
}
+ // Check if we have to map mime types
+ if (mapMimes) {
+ // Check if the current mime is mapped
+ if (mimeMap.containsKey(mimeType)) {
+ // It's mapped, let's replace it
+ mimeType = mimeMap.get(mimeType);
+ }
+ }
+
contentType = mimeType;
doc.add("type", contentType);
@@ -289,10 +308,42 @@ public class MoreIndexingFilter implemen
public void setConf(Configuration conf) {
this.conf = conf;
MIME = new MimeUtil(conf);
+
+ if (conf.getBoolean("moreIndexingFilter.mapMimeTypes", false) == true) {
+ mapMimes = true;
+
+ // Load the mapping
+ try {
+ readConfiguration();
+ } catch (Exception e) {
+ LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ }
+ }
}
public Configuration getConf() {
return this.conf;
}
+ private void readConfiguration() throws IOException {
+ BufferedReader reader = new
BufferedReader(conf.getConfResourceAsReader("contenttype-mapping.txt"));
+ String line;
+ String parts[];
+
+ mimeMap = new HashMap<String,String>();
+
+ while ((line = reader.readLine()) != null) {
+ if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+ line.trim();
+ parts = line.split("\t");
+
+ // Must be at least two parts
+ if (parts.length > 1) {
+ for (int i = 1; i < parts.length; i++) {
+ mimeMap.put(parts[i].trim(), parts[0].trim());
+ }
+ }
+ }
+ }
+ }
}