MoreIndexingFilter.java

markus Mon, 11 Jun 2012 03:27:07 -0700

Author: markus
Date: Mon Jun 11 10:26:37 2012
New Revision: 1348785

URL: http://svn.apache.org/viewvc?rev=1348785&view=rev
Log:
NUTCH-1262 Map `duplicating` content-types to a single type


Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1348785&r1=1348784&r2=1348785&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Jun 11 10:26:37 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1262 Map `duplicating` content-types to a single type (markus)
+
 * NUTCH-1384 Typo in ParseSegments's run-method (Matthias Agethle via markus)
 
 * NUTCH-1385 More robust plug-in order properties in nutch-site.xml (Andy Xue 
via markus)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1348785&r1=1348784&r2=1348785&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon Jun 11 10:26:37 2012
@@ -820,6 +820,16 @@
   </description>
 </property>
 
+<property>
+  <name>moreIndexingFilter.mapMimeTypes</name>
+  <value>false</value>
+  <description>Determines whether MIME-type mapping is enabled. It takes a
+  plain text file with mapped MIME-types. With it the user can map both
+  application/xhtml+xml and text/html to the same target MIME-type so it
+  can be treated equally in an index. See conf/contenttype-mapping.txt.
+  </description>
+</property>
+
 <!-- AnchorIndexing filter plugin properties -->
 
 <property>

Modified: 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1348785&r1=1348784&r2=1348785&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Mon Jun 11 10:26:37 2012
@@ -44,10 +44,16 @@ import org.apache.hadoop.io.Writable;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
 
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
 import java.util.Date;
 import java.util.regex.*;
+import java.util.HashMap;
 
-
+import org.apache.commons.lang.StringUtils;
 import org.apache.commons.lang.time.DateUtils;
 
 /**
@@ -72,6 +78,10 @@ public class MoreIndexingFilter implemen
   /** Get the MimeTypes resolver instance. */
   private MimeUtil MIME;
 
+  /** Map for mime-type substitution */
+  private HashMap<String,String> mimeMap = null;
+  private boolean mapMimes = false;
+
   public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
       CrawlDatum datum, Inlinks inlinks) throws IndexingException {
 
@@ -218,6 +228,15 @@ public class MoreIndexingFilter implemen
       return doc;
     }
 
+    // Check if we have to map mime types
+    if (mapMimes) {
+      // Check if the current mime is mapped
+      if (mimeMap.containsKey(mimeType)) {
+        // It's mapped, let's replace it
+        mimeType = mimeMap.get(mimeType);
+      }
+    }
+
     contentType = mimeType;
 
     doc.add("type", contentType);
@@ -289,10 +308,42 @@ public class MoreIndexingFilter implemen
   public void setConf(Configuration conf) {
     this.conf = conf;
     MIME = new MimeUtil(conf);
+
+    if (conf.getBoolean("moreIndexingFilter.mapMimeTypes", false) == true) {
+      mapMimes = true;
+
+      // Load the mapping
+      try {
+        readConfiguration();
+      } catch (Exception e) {
+        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      }
+    }
   }
 
   public Configuration getConf() {
     return this.conf;
   }
 
+  private void readConfiguration() throws IOException {
+    BufferedReader reader = new 
BufferedReader(conf.getConfResourceAsReader("contenttype-mapping.txt"));
+    String line;
+    String parts[];
+
+    mimeMap = new HashMap<String,String>();
+
+    while ((line = reader.readLine()) != null) {
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        line.trim();
+        parts = line.split("\t");
+
+        // Must be at least two parts
+        if (parts.length > 1) {
+          for (int i = 1; i < parts.length; i++) {
+            mimeMap.put(parts[i].trim(), parts[0].trim());
+          }
+        }
+      }
+    }
+  }
 }

svn commit: r1348785 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

Reply via email to