Author: snagel
Date: Sun Dec  6 21:14:06 2015
New Revision: 1718223

URL: http://svn.apache.org/viewvc?rev=1718223&view=rev
Log:
NUTCH-2172 index-more: document format of contenttype-mapping.txt

Added:
    nutch/trunk/conf/contenttype-mapping.txt.template
Modified:
    nutch/trunk/CHANGES.txt
    
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1718223&r1=1718222&r2=1718223&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Sun Dec  6 21:14:06 2015
@@ -1,5 +1,7 @@
 Nutch Change Log
-    
+
+* NUTCH-2172 index-more: document format of contenttype-mapping.txt (Nicola 
Tonellotto, snagel)
+
 Nutch 1.11 Release 03/12/2015 (dd/mm/yyyy)
 Release Report: http://s.apache.org/nutch11
 

Added: nutch/trunk/conf/contenttype-mapping.txt.template
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/contenttype-mapping.txt.template?rev=1718223&view=auto
==============================================================================
--- nutch/trunk/conf/contenttype-mapping.txt.template (added)
+++ nutch/trunk/conf/contenttype-mapping.txt.template Sun Dec  6 21:14:06 2015
@@ -0,0 +1,22 @@
+#
+# Mapping of detected content types (MIME types) to custom types (target types)
+# used by the plugin index-more when filling the index field `type'.
+#
+# Note: The mappings defined in this file are only active if the property
+# `moreIndexingFilter.mapMimeTypes' is true.
+#
+# Format (tab-separated plain text, comment lines start with `#'):
+#
+#  <target type> <TAB> <detected type1> [<TAB> <detected type2> ...]
+#
+# Examples (comment in to activate):
+#
+# map XHTML to HTML
+#text/html     application/xhtml+xml
+#
+# Map XHTML and HTML to a custom type "web page"
+#web page      text/html       application/xhtml+xml
+#
+# map various office document formats to a custom type "office document"
+#office document       application/vnd.oasis.opendocument.text 
application/x-tika-msoffice     application/msword
+#

Modified: 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1718223&r1=1718222&r2=1718223&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 Sun Dec  6 21:14:06 2015
@@ -312,10 +312,12 @@ public class MoreIndexingFilter implemen
   }
 
   private void readConfiguration() throws IOException {
+    LOG.info("Reading content type mappings from file 
contenttype-mapping.txt");
     BufferedReader reader = new BufferedReader(
         conf.getConfResourceAsReader("contenttype-mapping.txt"));
     String line;
     String parts[];
+    boolean formatWarningShown = false;
 
     mimeMap = new HashMap<String, String>();
 
@@ -329,6 +331,12 @@ public class MoreIndexingFilter implemen
           for (int i = 1; i < parts.length; i++) {
             mimeMap.put(parts[i].trim(), parts[0].trim());
           }
+        } else {
+          LOG.warn("Wrong format of line: {}", line);
+          if (!formatWarningShown) {
+            LOG.warn("Expected format: <target type> <tab> <type1> [<tab> 
<type2> ...]");
+            formatWarningShown = true;
+          }
         }
       }
     }


Reply via email to