Author: snagel Date: Sun Dec 6 21:14:06 2015 New Revision: 1718223 URL: http://svn.apache.org/viewvc?rev=1718223&view=rev Log: NUTCH-2172 index-more: document format of contenttype-mapping.txt
Added: nutch/trunk/conf/contenttype-mapping.txt.template Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1718223&r1=1718222&r2=1718223&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Sun Dec 6 21:14:06 2015 @@ -1,5 +1,7 @@ Nutch Change Log - + +* NUTCH-2172 index-more: document format of contenttype-mapping.txt (Nicola Tonellotto, snagel) + Nutch 1.11 Release 03/12/2015 (dd/mm/yyyy) Release Report: http://s.apache.org/nutch11 Added: nutch/trunk/conf/contenttype-mapping.txt.template URL: http://svn.apache.org/viewvc/nutch/trunk/conf/contenttype-mapping.txt.template?rev=1718223&view=auto ============================================================================== --- nutch/trunk/conf/contenttype-mapping.txt.template (added) +++ nutch/trunk/conf/contenttype-mapping.txt.template Sun Dec 6 21:14:06 2015 @@ -0,0 +1,22 @@ +# +# Mapping of detected content types (MIME types) to custom types (target types) +# used by the plugin index-more when filling the index field `type'. +# +# Note: The mappings defined in this file are only active if the property +# `moreIndexingFilter.mapMimeTypes' is true. +# +# Format (tab-separated plain text, comment lines start with `#'): +# +# <target type> <TAB> <detected type1> [<TAB> <detected type2> ...] +# +# Examples (comment in to activate): +# +# map XHTML to HTML +#text/html application/xhtml+xml +# +# Map XHTML and HTML to a custom type "web page" +#web page text/html application/xhtml+xml +# +# map various office document formats to a custom type "office document" +#office document application/vnd.oasis.opendocument.text application/x-tika-msoffice application/msword +# Modified: nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1718223&r1=1718222&r2=1718223&view=diff ============================================================================== --- nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Sun Dec 6 21:14:06 2015 @@ -312,10 +312,12 @@ public class MoreIndexingFilter implemen } private void readConfiguration() throws IOException { + LOG.info("Reading content type mappings from file contenttype-mapping.txt"); BufferedReader reader = new BufferedReader( conf.getConfResourceAsReader("contenttype-mapping.txt")); String line; String parts[]; + boolean formatWarningShown = false; mimeMap = new HashMap<String, String>(); @@ -329,6 +331,12 @@ public class MoreIndexingFilter implemen for (int i = 1; i < parts.length; i++) { mimeMap.put(parts[i].trim(), parts[0].trim()); } + } else { + LOG.warn("Wrong format of line: {}", line); + if (!formatWarningShown) { + LOG.warn("Expected format: <target type> <tab> <type1> [<tab> <type2> ...]"); + formatWarningShown = true; + } } } }