Author: lewismc Date: Wed Oct 12 18:22:20 2011 New Revision: 1182506 URL: http://svn.apache.org/viewvc?rev=1182506&view=rev Log: commit to address NUTCH-1097 and update to changes.txt
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java nutch/trunk/src/plugin/parse-html/plugin.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1182506&r1=1182505&r2=1182506&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Oct 12 18:22:20 2011 @@ -2,6 +2,8 @@ Nutch Change Log Release 1.4 - Current development +* NUTCH-1097 application/xhtml+xml should be enabled in plugin.xml of parse-html; allow multiple mimetypes for plugin.xml (Ferdy via lewismc) + * NUTCH-797 Fix parse-tika and parse-html to use relative URL resolution per RFC-3986 (Robert Hohman, ab) Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=1182506&r1=1182505&r2=1182506&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Wed Oct 12 18:22:20 2011 @@ -343,14 +343,13 @@ public final class ParserFactory { // NotMappedParserException for (int i=0; i<extensions.length; i++) { - if (extensions[i].getAttribute("contentType") != null - && extensions[i].getAttribute("contentType").equals( - contentType)) { - extList.add(extensions[i]); - } - else if ("*".equals(extensions[i].getAttribute("contentType"))){ + if ("*".equals(extensions[i].getAttribute("contentType"))){ extList.add(0, extensions[i]); } + else if (extensions[i].getAttribute("contentType") != null + && contentType.matches(escapeContentType(extensions[i].getAttribute("contentType")))) { + extList.add(extensions[i]); + } } if (extList.size() > 0) { @@ -377,10 +376,18 @@ public final class ParserFactory { return (extList.size() > 0) ? extList : null; } + + private String escapeContentType(String contentType) { + // Escapes contentType in order to use as a regex + // (and keep backwards compatibility). + // This enables to accept multiple types for a single parser. + return contentType.replace("+", "\\+").replace(".", "\\."); + } private boolean match(Extension extension, String id, String type) { return ((id.equals(extension.getId())) && - (type.equals(extension.getAttribute("contentType")) || extension.getAttribute("contentType").equals("*") || + (extension.getAttribute("contentType").equals("*") || + type.matches(escapeContentType(extension.getAttribute("contentType"))) || type.equals(DEFAULT_PLUGIN))); } Modified: nutch/trunk/src/plugin/parse-html/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/plugin.xml?rev=1182506&r1=1182505&r2=1182506&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-html/plugin.xml (original) +++ nutch/trunk/src/plugin/parse-html/plugin.xml Wed Oct 12 18:22:20 2011 @@ -39,7 +39,7 @@ <implementation id="org.apache.nutch.parse.html.HtmlParser" class="org.apache.nutch.parse.html.HtmlParser"> - <parameter name="contentType" value="text/html"/> + <parameter name="contentType" value="text/html|application/xhtml+xml"/> <parameter name="pathSuffix" value=""/> </implementation>