Author: lewismc
Date: Wed Oct 12 18:22:20 2011
New Revision: 1182506

URL: http://svn.apache.org/viewvc?rev=1182506&view=rev
Log:
commit to address NUTCH-1097 and update to changes.txt

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
    nutch/trunk/src/plugin/parse-html/plugin.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1182506&r1=1182505&r2=1182506&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Oct 12 18:22:20 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 1.4 - Current development
 
+* NUTCH-1097 application/xhtml+xml should be enabled in plugin.xml of 
parse-html; allow multiple mimetypes for plugin.xml (Ferdy via lewismc)
+
 * NUTCH-797 Fix parse-tika and parse-html to use relative URL resolution per 
RFC-3986
   (Robert Hohman, ab)
 

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=1182506&r1=1182505&r2=1182506&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Wed Oct 12 
18:22:20 2011
@@ -343,14 +343,13 @@ public final class ParserFactory {
       // NotMappedParserException
       
       for (int i=0; i<extensions.length; i++) {
-        if (extensions[i].getAttribute("contentType") != null
-            && extensions[i].getAttribute("contentType").equals(
-                contentType)) {
-          extList.add(extensions[i]);
-        }
-        else if ("*".equals(extensions[i].getAttribute("contentType"))){
+       if ("*".equals(extensions[i].getAttribute("contentType"))){
           extList.add(0, extensions[i]);
         }
+        else if (extensions[i].getAttribute("contentType") != null
+            && 
contentType.matches(escapeContentType(extensions[i].getAttribute("contentType"))))
 {
+          extList.add(extensions[i]);
+        }
       }
       
       if (extList.size() > 0) {
@@ -377,10 +376,18 @@ public final class ParserFactory {
     
     return (extList.size() > 0) ? extList : null;
   }
+  
+  private String escapeContentType(String contentType) {
+       // Escapes contentType in order to use as a regex 
+       // (and keep backwards compatibility).
+       // This enables to accept multiple types for a single parser. 
+       return contentType.replace("+", "\\+").replace(".", "\\.");
+       }
 
   private boolean match(Extension extension, String id, String type) {
     return ((id.equals(extension.getId())) &&
-            (type.equals(extension.getAttribute("contentType")) || 
extension.getAttribute("contentType").equals("*") ||
+            (extension.getAttribute("contentType").equals("*") || 
+             
type.matches(escapeContentType(extension.getAttribute("contentType"))) ||
              type.equals(DEFAULT_PLUGIN)));
   }
   

Modified: nutch/trunk/src/plugin/parse-html/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/plugin.xml?rev=1182506&r1=1182505&r2=1182506&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-html/plugin.xml (original)
+++ nutch/trunk/src/plugin/parse-html/plugin.xml Wed Oct 12 18:22:20 2011
@@ -39,7 +39,7 @@
 
       <implementation id="org.apache.nutch.parse.html.HtmlParser"
                       class="org.apache.nutch.parse.html.HtmlParser">
-        <parameter name="contentType" value="text/html"/>
+        <parameter name="contentType" value="text/html|application/xhtml+xml"/>
         <parameter name="pathSuffix" value=""/>
       </implementation>
 


Reply via email to