Author: markus Date: Thu Sep 22 14:02:51 2011 New Revision: 1174147 URL: http://svn.apache.org/viewvc?rev=1174147&view=rev Log: NUTCH-1115 Option to disable fixing of URL embedded parameters in DomContentUtils
Modified: nutch/branches/branch-1.4/conf/nutch-default.xml nutch/branches/branch-1.4/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java nutch/branches/branch-1.4/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java Modified: nutch/branches/branch-1.4/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/nutch-default.xml?rev=1174147&r1=1174146&r2=1174147&view=diff ============================================================================== --- nutch/branches/branch-1.4/conf/nutch-default.xml (original) +++ nutch/branches/branch-1.4/conf/nutch-default.xml Thu Sep 22 14:02:51 2011 @@ -921,7 +921,6 @@ "all" doesn't show either content or summaries.</description> </property> - <property> <name>parser.html.impl</name> <value>neko</value> @@ -950,6 +949,13 @@ </property> <property> + <name>parser.fix.embeddedparams</name> + <value>true</value> + <description>Whether to fix URL embedded params using semi-colons. + See NUTCH-436 and NUTCH-1115</description> +</property> + +<property> <name>htmlparsefilter.order</name> <value></value> <description>The order by which HTMLParse filters are applied. Modified: nutch/branches/branch-1.4/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=1174147&r1=1174146&r2=1174147&view=diff ============================================================================== --- nutch/branches/branch-1.4/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original) +++ nutch/branches/branch-1.4/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Thu Sep 22 14:02:51 2011 @@ -39,6 +39,8 @@ import org.w3c.dom.*; */ public class DOMContentUtils { + private boolean fixEmbeddedParams; + public static class LinkParams { public String elName; public String attrName; @@ -87,6 +89,9 @@ public class DOMContentUtils { if ( ! forceTags.contains(ignoreTags[i]) ) linkParams.remove(ignoreTags[i]); } + + // https://issues.apache.org/jira/browse/NUTCH-1115 + fixEmbeddedParams = conf.getBoolean("parser.fix.embeddedparams", true); } /** @@ -321,7 +326,7 @@ public class DOMContentUtils { // the target contains params information or the base doesn't then no // conversion necessary, return regular URL - if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) { + if (!fixEmbeddedParams || target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) { return new URL(base, target); } Modified: nutch/branches/branch-1.4/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.4/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java?rev=1174147&r1=1174146&r2=1174147&view=diff ============================================================================== --- nutch/branches/branch-1.4/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java (original) +++ nutch/branches/branch-1.4/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java Thu Sep 22 14:02:51 2011 @@ -39,6 +39,8 @@ import org.w3c.dom.NodeList; */ class DOMContentUtils { + private boolean fixEmbeddedParams; + private static class LinkParams { private String elName; private String attrName; @@ -87,6 +89,9 @@ class DOMContentUtils { if ( ! forceTags.contains(ignoreTags[i]) ) linkParams.remove(ignoreTags[i]); } + + // https://issues.apache.org/jira/browse/NUTCH-1115 + fixEmbeddedParams = conf.getBoolean("parser.fix.embeddedparams", true); } /** @@ -318,10 +323,10 @@ class DOMContentUtils { */ private URL fixEmbeddedParams(URL base, String target) throws MalformedURLException{ - + // the target contains params information or the base doesn't then no // conversion necessary, return regular URL - if (target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) { + if (!fixEmbeddedParams || target.indexOf(';') >= 0 || base.toString().indexOf(';') == -1) { return new URL(base, target); } @@ -340,7 +345,7 @@ class DOMContentUtils { else { target += params; } - + return new URL(base, target); }