Author: snagel
Date: Wed Oct 10 21:06:27 2012
New Revision: 1396796
URL: http://svn.apache.org/viewvc?rev=1396796&view=rev
Log:
NUTCH-706 Url regex normalizer: pattern for session id removal not to match
"newsId"
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/regex-normalize.xml.template
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1396796&r1=1396795&r2=1396796&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Oct 10 21:06:27 2012
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-706 Url regex normalizer: pattern for session id removal not to match
"newsId" (Meghna Kukreja via snagel)
+
* NUTCH-1415 release packages to contain top level folder apache-nutch-x.x
(snagel)
* NUTCH-1441 AnchorIndexingFilter should use plain HashSet (ferdy via lewismc)
Modified: nutch/trunk/conf/regex-normalize.xml.template
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/regex-normalize.xml.template?rev=1396796&r1=1396795&r2=1396796&view=diff
==============================================================================
--- nutch/trunk/conf/regex-normalize.xml.template (original)
+++ nutch/trunk/conf/regex-normalize.xml.template Wed Oct 10 21:06:27 2012
@@ -29,7 +29,7 @@
<!-- removes session ids from urls (such as jsessionid and PHPSESSID) -->
<regex>
-
<pattern>([;_]?((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|&|#|$)</pattern>
+
<pattern>([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|&|#|$)</pattern>
<substitution>$4</substitution>
</regex>
Modified:
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test?rev=1396796&r1=1396795&r2=1396796&view=diff
==============================================================================
---
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
(original)
+++
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
Wed Oct 10 21:06:27 2012
@@ -11,6 +11,8 @@ http://www.foo.com/foo.html;jsessionid=1
http://www.foo.com/foo.html?param=1&another=2;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED
http://www.foo.com/foo.html?param=1&another=2
http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?param=1&another=2
http://www.foo.com/foo.html?param=1&another=2
http://www.foo.com/foo.php?&x=1&sid=xyz&something=1
http://www.foo.com/foo.php?x=1&something=1
+# but NewsId is not a session id (NUTCH-706, NUTCH-1328)
+http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539
http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539
# test removal default pages
http://www.foo.com/home/index.html http://www.foo.com/home/
Modified:
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml?rev=1396796&r1=1396795&r2=1396796&view=diff
==============================================================================
---
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml
(original)
+++
nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml
Wed Oct 10 21:06:27 2012
@@ -13,7 +13,7 @@
<!-- removes session ids from urls (such as jsessionid and PHPSESSID) -->
<regex>
-
<pattern>([;_]?((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|&|#|$)</pattern>
+
<pattern>([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|&|#|$)</pattern>
<substitution>$4</substitution>
</regex>