Author: snagel
Date: Wed Oct 10 21:05:19 2012
New Revision: 1396795

URL: http://svn.apache.org/viewvc?rev=1396795&view=rev
Log:
NUTCH-706 Url regex normalizer: pattern for session id removal not to match 
"newsId"

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/conf/regex-normalize.xml.template
    
nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
    
nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1396795&r1=1396794&r2=1396795&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Wed Oct 10 21:05:19 2012
@@ -2,7 +2,7 @@ Nutch Change Log
 
 Release 2.2 - Current Development
 
-* NUTCH-
+* NUTCH-706 Url regex normalizer: pattern for session id removal not to match 
"newsId" (Meghna Kukreja via snagel)
 
 Release 2.1 (19/09/2012) ddmmyyyy
 Full Jira Report - 
https://issues.apache.org/jira/secure/ReleaseNote.jspa?projectId=10680&version=12321040

Modified: nutch/branches/2.x/conf/regex-normalize.xml.template
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/regex-normalize.xml.template?rev=1396795&r1=1396794&r2=1396795&view=diff
==============================================================================
--- nutch/branches/2.x/conf/regex-normalize.xml.template (original)
+++ nutch/branches/2.x/conf/regex-normalize.xml.template Wed Oct 10 21:05:19 
2012
@@ -29,7 +29,7 @@
 
 <!-- removes session ids from urls (such as jsessionid and PHPSESSID) -->
 <regex>
-  
<pattern>([;_]?((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|&amp;|#|$)</pattern>
+  
<pattern>([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|&amp;|#|$)</pattern>
   <substitution>$4</substitution>
 </regex>
 

Modified: 
nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test?rev=1396795&r1=1396794&r2=1396795&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
 (original)
+++ 
nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test
 Wed Oct 10 21:05:19 2012
@@ -11,6 +11,8 @@ http://www.foo.com/foo.html;jsessionid=1
 
http://www.foo.com/foo.html?param=1&another=2;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED
 http://www.foo.com/foo.html?param=1&another=2
 
http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?param=1&another=2
 http://www.foo.com/foo.html?param=1&another=2
 http://www.foo.com/foo.php?&x=1&sid=xyz&something=1 
http://www.foo.com/foo.php?x=1&something=1
+# but NewsId is not a session id (NUTCH-706, NUTCH-1328)
+http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 
http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539
 
 # test removal default pages
 http://www.foo.com/home/index.html http://www.foo.com/home/

Modified: 
nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml?rev=1396795&r1=1396794&r2=1396795&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml
 (original)
+++ 
nutch/branches/2.x/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml
 Wed Oct 10 21:05:19 2012
@@ -13,7 +13,7 @@
 
 <!-- removes session ids from urls (such as jsessionid and PHPSESSID) -->
 <regex>
-  
<pattern>([;_]?((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|&amp;|#|$)</pattern>
+  
<pattern>([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|&amp;|#|$)</pattern>
   <substitution>$4</substitution>
 </regex>
 


Reply via email to