Author: snagel Date: Wed Oct 10 21:06:27 2012 New Revision: 1396796 URL: http://svn.apache.org/viewvc?rev=1396796&view=rev Log: NUTCH-706 Url regex normalizer: pattern for session id removal not to match "newsId"
Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/regex-normalize.xml.template nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1396796&r1=1396795&r2=1396796&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Oct 10 21:06:27 2012 @@ -2,6 +2,8 @@ Nutch Change Log (trunk) Current Development: +* NUTCH-706 Url regex normalizer: pattern for session id removal not to match "newsId" (Meghna Kukreja via snagel) + * NUTCH-1415 release packages to contain top level folder apache-nutch-x.x (snagel) * NUTCH-1441 AnchorIndexingFilter should use plain HashSet (ferdy via lewismc) Modified: nutch/trunk/conf/regex-normalize.xml.template URL: http://svn.apache.org/viewvc/nutch/trunk/conf/regex-normalize.xml.template?rev=1396796&r1=1396795&r2=1396796&view=diff ============================================================================== --- nutch/trunk/conf/regex-normalize.xml.template (original) +++ nutch/trunk/conf/regex-normalize.xml.template Wed Oct 10 21:06:27 2012 @@ -29,7 +29,7 @@ <!-- removes session ids from urls (such as jsessionid and PHPSESSID) --> <regex> - <pattern>([;_]?((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|&|#|$)</pattern> + <pattern>([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|&|#|$)</pattern> <substitution>$4</substitution> </regex> Modified: nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test?rev=1396796&r1=1396795&r2=1396796&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test (original) +++ nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test Wed Oct 10 21:06:27 2012 @@ -11,6 +11,8 @@ http://www.foo.com/foo.html;jsessionid=1 http://www.foo.com/foo.html?param=1&another=2;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED http://www.foo.com/foo.html?param=1&another=2 http://www.foo.com/foo.html;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?param=1&another=2 http://www.foo.com/foo.html?param=1&another=2 http://www.foo.com/foo.php?&x=1&sid=xyz&something=1 http://www.foo.com/foo.php?x=1&something=1 +# but NewsId is not a session id (NUTCH-706, NUTCH-1328) +http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 http://www.foo.com/fa/newsdetail.aspx?NewsID=1567539 # test removal default pages http://www.foo.com/home/index.html http://www.foo.com/home/ Modified: nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml?rev=1396796&r1=1396795&r2=1396796&view=diff ============================================================================== --- nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml (original) +++ nutch/trunk/src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml Wed Oct 10 21:06:27 2012 @@ -13,7 +13,7 @@ <!-- removes session ids from urls (such as jsessionid and PHPSESSID) --> <regex> - <pattern>([;_]?((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|&|#|$)</pattern> + <pattern>([;_]?\b((?i)l|j|bv_)?((?i)sid|phpsessid|sessionid)=.*?)(\?|&|#|$)</pattern> <substitution>$4</substitution> </regex>