Author: jnioche
Date: Mon Jul 18 09:23:34 2011
New Revision: 1147796

URL: http://svn.apache.org/viewvc?rev=1147796&view=rev
Log:
NUTCH-1043 Add pattern for filtering .js in default url filters

Modified:
    nutch/branches/branch-1.4/CHANGES.txt
    nutch/branches/branch-1.4/conf/automaton-urlfilter.txt.template
    nutch/branches/branch-1.4/conf/regex-urlfilter.txt.template

Modified: nutch/branches/branch-1.4/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/CHANGES.txt?rev=1147796&r1=1147795&r2=1147796&view=diff
==============================================================================
--- nutch/branches/branch-1.4/CHANGES.txt (original)
+++ nutch/branches/branch-1.4/CHANGES.txt Mon Jul 18 09:23:34 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 1.4 - Current development
 
+* NUTCH-1043 Add pattern for filtering .js in default url filters (jnioche)
+
 * NUTCH-1054 LinkDB optional during indexing (jnioche)
 
 * NUTCH-1029 Readdb throws EOFException (markus)

Modified: nutch/branches/branch-1.4/conf/automaton-urlfilter.txt.template
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/automaton-urlfilter.txt.template?rev=1147796&r1=1147795&r2=1147796&view=diff
==============================================================================
--- nutch/branches/branch-1.4/conf/automaton-urlfilter.txt.template (original)
+++ nutch/branches/branch-1.4/conf/automaton-urlfilter.txt.template Mon Jul 18 
09:23:34 2011
@@ -25,7 +25,8 @@
 -(file|ftp|mailto):.*
 
 # skip image and other suffixes we can't yet parse
--.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)
+# for a more extensive coverage use the urlfilter-suffix plugin
+-.*\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)
 
 # skip URLs containing certain characters as probable queries, etc.
 -.*[?*!@=].*

Modified: nutch/branches/branch-1.4/conf/regex-urlfilter.txt.template
URL: 
http://svn.apache.org/viewvc/nutch/branches/branch-1.4/conf/regex-urlfilter.txt.template?rev=1147796&r1=1147795&r2=1147796&view=diff
==============================================================================
--- nutch/branches/branch-1.4/conf/regex-urlfilter.txt.template (original)
+++ nutch/branches/branch-1.4/conf/regex-urlfilter.txt.template Mon Jul 18 
09:23:34 2011
@@ -26,7 +26,8 @@
 -^(file|ftp|mailto):
 
 # skip image and other suffixes we can't yet parse
--\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$
+# for a more extensive coverage use the urlfilter-suffix plugin
+-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$
 
 # skip URLs containing certain characters as probable queries, etc.
 -[?*!@=]


Reply via email to