Author: ben
Date: 2007-11-17 20:22:11 -0800 (Sat, 17 Nov 2007)
New Revision: 7311
Modified:
sandbox/ben/ruby/linkchecker.rb
Log:
started to control whether to spider external links.
Modified: sandbox/ben/ruby/linkchecker.rb
===================================================================
--- sandbox/ben/ruby/linkchecker.rb 2007-11-18 03:48:16 UTC (rev 7310)
+++ sandbox/ben/ruby/linkchecker.rb 2007-11-18 04:22:11 UTC (rev 7311)
@@ -7,6 +7,8 @@
LASZLO_SERVER = "localhost"
LASZLO_PORT = 8080
CHECK_EXTERNAL_LINKS = true
+VERBOSE = true
+SPIDER_EXTERNAL_PAGES = false
report_broken_links = true
@@ -47,13 +49,17 @@
end
@response.body.scan( href_regexp ) { | m |
link = Link.new( @url, m[0])
- urls.push( link.absolute_url )
+ if (link.target_is_local? or SPIDER_EXTERNAL_PAGES) then
+ urls.push( link.absolute_url )
+ end
}
urls.uniq.sort
end
def is_local?
- (@url.slice(0,16) == "http://localhost") or (@url.slice(0..4) != "http:")
+ u = URI.parse(@url)
+ u.host == localhost
+ # (@url.slice(0,16) == "http://localhost") or (@url.slice(0..4) !=
"http:")
end
end
@@ -64,6 +70,16 @@
@target = target
end
+ def source_is_local?
+ u = URI.parse(@source)
+ u.host == "localhost"
+ end
+
+ def target_is_local?
+ u = URI.parse(absolute_url)
+ u.host == "localhost"
+ end
+
def absolute_url
if @target =~ %r{http:} then
return @target
@@ -73,7 +89,6 @@
# match up to the last slash
source_uri = URI.parse( @source )
# assume scheme is HTTP
-
relative_url = ""
if source_uri.path =~ %r{(.*)/[\w+-_.]*.html} then
@@ -85,7 +100,6 @@
relative_url += source_uri.path
end
-
monged_target = @target
while monged_target =~ %r{^\.\./(\S*)} do
monged_target = $1
@@ -126,7 +140,7 @@
if p.is_ok? then
@@urls_that_exist.push( url )
else
- @@missing_urls.push( url + "-error-" + p.response.code )
+ @@missing_urls.push( url + "[Error " + p.response.code + ": " +
p.response.message + "]")
end
filenames = p.find_urls
@@files_to_check.concat( filenames )
@@ -134,7 +148,7 @@
def LinkChecker.check_next_file
url = @@files_to_check.pop
- puts "check_next_file about to check #{url}"
+ puts "check_next_file about to check #{url}" if VERBOSE
@@files_weve_checked.push( url )
page = HTMLPage.new( url )
if page.is_ok? then
@@ -145,7 +159,7 @@
unique_new_urls = urls.uniq - @@files_to_check - @@files_weve_checked;
@@files_to_check.concat( unique_new_urls )
else
- @@missing_urls.push( url + "-error-" + page.response.code )
+ @@missing_urls.push( url + " [Error " + page.response.code + ": " +
page.response.message + "]" )
end
@@files_weve_checked.uniq!
@@ -166,10 +180,10 @@
end
def LinkChecker.generate_report
- "LinkChecker report****\n" +
- "\n\nUnique urls we checked: #{@@files_weve_checked.length}\n(?) " +
@@files_weve_checked.join("\n(?) ") +
- "\n\nGood urls we checked: #{@@urls_that_exist.length} \n(OK) " +
@@urls_that_exist.join("\n(OK) ") +
- "\n\nBroken links found: #{@@missing_urls.length}\n(-) " +
@@missing_urls.join("\n(-) ")
+ "LinkChecker report****\n" +
+ # "\n\nUnique urls we checked: #{@@files_weve_checked.length}\n(?) " +
@@files_weve_checked.join("\n(?) ") +
+ # "\n\nGood urls we checked: #{@@urls_that_exist.length} \n(OK) " +
@@urls_that_exist.join("\n(OK) ") +
+ "\n\nBroken links found: #{@@missing_urls.length}\n(x) " +
@@missing_urls.join("\n(x) ")
end
@@ -180,11 +194,13 @@
l = Link.new( "http://localhost:8080/trunk", "/docs/index.html" );
assert_not_nil l
assert_equal( "http://localhost:8080/trunk/docs/index.html",
l.absolute_url )
+ assert( l.target_is_local? )
l = Link.new( "foo", "bar")
assert_not_nil l
l = Link.new( "http://www.cnn.com", "grannymouse" )
assert_not_nil l
assert_equal( "http://www.cnn.com/grannymouse", l.absolute_url)
+ assert( ! l.target_is_local? )
end
def test_absolutify
@@ -273,13 +289,14 @@
num_checked_so_far = LinkChecker.num_files_checked;
i = 0;
- while( next_file_to_check = LinkChecker.next_url_to_check and i < 30 )
+ while( next_file_to_check = LinkChecker.next_url_to_check and i < 220 )
assert_not_nil( next_file_to_check )
LinkChecker.check_next_file
num_checked_now = LinkChecker.num_files_checked
assert( num_checked_now >= num_checked_so_far )
num_checked_so_far = num_checked_now
i += 1
+ if ( i % 30 == 0 ) then puts "Checked #{i} files..." end
end
puts "done. Checked #{i} files."
_______________________________________________
Laszlo-checkins mailing list
[email protected]
http://www.openlaszlo.org/mailman/listinfo/laszlo-checkins