Author: ben
Date: 2007-11-18 23:11:01 -0800 (Sun, 18 Nov 2007)
New Revision: 7315
Added:
sandbox/ben/ruby/spider.rb
Modified:
sandbox/ben/ruby/linkchecker.rb
Log:
added an AnnotatedURL class to figure out whether a link is local or external,
valid or invalid. Added a new class, Spider, as a refactor of LinkChecker.
Modified: sandbox/ben/ruby/linkchecker.rb
===================================================================
--- sandbox/ben/ruby/linkchecker.rb 2007-11-19 02:31:04 UTC (rev 7314)
+++ sandbox/ben/ruby/linkchecker.rb 2007-11-19 07:11:01 UTC (rev 7315)
@@ -6,35 +6,35 @@
# start at a particular server...
LASZLO_SERVER = "localhost"
LASZLO_PORT = 8080
-CHECK_EXTERNAL_LINKS = true
-VERBOSE = true
-SPIDER_EXTERNAL_PAGES = false
+VERBOSE = false
report_broken_links = true
class HTMLPage
@url
@response
+ @skipped
attr_reader :response
+ attr_reader :skipped
def initialize( url, verbose=false )
@url = url
- if (!CHECK_EXTERNAL_LINKS) then
- if !is_local? then
- puts "skipping #{url}" if verbose
- return
- end
+
+ if !is_local? then
+ @response = nil
+ @skipped = true
+ else
+ puts "trying #{url}..." if verbose
+ @response = Net::HTTP.get_response( URI.parse(url) )
+ @skipped = false
+ puts "ok." if verbose
end
-
- puts "trying #{url}..." if verbose
- @response = Net::HTTP.get_response( URI.parse(url) )
- puts "ok." if verbose
- rescue SystemCallError
- puts "couldn't connect to load url #{url}." if (verbose or
report_broken_links)
+ rescue SystemCallError
+ puts "couldn't connect to load url #{url}." if verbose
end
def is_ok?
- (@response and (@response.code =~ %r{[23]\d\d} ) )
+ (not @skipped and (@response and (@response.code =~ %r{[23]\d\d} ) ) )
end
def print_result
@@ -47,60 +47,58 @@
if not is_ok? then
return urls
end
+ # find all hrefs in the source
+ # TODO: also find source="", rel=""
@response.body.scan( href_regexp ) { | m |
link = Link.new( @url, m[0])
- if (link.target_is_local? or SPIDER_EXTERNAL_PAGES) then
- urls.push( link.absolute_url )
- end
+ urls.push( link.absolute_url )
}
urls.uniq.sort
end
def is_local?
u = URI.parse(@url)
- u.host == localhost
- # (@url.slice(0,16) == "http://localhost") or (@url.slice(0..4) !=
"http:")
+ u.host == "localhost"
end
end
+
class Link
+ attr_reader :source, :target
def initialize( source, target )
- @source = source
- @target = target
+ @source = AnnotatedURL.new( source )
+ @target_string = target
+ @target = AnnotatedURL.new( absolute_url )
end
def source_is_local?
- u = URI.parse(@source)
- u.host == "localhost"
+ @source.is_local?
end
def target_is_local?
- u = URI.parse(absolute_url)
- u.host == "localhost"
+ @target.is_local?
end
def absolute_url
- if @target =~ %r{http:} then
- return @target
+ if @target_string =~ %r{http:} then
+ return @target_string
end
- if @source =~ %r{http:} then
+ if @source.url =~ %r{http:} then
# the source url is absolute, and the target is not, so let's assemble a
new absolute url
# match up to the last slash
- source_uri = URI.parse( @source )
- # assume scheme is HTTP
relative_url = ""
- if source_uri.path =~ %r{(.*)/[\w+-_.]*.html} then
+ if @source.uri.path =~ %r{(.*)/[\w+-_.]*.html} then
relative_url += $1 + "/"
else
- if source_uri.path !~ %r{^/}
+ if @source.uri.path !~ %r{^/}
relative_url += "/"
end
- relative_url += source_uri.path
+ relative_url += @source.uri.path
end
- monged_target = @target
+ monged_target = @target_string
while monged_target =~ %r{^\.\./(\S*)} do
monged_target = $1
# puts "relative_url is #{relative_url}"
@@ -118,20 +116,25 @@
relative_url += "/"
end
- relative_url = "http://" + source_uri.host +
- ( source_uri.port != 80 ? ":#{source_uri.port}" : "") + relative_url
+ relative_url = "http://" + @source.uri.host +
+ ( @source.uri.port != 80 ? ":[EMAIL PROTECTED]" : "") + relative_url
return relative_url + monged_target
end
- return "[EMAIL PROTECTED]"
+ return "[EMAIL PROTECTED]"
end
end
+
class LinkChecker
@@files_weve_checked = Array.new
@@files_to_check = Array.new
@@urls_that_exist = Array.new
@@missing_urls = Array.new
+
+ @@verify_outgoing_links = false
+ @@spider_external_pages = false
+ @@verbose = false
def LinkChecker.check_one_file( url_base, filename )
url = url_base + "/" + filename
@@ -148,7 +151,7 @@
def LinkChecker.check_next_file
url = @@files_to_check.pop
- puts "check_next_file about to check #{url}" if VERBOSE
+ puts "check_next_file about to check #{url}" if @@verbose
@@files_weve_checked.push( url )
page = HTMLPage.new( url )
if page.is_ok? then
@@ -158,8 +161,11 @@
# and ignore any urls we're already planning to check
unique_new_urls = urls.uniq - @@files_to_check - @@files_weve_checked;
@@files_to_check.concat( unique_new_urls )
- else
- @@missing_urls.push( url + " [Error " + page.response.code + ": " +
page.response.message + "]" )
+ else
+ @@missing_urls.push( url +
+ ( page.skipped ?
+ " [ skipped ] " :
+ " [Error " + page.response.code + ": " + page.response.message +
"]" ) )
end
@@files_weve_checked.uniq!
@@ -189,6 +195,122 @@
end
+class AnnotatedURL
+ @url
+ @uri
+ @@total_created = 0
+ @bad
+ attr_reader :uri, :url
+
+ def initialize( url )
+ @url = url
+ @uri = URI.parse( url )
+ @@total_created += 1
+ @bad = false
+ rescue URI::InvalidURIError
+ @bad = true
+ end
+
+ def valid?
+ not @bad
+ end
+
+ def is_local?
+ not @bad and
+ ( @uri.host == "localhost" or @uri.host == "127.0.0.1" )
+ end
+end
+
+class TestAnnotatedURL < Test::Unit::TestCase
+ @@local_good_urls = [
+ "http://localhost:8080/trunk",
+ "http://localhost:8080/trunk/laszlo-explorer/",
+ "http://localhost:8080/trunk/docs/guide/index.html",
+ "http://localhost:8080/trunk/docs/deployers/",
+ ]
+
+ @@local_broken_urls = [
+ "http://localhost/foo",
+ "http://localhost:8121/foo",
+ "http://localhost:8080/bananabanana",
+ "http://localhost:8080/bananabanana/foo.html",
+ "http://127.0.0.1/bar/baz",
+ ]
+
+ @@remote_good_urls = [
+ "http://www.laszlosystems.com/" ,
+ "http://forum.openlaszlo.org",
+ "http://www.cnn.com",
+ "http://www.technorati.com"
+ ]
+
+ @@remote_broken_urls = [
+ "http://nononoidontexist.banana.com",
+ "http://www.cnn.com/there_couldnt_possibly_be_a_file_with_this_name.html",
+
+ ]
+
+ @@invalid_urls = [
+ "sjj:// ? foo.bar.baz",
+ "elf://",
+ ":",
+ "bb? wow + b",
+ "::bananafish_hello.h",
+ "banana-fish ! yah? ",
+ "htt:",
+ ]
+
+
+ @@all_test_urls = [].concat(@@local_good_urls).
+ concat( @@local_broken_urls ).
+ concat( @@remote_broken_urls ).
+ concat( @@remote_good_urls ).
+ concat( @@invalid_urls )
+
+ def test_data
+ assert( @@all_test_urls.length > 0 )
+ assert_equal( @@all_test_urls.length, @@all_test_urls.uniq.length)
+ assert_equal( @@all_test_urls.length,
+ @@local_good_urls.length +
+ @@local_broken_urls.length +
+ @@remote_good_urls.length +
+ @@remote_broken_urls.length +
+ @@invalid_urls.length)
+ end
+
+ def test_detecting_localness_and_validity
+ @@invalid_urls.each{ | u |
+ iu = AnnotatedURL.new( u )
+ assert_equal( false, iu.valid?, "invalid url #{iu.uri} should be
invalid" )
+ }
+
+ @@local_good_urls.each { | u |
+ au = AnnotatedURL.new( u )
+ assert( au.valid?, "local good url #{au.uri} should be valid" )
+ assert( au.is_local?, "local good url #{au.uri} should be local" )
+ }
+ @@local_broken_urls.each { | u |
+ au = AnnotatedURL.new( u )
+ assert( au.valid?, "local broken url #{au.uri} should be valid" )
+ assert( au.is_local? , "local broken url #{au.uri} should still be
local" )
+ }
+ @@remote_good_urls.each { | u |
+ aru = AnnotatedURL.new( u )
+ assert( aru.valid?, "remote good url #{aru.uri} should be valid" )
+ assert_equal( false, aru.is_local?, "remote url #{aru.uri} should not be
local" )
+ }
+ @@remote_broken_urls.each{ | u |
+ aru = AnnotatedURL.new( u )
+ assert( aru.valid?, "remote broken url #{u} should be valid" )
+ assert_equal( false, aru.is_local?, "remote url #{u} should not be
local" )
+ }
+
+ rescue URI::InvalidURIError => boom
+ puts "invalid uri error \n\t\"#{boom}\""
+ end
+
+end
+
class TestLink < Test::Unit::TestCase
def test_creation
l = Link.new( "http://localhost:8080/trunk", "/docs/index.html" );
@@ -289,7 +411,7 @@
num_checked_so_far = LinkChecker.num_files_checked;
i = 0;
- while( next_file_to_check = LinkChecker.next_url_to_check and i < 220 )
+ while( next_file_to_check = LinkChecker.next_url_to_check and i < 10 )
assert_not_nil( next_file_to_check )
LinkChecker.check_next_file
num_checked_now = LinkChecker.num_files_checked
@@ -299,7 +421,7 @@
if ( i % 30 == 0 ) then puts "Checked #{i} files..." end
end
- puts "done. Checked #{i} files."
+ puts "\n\ntest_something_simple done. Checked #{i} files."
puts LinkChecker.generate_report
end
Added: sandbox/ben/ruby/spider.rb
Property changes on: sandbox/ben/ruby/spider.rb
___________________________________________________________________
Name: svn:executable
+ *
Name: svn:mime-type
+ text/plain
Name: svn:eol-style
+ native
_______________________________________________
Laszlo-checkins mailing list
[email protected]
http://www.openlaszlo.org/mailman/listinfo/laszlo-checkins