Author: ben Date: 2007-11-25 20:44:46 -0800 (Sun, 25 Nov 2007) New Revision: 7362
Modified: sandbox/ben/ruby/linkchecker/lib/main.rb sandbox/ben/ruby/linkchecker/lib/spider.rb sandbox/ben/ruby/linkchecker/test/tc_spider.rb Log: Linkchecker handles timeouts gracefully. Modified: sandbox/ben/ruby/linkchecker/lib/main.rb =================================================================== --- sandbox/ben/ruby/linkchecker/lib/main.rb 2007-11-26 00:05:20 UTC (rev 7361) +++ sandbox/ben/ruby/linkchecker/lib/main.rb 2007-11-26 04:44:46 UTC (rev 7362) @@ -65,7 +65,8 @@ :spider_external_pages => false, :root_url => "http://localhost:8080/trunk/docs/reference/ref.preface.html", :verbose => false, - :check_external_links => false + :check_external_links => true, + :skip_domains => [] } @@ -78,7 +79,8 @@ ["-s", "--spider-external-pages", GetoptLong::NO_ARGUMENT], ["-u", "--root-url", GetoptLong::REQUIRED_ARGUMENT], ["-v", "--verbose", GetoptLong::NO_ARGUMENT], - ["-x", "--check-external-links", GetoptLong::NO_ARGUMENT] + ["-x", "--check-external-links", GetoptLong::NO_ARGUMENT], + ["-d", "--skip-domain", GetoptLong::REQUIRED_ARGUMENT] ) @@ -88,6 +90,8 @@ break if not opt case opt + when "-d" + options[:skip_domains].push( arg ) when "-h" puts usage_description exit @@ -123,6 +127,9 @@ spider = Spider.new( options[:root_url] ) spider.ignore_filter = options[:ignore_filters] spider.log_info = options[:verbose] +spider.validate_external_links = options[:check_external_links] +spider.spider_external_pages = options[:spider_external_pages] +spider.skip_domains = options[:skip_domains] spider.check_links( options[:max_links_to_check] ) puts "Spidering done.\n" puts "\n\n" Modified: sandbox/ben/ruby/linkchecker/lib/spider.rb =================================================================== --- sandbox/ben/ruby/linkchecker/lib/spider.rb 2007-11-26 00:05:20 UTC (rev 7361) +++ sandbox/ben/ruby/linkchecker/lib/spider.rb 2007-11-26 04:44:46 UTC (rev 7362) @@ -6,6 +6,7 @@ @validate_external_links @spider_external_pages + @skip_domains @log_info @report_progress @@ -30,6 +31,10 @@ attr_writer :ignore_filter attr_writer :log_info + attr_writer :validate_external_links, :spider_external_pages + + attr_reader :skip_domains + attr_writer :skip_domains def initialize( root_url, validate_external_links = false, spider_external_pages = false, log_info = false, report_progress = true ) @root = root_url @@ -44,6 +49,7 @@ @log_info = log_info @report_progress = report_progress @ignore_filter = Array.new + @skip_domains = Array.new @urls_to_visit.push( Link.new( @root, @root ) ) end @@ -75,15 +81,13 @@ next end - if not current_link.target.is_local? then - # TODO if the url is not local, only check it if we're supposed to validate external links - # if the url is not local, only crawl it if we're supposed to spider external pages + if (not @validate_external_links) and (not current_link.target.is_local?) then puts "skipping non-local url #{current_link.target.url}" if @log_info @skipped_links.push current_link next - end + end - response = Net::HTTP.get_response( current_link.target.uri ) + response = get_response( current_link.target.uri, 3, 2, true ) if (response and (response.code =~ %r{[23]\d\d} ) ) then puts "got good response for #{current_link.target.uri}" if @log_info @@ -94,13 +98,19 @@ @skipped_links.push current_link next end - + @good_links.push( current_link ) - # TODO also match src="foo.gif" and "Lz.swfEmbed({url: 'programs/LFC-$12.lzx" + + if (not @spider_external_pages) and (not current_link.target.is_local?) + puts "skipping crawling the inside of external page: #{current_link.target.url}" if @log_info + next + end + + # TODO also match src="foo.gif" and "Lz.swfEmbed({url: 'programs/LFC-$12.lzx" href_regexp = %r{href=\"([^#"\s]*)} response.body.scan( href_regexp ) { | m | @urls_to_visit.push( Link.new( current_link.target.uri, m[0] ) ) - puts "found url to visit #{m[0]} from #{current_link.target.uri}" if @log_info + # puts "found url to visit #{m[0]} from #{current_link.target.uri}" if @log_info } else puts "got bad response for #{current_link.target.uri}" if @log_info @@ -108,12 +118,37 @@ end end end + + def get_response( uri, timeout_duration = 5, max_retries = 3, verbose = false) + retrycount = 0 + resp = nil + begin + timeout(timeout_duration) do + resp = Net::HTTP.get_response( uri ) + resp + end + rescue TimeoutError + if(retrycount < max_retries) + retrycount+=1 + puts "gonna retry #{retrycount}" if verbose + retry + else + puts("ERROR Timeout error in get_response(#{uri.to_s}), attempt #" + retrycount.to_s) if verbose + nil + end + end + resp + rescue Exception => exception + puts("ERROR in get_page for #{uri.to_s}") + nil + end + def unique_visited_urls visited_urls = @visited_urls.map { | au | au.uri.to_s } visited_urls.uniq end - + def generate_report if entirely_good then "All is well." @@ -149,6 +184,13 @@ report += "\n" else report += "Skipped links: none\n" + end + + if @good_links.length > 0 then + report += "Good links: ([EMAIL PROTECTED] total)\n" + @good_links.each { | l | report += "(+) " + l.target.url + "\n"} + else + report += "Good links: none\n" end report Modified: sandbox/ben/ruby/linkchecker/test/tc_spider.rb =================================================================== --- sandbox/ben/ruby/linkchecker/test/tc_spider.rb 2007-11-26 00:05:20 UTC (rev 7361) +++ sandbox/ben/ruby/linkchecker/test/tc_spider.rb 2007-11-26 04:44:46 UTC (rev 7362) @@ -85,5 +85,18 @@ assert_equal( 1, spider.skipped_links.length, "we should have skipped exactly one url.") end + def test_validate_external_links + # Create a spider set to validate, but not crawl, external links + spider = Spider.new( "http://localhost:8080/trunk/docs/reference/ref.preface.html", true, false) + spider.log_info = false + spider.check_links( 10 ) + + puts spider.generate_verbose_report + assert( spider.invalid_links.length == 0, "should have no invalid links" ) + + # We should only skip links once. + assert_equal( spider.skipped_links.uniq.length, spider.skipped_links.length, "Skipped links list should be unique.") + assert_equal( spider.unique_visited_urls.length, spider.visited_urls.length, "We visited too many urls.") + end end \ No newline at end of file _______________________________________________ Laszlo-checkins mailing list [email protected] http://www.openlaszlo.org/mailman/listinfo/laszlo-checkins
