Author: ben
Date: 2007-11-25 20:44:46 -0800 (Sun, 25 Nov 2007)
New Revision: 7362

Modified:
   sandbox/ben/ruby/linkchecker/lib/main.rb
   sandbox/ben/ruby/linkchecker/lib/spider.rb
   sandbox/ben/ruby/linkchecker/test/tc_spider.rb
Log:
Linkchecker handles timeouts gracefully.

Modified: sandbox/ben/ruby/linkchecker/lib/main.rb
===================================================================
--- sandbox/ben/ruby/linkchecker/lib/main.rb    2007-11-26 00:05:20 UTC (rev 
7361)
+++ sandbox/ben/ruby/linkchecker/lib/main.rb    2007-11-26 04:44:46 UTC (rev 
7362)
@@ -65,7 +65,8 @@
   :spider_external_pages => false,  
   :root_url => "http://localhost:8080/trunk/docs/reference/ref.preface.html";,
   :verbose => false,
-  :check_external_links => false
+  :check_external_links => true,
+  :skip_domains => []  
 }
 
 
@@ -78,7 +79,8 @@
   ["-s", "--spider-external-pages", GetoptLong::NO_ARGUMENT],
   ["-u", "--root-url", GetoptLong::REQUIRED_ARGUMENT],  
   ["-v", "--verbose", GetoptLong::NO_ARGUMENT],
-  ["-x", "--check-external-links", GetoptLong::NO_ARGUMENT]
+  ["-x", "--check-external-links", GetoptLong::NO_ARGUMENT],
+  ["-d", "--skip-domain", GetoptLong::REQUIRED_ARGUMENT]
   )
 
 
@@ -88,6 +90,8 @@
     break if not opt
 
     case opt
+    when "-d"
+      options[:skip_domains].push( arg )
     when "-h" 
       puts usage_description
       exit
@@ -123,6 +127,9 @@
 spider = Spider.new( options[:root_url] ) 
 spider.ignore_filter = options[:ignore_filters]
 spider.log_info = options[:verbose]
+spider.validate_external_links = options[:check_external_links]
+spider.spider_external_pages = options[:spider_external_pages]
+spider.skip_domains = options[:skip_domains]
 spider.check_links( options[:max_links_to_check] )  
 puts "Spidering done.\n"
 puts "\n\n"

Modified: sandbox/ben/ruby/linkchecker/lib/spider.rb
===================================================================
--- sandbox/ben/ruby/linkchecker/lib/spider.rb  2007-11-26 00:05:20 UTC (rev 
7361)
+++ sandbox/ben/ruby/linkchecker/lib/spider.rb  2007-11-26 04:44:46 UTC (rev 
7362)
@@ -6,6 +6,7 @@
   
   @validate_external_links
   @spider_external_pages 
+  @skip_domains
   
   @log_info
   @report_progress
@@ -30,6 +31,10 @@
   
   attr_writer :ignore_filter
   attr_writer :log_info
+  attr_writer :validate_external_links, :spider_external_pages
+  
+  attr_reader :skip_domains
+  attr_writer :skip_domains
     
   def initialize( root_url, validate_external_links = false, 
spider_external_pages = false, log_info = false, report_progress = true )
     @root = root_url
@@ -44,6 +49,7 @@
     @log_info = log_info
     @report_progress = report_progress
     @ignore_filter = Array.new
+    @skip_domains = Array.new
     
     @urls_to_visit.push( Link.new( @root, @root ) )
   end
@@ -75,15 +81,13 @@
         next 
       end 
       
-      if not current_link.target.is_local? then
-        # TODO if the url is not local, only check it if we're supposed to 
validate external links         
-        # if the url is not local, only crawl it if we're supposed to spider 
external pages 
+      if (not @validate_external_links) and (not 
current_link.target.is_local?) then
         puts "skipping non-local url #{current_link.target.url}" if @log_info
         @skipped_links.push current_link 
         next
-      end       
+      end         
 
-      response = Net::HTTP.get_response( current_link.target.uri ) 
+      response = get_response( current_link.target.uri, 3, 2, true ) 
 
       if (response and (response.code =~ %r{[23]\d\d} ) ) then
         puts "got good response for #{current_link.target.uri}" if @log_info
@@ -94,13 +98,19 @@
           @skipped_links.push current_link           
           next
         end
-
+        
         @good_links.push( current_link ) 
-    # TODO also match src="foo.gif" and  "Lz.swfEmbed({url: 
'programs/LFC-$12.lzx"        
+        
+        if (not @spider_external_pages) and (not 
current_link.target.is_local?) 
+          puts "skipping crawling the inside of external page: 
#{current_link.target.url}" if @log_info
+          next
+        end 
+        
+       # TODO also match src="foo.gif" and  "Lz.swfEmbed({url: 
'programs/LFC-$12.lzx"        
         href_regexp = %r{href=\"([^#"\s]*)}    
         response.body.scan( href_regexp ) { | m |
           @urls_to_visit.push( Link.new( current_link.target.uri, m[0] ) )
-          puts "found url to visit #{m[0]} from #{current_link.target.uri}" if 
@log_info
+          # puts "found url to visit #{m[0]} from #{current_link.target.uri}" 
if @log_info
         }        
       else
           puts "got bad response for #{current_link.target.uri}" if @log_info
@@ -108,12 +118,37 @@
       end 
     end     
   end   
+    
+  def get_response( uri, timeout_duration = 5, max_retries = 3, verbose = 
false) 
+    retrycount = 0
+    resp = nil
+    begin
+      timeout(timeout_duration) do
+        resp = Net::HTTP.get_response( uri )
+        resp    
+      end
+    rescue TimeoutError      
+      if(retrycount < max_retries)
+        retrycount+=1
+        puts "gonna retry #{retrycount}" if verbose
+        retry
+      else
+        puts("ERROR Timeout error in get_response(#{uri.to_s}), attempt #" + 
retrycount.to_s) if verbose
+        nil
+      end
+    end
+    resp    
+  rescue Exception => exception
+    puts("ERROR in get_page for #{uri.to_s}")
+    nil
+  end 
+
   
   def unique_visited_urls 
     visited_urls = @visited_urls.map { | au | au.uri.to_s }
     visited_urls.uniq
   end
-  
+      
   def generate_report
     if entirely_good then 
       "All is well." 
@@ -149,6 +184,13 @@
       report += "\n"      
     else 
       report += "Skipped links: none\n"
+    end 
+    
+    if @good_links.length > 0 then
+      report += "Good links: ([EMAIL PROTECTED] total)\n"
+      @good_links.each { | l | report += "(+) " + l.target.url + "\n"}
+    else 
+      report += "Good links: none\n"        
     end    
         
     report    

Modified: sandbox/ben/ruby/linkchecker/test/tc_spider.rb
===================================================================
--- sandbox/ben/ruby/linkchecker/test/tc_spider.rb      2007-11-26 00:05:20 UTC 
(rev 7361)
+++ sandbox/ben/ruby/linkchecker/test/tc_spider.rb      2007-11-26 04:44:46 UTC 
(rev 7362)
@@ -85,5 +85,18 @@
     assert_equal( 1, spider.skipped_links.length, "we should have skipped 
exactly one url.")
   end 
   
+  def test_validate_external_links
+    # Create a spider set to validate, but not crawl, external links
+    spider = Spider.new( 
"http://localhost:8080/trunk/docs/reference/ref.preface.html";, true, false)
+    spider.log_info = false
+    spider.check_links( 10 ) 
+
+    puts spider.generate_verbose_report    
+    assert( spider.invalid_links.length == 0, "should have no invalid links" ) 
   
+    
+    # We should only skip links once.
+    assert_equal( spider.skipped_links.uniq.length, 
spider.skipped_links.length, "Skipped links list should be unique.")
+    assert_equal( spider.unique_visited_urls.length, 
spider.visited_urls.length, "We visited too many urls.")    
+  end 
   
 end
\ No newline at end of file


_______________________________________________
Laszlo-checkins mailing list
[email protected]
http://www.openlaszlo.org/mailman/listinfo/laszlo-checkins

Reply via email to