Author: ben
Date: 2007-11-17 20:22:11 -0800 (Sat, 17 Nov 2007)
New Revision: 7311

Modified:
   sandbox/ben/ruby/linkchecker.rb
Log:
started to control whether to spider external links.

Modified: sandbox/ben/ruby/linkchecker.rb
===================================================================
--- sandbox/ben/ruby/linkchecker.rb     2007-11-18 03:48:16 UTC (rev 7310)
+++ sandbox/ben/ruby/linkchecker.rb     2007-11-18 04:22:11 UTC (rev 7311)
@@ -7,6 +7,8 @@
 LASZLO_SERVER = "localhost"
 LASZLO_PORT = 8080
 CHECK_EXTERNAL_LINKS = true
+VERBOSE = true
+SPIDER_EXTERNAL_PAGES = false
 
 report_broken_links = true
 
@@ -47,13 +49,17 @@
     end
     @response.body.scan( href_regexp ) { | m |
       link = Link.new( @url, m[0]) 
-      urls.push( link.absolute_url )
+      if (link.target_is_local? or SPIDER_EXTERNAL_PAGES) then 
+        urls.push( link.absolute_url ) 
+      end
     }
     urls.uniq.sort
   end
   
   def is_local?
-    (@url.slice(0,16) == "http://localhost";) or (@url.slice(0..4) != "http:") 
+    u = URI.parse(@url) 
+    u.host == localhost
+    # (@url.slice(0,16) == "http://localhost";) or (@url.slice(0..4) != 
"http:") 
   end
       
 end
@@ -64,6 +70,16 @@
     @target = target
   end
   
+  def source_is_local? 
+    u = URI.parse(@source) 
+    u.host == "localhost"        
+  end 
+  
+  def target_is_local?
+    u = URI.parse(absolute_url) 
+    u.host == "localhost"    
+  end 
+  
   def absolute_url
     if @target =~ %r{http:} then
       return  @target
@@ -73,7 +89,6 @@
       # match up to the last slash
       source_uri = URI.parse( @source )
       # assume scheme is HTTP
-      
         
       relative_url = ""
       if source_uri.path =~  %r{(.*)/[\w+-_.]*.html} then
@@ -85,7 +100,6 @@
         relative_url += source_uri.path 
       end
       
-      
       monged_target = @target
       while monged_target =~ %r{^\.\./(\S*)} do
         monged_target = $1
@@ -126,7 +140,7 @@
     if p.is_ok? then
       @@urls_that_exist.push( url ) 
     else
-      @@missing_urls.push( url + "-error-" + p.response.code ) 
+      @@missing_urls.push( url + "[Error " + p.response.code + ": " + 
p.response.message + "]") 
     end
     filenames = p.find_urls 
     @@files_to_check.concat( filenames )
@@ -134,7 +148,7 @@
   
   def LinkChecker.check_next_file 
     url = @@files_to_check.pop
-    puts "check_next_file about to check #{url}"
+    puts "check_next_file about to check #{url}" if VERBOSE
     @@files_weve_checked.push( url ) 
     page = HTMLPage.new( url ) 
     if page.is_ok? then
@@ -145,7 +159,7 @@
       unique_new_urls = urls.uniq - @@files_to_check - @@files_weve_checked; 
       @@files_to_check.concat( unique_new_urls )
     else
-      @@missing_urls.push( url + "-error-" + page.response.code ) 
+      @@missing_urls.push( url + "     [Error " + page.response.code + ": " + 
page.response.message + "]" )
     end
     
     @@files_weve_checked.uniq!
@@ -166,10 +180,10 @@
   end
   
   def LinkChecker.generate_report 
-    "LinkChecker report****\n" + 
-    "\n\nUnique urls we checked:  #{@@files_weve_checked.length}\n(?) " + 
@@files_weve_checked.join("\n(?) ") +
-    "\n\nGood urls we checked:  #{@@urls_that_exist.length} \n(OK) " + 
@@urls_that_exist.join("\n(OK) ")  +  
-    "\n\nBroken links found: #{@@missing_urls.length}\n(-) " + 
@@missing_urls.join("\n(-) ")  
+    "LinkChecker report****\n" +     
+     # "\n\nUnique urls we checked:  #{@@files_weve_checked.length}\n(?) " + 
@@files_weve_checked.join("\n(?) ") +
+    #  "\n\nGood urls we checked:  #{@@urls_that_exist.length} \n(OK) " + 
@@urls_that_exist.join("\n(OK) ")  +  
+    "\n\nBroken links found: #{@@missing_urls.length}\n(x) " + 
@@missing_urls.join("\n(x) ")  
     
   end 
   
@@ -180,11 +194,13 @@
     l = Link.new( "http://localhost:8080/trunk";, "/docs/index.html" );
     assert_not_nil l 
     assert_equal( "http://localhost:8080/trunk/docs/index.html";, 
l.absolute_url )
+    assert( l.target_is_local? ) 
     l = Link.new( "foo", "bar")
     assert_not_nil l 
     l = Link.new( "http://www.cnn.com";, "grannymouse" )
     assert_not_nil l
     assert_equal( "http://www.cnn.com/grannymouse";, l.absolute_url) 
+    assert( ! l.target_is_local? )
   end
   
   def test_absolutify
@@ -273,13 +289,14 @@
     
     num_checked_so_far = LinkChecker.num_files_checked; 
     i = 0; 
-    while( next_file_to_check = LinkChecker.next_url_to_check  and i < 30 ) 
+    while( next_file_to_check = LinkChecker.next_url_to_check  and i < 220 ) 
       assert_not_nil( next_file_to_check ) 
       LinkChecker.check_next_file
       num_checked_now = LinkChecker.num_files_checked
       assert( num_checked_now >= num_checked_so_far ) 
       num_checked_so_far = num_checked_now       
       i += 1 
+      if ( i % 30 == 0 ) then puts "Checked #{i} files..." end
     end
 
     puts "done. Checked #{i} files."


_______________________________________________
Laszlo-checkins mailing list
[email protected]
http://www.openlaszlo.org/mailman/listinfo/laszlo-checkins

Reply via email to