Author: ben
Date: 2007-11-18 23:11:01 -0800 (Sun, 18 Nov 2007)
New Revision: 7315

Added:
   sandbox/ben/ruby/spider.rb
Modified:
   sandbox/ben/ruby/linkchecker.rb
Log:
added an AnnotatedURL class to figure out whether a link is local or external, 
valid or invalid. Added a new class, Spider, as a refactor of LinkChecker.

Modified: sandbox/ben/ruby/linkchecker.rb
===================================================================
--- sandbox/ben/ruby/linkchecker.rb     2007-11-19 02:31:04 UTC (rev 7314)
+++ sandbox/ben/ruby/linkchecker.rb     2007-11-19 07:11:01 UTC (rev 7315)
@@ -6,35 +6,35 @@
 # start at a particular server...
 LASZLO_SERVER = "localhost"
 LASZLO_PORT = 8080
-CHECK_EXTERNAL_LINKS = true
-VERBOSE = true
-SPIDER_EXTERNAL_PAGES = false
+VERBOSE = false
 
 report_broken_links = true
 
 class HTMLPage
   @url
   @response
+  @skipped
   attr_reader :response
+  attr_reader :skipped
   
   def initialize( url, verbose=false )
     @url = url
-    if (!CHECK_EXTERNAL_LINKS) then
-      if !is_local? then
-        puts "skipping #{url}" if verbose
-        return
-      end
+
+    if !is_local? then
+      @response = nil
+      @skipped = true
+    else 
+      puts "trying #{url}..." if verbose
+      @response = Net::HTTP.get_response( URI.parse(url) ) 
+      @skipped = false
+      puts "ok." if verbose
     end
-
-    puts "trying #{url}..." if verbose
-    @response = Net::HTTP.get_response( URI.parse(url) ) 
-    puts "ok." if verbose
-  rescue SystemCallError
-    puts "couldn't connect to load url #{url}." if (verbose or 
report_broken_links)
+    rescue SystemCallError
+      puts "couldn't connect to load url #{url}." if verbose
   end 
   
   def is_ok?
-    (@response and (@response.code =~ %r{[23]\d\d} ) )
+    (not @skipped and (@response and (@response.code =~ %r{[23]\d\d} ) ) )
   end
   
   def print_result
@@ -47,60 +47,58 @@
     if not is_ok? then
       return urls
     end
+    # find all hrefs in the source
+    # TODO: also find source="", rel=""
     @response.body.scan( href_regexp ) { | m |
       link = Link.new( @url, m[0]) 
-      if (link.target_is_local? or SPIDER_EXTERNAL_PAGES) then 
-        urls.push( link.absolute_url ) 
-      end
+      urls.push( link.absolute_url ) 
     }
     urls.uniq.sort
   end
   
   def is_local?
     u = URI.parse(@url) 
-    u.host == localhost
-    # (@url.slice(0,16) == "http://localhost";) or (@url.slice(0..4) != 
"http:") 
+    u.host == "localhost"
   end
       
 end
 
+
 class Link
+  attr_reader :source, :target 
   def initialize( source, target ) 
-    @source = source
-    @target = target
+    @source = AnnotatedURL.new( source )
+    @target_string = target
+    @target = AnnotatedURL.new( absolute_url ) 
   end
   
   def source_is_local? 
-    u = URI.parse(@source) 
-    u.host == "localhost"        
+    @source.is_local?
   end 
   
   def target_is_local?
-    u = URI.parse(absolute_url) 
-    u.host == "localhost"    
+    @target.is_local?
   end 
   
   def absolute_url
-    if @target =~ %r{http:} then
-      return  @target
+    if @target_string =~ %r{http:} then
+      return  @target_string
     end 
-    if @source =~ %r{http:} then
+    if @source.url =~ %r{http:} then
       # the source url is absolute, and the target is not, so let's assemble a 
new absolute url
       # match up to the last slash
-      source_uri = URI.parse( @source )
-      # assume scheme is HTTP
         
       relative_url = ""
-      if source_uri.path =~  %r{(.*)/[\w+-_.]*.html} then
+      if @source.uri.path =~  %r{(.*)/[\w+-_.]*.html} then
         relative_url +=  $1 + "/" 
       else 
-        if source_uri.path !~ %r{^/}
+        if @source.uri.path !~ %r{^/}
           relative_url +=  "/" 
         end
-        relative_url += source_uri.path 
+        relative_url += @source.uri.path
       end
       
-      monged_target = @target
+      monged_target = @target_string
       while monged_target =~ %r{^\.\./(\S*)} do
         monged_target = $1
         # puts "relative_url is #{relative_url}"
@@ -118,20 +116,25 @@
         relative_url += "/" 
       end  
 
-      relative_url = "http://"; + source_uri.host +     
-        ( source_uri.port != 80 ? ":#{source_uri.port}" : "") + relative_url
+      relative_url = "http://"; + @source.uri.host +     
+        ( @source.uri.port != 80 ? ":[EMAIL PROTECTED]" : "") + relative_url
 
       return relative_url + monged_target
     end 
-      return "[EMAIL PROTECTED]"
+      return "[EMAIL PROTECTED]"
   end
 end 
 
+
 class LinkChecker
   @@files_weve_checked  = Array.new
   @@files_to_check = Array.new
   @@urls_that_exist = Array.new
   @@missing_urls = Array.new
+  
+  @@verify_outgoing_links = false
+  @@spider_external_pages = false
+  @@verbose = false
     
   def LinkChecker.check_one_file( url_base, filename ) 
     url = url_base + "/" + filename 
@@ -148,7 +151,7 @@
   
   def LinkChecker.check_next_file 
     url = @@files_to_check.pop
-    puts "check_next_file about to check #{url}" if VERBOSE
+    puts "check_next_file about to check #{url}" if @@verbose
     @@files_weve_checked.push( url ) 
     page = HTMLPage.new( url ) 
     if page.is_ok? then
@@ -158,8 +161,11 @@
       # and ignore any urls we're already planning to check
       unique_new_urls = urls.uniq - @@files_to_check - @@files_weve_checked; 
       @@files_to_check.concat( unique_new_urls )
-    else
-      @@missing_urls.push( url + "     [Error " + page.response.code + ": " + 
page.response.message + "]" )
+    else 
+      @@missing_urls.push( url + 
+      ( page.skipped ? 
+        "     [ skipped ] " : 
+        "     [Error " + page.response.code + ": " + page.response.message + 
"]" )  )
     end
     
     @@files_weve_checked.uniq!
@@ -189,6 +195,122 @@
   
 end 
 
+class AnnotatedURL 
+  @url 
+  @uri
+  @@total_created = 0 
+  @bad 
+  attr_reader :uri, :url
+  
+  def initialize( url ) 
+    @url = url
+    @uri = URI.parse( url )
+    @@total_created += 1
+    @bad = false
+  rescue URI::InvalidURIError
+    @bad = true
+  end 
+  
+  def valid?
+    not @bad 
+  end 
+  
+  def is_local?
+    not @bad and 
+    ( @uri.host == "localhost" or @uri.host == "127.0.0.1" )
+  end
+end
+
+class TestAnnotatedURL < Test::Unit::TestCase 
+  @@local_good_urls = [
+    "http://localhost:8080/trunk";,
+    "http://localhost:8080/trunk/laszlo-explorer/";, 
+    "http://localhost:8080/trunk/docs/guide/index.html";, 
+    "http://localhost:8080/trunk/docs/deployers/";, 
+  ]
+
+  @@local_broken_urls = [
+    "http://localhost/foo";, 
+    "http://localhost:8121/foo";, 
+    "http://localhost:8080/bananabanana";, 
+    "http://localhost:8080/bananabanana/foo.html";,     
+    "http://127.0.0.1/bar/baz";,
+  ]
+
+  @@remote_good_urls = [
+    "http://www.laszlosystems.com/"; , 
+    "http://forum.openlaszlo.org";,
+    "http://www.cnn.com";, 
+    "http://www.technorati.com";
+  ]
+  
+  @@remote_broken_urls = [
+    "http://nononoidontexist.banana.com";,
+    "http://www.cnn.com/there_couldnt_possibly_be_a_file_with_this_name.html";,
+
+  ]
+  
+  @@invalid_urls = [
+    "sjj://  ? foo.bar.baz",
+    "elf://",
+    ":", 
+    "bb? wow + b",      
+    "::bananafish_hello.h", 
+    "banana-fish ! yah? ",
+    "htt:",    
+  ]
+  
+  
+  @@all_test_urls = [].concat(@@local_good_urls).
+    concat( @@local_broken_urls ).
+    concat( @@remote_broken_urls ).
+    concat( @@remote_good_urls ).
+    concat( @@invalid_urls )
+
+  def test_data
+    assert( @@all_test_urls.length > 0 )
+    assert_equal( @@all_test_urls.length, @@all_test_urls.uniq.length) 
+    assert_equal( @@all_test_urls.length, 
+      @@local_good_urls.length + 
+      @@local_broken_urls.length + 
+      @@remote_good_urls.length + 
+      @@remote_broken_urls.length + 
+      @@invalid_urls.length)     
+  end 
+  
+  def test_detecting_localness_and_validity 
+    @@invalid_urls.each{ | u |
+      iu = AnnotatedURL.new( u ) 
+      assert_equal( false, iu.valid?, "invalid url #{iu.uri} should be 
invalid" ) 
+    }
+
+    @@local_good_urls.each { | u | 
+      au = AnnotatedURL.new( u ) 
+      assert( au.valid?, "local good url #{au.uri} should be valid" )
+      assert( au.is_local?, "local good url #{au.uri} should be local" )
+    }
+    @@local_broken_urls.each { | u | 
+      au = AnnotatedURL.new( u ) 
+      assert( au.valid?, "local broken url #{au.uri} should be valid" )
+      assert( au.is_local? , "local broken url #{au.uri} should still be 
local" )
+    }
+    @@remote_good_urls.each { | u | 
+      aru = AnnotatedURL.new( u )
+      assert( aru.valid?, "remote good url #{aru.uri} should be valid" )
+      assert_equal( false, aru.is_local?, "remote url #{aru.uri} should not be 
local" )
+    }
+    @@remote_broken_urls.each{ | u |
+      aru = AnnotatedURL.new( u ) 
+      assert( aru.valid?, "remote broken url #{u} should be valid" )      
+      assert_equal( false, aru.is_local?, "remote url #{u} should not be 
local" )
+    }
+    
+  rescue URI::InvalidURIError => boom
+    puts "invalid uri error \n\t\"#{boom}\""    
+  end 
+
+end 
+
 class TestLink < Test::Unit::TestCase
   def test_creation
     l = Link.new( "http://localhost:8080/trunk";, "/docs/index.html" );
@@ -289,7 +411,7 @@
     
     num_checked_so_far = LinkChecker.num_files_checked; 
     i = 0; 
-    while( next_file_to_check = LinkChecker.next_url_to_check  and i < 220 ) 
+    while( next_file_to_check = LinkChecker.next_url_to_check  and i < 10 ) 
       assert_not_nil( next_file_to_check ) 
       LinkChecker.check_next_file
       num_checked_now = LinkChecker.num_files_checked
@@ -299,7 +421,7 @@
       if ( i % 30 == 0 ) then puts "Checked #{i} files..." end
     end
 
-    puts "done. Checked #{i} files."
+    puts "\n\ntest_something_simple done. Checked #{i} files."
     puts LinkChecker.generate_report
   end 
   

Added: sandbox/ben/ruby/spider.rb


Property changes on: sandbox/ben/ruby/spider.rb
___________________________________________________________________
Name: svn:executable
   + *
Name: svn:mime-type
   + text/plain
Name: svn:eol-style
   + native


_______________________________________________
Laszlo-checkins mailing list
[email protected]
http://www.openlaszlo.org/mailman/listinfo/laszlo-checkins

Reply via email to