Author: ben
Date: 2007-11-25 16:05:20 -0800 (Sun, 25 Nov 2007)
New Revision: 7361

Added:
   sandbox/ben/ruby/linkchecker/lib/annotated_url.rb
   sandbox/ben/ruby/linkchecker/lib/link.rb
   sandbox/ben/ruby/linkchecker/lib/main.rb
   sandbox/ben/ruby/linkchecker/test/tc_annotatedurl.rb
   sandbox/ben/ruby/linkchecker/test/tc_link.rb
   sandbox/ben/ruby/linkchecker/test/ts_spider.rb
Modified:
   sandbox/ben/ruby/linkchecker/lib/linkchecker.rb
   sandbox/ben/ruby/linkchecker/lib/spider.rb
   sandbox/ben/ruby/linkchecker/test/tc_linkchecker.rb
   sandbox/ben/ruby/linkchecker/test/tc_spider.rb
Log:
Made linkchecker configurable and documented usage options.

Added: sandbox/ben/ruby/linkchecker/lib/annotated_url.rb


Property changes on: sandbox/ben/ruby/linkchecker/lib/annotated_url.rb
___________________________________________________________________
Name: svn:executable
   + *
Name: svn:mime-type
   + text/plain
Name: svn:eol-style
   + native

Added: sandbox/ben/ruby/linkchecker/lib/link.rb


Property changes on: sandbox/ben/ruby/linkchecker/lib/link.rb
___________________________________________________________________
Name: svn:executable
   + *
Name: svn:mime-type
   + text/plain
Name: svn:eol-style
   + native

Modified: sandbox/ben/ruby/linkchecker/lib/linkchecker.rb
===================================================================
--- sandbox/ben/ruby/linkchecker/lib/linkchecker.rb     2007-11-25 23:08:21 UTC 
(rev 7360)
+++ sandbox/ben/ruby/linkchecker/lib/linkchecker.rb     2007-11-26 00:05:20 UTC 
(rev 7361)
@@ -1,7 +1,7 @@
 require 'net/http'
 require 'uri'
-require 'test/unit'
 require 'rexml/document'
+require 'link'
 
 # start at a particular server...
 LASZLO_SERVER = "localhost"
@@ -64,68 +64,7 @@
 end
 
 
-class Link
-  attr_reader :source, :target 
-  def initialize( source, target ) 
-    @source = AnnotatedURL.new( source )
-    @target_string = target
-    @target = AnnotatedURL.new( absolute_url ) 
-  end
-  
-  def source_is_local? 
-    @source.is_local?
-  end 
-  
-  def target_is_local?
-    @target.is_local?
-  end 
-  
-  def absolute_url
-    if @target_string =~ %r{http:} then
-      return  @target_string
-    end 
-    if @source.url =~ %r{http:} then
-      # the source url is absolute, and the target is not, so let's assemble a 
new absolute url
-      # match up to the last slash
-        
-      relative_url = ""
-      if @source.uri.path =~  %r{(.*)/[\w+-_.]*.html} then
-        relative_url +=  $1 + "/" 
-      else 
-        if @source.uri.path !~ %r{^/}
-          relative_url +=  "/" 
-        end
-        relative_url += @source.uri.path
-      end
-      
-      monged_target = @target_string
-      while monged_target =~ %r{^\.\./(\S*)} do
-        monged_target = $1
-        # puts "relative_url is #{relative_url}"
 
-        # strip one directory off the end of the relative url so far
-        if relative_url =~ %r{(\S*)/\S+$} 
-          relative_url = $1
-        else
-          puts "TROUBLE didn't match relative_url #{relative_url}"
-        end
-        # puts "monged_target is now #{monged_target} and relative_url is now 
#{relative_url}"        
-      end
-
-      if relative_url !~ %r{/$} and monged_target !~ %r{^/} then
-        relative_url += "/" 
-      end  
-
-      relative_url = "http://"; + @source.uri.host +     
-        ( @source.uri.port != 80 ? ":[EMAIL PROTECTED]" : "") + relative_url
-
-      return relative_url + monged_target
-    end 
-      return "[EMAIL PROTECTED]"
-  end
-end 
-
-
 class LinkChecker
   @@files_weve_checked  = Array.new
   @@files_to_check = Array.new
@@ -195,31 +134,6 @@
   
 end 
 
-class AnnotatedURL 
-  @url 
-  @uri
-  @@total_created = 0 
-  @bad 
-  attr_reader :uri, :url
-  
-  def initialize( url ) 
-    @url = url
-    @uri = URI.parse( url )
-    @@total_created += 1
-    @bad = false
-  rescue URI::InvalidURIError
-    @bad = true
-  end 
-  
-  def valid?
-    not @bad 
-  end 
-  
-  def is_local?
-    not @bad and 
-    ( @uri.host == "localhost" or @uri.host == "127.0.0.1" )
-  end
-end
 
 
 

Added: sandbox/ben/ruby/linkchecker/lib/main.rb


Property changes on: sandbox/ben/ruby/linkchecker/lib/main.rb
___________________________________________________________________
Name: svn:executable
   + *
Name: svn:mime-type
   + text/plain
Name: svn:eol-style
   + native

Modified: sandbox/ben/ruby/linkchecker/lib/spider.rb
===================================================================
--- sandbox/ben/ruby/linkchecker/lib/spider.rb  2007-11-25 23:08:21 UTC (rev 
7360)
+++ sandbox/ben/ruby/linkchecker/lib/spider.rb  2007-11-26 00:05:20 UTC (rev 
7361)
@@ -1,4 +1,4 @@
-require 'linkchecker'
+require 'link'
 
 class Spider
   @root
@@ -7,7 +7,8 @@
   @validate_external_links
   @spider_external_pages 
   
-  @log_info = true 
+  @log_info
+  @report_progress
   
   @visited_urls
   @urls_to_visit
@@ -17,10 +18,20 @@
   @invalid_links
   @skipped_links 
   
+  @num_links_checked
+  
+  # an array of regexps to ignore the response if it matches. This will help us
+  # recognize 404s and directory listings that don't give HTTP 404s.
+  @ignore_filter 
+  
   attr_reader :good_links, :broken_links, :invalid_links, :skipped_links
   attr_reader :visited_urls, :urls_to_visit
+  attr_reader :num_links_checked
+  
+  attr_writer :ignore_filter
+  attr_writer :log_info
     
-  def initialize( root_url, validate_external_links = false, 
spider_external_pages = false )
+  def initialize( root_url, validate_external_links = false, 
spider_external_pages = false, log_info = false, report_progress = true )
     @root = root_url
     @validate_external_links = validate_external_links
     @spider_external_pages = spider_external_pages
@@ -30,51 +41,79 @@
     @broken_links = Array.new
     @invalid_links = Array.new 
     @skipped_links = Array.new
+    @log_info = log_info
+    @report_progress = report_progress
+    @ignore_filter = Array.new
     
     @urls_to_visit.push( Link.new( @root, @root ) )
   end
   
   def check_links( max_links_to_check=100 )  
-    num_links_checked = 0 
+    @num_links_checked = 0 
     while ( @urls_to_visit.length > 0 and num_links_checked < 
max_links_to_check ) 
       current_link = @urls_to_visit.pop 
-      puts "about to visit #{current_link.target}" if @log_info 
       
+      # if we've already visited this url, don't bother doing it again
+      checked_already = visited_urls.find { | u | u.url == 
current_link.target.url }
+      if (checked_already) 
+        puts "skipping #{current_link.target.url} because we've been there 
before" if @log_info
+        next
+      end 
+      
+      @visited_urls.push( current_link.target )            
+      
+      @num_links_checked += 1
+      if (@report_progress and @num_links_checked % 10 == 0) then 
+        puts "[EMAIL PROTECTED]"
+      end 
+      puts "about to visit #{current_link.target.url} which is valid? 
#{current_link.target.valid?}" if @log_info 
+      
       # if the url is invalid, don't bother checking it, just mark it as bad. 
-      if not current_link.target.valid?
-        puts "skipping invalid link #{current_link.target.url}"
+      if ( not current_link.target.valid? ) then
+        puts "skipping invalid link #{current_link.target.url}" if @log_info
+        @invalid_links.push current_link
         next 
       end 
       
-      if not current_link.target.is_local?
+      if not current_link.target.is_local? then
         # TODO if the url is not local, only check it if we're supposed to 
validate external links         
         # if the url is not local, only crawl it if we're supposed to spider 
external pages 
-        puts "skpping non-local url #{current_link.target.url}"
+        puts "skipping non-local url #{current_link.target.url}" if @log_info
+        @skipped_links.push current_link 
         next
       end       
 
       response = Net::HTTP.get_response( current_link.target.uri ) 
 
       if (response and (response.code =~ %r{[23]\d\d} ) ) then
-        puts "got good response for #{current_link.target.uri}" 
+        puts "got good response for #{current_link.target.uri}" if @log_info
+        
+        # Check whether we're supposed to ignore this kind of page
+        if (@ignore_filter.find { | filter | response.body =~ filter } ) then
+          puts "skipping page which matches ignore_filter: 
#{current_link.target.url}" if @log_info
+          @skipped_links.push current_link           
+          next
+        end
+
         @good_links.push( current_link ) 
-        href_regexp = %r{href=\"([^#"\s]*)} 
+    # TODO also match src="foo.gif" and  "Lz.swfEmbed({url: 
'programs/LFC-$12.lzx"        
+        href_regexp = %r{href=\"([^#"\s]*)}    
         response.body.scan( href_regexp ) { | m |
           @urls_to_visit.push( Link.new( current_link.target.uri, m[0] ) )
-          puts "found url to visit #{m[0]}" if @log_info
+          puts "found url to visit #{m[0]} from #{current_link.target.uri}" if 
@log_info
         }        
       else
-          puts "got bad response for #{current_link.target.uri}" 
+          puts "got bad response for #{current_link.target.uri}" if @log_info
           @broken_links.push( current_link ) 
       end 
-
-      
-      @visited_urls.push( current_link.target )
-      num_links_checked += 1
-    end 
-    
+    end     
   end   
   
+  def unique_visited_urls 
+    visited_urls = @visited_urls.map { | au | au.uri.to_s }
+    visited_urls.uniq
+  end
+  
   def generate_report
     if entirely_good then 
       "All is well." 
@@ -82,15 +121,42 @@
       "Something was broken." 
     end 
   end
+  
+  def generate_verbose_report
+    report = "****Spidering Report for [EMAIL PROTECTED] ****\n"
     
+    report += "Checked [EMAIL PROTECTED] links total.\n"
+
+    if @broken_links.length > 0 then 
+      report += "Broken links: ([EMAIL PROTECTED] total)\n"
+      @broken_links.each { | l | report += "(x) " + l.target.url + " 
referenced in #{l.source.url} \n" }
+      report += "\n"
+    else 
+      report += "Broken links: none\n"
+    end    
+    
+    if @invalid_links.length > 0 then 
+      report += "Invalid links: ([EMAIL PROTECTED] total)\n"
+      @invalid_links.each { | l | report += "(!) " + l.target.url + " 
referenced in #{l.source.url} \n" }
+      report += "\n"
+    else 
+      report += "Invalid links: none\n"
+    end    
+    
+    if @skipped_links.length > 0 then 
+      report += "Skipped links: ([EMAIL PROTECTED] total)\n"
+      @skipped_links.each { | l | report += "(?) " + l.target.url + " 
referenced in #{l.source.url} \n" }
+      report += "\n"      
+    else 
+      report += "Skipped links: none\n"
+    end    
+        
+    report    
+  end 
+    
   def entirely_good
     @broken_links.length == 0 and @visited_urls.length > 0 and 
@good_links.length > 0 and @invalid_links.length == 0 
   end 
 
   
 end
-
-
-#####
-# the main program
-#### 
\ No newline at end of file

Added: sandbox/ben/ruby/linkchecker/test/tc_annotatedurl.rb


Property changes on: sandbox/ben/ruby/linkchecker/test/tc_annotatedurl.rb
___________________________________________________________________
Name: svn:executable
   + *
Name: svn:mime-type
   + text/plain
Name: svn:eol-style
   + native

Added: sandbox/ben/ruby/linkchecker/test/tc_link.rb


Property changes on: sandbox/ben/ruby/linkchecker/test/tc_link.rb
___________________________________________________________________
Name: svn:executable
   + *
Name: svn:mime-type
   + text/plain
Name: svn:eol-style
   + native

Modified: sandbox/ben/ruby/linkchecker/test/tc_linkchecker.rb
===================================================================
--- sandbox/ben/ruby/linkchecker/test/tc_linkchecker.rb 2007-11-25 23:08:21 UTC 
(rev 7360)
+++ sandbox/ben/ruby/linkchecker/test/tc_linkchecker.rb 2007-11-26 00:05:20 UTC 
(rev 7361)
@@ -1,151 +1,11 @@
 $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
 require 'linkchecker'
+require 'test/unit'
 
-class TestAnnotatedURL < Test::Unit::TestCase 
-  @@local_good_urls = [
-    "http://localhost:8080/trunk";,
-    "http://localhost:8080/trunk/laszlo-explorer/";, 
-    "http://localhost:8080/trunk/docs/guide/index.html";, 
-    "http://localhost:8080/trunk/docs/deployers/";, 
-  ]
 
-  @@local_broken_urls = [
-    "http://localhost/foo";, 
-    "http://localhost:8121/foo";, 
-    "http://localhost:8080/bananabanana";, 
-    "http://localhost:8080/bananabanana/foo.html";,     
-    "http://127.0.0.1/bar/baz";,
-  ]
 
-  @@remote_good_urls = [
-    "http://www.laszlosystems.com/"; , 
-    "http://forum.openlaszlo.org";,
-    "http://www.cnn.com";, 
-    "http://www.technorati.com";
-  ]
-  
-  @@remote_broken_urls = [
-    "http://nononoidontexist.banana.com";,
-    "http://www.cnn.com/there_couldnt_possibly_be_a_file_with_this_name.html";,
 
-  ]
-  
-  @@invalid_urls = [
-    "sjj://  ? foo.bar.baz",
-    "elf://",
-    ":", 
-    "bb? wow + b",      
-    "::bananafish_hello.h", 
-    "banana-fish ! yah? ",
-    "htt:",    
-  ]
-  
-  
-  @@all_test_urls = [].concat(@@local_good_urls).
-    concat( @@local_broken_urls ).
-    concat( @@remote_broken_urls ).
-    concat( @@remote_good_urls ).
-    concat( @@invalid_urls )
 
-  def test_data
-    assert( @@all_test_urls.length > 0 )
-    assert_equal( @@all_test_urls.length, @@all_test_urls.uniq.length) 
-    assert_equal( @@all_test_urls.length, 
-      @@local_good_urls.length + 
-      @@local_broken_urls.length + 
-      @@remote_good_urls.length + 
-      @@remote_broken_urls.length + 
-      @@invalid_urls.length)     
-  end 
-  
-  def test_detecting_localness_and_validity 
-    @@invalid_urls.each{ | u |
-      iu = AnnotatedURL.new( u ) 
-      assert_equal( false, iu.valid?, "invalid url #{iu.uri} should be 
invalid" ) 
-    }
-
-    @@local_good_urls.each { | u | 
-      au = AnnotatedURL.new( u ) 
-      assert( au.valid?, "local good url #{au.uri} should be valid" )
-      assert( au.is_local?, "local good url #{au.uri} should be local" )
-    }
-    @@local_broken_urls.each { | u | 
-      au = AnnotatedURL.new( u ) 
-      assert( au.valid?, "local broken url #{au.uri} should be valid" )
-      assert( au.is_local? , "local broken url #{au.uri} should still be 
local" )
-    }
-    @@remote_good_urls.each { | u | 
-      aru = AnnotatedURL.new( u )
-      assert( aru.valid?, "remote good url #{aru.uri} should be valid" )
-      assert_equal( false, aru.is_local?, "remote url #{aru.uri} should not be 
local" )
-    }
-    @@remote_broken_urls.each{ | u |
-      aru = AnnotatedURL.new( u ) 
-      assert( aru.valid?, "remote broken url #{u} should be valid" )      
-      assert_equal( false, aru.is_local?, "remote url #{u} should not be 
local" )
-    }
-    
-  rescue URI::InvalidURIError => boom
-    puts "invalid uri error \n\t\"#{boom}\""    
-  end 
-
-end 
-
-class TestLink < Test::Unit::TestCase
-  def test_creation
-    l = Link.new( "http://localhost:8080/trunk";, "/docs/index.html" );
-    assert_not_nil l 
-    assert_equal( "http://localhost:8080/trunk/docs/index.html";, 
l.absolute_url )
-    assert( l.target_is_local? ) 
-    l = Link.new( "foo", "bar")
-    assert_not_nil l 
-    l = Link.new( "http://www.cnn.com";, "grannymouse" )
-    assert_not_nil l
-    assert_equal( "http://www.cnn.com/grannymouse";, l.absolute_url) 
-    assert( ! l.target_is_local? )
-  end
-  
-  def test_absolutify
-    links = [
-      [ "http://localhost:8080";, "manager/status", 
"http://localhost:8080/manager/status"; ],
-      # this one is tricky. it breaks my code. [ "http://localhost:8080";, 
"/manager/status", "http://localhost:8080/manager/status"; ],      
-      [ "http://localhost:8080/tomcat-docs/changelog.html"; , "faq", 
"http://localhost:8080/tomcat-docs/faq"; ],
-      [ "http://localhost:8080/tomcat-docs/changelog.html"; , 
"http://www.cnn.com";, "http://www.cnn.com"; ], 
-      [ "http://localhost:8080/tomcat-docs/"; , "http://www.google.com";, 
"http://www.google.com"; ],         
-      [ "http://localhost:8080/tomcat-docs/changelog.html"; , 
"http://www.google.com";, "http://www.google.com"; ], 
-      [ "http://localhost:8080/trunk/docs/reference/ref.preface.html";, 
-          "../developers/tutorials.html", 
-          "http://localhost:8080/trunk/docs/developers/tutorials.html"; ], 
-      [ "http://localhost:8080/trunk/docs/reference/ref.preface.html";, 
-          "lz", 
-          "http://localhost:8080/trunk/docs/reference/lz"; ], 
-      [ "http://localhost:8080/trunk/docs/reference/ref.preface.html";, 
-          "../../docs/includes/lzx-pretty-print.css", 
-          "http://localhost:8080/trunk/docs/includes/lzx-pretty-print.css"; ],
-      [ "http://localhost:8080/trunk/docs/reference/ref.preface.html";, 
-          "peepers.html", 
-          "http://localhost:8080/trunk/docs/reference/peepers.html"; ], 
-        ["http://localhost:8080/trunk/docs/reference/LZX.ref.html";, 
-          "tag.splash-view.html", 
-          "http://localhost:8080/trunk/docs/reference/tag.splash-view.html";], 
-        ["http://localhost:8080/trunk/docs/reference/LZX.ref.html";, 
-          "tag.splash+as2.html", 
-          "http://localhost:8080/trunk/docs/reference/tag.splash+as2.html";], 
-        ["http://localhost:8080/trunk/docs/reference/tag.splash+as2.html";, 
-          "tag.splash-view.html", 
-          "http://localhost:8080/trunk/docs/reference/tag.splash-view.html";]
-          
-    ]
-    links.each { | t | 
-      l = Link.new( t[0], t[1] )
-      # puts l.absolute_url
-      trim_debugging_info = 
l.absolute_url.gsub(/(AAAA|BBBB|CCCC|DDDD|FFFF|GGGG|HHHH)/, '')
-      assert_equal( t[2], trim_debugging_info, "link check failed for #{t[2]}. 
original answer was #{l.absolute_url}" )
-      # puts "link check ok for #{t[2]}"
-    }
-  end
-end
-
 class TestLinkChecker < Test::Unit::TestCase
   def test_developers_guide
     url = "http://localhost:8080/trunk/docs/developers/index.html";

Modified: sandbox/ben/ruby/linkchecker/test/tc_spider.rb
===================================================================
--- sandbox/ben/ruby/linkchecker/test/tc_spider.rb      2007-11-25 23:08:21 UTC 
(rev 7360)
+++ sandbox/ben/ruby/linkchecker/test/tc_spider.rb      2007-11-26 00:05:20 UTC 
(rev 7361)
@@ -5,7 +5,8 @@
 class TestSpider < Test::Unit::TestCase
 
   @@simple_urls = [
-    "http://localhost:8080/trunk/docs/simple.html"; 
+    "http://localhost:8080/trunk/docs/simple.html";, 
+    "http://localhost:8080/trunk/docs/relative_links.html"; 
   ]
   
   @@interesting_urls = [
@@ -24,6 +25,10 @@
       assert( spider.entirely_good, "simplest should be entirely good" )
       assert_equal( spider.visited_urls.length, spider.good_links.length, "all 
the links we visited should be good" )
       assert( spider.visited_urls.length > 0 , "we must have checked more than 
zero urls" )
+      assert_equal( spider.skipped_links.uniq.length, 
spider.skipped_links.length, "Skipped links list should be unique.")      
+      assert_equal( spider.visited_urls.uniq.length, 
spider.visited_urls.length, "Visited links list should be unique.")            
+      assert_equal( spider.good_links.uniq.length, spider.good_links.length, 
"Good links list should be unique.")                  
+      assert_equal( spider.unique_visited_urls.length, 
spider.visited_urls.length, "Uniqueness.")
       assert_equal( "All is well.", spider.generate_report )
     }
   end 
@@ -32,7 +37,53 @@
     @@interesting_urls.each{ | r | 
       spider = Spider.new( r ) 
       spider.check_links( 10 )
+      assert( spider.broken_links.length == 0, "should have zero broken links 
in simplest" )
+      assert( spider.visited_urls.length > 0, "should have visited at least 
one url" ) 
+      assert( spider.good_links.length > 0, "should have at least one good 
link" )
+      assert( spider.invalid_links.length == 0, "should have no invalid links" 
)
+      assert( spider.entirely_good, "simplest should be entirely good" )
+      assert_equal( spider.visited_urls.length, spider.good_links.length, "all 
the links we visited should be good" )
+      assert( spider.visited_urls.length > 0 , "we must have checked more than 
zero urls" )
+      assert_equal( spider.unique_visited_urls.length, 
spider.visited_urls.length, "Uniqueness.")      
+      assert_equal( "All is well.", spider.generate_report )      
     }
   end 
   
+  def test_start_with_preface
+    spider = Spider.new( 
"http://localhost:8080/trunk/docs/reference/ref.preface.html";)
+    spider.check_links( 100 ) 
+
+    puts spider.generate_verbose_report    
+    assert( spider.invalid_links.length == 0, "should have no invalid links" ) 
   
+    
+    # We should only skip links once.
+    assert_equal( spider.skipped_links.uniq.length, 
spider.skipped_links.length, "Skipped links list should be unique.")
+    assert_equal( spider.unique_visited_urls.length, 
spider.visited_urls.length, "We visited too many urls.")    
+  end 
+  
+  def test_relative_links 
+    spider = Spider.new( 
"http://localhost:8080/trunk/docs/relative_links.html";)
+    spider.check_links( 10 ) 
+
+    puts spider.generate_verbose_report    
+    assert( spider.invalid_links.length == 0, "should have no invalid links" ) 
+    assert_equal( spider.unique_visited_urls.length, 
spider.visited_urls.length, "Uniqueness.")      
+    assert_equal( "All is well.", spider.generate_report )          
+  end 
+  
+  def test_wtf 
+     l = Link.new( 
URI.parse("http://localhost:8080/trunk/docs/relative_links.html";), 
"simple.html" ) 
+  end 
+  
+  def test_ignore_filter
+    ignore_filter = [%r{Directory Listing}]
+    spider = Spider.new( "http://localhost:8080/trunk/lps/components/"; )
+    spider.log_info = true
+    spider.ignore_filter = ignore_filter
+    spider.check_links( 10 )
+    assert_equal( 1, spider.visited_urls.length, "we should only have visited 
one url, because we should skip the contents of the directory listing.")
+    assert_equal( 1, spider.skipped_links.length, "we should have skipped 
exactly one url.")
+  end 
+  
+  
 end
\ No newline at end of file

Added: sandbox/ben/ruby/linkchecker/test/ts_spider.rb


Property changes on: sandbox/ben/ruby/linkchecker/test/ts_spider.rb
___________________________________________________________________
Name: svn:executable
   + *
Name: svn:mime-type
   + text/plain
Name: svn:eol-style
   + native


_______________________________________________
Laszlo-checkins mailing list
[email protected]
http://www.openlaszlo.org/mailman/listinfo/laszlo-checkins

Reply via email to