Author: ben
Date: 2007-11-25 16:05:20 -0800 (Sun, 25 Nov 2007)
New Revision: 7361
Added:
sandbox/ben/ruby/linkchecker/lib/annotated_url.rb
sandbox/ben/ruby/linkchecker/lib/link.rb
sandbox/ben/ruby/linkchecker/lib/main.rb
sandbox/ben/ruby/linkchecker/test/tc_annotatedurl.rb
sandbox/ben/ruby/linkchecker/test/tc_link.rb
sandbox/ben/ruby/linkchecker/test/ts_spider.rb
Modified:
sandbox/ben/ruby/linkchecker/lib/linkchecker.rb
sandbox/ben/ruby/linkchecker/lib/spider.rb
sandbox/ben/ruby/linkchecker/test/tc_linkchecker.rb
sandbox/ben/ruby/linkchecker/test/tc_spider.rb
Log:
Made linkchecker configurable and documented usage options.
Added: sandbox/ben/ruby/linkchecker/lib/annotated_url.rb
Property changes on: sandbox/ben/ruby/linkchecker/lib/annotated_url.rb
___________________________________________________________________
Name: svn:executable
+ *
Name: svn:mime-type
+ text/plain
Name: svn:eol-style
+ native
Added: sandbox/ben/ruby/linkchecker/lib/link.rb
Property changes on: sandbox/ben/ruby/linkchecker/lib/link.rb
___________________________________________________________________
Name: svn:executable
+ *
Name: svn:mime-type
+ text/plain
Name: svn:eol-style
+ native
Modified: sandbox/ben/ruby/linkchecker/lib/linkchecker.rb
===================================================================
--- sandbox/ben/ruby/linkchecker/lib/linkchecker.rb 2007-11-25 23:08:21 UTC
(rev 7360)
+++ sandbox/ben/ruby/linkchecker/lib/linkchecker.rb 2007-11-26 00:05:20 UTC
(rev 7361)
@@ -1,7 +1,7 @@
require 'net/http'
require 'uri'
-require 'test/unit'
require 'rexml/document'
+require 'link'
# start at a particular server...
LASZLO_SERVER = "localhost"
@@ -64,68 +64,7 @@
end
-class Link
- attr_reader :source, :target
- def initialize( source, target )
- @source = AnnotatedURL.new( source )
- @target_string = target
- @target = AnnotatedURL.new( absolute_url )
- end
-
- def source_is_local?
- @source.is_local?
- end
-
- def target_is_local?
- @target.is_local?
- end
-
- def absolute_url
- if @target_string =~ %r{http:} then
- return @target_string
- end
- if @source.url =~ %r{http:} then
- # the source url is absolute, and the target is not, so let's assemble a
new absolute url
- # match up to the last slash
-
- relative_url = ""
- if @source.uri.path =~ %r{(.*)/[\w+-_.]*.html} then
- relative_url += $1 + "/"
- else
- if @source.uri.path !~ %r{^/}
- relative_url += "/"
- end
- relative_url += @source.uri.path
- end
-
- monged_target = @target_string
- while monged_target =~ %r{^\.\./(\S*)} do
- monged_target = $1
- # puts "relative_url is #{relative_url}"
- # strip one directory off the end of the relative url so far
- if relative_url =~ %r{(\S*)/\S+$}
- relative_url = $1
- else
- puts "TROUBLE didn't match relative_url #{relative_url}"
- end
- # puts "monged_target is now #{monged_target} and relative_url is now
#{relative_url}"
- end
-
- if relative_url !~ %r{/$} and monged_target !~ %r{^/} then
- relative_url += "/"
- end
-
- relative_url = "http://" + @source.uri.host +
- ( @source.uri.port != 80 ? ":[EMAIL PROTECTED]" : "") + relative_url
-
- return relative_url + monged_target
- end
- return "[EMAIL PROTECTED]"
- end
-end
-
-
class LinkChecker
@@files_weve_checked = Array.new
@@files_to_check = Array.new
@@ -195,31 +134,6 @@
end
-class AnnotatedURL
- @url
- @uri
- @@total_created = 0
- @bad
- attr_reader :uri, :url
-
- def initialize( url )
- @url = url
- @uri = URI.parse( url )
- @@total_created += 1
- @bad = false
- rescue URI::InvalidURIError
- @bad = true
- end
-
- def valid?
- not @bad
- end
-
- def is_local?
- not @bad and
- ( @uri.host == "localhost" or @uri.host == "127.0.0.1" )
- end
-end
Added: sandbox/ben/ruby/linkchecker/lib/main.rb
Property changes on: sandbox/ben/ruby/linkchecker/lib/main.rb
___________________________________________________________________
Name: svn:executable
+ *
Name: svn:mime-type
+ text/plain
Name: svn:eol-style
+ native
Modified: sandbox/ben/ruby/linkchecker/lib/spider.rb
===================================================================
--- sandbox/ben/ruby/linkchecker/lib/spider.rb 2007-11-25 23:08:21 UTC (rev
7360)
+++ sandbox/ben/ruby/linkchecker/lib/spider.rb 2007-11-26 00:05:20 UTC (rev
7361)
@@ -1,4 +1,4 @@
-require 'linkchecker'
+require 'link'
class Spider
@root
@@ -7,7 +7,8 @@
@validate_external_links
@spider_external_pages
- @log_info = true
+ @log_info
+ @report_progress
@visited_urls
@urls_to_visit
@@ -17,10 +18,20 @@
@invalid_links
@skipped_links
+ @num_links_checked
+
+ # an array of regexps to ignore the response if it matches. This will help us
+ # recognize 404s and directory listings that don't give HTTP 404s.
+ @ignore_filter
+
attr_reader :good_links, :broken_links, :invalid_links, :skipped_links
attr_reader :visited_urls, :urls_to_visit
+ attr_reader :num_links_checked
+
+ attr_writer :ignore_filter
+ attr_writer :log_info
- def initialize( root_url, validate_external_links = false,
spider_external_pages = false )
+ def initialize( root_url, validate_external_links = false,
spider_external_pages = false, log_info = false, report_progress = true )
@root = root_url
@validate_external_links = validate_external_links
@spider_external_pages = spider_external_pages
@@ -30,51 +41,79 @@
@broken_links = Array.new
@invalid_links = Array.new
@skipped_links = Array.new
+ @log_info = log_info
+ @report_progress = report_progress
+ @ignore_filter = Array.new
@urls_to_visit.push( Link.new( @root, @root ) )
end
def check_links( max_links_to_check=100 )
- num_links_checked = 0
+ @num_links_checked = 0
while ( @urls_to_visit.length > 0 and num_links_checked <
max_links_to_check )
current_link = @urls_to_visit.pop
- puts "about to visit #{current_link.target}" if @log_info
+ # if we've already visited this url, don't bother doing it again
+ checked_already = visited_urls.find { | u | u.url ==
current_link.target.url }
+ if (checked_already)
+ puts "skipping #{current_link.target.url} because we've been there
before" if @log_info
+ next
+ end
+
+ @visited_urls.push( current_link.target )
+
+ @num_links_checked += 1
+ if (@report_progress and @num_links_checked % 10 == 0) then
+ puts "[EMAIL PROTECTED]"
+ end
+ puts "about to visit #{current_link.target.url} which is valid?
#{current_link.target.valid?}" if @log_info
+
# if the url is invalid, don't bother checking it, just mark it as bad.
- if not current_link.target.valid?
- puts "skipping invalid link #{current_link.target.url}"
+ if ( not current_link.target.valid? ) then
+ puts "skipping invalid link #{current_link.target.url}" if @log_info
+ @invalid_links.push current_link
next
end
- if not current_link.target.is_local?
+ if not current_link.target.is_local? then
# TODO if the url is not local, only check it if we're supposed to
validate external links
# if the url is not local, only crawl it if we're supposed to spider
external pages
- puts "skpping non-local url #{current_link.target.url}"
+ puts "skipping non-local url #{current_link.target.url}" if @log_info
+ @skipped_links.push current_link
next
end
response = Net::HTTP.get_response( current_link.target.uri )
if (response and (response.code =~ %r{[23]\d\d} ) ) then
- puts "got good response for #{current_link.target.uri}"
+ puts "got good response for #{current_link.target.uri}" if @log_info
+
+ # Check whether we're supposed to ignore this kind of page
+ if (@ignore_filter.find { | filter | response.body =~ filter } ) then
+ puts "skipping page which matches ignore_filter:
#{current_link.target.url}" if @log_info
+ @skipped_links.push current_link
+ next
+ end
+
@good_links.push( current_link )
- href_regexp = %r{href=\"([^#"\s]*)}
+ # TODO also match src="foo.gif" and "Lz.swfEmbed({url:
'programs/LFC-$12.lzx"
+ href_regexp = %r{href=\"([^#"\s]*)}
response.body.scan( href_regexp ) { | m |
@urls_to_visit.push( Link.new( current_link.target.uri, m[0] ) )
- puts "found url to visit #{m[0]}" if @log_info
+ puts "found url to visit #{m[0]} from #{current_link.target.uri}" if
@log_info
}
else
- puts "got bad response for #{current_link.target.uri}"
+ puts "got bad response for #{current_link.target.uri}" if @log_info
@broken_links.push( current_link )
end
-
-
- @visited_urls.push( current_link.target )
- num_links_checked += 1
- end
-
+ end
end
+ def unique_visited_urls
+ visited_urls = @visited_urls.map { | au | au.uri.to_s }
+ visited_urls.uniq
+ end
+
def generate_report
if entirely_good then
"All is well."
@@ -82,15 +121,42 @@
"Something was broken."
end
end
+
+ def generate_verbose_report
+ report = "****Spidering Report for [EMAIL PROTECTED] ****\n"
+ report += "Checked [EMAIL PROTECTED] links total.\n"
+
+ if @broken_links.length > 0 then
+ report += "Broken links: ([EMAIL PROTECTED] total)\n"
+ @broken_links.each { | l | report += "(x) " + l.target.url + "
referenced in #{l.source.url} \n" }
+ report += "\n"
+ else
+ report += "Broken links: none\n"
+ end
+
+ if @invalid_links.length > 0 then
+ report += "Invalid links: ([EMAIL PROTECTED] total)\n"
+ @invalid_links.each { | l | report += "(!) " + l.target.url + "
referenced in #{l.source.url} \n" }
+ report += "\n"
+ else
+ report += "Invalid links: none\n"
+ end
+
+ if @skipped_links.length > 0 then
+ report += "Skipped links: ([EMAIL PROTECTED] total)\n"
+ @skipped_links.each { | l | report += "(?) " + l.target.url + "
referenced in #{l.source.url} \n" }
+ report += "\n"
+ else
+ report += "Skipped links: none\n"
+ end
+
+ report
+ end
+
def entirely_good
@broken_links.length == 0 and @visited_urls.length > 0 and
@good_links.length > 0 and @invalid_links.length == 0
end
end
-
-
-#####
-# the main program
-####
\ No newline at end of file
Added: sandbox/ben/ruby/linkchecker/test/tc_annotatedurl.rb
Property changes on: sandbox/ben/ruby/linkchecker/test/tc_annotatedurl.rb
___________________________________________________________________
Name: svn:executable
+ *
Name: svn:mime-type
+ text/plain
Name: svn:eol-style
+ native
Added: sandbox/ben/ruby/linkchecker/test/tc_link.rb
Property changes on: sandbox/ben/ruby/linkchecker/test/tc_link.rb
___________________________________________________________________
Name: svn:executable
+ *
Name: svn:mime-type
+ text/plain
Name: svn:eol-style
+ native
Modified: sandbox/ben/ruby/linkchecker/test/tc_linkchecker.rb
===================================================================
--- sandbox/ben/ruby/linkchecker/test/tc_linkchecker.rb 2007-11-25 23:08:21 UTC
(rev 7360)
+++ sandbox/ben/ruby/linkchecker/test/tc_linkchecker.rb 2007-11-26 00:05:20 UTC
(rev 7361)
@@ -1,151 +1,11 @@
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
require 'linkchecker'
+require 'test/unit'
-class TestAnnotatedURL < Test::Unit::TestCase
- @@local_good_urls = [
- "http://localhost:8080/trunk",
- "http://localhost:8080/trunk/laszlo-explorer/",
- "http://localhost:8080/trunk/docs/guide/index.html",
- "http://localhost:8080/trunk/docs/deployers/",
- ]
- @@local_broken_urls = [
- "http://localhost/foo",
- "http://localhost:8121/foo",
- "http://localhost:8080/bananabanana",
- "http://localhost:8080/bananabanana/foo.html",
- "http://127.0.0.1/bar/baz",
- ]
- @@remote_good_urls = [
- "http://www.laszlosystems.com/" ,
- "http://forum.openlaszlo.org",
- "http://www.cnn.com",
- "http://www.technorati.com"
- ]
-
- @@remote_broken_urls = [
- "http://nononoidontexist.banana.com",
- "http://www.cnn.com/there_couldnt_possibly_be_a_file_with_this_name.html",
- ]
-
- @@invalid_urls = [
- "sjj:// ? foo.bar.baz",
- "elf://",
- ":",
- "bb? wow + b",
- "::bananafish_hello.h",
- "banana-fish ! yah? ",
- "htt:",
- ]
-
-
- @@all_test_urls = [].concat(@@local_good_urls).
- concat( @@local_broken_urls ).
- concat( @@remote_broken_urls ).
- concat( @@remote_good_urls ).
- concat( @@invalid_urls )
- def test_data
- assert( @@all_test_urls.length > 0 )
- assert_equal( @@all_test_urls.length, @@all_test_urls.uniq.length)
- assert_equal( @@all_test_urls.length,
- @@local_good_urls.length +
- @@local_broken_urls.length +
- @@remote_good_urls.length +
- @@remote_broken_urls.length +
- @@invalid_urls.length)
- end
-
- def test_detecting_localness_and_validity
- @@invalid_urls.each{ | u |
- iu = AnnotatedURL.new( u )
- assert_equal( false, iu.valid?, "invalid url #{iu.uri} should be
invalid" )
- }
-
- @@local_good_urls.each { | u |
- au = AnnotatedURL.new( u )
- assert( au.valid?, "local good url #{au.uri} should be valid" )
- assert( au.is_local?, "local good url #{au.uri} should be local" )
- }
- @@local_broken_urls.each { | u |
- au = AnnotatedURL.new( u )
- assert( au.valid?, "local broken url #{au.uri} should be valid" )
- assert( au.is_local? , "local broken url #{au.uri} should still be
local" )
- }
- @@remote_good_urls.each { | u |
- aru = AnnotatedURL.new( u )
- assert( aru.valid?, "remote good url #{aru.uri} should be valid" )
- assert_equal( false, aru.is_local?, "remote url #{aru.uri} should not be
local" )
- }
- @@remote_broken_urls.each{ | u |
- aru = AnnotatedURL.new( u )
- assert( aru.valid?, "remote broken url #{u} should be valid" )
- assert_equal( false, aru.is_local?, "remote url #{u} should not be
local" )
- }
-
- rescue URI::InvalidURIError => boom
- puts "invalid uri error \n\t\"#{boom}\""
- end
-
-end
-
-class TestLink < Test::Unit::TestCase
- def test_creation
- l = Link.new( "http://localhost:8080/trunk", "/docs/index.html" );
- assert_not_nil l
- assert_equal( "http://localhost:8080/trunk/docs/index.html",
l.absolute_url )
- assert( l.target_is_local? )
- l = Link.new( "foo", "bar")
- assert_not_nil l
- l = Link.new( "http://www.cnn.com", "grannymouse" )
- assert_not_nil l
- assert_equal( "http://www.cnn.com/grannymouse", l.absolute_url)
- assert( ! l.target_is_local? )
- end
-
- def test_absolutify
- links = [
- [ "http://localhost:8080", "manager/status",
"http://localhost:8080/manager/status" ],
- # this one is tricky. it breaks my code. [ "http://localhost:8080",
"/manager/status", "http://localhost:8080/manager/status" ],
- [ "http://localhost:8080/tomcat-docs/changelog.html" , "faq",
"http://localhost:8080/tomcat-docs/faq" ],
- [ "http://localhost:8080/tomcat-docs/changelog.html" ,
"http://www.cnn.com", "http://www.cnn.com" ],
- [ "http://localhost:8080/tomcat-docs/" , "http://www.google.com",
"http://www.google.com" ],
- [ "http://localhost:8080/tomcat-docs/changelog.html" ,
"http://www.google.com", "http://www.google.com" ],
- [ "http://localhost:8080/trunk/docs/reference/ref.preface.html",
- "../developers/tutorials.html",
- "http://localhost:8080/trunk/docs/developers/tutorials.html" ],
- [ "http://localhost:8080/trunk/docs/reference/ref.preface.html",
- "lz",
- "http://localhost:8080/trunk/docs/reference/lz" ],
- [ "http://localhost:8080/trunk/docs/reference/ref.preface.html",
- "../../docs/includes/lzx-pretty-print.css",
- "http://localhost:8080/trunk/docs/includes/lzx-pretty-print.css" ],
- [ "http://localhost:8080/trunk/docs/reference/ref.preface.html",
- "peepers.html",
- "http://localhost:8080/trunk/docs/reference/peepers.html" ],
- ["http://localhost:8080/trunk/docs/reference/LZX.ref.html",
- "tag.splash-view.html",
- "http://localhost:8080/trunk/docs/reference/tag.splash-view.html"],
- ["http://localhost:8080/trunk/docs/reference/LZX.ref.html",
- "tag.splash+as2.html",
- "http://localhost:8080/trunk/docs/reference/tag.splash+as2.html"],
- ["http://localhost:8080/trunk/docs/reference/tag.splash+as2.html",
- "tag.splash-view.html",
- "http://localhost:8080/trunk/docs/reference/tag.splash-view.html"]
-
- ]
- links.each { | t |
- l = Link.new( t[0], t[1] )
- # puts l.absolute_url
- trim_debugging_info =
l.absolute_url.gsub(/(AAAA|BBBB|CCCC|DDDD|FFFF|GGGG|HHHH)/, '')
- assert_equal( t[2], trim_debugging_info, "link check failed for #{t[2]}.
original answer was #{l.absolute_url}" )
- # puts "link check ok for #{t[2]}"
- }
- end
-end
-
class TestLinkChecker < Test::Unit::TestCase
def test_developers_guide
url = "http://localhost:8080/trunk/docs/developers/index.html"
Modified: sandbox/ben/ruby/linkchecker/test/tc_spider.rb
===================================================================
--- sandbox/ben/ruby/linkchecker/test/tc_spider.rb 2007-11-25 23:08:21 UTC
(rev 7360)
+++ sandbox/ben/ruby/linkchecker/test/tc_spider.rb 2007-11-26 00:05:20 UTC
(rev 7361)
@@ -5,7 +5,8 @@
class TestSpider < Test::Unit::TestCase
@@simple_urls = [
- "http://localhost:8080/trunk/docs/simple.html"
+ "http://localhost:8080/trunk/docs/simple.html",
+ "http://localhost:8080/trunk/docs/relative_links.html"
]
@@interesting_urls = [
@@ -24,6 +25,10 @@
assert( spider.entirely_good, "simplest should be entirely good" )
assert_equal( spider.visited_urls.length, spider.good_links.length, "all
the links we visited should be good" )
assert( spider.visited_urls.length > 0 , "we must have checked more than
zero urls" )
+ assert_equal( spider.skipped_links.uniq.length,
spider.skipped_links.length, "Skipped links list should be unique.")
+ assert_equal( spider.visited_urls.uniq.length,
spider.visited_urls.length, "Visited links list should be unique.")
+ assert_equal( spider.good_links.uniq.length, spider.good_links.length,
"Good links list should be unique.")
+ assert_equal( spider.unique_visited_urls.length,
spider.visited_urls.length, "Uniqueness.")
assert_equal( "All is well.", spider.generate_report )
}
end
@@ -32,7 +37,53 @@
@@interesting_urls.each{ | r |
spider = Spider.new( r )
spider.check_links( 10 )
+ assert( spider.broken_links.length == 0, "should have zero broken links
in simplest" )
+ assert( spider.visited_urls.length > 0, "should have visited at least
one url" )
+ assert( spider.good_links.length > 0, "should have at least one good
link" )
+ assert( spider.invalid_links.length == 0, "should have no invalid links"
)
+ assert( spider.entirely_good, "simplest should be entirely good" )
+ assert_equal( spider.visited_urls.length, spider.good_links.length, "all
the links we visited should be good" )
+ assert( spider.visited_urls.length > 0 , "we must have checked more than
zero urls" )
+ assert_equal( spider.unique_visited_urls.length,
spider.visited_urls.length, "Uniqueness.")
+ assert_equal( "All is well.", spider.generate_report )
}
end
+ def test_start_with_preface
+ spider = Spider.new(
"http://localhost:8080/trunk/docs/reference/ref.preface.html")
+ spider.check_links( 100 )
+
+ puts spider.generate_verbose_report
+ assert( spider.invalid_links.length == 0, "should have no invalid links" )
+
+ # We should only skip links once.
+ assert_equal( spider.skipped_links.uniq.length,
spider.skipped_links.length, "Skipped links list should be unique.")
+ assert_equal( spider.unique_visited_urls.length,
spider.visited_urls.length, "We visited too many urls.")
+ end
+
+ def test_relative_links
+ spider = Spider.new(
"http://localhost:8080/trunk/docs/relative_links.html")
+ spider.check_links( 10 )
+
+ puts spider.generate_verbose_report
+ assert( spider.invalid_links.length == 0, "should have no invalid links" )
+ assert_equal( spider.unique_visited_urls.length,
spider.visited_urls.length, "Uniqueness.")
+ assert_equal( "All is well.", spider.generate_report )
+ end
+
+ def test_wtf
+ l = Link.new(
URI.parse("http://localhost:8080/trunk/docs/relative_links.html"),
"simple.html" )
+ end
+
+ def test_ignore_filter
+ ignore_filter = [%r{Directory Listing}]
+ spider = Spider.new( "http://localhost:8080/trunk/lps/components/" )
+ spider.log_info = true
+ spider.ignore_filter = ignore_filter
+ spider.check_links( 10 )
+ assert_equal( 1, spider.visited_urls.length, "we should only have visited
one url, because we should skip the contents of the directory listing.")
+ assert_equal( 1, spider.skipped_links.length, "we should have skipped
exactly one url.")
+ end
+
+
end
\ No newline at end of file
Added: sandbox/ben/ruby/linkchecker/test/ts_spider.rb
Property changes on: sandbox/ben/ruby/linkchecker/test/ts_spider.rb
___________________________________________________________________
Name: svn:executable
+ *
Name: svn:mime-type
+ text/plain
Name: svn:eol-style
+ native
_______________________________________________
Laszlo-checkins mailing list
[email protected]
http://www.openlaszlo.org/mailman/listinfo/laszlo-checkins