require 'linkchecker' class Spider @root attr_reader :root @validate_external_links @spider_external_pages @log_info = true @visited_urls @urls_to_visit @good_links @broken_links @invalid_links @skipped_links attr_reader :good_links, :broken_links, :invalid_links, :skipped_links attr_reader :visited_urls, :urls_to_visit def initialize( root_url, validate_external_links = false, spider_external_pages = false ) @root = root_url @validate_external_links = validate_external_links @spider_external_pages = spider_external_pages @visited_urls = Array.new @urls_to_visit = Array.new @good_links = Array.new @broken_links = Array.new @invalid_links = Array.new @skipped_links = Array.new @urls_to_visit.push( Link.new( @root, @root ) ) end def check_links( max_links_to_check=100 ) num_links_checked = 0 while ( @urls_to_visit.length > 0 and num_links_checked < max_links_to_check ) current_link = @urls_to_visit.pop puts "about to visit #{current_link.target}" if @log_info # if the url is invalid, don't bother checking it, just mark it as bad. if not current_link.target.valid? puts "skipping invalid link #{current_link.target.url}" next end if not current_link.target.is_local? # TODO if the url is not local, only check it if we're supposed to validate external links # if the url is not local, only crawl it if we're supposed to spider external pages puts "skpping non-local url #{current_link.target.url}" next end response = Net::HTTP.get_response( current_link.target.uri ) if (response and (response.code =~ %r{[23]\d\d} ) ) then puts "got good response for #{current_link.target.uri}" @good_links.push( current_link ) href_regexp = %r{href=\"([^#"\s]*)} response.body.scan( href_regexp ) { | m | @urls_to_visit.push( Link.new( current_link.target.uri, m[0] ) ) puts "found url to visit #{m[0]}" if @log_info } else puts "got bad response for #{current_link.target.uri}" @broken_links.push( current_link ) end @visited_urls.push( current_link.target ) num_links_checked += 1 end end def generate_report if entirely_good then "All is well." else "Something was broken." end end def entirely_good @broken_links.length == 0 and @visited_urls.length > 0 and @good_links.length > 0 and @invalid_links.length == 0 end end class TestSpider < Test::Unit::TestCase @@simple_urls = [ "http://localhost:8080/trunk/docs/simple.html" ] @@interesting_urls = [ "http://localhost:8080/trunk/docs/index.html" ] def test_simplest @@simple_urls.each { | r | spider = Spider.new( r ) spider.check_links( 10 ) assert( spider.broken_links.length == 0, "should have zero broken links in simplest" ) assert( spider.visited_urls.length > 0, "should have visited at least one url" ) assert( spider.good_links.length > 0, "should have at least one good link" ) assert( spider.invalid_links.length == 0, "should have no invalid links" ) assert( spider.entirely_good, "simplest should be entirely good" ) assert_equal( spider.visited_urls.length, spider.good_links.length, "all the links we visited should be good" ) assert( spider.visited_urls.length > 0 , "we must have checked more than zero urls" ) assert_equal( "All is well.", spider.generate_report ) } end def test_interesting @@interesting_urls.each{ | r | spider = Spider.new( r ) spider.check_links( 10 ) } end end ##### # the main program ####