require 'net/http' require 'uri' require 'test/unit' require 'rexml/document' # start at a particular server... LASZLO_SERVER = "localhost" LASZLO_PORT = 8080 VERBOSE = false report_broken_links = true class HTMLPage @url @response @skipped attr_reader :response attr_reader :skipped def initialize( url, verbose=false ) @url = url if !is_local? then @response = nil @skipped = true else puts "trying #{url}..." if verbose @response = Net::HTTP.get_response( URI.parse(url) ) @skipped = false puts "ok." if verbose end rescue SystemCallError puts "couldn't connect to load url #{url}." if verbose end def is_ok? (not @skipped and (@response and (@response.code =~ %r{[23]\d\d} ) ) ) end def print_result puts "result: #{is_ok?} for url #{@url} with response " end def find_urls href_regexp = %r{href=\"([^#"\s]*)} urls = Array.new() if not is_ok? then return urls end # find all hrefs in the source # TODO: also find source="", rel="" @response.body.scan( href_regexp ) { | m | link = Link.new( @url, m[0]) urls.push( link.absolute_url ) } urls.uniq.sort end def is_local? u = URI.parse(@url) u.host == "localhost" end end class Link attr_reader :source, :target def initialize( source, target ) @source = AnnotatedURL.new( source ) @target_string = target @target = AnnotatedURL.new( absolute_url ) end def source_is_local? @source.is_local? end def target_is_local? @target.is_local? end def absolute_url if @target_string =~ %r{http:} then return @target_string end if @source.url =~ %r{http:} then # the source url is absolute, and the target is not, so let's assemble a new absolute url # match up to the last slash relative_url = "" if @source.uri.path =~ %r{(.*)/[\w+-_.]*.html} then relative_url += $1 + "/" else if @source.uri.path !~ %r{^/} relative_url += "/" end relative_url += @source.uri.path end monged_target = @target_string while monged_target =~ %r{^\.\./(\S*)} do monged_target = $1 # puts "relative_url is #{relative_url}" # strip one directory off the end of the relative url so far if relative_url =~ %r{(\S*)/\S+$} relative_url = $1 else puts "TROUBLE didn't match relative_url #{relative_url}" end # puts "monged_target is now #{monged_target} and relative_url is now #{relative_url}" end if relative_url !~ %r{/$} and monged_target !~ %r{^/} then relative_url += "/" end relative_url = "http://" + @source.uri.host + ( @source.uri.port != 80 ? ":#{@source.uri.port}" : "") + relative_url return relative_url + monged_target end return "not_the_right_thing-#{@target_string}" end end class LinkChecker @@files_weve_checked = Array.new @@files_to_check = Array.new @@urls_that_exist = Array.new @@missing_urls = Array.new @@verify_outgoing_links = false @@spider_external_pages = false @@verbose = false def LinkChecker.check_one_file( url_base, filename ) url = url_base + "/" + filename @@files_weve_checked.push( url ) p = HTMLPage.new( url ) if p.is_ok? then @@urls_that_exist.push( url ) else @@missing_urls.push( url + "[Error " + p.response.code + ": " + p.response.message + "]") end filenames = p.find_urls @@files_to_check.concat( filenames ) end def LinkChecker.check_next_file url = @@files_to_check.pop puts "check_next_file about to check #{url}" if @@verbose @@files_weve_checked.push( url ) page = HTMLPage.new( url ) if page.is_ok? then @@urls_that_exist.push( url ) urls = page.find_urls # ignore any urls we've already checked, # and ignore any urls we're already planning to check unique_new_urls = urls.uniq - @@files_to_check - @@files_weve_checked; @@files_to_check.concat( unique_new_urls ) else @@missing_urls.push( url + ( page.skipped ? " [ skipped ] " : " [Error " + page.response.code + ": " + page.response.message + "]" ) ) end @@files_weve_checked.uniq! @@urls_that_exist.uniq! @@missing_urls.uniq! end def LinkChecker.num_files_checked @@files_weve_checked.length end def LinkChecker.num_files_to_check @@files_to_check.length end def LinkChecker.next_url_to_check @@files_to_check.pop end def LinkChecker.generate_report "LinkChecker report****\n" + # "\n\nUnique urls we checked: #{@@files_weve_checked.length}\n(?) " + @@files_weve_checked.join("\n(?) ") + # "\n\nGood urls we checked: #{@@urls_that_exist.length} \n(OK) " + @@urls_that_exist.join("\n(OK) ") + "\n\nBroken links found: #{@@missing_urls.length}\n(x) " + @@missing_urls.join("\n(x) ") end end class AnnotatedURL @url @uri @@total_created = 0 @bad attr_reader :uri, :url def initialize( url ) @url = url @uri = URI.parse( url ) @@total_created += 1 @bad = false rescue URI::InvalidURIError @bad = true end def valid? not @bad end def is_local? not @bad and ( @uri.host == "localhost" or @uri.host == "127.0.0.1" ) end end class TestAnnotatedURL < Test::Unit::TestCase @@local_good_urls = [ "http://localhost:8080/trunk", "http://localhost:8080/trunk/laszlo-explorer/", "http://localhost:8080/trunk/docs/guide/index.html", "http://localhost:8080/trunk/docs/deployers/", ] @@local_broken_urls = [ "http://localhost/foo", "http://localhost:8121/foo", "http://localhost:8080/bananabanana", "http://localhost:8080/bananabanana/foo.html", "http://127.0.0.1/bar/baz", ] @@remote_good_urls = [ "http://www.laszlosystems.com/" , "http://forum.openlaszlo.org", "http://www.cnn.com", "http://www.technorati.com" ] @@remote_broken_urls = [ "http://nononoidontexist.banana.com", "http://www.cnn.com/there_couldnt_possibly_be_a_file_with_this_name.html", ] @@invalid_urls = [ "sjj:// ? foo.bar.baz", "elf://", ":", "bb? wow + b", "::bananafish_hello.h", "banana-fish ! yah? ", "htt:", ] @@all_test_urls = [].concat(@@local_good_urls). concat( @@local_broken_urls ). concat( @@remote_broken_urls ). concat( @@remote_good_urls ). concat( @@invalid_urls ) def test_data assert( @@all_test_urls.length > 0 ) assert_equal( @@all_test_urls.length, @@all_test_urls.uniq.length) assert_equal( @@all_test_urls.length, @@local_good_urls.length + @@local_broken_urls.length + @@remote_good_urls.length + @@remote_broken_urls.length + @@invalid_urls.length) end def test_detecting_localness_and_validity @@invalid_urls.each{ | u | iu = AnnotatedURL.new( u ) assert_equal( false, iu.valid?, "invalid url #{iu.uri} should be invalid" ) } @@local_good_urls.each { | u | au = AnnotatedURL.new( u ) assert( au.valid?, "local good url #{au.uri} should be valid" ) assert( au.is_local?, "local good url #{au.uri} should be local" ) } @@local_broken_urls.each { | u | au = AnnotatedURL.new( u ) assert( au.valid?, "local broken url #{au.uri} should be valid" ) assert( au.is_local? , "local broken url #{au.uri} should still be local" ) } @@remote_good_urls.each { | u | aru = AnnotatedURL.new( u ) assert( aru.valid?, "remote good url #{aru.uri} should be valid" ) assert_equal( false, aru.is_local?, "remote url #{aru.uri} should not be local" ) } @@remote_broken_urls.each{ | u | aru = AnnotatedURL.new( u ) assert( aru.valid?, "remote broken url #{u} should be valid" ) assert_equal( false, aru.is_local?, "remote url #{u} should not be local" ) } rescue URI::InvalidURIError => boom puts "invalid uri error \n\t\"#{boom}\"" end end class TestLink < Test::Unit::TestCase def test_creation l = Link.new( "http://localhost:8080/trunk", "/docs/index.html" ); assert_not_nil l assert_equal( "http://localhost:8080/trunk/docs/index.html", l.absolute_url ) assert( l.target_is_local? ) l = Link.new( "foo", "bar") assert_not_nil l l = Link.new( "http://www.cnn.com", "grannymouse" ) assert_not_nil l assert_equal( "http://www.cnn.com/grannymouse", l.absolute_url) assert( ! l.target_is_local? ) end def test_absolutify links = [ [ "http://localhost:8080", "manager/status", "http://localhost:8080/manager/status" ], # this one is tricky. it breaks my code. [ "http://localhost:8080", "/manager/status", "http://localhost:8080/manager/status" ], [ "http://localhost:8080/tomcat-docs/changelog.html" , "faq", "http://localhost:8080/tomcat-docs/faq" ], [ "http://localhost:8080/tomcat-docs/changelog.html" , "http://www.cnn.com", "http://www.cnn.com" ], [ "http://localhost:8080/tomcat-docs/" , "http://www.google.com", "http://www.google.com" ], [ "http://localhost:8080/tomcat-docs/changelog.html" , "http://www.google.com", "http://www.google.com" ], [ "http://localhost:8080/trunk/docs/reference/ref.preface.html", "../developers/tutorials.html", "http://localhost:8080/trunk/docs/developers/tutorials.html" ], [ "http://localhost:8080/trunk/docs/reference/ref.preface.html", "lz", "http://localhost:8080/trunk/docs/reference/lz" ], [ "http://localhost:8080/trunk/docs/reference/ref.preface.html", "../../docs/includes/lzx-pretty-print.css", "http://localhost:8080/trunk/docs/includes/lzx-pretty-print.css" ], [ "http://localhost:8080/trunk/docs/reference/ref.preface.html", "peepers.html", "http://localhost:8080/trunk/docs/reference/peepers.html" ], ["http://localhost:8080/trunk/docs/reference/LZX.ref.html", "tag.splash-view.html", "http://localhost:8080/trunk/docs/reference/tag.splash-view.html"], ["http://localhost:8080/trunk/docs/reference/LZX.ref.html", "tag.splash+as2.html", "http://localhost:8080/trunk/docs/reference/tag.splash+as2.html"], ["http://localhost:8080/trunk/docs/reference/tag.splash+as2.html", "tag.splash-view.html", "http://localhost:8080/trunk/docs/reference/tag.splash-view.html"] ] links.each { | t | l = Link.new( t[0], t[1] ) # puts l.absolute_url trim_debugging_info = l.absolute_url.gsub(/(AAAA|BBBB|CCCC|DDDD|FFFF|GGGG|HHHH)/, '') assert_equal( t[2], trim_debugging_info, "link check failed for #{t[2]}. original answer was #{l.absolute_url}" ) # puts "link check ok for #{t[2]}" } end end class TestLinkChecker < Test::Unit::TestCase def test_developers_guide url = "http://localhost:8080/trunk/docs/developers/index.html" p = HTMLPage.new( url ) assert( p.is_ok?, "developers index" ) urls = p.find_urls # urls.each { | u | puts u } end def test_reference_guide urlr = "http://localhost:8080/trunk/docs/reference/ref.preface.html" p = HTMLPage.new( urlr ) assert( p.is_ok?, "reference guide preface" ) urls = p.find_urls # urls.each { | u | puts "ref.preface.html: #{u}" } end def test_complex_url url = "http://localhost:8080/trunk/docs/reference/tag.splash+as2.html" p = HTMLPage.new( url ) assert( p.is_ok?, "spash as2 page" ) end def test_implicit_page_name url = "http://localhost:8080/trunk/docs/" p = HTMLPage.new( url ) assert( p.is_ok?, "docs root" ) end def test_something_simple assert_equal( LinkChecker.num_files_to_check, 0 ) assert_equal( LinkChecker.num_files_checked, 0 ) LinkChecker.check_one_file( "http://localhost:8080/trunk/docs/reference" , "ref.preface.html" ) assert( LinkChecker.num_files_checked > 0 ) assert( LinkChecker.num_files_to_check > 0 ) num_checked_so_far = LinkChecker.num_files_checked; i = 0; while( next_file_to_check = LinkChecker.next_url_to_check and i < 10 ) assert_not_nil( next_file_to_check ) LinkChecker.check_next_file num_checked_now = LinkChecker.num_files_checked assert( num_checked_now >= num_checked_so_far ) num_checked_so_far = num_checked_now i += 1 if ( i % 30 == 0 ) then puts "Checked #{i} files..." end end puts "\n\ntest_something_simple done. Checked #{i} files." puts LinkChecker.generate_report end end