On Wed, May 14, 2008 at 4:00 PM, gregsmit <[EMAIL PROTECTED]> wrote: > Does anyone know of an Ant task that I could use to walk through a website > (that I built with ant) to confirm that there are no broken links? I found > one really old project on sourceforge, but it looks pretty abandoned.
I wrote one a long time ago based on NekoHTML to do the HTML parsing, because all the ones I could find were online only, and thus checked public internet links only. I only made mine verify the link fragments (#id) could be found in the link target (I was checking documentation cross-references). Unless Canoo, it doesn't attempt to process javascript. Mine was simple minded and looking only at <a href>, <link href>, and <img src>, and at filters to avoid checking links based on patterns (to restrict checking local relative links for example, and skip http: links). This code is old, and hasn't been compiled or run in ages, but apparently I unit tested it, so might still be useful ;-) I'm happy to share the code (although it uses a few utility classes, so not easy to extract the relevant pieces). That's assuming Canoo is not a good fit here. My stuff probably pales in comparison, but I'm throwing it out there just in case it might be useful. --DD /** * Checks an HTML page for bad links. * <p> * Uses <a href="http://www.apache.org/~andyc/neko/doc/html/">NekoHTML</a>, * but could also use <a href="http://jtidy.sourceforge.net/">JTidy</a> I guess. * <p> * Current limitations: * <ul> * <li>Cannot indicate line/column of the bad link</li> * <li>Does not support re-baseing of document</li> * <li>Does not check URL in stylesheets</li> * <li>Slow!?</li> * </ul> * * @version May 2004 */ public class HtmlLinkChecker extends ConditionalAspect.AbstractTask { ... } <?xml version="1.0"?> <project name="HtmlLinkCheckerTest" default="tearDown" xmlns:bm="antlib:buildmagic"> <target name="setUp"> <property name="tmp" location="${basedir}/${ant.project.name}.tmp" /> <mkdir dir="${tmp}" /> </target> <target name="tearDown"> <delete dir="${tmp}" /> </target> <!-- Creates a few dummy HTML files, which by default have no bad links. Just override one of the property to force some kind of bad link. --> <target name="setUpFiles" depends="setUp"> <property name="google.link" value="http://www.google.com" /> <property name="logo.file" value="logo.gif" /> <property name="bullet.file" value="bullet.gif" /> <property name="style.file" value="style.css" /> <property name="book.file" value="book.html" /> <property name="chapter1.file" value="chapter1.html" /> <property name="section1.id" value="section1" /> <property name="sectionA.id" value="sectionA" /> <property name="coucou.id" value="coucou" /> <echo file="${tmp}/logo.gif">I am a logo!</echo> <echo file="${tmp}/bullet.gif">I am a bullet!</echo> <echo file="${tmp}/style.css"> p { color: #000000 } ul { list-style: url(${bullet.file}) } </echo> <echo file="${tmp}/book.html"><![CDATA[ <html> <body> <a href="${google.link}">Search:</a> <p id="coucou">coucou</p> <a href="${chapter1.file}">Chapter 1</a> <a href="${chapter1.file}#${section1.id}">Section 1</a> <a href="${chapter1.file}#section2">Section 1</a> <a href="chapter2.html">Chapter 2</a> </body> </html> ]]></echo> <echo file="${tmp}/chapter1.html"><![CDATA[ <html> <head> <link href="${style.file}" rel="stylesheet"> </head> <body> <h2 id="section1">Section #1</h2> <h2 id="section2">Section #2</h2> <a href="book.html#${coucou.id}">Book Index</a> </body> </html> ]]></echo> <echo file="${tmp}/chapter2.html"><![CDATA[ <html> <head> <link href="${style.file}" rel="stylesheet"> </head> <body> <img src="${logo.file}"> See <a href="#${sectionA.id}">Section A</a> <h2 id="sectionA">Section A</h2> <h2 id="sectionB">Section B</h2> <a href="${book.file}">Book Index</a> </body> </html> ]]></echo> </target> <target name="test-generic" depends="setUpFiles"> <bm:checklinks verbose="true"> <bm:fileset dir="${tmp}" includes="*.html" /> </bm:checklinks> </target> <target name="test-patterns" depends="setUpFiles"> <bm:checklinks verbose="false"> <bm:fileset dir="${tmp}" includes="*.html" /> <bm:linkpatterns> <bm:include regexp=".*/images/.*" ifTrue="${+imgs}" /> <bm:exclude prefix="chapterOne.html" ifTrue="${-chap1}" /> <bm:exclude regexp=".*#.*" ifTrue="${-frag}" /> <bm:exclude prefix="http:" ifTrue="${-http}" /> </bm:linkpatterns> </bm:checklinks> </target> </project> public class HtmlLinkCheckerTest extends BuildFileTestCase { /** * Tests all the links are OK. * Note that it doesn't tell us if some links are not checked... * Note also that it requires an internet connection to go to Google. */ public void testGoodLinks() { executeTarget("test-generic"); } public void testBadExternalHttpLink() { setProperty("google.link", "http://zzz.google.com"); expectSpecificBuildException("test-generic", "bad external http link", "1 bad link(s)"); assertBadLink("http://zzz.google.com"); } public void testBadInternalFileLink() { setProperty("google.link", "book.html"); setProperty("chapter1.file", "chapterOne.html"); expectSpecificBuildException("test-generic", "bad internal file link", "3 bad link(s)"); assertBadLink("chapterOne.html"); assertBadLink("chapterOne.html#section1"); assertBadLink("chapterOne.html#section2"); } public void testBadInternalFileFragment() { setProperty("google.link", "book.html"); setProperty("section1.id", "sectionOne"); expectSpecificBuildException("test-generic", "bad internal file frag", "1 bad link(s)"); assertBadLink("chapter1.html#sectionOne"); } public void testBadSelfFragment() { setProperty("google.link", "book.html"); setProperty("sectionA.id", "sectionABC"); expectSpecificBuildException("test-generic", "bad self frag", "1 bad link(s)"); assertBadLink("#sectionABC"); } public void testBadHeadLink() { setProperty("google.link", "book.html"); setProperty("style.file", "stylesheet.CSS"); expectSpecificBuildException("test-generic", "bad head link", "1 bad link(s)"); assertBadLink("stylesheet.CSS"); } public void testBadUrlInCss() { setProperty("google.link", "book.html"); setProperty("bullet.file", "square.gif"); try { expectSpecificBuildException("test-generic", "bad url in css", "1 bad link(s)"); assertBadLink("square.gif"); } catch (junit.framework.AssertionFailedError e) { // TODO: implement CSS link checks } } public void testBadImage() { setProperty("google.link", "book.html"); setProperty("logo.file", "logo.jpg"); expectSpecificBuildException("test-generic", "bad image", "1 bad link(s)"); assertBadLink("logo.jpg"); //System.out.println(getLog()); //System.out.println(getOutput()); //System.out.println(getFullLog()); //System.err.println(getError()); } public void testIgnoreBadInternalFileLink() { setProperty("google.link", "book.html"); setProperty("chapter1.file", "chapterOne.html"); setProperty("-chap1", "true"); executeTarget("test-patterns"); } public void testIgnoreBadExternalHttpLink() { setProperty("-http", "true"); setProperty("google.link", "http://zzz.google.com"); executeTarget("test-patterns"); } public void testIgnoreBadFragments() { setProperty("-frag", "true"); setProperty("google.link", "book.html"); setProperty("section1.id", "sectionOne"); setProperty("sectionA.id", "sectionABC"); executeTarget("test-patterns"); } public void testCheckImagesOnly() { setProperty("+imgs", "true"); setProperty("google.link", "book.html"); // Creates a few broken links, to be ignored (since not checked) setProperty("section1.id", "sectionOne"); setProperty("sectionA.id", "sectionABC"); setProperty("chapter1.file", "chapterOne.html"); executeTarget("test-patterns"); } private void setProperty(String name, String value) { getProject().setNewProperty(name, value); } private void assertBadLink(String link) { assertTrue(getLog().indexOf(": " + link + ":") > -1); } } // END class HtmlLinkCheckerTest --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]