Java Mailing List Archive

http://www.ant-tasks.com/

Home » Ant Users List »

Re: Ant task walk html and find broken links

Dominique Devienne

2008-05-14

Replies:

Author LoginPost Reply
On Wed, May 14, 2008 at 4:00 PM, gregsmit <gregsmit@(protected):
> Does anyone know of an Ant task that I could use to walk through a website
> (that I built with ant) to confirm that there are no broken links? I found
> one really old project on sourceforge, but it looks pretty abandoned.

I wrote one a long time ago based on NekoHTML to do the HTML parsing,
because all the ones I could find were online only, and thus checked
public internet links only. I only made mine verify the link fragments
(#id) could be found in the link target (I was checking documentation
cross-references).

Unless Canoo, it doesn't attempt to process javascript. Mine was
simple minded and looking only at <a href>, <link href>, and <img
src>, and at filters to avoid checking links based on patterns (to
restrict checking local relative links for example, and skip http:
links).

This code is old, and hasn't been compiled or run in ages, but
apparently I unit tested it, so might still be useful ;-) I'm happy to
share the code (although it uses a few utility classes, so not easy to
extract the relevant pieces).

That's assuming Canoo is not a good fit here. My stuff probably pales
in comparison, but I'm throwing it out there just in case it might be
useful.

--DD

/**
* Checks an HTML page for bad links.
* <p>
* Uses <a href="http://www.apache.org/~andyc/neko/doc/html/">NekoHTML</a>,
* but could also use <a href="http://jtidy.sourceforge.net/">JTidy</a> I guess.
* <p>
* Current limitations:
* <ul>
*  <li>Cannot indicate line/column of the bad link</li>
*  <li>Does not support re-baseing of document</li>
*  <li>Does not check URL in stylesheets</li>
*  <li>Slow!?</li>
* </ul>
*
* @version May 2004
*/
public class HtmlLinkChecker extends ConditionalAspect.AbstractTask { ... }


<?xml version="1.0"?>

<project name="HtmlLinkCheckerTest" default="tearDown"
     xmlns:bm="antlib:buildmagic">

<target name="setUp">
  <property name="tmp" location="${basedir}/${ant.project.name}.tmp" />
  <mkdir dir="${tmp}" />
</target>

<target name="tearDown">
  <delete dir="${tmp}" />
</target>

<!-- Creates a few dummy HTML files, which by default have no bad links.
    Just override one of the property to force some kind of bad link. -->
<target name="setUpFiles" depends="setUp">
  <property name="google.link"  value="http://www.google.com" />
  <property name="logo.file"   value="logo.gif" />
  <property name="bullet.file"  value="bullet.gif" />
  <property name="style.file"   value="style.css" />
  <property name="book.file"   value="book.html" />
  <property name="chapter1.file" value="chapter1.html" />
  <property name="section1.id"  value="section1" />
  <property name="sectionA.id"  value="sectionA" />
  <property name="coucou.id"   value="coucou" />

  <echo file="${tmp}/logo.gif">I am a logo!</echo>
  <echo file="${tmp}/bullet.gif">I am a bullet!</echo>
  <echo file="${tmp}/style.css">
   p { color: #000000 }
   ul { list-style: url(${bullet.file}) }
  </echo>
  <echo file="${tmp}/book.html"><![CDATA[
   <html>
    <body>
      <a href="${google.link}">Search:</a>
      <p id="coucou">coucou</p>
      <a  href="${chapter1.file}">Chapter 1</a>
       <a href="${chapter1.file}#${section1.id}">Section 1</a>
       <a href="${chapter1.file}#section2">Section 1</a>
      <a href="chapter2.html">Chapter 2</a>
    </body>
   </html>
  ]]></echo>

  <echo file="${tmp}/chapter1.html"><![CDATA[
   <html>
    <head>
      <link href="${style.file}" rel="stylesheet">
    </head>
    <body>
      <h2 id="section1">Section #1</h2>
      <h2 id="section2">Section #2</h2>
      <a href="book.html#${coucou.id}">Book Index</a>
    </body>
   </html>
  ]]></echo>

  <echo file="${tmp}/chapter2.html"><![CDATA[
   <html>
    <head>
      <link href="${style.file}" rel="stylesheet">
    </head>
    <body>
      <img src="${logo.file}">
      See <a href="#${sectionA.id}">Section A</a>
      <h2 id="sectionA">Section A</h2>
      <h2 id="sectionB">Section B</h2>
      <a href="${book.file}">Book Index</a>
    </body>
   </html>
  ]]></echo>
</target>

<target name="test-generic" depends="setUpFiles">
  <bm:checklinks verbose="true">
   <bm:fileset dir="${tmp}" includes="*.html" />
  </bm:checklinks>
</target>

<target name="test-patterns" depends="setUpFiles">
  <bm:checklinks verbose="false">
   <bm:fileset dir="${tmp}" includes="*.html" />

   <bm:linkpatterns>
    <bm:include regexp=".*/images/.*" ifTrue="${+imgs}" />

    <bm:exclude prefix="chapterOne.html" ifTrue="${-chap1}" />
    <bm:exclude regexp=".*#.*" ifTrue="${-frag}" />
    <bm:exclude prefix="http:" ifTrue="${-http}" />
   </bm:linkpatterns>
  </bm:checklinks>
</target>

</project>

public class HtmlLinkCheckerTest
        extends BuildFileTestCase {

  /**
  * Tests all the links are OK.
  * Note that it doesn't tell us if some links are not checked...
  * Note also that it requires an internet connection to go to Google.
  */
  public void testGoodLinks() {
    executeTarget("test-generic");
  }

  public void testBadExternalHttpLink() {
    setProperty("google.link", "http://zzz.google.com");
    expectSpecificBuildException("test-generic", "bad external http link",
                        "1 bad link(s)");
    assertBadLink("http://zzz.google.com");
  }

  public void testBadInternalFileLink() {
    setProperty("google.link", "book.html");
    setProperty("chapter1.file", "chapterOne.html");
    expectSpecificBuildException("test-generic", "bad internal file link",
                        "3 bad link(s)");
    assertBadLink("chapterOne.html");
    assertBadLink("chapterOne.html#section1");
    assertBadLink("chapterOne.html#section2");
  }

  public void testBadInternalFileFragment() {
    setProperty("google.link", "book.html");
    setProperty("section1.id", "sectionOne");
    expectSpecificBuildException("test-generic", "bad internal file frag",
                        "1 bad link(s)");
    assertBadLink("chapter1.html#sectionOne");
  }

  public void testBadSelfFragment() {
    setProperty("google.link", "book.html");
    setProperty("sectionA.id", "sectionABC");
    expectSpecificBuildException("test-generic", "bad self frag",
                        "1 bad link(s)");
    assertBadLink("#sectionABC");
  }

  public void testBadHeadLink() {
    setProperty("google.link", "book.html");
    setProperty("style.file", "stylesheet.CSS");
    expectSpecificBuildException("test-generic", "bad head link",
                        "1 bad link(s)");
    assertBadLink("stylesheet.CSS");
  }

  public void testBadUrlInCss() {
    setProperty("google.link", "book.html");
    setProperty("bullet.file", "square.gif");
    try {
       expectSpecificBuildException("test-generic", "bad url in css",
                          "1 bad link(s)");
       assertBadLink("square.gif");
    }
    catch (junit.framework.AssertionFailedError e) {
       // TODO: implement CSS link checks
    }
  }

  public void testBadImage() {
    setProperty("google.link", "book.html");
    setProperty("logo.file", "logo.jpg");
    expectSpecificBuildException("test-generic", "bad image",
                        "1 bad link(s)");
    assertBadLink("logo.jpg");

    //System.out.println(getLog());
    //System.out.println(getOutput());
    //System.out.println(getFullLog());
    //System.err.println(getError());
  }

  public void testIgnoreBadInternalFileLink() {
    setProperty("google.link", "book.html");
    setProperty("chapter1.file", "chapterOne.html");

    setProperty("-chap1", "true");

    executeTarget("test-patterns");
  }

  public void testIgnoreBadExternalHttpLink() {
    setProperty("-http", "true");
    setProperty("google.link", "http://zzz.google.com");
    executeTarget("test-patterns");
  }

  public void testIgnoreBadFragments() {
    setProperty("-frag", "true");
    setProperty("google.link", "book.html");
    setProperty("section1.id", "sectionOne");
    setProperty("sectionA.id", "sectionABC");
    executeTarget("test-patterns");
  }

  public void testCheckImagesOnly() {
    setProperty("+imgs", "true");
    setProperty("google.link", "book.html");

    // Creates a few broken links, to be ignored (since not checked)
    setProperty("section1.id", "sectionOne");
    setProperty("sectionA.id", "sectionABC");
    setProperty("chapter1.file", "chapterOne.html");

    executeTarget("test-patterns");
  }

  private void setProperty(String name, String value) {
    getProject().setNewProperty(name, value);
  }

  private void assertBadLink(String link) {
    assertTrue(getLog().indexOf(": " + link + ":") > -1);
  }

} // END class HtmlLinkCheckerTest

---------------------------------------------------------------------
To unsubscribe, e-mail: user-unsubscribe@(protected)
For additional commands, e-mail: user-help@(protected)

©2008 ant-tasks.com - Jax Systems, LLC, U.S.A.