huber 2002/06/30 09:36:11 Modified: src/java/org/apache/cocoon/generation LinkStatusGenerator.java Log: Added more javadoc comments, fixed generating attribute values of url, not of url built for requesting its links Revision Changes Path 1.2 +68 -78 xml-cocoon2/src/java/org/apache/cocoon/generation/LinkStatusGenerator.java Index: LinkStatusGenerator.java =================================================================== RCS file: /home/cvs/xml-cocoon2/src/java/org/apache/cocoon/generation/LinkStatusGenerator.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- LinkStatusGenerator.java 14 Jun 2002 16:19:14 -0000 1.1 +++ LinkStatusGenerator.java 30 Jun 2002 16:36:11 -0000 1.2 @@ -35,7 +35,8 @@ * * @author Michael Homeijer * @author Nicola Ken Barozzi ([EMAIL PROTECTED]) -*/ + * @author Bernhard Huber ([EMAIL PROTECTED]) + */ public class LinkStatusGenerator extends ComposerGenerator implements Recyclable, Configurable { /** The URI of the namespace of this generator. */ @@ -128,10 +129,8 @@ public final static String USER_AGENT_CONFIG = "user-agent"; /** * Default value of <code>user-agent</code> configuration value. - * <p> - * Its value is @see org.apache.cocoon.Constants#COMPLETE_NAME. - * </p> * + * @see org.apache.cocoon.Constants#COMPLETE_NAME * @since */ public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME; @@ -203,10 +202,12 @@ * query-string appended to each crawling request. * </p> * <pre><tt> - * <include>.*\.html?</exclude> or <exclude>.*\.html?, .*\.xsp</exclude> + * <include>.*\.html?</include> or <include>.*\.html?, .*\.xsp</include> * <exclude>.*\.gif</exclude> or <exclude>.*\.gif, .*\.jpe?g</exclude> * <link-content-type> application/x-cocoon-links </link-content-type> * <link-view-query> ?cocoon-view=links </link-view-query> + * <user-agent> Cocoon </user-agent> + * <accept> text/xml </accept> * </tt></pre> * * @param configuration XML configuration of this avalon component. @@ -298,8 +299,9 @@ /* Create a reusable attributes for creating nodes */ this.attributes = new AttributesImpl(); - excludeCrawlingURL = new HashSet(); - this.setDefaultExcludeFromCrawling(); + // already done in configure... + //excludeCrawlingURL = new HashSet(); + //this.setDefaultExcludeFromCrawling(); } /** @@ -342,12 +344,12 @@ // remove it from the to-do list linksToProcess.remove(link); - URLConnection conn = processURL(url, link.getReferrer()); + String new_url_link = processURL(url, link.getReferrer()); // calc all links from this url - if (conn != null) { + if (new_url_link != null) { - List url_links = getLinksFromConnection(conn, url); + List url_links = getLinksFromConnection(new_url_link, url); if (url_links != null) { // add links of this url to the to-do list linksToProcess.addAll(url_links); @@ -401,9 +403,22 @@ } - protected List getLinksFromConnection(URLConnection conn, URL url) { + /** + * Retrieve a list of links of a url + * + * @param url_link_string url for requesting links, it is assumed that + * url_link_string queries the cocoon view links, ie of the form + * <code>http://host/foo/bar?cocoon-view=links</code> + * @param url_of_referrer base url of which links are requested, ie of the form + * <code>http://host/foo/bar</code> + * @return List of links from url_of_referrer, as result of requesting url + * url_link_string + */ + protected List getLinksFromConnection(String url_link_string, URL url_of_referrer) { List url_links = null; try { + URL url_link = new URL( url_link_string ); + URLConnection conn = url_link.openConnection(); String content_type = conn.getContentType(); if (getLogger().isDebugEnabled()) { @@ -419,7 +434,7 @@ // content is supposed to be a list of links, // relative to current URL String line; - String referrer = url.toString(); + String referrer = url_of_referrer.toString(); while ((line = br.readLine()) != null) { URL new_url = new URL(url, line); @@ -459,15 +474,23 @@ return url_links; } - protected URLConnection processURL(URL url, String referrer) throws SAXException { + /** + * Generate xml attributes of a url, calculate url for retrieving links + * + * @param url to process + * @param referrer of the url + * @return String url for retrieving links, or null if url is an excluded-url, + * and not an included-url. + */ + protected String processURL(URL url, String referrer) throws SAXException { if (getLogger().isDebugEnabled()) { getLogger().debug("getLinks URL " + url); } - URLConnection result = null; + String result = null; - // don't try to investigate url which has been crawled already + // don't try to investigate a url which has been crawled already if (crawled.contains(url.toString())) { return null; } @@ -481,71 +504,38 @@ attributes.addAttribute("", REFERRER_ATTR_NAME, REFERRER_ATTR_NAME, "CDATA", referrer); - // don't try to get links for url which is excluded from crawling - if (isExcludedURL(url.toString())) { - // Check for status and output it. - - try { - URLConnection links_url_connection = url.openConnection(); - HttpURLConnection h = (HttpURLConnection)links_url_connection; - String content_type = links_url_connection.getContentType(); - - attributes.addAttribute("", CONTENT_ATTR_NAME, - CONTENT_ATTR_NAME, "CDATA", - content_type); - - attributes.addAttribute("", MESSAGE_ATTR_NAME, - MESSAGE_ATTR_NAME, "CDATA", - h.getResponseMessage()); - - attributes.addAttribute("", STATUS_ATTR_NAME, - STATUS_ATTR_NAME, "CDATA", - String.valueOf(h.getResponseCode())); - - - - } - catch (IOException ioe) - { - attributes.addAttribute("", MESSAGE_ATTR_NAME, - MESSAGE_ATTR_NAME, "CDATA", - ioe.getMessage()); - } - - } else { - - // Output url, referrer, content-type, status, message for traversable url's - // add prefix and query to get data from the linkserializer. - try { - URL links_url = new URL(url.toExternalForm() - + ((url.toExternalForm().indexOf("?") == -1) ? "?" : "&") - + linkViewQuery); - URLConnection links_url_connection = links_url.openConnection(); - HttpURLConnection h = (HttpURLConnection)links_url_connection; - - result = links_url_connection; - - attributes.addAttribute("", CONTENT_ATTR_NAME, - CONTENT_ATTR_NAME, "CDATA", - links_url_connection.getContentType()); - - attributes.addAttribute("", MESSAGE_ATTR_NAME, - MESSAGE_ATTR_NAME, "CDATA", - h.getResponseMessage()); - - attributes.addAttribute("", STATUS_ATTR_NAME, - STATUS_ATTR_NAME, "CDATA", - String.valueOf(h.getResponseCode())); - } - catch(IOException ioe ) { - // Output url referrer status message - attributes.addAttribute("", MESSAGE_ATTR_NAME, + // Output url, referrer, content-type, status, message for traversable url's + try { + URLConnection links_url_connection = url.openConnection(); + HttpURLConnection h = (HttpURLConnection)links_url_connection; + String content_type = links_url_connection.getContentType(); + + attributes.addAttribute("", CONTENT_ATTR_NAME, + CONTENT_ATTR_NAME, "CDATA", + content_type); + + attributes.addAttribute("", MESSAGE_ATTR_NAME, + MESSAGE_ATTR_NAME, "CDATA", + h.getResponseMessage()); + + attributes.addAttribute("", STATUS_ATTR_NAME, + STATUS_ATTR_NAME, "CDATA", + String.valueOf(h.getResponseCode())); + } catch (IOException ioe) { + attributes.addAttribute("", MESSAGE_ATTR_NAME, MESSAGE_ATTR_NAME, "CDATA", ioe.getMessage()); - - - } } + + // don't try to get links of a url which is excluded from crawling + // try to get links of a url which is included for crawling + if (!isExcludedURL(url.toString()) && isIncludedURL( url.toString() )) { + // add prefix and query to get data from the linkserializer. + result = url.toExternalForm() + + ((url.toExternalForm().indexOf("?") == -1) ? "?" : "&") + + linkViewQuery; + } + super.contentHandler.startElement(URI, LINK_NODE_NAME, URI+':'+LINK_NODE_NAME, attributes); super.contentHandler.endElement(URI, LINK_NODE_NAME, URI+':'+LINK_NODE_NAME); @@ -623,7 +613,7 @@ super.recycle(); this.attributes = null; - this.excludeCrawlingURL = null; + //this.excludeCrawlingURL = null; } }
---------------------------------------------------------------------- In case of troubles, e-mail: [EMAIL PROTECTED] To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]