huber 2002/08/04 11:33:51 Modified: src/java/org/apache/cocoon/generation LinkStatusGenerator.java Log: Explictly close BufferedReader, and HttpURLConnection Revision Changes Path 1.5 +170 -101 xml-cocoon2/src/java/org/apache/cocoon/generation/LinkStatusGenerator.java Index: LinkStatusGenerator.java =================================================================== RCS file: /home/cvs/xml-cocoon2/src/java/org/apache/cocoon/generation/LinkStatusGenerator.java,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- LinkStatusGenerator.java 2 Aug 2002 07:06:21 -0000 1.4 +++ LinkStatusGenerator.java 4 Aug 2002 18:33:51 -0000 1.5 @@ -1,3 +1,53 @@ +/* + + ============================================================================ + The Apache Software License, Version 1.1 + ============================================================================ + + Copyright (C) 1999-2002 The Apache Software Foundation. All rights reserved. + + Redistribution and use in source and binary forms, with or without modifica- + tion, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + 3. The end-user documentation included with the redistribution, if any, must + include the following acknowledgment: "This product includes software + developed by the Apache Software Foundation (http://www.apache.org/)." + Alternately, this acknowledgment may appear in the software itself, if + and wherever such third-party acknowledgments normally appear. + + 4. The names "Apache Cocoon" and "Apache Software Foundation" must not be + used to endorse or promote products derived from this software without + prior written permission. For written permission, please contact + [EMAIL PROTECTED] + + 5. Products derived from this software may not be called "Apache", nor may + "Apache" appear in their name, without prior written permission of the + Apache Software Foundation. + + THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, + INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLU- + DING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + This software consists of voluntary contributions made by many individuals + on behalf of the Apache Software Foundation and was originally created by + Stefano Mazzocchi <[EMAIL PROTECTED]>. For more information on the Apache + Software Foundation, please see <http://www.apache.org/>. + + */ package org.apache.cocoon.generation; import org.apache.avalon.excalibur.pool.Recyclable; @@ -40,23 +90,23 @@ public class LinkStatusGenerator extends ComposerGenerator implements Recyclable, Configurable { /** The URI of the namespace of this generator. */ protected static final String URI = - "http://apache.org/cocoon/linkstatus/2.0"; - + "http://apache.org/cocoon/linkstatus/2.0"; + /** The namespace prefix for this namespace. */ protected static final String PREFIX = "linkstatus"; - + /* Node and attribute names */ protected static final String TOP_NODE_NAME = "linkstatus"; protected static final String LINK_NODE_NAME = "link"; - + protected static final String HREF_ATTR_NAME = "href"; protected static final String REFERRER_ATTR_NAME = "referrer"; protected static final String CONTENT_ATTR_NAME = "content"; protected static final String STATUS_ATTR_NAME = "status"; - protected static final String MESSAGE_ATTR_NAME = "message"; - + protected static final String MESSAGE_ATTR_NAME = "message"; + protected AttributesImpl attributes = new AttributesImpl(); - + /** * Config element name specifying expected link content-typ. * <p> @@ -66,7 +116,7 @@ * @since */ public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type"; - + /** * Default value of <code>link-content-type</code> configuration value. * <p> @@ -96,7 +146,7 @@ * @since */ public final static String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links"; - + /** * Config element name specifying excluding regular expression pattern. * <p> @@ -106,7 +156,7 @@ * @since */ public final static String EXCLUDE_CONFIG = "exclude"; - + /** * Config element name specifying including regular expression pattern. * <p> @@ -116,7 +166,7 @@ * @since */ public final static String INCLUDE_CONFIG = "include"; - + /** * Config element name specifying http header value for user-Agent. * <p> @@ -133,7 +183,7 @@ * @since */ public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME; - + /** * Config element name specifying http header value for accept. * <p> @@ -152,43 +202,42 @@ * @since */ public final static String ACCEPT_DEFAULT = "*/*"; - + private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT; private String linkContentType = LINK_CONTENT_TYPE_DEFAULT; private HashSet excludeCrawlingURL; private HashSet includeCrawlingURL; private String userAgent = USER_AGENT_DEFAULT; private String accept = ACCEPT_DEFAULT; - + private HashSet crawled; private HashSet linksToProcess; - + /** * Stores links to process and the referrer links */ - private class Link { private URL url; private String referrer; - + public Link( URL url, String referrer ) { this.url = url; this.referrer = referrer; } - + public URL getURL() { return url; } - + public String getReferrer() { return referrer; } - + public boolean equals( Link l ) { return url.equals( l.getURL()); } } - + /** * Configure the crawler component. * <p> @@ -214,8 +263,8 @@ * @since */ public void configure(Configuration configuration) - throws ConfigurationException { - + throws ConfigurationException { + Configuration[] children; children = configuration.getChildren(INCLUDE_CONFIG); if (children != null && children.length > 0) { @@ -229,12 +278,12 @@ this.includeCrawlingURL.add(new RE(tokenized_pattern)); } } catch (RESyntaxException rese) { - getLogger().error("Cannot create includeing regular-expression for " + - pattern, rese); + getLogger().error("Cannot create including regular-expression for " + + pattern, rese); } } } - + children = configuration.getChildren(EXCLUDE_CONFIG); if (children != null && children.length > 0) { excludeCrawlingURL = new HashSet(); @@ -247,15 +296,15 @@ this.excludeCrawlingURL.add(new RE(tokenized_pattern)); } } catch (RESyntaxException rese) { - getLogger().error("Cannot create excluding regular-expression for " + - pattern, rese); + getLogger().error("Cannot create excluding regular-expression for " + + pattern, rese); } } } else { excludeCrawlingURL = new HashSet(); setDefaultExcludeFromCrawling(); } - + Configuration child; String value; child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false); @@ -272,7 +321,7 @@ this.linkViewQuery = value.trim(); } } - + child = configuration.getChild(USER_AGENT_CONFIG, false); if (child != null) { value = child.getValue(); @@ -280,7 +329,7 @@ this.userAgent = value; } } - + child = configuration.getChild(ACCEPT_CONFIG, false); if (child != null) { value = child.getValue(); @@ -289,20 +338,20 @@ } } } - + public void setup(SourceResolver resolver, Map objectModel, String src, Parameters par) - throws ProcessingException, SAXException, IOException { - + throws ProcessingException, SAXException, IOException { + super.setup(resolver, objectModel, src, par); - + /* Create a reusable attributes for creating nodes */ this.attributes = new AttributesImpl(); - + // already done in configure... //excludeCrawlingURL = new HashSet(); //this.setDefaultExcludeFromCrawling(); } - + /** * Generate XML data. * @@ -312,42 +361,42 @@ * if the requsted URI wasn't found */ public void generate() - throws SAXException, ProcessingException { + throws SAXException, ProcessingException { try { - + crawled = new HashSet(); linksToProcess = new HashSet(); - + URL root = new URL(source); linksToProcess.add(new Link( root, "")); - - + + if (getLogger().isDebugEnabled()) { getLogger().debug("crawl URL " + root); } - + this.contentHandler.startDocument(); this.contentHandler.startPrefixMapping(PREFIX,URI); - + attributes.clear(); super.contentHandler.startElement(URI, TOP_NODE_NAME, URI+':'+TOP_NODE_NAME, attributes); - + while (linksToProcess.size() > 0) { Iterator i = linksToProcess.iterator(); - + if (i.hasNext()) { // fetch a URL Link link = (Link) i.next(); - URL url = link.getURL(); - + URL url = link.getURL(); + // remove it from the to-do list linksToProcess.remove(link); - + String new_url_link = processURL(url, link.getReferrer()); - + // calc all links from this url if (new_url_link != null) { - + List url_links = getLinksFromConnection(new_url_link, url); if (url_links != null) { // add links of this url to the to-do list @@ -356,7 +405,7 @@ } } } - + super.contentHandler.endElement(URI, TOP_NODE_NAME, URI+':'+TOP_NODE_NAME); this.contentHandler.endPrefixMapping(PREFIX); this.contentHandler.endDocument(); @@ -365,7 +414,7 @@ throw new ResourceNotFoundException("Could not read source ", ioe); } } - + /** * Default exclude patterns. * <p> @@ -389,19 +438,19 @@ ".*\\.js(\\?.*)?$", ".*\\.css(\\?.*)?$" }; - + for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) { String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i]; try { excludeCrawlingURL.add(new RE(pattern)); } catch (RESyntaxException rese) { getLogger().error("Cannot create excluding regular-expression for " + - pattern, rese); + pattern, rese); } } } - - + + /** * Retrieve a list of links of a url * @@ -410,31 +459,38 @@ * <code>http://host/foo/bar?cocoon-view=links</code> * @param url_of_referrer base url of which links are requested, ie of the form * <code>http://host/foo/bar</code> - * @return List of links from url_of_referrer, as result of requesting url + * @return List of links from url_of_referrer, as result of requesting url * url_link_string */ protected List getLinksFromConnection(String url_link_string, URL url_of_referrer) { List url_links = null; + BufferedReader br = null; try { URL url_link = new URL( url_link_string ); URLConnection conn = url_link.openConnection(); String content_type = conn.getContentType(); - + + if (content_type == null) { + getLogger().warn( "No content type available for " + String.valueOf( url_link_string ) ); + // caller checks if null + return url_links; + } + if (getLogger().isDebugEnabled()) { getLogger().debug("Content-type: " + content_type); } - + if (content_type.equals(linkContentType)) { url_links = new ArrayList(); - + InputStream is = conn.getInputStream(); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - + br = new BufferedReader(new InputStreamReader(is)); + // content is supposed to be a list of links, // relative to current URL String line; String referrer = url_of_referrer.toString(); - + while ((line = br.readLine()) != null) { URL new_url = new URL(url_link, line); boolean add_url = true; @@ -442,22 +498,22 @@ if (add_url) { add_url &= !url_links.contains(new_url); } - + // don't add new_url if it has been crawled already if (add_url) { add_url &= !crawled.contains(new_url.toString()); } - + Link new_link = new Link( new_url, referrer ); if (add_url) { add_url &= !linksToProcess.contains(new_link); } - + // don't add if is not matched by existing include definition if (add_url) { add_url &= isIncludedURL(new_url.toString()); } - + if (add_url) { if (getLogger().isDebugEnabled()) { getLogger().debug("Add URL: " + new_url.toString()); @@ -469,10 +525,18 @@ } } catch (IOException ioe) { getLogger().warn("Problems get links of " + url_link_string, ioe); + } finally { + // explictly close the stream + if (br != null) { + try { + br.close(); + br = null; + } catch (IOException ignored) {} + } } return url_links; } - + /** * Generate xml attributes of a url, calculate url for retrieving links * @@ -482,57 +546,63 @@ * and not an included-url. */ protected String processURL(URL url, String referrer) throws SAXException { - + if (getLogger().isDebugEnabled()) { getLogger().debug("getLinks URL " + url); } - + String result = null; - + // don't try to investigate a url which has been crawled already if (crawled.contains(url.toString())) { return null; } - + // mark it as crawled crawled.add(url.toString()); - + attributes.clear(); attributes.addAttribute("", HREF_ATTR_NAME, - HREF_ATTR_NAME, "CDATA", url.toString()); + HREF_ATTR_NAME, "CDATA", url.toString()); attributes.addAttribute("", REFERRER_ATTR_NAME, - REFERRER_ATTR_NAME, "CDATA", referrer); - + REFERRER_ATTR_NAME, "CDATA", referrer); + // Output url, referrer, content-type, status, message for traversable url's + HttpURLConnection h = null; try { + URLConnection links_url_connection = url.openConnection(); - HttpURLConnection h = (HttpURLConnection)links_url_connection; + h = (HttpURLConnection)links_url_connection; String content_type = links_url_connection.getContentType(); - + attributes.addAttribute("", CONTENT_ATTR_NAME, - CONTENT_ATTR_NAME, "CDATA", - content_type); - + CONTENT_ATTR_NAME, "CDATA", + content_type); + attributes.addAttribute("", MESSAGE_ATTR_NAME, - MESSAGE_ATTR_NAME, "CDATA", - h.getResponseMessage()); - + MESSAGE_ATTR_NAME, "CDATA", + h.getResponseMessage()); + attributes.addAttribute("", STATUS_ATTR_NAME, - STATUS_ATTR_NAME, "CDATA", - String.valueOf(h.getResponseCode())); + STATUS_ATTR_NAME, "CDATA", + String.valueOf(h.getResponseCode())); } catch (IOException ioe) { attributes.addAttribute("", MESSAGE_ATTR_NAME, - MESSAGE_ATTR_NAME, "CDATA", - ioe.getMessage()); + MESSAGE_ATTR_NAME, "CDATA", + ioe.getMessage()); + } finally { + if (h != null) { + h.disconnect(); + } } - + // don't try to get links of a url which is excluded from crawling - // try to get links of a url which is included for crawling + // try to get links of a url which is included for crawling if (!isExcludedURL(url.toString()) && isIncludedURL( url.toString() )) { // add prefix and query to get data from the linkserializer. result = url.toExternalForm() - + ((url.toExternalForm().indexOf("?") == -1) ? "?" : "&") - + linkViewQuery; + + ((url.toExternalForm().indexOf("?") == -1) ? "?" : "&") + + linkViewQuery; } super.contentHandler.startElement(URI, LINK_NODE_NAME, URI+':'+LINK_NODE_NAME, attributes); @@ -540,7 +610,7 @@ return result; } - + /** * check if URL is a candidate for indexing * @@ -556,7 +626,7 @@ } return false; } - + final String s = url.toString(); Iterator i = excludeCrawlingURL.iterator(); while (i.hasNext()) { @@ -573,8 +643,8 @@ } return false; } - - + + /** * check if URL is a candidate for indexing * @@ -590,7 +660,7 @@ } return true; } - + final String s = url.toString(); Iterator i = includeCrawlingURL.iterator(); while (i.hasNext()) { @@ -607,12 +677,11 @@ } return false; } - + public void recycle() { super.recycle(); - + this.attributes = null; //this.excludeCrawlingURL = null; } } -
---------------------------------------------------------------------- In case of troubles, e-mail: [EMAIL PROTECTED] To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]