huber 2002/08/04 11:01:22 Modified: src/java/org/apache/cocoon/components/crawler SimpleCocoonCrawlerImpl.java Log: Check if contentType is null, close BufferedInputStream explicitly Submitted by: [EMAIL PROTECTED] Reviewed by: [EMAIL PROTECTED] Revision Changes Path 1.12 +111 -96 xml-cocoon2/src/java/org/apache/cocoon/components/crawler/SimpleCocoonCrawlerImpl.java Index: SimpleCocoonCrawlerImpl.java =================================================================== RCS file: /home/cvs/xml-cocoon2/src/java/org/apache/cocoon/components/crawler/SimpleCocoonCrawlerImpl.java,v retrieving revision 1.11 retrieving revision 1.12 diff -u -r1.11 -r1.12 --- SimpleCocoonCrawlerImpl.java 31 Jul 2002 13:13:21 -0000 1.11 +++ SimpleCocoonCrawlerImpl.java 4 Aug 2002 18:01:22 -0000 1.12 @@ -1,36 +1,36 @@ /* - + ============================================================================ The Apache Software License, Version 1.1 ============================================================================ - + Copyright (C) 1999-2002 The Apache Software Foundation. All rights reserved. - + Redistribution and use in source and binary forms, with or without modifica- tion, are permitted provided that the following conditions are met: - + 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - + 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - + 3. The end-user documentation included with the redistribution, if any, must include the following acknowledgment: "This product includes software developed by the Apache Software Foundation (http://www.apache.org/)." Alternately, this acknowledgment may appear in the software itself, if and wherever such third-party acknowledgments normally appear. - + 4. The names "Apache Cocoon" and "Apache Software Foundation" must not be used to endorse or promote products derived from this software without prior written permission. For written permission, please contact [EMAIL PROTECTED] - + 5. Products derived from this software may not be called "Apache", nor may "Apache" appear in their name, without prior written permission of the Apache Software Foundation. - + THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE @@ -41,13 +41,13 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - + This software consists of voluntary contributions made by many individuals on behalf of the Apache Software Foundation and was originally created by Stefano Mazzocchi <[EMAIL PROTECTED]>. For more information on the Apache Software Foundation, please see <http://www.apache.org/>. - -*/ + + */ package org.apache.cocoon.components.crawler; import org.apache.avalon.framework.activity.Disposable; @@ -73,6 +73,7 @@ import java.io.IOException; import java.net.URL; import java.net.URLConnection; +import java.net.HttpURLConnection; import java.util.HashSet; import java.util.Iterator; import java.util.List; @@ -97,7 +98,7 @@ * @since */ public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type"; - + /** * Default value of <code>link-content-type</code> configuration value. * <p> @@ -107,7 +108,7 @@ * @since */ public final String LINK_CONTENT_TYPE_DEFAULT = "application/x-cocoon-links"; - + /** * Config element name specifying query-string appendend for requesting links * of an URL. @@ -118,7 +119,7 @@ * @since */ public final static String LINK_VIEW_QUERY_CONFIG = "link-view-query"; - + /** * Default value of <code>link-view-query</code> configuration value. * <p> @@ -128,7 +129,7 @@ * @since */ public final static String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links"; - + /** * Config element name specifying excluding regular expression pattern. * <p> @@ -138,7 +139,7 @@ * @since */ public final static String EXCLUDE_CONFIG = "exclude"; - + /** * Config element name specifying including regular expression pattern. * <p> @@ -148,7 +149,7 @@ * @since */ public final static String INCLUDE_CONFIG = "include"; - + /** * Config element name specifying http header value for user-Agent. * <p> @@ -158,17 +159,15 @@ * @since */ public final static String USER_AGENT_CONFIG = "user-agent"; - + /** * Default value of <code>user-agent</code> configuration value. - * <p> - * Its value is @see org.apache.cocoon.Constants#COMPLETE_NAME. - * </p> + * @see org.apache.cocoon.Constants#COMPLETE_NAME. * * @since */ public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME; - + /** * Config element name specifying http header value for accept. * <p> @@ -178,7 +177,7 @@ * @since */ public final static String ACCEPT_CONFIG = "accept"; - + /** * Default value of <code>accept</code> configuration value. * <p> @@ -188,18 +187,18 @@ * @since */ public final static String ACCEPT_DEFAULT = "*/*"; - + private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT; private String linkContentType = LINK_CONTENT_TYPE_DEFAULT; private HashSet excludeCrawlingURL; private HashSet includeCrawlingURL; private String userAgent = USER_AGENT_DEFAULT; private String accept = ACCEPT_DEFAULT; - + private HashSet crawled; private HashSet urlsToProcess; - - + + /** * Constructor for the SimpleCocoonCrawlerImpl object * @@ -211,8 +210,8 @@ // by default exclude common image patterns excludeCrawlingURL = null; } - - + + /** * Configure the crawler component. * <p> @@ -236,8 +235,8 @@ * @since */ public void configure(Configuration configuration) - throws ConfigurationException { - + throws ConfigurationException { + Configuration[] children; children = configuration.getChildren(INCLUDE_CONFIG); if (children != null && children.length > 0) { @@ -252,7 +251,7 @@ } } catch (RESyntaxException rese) { getLogger().error("Cannot create including regular-expression for " + - pattern, rese); + pattern, rese); } } } else { @@ -260,7 +259,7 @@ getLogger().debug("Include all URLs"); } } - + children = configuration.getChildren(EXCLUDE_CONFIG); if (children != null && children.length > 0) { excludeCrawlingURL = new HashSet(); @@ -274,7 +273,7 @@ } } catch (RESyntaxException rese) { getLogger().error("Cannot create excluding regular-expression for " + - pattern, rese); + pattern, rese); } } } else { @@ -284,7 +283,7 @@ getLogger().debug("Exclude default URLs only"); } } - + Configuration child; String value; child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false); @@ -301,7 +300,7 @@ this.linkViewQuery = value.trim(); } } - + child = configuration.getChild(USER_AGENT_CONFIG, false); if (child != null) { value = child.getValue(); @@ -309,7 +308,7 @@ this.userAgent = value; } } - + child = configuration.getChild(ACCEPT_CONFIG, false); if (child != null) { value = child.getValue(); @@ -318,8 +317,8 @@ } } } - - + + /** * dispose at end of life cycle, releasing all resources. * @@ -331,8 +330,8 @@ excludeCrawlingURL = null; includeCrawlingURL = null; } - - + + /** * recylcle this object, relasing resources * @@ -342,8 +341,8 @@ crawled = null; urlsToProcess = null; } - - + + /** * Start crawling a URL. * @@ -375,15 +374,15 @@ public void crawl(URL url) { crawled = new HashSet(); urlsToProcess = new HashSet(); - + if (getLogger().isDebugEnabled()) { getLogger().debug("crawl URL " + url); } - + urlsToProcess.add(url); } - - + + /** * Return iterator, iterating over all links of the currently crawled URL. * <p> @@ -397,8 +396,8 @@ public Iterator iterator() { return new CocoonCrawlerIterator(this); } - - + + /** * Default exclude patterns. * <p> @@ -416,25 +415,25 @@ */ private void setDefaultExcludeFromCrawling() { String[] EXCLUDE_FROM_CRAWLING_DEFAULT = { - ".*\\.gif(\\?.*)?$", - ".*\\.png(\\?.*)?$", - ".*\\.jpe?g(\\?.*)?$", - ".*\\.js(\\?.*)?$", - ".*\\.css(\\?.*)?$" - }; - + ".*\\.gif(\\?.*)?$", + ".*\\.png(\\?.*)?$", + ".*\\.jpe?g(\\?.*)?$", + ".*\\.js(\\?.*)?$", + ".*\\.css(\\?.*)?$" + }; + for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) { String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i]; try { excludeCrawlingURL.add(new RE(pattern)); } catch (RESyntaxException rese) { getLogger().error("Cannot create excluding regular-expression for " + - pattern, rese); + pattern, rese); } } } - - + + /** * Compute list of links from the url. * <p> @@ -449,44 +448,54 @@ private List getLinks(URL url) { ArrayList url_links = null; String sURL = url.toString(); - + if (!isIncludedURL(sURL) || isExcludedURL(sURL)) { return null; } - + // don't try to get links for url which has been crawled already if (crawled.contains(sURL)) { return null; } - + // mark it as crawled crawled.add(sURL); - + // get links of url if (getLogger().isDebugEnabled()) { getLogger().debug("Getting links of URL " + sURL); } + BufferedReader br = null; try { sURL = url.getFile(); URL links = new URL(url, sURL - + ((sURL.indexOf("?") == -1) ? "?" : "&") - + linkViewQuery); + + ((sURL.indexOf("?") == -1) ? "?" : "&") + + linkViewQuery); URLConnection links_url_connection = links.openConnection(); InputStream is = links_url_connection.getInputStream(); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - + br = new BufferedReader(new InputStreamReader(is)); + String contentType = links_url_connection.getContentType(); + if (contentType == null) { + if (getLogger().isDebugEnabled()) { + getLogger().debug("Ignoring " + sURL + " (no content type)"); + } + // there is a check on null in the calling method + return null; + } + int index = contentType.indexOf(';'); - if (contentType != null && index != -1) { + if (index != -1) { contentType = contentType.substring(0, index); } + if (getLogger().isDebugEnabled()) { getLogger().debug("Content-type: " + contentType); } - + if (contentType.equals(linkContentType)) { url_links = new ArrayList(); - + // content is supposed to be a list of links, // relative to current URL String line; @@ -497,17 +506,17 @@ if (add_url) { add_url &= !url_links.contains(new_url); } - + // don't add new_url if it has been crawled already if (add_url) { add_url &= !crawled.contains(new_url.toString()); } - + // don't add if is not matched by existing include definition if (add_url) { add_url &= isIncludedURL(new_url.toString()); } - + // don't add if is matched by existing exclude definition if (add_url) { add_url &= !isExcludedURL(new_url.toString()); @@ -523,11 +532,18 @@ } } catch (IOException ioe) { getLogger().warn("Problems get links of " + url, ioe); + } finally { + if( br != null ) { + try { + br.close(); + br = null; + } catch (IOException ignored ) {} + } } return url_links; } - - + + /** * check if URL is a candidate for indexing * @@ -540,7 +556,7 @@ if (excludeCrawlingURL == null) { return false; } - + final String s = url.toString(); Iterator i = excludeCrawlingURL.iterator(); while (i.hasNext()) { @@ -557,8 +573,8 @@ } return false; } - - + + /** * check if URL is a candidate for indexing * @@ -571,7 +587,7 @@ if (includeCrawlingURL == null) { return true; } - + final String s = url.toString(); Iterator i = includeCrawlingURL.iterator(); while (i.hasNext()) { @@ -588,8 +604,8 @@ } return false; } - - + + /** * Helper class implementing an Iterator * <p> @@ -600,11 +616,10 @@ * @author <a href="mailto:[EMAIL PROTECTED]">Bernhard Huber</a> * @version $Id$ */ - public static class CocoonCrawlerIterator implements Iterator - { + public static class CocoonCrawlerIterator implements Iterator { private SimpleCocoonCrawlerImpl cocoonCrawler; - - + + /** *Constructor for the CocoonCrawlerIterator object * @@ -614,8 +629,8 @@ CocoonCrawlerIterator(SimpleCocoonCrawlerImpl cocoonCrawler) { this.cocoonCrawler = cocoonCrawler; } - - + + /** * check if crawling is finished. * @@ -625,8 +640,8 @@ public boolean hasNext() { return cocoonCrawler.urlsToProcess.size() > 0; } - - + + /** * return the next URL * @@ -639,10 +654,10 @@ if (i.hasNext()) { // fetch a URL url = (URL) i.next(); - + // remove it from the to-do list cocoonCrawler.urlsToProcess.remove(url); - + // calc all links from this url List url_links = cocoonCrawler.getLinks(url); if (url_links != null) { @@ -653,8 +668,8 @@ // finally return this url return url; } - - + + /** * remove is not implemented *
---------------------------------------------------------------------- In case of troubles, e-mail: [EMAIL PROTECTED] To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]