jeremy 2003/10/06 09:30:41
Modified: src/java/org/apache/cocoon/components/crawler
CocoonCrawler.java SimpleCocoonCrawlerImpl.java
Log:
Applied patch to CocoonCrawler.java and SimpleCocoonCrawlerImpl.java from
[EMAIL PROTECTED] (Unico Hommes)
Revision Changes Path
1.3 +17 -11
cocoon-2.1/src/java/org/apache/cocoon/components/crawler/CocoonCrawler.java
Index: CocoonCrawler.java
===================================================================
RCS file:
/home/cvs/cocoon-2.1/src/java/org/apache/cocoon/components/crawler/CocoonCrawler.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- CocoonCrawler.java 16 Mar 2003 17:49:11 -0000 1.2
+++ CocoonCrawler.java 6 Oct 2003 16:30:41 -0000 1.3
@@ -67,23 +67,30 @@
/**
* Role name of this avalon component.
* Its value is
<code>org.apache.cocoon.components.crawler.CocoonCrawler</code>.
- *
- * @since
*/
String ROLE = CocoonCrawler.class.getName();
/**
- * start crawling the URL.
- * <p>
- * Calling this method initiates the crawling.
- * </p>
+ * This is the same as calling crawl(url,-1);
*
- * @param url Description of Parameter
- * @since
+ * @param url The URL to start crawling from.
*/
void crawl(URL url);
-
+
+
+ /**
+ * start crawling the URL.
+ * <p>
+ * Calling this method initiates the crawling and tells the
+ * crawler not to crawl beyond a maximum depth.
+ * </p>
+ *
+ * @param url The URL to start crawling from
+ * @param depth The maximum depth to crawl to. -1 for no maxiumum.
+ */
+ void crawl(URL url, int maxDepth);
+
/**
* Iterate over crawling URLs.
@@ -93,7 +100,6 @@
* </p>
*
* @return Iterator iterates over crawling URLs.
- * @since
*/
Iterator iterator();
}
1.2 +92 -93
cocoon-2.1/src/java/org/apache/cocoon/components/crawler/SimpleCocoonCrawlerImpl.java
Index: SimpleCocoonCrawlerImpl.java
===================================================================
RCS file:
/home/cvs/cocoon-2.1/src/java/org/apache/cocoon/components/crawler/SimpleCocoonCrawlerImpl.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- SimpleCocoonCrawlerImpl.java 9 Mar 2003 00:08:48 -0000 1.1
+++ SimpleCocoonCrawlerImpl.java 6 Oct 2003 16:30:41 -0000 1.2
@@ -86,8 +86,6 @@
* <p>
* Its value is <code>link-content-type</code>.
* </p>
- *
- * @since
*/
public final static String LINK_CONTENT_TYPE_CONFIG =
"link-content-type";
@@ -96,10 +94,8 @@
* <p>
* Its value is <code>application/x-cocoon-links</code>.
* </p>
- *
- * @since
*/
- public final String LINK_CONTENT_TYPE_DEFAULT =
"application/x-cocoon-links";
+ public final String LINK_CONTENT_TYPE_DEFAULT =
Constants.LINK_CONTENT_TYPE;
/**
* Config element name specifying query-string appendend for requesting
links
@@ -107,18 +103,14 @@
* <p>
* Its value is <code>link-view-query</code>.
* </p>
- *
- * @since
*/
public final static String LINK_VIEW_QUERY_CONFIG = "link-view-query";
/**
- * Default value of <code>link-view-query</code> configuration value.
+ * Default value of <code>link-view-query</code> configuration option.
* <p>
* Its value is <code>?cocoon-view=links</code>.
* </p>
- *
- * @since
*/
public final static String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";
@@ -127,8 +119,6 @@
* <p>
* Its value is <code>exclude</code>.
* </p>
- *
- * @since
*/
public final static String EXCLUDE_CONFIG = "exclude";
@@ -137,8 +127,6 @@
* <p>
* Its value is <code>include</code>.
* </p>
- *
- * @since
*/
public final static String INCLUDE_CONFIG = "include";
@@ -147,16 +135,12 @@
* <p>
* Its value is <code>user-agent</code>.
* </p>
- *
- * @since
*/
public final static String USER_AGENT_CONFIG = "user-agent";
/**
- * Default value of <code>user-agent</code> configuration value.
+ * Default value of <code>user-agent</code> configuration option.
* @see Constants#COMPLETE_NAME
- *
- * @since
*/
public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;
@@ -165,21 +149,18 @@
* <p>
* Its value is <code>accept</code>.
* </p>
- *
- * @since
*/
public final static String ACCEPT_CONFIG = "accept";
/**
- * Default value of <code>accept</code> configuration value.
+ * Default value of <code>accept</code> configuration option.
* <p>
* Its value is <code>* / *</code>
* </p>
- *
- * @since
*/
public final static String ACCEPT_DEFAULT = "*/*";
+
private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT;
private String linkContentType = LINK_CONTENT_TYPE_DEFAULT;
private HashSet excludeCrawlingURL;
@@ -187,14 +168,16 @@
private String userAgent = USER_AGENT_DEFAULT;
private String accept = ACCEPT_DEFAULT;
+ private int depth;
+
private HashSet crawled;
private HashSet urlsToProcess;
-
-
+ private HashSet urlsNextDepth;
+
+
+
/**
* Constructor for the SimpleCocoonCrawlerImpl object
- *
- * @since
*/
public SimpleCocoonCrawlerImpl() {
// by default include everything
@@ -220,11 +203,11 @@
* <exclude>.*\.gif</exclude> or <exclude>.*\.gif,
.*\.jpe?g</exclude>
* <link-content-type> application/x-cocoon-links
</link-content-type>
* <link-view-query> ?cocoon-view=links </link-view-query>
+ * <crawl-domain> host | web </crawl-domain>
* </tt></pre>
*
* @param configuration XML configuration of this avalon
component.
* @exception ConfigurationException is throwing if configuration is
invalid.
- * @since
*/
public void configure(Configuration configuration)
throws ConfigurationException {
@@ -308,17 +291,17 @@
this.accept = value;
}
}
+
}
/**
* dispose at end of life cycle, releasing all resources.
- *
- * @since
*/
public void dispose() {
crawled = null;
urlsToProcess = null;
+ urlsNextDepth = null;
excludeCrawlingURL = null;
includeCrawlingURL = null;
}
@@ -326,54 +309,65 @@
/**
* recylcle this object, relasing resources
- *
- * @since
*/
public void recycle() {
crawled = null;
urlsToProcess = null;
+ urlsNextDepth = null;
+ depth = -1;
}
/**
- * Start crawling a URL.
- *
- * <p>
- * Use this method to start crawling.
- * Get the this url, and all its children by using
<code>iterator()</code>.
- * The Iterator object will return URL objects.
- * </p>
- * <p>
- * You may use the crawl(), and iterator() methods the following way:
- * </p>
- * <pre><tt>
- * SimpleCocoonCrawlerImpl scci = ....;
- * scci.crawl( "http://foo/bar" );
- * Iterator i = scci.iterator();
- * while (i.hasNext()) {
- * URL url = (URL)i.next();
- * ...
- * }
- * </tt></pre>
- * <p>
- * The i.next() method returns a URL, and calculates the links of the
- * URL before return it.
- * </p>
- *
- * @param url Crawl this URL, getting all links from this URL.
- * @since
+ * The same as calling crawl(url,-1);
+ *
+ * @param url Crawl this URL, getting all links from this URL.
*/
public void crawl(URL url) {
- crawled = new HashSet();
- urlsToProcess = new HashSet();
-
- if (getLogger().isDebugEnabled()) {
- getLogger().debug("crawl URL " + url);
- }
-
- urlsToProcess.add(url);
+ crawl(url, -1);
}
+ /**
+ * Start crawling a URL.
+ *
+ * <p>
+ * Use this method to start crawling.
+ * Get the this url, and all its children by using
<code>iterator()</code>.
+ * The Iterator object will return URL objects.
+ * </p>
+ * <p>
+ * You may use the crawl(), and iterator() methods the following way:
+ * </p>
+ * <pre><tt>
+ * SimpleCocoonCrawlerImpl scci = ....;
+ * scci.crawl( "http://foo/bar" );
+ * Iterator i = scci.iterator();
+ * while (i.hasNext()) {
+ * URL url = (URL)i.next();
+ * ...
+ * }
+ * </tt></pre>
+ * <p>
+ * The i.next() method returns a URL, and calculates the links of the
+ * URL before return it.
+ * </p>
+ *
+ * @param url Crawl this URL, getting all links from this URL.
+ * @param maxDepth maximum depth to crawl to. -1 for no maximum.
+ */
+ public void crawl(URL url, int maxDepth) {
+ crawled = new HashSet();
+ urlsToProcess = new HashSet();
+ urlsNextDepth = new HashSet();
+ depth = maxDepth;
+
+ if (getLogger().isDebugEnabled()) {
+ getLogger().debug("crawl URL " + url + " to depth " +
maxDepth);
+ }
+
+ urlsToProcess.add(url);
+ }
+
/**
* Return iterator, iterating over all links of the currently crawled
URL.
@@ -464,6 +458,9 @@
+ ((sURL.indexOf("?") == -1) ? "?" : "&")
+ linkViewQuery);
URLConnection links_url_connection = links.openConnection();
+ links_url_connection.setRequestProperty("Accept", accept);
+ links_url_connection.setRequestProperty("User-Agent", userAgent);
+ links_url_connection.connect();
InputStream is = links_url_connection.getInputStream();
br = new BufferedReader(new InputStreamReader(is));
@@ -503,7 +500,7 @@
if (add_url) {
add_url &= !crawled.contains(new_url.toString());
}
-
+
// don't add if is not matched by existing include
definition
if (add_url) {
add_url &= isIncludedURL(new_url.toString());
@@ -536,13 +533,12 @@
return url_links;
}
-
+
/**
* check if URL is a candidate for indexing
*
- * @param url Description of Parameter
+ * @param url the URL to check
* @return The excludedURL value
- * @since
*/
private boolean isExcludedURL(String url) {
// by default do not exclude URL for crawling
@@ -573,7 +569,6 @@
*
* @param url Description of Parameter
* @return The includedURL value
- * @since
*/
private boolean isIncludedURL(String url) {
// by default include URL for crawling
@@ -597,7 +592,7 @@
}
return false;
}
-
+
/**
* Helper class implementing an Iterator
@@ -614,10 +609,9 @@
/**
- *Constructor for the CocoonCrawlerIterator object
+ * Constructor for the CocoonCrawlerIterator object
*
- * @param cocoonCrawler Description of Parameter
- * @since
+ * @param cocoonCrawler the containing CocoonCrawler instance.
*/
CocoonCrawlerIterator(SimpleCocoonCrawlerImpl cocoonCrawler) {
this.cocoonCrawler = cocoonCrawler;
@@ -627,21 +621,26 @@
/**
* check if crawling is finished.
*
- * @return Description of the Returned Value
- * @since
+ * @return <code>true</code> if crawling has finished,
+ * else <code>false</code>.
*/
public boolean hasNext() {
- return cocoonCrawler.urlsToProcess.size() > 0;
+ return cocoonCrawler.urlsToProcess.size() > 0
+ || cocoonCrawler.urlsNextDepth.size() > 0;
}
/**
- * return the next URL
- *
- * @return Description of the Returned Value
- * @since
+ * @return the next URL
*/
public Object next() {
+ if (cocoonCrawler.urlsToProcess.size() == 0
+ && cocoonCrawler.urlsNextDepth.size() > 0) {
+ // process queued urls belonging to the next depth level
+ cocoonCrawler.urlsToProcess =
cocoonCrawler.urlsNextDepth;
+ cocoonCrawler.urlsNextDepth = new HashSet();
+ cocoonCrawler.depth--;
+ }
URL url = null;
Iterator i = cocoonCrawler.urlsToProcess.iterator();
if (i.hasNext()) {
@@ -651,12 +650,14 @@
// remove it from the to-do list
cocoonCrawler.urlsToProcess.remove(url);
- // calc all links from this url
- List url_links = cocoonCrawler.getLinks(url);
- if (url_links != null) {
- // add links of this url to the to-do list
- cocoonCrawler.urlsToProcess.addAll(url_links);
- }
+ if (cocoonCrawler.depth == -1 ||
cocoonCrawler.depth > 0) {
+ // calc all links from this url
+ List url_links =
cocoonCrawler.getLinks(url);
+ if (url_links != null) {
+ // add links of this url to the
to-do list
+
cocoonCrawler.urlsNextDepth.addAll(url_links);
+ }
+ }
}
// finally return this url
return url;
@@ -665,8 +666,6 @@
/**
* remove is not implemented
- *
- * @since
*/
public void remove() {
throw new UnsupportedOperationException("remove is not
implemented");