upayavira 2003/10/06 05:40:14
Modified: src/java/org/apache/cocoon/bean CocoonBean.java src/java/org/apache/cocoon/bean/helpers OutputStreamListener.java Log: Moved crawling code into a separate crawler class. This has made it possible to implement all of my proposed reporting options. The CLI now reports: * [no of pages generated/no of pages left] [new links in page/total pages in links] page URI Revision Changes Path 1.32 +52 -75 cocoon-2.1/src/java/org/apache/cocoon/bean/CocoonBean.java Index: CocoonBean.java =================================================================== RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/CocoonBean.java,v retrieving revision 1.31 retrieving revision 1.32 diff -u -r1.31 -r1.32 --- CocoonBean.java 1 Oct 2003 20:27:49 -0000 1.31 +++ CocoonBean.java 6 Oct 2003 12:40:14 -0000 1.32 @@ -53,6 +53,7 @@ import org.apache.cocoon.Constants; import org.apache.cocoon.ResourceNotFoundException; import org.apache.cocoon.ProcessingException; +import org.apache.cocoon.bean.helpers.Crawler; import org.apache.cocoon.bean.helpers.DelayedOutputStream; import org.apache.cocoon.components.notification.SimpleNotifyingBean; import org.apache.cocoon.components.notification.Notifier; @@ -95,7 +96,6 @@ private boolean precompileOnly = false; private boolean confirmExtension = true; private String defaultFilename = Constants.INDEX_URI; - private List targets = new ArrayList(); private boolean brokenLinkGenerate = false; private String brokenLinkExtension = ""; private List excludePatterns = new ArrayList(); @@ -103,12 +103,15 @@ private List includeLinkExtensions = null; // Internal Objects - private Map allProcessedLinks; - private Map allTranslatedLinks; private boolean initialized; private List listeners = new ArrayList(); private boolean verbose; SourceResolver sourceResolver; + private Crawler crawler; + + public CocoonBean() { + this.crawler = new Crawler(); + } // // INITIALISATION METHOD @@ -118,7 +121,7 @@ if (this.initialized == false) { super.initialize(); - if (targets.size() == 0 && !precompileOnly) { + if (crawler.getRemainingCount() == 0 && !precompileOnly) { String error = "Please, specify at least one starting URI."; log.fatalError(error); throw new ProcessingException(error); @@ -196,7 +199,7 @@ target.setFollowLinks(this.followLinks); target.setConfirmExtension(this.confirmExtension); target.setLogger(this.logger); - targets.add(target); + crawler.addTarget(target); } public void addTarget(String type, String sourceURI, String destURI) @@ -206,7 +209,7 @@ target.setFollowLinks(this.followLinks); target.setConfirmExtension(this.confirmExtension); target.setLogger(this.logger); - targets.add(target); + crawler.addTarget(target); } public void addTarget(String sourceURI, String destURI) @@ -216,7 +219,7 @@ target.setFollowLinks(this.followLinks); target.setConfirmExtension(this.confirmExtension); target.setLogger(this.logger); - targets.add(target); + crawler.addTarget(target); } public void addTargets(List uris, String destURI) @@ -228,7 +231,7 @@ target.setFollowLinks(this.followLinks); target.setConfirmExtension(this.confirmExtension); target.setLogger(this.logger); - targets.add(target); + crawler.addTarget(target); } } @@ -254,7 +257,7 @@ target.setFollowLinks(followLinks); target.setConfirmExtension(confirmExtension); target.setLogger(logger); - targets.add(target); + crawler.addTarget(target); } public void addExcludePattern(String pattern) { @@ -351,60 +354,34 @@ this.initialize(); } - allProcessedLinks = new HashMap(); - allTranslatedLinks = new HashMap(); - - Map targetMap = new HashMap(); - Iterator i = targets.iterator(); - while (i.hasNext()) { - Target target = (Target) i.next(); - targetMap.put(target, target); - } - - int nCount = 0; - while (targetMap.size() > 0) { - Target target = (Target) targetMap.keySet().iterator().next(); - try { - if (!allProcessedLinks.containsKey(target)) { - if (precompileOnly) { - processXSP(target.getSourceURI()); - } else if (this.followLinks) { - i = processTarget(target).iterator(); - while (i.hasNext()) { - Target link = (Target) i.next(); - targetMap.put(link, link); - } - } else { - processTarget(target); - } + if (crawler.getRemainingCount()==0) { + super.precompile(); + } else { + Iterator iterator = crawler.iterator(); + while (iterator.hasNext()) { + Target target = (Target) iterator.next(); + if (precompileOnly) { + processXSP(target.getSourceURI()); + } else { + processTarget(crawler, target); } - } catch (ResourceNotFoundException rnfe) { - this.sendBrokenLinkWarning(target.getSourceURI(), rnfe.getMessage()); } - - targetMap.remove(target); - nCount++; - - if (log.isInfoEnabled()) { - log.info( - " Memory used: " - + (Runtime.getRuntime().totalMemory() - - Runtime.getRuntime().freeMemory())); - log.info( - " Processed, Translated & Left: " - + allProcessedLinks.size() - + ", " - + allTranslatedLinks.size() - + ", " - + targetMap.size()); - } - } - - if (nCount == 0) { - super.precompile(); } + if (log.isInfoEnabled()) { + log.info( + " Memory used: " + + (Runtime.getRuntime().totalMemory() + - Runtime.getRuntime().freeMemory())); + log.info( + " Processed, Translated & Left: " + + crawler.getProcessedCount() + + ", " + + crawler.getTranslatedCount() + + ", " + + crawler.getRemainingCount()); + } } - + /** * Processes the given Target and return all links. * @@ -439,25 +416,22 @@ * Target objects. * @exception Exception if an error occurs */ - private Collection processTarget(Target target) throws Exception { + private void processTarget(Crawler crawler, Target target) throws Exception { int status = 0; int linkCount = 0; int newLinkCount = 0; int pageSize = 0; - long startTimeMillis = System.currentTimeMillis(); if (target.confirmExtensions()) { - if (null == allTranslatedLinks.get(target.getSourceURI())) { + if (!crawler.hasTranslatedLink(target)) { final String mimeType = getType(target.getDeparameterizedSourceURI(), target.getParameters()); target.setMimeType(mimeType); - allTranslatedLinks.put(target.getSourceURI(), target.getDestinationURI()); + crawler.addTranslatedLink(target); } } - // Store processed URI list to avoid eternal loop - allProcessedLinks.put(target, target); // IS THIS STILL NEEDED? //if ("".equals(destinationURI)) { @@ -466,7 +440,6 @@ // Process links final HashMap translatedLinks = new HashMap(); - final List targets = new ArrayList(); if (target.followLinks() && target.confirmExtensions() && isCrawlablePage(target)) { final Iterator i = this.getLinks(target.getDeparameterizedSourceURI(), target.getParameters()).iterator(); @@ -485,18 +458,22 @@ continue; } - if (null == allTranslatedLinks.get(linkTarget.getSourceURI())) { + if (!crawler.hasTranslatedLink(linkTarget)) { try { final String mimeType = getType(linkTarget.getDeparameterizedSourceURI(), linkTarget.getParameters()); linkTarget.setMimeType(mimeType); - allTranslatedLinks.put(linkTarget.getSourceURI(), linkTarget.getDestinationURI()); + crawler.addTranslatedLink(linkTarget); log.info(" Link translated: " + linkTarget.getSourceURI()); - targets.add(linkTarget); + if (crawler.addTarget(linkTarget)) { + newLinkCount++; + } } catch (ProcessingException pe) { this.sendBrokenLinkWarning(linkTarget.getSourceURI(), pe.getMessage()); if (this.brokenLinkGenerate) { - targets.add(linkTarget); + if (crawler.addTarget(linkTarget)) { + newLinkCount++; + } } } } @@ -546,7 +523,9 @@ pageSkipped(linkTarget.getSourceURI(), "matched include/exclude rules"); continue; } - targets.add(linkTarget); + if (crawler.addTarget(linkTarget)) { + newLinkCount++; + } } linkCount = gatheredLinks.size(); } @@ -573,8 +552,8 @@ pageSize, linkCount, newLinkCount, - 0, //pagesRemaining, @TODO@ Implement this - 0, //pagesComplete, @TODO@ Implement this + crawler.getRemainingCount(), + crawler.getProcessedCount(), System.currentTimeMillis()- startTimeMillis); } catch (IOException ioex) { @@ -588,8 +567,6 @@ log.warn("Could not process URI: " + target.getSourceURI()); this.sendBrokenLinkWarning(target.getSourceURI(), "URI not found"); } - - return targets; } /** 1.5 +16 -5 cocoon-2.1/src/java/org/apache/cocoon/bean/helpers/OutputStreamListener.java Index: OutputStreamListener.java =================================================================== RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/helpers/OutputStreamListener.java,v retrieving revision 1.4 retrieving revision 1.5 diff -u -r1.4 -r1.5 --- OutputStreamListener.java 27 Sep 2003 09:50:29 -0000 1.4 +++ OutputStreamListener.java 6 Oct 2003 12:40:14 -0000 1.5 @@ -75,6 +75,8 @@ private final long startTimeMillis; private String reportFile = null; private String reportType = "text"; + private long siteSize = 0L; + private int sitePages = 0; public OutputStreamListener(OutputStream os) { writer = new PrintWriter(os); @@ -97,6 +99,9 @@ int pagesRemaining, int pagesComplete, long timeTaken) { + this.siteSize += pageSize; + this.sitePages++; + double time = (((double)timeTaken)/1000); String size; @@ -109,7 +114,8 @@ if (linksInPage == -1) { this.print("* " + sourceURI); } else { - this.print(pad(8, "* ["+linksInPage + "] ") + + this.print(pad(12, "* [" + pagesComplete + "/" + pagesRemaining + "] ") + + pad(10, "[" + newLinksInPage + "/" + linksInPage + "] ") + pad(7,time + "s ") + pad(7, size) + " " + sourceURI); @@ -125,7 +131,7 @@ } public void brokenLinkFound(String uri, String parentURI, String message, Throwable t) { - this.print(pad(28,"X [0] ")+uri+"\tBROKEN: "+message); + this.print(pad(42,"X [0] ")+uri+"\tBROKEN: "+message); brokenLinks.add(uri + "\t" + message); // StringWriter sw = new StringWriter(); @@ -135,14 +141,19 @@ } public void pageSkipped(String uri, String message) { - this.print(pad(23, "^ ") + uri); + this.print(pad(37, "^ ") + uri); } public void complete() { outputBrokenLinks(); long duration = System.currentTimeMillis() - startTimeMillis; - this.print("Total time: " + (duration / 60000) + " minutes " + (duration % 60000)/1000 + " seconds"); + + this.print("Total time: " + + (duration / 60000) + " minutes " + + (duration % 60000)/1000 + " seconds, " + + " Site size: " + this.siteSize + + " Site pages: " + this.sitePages); this.close(); }