upayavira 2003/09/18 05:11:50
Modified: src/java/org/apache/cocoon Main.java src/java/org/apache/cocoon/bean BeanListener.java CocoonBean.java Target.java src/java/org/apache/cocoon/bean/helpers DelayedOutputStream.java OutputStreamListener.java src/java/org/apache/cocoon/util NetUtils.java Log: Added removeAuthentication() to NetUtils, removes FTP and HTTP authentication details from a URI. Could do it with more protocols. Added page skipping reporting (link preceded with ^) Added facility to only crawl certain extensions (this didn't give any hoped for speed improvement, but I might as well leave it there, as some might want to crawl HTML but not PDFs) Prevented link gathering when running in link view mode Added code to bean to report time taken and page size Improved layout of console reporting of CLI (now shows links found, time taken, page size and URI, all in nice columns) Revision Changes Path 1.16 +13 -1 cocoon-2.1/src/java/org/apache/cocoon/Main.java Index: Main.java =================================================================== RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/Main.java,v retrieving revision 1.15 retrieving revision 1.16 diff -u -r1.15 -r1.16 --- Main.java 17 Sep 2003 01:13:44 -0000 1.15 +++ Main.java 18 Sep 2003 12:11:49 -0000 1.16 @@ -162,6 +162,9 @@ private static final String NODE_EXCLUDE = "exclude"; private static final String ATTR_INCLUDE_EXCLUDE_PATTERN = "pattern"; + private static final String NODE_INCLUDE_LINKS = "include-links"; + private static final String ATTR_LINK_EXTENSION = "extension"; + private static final String NODE_URI = "uri"; private static final String ATTR_URI_TYPE = "type"; private static final String ATTR_URI_SOURCEPREFIX = "src-prefix"; @@ -501,6 +504,9 @@ String pattern = Main.parseIncludeExcludeNode(cocoon, node, NODE_EXCLUDE); cocoon.addExcludePattern(pattern); + } else if (nodeName.equals(NODE_INCLUDE_LINKS)) { + Main.parseIncludeLinksNode(cocoon, node); + } else if (nodeName.equals(NODE_URI)) { Main.parseURINode(cocoon, node, destDir); @@ -532,6 +538,12 @@ NodeList nodes = node.getChildNodes(); if (nodes.getLength()!=0) { throw new IllegalArgumentException("Unexpected children of <" + NODE_LOGGING + "> node"); + } + } + + private static void parseIncludeLinksNode(CocoonBean cocoon, Node node) throws IllegalArgumentException { + if (Main.hasAttribute(node, ATTR_LINK_EXTENSION)) { + cocoon.addIncludeLinkExtension(Main.getAttributeValue(node, ATTR_LINK_EXTENSION)); } } 1.3 +3 -3 cocoon-2.1/src/java/org/apache/cocoon/bean/BeanListener.java Index: BeanListener.java =================================================================== RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/BeanListener.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- BeanListener.java 15 Sep 2003 19:18:17 -0000 1.2 +++ BeanListener.java 18 Sep 2003 12:11:50 -0000 1.3 @@ -71,7 +71,7 @@ int linksInPage, int newLinksinPage, int pagesRemaining, - int pageComplete, + int pagesComplete, long timeTaken); /** @@ -79,7 +79,7 @@ * include/exclude pattern. * @param msg */ - public void pageSkipped(String uri); + public void pageSkipped(String uri, String message); /** * Report a general message about operation of the bean 1.27 +68 -17 cocoon-2.1/src/java/org/apache/cocoon/bean/CocoonBean.java Index: CocoonBean.java =================================================================== RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/CocoonBean.java,v retrieving revision 1.26 retrieving revision 1.27 diff -u -r1.26 -r1.27 --- CocoonBean.java 15 Sep 2003 19:18:17 -0000 1.26 +++ CocoonBean.java 18 Sep 2003 12:11:50 -0000 1.27 @@ -100,7 +100,8 @@ private String brokenLinkExtension = ""; private List excludePatterns = new ArrayList(); private List includePatterns = new ArrayList(); - + private List includeLinkExtensions = null; + // Internal Objects private Map allProcessedLinks; private Map allTranslatedLinks; @@ -223,15 +224,36 @@ includePatterns.add(preparedPattern); } + public void addIncludeLinkExtension(String extension) { + if (includeLinkExtensions == null) { + includeLinkExtensions = new ArrayList(); + } + includeLinkExtensions.add(extension); + } + public void addListener(BeanListener listener) { this.listeners.add(listener); } - public void pageGenerated(String uri, int linksInPage, int pagesRemaining) { + public void pageGenerated(String sourceURI, + String destURI, + int pageSize, + int linksInPage, + int newLinksInPage, + int pagesRemaining, + int pagesComplete, + long timeTaken) { Iterator i = listeners.iterator(); while (i.hasNext()) { BeanListener l = (BeanListener) i.next(); - l.pageGenerated(uri, "", 0, linksInPage, 0, pagesRemaining, 0, 0L); + l.pageGenerated(sourceURI, + destURI, + pageSize, + linksInPage, + newLinksInPage, + pagesRemaining, + pagesComplete, + timeTaken); } } @@ -259,6 +281,14 @@ } } + public void pageSkipped(String uri, String message) { + Iterator i = listeners.iterator(); + while (i.hasNext()) { + BeanListener l = (BeanListener) i.next(); + l.pageSkipped(uri, message); + } + } + public void dispose() { if (this.initialized) { if (this.sourceResolver != null) { @@ -371,6 +401,10 @@ int status = 0; int linkCount = 0; + int newLinkCount = 0; + int pageSize = 0; + + long startTimeMillis = System.currentTimeMillis(); if (confirmExtension) { if (null == allTranslatedLinks.get(target.getSourceURI())) { @@ -389,9 +423,8 @@ // Process links final HashMap translatedLinks = new HashMap(); - List gatheredLinks = new ArrayList(); final List targets = new ArrayList(); - if (followLinks && confirmExtension) { + if (followLinks && confirmExtension && isCrawlablePage(target)) { final Iterator i = this.getLinks(target.getDeparameterizedSourceURI(), target.getParameters()).iterator(); @@ -400,13 +433,12 @@ Target linkTarget = target.getDerivedTarget(linkURI); if (linkTarget == null) { - System.out.println("Skipping "+ linkURI); - //@TODO@ Log/report skipped link + pageSkipped(linkURI, "link does not share same root as parent"); continue; } if (!isIncluded(linkTarget.getSourceURI())) { - //@TODO@ Log/report skipped link + pageSkipped(linkTarget.getSourceURI(), "matched include/exclude rules"); continue; } @@ -436,6 +468,13 @@ // Process URI DelayedOutputStream output = new DelayedOutputStream(); try { + List gatheredLinks; + if (!confirmExtension && followLinks && isCrawlablePage(target)) { + gatheredLinks = new ArrayList(); + } else { + gatheredLinks = null; + } + status = getPage( target.getDeparameterizedSourceURI(), @@ -450,7 +489,7 @@ "Resource not found: " + status); } - if (followLinks && !confirmExtension) { + if (gatheredLinks != null) { for (Iterator it = gatheredLinks.iterator();it.hasNext();) { String linkURI = (String) it.next(); Target linkTarget = target.getDerivedTarget(linkURI); @@ -470,7 +509,6 @@ linkCount = gatheredLinks.size(); } - pageGenerated(target.getSourceURI(), linkCount, 0); // @todo@ get the number of pages remaining here } catch (ProcessingException pe) { output.close(); output = null; @@ -482,11 +520,21 @@ ModifiableSource source = getSource(target); try { + pageSize = output.size(); OutputStream stream = source.getOutputStream(); output.setFileOutputStream(stream); output.flush(); output.close(); + pageGenerated(target.getSourceURI(), + target.getAuthlessDestURI(), + pageSize, + linkCount, + newLinkCount, + 0, //pagesRemaining, @TODO@ Implement this + 0, //pagesComplete, @TODO@ Implement this + System.currentTimeMillis()- startTimeMillis); + } catch (IOException ioex) { log.warn(ioex.toString()); } finally { @@ -499,11 +547,6 @@ this.sendBrokenLinkWarning(target.getSourceURI(), "URI not found"); } -/* Commenting out timestamp - will reimplement properly using the BeanListener interface - double d = (System.currentTimeMillis()- startTimeMillis); - String time = " [" + (d/1000) + " seconds]"; - System.out.println(" "+ time); -*/ return targets; } @@ -520,7 +563,7 @@ //String brokenFile = NetUtils.decodePath(destinationURI); if (brokenLinkExtension != null) { - target.setExtension(brokenLinkExtension); + target.setExtraExtension(brokenLinkExtension); } SimpleNotifyingBean n = new SimpleNotifyingBean(this); n.setType("resource-not-found"); @@ -596,5 +639,13 @@ } } return included; + } + private boolean isCrawlablePage(Target target) { + if (includeLinkExtensions == null) { + return true; + } else { + String extension = target.getExtension(); + return includeLinkExtensions.contains(target.getExtension()); + } } } 1.6 +17 -2 cocoon-2.1/src/java/org/apache/cocoon/bean/Target.java Index: Target.java =================================================================== RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/Target.java,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- Target.java 15 Sep 2003 19:18:18 -0000 1.5 +++ Target.java 18 Sep 2003 12:11:50 -0000 1.6 @@ -186,7 +186,7 @@ * stand out, within the file structure of the generated site, by, for * example, adding '.error' to the end of the filename. */ - public void setExtension(String extension) { + public void setExtraExtension(String extension) { this.extension = extension; this.finalDestinationURI = null; } @@ -230,6 +230,13 @@ return NetUtils.getPath(this.getSourceURI()); } + /** + * Gets the file extension for the source URI + */ + public String getExtension() { + return NetUtils.getExtension(this.getSourceURI()); + } + /** * Gets the parent URI (the URI of the page that contained * a link to this URI). null is returned if this page was @@ -325,6 +332,14 @@ return NetUtils.relativize(path, actualSourceURI); } + /** + * + * @return + */ + public String getAuthlessDestURI() throws ProcessingException { + return NetUtils.removeAuthorisation(this.getDestinationURI()); + } + /** * Gets the original URI used to create this Target. * This URI is completely unprocessed. 1.4 +10 -1 cocoon-2.1/src/java/org/apache/cocoon/bean/helpers/DelayedOutputStream.java Index: DelayedOutputStream.java =================================================================== RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/helpers/DelayedOutputStream.java,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- DelayedOutputStream.java 10 Jun 2003 11:17:25 -0000 1.3 +++ DelayedOutputStream.java 18 Sep 2003 12:11:50 -0000 1.4 @@ -250,4 +250,13 @@ throw new IOException("No outputstream available!"); } } + /** + * Gets the size of the content of the current output stream + */ + public int size() { + if (baos != null) { + return baos.size(); + } + return 0; + } } 1.2 +40 -10 cocoon-2.1/src/java/org/apache/cocoon/bean/helpers/OutputStreamListener.java Index: OutputStreamListener.java =================================================================== RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/helpers/OutputStreamListener.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- OutputStreamListener.java 15 Sep 2003 19:18:18 -0000 1.1 +++ OutputStreamListener.java 18 Sep 2003 12:11:50 -0000 1.2 @@ -93,15 +93,28 @@ String destinationURI, int pageSize, int linksInPage, - int newLinksinPage, + int newLinksInPage, int pagesRemaining, - int pageComplete, + int pagesComplete, long timeTaken) { + double time = (((double)timeTaken)/1000); + + String size; + if (pageSize < 1024) { + size = pageSize + "b"; + } else { + size = ((float)((int)(pageSize/102.4)))/10 + "Kb"; + } + if (linksInPage == -1) { this.print("* " + sourceURI); } else { - this.print("* ["+linksInPage + "] "+sourceURI); - } + this.print(pad(13, "* ["+linksInPage + "/" + newLinksInPage + "] ") + + pad(7,time + "s ") + + pad(8, size) + + sourceURI); + } + } public void messageGenerated(String msg) { this.print(msg); @@ -112,7 +125,7 @@ } public void brokenLinkFound(String uri, String parentURI, String message, Throwable t) { - this.print("X [0] "+uri+"\tBROKEN: "+message); + this.print(pad(28,"X [0] ")+uri+"\tBROKEN: "+message); brokenLinks.add(uri + "\t" + message); // StringWriter sw = new StringWriter(); @@ -121,8 +134,8 @@ } - public void pageSkipped(String uri) { - // @TODO@ Do something + public void pageSkipped(String uri, String message) { + this.print("^ "+uri); } public void complete() { @@ -130,8 +143,7 @@ long duration = System.currentTimeMillis() - startTimeMillis; this.print("Total time: " + (duration / 60000) + " minutes " + (duration % 60000)/1000 + " seconds"); - writer.flush(); - writer.close(); + this.close(); } public boolean isSuccessful() { @@ -184,7 +196,25 @@ } } + private String pad(int chars, String str) { + int len = str.length(); + if (len < chars) { + StringBuffer sb = new StringBuffer(chars > len ? chars+1 : len+1); + sb.append(str); + for (int i=len; i<chars; i++) { + sb.append(" "); + } + return sb.toString(); + } + return str; + } + private void print(String message) { writer.println(message); + writer.flush(); + } + + private void close() { + writer.close(); } } 1.4 +10 -1 cocoon-2.1/src/java/org/apache/cocoon/util/NetUtils.java Index: NetUtils.java =================================================================== RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/util/NetUtils.java,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- NetUtils.java 27 May 2003 08:46:58 -0000 1.3 +++ NetUtils.java 18 Sep 2003 12:11:50 -0000 1.4 @@ -447,4 +447,13 @@ return pars; } + /** + * Remove any authorisation details from a URI + */ + public static String removeAuthorisation(String uri) { + if (uri.indexOf("@")!=-1 && (uri.startsWith("ftp://") || uri.startsWith("http://"))) { + return uri.substring(0, uri.indexOf(":")+2)+uri.substring(uri.indexOf("@")+1); + } + return uri; + } }