upayavira    2003/09/18 05:11:50

  Modified:    src/java/org/apache/cocoon Main.java
               src/java/org/apache/cocoon/bean BeanListener.java
                        CocoonBean.java Target.java
               src/java/org/apache/cocoon/bean/helpers
                        DelayedOutputStream.java OutputStreamListener.java
               src/java/org/apache/cocoon/util NetUtils.java
  Log:
  Added removeAuthentication() to NetUtils, removes FTP and HTTP authentication 
details from a URI. Could do it with more protocols.
  Added page skipping reporting (link preceded with ^)
  Added facility to only crawl certain extensions (this didn't give any hoped 
for speed improvement, but I might as well leave it there, as some might want 
to crawl HTML but not PDFs)
  Prevented link gathering when running in link view mode
  Added code to bean to report time taken and page size
  Improved layout of console reporting of CLI (now shows links found, time 
taken, page size and URI, all in nice columns)
  
  Revision  Changes    Path
  1.16      +13 -1     cocoon-2.1/src/java/org/apache/cocoon/Main.java
  
  Index: Main.java
  ===================================================================
  RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/Main.java,v
  retrieving revision 1.15
  retrieving revision 1.16
  diff -u -r1.15 -r1.16
  --- Main.java 17 Sep 2003 01:13:44 -0000      1.15
  +++ Main.java 18 Sep 2003 12:11:49 -0000      1.16
  @@ -162,6 +162,9 @@
       private static final String NODE_EXCLUDE = "exclude";
       private static final String ATTR_INCLUDE_EXCLUDE_PATTERN = "pattern";
       
  +    private static final String NODE_INCLUDE_LINKS = "include-links";
  +    private static final String ATTR_LINK_EXTENSION = "extension";
  +    
       private static final String NODE_URI = "uri";
       private static final String ATTR_URI_TYPE = "type";
       private static final String ATTR_URI_SOURCEPREFIX = "src-prefix";
  @@ -501,6 +504,9 @@
                           String pattern = 
Main.parseIncludeExcludeNode(cocoon, node, NODE_EXCLUDE);
                           cocoon.addExcludePattern(pattern);
   
  +                    } else if (nodeName.equals(NODE_INCLUDE_LINKS)) {
  +                        Main.parseIncludeLinksNode(cocoon, node);
  +
                       } else if (nodeName.equals(NODE_URI)) {
                           Main.parseURINode(cocoon, node, destDir);
   
  @@ -532,6 +538,12 @@
           NodeList nodes = node.getChildNodes();
           if (nodes.getLength()!=0) {
               throw new IllegalArgumentException("Unexpected children of <" + 
NODE_LOGGING + "> node");
  +        }
  +    }
  +
  +    private static void parseIncludeLinksNode(CocoonBean cocoon, Node node) 
throws IllegalArgumentException {
  +        if (Main.hasAttribute(node, ATTR_LINK_EXTENSION)) {
  +            cocoon.addIncludeLinkExtension(Main.getAttributeValue(node, 
ATTR_LINK_EXTENSION));
           }
       }
   
  
  
  
  1.3       +3 -3      
cocoon-2.1/src/java/org/apache/cocoon/bean/BeanListener.java
  
  Index: BeanListener.java
  ===================================================================
  RCS file: 
/home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/BeanListener.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- BeanListener.java 15 Sep 2003 19:18:17 -0000      1.2
  +++ BeanListener.java 18 Sep 2003 12:11:50 -0000      1.3
  @@ -71,7 +71,7 @@
                                 int linksInPage, 
                                 int newLinksinPage, 
                                 int pagesRemaining, 
  -                              int pageComplete, 
  +                              int pagesComplete, 
                                 long timeTaken);
                                 
       /**
  @@ -79,7 +79,7 @@
        * include/exclude pattern.
        * @param msg            
        */
  -    public void pageSkipped(String uri);
  +    public void pageSkipped(String uri, String message);
       
       /**
        * Report a general message about operation of the bean
  
  
  
  1.27      +68 -17    
cocoon-2.1/src/java/org/apache/cocoon/bean/CocoonBean.java
  
  Index: CocoonBean.java
  ===================================================================
  RCS file: 
/home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/CocoonBean.java,v
  retrieving revision 1.26
  retrieving revision 1.27
  diff -u -r1.26 -r1.27
  --- CocoonBean.java   15 Sep 2003 19:18:17 -0000      1.26
  +++ CocoonBean.java   18 Sep 2003 12:11:50 -0000      1.27
  @@ -100,7 +100,8 @@
       private String brokenLinkExtension = "";
       private List excludePatterns = new ArrayList();
       private List includePatterns = new ArrayList();
  -
  +    private List includeLinkExtensions = null;
  +    
       // Internal Objects
       private Map allProcessedLinks;
       private Map allTranslatedLinks;
  @@ -223,15 +224,36 @@
           includePatterns.add(preparedPattern);
       }
   
  +    public void addIncludeLinkExtension(String extension) {
  +        if (includeLinkExtensions == null) {
  +            includeLinkExtensions = new ArrayList();
  +        }
  +        includeLinkExtensions.add(extension);
  +    }
  +    
       public void addListener(BeanListener listener) {
           this.listeners.add(listener);
       }
   
  -    public void pageGenerated(String uri, int linksInPage, int 
pagesRemaining) {
  +    public void pageGenerated(String sourceURI, 
  +                              String destURI, 
  +                              int pageSize, 
  +                              int linksInPage, 
  +                              int newLinksInPage, 
  +                              int pagesRemaining, 
  +                              int pagesComplete, 
  +                              long timeTaken) {
           Iterator i = listeners.iterator();
           while (i.hasNext()) {
               BeanListener l = (BeanListener) i.next();
  -            l.pageGenerated(uri, "", 0, linksInPage, 0, pagesRemaining, 0, 
0L);
  +            l.pageGenerated(sourceURI, 
  +                            destURI, 
  +                            pageSize, 
  +                            linksInPage, 
  +                            newLinksInPage,
  +                            pagesRemaining,
  +                            pagesComplete,
  +                            timeTaken);
           }
       }
   
  @@ -259,6 +281,14 @@
           }
       }
   
  +    public void pageSkipped(String uri, String message) {
  +        Iterator i = listeners.iterator();
  +        while (i.hasNext()) {
  +            BeanListener l = (BeanListener) i.next();
  +            l.pageSkipped(uri, message);
  +        }
  +    }
  +
       public void dispose() {
           if (this.initialized) {
               if (this.sourceResolver != null) {
  @@ -371,6 +401,10 @@
           int status = 0;
           
           int linkCount = 0;
  +        int newLinkCount = 0;
  +        int pageSize = 0;
  +        
  +        long startTimeMillis = System.currentTimeMillis();
   
           if (confirmExtension) {
               if (null == allTranslatedLinks.get(target.getSourceURI())) {
  @@ -389,9 +423,8 @@
   
           // Process links
           final HashMap translatedLinks = new HashMap();
  -        List gatheredLinks = new ArrayList();
           final List targets = new ArrayList();
  -        if (followLinks && confirmExtension) {
  +        if (followLinks && confirmExtension && isCrawlablePage(target)) {
               final Iterator i =
                   this.getLinks(target.getDeparameterizedSourceURI(), 
target.getParameters()).iterator();
   
  @@ -400,13 +433,12 @@
                   Target linkTarget = target.getDerivedTarget(linkURI);
   
                   if (linkTarget == null) {
  -                    System.out.println("Skipping "+ linkURI);
  -                    //@TODO@ Log/report skipped link
  +                    pageSkipped(linkURI, "link does not share same root as 
parent");
                       continue;
                   }
   
                   if (!isIncluded(linkTarget.getSourceURI())) {
  -                    //@TODO@ Log/report skipped link
  +                    pageSkipped(linkTarget.getSourceURI(), "matched 
include/exclude rules");
                       continue;
                   }
   
  @@ -436,6 +468,13 @@
               // Process URI
               DelayedOutputStream output = new DelayedOutputStream();
               try {
  +                List gatheredLinks;
  +                if (!confirmExtension && followLinks && 
isCrawlablePage(target)) {
  +                    gatheredLinks = new ArrayList();
  +                } else {
  +                    gatheredLinks = null;
  +                }
  +        
                   status =
                       getPage(
                           target.getDeparameterizedSourceURI(),
  @@ -450,7 +489,7 @@
                           "Resource not found: " + status);
                   }
   
  -                if (followLinks && !confirmExtension) {
  +                if (gatheredLinks != null) {
                       for (Iterator it = 
gatheredLinks.iterator();it.hasNext();) {
                           String linkURI = (String) it.next();
                           Target linkTarget = target.getDerivedTarget(linkURI);
  @@ -470,7 +509,6 @@
                       linkCount = gatheredLinks.size();
                   }
   
  -                pageGenerated(target.getSourceURI(), linkCount, 0); // 
@todo@ get the number of pages remaining here
               } catch (ProcessingException pe) {
                   output.close();
                   output = null;
  @@ -482,11 +520,21 @@
   
                       ModifiableSource source = getSource(target);
                       try {
  +                        pageSize = output.size();
                           OutputStream stream = source.getOutputStream();
   
                           output.setFileOutputStream(stream);
                           output.flush();
                           output.close();
  +                        pageGenerated(target.getSourceURI(), 
  +                                      target.getAuthlessDestURI(), 
  +                                      pageSize,
  +                                      linkCount,
  +                                      newLinkCount,
  +                                      0, //pagesRemaining,  @TODO@ Implement 
this
  +                                      0, //pagesComplete,   @TODO@ Implement 
this
  +                                      System.currentTimeMillis()- 
startTimeMillis);
  +
                       } catch (IOException ioex) {
                           log.warn(ioex.toString());
                       } finally {
  @@ -499,11 +547,6 @@
               this.sendBrokenLinkWarning(target.getSourceURI(), "URI not 
found");
           }
   
  -/*  Commenting out timestamp - will reimplement properly using the 
BeanListener interface
  -        double d = (System.currentTimeMillis()- startTimeMillis);
  -        String time = " [" + (d/1000) + " seconds]";
  -        System.out.println("        "+ time);
  -*/
           return targets;
       }
   
  @@ -520,7 +563,7 @@
               //String brokenFile = NetUtils.decodePath(destinationURI);
               
               if (brokenLinkExtension != null) {
  -                target.setExtension(brokenLinkExtension);
  +                target.setExtraExtension(brokenLinkExtension);
               }
               SimpleNotifyingBean n = new SimpleNotifyingBean(this);
               n.setType("resource-not-found");
  @@ -596,5 +639,13 @@
               }
           }
           return included;
  +    }
  +    private boolean isCrawlablePage(Target target) {
  +        if (includeLinkExtensions == null) {
  +            return true;
  +        } else {
  +            String extension = target.getExtension();
  +            return includeLinkExtensions.contains(target.getExtension());
  +        }
       }
   }
  
  
  
  1.6       +17 -2     cocoon-2.1/src/java/org/apache/cocoon/bean/Target.java
  
  Index: Target.java
  ===================================================================
  RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/Target.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- Target.java       15 Sep 2003 19:18:18 -0000      1.5
  +++ Target.java       18 Sep 2003 12:11:50 -0000      1.6
  @@ -186,7 +186,7 @@
        * stand out, within the file structure of the generated site, by, for
        * example, adding '.error' to the end of the filename.
        */
  -    public void setExtension(String extension) {
  +    public void setExtraExtension(String extension) {
           this.extension = extension;
           this.finalDestinationURI = null;
       }
  @@ -230,6 +230,13 @@
           return NetUtils.getPath(this.getSourceURI());
       }
   
  +    /**
  +     * Gets the file extension for the source URI
  +     */
  +    public String getExtension() {
  +        return NetUtils.getExtension(this.getSourceURI());
  +    }
  +    
       /** 
        * Gets the parent URI (the URI of the page that contained
        * a link to this URI). null is returned if this page was
  @@ -325,6 +332,14 @@
           return NetUtils.relativize(path, actualSourceURI);
       }
   
  +    /**
  +     * 
  +     * @return
  +     */
  +    public String getAuthlessDestURI() throws ProcessingException {
  +        return NetUtils.removeAuthorisation(this.getDestinationURI());
  +    }
  +    
       /**
        * Gets the original URI used to create this Target.
        * This URI is completely unprocessed.
  
  
  
  1.4       +10 -1     
cocoon-2.1/src/java/org/apache/cocoon/bean/helpers/DelayedOutputStream.java
  
  Index: DelayedOutputStream.java
  ===================================================================
  RCS file: 
/home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/helpers/DelayedOutputStream.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- DelayedOutputStream.java  10 Jun 2003 11:17:25 -0000      1.3
  +++ DelayedOutputStream.java  18 Sep 2003 12:11:50 -0000      1.4
  @@ -250,4 +250,13 @@
               throw new IOException("No outputstream available!");
           }
       }
  +    /**
  +     * Gets the size of the content of the current output stream
  +     */
  +    public int size() {
  +        if (baos != null) {
  +            return baos.size();
  +        }
  +        return 0;
  +    }
   }
  
  
  
  1.2       +40 -10    
cocoon-2.1/src/java/org/apache/cocoon/bean/helpers/OutputStreamListener.java
  
  Index: OutputStreamListener.java
  ===================================================================
  RCS file: 
/home/cvs/cocoon-2.1/src/java/org/apache/cocoon/bean/helpers/OutputStreamListener.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- OutputStreamListener.java 15 Sep 2003 19:18:18 -0000      1.1
  +++ OutputStreamListener.java 18 Sep 2003 12:11:50 -0000      1.2
  @@ -93,15 +93,28 @@
                                 String destinationURI, 
                                 int pageSize,
                                 int linksInPage, 
  -                              int newLinksinPage, 
  +                              int newLinksInPage, 
                                 int pagesRemaining, 
  -                              int pageComplete, 
  +                              int pagesComplete, 
                                 long timeTaken) {
  +        double time = (((double)timeTaken)/1000);
  +        
  +        String size;
  +        if (pageSize < 1024) {
  +            size = pageSize + "b";
  +        } else {
  +            size = ((float)((int)(pageSize/102.4)))/10 + "Kb";
  +        }
  +        
           if (linksInPage == -1) {
               this.print("* " + sourceURI);
           } else {
  -            this.print("* ["+linksInPage + "] "+sourceURI);
  -        }
  +            this.print(pad(13, "* ["+linksInPage + "/" + newLinksInPage + "] 
") +
  +                       pad(7,time + "s ") +
  +                       pad(8, size) +
  +                       sourceURI);
  +        }     
  +           
       }
       public void messageGenerated(String msg) {
           this.print(msg);
  @@ -112,7 +125,7 @@
       }
   
       public void brokenLinkFound(String uri, String parentURI, String 
message, Throwable t) {
  -        this.print("X [0] "+uri+"\tBROKEN: "+message);
  +        this.print(pad(28,"X [0] ")+uri+"\tBROKEN: "+message);
           brokenLinks.add(uri + "\t" + message);
           
   //            StringWriter sw = new StringWriter();
  @@ -121,8 +134,8 @@
   
       }
   
  -    public void pageSkipped(String uri) {
  -        // @TODO@ Do something
  +    public void pageSkipped(String uri, String message) {
  +        this.print("^ "+uri);
       }
       
       public void complete() {
  @@ -130,8 +143,7 @@
   
           long duration = System.currentTimeMillis() - startTimeMillis;
           this.print("Total time: " + (duration / 60000) + " minutes " + 
(duration % 60000)/1000 + " seconds");
  -        writer.flush();
  -        writer.close();
  +        this.close();
       }
   
       public boolean isSuccessful() {
  @@ -184,7 +196,25 @@
           }
       }
   
  +    private String pad(int chars, String str) {
  +        int len = str.length();
  +        if (len < chars) {
  +            StringBuffer sb = new StringBuffer(chars > len ? chars+1 : 
len+1);
  +            sb.append(str);
  +            for (int i=len; i<chars; i++) {
  +                sb.append(" ");
  +            }
  +            return sb.toString();
  +        }
  +        return str;
  +    }
  +    
       private void print(String message) {
           writer.println(message);
  +        writer.flush();
  +    }
  +    
  +    private void close() {
  +        writer.close();
       }
   }
  
  
  
  1.4       +10 -1     cocoon-2.1/src/java/org/apache/cocoon/util/NetUtils.java
  
  Index: NetUtils.java
  ===================================================================
  RCS file: /home/cvs/cocoon-2.1/src/java/org/apache/cocoon/util/NetUtils.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- NetUtils.java     27 May 2003 08:46:58 -0000      1.3
  +++ NetUtils.java     18 Sep 2003 12:11:50 -0000      1.4
  @@ -447,4 +447,13 @@
           return pars;
       }
   
  +    /**
  +     * Remove any authorisation details from a URI
  +     */
  +    public static String removeAuthorisation(String uri) {
  +        if (uri.indexOf("@")!=-1 && (uri.startsWith("ftp://";) || 
uri.startsWith("http://";))) {
  +            return uri.substring(0, 
uri.indexOf(":")+2)+uri.substring(uri.indexOf("@")+1);
  +        } 
  +        return uri;
  +    }
   }
  
  
  

Reply via email to