generation LinkStatusGenerator.java

nicolaken Mon, 25 Mar 2002 23:52:20 -0800

nicolaken    02/03/26 00:10:58

  Modified:    .        changes.xml
  Added:       src/scratchpad/webapp/mount/linkstatus linkstatus.xsl
                        sitemap.xmap
               src/scratchpad/src/org/apache/cocoon/generation
                        LinkStatusGenerator.java
  Log:
  Added LinkStatusGenerator donated by Michael Homeijer and accompanying sample
  sitemap to scratchpad.
  
  Revision  Changes    Path
  1.130     +5 -1      xml-cocoon2/changes.xml
  
  Index: changes.xml
  ===================================================================
  RCS file: /home/cvs/xml-cocoon2/changes.xml,v
  retrieving revision 1.129
  retrieving revision 1.130
  diff -u -r1.129 -r1.130
  --- changes.xml       26 Mar 2002 08:09:01 -0000      1.129
  +++ changes.xml       26 Mar 2002 08:10:58 -0000      1.130
  @@ -4,7 +4,7 @@
   
   <!--
     History of Cocoon changes
  -  $Id: changes.xml,v 1.129 2002/03/26 08:09:01 nicolaken Exp $
  +  $Id: changes.xml,v 1.130 2002/03/26 08:10:58 nicolaken Exp $
   -->
   
   <changes title="History of Changes">
  @@ -35,6 +35,10 @@
    </devs>
   
    <release version="@version@" date="@date@">
  +  <action dev="NKB" type="add">
  +    Added LinkStatusGenerator donated by Michael Homeijer and accompanying sample
  +    sitemap to scratchpad.
  +  </action>
     <action dev="NKB" type="update">
       Moved castor scratchpad sample from /samples to /mount as other samples.
       Now the refactored sample page points to the mount dir, thus
  
  
  
  1.1                  
xml-cocoon2/src/scratchpad/webapp/mount/linkstatus/linkstatus.xsl
  
  Index: linkstatus.xsl
  ===================================================================
  <?xml version="1.0"?>
  <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform";
  xmlns:linkstatus="http://apache.org/cocoon/linkstatus/2.0";>
  
    <xsl:template match="linkstatus:linkstatus">
        <xsl:apply-templates/>
    </xsl:template>
  
    <xsl:template match="linkstatus:link">
        HREF: <xsl:value-of select="@href"/>, REFERRER: <xsl:value-of 
select="@referrer"/>, CONTENT-TYPE: <xsl:value-of select="@content"/>, STATUS: 
<xsl:value-of select="@status"/>, MESSAGE: <xsl:value-of select="@message"/><BR/>
    </xsl:template>
  
  </xsl:stylesheet>
  
  
  
  
  1.1                  xml-cocoon2/src/scratchpad/webapp/mount/linkstatus/sitemap.xmap
  
  Index: sitemap.xmap
  ===================================================================
  <?xml version="1.0"?>
  
  <map:sitemap xmlns:map="http://apache.org/cocoon/sitemap/1.0";>
  
  <!-- =========================== Components ================================ -->
  
   <map:components>
    <map:generators default="file">
     <map:generator name="linkstatus"      logger="sitemap.generator.linkstatus"      
label="content,data"
                    src="org.apache.cocoon.generation.LinkStatusGenerator"/>
    </map:generators>
  
    <map:transformers default="xslt"/>
    <map:readers default="resource"/>
    <map:serializers default="html"/>
    <map:selectors default="browser"/>
    <map:matchers default="wildcard">
     <map:matcher name="wildcard"        
src="org.apache.cocoon.matching.WildcardURIMatcherFactory"/>
    </map:matchers>
   </map:components>
  
   <map:views>
    <map:view name="links" from-position="last">
     <map:serialize type="links"/>
    </map:view>
   </map:views>
  
  <!-- =========================== Pipelines ================================= -->
  
   <map:pipelines>
    <map:pipeline>
  
     <map:match pattern="">
       <map:redirect-to uri="linkstatus"/>
     </map:match>
  
     <map:match pattern="linkstatus">
       <map:generate type="linkstatus" src="http://localhost:8080/cocoon/welcome"/>
       <map:transform src="linkstatus.xsl"/>
       <map:serialize/>
     </map:match>
  
    </map:pipeline>
   </map:pipelines>
  
  </map:sitemap>
  
  <!-- end of file -->
  
  
  
  1.1                  
xml-cocoon2/src/scratchpad/src/org/apache/cocoon/generation/LinkStatusGenerator.java
  
  Index: LinkStatusGenerator.java
  ===================================================================
  package org.apache.cocoon.generation;
  
  import org.apache.avalon.excalibur.pool.Recyclable;
  import org.apache.avalon.framework.parameters.Parameters;
  import org.apache.avalon.framework.configuration.Configurable;
  import org.apache.avalon.framework.configuration.Configuration;
  import org.apache.avalon.framework.configuration.ConfigurationException;
  import org.apache.cocoon.ProcessingException;
  import org.apache.cocoon.ResourceNotFoundException;
  import org.apache.cocoon.environment.SourceResolver;
  import org.apache.cocoon.Constants;
  import org.apache.cocoon.util.Tokenizer;
  import org.apache.regexp.RE;
  import org.apache.regexp.RESyntaxException;
  import org.apache.log.Logger;
  
  import org.xml.sax.SAXException;
  import org.xml.sax.helpers.AttributesImpl;
  
  import java.io.IOException;
  import java.io.InputStream;
  import java.io.BufferedReader;
  import java.io.InputStreamReader;
  import java.net.URLConnection;
  import java.net.HttpURLConnection;
  import java.net.URL;
  import java.util.Map;
  import java.util.HashSet;
  import java.util.Iterator;
  import java.util.List;
  import java.util.ArrayList;
  
  /**
   * Generates a list of links that are reachable from the src and their status.
   *
   * @author Michael Homeijer
   */
  
  public class LinkStatusGenerator extends ComposerGenerator implements Recyclable, 
Configurable {
      /** The URI of the namespace of this generator. */
      protected static final String URI =
          "http://apache.org/cocoon/linkstatus/2.0";;
  
      /** The namespace prefix for this namespace. */
      protected static final String PREFIX = "linkstatus";
  
      /* Node and attribute names */
      protected static final String TOP_NODE_NAME         = "linkstatus";
      protected static final String LINK_NODE_NAME         = "link";
  
      protected static final String HREF_ATTR_NAME    = "href";
      protected static final String REFERRER_ATTR_NAME    = "referrer";
      protected static final String CONTENT_ATTR_NAME    = "content";
      protected static final String STATUS_ATTR_NAME    = "status";
      protected static final String MESSAGE_ATTR_NAME    = "message";     
  
      protected AttributesImpl attributes = new AttributesImpl();
  
      /**
       * Config element name specifying expected link content-typ.
       * <p>
       *   Its value is <code>link-content-type</code>.
       * </p>
       *
       * @since
       */
      public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type";
  
      /**
       * Default value of <code>link-content-type</code> configuration value.
       * <p>
       *   Its value is <code>application/x-cocoon-links</code>.
       * </p>
       *
       * @since
       */
      public final String LINK_CONTENT_TYPE_DEFAULT = "application/x-cocoon-links";
      
      /**
       * Config element name specifying query-string appendend for requesting links
       * of an URL.
       * <p>
       *  Its value is <code>link-view-query</code>.
       * </p>
       *
       * @since
       */
      public final static String LINK_VIEW_QUERY_CONFIG = "link-view-query";
      /**
       * Default value of <code>link-view-query</code> configuration value.
       * <p>
       *   Its value is <code>?cocoon-view=links</code>.
       * </p>
       *
       * @since
       */
      public final static String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";
  
      /**
       * Config element name specifying excluding regular expression pattern.
       * <p>
       *  Its value is <code>exclude</code>.
       * </p>
       *
       * @since
       */
      public final static String EXCLUDE_CONFIG = "exclude";
  
      /**
       * Config element name specifying including regular expression pattern.
       * <p>
       *  Its value is <code>include</code>.
       * </p>
       *
       * @since
       */
      public final static String INCLUDE_CONFIG = "include";
  
      /**
       * Config element name specifying http header value for user-Agent.
       * <p>
       *  Its value is <code>user-agent</code>.
       * </p>
       *
       * @since
       */
      public final static String USER_AGENT_CONFIG = "user-agent";
      /**
       * Default value of <code>user-agent</code> configuration value.
       * <p>
       *   Its value is @see org.apache.cocoon.Constants#COMPLETE_NAME.
       * </p>
       *
       * @since
       */
      public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;
  
      /**
       * Config element name specifying http header value for accept.
       * <p>
       *  Its value is <code>accept</code>.
       * </p>
       *
       * @since
       */
      public final static String ACCEPT_CONFIG = "accept";
      /**
       * Default value of <code>accept</code> configuration value.
       * <p>
       *   Its value is <code>* / *</code>
       * </p>
       *
       * @since
       */
      public final static String ACCEPT_DEFAULT = "*/*";
  
      private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT;
      private String linkContentType = LINK_CONTENT_TYPE_DEFAULT;
      private HashSet excludeCrawlingURL;
      private HashSet includeCrawlingURL;
      private String userAgent = USER_AGENT_DEFAULT;
      private String accept = ACCEPT_DEFAULT;
  
      private HashSet crawled;
      private HashSet linksToProcess;
  
      /**
       * Stores links to process and the referrer links
       */
  
      private class Link {
          private URL url;
          private String referrer;
  
          public Link( URL url, String referrer ) {
              this.url = url;
              this.referrer = referrer;
          }
  
          public URL getURL() {
              return url;
          }
  
          public String getReferrer() {
              return referrer;
          }
  
          public boolean equals( Link l ) {
              return url.equals( l.getURL());
          }
      }
  
      /**
       * Configure the crawler component.
       * <p>
       *  Configure can specify which URI to include, and which URI to exclude
       *  from crawling. You specify the patterns as regular expressions.
       * </p>
       * <p>
       *  Morover you can configure
       *  the required content-type of crawling request, and the
       *  query-string appended to each crawling request.
       * </p>
       * <pre><tt>
       * &lt;include&gt;.*\.html?&lt;/exclude&gt; or &lt;exclude&gt;.*\.html?, 
.*\.xsp&lt;/exclude&gt;
       * &lt;exclude&gt;.*\.gif&lt;/exclude&gt; or &lt;exclude&gt;.*\.gif, 
.*\.jpe?g&lt;/exclude&gt;
       * &lt;link-content-type&gt; application/x-cocoon-links 
&lt;/link-content-type&gt;
       * &lt;link-view-query&gt; ?cocoon-view=links &lt;/link-view-query&gt;
       * </tt></pre>
       *
       * @param  configuration               XML configuration of this avalon 
component.
       * @exception  ConfigurationException  is throwing if configuration is invalid.
       * @since
       */
      public void configure(Configuration configuration)
          throws ConfigurationException {
  
          Configuration[] children;
          children = configuration.getChildren(INCLUDE_CONFIG);
          if (children != null && children.length > 0) {
              includeCrawlingURL = new HashSet();
              for (int i = 0; i < children.length; i++) {
                  String pattern = children[i].getValue();
                  try {
                      Tokenizer t = new Tokenizer(pattern, ", ");
                      while (t.hasMoreTokens()) {
                          String tokenized_pattern = t.nextToken();
                          this.includeCrawlingURL.add(new RE(tokenized_pattern));
                      }
                  } catch (RESyntaxException rese) {
                      getLogger().error("Cannot create includeing regular-expression 
for " + 
                                        pattern, rese);
                  }
              }
          }
  
          children = configuration.getChildren(EXCLUDE_CONFIG);
          if (children != null && children.length > 0) {
              excludeCrawlingURL = new HashSet();
              for (int i = 0; i < children.length; i++) {
                  String pattern = children[i].getValue();
                  try {
                      Tokenizer t = new Tokenizer(pattern, ", ");
                      while (t.hasMoreTokens()) {
                          String tokenized_pattern = t.nextToken();
                          this.excludeCrawlingURL.add(new RE(tokenized_pattern));
                      }
                  } catch (RESyntaxException rese) {
                      getLogger().error("Cannot create excluding regular-expression 
for " + 
                                        pattern, rese);
                  }
              }
          } else {
              excludeCrawlingURL = new HashSet();
              setDefaultExcludeFromCrawling();
          }
  
          Configuration child;
          String value;
          child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false);
          if (child != null) {
              value = child.getValue();
              if (value != null && value.length() > 0) {
                  this.linkContentType = value.trim();
              }
          }
          child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false);
          if (child != null) {
              value = child.getValue();
              if (value != null && value.length() > 0) {
                  this.linkViewQuery = value.trim();
              }
          }
  
          child = configuration.getChild(USER_AGENT_CONFIG, false);
          if (child != null) {
              value = child.getValue();
              if (value != null && value.length() > 0) {
                  this.userAgent = value;
              }
          }
  
          child = configuration.getChild(ACCEPT_CONFIG, false);
          if (child != null) {
              value = child.getValue();
              if (value != null && value.length() > 0) {
                  this.accept = value;
              }
          }
      }
  
      public void setup(SourceResolver resolver, Map objectModel, String src, 
Parameters par)
          throws ProcessingException, SAXException, IOException {
  
          super.setup(resolver, objectModel, src, par);
  
          /* Create a reusable attributes for creating nodes */
          this.attributes = new AttributesImpl();
  
          excludeCrawlingURL = new HashSet();
          this.setDefaultExcludeFromCrawling();
      }
  
      /**
       * Generate XML data.
       *
       * @throws  SAXException
       *      if an error occurs while outputting the document
       * @throws  ProcessingException
       *      if the requsted URI wasn't found
       */
      public void generate()
          throws SAXException, ProcessingException {
          try {
  
              crawled = new HashSet();
              linksToProcess = new HashSet();
  
              URL root = new URL(source);
              linksToProcess.add(new Link( root, ""));
  
  
              if (getLogger().isDebugEnabled()) {
                  getLogger().debug("crawl URL " + root);
              }
  
              this.contentHandler.startDocument();
              this.contentHandler.startPrefixMapping(PREFIX,URI);
  
              attributes.clear();
              super.contentHandler.startElement(URI, TOP_NODE_NAME, 
URI+':'+TOP_NODE_NAME, attributes);
  
              while (linksToProcess.size() > 0) {
                  Iterator i = linksToProcess.iterator();
  
                  if (i.hasNext()) {
                      // fetch a URL
                      Link link = (Link) i.next();
                      URL url = link.getURL();            
  
                      // remove it from the to-do list
                      linksToProcess.remove(link);
  
                      URLConnection conn = processURL(url, link.getReferrer());
  
                      // calc all links from this url
                      if (conn != null) {
  
                          List url_links = getLinksFromConnection(conn, url);
                          if (url_links != null) {
                              // add links of this url to the to-do list
                              linksToProcess.addAll(url_links);
                          }
                      }
                  }
              }
  
              super.contentHandler.endElement(URI, TOP_NODE_NAME, 
URI+':'+TOP_NODE_NAME);
              this.contentHandler.endPrefixMapping(PREFIX);
              this.contentHandler.endDocument();
          } catch (IOException ioe) {
              getLogger().warn("Could not read source ", ioe);
              throw new ResourceNotFoundException("Could not read source ", ioe);
          }
      }
  
      /**
       * Default exclude patterns.
       * <p>
       *   By default URLs matching following patterns are excluded:
       * </p>
       * <ul>
       *   <li>.*\\.gif(\\?.*)?$ - exclude gif images</li>
       *   <li>.*\\.png(\\?.*)?$ - exclude png images</li>
       *   <li>.*\\.jpe?g(\\?.*)?$ - exclude jpeg images</li>
       *   <li>.*\\.js(\\?.*)?$ - exclude javascript </li>
       *   <li>.*\\.css(\\?.*)?$ - exclude cascaded stylesheets</li>
       * </ul>
       *
       * @since
       */
      private void setDefaultExcludeFromCrawling() {
          String[] EXCLUDE_FROM_CRAWLING_DEFAULT = {
              ".*\\.gif(\\?.*)?$",
              ".*\\.png(\\?.*)?$",
              ".*\\.jpe?g(\\?.*)?$",
              ".*\\.js(\\?.*)?$",
              ".*\\.css(\\?.*)?$"
          };
  
          for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) {
              String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i];
              try {
                  excludeCrawlingURL.add(new RE(pattern));
              } catch (RESyntaxException rese) {
                  getLogger().error("Cannot create excluding regular-expression for " +
                                    pattern, rese);
              }
          }
      }
  
  
      protected List getLinksFromConnection(URLConnection conn, URL url) {
          List url_links = null;
          try {
              String content_type = conn.getContentType();
  
              if (getLogger().isDebugEnabled()) {
                  getLogger().debug("Content-type: " + content_type);
              }
  
              if (content_type.equals(linkContentType)) {
                  url_links = new ArrayList();
  
                  InputStream is = conn.getInputStream();
                  BufferedReader br = new BufferedReader(new InputStreamReader(is));
  
                  // content is supposed to be a list of links,
                  // relative to current URL
                  String line;
                  String referrer = url.toString();
  
                  while ((line = br.readLine()) != null) {
                      URL new_url = new URL(url, line);
                      boolean add_url = true;
                      // don't add new_url twice
                      if (add_url) {
                          add_url &= !url_links.contains(new_url);
                      }
  
                      // don't add new_url if it has been crawled already
                      if (add_url) {
                          add_url &= !crawled.contains(new_url.toString());
                      }
  
                      Link new_link = new Link( new_url, referrer );
                      if (add_url) {
                          add_url &= !linksToProcess.contains(new_link);
                      }
  
                      // don't add if is not matched by existing include definition
                      if (add_url) {
                          add_url &= isIncludedURL(new_url.toString());
                      }
  
                      if (add_url) {
                          if (getLogger().isDebugEnabled()) {
                              getLogger().debug("Add URL: " + new_url.toString());
                          }
                          url_links.add(new_link);
                      }
                  }
                  // now we have a list of URL which should be examined
              }
          } catch (IOException ioe) {
              getLogger().warn("Problems get links of " + url, ioe);
          }
          return url_links;
      }
  
      protected URLConnection processURL(URL url, String referrer) throws SAXException 
{
  
          if (getLogger().isDebugEnabled()) {
              getLogger().debug("getLinks URL " + url);
          }
  
          URLConnection result = null;
  
          // don't try to investigate url which has been crawled already
          if (crawled.contains(url.toString())) {
              return null;
          }
  
          // mark it as crawled
          crawled.add(url.toString());
  
          attributes.clear();
          attributes.addAttribute("", HREF_ATTR_NAME,
                                  HREF_ATTR_NAME, "CDATA", url.toString());
          attributes.addAttribute("", REFERRER_ATTR_NAME,
                                  REFERRER_ATTR_NAME, "CDATA", referrer);
  
          // don't try to get links for url which is excluded from crawling
          if (isExcludedURL(url.toString())) {
              // Check for status and output it.
  
              try {
                  URLConnection links_url_connection = url.openConnection();
                  HttpURLConnection h = (HttpURLConnection)links_url_connection;
                  String content_type = links_url_connection.getContentType();
  
                  attributes.addAttribute("", CONTENT_ATTR_NAME,
                                          CONTENT_ATTR_NAME, "CDATA",
                                          content_type);
  
                  attributes.addAttribute("", MESSAGE_ATTR_NAME,
                                          MESSAGE_ATTR_NAME, "CDATA",
                                          h.getResponseMessage());
  
                  attributes.addAttribute("", STATUS_ATTR_NAME,
                                          STATUS_ATTR_NAME, "CDATA",
                                          String.valueOf(h.getResponseCode()));
  
  
  
              }
              catch (IOException ioe)
                  {
                      attributes.addAttribute("", MESSAGE_ATTR_NAME,
                                              MESSAGE_ATTR_NAME, "CDATA",
                                              ioe.getMessage());
                  }
                  
          } else {
  
              // Output url, referrer, content-type, status, message for traversable 
url's
              // add prefix and query to get data from the linkserializer.        
              try {
                  URL links_url = new URL(url, url.getPath()
                                          + ((url.getPath().indexOf("?") == -1) ? "?" 
: "&") 
                                          + linkViewQuery);
                  URLConnection links_url_connection = links_url.openConnection();
                  HttpURLConnection h = (HttpURLConnection)links_url_connection;
  
                  result = links_url_connection;
  
                  attributes.addAttribute("", CONTENT_ATTR_NAME,
                                          CONTENT_ATTR_NAME, "CDATA",
                                          links_url_connection.getContentType());
  
                  attributes.addAttribute("", MESSAGE_ATTR_NAME,
                                          MESSAGE_ATTR_NAME, "CDATA",
                                          h.getResponseMessage());
  
                  attributes.addAttribute("", STATUS_ATTR_NAME,
                                          STATUS_ATTR_NAME, "CDATA",
                                          String.valueOf(h.getResponseCode()));
              }
              catch(IOException ioe ) {
                  // Output url referrer status message
                  attributes.addAttribute("", MESSAGE_ATTR_NAME,
                                          MESSAGE_ATTR_NAME, "CDATA",
                                          ioe.getMessage());
  
          
              }
          }
          super.contentHandler.startElement(URI, LINK_NODE_NAME, 
URI+':'+LINK_NODE_NAME, attributes);
          super.contentHandler.endElement(URI, LINK_NODE_NAME, URI+':'+LINK_NODE_NAME);
          
          return result;
      }
  
      /**
       * check if URL is a candidate for indexing
       *
       * @param  url  Description of Parameter
       * @return      The excludedURL value
       * @since
       */
      private boolean isExcludedURL(String url) {
          // by default include URL for crawling
          if (excludeCrawlingURL == null) {
              if (getLogger().isDebugEnabled()) {
                  getLogger().debug("exclude no URL " + url);
              }
              return false;
          }
  
          final String s = url.toString();
          Iterator i = excludeCrawlingURL.iterator();
          while (i.hasNext()) {
              RE pattern = (RE) i.next();
              if (pattern.match(s)) {
                  if (getLogger().isDebugEnabled()) {
                      getLogger().debug("exclude URL " + url);
                  }
                  return true;
              }
          }
          if (getLogger().isDebugEnabled()) {
              getLogger().debug("exclude not URL " + url);
          }
          return false;
      }
  
  
      /**
       * check if URL is a candidate for indexing
       *
       * @param  url  Description of Parameter
       * @return      The includedURL value
       * @since
       */
      private boolean isIncludedURL(String url) {
          // by default include URL for crawling
          if (includeCrawlingURL == null) {
              if (getLogger().isDebugEnabled()) {
                  getLogger().debug("include all URL " + url);
              }
              return true;
          }
  
          final String s = url.toString();
          Iterator i = includeCrawlingURL.iterator();
          while (i.hasNext()) {
              RE pattern = (RE) i.next();
              if (pattern.match(s)) {
                  if (getLogger().isDebugEnabled()) {
                      getLogger().debug("include URL " + url);
                  }
                  return true;
              }
          }
          if (getLogger().isDebugEnabled()) {
              getLogger().debug("include not URL " + url);
          }
          return false;
      }
  
      public void recycle() {
          super.recycle();
  
          this.attributes = null;
          this.excludeCrawlingURL = null;
      }
  }


----------------------------------------------------------------------
In case of troubles, e-mail:     [EMAIL PROTECTED]
To unsubscribe, e-mail:          [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: xml-cocoon2/src/scratchpad/src/org/apache/cocoon/generation LinkStatusGenerator.java

Reply via email to