DefaultOutputDocument.java

rgardler Tue, 21 Nov 2006 18:31:51 -0800

Author: rgardler
Date: Tue Nov 21 17:31:15 2006
New Revision: 478001

URL: http://svn.apache.org/viewvc?view=rev&rev=478001
Log:
Add a (very basic) crawler.


Modified:
    
forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/cli/CLI.java
    
forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/AbstractOutputDocument.java
    
forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/DefaultOutputDocument.java

Modified: 
forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/cli/CLI.java
URL: 
http://svn.apache.org/viewvc/forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/cli/CLI.java?view=diff&rev=478001&r1=478000&r2=478001
==============================================================================
--- 
forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/cli/CLI.java 
(original)
+++ 
forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/cli/CLI.java 
Tue Nov 21 17:31:15 2006
@@ -16,13 +16,19 @@
  */
 package org.apache.forrest.cli;
 
+import java.io.IOException;
+import java.net.MalformedURLException;
 import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.HashSet;
+import java.util.Set;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.forrest.core.Controller;
 import org.apache.forrest.core.IController;
 import org.apache.forrest.core.document.AbstractOutputDocument;
+import org.apache.forrest.core.exception.ProcessingException;
 
 /**
  * A command line interface for Forrest.
@@ -31,6 +37,12 @@
 public class CLI {
        private static final Log log = LogFactory.getLog(CLI.class);
 
+       private static Set<String> processedUris = new HashSet<String>();
+
+       private static Set<String> unProcessedUris = new HashSet<String>();
+
+       private static IController controller;
+
        /**
         * @param args
         */
@@ -45,19 +57,47 @@
 
                try {
                        AbstractOutputDocument doc = null;
+                       controller = new Controller();
                        System.out.println("\n Processing request for " + 
args[0]);
-                       final URI requestURI = new URI(args[0]);
-                       final IController controller = new Controller();
-                       doc = controller.getOutputDocument(requestURI);
-
-                       System.out.println("\n Resulting document for request " 
+ args[0]
-                                       + " is:\n");
-                       System.out.println(doc.getContentAsString());
-
+                       unProcessedUris.add(args[0]);
+                       while (unProcessedUris.size() > 0) {
+                               processURIs(unProcessedUris);
+                       }
                } catch (final Exception e) {
                        e.printStackTrace();
                        log.error(e);
                        System.exit(1);
+               }
+       }
+
+       /**
+        * Processes a URI to get the response document. Any local links found 
in
+        * the document are added to the list of documents to be processed.
+        * 
+        * @param uri
+        * @param controller
+        * @throws MalformedURLException
+        * @throws ProcessingException
+        * @throws IOException
+        * @throws URISyntaxException
+        */
+       private static void processURIs(final Set<String> uris)
+                       throws MalformedURLException, ProcessingException, 
IOException,
+                       URISyntaxException {
+               AbstractOutputDocument doc;
+               HashSet<String> processingUris = new HashSet<String>(uris);
+               unProcessedUris = new HashSet<String>();
+               for (String strUri : processingUris) {
+                       URI uri = new URI(strUri);
+                       if (!(processedUris.contains(strUri))) {
+                               log.debug("Processing: " + strUri);
+                               doc = controller.getOutputDocument(uri);
+                               
unProcessedUris.addAll(doc.getLocalDocumentLinks());
+                               System.out.println("\n Resulting document for 
request " + uri
+                                               + " is:\n");
+                               System.out.println(doc.getContentAsString());
+                               processedUris.add(strUri);
+                       }
                }
        }
 

Modified: 
forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/AbstractOutputDocument.java
URL: 
http://svn.apache.org/viewvc/forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/AbstractOutputDocument.java?view=diff&rev=478001&r1=478000&r2=478001
==============================================================================
--- 
forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/AbstractOutputDocument.java
 (original)
+++ 
forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/AbstractOutputDocument.java
 Tue Nov 21 17:31:15 2006
@@ -17,6 +17,7 @@
 package org.apache.forrest.core.document;
 
 import java.net.URI;
+import java.util.Set;
 
 /**
  * An output document is a single document that has been processed by Forrest
@@ -34,5 +35,14 @@
        public URI getRequestURI() {
                return this.requestURI;
        }
+
+       /**
+        * Get a set of links to local documents in within this
+        * document. This is used to identify links that should
+        * be crawled when generating content.
+        * 
+        * @return
+        */
+       public abstract Set<String> getLocalDocumentLinks();
 
 }

Modified: 
forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/DefaultOutputDocument.java
URL: 
http://svn.apache.org/viewvc/forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/DefaultOutputDocument.java?view=diff&rev=478001&r1=478000&r2=478001
==============================================================================
--- 
forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/DefaultOutputDocument.java
 (original)
+++ 
forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest/core/document/DefaultOutputDocument.java
 Tue Nov 21 17:31:15 2006
@@ -16,12 +16,24 @@
  */
 package org.apache.forrest.core.document;
 
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.log4j.Logger;
+
+import com.sun.org.apache.regexp.internal.RE;
+import com.sun.org.apache.regexp.internal.RESyntaxException;
+
 /**
  * The most basic of output documents. The document itself is nothing more than
  * a String.
  * 
  */
 public class DefaultOutputDocument extends AbstractOutputDocument {
+       
+       Logger log = Logger.getLogger(DefaultOutputDocument.class);
 
        public DefaultOutputDocument(final String content) {
                this.setContent(content);
@@ -30,6 +42,34 @@
        @Override
        public String getContentAsString() {
                return this.content;
+       }
+
+       /**
+        * Get the links that should be crawled from this document. Since type 
of
+        * this document is not known (it's a string) it can be difficult to
+        * identify links. However, if the document appears to be an HTML string
+        * then href attributes of anchors are retrieved (only local links will 
be
+        * returned in the resutls).
+        */
+       @Override
+       public Set<String> getLocalDocumentLinks() {
+               Set<String> results = new HashSet<String>();
+               String content = getContentAsString();
+               if (content.contains("html") || content.contains("HTML")) {
+                       String rePattern = 
"<[a|A]\\s*href=\"([^\"#]+)\"\\s*>([^*<]+)</[a|A]>";
+                       Pattern pattern = Pattern.compile(rePattern);
+                       Matcher matcher = pattern.matcher(content);
+                       while (matcher.find()) {
+                               String href = matcher.group(1);
+                               if (href.startsWith("#") || 
href.startsWith("href://")) {
+                                       log.debug("Ignoring non-local href: " + 
href);
+                               } else {
+                           results.add(href);
+                           log.debug("Added local href: " + href);
+                               }
+               }
+               }
+               return results;
        }
 
 }

svn commit: r478001 - in /forrest/trunk/whiteboard/forrest2/core/src/core/org/apache/forrest: cli/CLI.java core/document/AbstractOutputDocument.java core/document/DefaultOutputDocument.java

Reply via email to