[gwt-contrib] [google-web-toolkit] r6204 committed - Adds simple crawler to tools.

codesite-noreply Thu, 24 Sep 2009 08:25:09 -0700

Revision: 6204
Author: [email protected]
Date: Thu Sep 24 08:24:11 2009
Log: Adds simple crawler to tools.



http://code.google.com/p/google-web-toolkit/source/detail?r=6204

Added:
  /branches/crawlability/eclipse/tools/simple-crawler
  /branches/crawlability/eclipse/tools/simple-crawler/.checkstyle
  /branches/crawlability/eclipse/tools/simple-crawler/.classpath
  /branches/crawlability/eclipse/tools/simple-crawler/.project
  /branches/crawlability/tools/simple-crawler
  /branches/crawlability/tools/simple-crawler/build.xml
  /branches/crawlability/tools/simple-crawler/src
  /branches/crawlability/tools/simple-crawler/src/com
  /branches/crawlability/tools/simple-crawler/src/com/google
  /branches/crawlability/tools/simple-crawler/src/com/google/gwt
   
/branches/crawlability/tools/simple-crawler/src/com/google/gwt/simplecrawler
   
/branches/crawlability/tools/simple-crawler/src/com/google/gwt/simplecrawler/Settings.java
   
/branches/crawlability/tools/simple-crawler/src/com/google/gwt/simplecrawler/SimpleCrawler.java
   
/branches/crawlability/tools/simple-crawler/src/com/google/gwt/simplecrawler/SimpleSitemapParser.java
Modified:
  /branches/crawlability/tools/build.xml

=======================================
--- /dev/null
+++ /branches/crawlability/eclipse/tools/simple-crawler/.checkstyle     Thu Sep 
 
24 08:24:11 2009
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<fileset-config file-format-version="1.2.0" simple-config="true">
+    <fileset name="all" enabled="true" check-config-name="GWT Checks"  
local="false">
+        <file-match-pattern match-pattern="." include-pattern="true"/>
+    </fileset>
+    <filter name="NonSrcDirs" enabled="true"/>
+</fileset-config>
=======================================
--- /dev/null
+++ /branches/crawlability/eclipse/tools/simple-crawler/.classpath      Thu Sep 
 
24 08:24:11 2009
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+       <classpathentry kind="src" path="src"/>
+       <classpathentry kind="con"  
path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+       <classpathentry kind="con"  
path="org.eclipse.jdt.junit.JUNIT_CONTAINER/3"/>
+       <classpathentry kind="output" path="bin"/>
+</classpath>
=======================================
--- /dev/null
+++ /branches/crawlability/eclipse/tools/simple-crawler/.project        Thu Sep 
24  
08:24:11 2009
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+       <name>simple-crawler</name>
+       <comment>Simple crawler</comment>
+       <projects>
+       </projects>
+       <buildSpec>
+               <buildCommand>
+                       <name>org.eclipse.jdt.core.javabuilder</name>
+                       <arguments>
+                       </arguments>
+               </buildCommand>
+       </buildSpec>
+       <natures>
+               <nature>org.eclipse.jdt.core.javanature</nature>
+       </natures>
+       <linkedResources>
+               <link>
+                       <name>src</name>
+                       <type>2</type>
+                       
<locationURI>GWT_ROOT/tools/simple-crawler/src</locationURI>
+               </link>
+       </linkedResources>
+</projectDescription>
=======================================
--- /dev/null
+++ /branches/crawlability/tools/simple-crawler/build.xml       Thu Sep 24  
08:24:11 2009
@@ -0,0 +1,31 @@
+<?xml version="1.0"?>
+
+<project name="simple-crawler" default="build" basedir=".">
+
+  <property name="gwt.root" location="../.." />
+  <property name="project.tail" value="tools/simple-crawler" />
+  <import file="${gwt.root}/common.ant.xml" />
+
+  <!-- Platform shouldn't matter here, just picking one -->
+  <property.ensure name="gwt.dev.jar"  
location="${gwt.build.lib}/gwt-dev-${build.host.platform}.jar" />
+
+  <target name="clean">
+    <delete dir="build"/>
+  </target>
+
+  <target name="build">
+
+    <mkdir dir="${javac.out}" />
+    <gwt.javac>
+      <classpath>
+        <pathelement location="${gwt.dev.jar}" />
+        <pathelement location="${gwt.user.jar}" />
+      </classpath>
+    </gwt.javac>
+  </target>
+
+  <target name="test"/>
+
+  <target name="checkstyle"/>
+
+</project>
=======================================
--- /dev/null
+++  
/branches/crawlability/tools/simple-crawler/src/com/google/gwt/simplecrawler/Settings.java
       
Thu Sep 24 08:24:11 2009
@@ -0,0 +1,168 @@
+/*
+ * Copyright 2009 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may  
not
+ * use this file except in compliance with the License. You may obtain a  
copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,  
WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations  
under
+ * the License.
+ */
+package com.google.gwt.simplecrawler;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * Command-line settings for simple crawler.
+ */
+public class Settings {
+  /**
+   * An exception indicating that there is a problem in an argument list.
+   */
+  public static class ArgumentListException extends Exception {
+    public ArgumentListException(String message) {
+      super(message);
+    }
+  }
+
+  /**
+   * One individual setting.
+   */
+  public abstract static class Setting<T> {
+    private final String help;
+    private T value;
+
+    public Setting(T initialValue, String help) {
+      value = initialValue;
+      this.help = help;
+    }
+
+    public T get() {
+      return value;
+    }
+
+    public String getHelp() {
+      return help;
+    }
+
+    public void set(T newValue) {
+      value = newValue;
+    }
+
+    /**
+     * Consume arguments from the front of the list. If the front of the
+     * argument list is not a match, do nothing. If the front of the  
argument
+     * list is a match but has some problem, then throw an exception.
+     */
+    abstract boolean consumeArguments(List<String> arguments)
+        throws ArgumentListException;
+  }
+
+  /**
+   * A setting that is an option followed by a string argument.
+   */
+  public static class StringSetting extends Setting<String> {
+    private final String option;
+
+    public StringSetting(String option, String argumentName,
+        String defaultSetting, String description) {
+      super(defaultSetting, option + " " + argumentName + "    " +  
description);
+      this.option = option;
+    }
+
+    @Override
+    public String toString() {
+      return option + " " + get();
+    }
+
+    @Override
+    boolean consumeArguments(List<String> arguments)
+        throws ArgumentListException {
+      if (arguments.get(0).equals(option)) {
+        if (arguments.size() < 2) {
+          throw new ArgumentListException("Option " + option + " requires  
an argument");
+        }
+        arguments.remove(0);
+        set(arguments.remove(0));
+        return true;
+      }
+      return false;
+    }
+  }
+
+  /**
+   * Processes the arguments from the command line.
+   *
+   * @param allArguments
+   * @return processed settings
+   * @throws ArgumentListException
+   */
+  public static Settings fromArgumentList(String[] allArguments)
+      throws ArgumentListException {
+
+    Settings settings = new Settings();
+
+    List<String> remainingArguments = new LinkedList<String>(
+        Arrays.asList(allArguments));
+
+    // Handle hyphenated options
+    next_argument : while (!remainingArguments.isEmpty()) {
+      for (Setting<?> setting : settings.allSettings) {
+        if (setting.consumeArguments(remainingArguments)) {
+          continue next_argument;
+        }
+      }
+      System.err.println("Unknown argument: " + remainingArguments.get(0));
+      break; // No setting wanted the remaining arguments
+    }
+
+    // Enforce that an initial URL or a sitemap file are supplied, not  
both.
+    if ((settings.sitemap.get() == null) && (settings.initUrl.get() ==  
null)) {
+      throw new ArgumentListException(
+          "Need to specify either a sitemap file or an initial URL.");
+    } else if ((settings.sitemap.get() != null)
+        && (settings.initUrl.get() != null)) {
+      throw new ArgumentListException(
+          "Please specify EITHER an initial URL OR a sitemap file.");
+    }
+    return settings;
+  }
+
+  /**
+   * Displays usage information.
+   *
+   * @return help message
+   */
+  public static String settingsHelp() {
+    StringBuffer help = new StringBuffer();
+    for (Setting<?> setting : new Settings().allSettings) {
+      help.append(setting.getHelp() + "\n");
+    }
+    return help.toString();
+  }
+
+  private List<Setting<?>> allSettings = new ArrayList<Setting<?>>();
+
+  public final Setting<String> sitemap = addSetting(new StringSetting(
+      "-sitemap", "filename", null, "name of the file that contains a  
sitemap"));
+
+  public final Setting<String> initUrl = addSetting(new StringSetting(
+      "-initUrl", "URL", null, "initial URL to start crawl from"));
+
+  public final Setting<String> out = addSetting(new StringSetting("-out",
+      "filename", null, "file that output will be written to"));
+
+  private <T> Setting<T> addSetting(Setting<T> setting) {
+    allSettings.add(setting);
+    return setting;
+  }
+
+}
=======================================
--- /dev/null
+++  
/branches/crawlability/tools/simple-crawler/src/com/google/gwt/simplecrawler/SimpleCrawler.java
  
Thu Sep 24 08:24:11 2009
@@ -0,0 +1,213 @@
+/*
+ * Copyright 2009 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may  
not
+ * use this file except in compliance with the License. You may obtain a  
copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,  
WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations  
under
+ * the License.
+ */
+package com.google.gwt.simplecrawler;
+
+import org.xml.sax.SAXException;
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintWriter;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.Scanner;
+import java.util.Set;
+import java.util.Queue;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+
+import javax.xml.parsers.ParserConfigurationException;
+
+/**
+ * PLEASE READ, THIS IS NOT THE USUAL YADDA YADDA.
+ *
+ * This class performs a simple crawl of your web site. It is intended to  
be
+ * used as a tool for developers who make their Ajax applications  
crawlable.
+ * While it was developed by Google engineers, it makes *ABSOLUTELY NO
+ * GUARANTEES* about behaving like the actual Google web crawler. This  
crawler
+ * is initialized either by a URI or by a sitemap, and it follows  
hyperlinks on
+ * the same site. This will allow developers to get an idea of what the  
real
+ * crawler will see.
+ */
+public class SimpleCrawler {
+  /**
+   * Set of all web pages already seen.
+   */
+  private static Set<String> alreadySeenSet = new HashSet<String>();
+  /**
+   * Pattern to extract HREFs from pages.
+   */
+  private static final String HREF = "<[^>]*href=\"([^\"]*)\"";
+  /**
+   * Defaults to System.out if nothing is specified by the user.
+   */
+  private static PrintWriter out = new PrintWriter(System.out, true);
+  /**
+   * Queue of pages yet to be crawled.
+   */
+  private static Queue<String> urlQueue = new LinkedList<String>();
+
+  /**
+   * Gets the content of a URL via a web connection.
+   *
+   * @param urlString
+   *            the URL to fetch
+   * @return content of the URL
+   */
+  private static String getUrlContent(String urlString) {
+    try {
+      URL url = new URL(urlString);
+      InputStream urlInputStream = url.openStream();
+      Scanner contentScanner = new Scanner(urlInputStream, "UTF-8");
+      String content = "";
+      if (contentScanner.hasNextLine()) {
+        // trick documented here:
+        //  
http://weblogs.java.net/blog/pat/archive/2004/10/stupid_scanner_1.html
+        content = contentScanner.useDelimiter("\\A").next();
+      }
+      return content;
+      } catch (MalformedURLException e) {
+        System.err.println("Malformed url: " + urlString);
+        return null;
+      } catch (IOException e) {
+      System.err.println("Could not open url: " + e.getMessage());
+      return null;
+    }
+  }
+
+  /**
+   * Entry point.
+   */
+   public static void main(String[] args) throws  
ParserConfigurationException,
+    SAXException {
+    try {
+      Settings settings = Settings.fromArgumentList(args);
+      if (settings.out.get() != null) {
+        out = new PrintWriter(new FileOutputStream(settings.out.get()),  
true);
+      }
+      if (settings.initUrl.get() != null) {
+        String initialUrl = settings.initUrl.get();
+        urlQueue.add(initialUrl);
+      } else {
+        SimpleSitemapParser.parseSitemap(settings.sitemap.get(), urlQueue);
+      }
+      runUntilQueueEmpty();
+      } catch (Settings.ArgumentListException e) {
+        System.err.println(e.getMessage());
+        System.err.println("Usage: java  
com.google.gwt.crawler.SimpleCrawler [-sitemap SiteMapFile | -initUrl URL]  
[-out outFile]");
+        System.err.println(Settings.settingsHelp());
+        System.exit(1);
+    } catch (IOException e) {
+      System.err.println("Could not open file: " + e.getMessage());
+      System.exit(1);
+    }
+  }
+
+  /**
+   * Creates a full URL from a hash fragment.
+   *
+   * @return full URL
+   */
+  private static String makeFullUrlFromFragment(String hashFragment,
+    String nextUrl) {
+    // Get the baseUrl up to the query parameters.
+    String baseUrl = nextUrl.replaceAll("\\?.*", "");
+    return baseUrl + hashFragment;
+  }
+
+  /**
+   * Maps to crawler URL, which in short means #! will be mapped to
+   * _escaped_fragment_=. For example, www.example.com#!mystate is mapped  
to
+   * www.example.com?_escaped_fragment_=mystate.
+   *
+   * @param url
+   * @return mapped URL
+   */
+  private static String mapToCrawlerUrl(String url) {
+    String toReturn = url;
+    if (toReturn.contains("#!")) {
+      if (!toReturn.contains("?")) {
+        toReturn = toReturn.replaceAll("#!", "?_escaped_fragment_=");
+      } else {
+        toReturn = toReturn.replaceAll("#!", "&_escaped_fragment_=");
+      }
+    }
+    return toReturn;
+  }
+
+  /**
+   * Maps back to the original URL, which can contain #!. For example,
+   * www.example.com?_escaped_fragment_=mystate is mapped to
+   * www.example.com#!mystate.
+   *
+   * @param url
+   * @return mapped url
+   */
+  private static String mapToOriginalUrl(String url) {
+    String toReturn = url;
+    if (toReturn.contains("_escaped_fragment_=")) {
+      toReturn = toReturn.replaceAll("_escaped_fragment_=", "#!");
+    }
+    return toReturn;
+  }
+
+  /**
+   * Gets the content for all the URLs on the queue, extracts links from  
it,
+   * and follows the links.
+   */
+  private static void runUntilQueueEmpty() {
+    /*
+     * This pattern is correct in many, but not all cases.  It would be
+     * impossible to find a pattern that matches all cases, for
+     * example faulty HREFs.
+     */
+    Pattern hrefPattern = Pattern.compile(HREF, Pattern.CASE_INSENSITIVE);
+    while (!urlQueue.isEmpty()) {
+      String nextUrl = urlQueue.poll();
+      if (!alreadySeenSet.contains(nextUrl)) {
+        alreadySeenSet.add(nextUrl);
+        out.println("------- The original URL is: " + nextUrl + " ------");
+        nextUrl = mapToCrawlerUrl(nextUrl);
+        out.println("------- The crawler is requesting the following  
URL: " + nextUrl + " ------");
+        String indexedUrl = mapToOriginalUrl(nextUrl);
+        if (indexedUrl.compareTo(nextUrl) != 0) {
+          out.println("------- NOTE: This page will be indexed with the  
following URL: "
+            + indexedUrl + " ------");
+        }
+        String nextUrlContent = getUrlContent(nextUrl);
+        if (nextUrlContent == null) {
+          out.println("------ no content for this web page ------");
+        } else {
+          out.println(nextUrlContent);
+          Matcher matcher = hrefPattern.matcher(nextUrlContent);
+          while (matcher.find()) {
+            String extractedUrl = matcher.group(1);
+            if (!extractedUrl.startsWith("http")) {
+              if (extractedUrl.startsWith("#")) {
+                extractedUrl = makeFullUrlFromFragment(extractedUrl,  
nextUrl);
+              }
+              if ((extractedUrl.length() > 0) &&  
(!alreadySeenSet.contains(extractedUrl))) {
+                urlQueue.add(extractedUrl);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
=======================================
--- /dev/null
+++  
/branches/crawlability/tools/simple-crawler/src/com/google/gwt/simplecrawler/SimpleSitemapParser.java
    
Thu Sep 24 08:24:11 2009
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2009 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may  
not
+ * use this file except in compliance with the License. You may obtain a  
copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,  
WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations  
under
+ * the License.
+ */
+package com.google.gwt.simplecrawler;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.BufferedInputStream;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Queue;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+/**
+ * This simple sitemap parser only looks at url entries and extracts the  
loc
+ * from them. It ignores everything else.
+ * The format of site maps is documented here: http://www.sitemaps.org.
+ * A simple site map will look as follows:
+ * <?xml version="1.0" encoding="UTF-8"?>
+ * <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9";>
+ *     
<url><loc>http://j15r.com:8800/Showcase/Showcase.html?locale=ar</loc></url>
+ *     
<url><loc>http://j15r.com:8800/Showcase/Showcase.html?locale=en</loc></url>
+ * </urlset>
+ */
+public class SimpleSitemapParser {
+
+  /**
+   * Parses a site map file
+   *
+   * @param fileName file name for site map. The format of the site map is
+   *          documented here: {...@link http://www.sitemaps.org/}
+   * @param urlQueue Url queue where all Urls in the site map will be  
stored
+   * @throws ParserConfigurationException
+   * @throws SAXException
+   * @throws IOException
+   */
+  public static void parseSitemap(String fileName, Queue<String> urlQueue)
+      throws ParserConfigurationException, SAXException, IOException {
+    DefaultHandler handler = makeSiteMapParserHandler(urlQueue);
+    SAXParserFactory factoryMain = SAXParserFactory.newInstance();
+    factoryMain.setNamespaceAware(true);
+    SAXParser saxParser = factoryMain.newSAXParser();
+    InputStream in = new FileInputStream(fileName);
+    in = new BufferedInputStream(in);
+    saxParser.parse(in, handler);
+  }
+
+  /**
+   * SAX parser that performs the parse
+   *
+   * @param urlQueue Url queue where all Urls in the site map will be  
stored
+   * @return the SAX parser handler
+   */
+  private static DefaultHandler makeSiteMapParserHandler(final  
Queue<String> urlQueue) {
+    return new DefaultHandler() {
+      StringBuilder valueBuilder = new StringBuilder();
+
+      @Override
+      public void characters(char ch[], int start, int length) {
+        valueBuilder.append(ch, start, length);
+      }
+
+      @Override
+      public void endElement(String uri, String localName, String qName) {
+        if (localName.compareTo("loc") == 0) {
+          String url = valueBuilder.toString();
+          urlQueue.add(url);
+        }
+      }
+
+      @Override
+      public void startElement(String uri, String localName, String qName,
+          final Attributes attributes) {
+        valueBuilder.delete(0, valueBuilder.length());
+      }
+    };
+  }
+}
=======================================
--- /branches/crawlability/tools/build.xml      Mon Dec  1 20:28:46 2008
+++ /branches/crawlability/tools/build.xml      Thu Sep 24 08:24:11 2009
@@ -17,8 +17,12 @@
          <target name="soyc-vis" depends="" description="Compile SOYC  
dashboard">
                  <gwt.ant dir="soyc-vis" />
          </target>
-
-  <target name="-do" depends="benchmark-viewer,api-checker,soyc-vis"  
description="Run all subfolders" />
+
+        <target name="simple-crawler" depends="" description="Compile  
simple crawler">
+                <gwt.ant dir="simple-crawler" />
+        </target>
+
+  <target name="-do"  
depends="benchmark-viewer,api-checker,soyc-vis,simple-crawler"  
description="Run all subfolders" />

    <target name="build" description="Build each subfolder">
      <antcall target="-do">

--~--~---------~--~----~------------~-------~--~----~
http://groups.google.com/group/Google-Web-Toolkit-Contributors
-~----------~----~----~----~------~----~------~--~---

[gwt-contrib] [google-web-toolkit] r6204 committed - Adds simple crawler to tools.

Reply via email to