Revision: 6204 Author: [email protected] Date: Thu Sep 24 08:24:11 2009 Log: Adds simple crawler to tools.
http://code.google.com/p/google-web-toolkit/source/detail?r=6204 Added: /branches/crawlability/eclipse/tools/simple-crawler /branches/crawlability/eclipse/tools/simple-crawler/.checkstyle /branches/crawlability/eclipse/tools/simple-crawler/.classpath /branches/crawlability/eclipse/tools/simple-crawler/.project /branches/crawlability/tools/simple-crawler /branches/crawlability/tools/simple-crawler/build.xml /branches/crawlability/tools/simple-crawler/src /branches/crawlability/tools/simple-crawler/src/com /branches/crawlability/tools/simple-crawler/src/com/google /branches/crawlability/tools/simple-crawler/src/com/google/gwt /branches/crawlability/tools/simple-crawler/src/com/google/gwt/simplecrawler /branches/crawlability/tools/simple-crawler/src/com/google/gwt/simplecrawler/Settings.java /branches/crawlability/tools/simple-crawler/src/com/google/gwt/simplecrawler/SimpleCrawler.java /branches/crawlability/tools/simple-crawler/src/com/google/gwt/simplecrawler/SimpleSitemapParser.java Modified: /branches/crawlability/tools/build.xml ======================================= --- /dev/null +++ /branches/crawlability/eclipse/tools/simple-crawler/.checkstyle Thu Sep 24 08:24:11 2009 @@ -0,0 +1,7 @@ +<?xml version="1.0" encoding="UTF-8"?> +<fileset-config file-format-version="1.2.0" simple-config="true"> + <fileset name="all" enabled="true" check-config-name="GWT Checks" local="false"> + <file-match-pattern match-pattern="." include-pattern="true"/> + </fileset> + <filter name="NonSrcDirs" enabled="true"/> +</fileset-config> ======================================= --- /dev/null +++ /branches/crawlability/eclipse/tools/simple-crawler/.classpath Thu Sep 24 08:24:11 2009 @@ -0,0 +1,7 @@ +<?xml version="1.0" encoding="UTF-8"?> +<classpath> + <classpathentry kind="src" path="src"/> + <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> + <classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/3"/> + <classpathentry kind="output" path="bin"/> +</classpath> ======================================= --- /dev/null +++ /branches/crawlability/eclipse/tools/simple-crawler/.project Thu Sep 24 08:24:11 2009 @@ -0,0 +1,24 @@ +<?xml version="1.0" encoding="UTF-8"?> +<projectDescription> + <name>simple-crawler</name> + <comment>Simple crawler</comment> + <projects> + </projects> + <buildSpec> + <buildCommand> + <name>org.eclipse.jdt.core.javabuilder</name> + <arguments> + </arguments> + </buildCommand> + </buildSpec> + <natures> + <nature>org.eclipse.jdt.core.javanature</nature> + </natures> + <linkedResources> + <link> + <name>src</name> + <type>2</type> + <locationURI>GWT_ROOT/tools/simple-crawler/src</locationURI> + </link> + </linkedResources> +</projectDescription> ======================================= --- /dev/null +++ /branches/crawlability/tools/simple-crawler/build.xml Thu Sep 24 08:24:11 2009 @@ -0,0 +1,31 @@ +<?xml version="1.0"?> + +<project name="simple-crawler" default="build" basedir="."> + + <property name="gwt.root" location="../.." /> + <property name="project.tail" value="tools/simple-crawler" /> + <import file="${gwt.root}/common.ant.xml" /> + + <!-- Platform shouldn't matter here, just picking one --> + <property.ensure name="gwt.dev.jar" location="${gwt.build.lib}/gwt-dev-${build.host.platform}.jar" /> + + <target name="clean"> + <delete dir="build"/> + </target> + + <target name="build"> + + <mkdir dir="${javac.out}" /> + <gwt.javac> + <classpath> + <pathelement location="${gwt.dev.jar}" /> + <pathelement location="${gwt.user.jar}" /> + </classpath> + </gwt.javac> + </target> + + <target name="test"/> + + <target name="checkstyle"/> + +</project> ======================================= --- /dev/null +++ /branches/crawlability/tools/simple-crawler/src/com/google/gwt/simplecrawler/Settings.java Thu Sep 24 08:24:11 2009 @@ -0,0 +1,168 @@ +/* + * Copyright 2009 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.gwt.simplecrawler; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +/** + * Command-line settings for simple crawler. + */ +public class Settings { + /** + * An exception indicating that there is a problem in an argument list. + */ + public static class ArgumentListException extends Exception { + public ArgumentListException(String message) { + super(message); + } + } + + /** + * One individual setting. + */ + public abstract static class Setting<T> { + private final String help; + private T value; + + public Setting(T initialValue, String help) { + value = initialValue; + this.help = help; + } + + public T get() { + return value; + } + + public String getHelp() { + return help; + } + + public void set(T newValue) { + value = newValue; + } + + /** + * Consume arguments from the front of the list. If the front of the + * argument list is not a match, do nothing. If the front of the argument + * list is a match but has some problem, then throw an exception. + */ + abstract boolean consumeArguments(List<String> arguments) + throws ArgumentListException; + } + + /** + * A setting that is an option followed by a string argument. + */ + public static class StringSetting extends Setting<String> { + private final String option; + + public StringSetting(String option, String argumentName, + String defaultSetting, String description) { + super(defaultSetting, option + " " + argumentName + " " + description); + this.option = option; + } + + @Override + public String toString() { + return option + " " + get(); + } + + @Override + boolean consumeArguments(List<String> arguments) + throws ArgumentListException { + if (arguments.get(0).equals(option)) { + if (arguments.size() < 2) { + throw new ArgumentListException("Option " + option + " requires an argument"); + } + arguments.remove(0); + set(arguments.remove(0)); + return true; + } + return false; + } + } + + /** + * Processes the arguments from the command line. + * + * @param allArguments + * @return processed settings + * @throws ArgumentListException + */ + public static Settings fromArgumentList(String[] allArguments) + throws ArgumentListException { + + Settings settings = new Settings(); + + List<String> remainingArguments = new LinkedList<String>( + Arrays.asList(allArguments)); + + // Handle hyphenated options + next_argument : while (!remainingArguments.isEmpty()) { + for (Setting<?> setting : settings.allSettings) { + if (setting.consumeArguments(remainingArguments)) { + continue next_argument; + } + } + System.err.println("Unknown argument: " + remainingArguments.get(0)); + break; // No setting wanted the remaining arguments + } + + // Enforce that an initial URL or a sitemap file are supplied, not both. + if ((settings.sitemap.get() == null) && (settings.initUrl.get() == null)) { + throw new ArgumentListException( + "Need to specify either a sitemap file or an initial URL."); + } else if ((settings.sitemap.get() != null) + && (settings.initUrl.get() != null)) { + throw new ArgumentListException( + "Please specify EITHER an initial URL OR a sitemap file."); + } + return settings; + } + + /** + * Displays usage information. + * + * @return help message + */ + public static String settingsHelp() { + StringBuffer help = new StringBuffer(); + for (Setting<?> setting : new Settings().allSettings) { + help.append(setting.getHelp() + "\n"); + } + return help.toString(); + } + + private List<Setting<?>> allSettings = new ArrayList<Setting<?>>(); + + public final Setting<String> sitemap = addSetting(new StringSetting( + "-sitemap", "filename", null, "name of the file that contains a sitemap")); + + public final Setting<String> initUrl = addSetting(new StringSetting( + "-initUrl", "URL", null, "initial URL to start crawl from")); + + public final Setting<String> out = addSetting(new StringSetting("-out", + "filename", null, "file that output will be written to")); + + private <T> Setting<T> addSetting(Setting<T> setting) { + allSettings.add(setting); + return setting; + } + +} ======================================= --- /dev/null +++ /branches/crawlability/tools/simple-crawler/src/com/google/gwt/simplecrawler/SimpleCrawler.java Thu Sep 24 08:24:11 2009 @@ -0,0 +1,213 @@ +/* + * Copyright 2009 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.gwt.simplecrawler; + +import org.xml.sax.SAXException; + +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.PrintWriter; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.Scanner; +import java.util.Set; +import java.util.Queue; +import java.util.regex.Pattern; +import java.util.regex.Matcher; + +import javax.xml.parsers.ParserConfigurationException; + +/** + * PLEASE READ, THIS IS NOT THE USUAL YADDA YADDA. + * + * This class performs a simple crawl of your web site. It is intended to be + * used as a tool for developers who make their Ajax applications crawlable. + * While it was developed by Google engineers, it makes *ABSOLUTELY NO + * GUARANTEES* about behaving like the actual Google web crawler. This crawler + * is initialized either by a URI or by a sitemap, and it follows hyperlinks on + * the same site. This will allow developers to get an idea of what the real + * crawler will see. + */ +public class SimpleCrawler { + /** + * Set of all web pages already seen. + */ + private static Set<String> alreadySeenSet = new HashSet<String>(); + /** + * Pattern to extract HREFs from pages. + */ + private static final String HREF = "<[^>]*href=\"([^\"]*)\""; + /** + * Defaults to System.out if nothing is specified by the user. + */ + private static PrintWriter out = new PrintWriter(System.out, true); + /** + * Queue of pages yet to be crawled. + */ + private static Queue<String> urlQueue = new LinkedList<String>(); + + /** + * Gets the content of a URL via a web connection. + * + * @param urlString + * the URL to fetch + * @return content of the URL + */ + private static String getUrlContent(String urlString) { + try { + URL url = new URL(urlString); + InputStream urlInputStream = url.openStream(); + Scanner contentScanner = new Scanner(urlInputStream, "UTF-8"); + String content = ""; + if (contentScanner.hasNextLine()) { + // trick documented here: + // http://weblogs.java.net/blog/pat/archive/2004/10/stupid_scanner_1.html + content = contentScanner.useDelimiter("\\A").next(); + } + return content; + } catch (MalformedURLException e) { + System.err.println("Malformed url: " + urlString); + return null; + } catch (IOException e) { + System.err.println("Could not open url: " + e.getMessage()); + return null; + } + } + + /** + * Entry point. + */ + public static void main(String[] args) throws ParserConfigurationException, + SAXException { + try { + Settings settings = Settings.fromArgumentList(args); + if (settings.out.get() != null) { + out = new PrintWriter(new FileOutputStream(settings.out.get()), true); + } + if (settings.initUrl.get() != null) { + String initialUrl = settings.initUrl.get(); + urlQueue.add(initialUrl); + } else { + SimpleSitemapParser.parseSitemap(settings.sitemap.get(), urlQueue); + } + runUntilQueueEmpty(); + } catch (Settings.ArgumentListException e) { + System.err.println(e.getMessage()); + System.err.println("Usage: java com.google.gwt.crawler.SimpleCrawler [-sitemap SiteMapFile | -initUrl URL] [-out outFile]"); + System.err.println(Settings.settingsHelp()); + System.exit(1); + } catch (IOException e) { + System.err.println("Could not open file: " + e.getMessage()); + System.exit(1); + } + } + + /** + * Creates a full URL from a hash fragment. + * + * @return full URL + */ + private static String makeFullUrlFromFragment(String hashFragment, + String nextUrl) { + // Get the baseUrl up to the query parameters. + String baseUrl = nextUrl.replaceAll("\\?.*", ""); + return baseUrl + hashFragment; + } + + /** + * Maps to crawler URL, which in short means #! will be mapped to + * _escaped_fragment_=. For example, www.example.com#!mystate is mapped to + * www.example.com?_escaped_fragment_=mystate. + * + * @param url + * @return mapped URL + */ + private static String mapToCrawlerUrl(String url) { + String toReturn = url; + if (toReturn.contains("#!")) { + if (!toReturn.contains("?")) { + toReturn = toReturn.replaceAll("#!", "?_escaped_fragment_="); + } else { + toReturn = toReturn.replaceAll("#!", "&_escaped_fragment_="); + } + } + return toReturn; + } + + /** + * Maps back to the original URL, which can contain #!. For example, + * www.example.com?_escaped_fragment_=mystate is mapped to + * www.example.com#!mystate. + * + * @param url + * @return mapped url + */ + private static String mapToOriginalUrl(String url) { + String toReturn = url; + if (toReturn.contains("_escaped_fragment_=")) { + toReturn = toReturn.replaceAll("_escaped_fragment_=", "#!"); + } + return toReturn; + } + + /** + * Gets the content for all the URLs on the queue, extracts links from it, + * and follows the links. + */ + private static void runUntilQueueEmpty() { + /* + * This pattern is correct in many, but not all cases. It would be + * impossible to find a pattern that matches all cases, for + * example faulty HREFs. + */ + Pattern hrefPattern = Pattern.compile(HREF, Pattern.CASE_INSENSITIVE); + while (!urlQueue.isEmpty()) { + String nextUrl = urlQueue.poll(); + if (!alreadySeenSet.contains(nextUrl)) { + alreadySeenSet.add(nextUrl); + out.println("------- The original URL is: " + nextUrl + " ------"); + nextUrl = mapToCrawlerUrl(nextUrl); + out.println("------- The crawler is requesting the following URL: " + nextUrl + " ------"); + String indexedUrl = mapToOriginalUrl(nextUrl); + if (indexedUrl.compareTo(nextUrl) != 0) { + out.println("------- NOTE: This page will be indexed with the following URL: " + + indexedUrl + " ------"); + } + String nextUrlContent = getUrlContent(nextUrl); + if (nextUrlContent == null) { + out.println("------ no content for this web page ------"); + } else { + out.println(nextUrlContent); + Matcher matcher = hrefPattern.matcher(nextUrlContent); + while (matcher.find()) { + String extractedUrl = matcher.group(1); + if (!extractedUrl.startsWith("http")) { + if (extractedUrl.startsWith("#")) { + extractedUrl = makeFullUrlFromFragment(extractedUrl, nextUrl); + } + if ((extractedUrl.length() > 0) && (!alreadySeenSet.contains(extractedUrl))) { + urlQueue.add(extractedUrl); + } + } + } + } + } + } + } +} ======================================= --- /dev/null +++ /branches/crawlability/tools/simple-crawler/src/com/google/gwt/simplecrawler/SimpleSitemapParser.java Thu Sep 24 08:24:11 2009 @@ -0,0 +1,96 @@ +/* + * Copyright 2009 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.gwt.simplecrawler; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import java.io.BufferedInputStream; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Queue; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +/** + * This simple sitemap parser only looks at url entries and extracts the loc + * from them. It ignores everything else. + * The format of site maps is documented here: http://www.sitemaps.org. + * A simple site map will look as follows: + * <?xml version="1.0" encoding="UTF-8"?> + * <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> + * <url><loc>http://j15r.com:8800/Showcase/Showcase.html?locale=ar</loc></url> + * <url><loc>http://j15r.com:8800/Showcase/Showcase.html?locale=en</loc></url> + * </urlset> + */ +public class SimpleSitemapParser { + + /** + * Parses a site map file + * + * @param fileName file name for site map. The format of the site map is + * documented here: {...@link http://www.sitemaps.org/} + * @param urlQueue Url queue where all Urls in the site map will be stored + * @throws ParserConfigurationException + * @throws SAXException + * @throws IOException + */ + public static void parseSitemap(String fileName, Queue<String> urlQueue) + throws ParserConfigurationException, SAXException, IOException { + DefaultHandler handler = makeSiteMapParserHandler(urlQueue); + SAXParserFactory factoryMain = SAXParserFactory.newInstance(); + factoryMain.setNamespaceAware(true); + SAXParser saxParser = factoryMain.newSAXParser(); + InputStream in = new FileInputStream(fileName); + in = new BufferedInputStream(in); + saxParser.parse(in, handler); + } + + /** + * SAX parser that performs the parse + * + * @param urlQueue Url queue where all Urls in the site map will be stored + * @return the SAX parser handler + */ + private static DefaultHandler makeSiteMapParserHandler(final Queue<String> urlQueue) { + return new DefaultHandler() { + StringBuilder valueBuilder = new StringBuilder(); + + @Override + public void characters(char ch[], int start, int length) { + valueBuilder.append(ch, start, length); + } + + @Override + public void endElement(String uri, String localName, String qName) { + if (localName.compareTo("loc") == 0) { + String url = valueBuilder.toString(); + urlQueue.add(url); + } + } + + @Override + public void startElement(String uri, String localName, String qName, + final Attributes attributes) { + valueBuilder.delete(0, valueBuilder.length()); + } + }; + } +} ======================================= --- /branches/crawlability/tools/build.xml Mon Dec 1 20:28:46 2008 +++ /branches/crawlability/tools/build.xml Thu Sep 24 08:24:11 2009 @@ -17,8 +17,12 @@ <target name="soyc-vis" depends="" description="Compile SOYC dashboard"> <gwt.ant dir="soyc-vis" /> </target> - - <target name="-do" depends="benchmark-viewer,api-checker,soyc-vis" description="Run all subfolders" /> + + <target name="simple-crawler" depends="" description="Compile simple crawler"> + <gwt.ant dir="simple-crawler" /> + </target> + + <target name="-do" depends="benchmark-viewer,api-checker,soyc-vis,simple-crawler" description="Run all subfolders" /> <target name="build" description="Build each subfolder"> <antcall target="-do"> --~--~---------~--~----~------------~-------~--~----~ http://groups.google.com/group/Google-Web-Toolkit-Contributors -~----------~----~----~----~------~----~------~--~---
