Author: olegk
Date: Tue Nov 11 03:47:18 2008
New Revision: 713022
URL: http://svn.apache.org/viewvc?rev=713022&view=rev
Log:
DROIDS-31: Ported Norobots unit tests from HttpComponents
Modified:
incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java
incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java
Modified:
incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java?rev=713022&r1=713021&r2=713022&view=diff
==============================================================================
---
incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java
(original)
+++
incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java
Tue Nov 11 03:47:18 2008
@@ -53,11 +53,13 @@
private final String userAgent;
private URI baseURI;
+ private URI robotsURI;
private RulesEngine rules;
private RulesEngine wildcardRules;
/**
- * Create a Client for a particular user-agent name.
+ * Create a Client for a particular user-agent name and the given
+ * [EMAIL PROTECTED] ContentLoader}.
*
* @param userAgent name for the robot
*/
@@ -75,6 +77,15 @@
}
/**
+ * Create a Client for a particular user-agent name.
+ *
+ * @param userAgent name for the robot
+ */
+ public NoRobotClient(String userAgent) {
+ this(new SimpleContentLoader(), userAgent);
+ }
+
+ /**
* Head to a website and suck in their robots.txt file.
* Note that the URL passed in is for the website and does
* not include the robots.txt file itself.
@@ -84,7 +95,7 @@
public void parse(URI baseUri) throws IOException, NoRobotException {
URI uri;
try {
- uri = baseUri.resolve(new URI("/robots.txt"));
+ uri = baseUri.resolve(new URI("robots.txt"));
} catch (URISyntaxException ex) {
throw new NoRobotException("Invalid URI", ex);
}
@@ -93,23 +104,26 @@
return;
}
InputStream instream = contentLoader.load(uri);
- try {
- parseText(instream);
- } finally {
- instream.close();
- }
+ parseText(instream);
+ robotsURI = uri;
baseURI = baseUri;
}
public void parseText(InputStream instream) throws IOException {
- Map<String, RulesEngine> map = parse(instream);
- this.rules = map.get(this.userAgent);
- if (this.rules == null) {
- this.rules = new RulesEngine();
- }
- this.wildcardRules = map.get("*");
- if (this.wildcardRules == null) {
- this.wildcardRules = new RulesEngine();
+ baseURI = null;
+ robotsURI = null;
+ try {
+ Map<String, RulesEngine> map = doParse(instream);
+ this.rules = map.get(this.userAgent);
+ if (this.rules == null) {
+ this.rules = new RulesEngine();
+ }
+ this.wildcardRules = map.get("*");
+ if (this.wildcardRules == null) {
+ this.wildcardRules = new RulesEngine();
+ }
+ } finally {
+ instream.close();
}
}
@@ -208,29 +222,40 @@
throw new IllegalStateException("You must call parse before you call
this method. ");
}
- if (baseURI != null && (!baseURI.getHost().equals(uri.getHost()) ||
+ if (baseURI != null &&
+ (!equals(baseURI.getHost(), uri.getHost()) ||
baseURI.getPort() != uri.getPort() ||
- !baseURI.getScheme().equals(uri.getScheme())))
+ !equals(baseURI.getScheme(), uri.getScheme())))
{
throw new IllegalArgumentException(
"Illegal to use a different url, " + uri.toString() +
", for this robots.txt: " + baseURI.toString());
}
- if((uri.equals(baseURI))) {
+ if (uri.equals(robotsURI)) {
return true;
}
- String uriStr = uri.toString();
+ String path = uri.getPath();
+ if (baseURI != null) {
+ String basepath = baseURI.getPath();
+ if (path.startsWith(basepath)) {
+ path = path.substring(basepath.length());
+ if (!path.startsWith("/")) {
+ path = "/" + path;
+ }
+ }
+ }
+
try {
- uriStr = URLDecoder.decode(uriStr, US_ASCII);
+ path = URLDecoder.decode(path, US_ASCII);
} catch (UnsupportedEncodingException ex) {
// ASCII always supported
return false;
}
- Boolean allowed = this.rules.isAllowed( uriStr );
+ Boolean allowed = this.rules.isAllowed( path );
if(allowed == null) {
- allowed = this.wildcardRules.isAllowed( uriStr );
+ allowed = this.wildcardRules.isAllowed( path );
}
if(allowed == null) {
allowed = Boolean.TRUE;
@@ -239,4 +264,9 @@
return allowed.booleanValue();
}
+ private static boolean equals(final Object obj1, final Object obj2) {
+ return obj1 == null ? obj2 == null : obj1.equals(obj2);
+ }
+
+
}
Modified:
incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java
URL:
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java?rev=713022&r1=713021&r2=713022&view=diff
==============================================================================
---
incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java
(original)
+++
incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java
Tue Nov 11 03:47:18 2008
@@ -1,16 +1,189 @@
package org.apache.droids.norobots;
import java.io.ByteArrayInputStream;
+import java.net.MalformedURLException;
import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
import java.util.Map;
import junit.framework.Assert;
+import org.junit.BeforeClass;
import org.junit.Test;
public class TestNorobotsClient
{
+ private static URI BASE_URI;
+
+ @BeforeClass
+ public static void setupBaseURL() throws URISyntaxException {
+ ClassLoader cl = TestNorobotsClient.class.getClassLoader();
+ BASE_URI = cl.getResource("data/").toURI();
+ }
+
+ //-----------------------------------------------------------------------
+ // To test:
+ // create -> parse -> isUrlAllowed?
+
+ @Test
+ public void testAllowed() throws Exception {
+ URI target = BASE_URI.resolve("basic/");
+ NoRobotClient nrc = new NoRobotClient("Scabies-1.0");
+ nrc.parse(target);
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("index.html")));
+ Assert.assertFalse(nrc.isUrlAllowed(target.resolve("view-cvs/")));
+ }
+
+ // Tests the example given in the RFC
+ @Test
+ public void testRfcExampleUnhipbot() throws Exception {
+ URI target = BASE_URI.resolve("rfc/");
+
+ NoRobotClient nrc = new NoRobotClient("unhipbot");
+ nrc.parse(target);
+
+ // Start of rfc test
+ Assert.assertFalse(nrc.isUrlAllowed(target));
+ Assert.assertFalse(nrc.isUrlAllowed(target.resolve("index.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("robots.txt")));
+ Assert.assertFalse(nrc.isUrlAllowed(target.resolve("server.html")));
+ Assert.assertFalse(nrc.isUrlAllowed(target.resolve("services/fast.html")));
+ Assert.assertFalse(nrc.isUrlAllowed(target.resolve("services/slow.html")));
+ Assert.assertFalse(nrc.isUrlAllowed(target.resolve("orgo.gif")));
+ Assert.assertFalse(nrc.isUrlAllowed(target.resolve("org/about.html")));
+ Assert.assertFalse(nrc.isUrlAllowed(target.resolve("org/plans.html")));
+ Assert.assertFalse(nrc.isUrlAllowed(target.resolve("%7Ejim/jim.html")));
+ Assert.assertFalse(nrc.isUrlAllowed(target.resolve("%7Emak/mak.html")));
+ // End of rfc test
+ }
+
+ @Test
+ public void testRfcExampleWebcrawler() throws Exception {
+ URI target = BASE_URI.resolve("rfc/");
+
+ NoRobotClient nrc = new NoRobotClient("webcrawler");
+ nrc.parse(target);
+ // Start of rfc test
+ Assert.assertTrue(nrc.isUrlAllowed(target));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("index.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("robots.txt")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("server.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("services/fast.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("services/slow.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("orgo.gif")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("org/about.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("org/plans.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("%7Ejim/jim.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("%7Emak/mak.html")));
+ // End of rfc test
+ }
+
+ @Test
+ public void testRfcExampleExcite() throws Exception {
+ URI target = BASE_URI.resolve("rfc/");
+
+ NoRobotClient nrc = new NoRobotClient("excite");
+ nrc.parse(target);
+ // Start of rfc test
+ Assert.assertTrue(nrc.isUrlAllowed(target));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("index.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("robots.txt")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("server.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("services/fast.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("services/slow.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("orgo.gif")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("org/about.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("org/plans.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("%7Ejim/jim.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("%7Emak/mak.html")));
+ // End of rfc test
+ }
+
+ @Test
+ public void testRfcExampleOther() throws Exception {
+ URI target = BASE_URI.resolve("rfc/");
+
+ NoRobotClient nrc = new NoRobotClient("other");
+ nrc.parse(target);
+ // Start of rfc test
+ Assert.assertFalse(nrc.isUrlAllowed(target));
+ Assert.assertFalse(nrc.isUrlAllowed(target.resolve("index.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("robots.txt")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("server.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("services/fast.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("services/slow.html")));
+ Assert.assertFalse(nrc.isUrlAllowed(target.resolve("orgo.gif")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("org/about.html")));
+ Assert.assertFalse(nrc.isUrlAllowed(target.resolve("org/plans.html")));
+ Assert.assertFalse(nrc.isUrlAllowed(target.resolve("%7Ejim/jim.html")));
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("%7Emak/mak.html")));
+ // End of rfc test
+ }
+
+ @Test
+ public void testRfcBadWebDesigner() throws Exception {
+ URI target = BASE_URI.resolve("bad/");
+
+ NoRobotClient nrc = new NoRobotClient("other");
+ nrc.parse(target);
+
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("%7Etest/%7Efoo.html")));
+ }
+
+ // Tests NRB-3
+ // http://www.osjava.org:8080/jira/secure/ViewIssue.jspa?key=NRB-3
+ @Test
+ public void testNrb3() throws Exception {
+ URI target = BASE_URI.resolve("basic/");
+ NoRobotClient nrc = new NoRobotClient("Scabies-1.0");
+ nrc.parse(target);
+ Assert.assertTrue(nrc.isUrlAllowed(target.resolve("basic")));
+ }
+
+ // Tests NRB-6
+ // http://issues.osjava.org/jira/secure/ViewIssue.jspa?key=NRB-6
+ @Test
+ public void testNrb6() throws Exception {
+ URI target = BASE_URI.resolve("order/");
+ NoRobotClient nrc = new NoRobotClient("Scabies-1.0");
+ nrc.parse(target);
+ Assert.assertTrue("Specific then Wildcard not working as expected", nrc
+ .isUrlAllowed(target.resolve("order/")));
+
+ target = BASE_URI.resolve("order-reverse/");
+ nrc = new NoRobotClient("Scabies-1.0");
+ nrc.parse(target);
+ Assert.assertTrue("Wildcard then Specific not working as expected", nrc
+ .isUrlAllowed(target.resolve("order/")));
+ }
+
+ // Tests NRB-9
+ // http://issues.osjava.org/jira/secure/ViewIssue.jspa?key=NRB-9
+ @Test
+ public void testNrb9() throws Exception {
+ URI target = BASE_URI.resolve("disallow-empty/");
+ NoRobotClient nrc = new NoRobotClient("test");
+ nrc.parse(target);
+ Assert.assertTrue("'Disallow: ' should mean to disallow nothing", nrc
+ .isUrlAllowed(target.resolve("index.html")));
+ }
+
+ // Tests NRB-8
+ // http://issues.osjava.org/jira/secure/ViewIssue.jspa?key=NRB-8
+ @Test
+ public void testNrb8() throws Exception {
+ URI target = BASE_URI.resolve("ua-case-insensitive/");
+ String[] names = new String[] {"test", "TEST", "tEsT"};
+ for (int i = 0; i < names.length; i++) {
+ NoRobotClient nrc = new NoRobotClient(names[i]);
+ nrc.parse(target);
+ Assert.assertFalse("User-Agent names should be case insensitive", nrc
+ .isUrlAllowed(target.resolve("index.html")));
+ }
+ }
+
@Test
public void testRobotsParsing() throws Exception {
String s =