Author: olegk
Date: Tue Nov 11 03:47:18 2008
New Revision: 713022

URL: http://svn.apache.org/viewvc?rev=713022&view=rev
Log:
DROIDS-31: Ported Norobots unit tests from HttpComponents

Modified:
    
incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java
    
incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java

Modified: 
incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java
URL: 
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java?rev=713022&r1=713021&r2=713022&view=diff
==============================================================================
--- 
incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java
 (original)
+++ 
incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java
 Tue Nov 11 03:47:18 2008
@@ -53,11 +53,13 @@
   private final String userAgent;
   
   private URI baseURI;
+  private URI robotsURI;
   private RulesEngine rules;
   private RulesEngine wildcardRules;
 
   /**
-   * Create a Client for a particular user-agent name. 
+   * Create a Client for a particular user-agent name and the given
+   * [EMAIL PROTECTED] ContentLoader}. 
    *
    * @param userAgent name for the robot
    */
@@ -75,6 +77,15 @@
   }
 
   /**
+   * Create a Client for a particular user-agent name. 
+   *
+   * @param userAgent name for the robot
+   */
+  public NoRobotClient(String userAgent) {
+    this(new SimpleContentLoader(), userAgent);
+  }
+  
+  /**
    * Head to a website and suck in their robots.txt file. 
    * Note that the URL passed in is for the website and does 
    * not include the robots.txt file itself.
@@ -84,7 +95,7 @@
   public void parse(URI baseUri) throws IOException, NoRobotException {
     URI uri;
     try {
-      uri = baseUri.resolve(new URI("/robots.txt"));
+      uri = baseUri.resolve(new URI("robots.txt"));
     } catch (URISyntaxException ex) {
       throw new NoRobotException("Invalid URI", ex);
     }
@@ -93,23 +104,26 @@
       return;
     }
     InputStream instream = contentLoader.load(uri);
-    try {
-      parseText(instream);
-    } finally {
-      instream.close();
-    }
+    parseText(instream);
+    robotsURI = uri;
     baseURI = baseUri;
   }
 
   public void parseText(InputStream instream) throws IOException {
-    Map<String, RulesEngine> map = parse(instream);
-    this.rules = map.get(this.userAgent);
-    if (this.rules == null) {
-      this.rules = new RulesEngine();
-    }
-    this.wildcardRules = map.get("*");
-    if (this.wildcardRules == null) {
-      this.wildcardRules = new RulesEngine();
+    baseURI = null;
+    robotsURI = null;
+    try {
+      Map<String, RulesEngine> map = doParse(instream);
+      this.rules = map.get(this.userAgent);
+      if (this.rules == null) {
+        this.rules = new RulesEngine();
+      }
+      this.wildcardRules = map.get("*");
+      if (this.wildcardRules == null) {
+        this.wildcardRules = new RulesEngine();
+      }
+    } finally {
+      instream.close();
     }
   }
 
@@ -208,29 +222,40 @@
       throw new IllegalStateException("You must call parse before you call 
this method.  ");
     }
 
-    if (baseURI != null && (!baseURI.getHost().equals(uri.getHost()) ||
+    if (baseURI != null && 
+        (!equals(baseURI.getHost(), uri.getHost()) ||
          baseURI.getPort() != uri.getPort() ||
-        !baseURI.getScheme().equals(uri.getScheme())))
+        !equals(baseURI.getScheme(), uri.getScheme())))
     {
       throw new IllegalArgumentException(
           "Illegal to use a different url, " + uri.toString() + 
           ",  for this robots.txt: " + baseURI.toString());
     }
     
-    if((uri.equals(baseURI))) {
+    if (uri.equals(robotsURI)) {
       return true;
     }
     
-    String uriStr = uri.toString();
+    String path = uri.getPath();
+    if (baseURI != null) {
+      String basepath = baseURI.getPath();
+      if (path.startsWith(basepath)) {
+        path = path.substring(basepath.length());
+        if (!path.startsWith("/")) {
+          path = "/" + path;
+        }
+      }
+    }
+    
     try {
-      uriStr = URLDecoder.decode(uriStr, US_ASCII);
+      path = URLDecoder.decode(path, US_ASCII);
     } catch (UnsupportedEncodingException ex) {
       // ASCII always supported
       return false;
     }
-    Boolean allowed = this.rules.isAllowed( uriStr );
+    Boolean allowed = this.rules.isAllowed( path );
     if(allowed == null) {
-      allowed = this.wildcardRules.isAllowed( uriStr );
+      allowed = this.wildcardRules.isAllowed( path );
     }
     if(allowed == null) {
       allowed = Boolean.TRUE;
@@ -239,4 +264,9 @@
     return allowed.booleanValue();
   }
 
+  private static boolean equals(final Object obj1, final Object obj2) {
+    return obj1 == null ? obj2 == null : obj1.equals(obj2);
+  }
+
+  
 }

Modified: 
incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java
URL: 
http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java?rev=713022&r1=713021&r2=713022&view=diff
==============================================================================
--- 
incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java
 (original)
+++ 
incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java
 Tue Nov 11 03:47:18 2008
@@ -1,16 +1,189 @@
 package org.apache.droids.norobots;
 
 import java.io.ByteArrayInputStream;
+import java.net.MalformedURLException;
 import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
 import java.util.Map;
 
 import junit.framework.Assert;
 
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 public class TestNorobotsClient
 {
 
+  private static URI BASE_URI;
+  
+  @BeforeClass
+  public static void setupBaseURL() throws URISyntaxException {
+    ClassLoader cl = TestNorobotsClient.class.getClassLoader();
+    BASE_URI = cl.getResource("data/").toURI();
+  }
+  
+  //-----------------------------------------------------------------------
+  // To test: 
+  // create -> parse -> isUrlAllowed?
+
+  @Test
+  public void testAllowed() throws Exception {
+    URI target = BASE_URI.resolve("basic/");
+    NoRobotClient nrc = new NoRobotClient("Scabies-1.0");
+    nrc.parse(target);
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("index.html")));
+    Assert.assertFalse(nrc.isUrlAllowed(target.resolve("view-cvs/")));
+  }
+
+  // Tests the example given in the RFC
+  @Test
+  public void testRfcExampleUnhipbot() throws Exception {
+    URI target = BASE_URI.resolve("rfc/");
+
+    NoRobotClient nrc = new NoRobotClient("unhipbot");
+    nrc.parse(target);
+
+    // Start of rfc test
+    Assert.assertFalse(nrc.isUrlAllowed(target));
+    Assert.assertFalse(nrc.isUrlAllowed(target.resolve("index.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("robots.txt")));
+    Assert.assertFalse(nrc.isUrlAllowed(target.resolve("server.html")));
+    Assert.assertFalse(nrc.isUrlAllowed(target.resolve("services/fast.html")));
+    Assert.assertFalse(nrc.isUrlAllowed(target.resolve("services/slow.html")));
+    Assert.assertFalse(nrc.isUrlAllowed(target.resolve("orgo.gif")));
+    Assert.assertFalse(nrc.isUrlAllowed(target.resolve("org/about.html")));
+    Assert.assertFalse(nrc.isUrlAllowed(target.resolve("org/plans.html")));
+    Assert.assertFalse(nrc.isUrlAllowed(target.resolve("%7Ejim/jim.html")));
+    Assert.assertFalse(nrc.isUrlAllowed(target.resolve("%7Emak/mak.html")));
+    // End of rfc test
+  }
+
+  @Test
+  public void testRfcExampleWebcrawler() throws Exception {
+    URI target = BASE_URI.resolve("rfc/");
+
+    NoRobotClient nrc = new NoRobotClient("webcrawler");
+    nrc.parse(target);
+    // Start of rfc test
+    Assert.assertTrue(nrc.isUrlAllowed(target));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("index.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("robots.txt")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("server.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("services/fast.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("services/slow.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("orgo.gif")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("org/about.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("org/plans.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("%7Ejim/jim.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("%7Emak/mak.html")));
+    // End of rfc test
+  }
+
+  @Test
+  public void testRfcExampleExcite() throws Exception {
+    URI target = BASE_URI.resolve("rfc/");
+
+    NoRobotClient nrc = new NoRobotClient("excite");
+    nrc.parse(target);
+    // Start of rfc test
+    Assert.assertTrue(nrc.isUrlAllowed(target));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("index.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("robots.txt")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("server.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("services/fast.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("services/slow.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("orgo.gif")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("org/about.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("org/plans.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("%7Ejim/jim.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("%7Emak/mak.html")));
+    // End of rfc test
+  }
+
+  @Test
+  public void testRfcExampleOther() throws Exception {
+    URI target = BASE_URI.resolve("rfc/");
+
+    NoRobotClient nrc = new NoRobotClient("other");
+    nrc.parse(target);
+    // Start of rfc test
+    Assert.assertFalse(nrc.isUrlAllowed(target));
+    Assert.assertFalse(nrc.isUrlAllowed(target.resolve("index.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("robots.txt")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("server.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("services/fast.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("services/slow.html")));
+    Assert.assertFalse(nrc.isUrlAllowed(target.resolve("orgo.gif")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("org/about.html")));
+    Assert.assertFalse(nrc.isUrlAllowed(target.resolve("org/plans.html")));
+    Assert.assertFalse(nrc.isUrlAllowed(target.resolve("%7Ejim/jim.html")));
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("%7Emak/mak.html")));
+    // End of rfc test
+  }
+
+  @Test
+  public void testRfcBadWebDesigner() throws Exception {
+    URI target = BASE_URI.resolve("bad/");
+
+    NoRobotClient nrc = new NoRobotClient("other");
+    nrc.parse(target);
+
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("%7Etest/%7Efoo.html")));
+  }
+
+  // Tests NRB-3
+  // http://www.osjava.org:8080/jira/secure/ViewIssue.jspa?key=NRB-3
+  @Test
+  public void testNrb3() throws Exception {
+    URI target = BASE_URI.resolve("basic/");
+    NoRobotClient nrc = new NoRobotClient("Scabies-1.0");
+    nrc.parse(target);
+    Assert.assertTrue(nrc.isUrlAllowed(target.resolve("basic")));
+  }
+
+  // Tests NRB-6
+  // http://issues.osjava.org/jira/secure/ViewIssue.jspa?key=NRB-6
+  @Test
+  public void testNrb6() throws Exception {
+    URI target = BASE_URI.resolve("order/");
+    NoRobotClient nrc = new NoRobotClient("Scabies-1.0");
+    nrc.parse(target);
+    Assert.assertTrue("Specific then Wildcard not working as expected", nrc
+        .isUrlAllowed(target.resolve("order/")));
+
+    target = BASE_URI.resolve("order-reverse/");
+    nrc = new NoRobotClient("Scabies-1.0");
+    nrc.parse(target);
+    Assert.assertTrue("Wildcard then Specific not working as expected", nrc
+        .isUrlAllowed(target.resolve("order/")));
+  }      
+
+  // Tests NRB-9
+  // http://issues.osjava.org/jira/secure/ViewIssue.jspa?key=NRB-9
+  @Test
+  public void testNrb9() throws Exception {
+    URI target = BASE_URI.resolve("disallow-empty/");
+    NoRobotClient nrc = new NoRobotClient("test");
+    nrc.parse(target);
+    Assert.assertTrue("'Disallow: ' should mean to disallow nothing", nrc
+        .isUrlAllowed(target.resolve("index.html")));
+  }
+
+  // Tests NRB-8
+  // http://issues.osjava.org/jira/secure/ViewIssue.jspa?key=NRB-8
+  @Test
+  public void testNrb8() throws Exception {
+    URI target = BASE_URI.resolve("ua-case-insensitive/");
+    String[] names = new String[] {"test", "TEST", "tEsT"};
+    for (int i = 0; i < names.length; i++) {
+      NoRobotClient nrc = new NoRobotClient(names[i]);
+      nrc.parse(target);
+      Assert.assertFalse("User-Agent names should be case insensitive", nrc
+          .isUrlAllowed(target.resolve("index.html")));
+    }
+  }
+
   @Test
   public void testRobotsParsing() throws Exception {
     String s = 


Reply via email to