Author: theli
Date: 2006-03-09 15:03:54 +0100 (Thu, 09 Mar 2006)
New Revision: 1872

Modified:
   trunk/source/de/anomic/data/robotsParser.java
   trunk/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java
Log:
*) Adding support for robots Allow directive

Modified: trunk/source/de/anomic/data/robotsParser.java
===================================================================
--- trunk/source/de/anomic/data/robotsParser.java       2006-03-09 14:03:24 UTC 
(rev 1871)
+++ trunk/source/de/anomic/data/robotsParser.java       2006-03-09 14:03:54 UTC 
(rev 1872)
@@ -75,6 +75,11 @@
  */
 public final class robotsParser{
     
+    public static final String ROBOTS_USER_AGENT = "User-agent:".toUpperCase();
+    public static final String ROBOTS_DISALLOW = "Disallow:".toUpperCase();
+    public static final String ROBOTS_ALLOW = "Allow:".toUpperCase();
+    public static final String ROBOTS_COMMENT = "#";
+    
     /*public robotsParser(URL robotsUrl){
      }*/
     /*
@@ -119,9 +124,9 @@
                 // rule4Yacy = false; inBlock = false;
                 
                 // NEW: just ignore it
-            } else if (line.startsWith("#")) {
+            } else if (line.startsWith(ROBOTS_COMMENT)) {
                 // we can ignore this. Just a comment line
-            } else if (lineUpper.startsWith("User-agent:".toUpperCase())) {
+            } else if (lineUpper.startsWith(ROBOTS_USER_AGENT)) {
                 
                 if (inBlock) {
                     // we have detected the start of a new block
@@ -131,7 +136,7 @@
                 }
                 
                 // cutting off comments at the line end
-                pos = line.indexOf("#");
+                pos = line.indexOf(ROBOTS_COMMENT);
                 if (pos != -1) line = line.substring(0,pos).trim();
                 
                 // replacing all tabs with spaces
@@ -145,12 +150,14 @@
                     isRuleBlock4YaCyAgent |= 
userAgent.toLowerCase().indexOf("yacy") >=0;
                     if (isRuleBlock4YaCyAgent) rule4YaCyFound = true;
                 }
-            } else if (lineUpper.startsWith("Disallow:".toUpperCase())) {
+            } else if (lineUpper.startsWith(ROBOTS_DISALLOW) || 
+                       lineUpper.startsWith(ROBOTS_ALLOW)) {
                 inBlock = true;
+                boolean isDisallowRule = lineUpper.startsWith(ROBOTS_DISALLOW);
                 
                 if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) {
                     // cutting off comments at the line end
-                    pos = line.indexOf("#");
+                    pos = line.indexOf(ROBOTS_COMMENT);
                     if (pos != -1) line = line.substring(0,pos).trim();
                                        
                     // cutting of tailing *
@@ -176,9 +183,10 @@
                         }
                         
                         // escaping all occurences of ; because this char is 
used as special char in the Robots DB
-                        path = path.replaceAll(";","%3B");                    
+                        path = 
path.replaceAll(plasmaCrawlRobotsTxt.ROBOTS_DB_PATH_SEPARATOR,"%3B");           
         
                         
                         // adding it to the pathlist
+                        if (!isDisallowRule) path = "!" + path;
                         if (isRuleBlock4AllAgents) deny4AllAgents.add(path);
                         if (isRuleBlock4YaCyAgent) deny4YaCyAgent.add(path);
                     }

Modified: trunk/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java
===================================================================
--- trunk/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java     2006-03-09 
14:03:24 UTC (rev 1871)
+++ trunk/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java     2006-03-09 
14:03:54 UTC (rev 1872)
@@ -59,6 +59,9 @@
 import de.anomic.kelondro.kelondroException;
 
 public class plasmaCrawlRobotsTxt {
+    
+    public static final String ROBOTS_DB_PATH_SEPARATOR = ";";    
+    
     kelondroMap robotsTable;
     private final File robotsTableFile;
     private int bufferkb;
@@ -168,7 +171,7 @@
                 this.disallowPathList = new LinkedList();
                 String csPl = (String) this.mem.get(DISALLOW_PATH_LIST);
                 if (csPl.length() > 0){
-                    String[] pathArray = csPl.split(";");
+                    String[] pathArray = csPl.split(ROBOTS_DB_PATH_SEPARATOR);
                     if ((pathArray != null)&&(pathArray.length > 0)) {
                         this.disallowPathList.addAll(Arrays.asList(pathArray));
                     }
@@ -200,7 +203,7 @@
                 StringBuffer pathListStr = new StringBuffer();
                 for (int i=0; i<disallowPathList.size();i++) {
                     pathListStr.append(disallowPathList.get(i))
-                               .append(";");
+                               .append(ROBOTS_DB_PATH_SEPARATOR);
                 }
                 
this.mem.put(DISALLOW_PATH_LIST,pathListStr.substring(0,pathListStr.length()-1));
             }
@@ -246,13 +249,26 @@
         }          
         
         public boolean isDisallowed(String path) {
-            if ((this.mem == null) || (this.disallowPathList.size() == 0)) 
return false;            
-            if ((path == null) || (path.length() == 0)) path = "/";
+            if ((this.mem == null) || (this.disallowPathList.size() == 0)) 
return false;   
             
+            // if the path is null or empty we set it to /
+            if ((path == null) || (path.length() == 0)) path = "/";            
+            // escaping all occurences of ; because this char is used as 
special char in the Robots DB
+            else  path = path.replaceAll(ROBOTS_DB_PATH_SEPARATOR,"%3B");
+            
+            
             Iterator pathIter = this.disallowPathList.iterator();
             while (pathIter.hasNext()) {
                 String nextPath = (String) pathIter.next();
-                if (path.startsWith(nextPath)) return true;
+                // allow rule
+                if (nextPath.startsWith("!") && nextPath.length() > 1 && 
path.startsWith(nextPath.substring(1))) {
+                    return false;
+                }
+                    
+                // disallow rule
+                if (path.startsWith(nextPath)) {
+                    return true;
+                }
             }
             return false;
         }

_______________________________________________
YaCy-svn mailing list
[email protected]
http://lists.berlios.de/mailman/listinfo/yacy-svn

Antwort per Email an