Author: theli
Date: 2006-03-09 13:19:46 +0100 (Thu, 09 Mar 2006)
New Revision: 1867

Modified:
   trunk/source/de/anomic/data/robotsParser.java
Log:
*) better robots.txt support
   - previously rules for all crawlers and special rules for yacy where 
     combined using AND. Now the general rule will be ignored if there is
     a special rule for yacy (according to rfc)

Modified: trunk/source/de/anomic/data/robotsParser.java
===================================================================
--- trunk/source/de/anomic/data/robotsParser.java       2006-03-09 11:31:17 UTC 
(rev 1866)
+++ trunk/source/de/anomic/data/robotsParser.java       2006-03-09 12:19:46 UTC 
(rev 1867)
@@ -95,59 +95,61 @@
     }
     
     public static ArrayList parse(BufferedReader reader) throws IOException{
-        ArrayList deny = new ArrayList();
+        ArrayList deny4AllAgents = new ArrayList();
+        ArrayList deny4YaCyAgent = new ArrayList();
         
         int pos;
         String line = null, lineUpper = null;
-        boolean rule4Yacy = false, inBlock = false;        
+        boolean isRuleBlock4AllAgents = false,
+                isRuleBlock4YaCyAgent = false,
+                rule4YaCyFound = false,
+                inBlock = false;        
         
         while ((line = reader.readLine()) != null) {
             line = line.trim();
             lineUpper = line.toUpperCase();
             
             if (line.length() == 0) {
-                // we have reached the end of the rule block
-                rule4Yacy = false; inBlock = false;
+                // OLD: we have reached the end of the rule block
+                // rule4Yacy = false; inBlock = false;
+                
+                // NEW: just ignore it
             } else if (line.startsWith("#")) {
                 // we can ignore this. Just a comment line
             } else if (lineUpper.startsWith("User-agent:".toUpperCase())) {
                 
                 if (inBlock) {
+                    // we have detected the start of a new block
                     inBlock = false;
-                    rule4Yacy = false;
+                    isRuleBlock4AllAgents = false;
+                    isRuleBlock4YaCyAgent = false;
                 }
                 
-                if (!rule4Yacy) {
-                    // cutting off comments at the line end
-                    pos = line.indexOf("#");
-                    if (pos != -1) {
-                        line = line.substring(0,pos).trim();
-                    }
-                    
-                    // replacing all tabs with spaces
-                    line = line.replaceAll("\t"," ");
-                    
-                    // getting out the robots name
-                    pos = line.indexOf(" ");
-                    if (pos != -1) {
-                        String userAgent = line.substring(pos).trim();
-                        rule4Yacy = (userAgent.equals("*") || 
(userAgent.toLowerCase().indexOf("yacy") >=0));
-                    }
+                // cutting off comments at the line end
+                pos = line.indexOf("#");
+                if (pos != -1) line = line.substring(0,pos).trim();
+                
+                // replacing all tabs with spaces
+                line = line.replaceAll("\t"," ");
+                
+                // getting out the robots name
+                pos = line.indexOf(" ");
+                if (pos != -1) {
+                    String userAgent = line.substring(pos).trim();
+                    isRuleBlock4AllAgents |= userAgent.equals("*");
+                    isRuleBlock4YaCyAgent |= 
userAgent.toLowerCase().indexOf("yacy") >=0;
+                    if (isRuleBlock4YaCyAgent) rule4YaCyFound = true;
                 }
             } else if (lineUpper.startsWith("Disallow:".toUpperCase())) {
                 inBlock = true;
                 
-                if (rule4Yacy) {
+                if (isRuleBlock4YaCyAgent || isRuleBlock4AllAgents) {
                     // cutting off comments at the line end
                     pos = line.indexOf("#");
-                    if (pos != -1) {
-                        line = line.substring(0,pos).trim();
-                    }
+                    if (pos != -1) line = line.substring(0,pos).trim();
                                        
                     // cutting of tailing *
-                    if (line.endsWith("*")) {
-                        line = line.substring(0,line.length()-1);
-                    }
+                    if (line.endsWith("*")) line = 
line.substring(0,line.length()-1);
                     
                     // replacing all tabs with spaces
                     line = line.replaceAll("\t"," ");
@@ -172,13 +174,14 @@
                         path = path.replaceAll(";","%3B");                    
                         
                         // adding it to the pathlist
-                        deny.add(path);
+                        if (isRuleBlock4AllAgents) deny4AllAgents.add(path);
+                        if (isRuleBlock4YaCyAgent) deny4YaCyAgent.add(path);
                     }
                 }
             }
         }
         
-        return deny;
+        return (rule4YaCyFound)?deny4YaCyAgent:deny4AllAgents;
     }        
     
     public static boolean isDisallowed(URL nexturl) {
@@ -293,11 +296,16 @@
             
             // if we previously have downloaded this robots.txt then we can 
set the if-modified-since header
             httpHeader reqHeaders = new httpHeader();
+            
+            // adding referer
+            reqHeaders.put(httpHeader.REFERER, (new 
URL(robotsURL,"/")).toString());
+            
             if (entry != null) {
                 oldEtag = entry.getETag();
                 reqHeaders = new httpHeader();
                 Date modDate = entry.getModDate();
                 if (modDate != null) 
reqHeaders.put(httpHeader.IF_MODIFIED_SINCE,httpc.dateString(entry.getModDate()));
+                
             }
             
             httpc.response res = con.GET(robotsURL.getFile(), reqHeaders);

_______________________________________________
YaCy-svn mailing list
[email protected]
http://lists.berlios.de/mailman/listinfo/yacy-svn

Antwort per Email an