Update of /cvsroot/nutch/nutch/src/java/net/nutch/fetcher
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9264/src/java/net/nutch/fetcher
Modified Files:
Fetcher.java RobotRulesParser.java
Log Message:
The Fetcher used to be good at downloading URLs, but
lacked a few features that RequestScheduler has, such as
obeying robots.txt and following delay guidelines. It
now covers both cases.
Index: RobotRulesParser.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/fetcher/RobotRulesParser.java,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** RobotRulesParser.java 7 Apr 2003 06:46:07 -0000 1.7
--- RobotRulesParser.java 6 Apr 2004 23:20:53 -0000 1.8
***************
*** 22,352 ****
/**
* This class handles the parsing of <code>robots.txt</code> files.
*/
public class RobotRulesParser {
! public static final Logger LOG=
! LogFormatter.getLogger("net.nutch.fetcher.RobotRulesParser");
! private HashMap robotNames;
! private static final String CHARACTER_ENCODING= "UTF-8";
! private static final int NO_PRECEDENCE= Integer.MAX_VALUE;
! private static final RobotRuleSet EMPTY_RULES= new RobotRuleSet();
! /**
! * This class holds the rules which were parsed from a robots.txt
! * file, and can test paths against those rules.
! */
! public static class RobotRuleSet {
! ArrayList tmpEntries;
! RobotsEntry[] entries;
! private class RobotsEntry {
! String prefix;
! boolean allowed;
! RobotsEntry(String prefix, boolean allowed) {
! this.prefix= prefix;
! this.allowed= allowed;
! }
! }
! // should not be instantiated from outside RobotRulesParser
! private RobotRuleSet() {
! tmpEntries= new ArrayList();
! entries= null;
! }
! private void addPrefix(String prefix, boolean allow) {
! if (tmpEntries == null) {
! tmpEntries= new ArrayList();
! if (entries != null) {
! for (int i= 0; i < entries.length; i++)
! tmpEntries.add(entries[i]);
}
- entries= null;
- }
! tmpEntries.add(new RobotsEntry(prefix, allow));
! }
! private void clearPrefixes() {
! if (tmpEntries == null) {
! tmpEntries= new ArrayList();
! entries= null;
! } else {
! tmpEntries.clear();
! }
! }
! /**
! * Returns <code>false</code> if the <code>robots.txt</code> file
! * prohibits us from accessing the given <code>path</code>, or
! * <code>true</code> otherwise.
! */
! public boolean isAllowed(String path) {
! try {
! path= URLDecoder.decode(path, CHARACTER_ENCODING);
! } catch (Exception e) {
! // just ignore it- we can still try to match
! // path prefixes
! }
! if (entries == null) {
! entries= new RobotsEntry[tmpEntries.size()];
! entries= (RobotsEntry[])
! tmpEntries.toArray(entries);
! tmpEntries= null;
! }
! int pos= 0;
! int end= entries.length;
! while (pos < end) {
! if (path.startsWith(entries[pos].prefix))
! return entries[pos].allowed;
! pos++;
! }
! return true;
! }
! public String toString() {
! isAllowed("x"); // force String[] representation
! StringBuffer buf= new StringBuffer();
! for (int i= 0; i < entries.length; i++)
! if (entries[i].allowed)
! buf.append("Allow: " + entries[i].prefix
! + System.getProperty("line.separator"));
! else
! buf.append("Disallow: " + entries[i].prefix
! + System.getProperty("line.separator"));
! return buf.toString();
! }
! }
! /**
! * Creates a new <code>RobotRulesParser</code> which will use the
! * supplied <code>robotNames</code> when choosing which stanza to
! * follow in <code>robots.txt</code> files. Any name in the array
! * may be matched. The order of the <code>robotNames</code>
! * determines the precedence- if many names are matched, only the
! * rules associated with the robot name having the smallest index
! * will be used.
! */
! public RobotRulesParser(String[] robotNames) {
! this.robotNames= new HashMap();
! for (int i= 0; i < robotNames.length; i++) {
! this.robotNames.put(robotNames[i].toLowerCase(), new Integer(i));
}
- // always make sure "*" is included
- if (!this.robotNames.containsKey("*"))
- this.robotNames.put("*", new Integer(robotNames.length));
- }
! /**
! * Returns a [EMAIL PROTECTED] RobotRuleSet} object which encapsulates the
! * rules parsed from the supplied <code>robotContent</code>.
! */
! RobotRuleSet parseRules(byte[] robotContent) {
! if (robotContent == null)
! return EMPTY_RULES;
! String content= new String (robotContent);
! StringTokenizer lineParser= new StringTokenizer(content, "\n\r");
! RobotRuleSet bestRulesSoFar= null;
! int bestPrecedenceSoFar= NO_PRECEDENCE;
! RobotRuleSet currentRules= new RobotRuleSet();
! int currentPrecedence= NO_PRECEDENCE;
! boolean addRules= false; // in stanza for our robot
! boolean doneAgents= false; // detect multiple agent lines
! while (lineParser.hasMoreTokens()) {
! String line= lineParser.nextToken();
! // trim out comments and whitespace
! int hashPos= line.indexOf("#");
! if (hashPos >= 0)
! line= line.substring(0, hashPos);
! line= line.trim();
! if ( (line.length() >= 11)
! && (line.substring(0, 11).equalsIgnoreCase("User-agent:")) ) {
! if (doneAgents) {
! if (currentPrecedence < bestPrecedenceSoFar) {
! bestPrecedenceSoFar= currentPrecedence;
! bestRulesSoFar= currentRules;
! currentPrecedence= NO_PRECEDENCE;
! currentRules= new RobotRuleSet();
! }
! addRules= false;
! }
! doneAgents= false;
! String agentNames= line.substring(line.indexOf(":") + 1);
! agentNames= agentNames.trim();
! StringTokenizer agentTokenizer= new StringTokenizer(agentNames);
! while (agentTokenizer.hasMoreTokens()) {
! // for each agent listed, see if it's us:
! String agentName= agentTokenizer.nextToken().toLowerCase();
! Integer precedenceInt= (Integer) robotNames.get(agentName);
! if (precedenceInt != null) {
! int precedence= precedenceInt.intValue();
! if ( (precedence < currentPrecedence)
! && (precedence < bestPrecedenceSoFar) )
! currentPrecedence= precedence;
! }
! }
! if (currentPrecedence < bestPrecedenceSoFar)
! addRules= true;
! } else if ( (line.length() >= 9)
! && (line.substring(0, 9).equalsIgnoreCase("Disallow:")) ) {
! doneAgents= true;
! String path= line.substring(line.indexOf(":") + 1);
! path= path.trim();
! try {
! path= URLDecoder.decode(path, CHARACTER_ENCODING);
! } catch (Exception e) {
! LOG.warning("error parsing robots rules- can't decode path: "
! + path);
! }
! if (path.length() == 0) { // "empty rule"
! if (addRules)
! currentRules.clearPrefixes();
! } else { // rule with path
! if (addRules)
! currentRules.addPrefix(path, false);
! }
! } else if ( (line.length() >= 6)
! && (line.substring(0, 6).equalsIgnoreCase("Allow:")) ) {
! doneAgents= true;
! String path= line.substring(line.indexOf(":") + 1);
! path= path.trim();
! if (path.length() == 0) {
! // "empty rule"- treat same as empty disallow
! if (addRules)
! currentRules.clearPrefixes();
! } else { // rule with path
! if (addRules)
! currentRules.addPrefix(path, true);
}
- }
- }
! if (currentPrecedence < bestPrecedenceSoFar) {
! bestPrecedenceSoFar= currentPrecedence;
! bestRulesSoFar= currentRules;
! }
! if (bestPrecedenceSoFar == NO_PRECEDENCE)
! return EMPTY_RULES;
! return bestRulesSoFar;
! }
! /**
! * Returns a <code>RobotRuleSet</code> object appropriate for use
! * when the <code>robots.txt</code> file is empty or missing; all
! * requests are allowed.
! */
! static RobotRuleSet getEmptyRules() {
! return EMPTY_RULES;
! }
! /**
! * Returns a <code>RobotRuleSet</code> object appropriate for use
! * when the <code>robots.txt</code> file is not fetched due to a
! * <code>403/Forbidden</code> response; all requests are
! * disallowed.
! */
! static RobotRuleSet getForbidAllRules() {
! RobotRuleSet rules= new RobotRuleSet();
! rules.addPrefix("", false);
! return rules;
! }
! private final static int BUFSIZE= 2048;
! /** command-line main for testing */
! public static void main(String[] argv) {
! if (argv.length != 3) {
! System.out.println("Usage:");
! System.out.println(" java <robots-file> <url-file> <agent-name>+");
! System.out.println("");
! System.out.println("The <robots-file> will be parsed as a robots.txt file,");
! System.out.println("using the given <agent-name> to select rules. URLs ");
! System.out.println("will be read (one per line) from <url-file>, and tested");
! System.out.println("against the rules.");
! System.exit(-1);
! }
! try {
! FileInputStream robotsIn= new FileInputStream(argv[0]);
! LineNumberReader testsIn= new LineNumberReader(new FileReader(argv[1]));
! String[] robotNames= new String[argv.length - 1];
! for (int i= 0; i < argv.length - 2; i++)
! robotNames[i]= argv[i+2];
! ArrayList bufs= new ArrayList();
! byte[] buf= new byte[BUFSIZE];
! int totBytes= 0;
! int rsize= robotsIn.read(buf);
! while (rsize >= 0) {
! totBytes+= rsize;
! if (rsize != BUFSIZE) {
! byte[] tmp= new byte[rsize];
! System.arraycopy(buf, 0, tmp, 0, rsize);
! bufs.add(tmp);
! } else {
! bufs.add(buf);
! buf= new byte[BUFSIZE];
! }
! rsize= robotsIn.read(buf);
! }
! byte[] robotsBytes= new byte[totBytes];
! int pos= 0;
! for (int i= 0; i < bufs.size(); i++) {
! byte[] currBuf= (byte[]) bufs.get(i);
! int currBufLen= currBuf.length;
! System.arraycopy(currBuf, 0, robotsBytes, pos, currBufLen);
! pos+= currBufLen;
! }
! RobotRulesParser parser=
! new RobotRulesParser(robotNames);
! RobotRuleSet rules= parser.parseRules(robotsBytes);
! System.out.println("Rules:");
! System.out.println(rules);
! System.out.println();
! String testPath= testsIn.readLine().trim();
! while (testPath != null) {
! System.out.println( (rules.isAllowed(testPath) ?
! "allowed" : "not allowed")
! + ":\t" + testPath);
! testPath= testsIn.readLine();
! }
! } catch (Exception e) {
! e.printStackTrace();
}
- }
}
--- 22,379 ----
/**
* This class handles the parsing of <code>robots.txt</code> files.
+ * It emits RobotRules objects, which describe the download permissions
+ * as described in RobotRulesParser.
+ *
+ * @author Tom Pierce, modified by Mike Cafarella
*/
public class RobotRulesParser {
+ public static final Logger LOG=
+ LogFormatter.getLogger("net.nutch.fetcher.RobotRulesParser");
! private HashMap robotNames;
! private static final String CHARACTER_ENCODING= "UTF-8";
! private static final int NO_PRECEDENCE= Integer.MAX_VALUE;
! private static final RobotRuleSet EMPTY_RULES= new RobotRuleSet();
! /**
! * This class holds the rules which were parsed from a robots.txt
! * file, and can test paths against those rules.
! */
! public static class RobotRuleSet {
! ArrayList tmpEntries;
! RobotsEntry[] entries;
! long expireTime;
! /**
! */
! private class RobotsEntry {
! String prefix;
! boolean allowed;
! RobotsEntry(String prefix, boolean allowed) {
! this.prefix= prefix;
! this.allowed= allowed;
! }
! }
! /**
! * should not be instantiated from outside RobotRulesParser
! */
! private RobotRuleSet() {
! tmpEntries= new ArrayList();
! entries= null;
! }
! /**
! */
! private void addPrefix(String prefix, boolean allow) {
! if (tmpEntries == null) {
! tmpEntries= new ArrayList();
! if (entries != null) {
! for (int i= 0; i < entries.length; i++)
! tmpEntries.add(entries[i]);
! }
! entries= null;
! }
! tmpEntries.add(new RobotsEntry(prefix, allow));
}
! /**
! */
! private void clearPrefixes() {
! if (tmpEntries == null) {
! tmpEntries= new ArrayList();
! entries= null;
! } else {
! tmpEntries.clear();
! }
! }
! /**
! * Change when the ruleset goes stale.
! */
! public void setExpireTime(long expireTime) {
! this.expireTime = expireTime;
! }
! /**
! * Get expire time
! */
! public long getExpireTime() {
! return expireTime;
! }
! /**
! * Returns <code>false</code> if the <code>robots.txt</code> file
! * prohibits us from accessing the given <code>path</code>, or
! * <code>true</code> otherwise.
! */
! public boolean isAllowed(String path) {
! try {
! path= URLDecoder.decode(path, CHARACTER_ENCODING);
! } catch (Exception e) {
! // just ignore it- we can still try to match
! // path prefixes
! }
! if (entries == null) {
! entries= new RobotsEntry[tmpEntries.size()];
! entries= (RobotsEntry[])
! tmpEntries.toArray(entries);
! tmpEntries= null;
! }
! int pos= 0;
! int end= entries.length;
! while (pos < end) {
! if (path.startsWith(entries[pos].prefix))
! return entries[pos].allowed;
! pos++;
! }
! return true;
! }
! /**
! */
! public String toString() {
! isAllowed("x"); // force String[] representation
! StringBuffer buf= new StringBuffer();
! for (int i= 0; i < entries.length; i++)
! if (entries[i].allowed)
! buf.append("Allow: " + entries[i].prefix
! + System.getProperty("line.separator"));
! else
! buf.append("Disallow: " + entries[i].prefix
! + System.getProperty("line.separator"));
! return buf.toString();
! }
! }
! /**
! * Creates a new <code>RobotRulesParser</code> which will use the
! * supplied <code>robotNames</code> when choosing which stanza to
! * follow in <code>robots.txt</code> files. Any name in the array
! * may be matched. The order of the <code>robotNames</code>
! * determines the precedence- if many names are matched, only the
! * rules associated with the robot name having the smallest index
! * will be used.
! */
! public RobotRulesParser(String[] robotNames) {
! this.robotNames= new HashMap();
! for (int i= 0; i < robotNames.length; i++) {
! this.robotNames.put(robotNames[i].toLowerCase(), new Integer(i));
! }
! // always make sure "*" is included
! if (!this.robotNames.containsKey("*"))
! this.robotNames.put("*", new Integer(robotNames.length));
}
! /**
! * Returns a [EMAIL PROTECTED] RobotRuleSet} object which encapsulates the
! * rules parsed from the supplied <code>robotContent</code>.
! */
! RobotRuleSet parseRules(byte[] robotContent) {
! if (robotContent == null)
! return EMPTY_RULES;
! String content= new String (robotContent);
! StringTokenizer lineParser= new StringTokenizer(content, "\n\r");
! RobotRuleSet bestRulesSoFar= null;
! int bestPrecedenceSoFar= NO_PRECEDENCE;
! RobotRuleSet currentRules= new RobotRuleSet();
! int currentPrecedence= NO_PRECEDENCE;
! boolean addRules= false; // in stanza for our robot
! boolean doneAgents= false; // detect multiple agent lines
! while (lineParser.hasMoreTokens()) {
! String line= lineParser.nextToken();
! // trim out comments and whitespace
! int hashPos= line.indexOf("#");
! if (hashPos >= 0)
! line= line.substring(0, hashPos);
! line= line.trim();
! if ( (line.length() >= 11)
! && (line.substring(0, 11).equalsIgnoreCase("User-agent:")) ) {
! if (doneAgents) {
! if (currentPrecedence < bestPrecedenceSoFar) {
! bestPrecedenceSoFar= currentPrecedence;
! bestRulesSoFar= currentRules;
! currentPrecedence= NO_PRECEDENCE;
! currentRules= new RobotRuleSet();
! }
! addRules= false;
! }
! doneAgents= false;
! String agentNames= line.substring(line.indexOf(":") + 1);
! agentNames= agentNames.trim();
! StringTokenizer agentTokenizer= new StringTokenizer(agentNames);
! while (agentTokenizer.hasMoreTokens()) {
! // for each agent listed, see if it's us:
! String agentName= agentTokenizer.nextToken().toLowerCase();
! Integer precedenceInt= (Integer) robotNames.get(agentName);
! if (precedenceInt != null) {
! int precedence= precedenceInt.intValue();
! if ( (precedence < currentPrecedence)
! && (precedence < bestPrecedenceSoFar) )
! currentPrecedence= precedence;
! }
! }
! if (currentPrecedence < bestPrecedenceSoFar)
! addRules= true;
! } else if ( (line.length() >= 9)
! && (line.substring(0, 9).equalsIgnoreCase("Disallow:")) ) {
! doneAgents= true;
! String path= line.substring(line.indexOf(":") + 1);
! path= path.trim();
! try {
! path= URLDecoder.decode(path, CHARACTER_ENCODING);
! } catch (Exception e) {
! LOG.warning("error parsing robots rules- can't decode path: "
! + path);
! }
! if (path.length() == 0) { // "empty rule"
! if (addRules)
! currentRules.clearPrefixes();
! } else { // rule with path
! if (addRules)
! currentRules.addPrefix(path, false);
! }
! } else if ( (line.length() >= 6)
! && (line.substring(0, 6).equalsIgnoreCase("Allow:")) ) {
! doneAgents= true;
! String path= line.substring(line.indexOf(":") + 1);
! path= path.trim();
! if (path.length() == 0) {
! // "empty rule"- treat same as empty disallow
! if (addRules)
! currentRules.clearPrefixes();
! } else { // rule with path
! if (addRules)
! currentRules.addPrefix(path, true);
! }
! }
}
! if (currentPrecedence < bestPrecedenceSoFar) {
! bestPrecedenceSoFar= currentPrecedence;
! bestRulesSoFar= currentRules;
! }
! if (bestPrecedenceSoFar == NO_PRECEDENCE)
! return EMPTY_RULES;
! return bestRulesSoFar;
! }
! /**
! * Returns a <code>RobotRuleSet</code> object appropriate for use
! * when the <code>robots.txt</code> file is empty or missing; all
! * requests are allowed.
! */
! static RobotRuleSet getEmptyRules() {
! return EMPTY_RULES;
! }
! /**
! * Returns a <code>RobotRuleSet</code> object appropriate for use
! * when the <code>robots.txt</code> file is not fetched due to a
! * <code>403/Forbidden</code> response; all requests are
! * disallowed.
! */
! static RobotRuleSet getForbidAllRules() {
! RobotRuleSet rules= new RobotRuleSet();
! rules.addPrefix("", false);
! return rules;
! }
! private final static int BUFSIZE= 2048;
! /** command-line main for testing */
! public static void main(String[] argv) {
! if (argv.length != 3) {
! System.out.println("Usage:");
! System.out.println(" java <robots-file> <url-file> <agent-name>+");
! System.out.println("");
! System.out.println("The <robots-file> will be parsed as a robots.txt
file,");
! System.out.println("using the given <agent-name> to select rules. URLs
");
! System.out.println("will be read (one per line) from <url-file>, and
tested");
! System.out.println("against the rules.");
! System.exit(-1);
! }
! try {
! FileInputStream robotsIn= new FileInputStream(argv[0]);
! LineNumberReader testsIn= new LineNumberReader(new FileReader(argv[1]));
! String[] robotNames= new String[argv.length - 1];
! for (int i= 0; i < argv.length - 2; i++)
! robotNames[i]= argv[i+2];
! ArrayList bufs= new ArrayList();
! byte[] buf= new byte[BUFSIZE];
! int totBytes= 0;
! int rsize= robotsIn.read(buf);
! while (rsize >= 0) {
! totBytes+= rsize;
! if (rsize != BUFSIZE) {
! byte[] tmp= new byte[rsize];
! System.arraycopy(buf, 0, tmp, 0, rsize);
! bufs.add(tmp);
! } else {
! bufs.add(buf);
! buf= new byte[BUFSIZE];
! }
! rsize= robotsIn.read(buf);
! }
! byte[] robotsBytes= new byte[totBytes];
! int pos= 0;
! for (int i= 0; i < bufs.size(); i++) {
! byte[] currBuf= (byte[]) bufs.get(i);
! int currBufLen= currBuf.length;
! System.arraycopy(currBuf, 0, robotsBytes, pos, currBufLen);
! pos+= currBufLen;
! }
! RobotRulesParser parser=
! new RobotRulesParser(robotNames);
! RobotRuleSet rules= parser.parseRules(robotsBytes);
! System.out.println("Rules:");
! System.out.println(rules);
! System.out.println();
! String testPath= testsIn.readLine().trim();
! while (testPath != null) {
! System.out.println( (rules.isAllowed(testPath) ?
! "allowed" : "not allowed")
! + ":\t" + testPath);
! testPath= testsIn.readLine();
! }
! } catch (Exception e) {
! e.printStackTrace();
! }
}
}
Index: Fetcher.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/fetcher/Fetcher.java,v
retrieving revision 1.31
retrieving revision 1.32
diff -C2 -d -r1.31 -r1.32
*** Fetcher.java 13 Feb 2004 19:53:48 -0000 1.31
--- Fetcher.java 6 Apr 2004 23:20:51 -0000 1.32
***************
*** 5,9 ****
import net.nutch.net.protocols.Response;
-
import net.nutch.pagedb.FetchListEntry;
import net.nutch.net.protocols.http.Http;
--- 5,8 ----
***************
*** 15,23 ****
import java.io.*;
import java.net.*;
[...973 lines suppressed...]
! } else if (args[i].equals("-verbose")) { // found -verbose option
! verbose = true;
! } else if (i != args.length-1) {
! System.err.println(usage);
! System.exit(-1);
! } else // root is required parameter
! directory = args[i];
! }
!
! Fetcher fetcher = new Fetcher(directory); // make a Fetcher
! if (timeout != -1) // set timeout option
! fetcher.getHttp().setTimeout(timeout);
! if (threadCount != -1) // set threadCount option
! fetcher.setThreadCount(threadCount);
! // set log level
! fetcher.setLogLevel(verbose ? Level.FINE : Level.INFO);
!
! fetcher.run(); // run the Fetcher
! }
}
-------------------------------------------------------
This SF.Net email is sponsored by: IBM Linux Tutorials
Free Linux tutorial presented by Daniel Robbins, President and CEO of
GenToo technologies. Learn everything from fundamentals to system
administration.http://ads.osdn.com/?ad_id=1470&alloc_id=3638&op=click
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs