Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java?rev=357197&r1=357196&r2=357197&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java Fri Dec 16 09:51:05 2005 @@ -53,6 +53,9 @@ private static final boolean ALLOW_FORBIDDEN = NutchConf.get().getBoolean("http.robots.403.allow", false); + private static final int MAX_REDIRECTS = + NutchConf.get().getInt("http.redirect.max", 3); + private static final String[] AGENTS = getAgents(); private static final Hashtable CACHE = new Hashtable(); @@ -377,16 +380,30 @@ RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host); if (robotRules == null) { // cache miss - HttpResponse response = new HttpResponse(new URL(url, "/robots.txt")); - - if (response.getCode() == 200) // found rules: parse them - robotRules = new RobotRulesParser().parseRules(response.getContent()); - else if ( (response.getCode() == 403) && (!ALLOW_FORBIDDEN) ) - robotRules = FORBID_ALL_RULES; // use forbid all - else - robotRules = EMPTY_RULES; // use default rules + int redirects = 0; + do { + HttpResponse response = new HttpResponse(new URL(url, "/robots.txt")); + + int code = response.getCode(); + + if (code == 200) { // found rules: parse them + robotRules = new RobotRulesParser().parseRules(response.getContent()); + } else if ( (code == 403) && (!ALLOW_FORBIDDEN) ) { + robotRules = FORBID_ALL_RULES; // use forbid all + } else if (code >= 300 && code < 400) { // handle redirect + if (redirects == MAX_REDIRECTS) { + robotRules = EMPTY_RULES; + } else { + url = new URL(url, response.getHeader("Location")); + LOG.fine("redirect to " + url); + redirects++; + } + } else { + robotRules = EMPTY_RULES; // use default rules + } + } while (robotRules == null); - CACHE.put(host, robotRules); // cache rules for host + CACHE.put(host, robotRules); // cache rules for host } String path = url.getPath(); // check rules
Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=357197&r1=357196&r2=357197&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Fri Dec 16 09:51:05 2005 @@ -60,10 +60,14 @@ } public HttpResponse(URL url) throws IOException { + this(url, false); + } + + HttpResponse(URL url, boolean followRedirects) throws IOException { this.base = url.toString(); this.orig = url.toString(); GetMethod get = new GetMethod(this.orig); - get.setFollowRedirects(false); + get.setFollowRedirects(followRedirects); get.setRequestHeader("User-Agent", Http.AGENT_STRING); HttpMethodParams params = get.getParams(); // some servers cannot digest the new protocol Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java?rev=357197&r1=357196&r2=357197&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java Fri Dec 16 09:51:05 2005 @@ -379,7 +379,8 @@ if (robotRules == null) { // cache miss LOG.fine("cache miss " + url); try { - HttpResponse response = new HttpResponse(new URL(url, "/robots.txt")); + HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"), + true); if (response.getCode() == 200) // found rules: parse them robotRules = new RobotRulesParser().parseRules(response.getContent()); Modified: lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tutorial.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tutorial.xml?rev=357197&r1=357196&r2=357197&view=diff ============================================================================== --- lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tutorial.xml (original) +++ lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tutorial.xml Fri Dec 16 09:51:05 2005 @@ -66,11 +66,11 @@ <ol> -<li>Create a flat file of root urls. For example, to crawl the -<code>nutch</code> site you might start with a file named -<code>urls</code> containing just the Nutch home page. All other -Nutch pages should be reachable from this page. The <code>urls</code> -file would thus look like: +<li>Create a directory with a flat file of root urls. For example, to +crawl the <code>nutch</code> site you might start with a file named +<code>urls/nutch</code> containing the url of just the Nutch home +page. All other Nutch pages should be reachable from this page. The +<code>urls/nutch</code> file would thus contain: <source> http://lucene.apache.org/nutch/ </source> @@ -97,24 +97,28 @@ <ul> <li><code>-dir</code> <em>dir</em> names the directory to put the crawl in.</li> -<li><code>-depth</code> <em>depth</em> indicates the link depth from the root -page that should be crawled.</li> -<li><code>-delay</code> <em>delay</em> determines the number of seconds -between accesses to each host.</li> <li><code>-threads</code> <em>threads</em> determines the number of threads that will fetch in parallel.</li> +<li><code>-depth</code> <em>depth</em> indicates the link depth from the root +page that should be crawled.</li> +<li><code>-topN</code> <em>N</em> determines the maximum number of pages that +will be retrieved at each level up to the depth.</li> </ul> <p>For example, a typical call might be:</p> <source> -bin/nutch crawl urls -dir crawl.test -depth 3 >& crawl.log +bin/nutch crawl urls -dir crawl -depth 3 -topN 50 </source> -<p>Typically one starts testing one's configuration by crawling at low -depths, and watching the output to check that desired pages are found. -Once one is more confident of the configuration, then an appropriate -depth for a full crawl is around 10.</p> +<p>Typically one starts testing one's configuration by crawling at +shallow depths, sharply limiting the number of pages fetched at each +level (<code>-topN</code>), and watching the output to check that +desired pages are fetched and undesirable pages are not. Once one is +confident of the configuration, then an appropriate depth for a full +crawl is around 10. The number of pages per level +(<code>-topN</code>) for a full crawl can be from tens of thousands to +millions, depending on your resources.</p> <p>Once crawling has completed, one can skip to the Searching section below.</p> @@ -131,54 +135,62 @@ <section> <title>Whole-web: Concepts</title> -<p>Nutch data is of two types:</p> +<p>Nutch data is composed of:</p> <ol> - <li>The web database. This contains information about every -page known to Nutch, and about links between those pages.</li> - <li>A set of segments. Each segment is a set of pages that are -fetched and indexed as a unit. Segment data consists of the -following types:</li> + + <li>The crawl database, or <em>crawldb</em>. This contains +information about every url known to Nutch, including whether it was +fetched, and, if so, when.</li> + + <li>The link database, or <em>linkdb</em>. This contains the list +of known links to each url, including both the source url and anchor +text of the link.</li> + + <li>A set of <em>segments</em>. Each segment is a set of urls that are +fetched as a unit. Segments are directories with the following +subdirectories:</li> + <li><ul> - <li>a <em>fetchlist</em> is a file -that names a set of pages to be fetched</li> - <li>the<em> fetcher output</em> is a -set of files containing the fetched pages</li> - <li>the <em>index </em>is a -Lucene-format index of the fetcher output.</li> + <li>a <em>crawl_generate</em> names a set of urls to be fetched</li> + <li>a <em>crawl_fetch</em> contains the status of fetching each url</li> + <li>a <em>content</em> contains the content of each url</li> + <li>a <em>parse_text</em> contains the parsed text of each url</li> + <li>a <em>parse_data</em> contains outlinks and metadata parsed + from each url</li> + <li>a <em>crawl_parse</em> contains the outlink urls, used to + update the crawldb</li> </ul></li> + +<li>The <em>indexes</em>are Lucene-format indexes.</li> + </ol> -<p>In the following examples we will keep our web database in a directory -named <code>db</code> and our segments -in a directory named <code>segments</code>:</p> -<source>mkdir db -mkdir segments</source> </section> <section> <title>Whole-web: Boostrapping the Web Database</title> -<p>The admin tool is used to create a new, empty database:</p> - -<source>bin/nutch admin db -create</source> -<p>The <em>injector</em> adds urls into the database. Let's inject -URLs from the <a href="http://dmoz.org/">DMOZ</a> Open -Directory. First we must download and uncompress the file listing all -of the DMOZ pages. (This is a 200+Mb file, so this will take a few -minutes.)</p> +<p>The <em>injector</em> adds urls to the crawldb. Let's inject URLs +from the <a href="http://dmoz.org/">DMOZ</a> Open Directory. First we +must download and uncompress the file listing all of the DMOZ pages. +(This is a 200+Mb file, so this will take a few minutes.)</p> <source>wget http://rdf.dmoz.org/rdf/content.rdf.u8.gz gunzip content.rdf.u8.gz</source> -<p>Next we inject a random subset of these pages into the web database. +<p>Next we select a random subset of these pages. (We use a random subset so that everyone who runs this tutorial doesn't hammer the same sites.) DMOZ contains around three million -URLs. We inject one out of every 3000, so that we end up with +URLs. We select one out of every 5000, so that we end up with around 1000 URLs:</p> -<source>bin/nutch inject db -dmozfile content.rdf.u8 -subset 3000</source> +<source>mkdir dmoz +bin/nutch org.apache.nutch.crawl.DmozParser content.rdf.u8 -subset 5000 > dmoz/urls</source> -<p>This also takes a few minutes, as it must parse the full file.</p> +<p>The parser also takes a few minutes, as it must parse the full +file. Finally, we initialize the crawl db with the selected urls.</p> + +<source>bin/nutch inject crawl/crawldb dmoz</source> <p>Now we have a web database with around 1000 as-yet unfetched URLs in it.</p> @@ -186,39 +198,39 @@ <section> <title>Whole-web: Fetching</title> <p>To fetch, we first generate a fetchlist from the database:</p> -<source>bin/nutch generate db segments +<source>bin/nutch generate crawl/crawldb crawl/segments </source> <p>This generates a fetchlist for all of the pages due to be fetched. The fetchlist is placed in a newly created segment directory. The segment directory is named by the time it's created. We save the name of this segment in the shell variable <code>s1</code>:</p> -<source>s1=`ls -d segments/2* | tail -1` +<source>s1=`ls -d crawl/segments/2* | tail -1` echo $s1 </source> <p>Now we run the fetcher on this segment with:</p> <source>bin/nutch fetch $s1</source> <p>When this is complete, we update the database with the results of the fetch:</p> -<source>bin/nutch updatedb db $s1</source> +<source>bin/nutch updatedb crawl/crawldb $s1</source> <p>Now the database has entries for all of the pages referenced by the initial set.</p> <p>Now we fetch a new segment with the top-scoring 1000 pages:</p> -<source>bin/nutch generate db segments -topN 1000 -s2=`ls -d segments/2* | tail -1` +<source>bin/nutch generate crawl/crawldb crawl/segments -topN 1000 +s2=`ls -d crawl/segments/2* | tail -1` echo $s2 bin/nutch fetch $s2 -bin/nutch updatedb db $s2 +bin/nutch updatedb crawl/crawldb $s2 </source> <p>Let's fetch one more round:</p> <source> -bin/nutch generate db segments -topN 1000 -s3=`ls -d segments/2* | tail -1` +bin/nutch generate crawl/crawldb crawl/segments -topN 1000 +s3=`ls -d crawl/segments/2* | tail -1` echo $s3 bin/nutch fetch $s3 -bin/nutch updatedb db $s3 +bin/nutch updatedb crawl/crawldb $s3 </source> <p>By this point we've fetched a few thousand pages. Let's index @@ -227,16 +239,20 @@ </section> <section> <title>Whole-web: Indexing</title> -<p>To index each segment we use the <code>index</code> -command, as follows:</p> -<source>bin/nutch index $s1 -bin/nutch index $s2 -bin/nutch index $s3</source> -<p>Then, before we can search a set of segments, we need to delete -duplicate pages. This is done with:</p> +<p>Before indexing we first invert all of the links, so that we may +index incoming anchor text with the pages.</p> + +<source>bin/nutch invertlinks crawl/linkdb crawl/segments</source> + +<p>To index the segments we use the <code>index</code> command, as follows:</p> + +<source>bin/nutch index indexes crawl/linkdb crawl/segments/*</source> + +<!-- <p>Then, before we can search a set of segments, we need to delete --> +<!-- duplicate pages. This is done with:</p> --> -<source>bin/nutch dedup segments dedup.tmp</source> +<!-- <source>bin/nutch dedup indexes</source> --> <p>Now we're ready to search!</p> @@ -256,10 +272,8 @@ cp nutch*.war ~/local/tomcat/webapps/ROOT.war </source> -<p>The webapp finds its indexes in <code>./segments</code>, relative -to where you start Tomcat, so, if you've done intranet crawling, -connect to your crawl directory, or, if you've done whole-web -crawling, don't change directories, and give the command:</p> +<p>The webapp finds its indexes in <code>./crawl</code>, relative +to where you start Tomcat, so use a command like:</p> <source>~/local/tomcat/bin/catalina.sh start </source> Modified: lucene/nutch/trunk/src/test/org/apache/nutch/io/TestSequenceFile.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/io/TestSequenceFile.java?rev=357197&r1=357196&r2=357197&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/io/TestSequenceFile.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/io/TestSequenceFile.java Fri Dec 16 09:51:05 2005 @@ -42,7 +42,7 @@ NutchFileSystem nfs = new LocalFileSystem(); try { //LOG.setLevel(Level.FINE); - writeTest(nfs, count, seed, file); + writeTest(nfs, count, seed, file, false); readTest(nfs, count, seed, file); sortTest(nfs, count, megabytes, factor, false, file); @@ -61,12 +61,14 @@ } } - private static void writeTest(NutchFileSystem nfs, int count, int seed, String file) + private static void writeTest(NutchFileSystem nfs, int count, int seed, + String file, boolean compress) throws IOException { new File(file).delete(); LOG.fine("creating with " + count + " records"); SequenceFile.Writer writer = - new SequenceFile.Writer(nfs, file, RandomDatum.class, RandomDatum.class); + new SequenceFile.Writer(nfs, file, RandomDatum.class, RandomDatum.class, + compress); RandomDatum.Generator generator = new RandomDatum.Generator(seed); for (int i = 0; i < count; i++) { generator.next(); @@ -210,8 +212,9 @@ boolean check = false; boolean fast = false; boolean merge = false; + boolean compress = false; String file = null; - String usage = "Usage: SequenceFile (-local | -ndfs <namenode:port>) [-count N] [-megabytes M] [-factor F] [-nocreate] [-check] [-fast] [-merge] file"; + String usage = "Usage: SequenceFile (-local | -ndfs <namenode:port>) [-count N] [-megabytes M] [-factor F] [-nocreate] [-check] [-fast] [-merge] [-compress] file"; if (args.length == 0) { System.err.println(usage); @@ -237,6 +240,8 @@ fast = true; } else if (args[i].equals("-merge")) { merge = true; + } else if (args[i].equals("-compress")) { + compress = true; } else { // file is required parameter file = args[i]; @@ -249,6 +254,7 @@ LOG.info("check = " + check); LOG.info("fast = " + fast); LOG.info("merge = " + merge); + LOG.info("compress = " + compress); LOG.info("file = " + file); int seed = 0; @@ -256,7 +262,7 @@ LOG.setLevel(Level.FINE); if (create && !merge) { - writeTest(nfs, count, seed, file); + writeTest(nfs, count, seed, file, compress); readTest(nfs, count, seed, file); } Modified: lucene/nutch/trunk/src/web/jsp/anchors.jsp URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/web/jsp/anchors.jsp?rev=357197&r1=357196&r2=357197&view=diff ============================================================================== --- lucene/nutch/trunk/src/web/jsp/anchors.jsp (original) +++ lucene/nutch/trunk/src/web/jsp/anchors.jsp Fri Dec 16 09:51:05 2005 @@ -55,8 +55,10 @@ <ul> <% String[] anchors = bean.getAnchors(details); - for (int i = 0; i < anchors.length; i++) { + if (anchors != null) { + for (int i = 0; i < anchors.length; i++) { %><li><%=Entities.encode(anchors[i])%> +<% } %> <% } %> </ul> Modified: lucene/nutch/trunk/src/webapps/jobtracker/jobdetails.jsp URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/webapps/jobtracker/jobdetails.jsp?rev=357197&r1=357196&r2=357197&view=diff ============================================================================== --- lucene/nutch/trunk/src/webapps/jobtracker/jobdetails.jsp (original) +++ lucene/nutch/trunk/src/webapps/jobtracker/jobdetails.jsp Fri Dec 16 09:51:05 2005 @@ -4,7 +4,7 @@ import="javax.servlet.http.*" import="java.io.*" import="java.util.*" - import="org.apache.nutch.mapReduce.*" + import="org.apache.nutch.mapred.*" %> <% String jobid = request.getParameter("jobid"); @@ -12,6 +12,9 @@ JobTracker.JobInProgress job = (JobTracker.JobInProgress) tracker.getJob(jobid); JobProfile profile = (job != null) ? (job.getProfile()) : null; JobStatus status = (job != null) ? (job.getStatus()) : null; + + Vector mapTaskReports[] = tracker.getMapTaskReport(jobid); + Vector reduceTaskReports[] = tracker.getReduceTaskReport(jobid); %> <html> @@ -35,10 +38,21 @@ <h2>Map Tasks</h2> <center> <table border=2 cellpadding="5" cellspacing="2"> - <tr><td align="center" colspan="4">Map Tasks</td></tr> + <tr><td align="center">Map Task Id</td><td>Pct Complete</td><td>State</td><td>Diagnostic Text</td></tr> <% - + for (int i = 0; i < mapTaskReports.length; i++) { + Vector v = mapTaskReports[i]; + out.print("<tr><td>" + v.elementAt(0) + "</td><td>" + v.elementAt(1) + "</td><td>" + v.elementAt(2) + "</td>"); + if (v.size() == 3) { + out.print("<td></td>"); + } else { + for (int j = 3; j < v.size(); j++) { + out.print("<td>" + v.elementAt(j) + "</td>"); + } + } + out.print("</tr>\n"); + } %> </table> </center> @@ -46,6 +60,26 @@ <h2>Reduce Tasks</h2> + <center> + <table border=2 cellpadding="5" cellspacing="2"> + <tr><td align="center">Reduce Task Id</td><td>Pct Complete</td><td>State</td><td>Diagnostic Text</td></tr> + + <% + for (int i = 0; i < reduceTaskReports.length; i++) { + Vector v = reduceTaskReports[i]; + out.print("<tr><td>" + v.elementAt(0) + "</td><td>" + v.elementAt(1) + "</td><td>" + v.elementAt(2) + "</td>"); + if (v.size() == 3) { + out.print("<td></td>"); + } else { + for (int j = 3; j < v.size(); j++) { + out.print("<td>" + v.elementAt(j) + "</td>"); + } + } + out.print("</tr>\n"); + } + %> + </table> + </center> <hr> Modified: lucene/nutch/trunk/src/webapps/jobtracker/jobtracker.jsp URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/webapps/jobtracker/jobtracker.jsp?rev=357197&r1=357196&r2=357197&view=diff ============================================================================== --- lucene/nutch/trunk/src/webapps/jobtracker/jobtracker.jsp (original) +++ lucene/nutch/trunk/src/webapps/jobtracker/jobtracker.jsp Fri Dec 16 09:51:05 2005 @@ -4,7 +4,7 @@ import="javax.servlet.http.*" import="java.io.*" import="java.util.*" - import="org.apache.nutch.mapReduce.*" + import="org.apache.nutch.mapred.*" %> <%! JobTracker tracker = JobTracker.getTracker();
