trun...

cutting Fri, 16 Dec 2005 09:55:16 -0800

Modified: 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java?rev=357197&r1=357196&r2=357197&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
 Fri Dec 16 09:51:05 2005
@@ -53,6 +53,9 @@
   private static final boolean ALLOW_FORBIDDEN =
     NutchConf.get().getBoolean("http.robots.403.allow", false);
 
+  private static final int MAX_REDIRECTS =
+    NutchConf.get().getInt("http.redirect.max", 3);
+
   private static final String[] AGENTS = getAgents();
   private static final Hashtable CACHE = new Hashtable();
   
@@ -377,16 +380,30 @@
     RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host);
 
     if (robotRules == null) {                     // cache miss
-      HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"));
-
-      if (response.getCode() == 200)               // found rules: parse them
-        robotRules = new RobotRulesParser().parseRules(response.getContent());
-      else if ( (response.getCode() == 403) && (!ALLOW_FORBIDDEN) )
-        robotRules = FORBID_ALL_RULES;            // use forbid all
-      else                                        
-        robotRules = EMPTY_RULES;                 // use default rules
+      int redirects = 0;
+      do {
+        HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"));
+
+        int code = response.getCode();
+
+        if (code == 200) {                        // found rules: parse them
+          robotRules = new 
RobotRulesParser().parseRules(response.getContent());
+        } else if ( (code == 403) && (!ALLOW_FORBIDDEN) ) {
+          robotRules = FORBID_ALL_RULES;          // use forbid all
+        } else if (code >= 300 && code < 400) {   // handle redirect
+          if (redirects == MAX_REDIRECTS) {
+            robotRules = EMPTY_RULES;
+          } else {
+            url = new URL(url, response.getHeader("Location"));
+            LOG.fine("redirect to " + url); 
+            redirects++;
+          }
+        } else {
+          robotRules = EMPTY_RULES;                 // use default rules
+        }
+      } while (robotRules == null);
 
-      CACHE.put(host, robotRules);                // cache rules for host
+      CACHE.put(host, robotRules);              // cache rules for host
     }
 
     String path = url.getPath();                  // check rules


Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=357197&r1=357196&r2=357197&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 Fri Dec 16 09:51:05 2005
@@ -60,10 +60,14 @@
   }
 
   public HttpResponse(URL url) throws IOException {
+    this(url, false);
+  }
+
+  HttpResponse(URL url, boolean followRedirects) throws IOException {
     this.base = url.toString();
     this.orig = url.toString();
     GetMethod get = new GetMethod(this.orig);
-    get.setFollowRedirects(false);
+    get.setFollowRedirects(followRedirects);
     get.setRequestHeader("User-Agent", Http.AGENT_STRING);
     HttpMethodParams params = get.getParams();
     // some servers cannot digest the new protocol

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java?rev=357197&r1=357196&r2=357197&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
 Fri Dec 16 09:51:05 2005
@@ -379,7 +379,8 @@
     if (robotRules == null) {                     // cache miss
       LOG.fine("cache miss " + url);
       try {
-        HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"));
+        HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"),
+                                                 true);
 
         if (response.getCode() == 200)               // found rules: parse them
           robotRules = new 
RobotRulesParser().parseRules(response.getContent());

Modified: 
lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tutorial.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tutorial.xml?rev=357197&r1=357196&r2=357197&view=diff
==============================================================================
--- lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tutorial.xml 
(original)
+++ lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tutorial.xml 
Fri Dec 16 09:51:05 2005
@@ -66,11 +66,11 @@
 
 <ol>
 
-<li>Create a flat file of root urls.  For example, to crawl the
-<code>nutch</code> site you might start with a file named
-<code>urls</code> containing just the Nutch home page.  All other
-Nutch pages should be reachable from this page.  The <code>urls</code>
-file would thus look like:
+<li>Create a directory with a flat file of root urls.  For example, to
+crawl the <code>nutch</code> site you might start with a file named
+<code>urls/nutch</code> containing the url of just the Nutch home
+page.  All other Nutch pages should be reachable from this page.  The
+<code>urls/nutch</code> file would thus contain:
 <source>
 http://lucene.apache.org/nutch/
 </source>
@@ -97,24 +97,28 @@
 
 <ul>
 <li><code>-dir</code> <em>dir</em> names the directory to put the crawl 
in.</li>
-<li><code>-depth</code> <em>depth</em> indicates the link depth from the root
-page that should be crawled.</li>
-<li><code>-delay</code> <em>delay</em> determines the number of seconds
-between accesses to each host.</li>
 <li><code>-threads</code> <em>threads</em> determines the number of
 threads that will fetch in parallel.</li>
+<li><code>-depth</code> <em>depth</em> indicates the link depth from the root
+page that should be crawled.</li>
+<li><code>-topN</code> <em>N</em> determines the maximum number of pages that
+will be retrieved at each level up to the depth.</li>
 </ul>
 
 <p>For example, a typical call might be:</p>
 
 <source>
-bin/nutch crawl urls -dir crawl.test -depth 3 >&amp; crawl.log
+bin/nutch crawl urls -dir crawl -depth 3 -topN 50
 </source>
 
-<p>Typically one starts testing one's configuration by crawling at low
-depths, and watching the output to check that desired pages are found.
-Once one is more confident of the configuration, then an appropriate
-depth for a full crawl is around 10.</p>
+<p>Typically one starts testing one's configuration by crawling at
+shallow depths, sharply limiting the number of pages fetched at each
+level (<code>-topN</code>), and watching the output to check that
+desired pages are fetched and undesirable pages are not.  Once one is
+confident of the configuration, then an appropriate depth for a full
+crawl is around 10.  The number of pages per level
+(<code>-topN</code>) for a full crawl can be from tens of thousands to
+millions, depending on your resources.</p>
 
 <p>Once crawling has completed, one can skip to the Searching section
 below.</p>
@@ -131,54 +135,62 @@
 <section>
 <title>Whole-web: Concepts</title>
 
-<p>Nutch data is of two types:</p>
+<p>Nutch data is composed of:</p>
 
 <ol>
-  <li>The web database.  This contains information about every
-page known to Nutch, and about links between those pages.</li>
-  <li>A set of segments.  Each segment is a set of pages that are
-fetched and indexed as a unit.  Segment data consists of the
-following types:</li>
+
+  <li>The crawl database, or <em>crawldb</em>.  This contains
+information about every url known to Nutch, including whether it was
+fetched, and, if so, when.</li>
+
+  <li>The link database, or <em>linkdb</em>.  This contains the list
+of known links to each url, including both the source url and anchor
+text of the link.</li>
+
+  <li>A set of <em>segments</em>.  Each segment is a set of urls that are
+fetched as a unit.  Segments are directories with the following
+subdirectories:</li>
+
   <li><ul>
-    <li>a <em>fetchlist</em> is a file
-that names a set of pages to be fetched</li>
-    <li>the<em> fetcher output</em> is a
-set of files containing the fetched pages</li>
-    <li>the <em>index </em>is a
-Lucene-format index of the fetcher output.</li>
+    <li>a <em>crawl_generate</em> names a set of urls to be fetched</li>
+    <li>a <em>crawl_fetch</em> contains the status of fetching each url</li>
+    <li>a <em>content</em> contains the content of each url</li>
+    <li>a <em>parse_text</em> contains the parsed text of each url</li>
+    <li>a <em>parse_data</em> contains outlinks and metadata parsed
+    from each url</li>
+    <li>a <em>crawl_parse</em> contains the outlink urls, used to
+    update the crawldb</li>
   </ul></li>
+
+<li>The <em>indexes</em>are Lucene-format indexes.</li>
+
 </ol>
-<p>In the following examples we will keep our web database in a directory
-named <code>db</code> and our segments
-in a directory named <code>segments</code>:</p>
-<source>mkdir db
-mkdir segments</source>
 
 </section>
 <section>
 <title>Whole-web: Boostrapping the Web Database</title>
-<p>The admin tool is used to create a new, empty database:</p>
-
-<source>bin/nutch admin db -create</source>
 
-<p>The <em>injector</em> adds urls into the database.  Let's inject
-URLs from the <a href="http://dmoz.org/";>DMOZ</a> Open
-Directory. First we must download and uncompress the file listing all
-of the DMOZ pages.  (This is a 200+Mb file, so this will take a few
-minutes.)</p>
+<p>The <em>injector</em> adds urls to the crawldb.  Let's inject URLs
+from the <a href="http://dmoz.org/";>DMOZ</a> Open Directory. First we
+must download and uncompress the file listing all of the DMOZ pages.
+(This is a 200+Mb file, so this will take a few minutes.)</p>
 
 <source>wget http://rdf.dmoz.org/rdf/content.rdf.u8.gz
 gunzip content.rdf.u8.gz</source>
 
-<p>Next we inject a random subset of these pages into the web database.
+<p>Next we select a random subset of these pages.
  (We use a random subset so that everyone who runs this tutorial
 doesn't hammer the same sites.)  DMOZ contains around three million
-URLs.  We inject one out of every 3000, so that we end up with
+URLs.  We select one out of every 5000, so that we end up with
 around 1000 URLs:</p>
 
-<source>bin/nutch inject db -dmozfile content.rdf.u8 -subset 3000</source>
+<source>mkdir dmoz
+bin/nutch org.apache.nutch.crawl.DmozParser content.rdf.u8 -subset 5000 &gt; 
dmoz/urls</source>
 
-<p>This also takes a few minutes, as it must parse the full file.</p>
+<p>The parser also takes a few minutes, as it must parse the full
+file.  Finally, we initialize the crawl db with the selected urls.</p>
+
+<source>bin/nutch inject crawl/crawldb dmoz</source>
 
 <p>Now we have a web database with around 1000 as-yet unfetched URLs in it.</p>
 
@@ -186,39 +198,39 @@
 <section>
 <title>Whole-web: Fetching</title>
 <p>To fetch, we first generate a fetchlist from the database:</p>
-<source>bin/nutch generate db segments
+<source>bin/nutch generate crawl/crawldb crawl/segments
 </source>
 <p>This generates a fetchlist for all of the pages due to be fetched.
  The fetchlist is placed in a newly created segment directory.
  The segment directory is named by the time it's created.  We
 save the name of this segment in the shell variable <code>s1</code>:</p>
-<source>s1=`ls -d segments/2* | tail -1`
+<source>s1=`ls -d crawl/segments/2* | tail -1`
 echo $s1
 </source>
 <p>Now we run the fetcher on this segment with:</p>
 <source>bin/nutch fetch $s1</source>
 <p>When this is complete, we update the database with the results of the
 fetch:</p>
-<source>bin/nutch updatedb db $s1</source>
+<source>bin/nutch updatedb crawl/crawldb $s1</source>
 <p>Now the database has entries for all of the pages referenced by the
 initial set.</p>
 
 <p>Now we fetch a new segment with the top-scoring 1000 pages:</p>
-<source>bin/nutch generate db segments -topN 1000
-s2=`ls -d segments/2* | tail -1`
+<source>bin/nutch generate crawl/crawldb crawl/segments -topN 1000
+s2=`ls -d crawl/segments/2* | tail -1`
 echo $s2
 
 bin/nutch fetch $s2
-bin/nutch updatedb db $s2
+bin/nutch updatedb crawl/crawldb $s2
 </source>
 <p>Let's fetch one more round:</p>
 <source>
-bin/nutch generate db segments -topN 1000
-s3=`ls -d segments/2* | tail -1`
+bin/nutch generate crawl/crawldb crawl/segments -topN 1000
+s3=`ls -d crawl/segments/2* | tail -1`
 echo $s3
 
 bin/nutch fetch $s3
-bin/nutch updatedb db $s3
+bin/nutch updatedb crawl/crawldb $s3
 </source>
 
 <p>By this point we've fetched a few thousand pages.  Let's index
@@ -227,16 +239,20 @@
 </section>
 <section>
 <title>Whole-web: Indexing</title>
-<p>To index each segment we use the <code>index</code>
-command, as follows:</p>
-<source>bin/nutch index $s1
-bin/nutch index $s2
-bin/nutch index $s3</source>
 
-<p>Then, before we can search a set of segments, we need to delete
-duplicate pages.  This is done with:</p>
+<p>Before indexing we first invert all of the links, so that we may
+index incoming anchor text with the pages.</p>
+
+<source>bin/nutch invertlinks crawl/linkdb crawl/segments</source>
+
+<p>To index the segments we use the <code>index</code> command, as follows:</p>
+
+<source>bin/nutch index indexes crawl/linkdb crawl/segments/*</source>
+
+<!-- <p>Then, before we can search a set of segments, we need to delete -->
+<!-- duplicate pages.  This is done with:</p> -->
 
-<source>bin/nutch dedup segments dedup.tmp</source>
+<!-- <source>bin/nutch dedup indexes</source> -->
 
 <p>Now we're ready to search!</p>
 
@@ -256,10 +272,8 @@
 cp nutch*.war ~/local/tomcat/webapps/ROOT.war
 </source>
 
-<p>The webapp finds its indexes in <code>./segments</code>, relative
-to where you start Tomcat, so, if you've done intranet crawling,
-connect to your crawl directory, or, if you've done whole-web
-crawling, don't change directories, and give the command:</p>
+<p>The webapp finds its indexes in <code>./crawl</code>, relative
+to where you start Tomcat, so use a command like:</p>
 
 <source>~/local/tomcat/bin/catalina.sh start
 </source>

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/io/TestSequenceFile.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/io/TestSequenceFile.java?rev=357197&r1=357196&r2=357197&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/io/TestSequenceFile.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/io/TestSequenceFile.java Fri 
Dec 16 09:51:05 2005
@@ -42,7 +42,7 @@
     NutchFileSystem nfs = new LocalFileSystem();
     try {
         //LOG.setLevel(Level.FINE);
-        writeTest(nfs, count, seed, file);
+        writeTest(nfs, count, seed, file, false);
         readTest(nfs, count, seed, file);
 
         sortTest(nfs, count, megabytes, factor, false, file);
@@ -61,12 +61,14 @@
     }
   }
 
-  private static void writeTest(NutchFileSystem nfs, int count, int seed, 
String file)
+  private static void writeTest(NutchFileSystem nfs, int count, int seed,
+                                String file, boolean compress)
     throws IOException {
     new File(file).delete();
     LOG.fine("creating with " + count + " records");
     SequenceFile.Writer writer =
-      new SequenceFile.Writer(nfs, file, RandomDatum.class, RandomDatum.class);
+      new SequenceFile.Writer(nfs, file, RandomDatum.class, RandomDatum.class,
+                              compress);
     RandomDatum.Generator generator = new RandomDatum.Generator(seed);
     for (int i = 0; i < count; i++) {
       generator.next();
@@ -210,8 +212,9 @@
     boolean check = false;
     boolean fast = false;
     boolean merge = false;
+    boolean compress = false;
     String file = null;
-    String usage = "Usage: SequenceFile (-local | -ndfs <namenode:port>) 
[-count N] [-megabytes M] [-factor F] [-nocreate] [-check] [-fast] [-merge] 
file";
+    String usage = "Usage: SequenceFile (-local | -ndfs <namenode:port>) 
[-count N] [-megabytes M] [-factor F] [-nocreate] [-check] [-fast] [-merge] 
[-compress] file";
     
     if (args.length == 0) {
         System.err.println(usage);
@@ -237,6 +240,8 @@
               fast = true;
           } else if (args[i].equals("-merge")) {
               merge = true;
+          } else if (args[i].equals("-compress")) {
+              compress = true;
           } else {
               // file is required parameter
               file = args[i];
@@ -249,6 +254,7 @@
         LOG.info("check = " + check);
         LOG.info("fast = " + fast);
         LOG.info("merge = " + merge);
+        LOG.info("compress = " + compress);
         LOG.info("file = " + file);
 
         int seed = 0;
@@ -256,7 +262,7 @@
         LOG.setLevel(Level.FINE);
 
         if (create && !merge) {
-            writeTest(nfs, count, seed, file);
+            writeTest(nfs, count, seed, file, compress);
             readTest(nfs, count, seed, file);
         }
 

Modified: lucene/nutch/trunk/src/web/jsp/anchors.jsp
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/web/jsp/anchors.jsp?rev=357197&r1=357196&r2=357197&view=diff
==============================================================================
--- lucene/nutch/trunk/src/web/jsp/anchors.jsp (original)
+++ lucene/nutch/trunk/src/web/jsp/anchors.jsp Fri Dec 16 09:51:05 2005
@@ -55,8 +55,10 @@
 <ul>
 <%
   String[] anchors = bean.getAnchors(details);
-  for (int i = 0; i < anchors.length; i++) {
+  if (anchors != null) {
+    for (int i = 0; i < anchors.length; i++) {
 %><li><%=Entities.encode(anchors[i])%>
+<%   } %>
 <% } %>
 </ul>
      

Modified: lucene/nutch/trunk/src/webapps/jobtracker/jobdetails.jsp
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/webapps/jobtracker/jobdetails.jsp?rev=357197&r1=357196&r2=357197&view=diff
==============================================================================
--- lucene/nutch/trunk/src/webapps/jobtracker/jobdetails.jsp (original)
+++ lucene/nutch/trunk/src/webapps/jobtracker/jobdetails.jsp Fri Dec 16 
09:51:05 2005
@@ -4,7 +4,7 @@
   import="javax.servlet.http.*"
   import="java.io.*"
   import="java.util.*"
-  import="org.apache.nutch.mapReduce.*"
+  import="org.apache.nutch.mapred.*"
 %>
 <%
   String jobid = request.getParameter("jobid");
@@ -12,6 +12,9 @@
   JobTracker.JobInProgress job = (JobTracker.JobInProgress) 
tracker.getJob(jobid);
   JobProfile profile = (job != null) ? (job.getProfile()) : null;
   JobStatus status = (job != null) ? (job.getStatus()) : null;
+
+  Vector mapTaskReports[] = tracker.getMapTaskReport(jobid);
+  Vector reduceTaskReports[] = tracker.getReduceTaskReport(jobid);
 %>
 
 <html>
@@ -35,10 +38,21 @@
 <h2>Map Tasks</h2>
   <center>
   <table border=2 cellpadding="5" cellspacing="2">
-  <tr><td align="center" colspan="4">Map Tasks</td></tr>
+  <tr><td align="center">Map Task Id</td><td>Pct 
Complete</td><td>State</td><td>Diagnostic Text</td></tr>
 
   <%
-
+    for (int i = 0; i < mapTaskReports.length; i++) {
+      Vector v = mapTaskReports[i];
+      out.print("<tr><td>" + v.elementAt(0) + "</td><td>" + v.elementAt(1) + 
"</td><td>" + v.elementAt(2) + "</td>");
+      if (v.size() == 3) {
+        out.print("<td></td>");
+      } else {
+        for (int j = 3; j < v.size(); j++) {
+          out.print("<td>" + v.elementAt(j) + "</td>");
+        }
+      }
+      out.print("</tr>\n");
+    }
   %>
   </table>
   </center>
@@ -46,6 +60,26 @@
 
 
 <h2>Reduce Tasks</h2>
+  <center>
+  <table border=2 cellpadding="5" cellspacing="2">
+  <tr><td align="center">Reduce Task Id</td><td>Pct 
Complete</td><td>State</td><td>Diagnostic Text</td></tr>
+
+  <%
+    for (int i = 0; i < reduceTaskReports.length; i++) {
+      Vector v = reduceTaskReports[i];
+      out.print("<tr><td>" + v.elementAt(0) + "</td><td>" + v.elementAt(1) + 
"</td><td>" + v.elementAt(2) + "</td>");
+      if (v.size() == 3) {
+        out.print("<td></td>");
+      } else {
+        for (int j = 3; j < v.size(); j++) {
+          out.print("<td>" + v.elementAt(j) + "</td>");
+        }
+      }
+      out.print("</tr>\n");
+    }
+  %>
+  </table>
+  </center>
 
 
 <hr>

Modified: lucene/nutch/trunk/src/webapps/jobtracker/jobtracker.jsp
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/webapps/jobtracker/jobtracker.jsp?rev=357197&r1=357196&r2=357197&view=diff
==============================================================================
--- lucene/nutch/trunk/src/webapps/jobtracker/jobtracker.jsp (original)
+++ lucene/nutch/trunk/src/webapps/jobtracker/jobtracker.jsp Fri Dec 16 
09:51:05 2005
@@ -4,7 +4,7 @@
   import="javax.servlet.http.*"
   import="java.io.*"
   import="java.util.*"
-  import="org.apache.nutch.mapReduce.*"
+  import="org.apache.nutch.mapred.*"
 %>
 <%!
   JobTracker tracker = JobTracker.getTracker();

svn commit: r357197 [5/5] - in /lucene/nutch: branches/mapred/ trunk/ trunk/bin/ trunk/conf/ trunk/lib/ trunk/lib/jetty-ext/ trunk/site/ trunk/src/java/org/apache/nutch/crawl/ trunk/src/java/org/apache/nutch/db/ trunk/src/java/org/apache/nutch/fs/ trun...

Reply via email to