svn commit: r391958 - in /lucene/nutch/trunk: conf/nutch-default.xml src/java/org/apache/nutch/parse/ParseData.java src/test/org/apache/nutch/parse/TestParseData.java src/test/org/apache/nutch/util/Wr

2006-04-06 Thread jerome
Author: jerome
Date: Thu Apr  6 03:49:40 2006
New Revision: 391958

URL: http://svn.apache.org/viewcvs?rev=391958view=rev
Log:
NUTCH-244, db.max.outlinks.per.page can now be negative for no limit of handled 
outlinks per page

Modified:
lucene/nutch/trunk/conf/nutch-default.xml
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
lucene/nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=391958r1=391957r2=391958view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Thu Apr  6 03:49:40 2006
@@ -255,6 +255,8 @@
   namedb.max.outlinks.per.page/name
   value100/value
   descriptionThe maximum number of outlinks that we'll process for a page.
+  If this value is nonnegative (=0), at most db.max.outlinks.per.page outlinks
+  will be processed for a page; otherwise, all outlinks will be processed.
   /description
 /property
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?rev=391958r1=391957r2=391958view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Thu Apr  
6 03:49:40 2006
@@ -119,12 +119,15 @@
 
 int totalOutlinks = in.readInt(); // read outlinks
 int maxOutlinksPerPage = this.conf.getInt(db.max.outlinks.per.page, 100);
-int outlinksToRead = Math.min(maxOutlinksPerPage, totalOutlinks);
+int outlinksToRead = totalOutlinks;
+if (maxOutlinksPerPage = 0) {
+  outlinksToRead = Math.min(maxOutlinksPerPage, totalOutlinks);
+}
 outlinks = new Outlink[outlinksToRead];
 for (int i = 0; i  outlinksToRead; i++) {
   outlinks[i] = Outlink.read(in);
 }
-for (int i = maxOutlinksPerPage; i  totalOutlinks; i++) {
+for (int i = outlinksToRead; i  totalOutlinks; i++) {
   Outlink.skip(in);
 }
 

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java?rev=391958r1=391957r2=391958view=diff
==
--- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java Thu 
Apr  6 03:49:40 2006
@@ -51,4 +51,31 @@
 WritableTestUtils.testWritable(r, conf);
   }

+  public void testMaxOutlinks() throws Exception {
+Outlink[] outlinks = new Outlink[128];
+for (int i=0; ioutlinks.length; i++) {
+  outlinks[i] = new Outlink(http://outlink.com/; + i, Outlink + i, 
conf);
+}
+ParseData original = new ParseData(ParseStatus.STATUS_SUCCESS,
+   Max Outlinks Title,
+   outlinks,
+   new Metadata());
+Configuration conf = NutchConfiguration.create();
+// No Outlinks
+conf.setInt(db.max.outlinks.per.page, 0);
+ParseData data = (ParseData) WritableTestUtils.writeRead(original, conf);
+assertEquals(0, data.getOutlinks().length);
+// Only 100 Outlinks
+conf.setInt(db.max.outlinks.per.page, 100);
+data = (ParseData) WritableTestUtils.writeRead(original, conf);
+assertEquals(100, data.getOutlinks().length);
+// 256 Outlinks
+conf.setInt(db.max.outlinks.per.page, 256);
+data = (ParseData) WritableTestUtils.writeRead(original, conf);
+assertEquals(outlinks.length, data.getOutlinks().length);
+// All Outlinks
+conf.setInt(db.max.outlinks.per.page, -1);
+data = (ParseData) WritableTestUtils.writeRead(original, conf);
+assertEquals(outlinks.length, data.getOutlinks().length);
+  }
 }

Modified: 
lucene/nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java?rev=391958r1=391957r2=391958view=diff
==
--- lucene/nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/util/WritableTestUtils.java 
Thu Apr  6 03:49:40 2006
@@ -31,6 +31,14 @@
   /** Utility method for testing writables. */
   public static void testWritable(Writable before, Configuration conf)
   throws Exception {
+TestCase.assertEquals(before, writeRead(before, conf));
+ 

svn commit: r392056 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java

2006-04-06 Thread ab
Author: ab
Date: Thu Apr  6 13:01:47 2006
New Revision: 392056

URL: http://svn.apache.org/viewcvs?rev=392056view=rev
Log:
Pages with only STATUS_DB_GONE were unaccounted for, which caused an NPE.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=392056r1=392055r2=392056view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Thu 
Apr  6 13:01:47 2006
@@ -52,6 +52,7 @@
   switch (datum.getStatus()) {// find old entry, if any
   case CrawlDatum.STATUS_DB_UNFETCHED:
   case CrawlDatum.STATUS_DB_FETCHED:
+  case CrawlDatum.STATUS_DB_GONE:
 old = datum;
 break;
   case CrawlDatum.STATUS_LINKED: