svn commit: r798304 [3/3] - in /lucene/nutch/logos: ./ character-hand-big.png character.eps nutch_logo.eps nutch_logo.png

2009-07-27 Thread cutting
Added: lucene/nutch/logos/nutch_logo.eps
URL: 
http://svn.apache.org/viewvc/lucene/nutch/logos/nutch_logo.eps?rev=798304view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/logos/nutch_logo.eps
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/logos/nutch_logo.png
URL: 
http://svn.apache.org/viewvc/lucene/nutch/logos/nutch_logo.png?rev=798304view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/logos/nutch_logo.png
--
svn:mime-type = application/octet-stream




svn commit: r475926 - /lucene/nutch/nightly/nightly.sh

2006-11-17 Thread cutting
Author: cutting
Date: Thu Nov 16 13:03:26 2006
New Revision: 475926

URL: http://svn.apache.org/viewvc?view=revrev=475926
Log:
Update nightly build location.

Modified:
lucene/nutch/nightly/nightly.sh

Modified: lucene/nutch/nightly/nightly.sh
URL: 
http://svn.apache.org/viewvc/lucene/nutch/nightly/nightly.sh?view=diffrev=475926r1=475925r2=475926
==
--- lucene/nutch/nightly/nightly.sh (original)
+++ lucene/nutch/nightly/nightly.sh Thu Nov 16 13:03:26 2006
@@ -5,7 +5,7 @@
 TRUNK=http://svn.apache.org/repos/asf/lucene/nutch/trunk
 
 REL_SERVER=people.apache.org
-REL_DIR=/www/people.apache.org/dist/lucene/nutch/nightly
+REL_DIR=/www/people.apache.org/builds/lucene/nutch/nightly
 
 # create an empty build directory
 rm -rf /tmp/nutch-nightly




svn commit: r421185 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java

2006-07-12 Thread cutting
Author: cutting
Date: Wed Jul 12 01:16:37 2006
New Revision: 421185

URL: http://svn.apache.org/viewvc?rev=421185view=rev
Log:
Patch a bug introduced by Hadoop 0.4.0, which requires specified input
directories to exist.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=421185r1=421184r2=421185view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Wed Jul 12 
01:16:37 2006
@@ -65,7 +65,8 @@
 if (LOG.isInfoEnabled()) { LOG.info(CrawlDb update: done); }
   }
 
-  public static JobConf createJob(Configuration config, Path crawlDb) {
+  public static JobConf createJob(Configuration config, Path crawlDb)
+throws IOException {
 Path newCrawlDb =
   new Path(crawlDb,
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
@@ -73,7 +74,11 @@
 JobConf job = new NutchJob(config);
 job.setJobName(crawldb  + crawlDb);
 
-job.addInputPath(new Path(crawlDb, CrawlDatum.DB_DIR_NAME));
+
+Path current = new Path(crawlDb, CrawlDatum.DB_DIR_NAME);
+if (FileSystem.get(job).exists(current)) {
+  job.addInputPath(current);
+}
 job.setInputFormat(SequenceFileInputFormat.class);
 job.setInputKeyClass(UTF8.class);
 job.setInputValueClass(CrawlDatum.class);




svn commit: r417884 - in /lucene/nutch/trunk: lib/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/segment/

2006-06-28 Thread cutting
Author: cutting
Date: Wed Jun 28 14:54:53 2006
New Revision: 417884

URL: http://svn.apache.org/viewvc?rev=417884view=rev
Log:
NUTCH-312.  Upgrade to Hadoop 0.4.0.

Added:
lucene/nutch/trunk/lib/commons-cli-2.0-SNAPSHOT.jar   (with props)
lucene/nutch/trunk/lib/hadoop-0.4.0.jar   (with props)
Removed:
lucene/nutch/trunk/lib/hadoop-0.3.2.jar
Modified:

lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java

Added: lucene/nutch/trunk/lib/commons-cli-2.0-SNAPSHOT.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-cli-2.0-SNAPSHOT.jar?rev=417884view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/commons-cli-2.0-SNAPSHOT.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/hadoop-0.4.0.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.4.0.jar?rev=417884view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/hadoop-0.4.0.jar
--
svn:mime-type = application/octet-stream

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java?rev=417884r1=417883r2=417884view=diff
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java 
Wed Jun 28 14:54:53 2006
@@ -31,6 +31,7 @@
 import org.apache.hadoop.mapred.RecordWriter;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.Progressable;
 
 import org.apache.nutch.parse.ParseOutputFormat;
 import org.apache.nutch.protocol.Content;
@@ -45,7 +46,8 @@
 
   public RecordWriter getRecordWriter(final FileSystem fs,
   final JobConf job,
-  final String name) throws IOException {
+  final String name,
+  final Progressable progress) throws 
IOException {
 
 final Path fetch =
   new Path(new Path(job.getOutputPath(), CrawlDatum.FETCH_DIR_NAME), name);
@@ -66,7 +68,7 @@
   }
 
   if (Fetcher.isParsing(job)) {
-parseOut = new ParseOutputFormat().getRecordWriter(fs, job, name);
+parseOut = new ParseOutputFormat().getRecordWriter(fs, job, name, 
null);
   }
 }
 

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=417884r1=417883r2=417884view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java 
Wed Jun 28 14:54:53 2006
@@ -26,6 +26,7 @@
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.Progressable;
 
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
@@ -276,7 +277,8 @@
   /** Write nothing. */
   public RecordWriter getRecordWriter(final FileSystem fs,
   final JobConf job,
-  final String name) throws IOException {
+  final String name,
+  final Progressable progress) throws 
IOException {
 return new RecordWriter() {   
 public void write(WritableComparable key, Writable value)
   throws IOException {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?rev=417884r1=417883r2=417884view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java

svn commit: r413175 - in /lucene/nutch/trunk/lib: hadoop-0.3.1.jar hadoop-0.3.2.jar

2006-06-09 Thread cutting
Author: cutting
Date: Fri Jun  9 14:48:23 2006
New Revision: 413175

URL: http://svn.apache.org/viewvc?rev=413175view=rev
Log:
Upgrading to Hadoop 0.3.2 release.

Added:
lucene/nutch/trunk/lib/hadoop-0.3.2.jar   (with props)
Removed:
lucene/nutch/trunk/lib/hadoop-0.3.1.jar

Added: lucene/nutch/trunk/lib/hadoop-0.3.2.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.3.2.jar?rev=413175view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/hadoop-0.3.2.jar
--
svn:mime-type = application/octet-stream




svn commit: r405861 - in /lucene/nutch/trunk/lib: hadoop-0.2.0.jar hadoop-0.2.1.jar

2006-05-12 Thread cutting
Author: cutting
Date: Fri May 12 13:31:59 2006
New Revision: 405861

URL: http://svn.apache.org/viewcvs?rev=405861view=rev
Log:
Upgrading to Hadoop 0.2.1.

Added:
lucene/nutch/trunk/lib/hadoop-0.2.1.jar   (with props)
Removed:
lucene/nutch/trunk/lib/hadoop-0.2.0.jar

Added: lucene/nutch/trunk/lib/hadoop-0.2.1.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.2.1.jar?rev=405861view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/hadoop-0.2.1.jar
--
svn:mime-type = application/octet-stream




svn commit: r400159 - /lucene/nutch/trunk/bin/

2006-05-05 Thread cutting
Author: cutting
Date: Fri May  5 13:01:44 2006
New Revision: 400159

URL: http://svn.apache.org/viewcvs?rev=400159view=rev
Log:
Ignore bin/rcc (from Hadoop).

Modified:
lucene/nutch/trunk/bin/   (props changed)

Propchange: lucene/nutch/trunk/bin/
--
--- svn:ignore (original)
+++ svn:ignore Fri May  5 13:01:44 2006
@@ -1,6 +1,7 @@
 hadoop
 hadoop-daemon.sh
 hadoop-daemons.sh
+rcc
 slaves.sh
 start-all.sh
 start-dfs.sh




svn commit: r400199 - in /lucene/nutch/trunk/lib: hadoop-0.1.1.jar hadoop-0.2.0.jar

2006-05-05 Thread cutting
Author: cutting
Date: Fri May  5 15:44:04 2006
New Revision: 400199

URL: http://svn.apache.org/viewcvs?rev=400199view=rev
Log:
Upgrading to Hadoop 0.2.0.

Added:
lucene/nutch/trunk/lib/hadoop-0.2.0.jar   (with props)
Removed:
lucene/nutch/trunk/lib/hadoop-0.1.1.jar

Added: lucene/nutch/trunk/lib/hadoop-0.2.0.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.2.0.jar?rev=400199view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/hadoop-0.2.0.jar
--
svn:mime-type = application/octet-stream




svn commit: r394781 - /lucene/nutch/trunk/bin/

2006-04-17 Thread cutting
Author: cutting
Date: Mon Apr 17 14:40:58 2006
New Revision: 394781

URL: http://svn.apache.org/viewcvs?rev=394781view=rev
Log:
Ignore more bin files.

Modified:
lucene/nutch/trunk/bin/   (props changed)

Propchange: lucene/nutch/trunk/bin/
--
--- svn:ignore (original)
+++ svn:ignore Mon Apr 17 14:40:58 2006
@@ -3,4 +3,8 @@
 hadoop-daemons.sh
 slaves.sh
 start-all.sh
+start-dfs.sh
+start-mapred.sh
 stop-all.sh
+stop-dfs.sh
+stop-mapred.sh




svn commit: r392458 - in /lucene/nutch/trunk/lib: hadoop-0.1.0.jar hadoop-0.1.1.jar

2006-04-07 Thread cutting
Author: cutting
Date: Fri Apr  7 16:48:10 2006
New Revision: 392458

URL: http://svn.apache.org/viewcvs?rev=392458view=rev
Log:
Upgrading to Hadoop release 0.1.1.

Added:
lucene/nutch/trunk/lib/hadoop-0.1.1.jar   (with props)
Removed:
lucene/nutch/trunk/lib/hadoop-0.1.0.jar

Added: lucene/nutch/trunk/lib/hadoop-0.1.1.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1.1.jar?rev=392458view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/hadoop-0.1.1.jar
--
svn:mime-type = application/octet-stream




svn commit: r390745 - in /lucene/nutch/trunk/lib: hadoop-0.1-dev.jar hadoop-0.1.0.jar

2006-04-01 Thread cutting
Author: cutting
Date: Sat Apr  1 12:16:22 2006
New Revision: 390745

URL: http://svn.apache.org/viewcvs?rev=390745view=rev
Log:
Update to Hadoop 0.1.0 release.

Added:
lucene/nutch/trunk/lib/hadoop-0.1.0.jar   (with props)
Removed:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Added: lucene/nutch/trunk/lib/hadoop-0.1.0.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1.0.jar?rev=390745view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/hadoop-0.1.0.jar
--
svn:mime-type = application/octet-stream




svn commit: r387310 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

2006-03-20 Thread cutting
Author: cutting
Date: Mon Mar 20 13:08:15 2006
New Revision: 387310

URL: http://svn.apache.org/viewcvs?rev=387310view=rev
Log:
Upgrade to current Hadoop.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=387310r1=387309r2=387310view=diff
==
Binary files - no diff available.




svn commit: r386181 - in /lucene/nutch/branches/branch-0.7: site/issue_tracking.html site/issue_tracking.pdf src/site/src/documentation/content/xdocs/issue_tracking.xml

2006-03-15 Thread cutting
Author: cutting
Date: Wed Mar 15 14:20:40 2006
New Revision: 386181

URL: http://svn.apache.org/viewcvs?rev=386181view=rev
Log:
Updated link to jira.

Modified:
lucene/nutch/branches/branch-0.7/site/issue_tracking.html
lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf

lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml

Modified: lucene/nutch/branches/branch-0.7/site/issue_tracking.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/site/issue_tracking.html?rev=386181r1=386180r2=386181view=diff
==
--- lucene/nutch/branches/branch-0.7/site/issue_tracking.html (original)
+++ lucene/nutch/branches/branch-0.7/site/issue_tracking.html Wed Mar 15 
14:20:40 2006
@@ -128,7 +128,7 @@
 
 p
   Nutch issues (bugs, as well as enhancement requests) are tracked in 
-  Apache JIRA a 
href=http://nagoya.apache.org/jira/browse/Nutch;here/a.
+  Apache JIRA a 
href=http://issues.apache.org/jira/browse/Nutch;here/a.
   If you aren't sure whether something is a bug, post a question on the
   Nutch user a href=mailing_lists.htmlmailing list/a.
 /p

Modified: lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf?rev=386181r1=386180r2=386181view=diff
==
--- lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf (original)
+++ lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf Wed Mar 15 
14:20:40 2006
@@ -32,7 +32,7 @@
 /Rect [ 485.232 585.8 505.884 573.8 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A  /URI (http://nagoya.apache.org/jira/browse/Nutch)
+/A  /URI (http://issues.apache.org/jira/browse/Nutch)
 /S /URI 
 /H /I
 

Modified: 
lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml?rev=386181r1=386180r2=386181view=diff
==
--- 
lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml
 (original)
+++ 
lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml
 Wed Mar 15 14:20:40 2006
@@ -11,7 +11,7 @@
   body
 p
   Nutch issues (bugs, as well as enhancement requests) are tracked in 
-  Apache JIRA a 
href=http://nagoya.apache.org/jira/browse/Nutch;here/a.
+  Apache JIRA a 
href=http://issues.apache.org/jira/browse/Nutch;here/a.
   If you aren't sure whether something is a bug, post a question on the
   Nutch user a href=mailing_lists.htmlmailing list/a.
 /p




svn commit: r383698 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

2006-03-06 Thread cutting
Author: cutting
Date: Mon Mar  6 14:54:20 2006
New Revision: 383698

URL: http://svn.apache.org/viewcvs?rev=383698view=rev
Log:
Upgrade to latest version of Hadoop.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=383698r1=383697r2=383698view=diff
==
Binary files - no diff available.




svn commit: r382912 - in /lucene/nutch/trunk/src/java/org/apache/nutch: crawl/ fetcher/ indexer/ parse/ plugin/ searcher/ segment/

2006-03-03 Thread cutting
Author: cutting
Date: Fri Mar  3 11:05:41 2006
New Revision: 382912

URL: http://svn.apache.org/viewcvs?rev=382912view=rev
Log:
Undo unintentional changes made in r381751.  Thanks, Jerome, for catching this!

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=382912r1=382911r2=382912view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Fri Mar  3 
11:05:41 2006
@@ -44,11 +44,11 @@
 
 
   /* Perform complete crawling and indexing given a set of root urls. */
-  public static boolean doMain(String args[]) throws Exception {
+  public static void main(String args[]) throws Exception {
 if (args.length  1) {
   System.out.println
 (Usage: Crawl urlDir [-dir d] [-threads n] [-depth i] [-topN N]);
-  return false;
+  return;
 }
 
 Configuration conf = NutchConfiguration.create();
@@ -122,22 +122,5 @@
 new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir, job).merge();
 
 LOG.info(crawl finished:  + dir);
-
-return true;
-  }
-
-  /**
-   * main() wrapper that returns proper exit status
-   */
-  public static void main(String[] args) {
-Runtime rt = Runtime.getRuntime();
-try {
-  boolean status = doMain(args);
-  rt.exit(status ? 0 : 1);
-}
-catch (Exception e) {
-  LOG.log(Level.SEVERE, error, caught Exception in main(), e);
-  rt.exit(1);
-}
   }
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=382912r1=382911r2=382912view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Fri Mar  3 
11:05:41 2006
@@ -90,31 +90,17 @@
 fs.delete(old);
   }
 
-  public static boolean doMain(String[] args) throws Exception {
+  public static void main(String[] args) throws Exception {
 CrawlDb crawlDb = new CrawlDb(NutchConfiguration.create());
 
 if (args.length  2) {
   System.err.println(Usage: crawldb segment);
-  return false;
+  return;
 }
 
 crawlDb.update(new File(args[0]), new File(args[1]));
-
-return true;
   }
 
-  /**
-   * main() wrapper that returns proper exit status
-   */
-  public static void main(String[] args) {
-Runtime rt = Runtime.getRuntime();
-try {
-  boolean status = doMain(args);
-  rt.exit(status ? 0 : 1);
-}
-catch (Exception e) {
-  LOG.log(Level.SEVERE, error, caught Exception in main(), e);
-  rt.exit(1);
-}
-  }
+
+
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=382912r1=382911r2=382912view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Fri 
Mar  3 11:05:41 2006
@@ -20,7 +20,7 @@
 import java.io.IOException;
 import java.util.Iterator;
 import java.util.TreeMap;
-import java.util.logging.*;
+import java.util.logging.Logger;
 
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.io.LongWritable;
@@ -241,7 +241,7 @@
 JobClient.runJob(job);
   }
 
-  public static boolean doMain(String[] args) throws IOException {
+  public static void main(String[] args) throws IOException {
 CrawlDbReader dbr = new CrawlDbReader();
 
 if (args.length  1) {
@@ -250,7 +250,7

svn commit: r382939 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

2006-03-03 Thread cutting
Author: cutting
Date: Fri Mar  3 13:46:21 2006
New Revision: 382939

URL: http://svn.apache.org/viewcvs?rev=382939view=rev
Log:
Upgrade hadoop to latest version with some important mapred bug fixes.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=382939r1=382938r2=382939view=diff
==
Binary files - no diff available.




svn commit: r382512 - in /lucene/nutch/trunk/lib: lucene-core-1.9-final.jar lucene-core-1.9.1.jar lucene-misc-1.9-final.jar lucene-misc-1.9.1.jar

2006-03-02 Thread cutting
Author: cutting
Date: Thu Mar  2 12:59:09 2006
New Revision: 382512

URL: http://svn.apache.org/viewcvs?rev=382512view=rev
Log:
Upgrade to Lucene 1.9.1.

Added:
lucene/nutch/trunk/lib/lucene-core-1.9.1.jar   (with props)
lucene/nutch/trunk/lib/lucene-misc-1.9.1.jar   (with props)
Removed:
lucene/nutch/trunk/lib/lucene-core-1.9-final.jar
lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar

Added: lucene/nutch/trunk/lib/lucene-core-1.9.1.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-core-1.9.1.jar?rev=382512view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/lucene-core-1.9.1.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/lucene-misc-1.9.1.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-misc-1.9.1.jar?rev=382512view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/lucene-misc-1.9.1.jar
--
svn:mime-type = application/octet-stream




svn commit: r382573 - in /lucene/nutch/trunk: conf/hadoop-env.sh.template lib/hadoop-0.1-dev.jar

2006-03-02 Thread cutting
Author: cutting
Date: Thu Mar  2 15:59:24 2006
New Revision: 382573

URL: http://svn.apache.org/viewcvs?rev=382573view=rev
Log:
Update to latest Hadoop code.

Modified:
lucene/nutch/trunk/conf/hadoop-env.sh.template
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/conf/hadoop-env.sh.template
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/hadoop-env.sh.template?rev=382573r1=382572r2=382573view=diff
==
--- lucene/nutch/trunk/conf/hadoop-env.sh.template (original)
+++ lucene/nutch/trunk/conf/hadoop-env.sh.template Thu Mar  2 15:59:24 2006
@@ -1,6 +1,11 @@
 # Set Hadoop-specific environment variables here.
 
-# The java implementation to use.
+# The only required environment variable is JAVA_HOME.  All others are
+# optional.  When running a distributed configuration it is best to
+# set JAVA_HOME in this file, so that it is correctly defined on
+# remote nodes.
+
+# The java implementation to use.  Required.
 # export JAVA_HOME=/usr/bin/java
 
 # The maximum amount of heap to use, in MB. Default is 1000.
@@ -8,6 +13,9 @@
 
 # Extra Java runtime options.  Empty by default.
 # export HADOOP_OPTS=-server
+
+# Extra ssh options.  Default: '-o ConnectTimeout=1 -o 
SendEnv=HADOOP_CONF_DIR'.
+# export HADOOP_SSH_OPTS=-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR
 
 # Where log files are stored.  $HADOOP_HOME/logs by default.
 # export HADOOP_LOG_DIR=${HADOOP_HOME}/logs

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=382573r1=382572r2=382573view=diff
==
Binary files - no diff available.




svn commit: r382579 - /lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java

2006-03-02 Thread cutting
Author: cutting
Date: Thu Mar  2 16:06:59 2006
New Revision: 382579

URL: http://svn.apache.org/viewcvs?rev=382579view=rev
Log:
Disable speculative execution, since input format has side effects.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=382579r1=382578r2=382579view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java 
Thu Mar  2 16:06:59 2006
@@ -307,6 +307,7 @@
 job.setInputKeyClass(HashScore.class);
 job.setInputValueClass(IndexDoc.class);
 job.setInputFormat(InputFormat.class);
+job.setBoolean(mapred.speculative.execution, false);
 
 job.setPartitionerClass(HashPartitioner.class);
 job.setReducerClass(HashReducer.class);




svn commit: r381721 - in /lucene/nutch/trunk/lib: lucene-core-1.9-final.jar lucene-core-1.9-rc1-dev.jar lucene-misc-1.9-final.jar lucene-misc-1.9-rc1-dev.jar

2006-02-28 Thread cutting
Author: cutting
Date: Tue Feb 28 10:00:43 2006
New Revision: 381721

URL: http://svn.apache.org/viewcvs?rev=381721view=rev
Log:
Upgrade lucene version to final release.

Added:
lucene/nutch/trunk/lib/lucene-core-1.9-final.jar   (with props)
lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar   (with props)
Removed:
lucene/nutch/trunk/lib/lucene-core-1.9-rc1-dev.jar
lucene/nutch/trunk/lib/lucene-misc-1.9-rc1-dev.jar

Added: lucene/nutch/trunk/lib/lucene-core-1.9-final.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-core-1.9-final.jar?rev=381721view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/lucene-core-1.9-final.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar?rev=381721view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar
--
svn:mime-type = application/octet-stream




svn commit: r381824 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

2006-02-28 Thread cutting
Author: cutting
Date: Tue Feb 28 15:30:02 2006
New Revision: 381824

URL: http://svn.apache.org/viewcvs?rev=381824view=rev
Log:
Updating hadoop jar.  Includes fixes for Windows.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=381824r1=381823r2=381824view=diff
==
Binary files - no diff available.




svn commit: r380789 - /lucene/nutch/trunk/build.xml

2006-02-24 Thread cutting
Author: cutting
Date: Fri Feb 24 11:11:44 2006
New Revision: 380789

URL: http://svn.apache.org/viewcvs?rev=380789view=rev
Log:
Fix to not use 'exec', but rather 'untar' and 'chmod' which are more portable.

Modified:
lucene/nutch/trunk/build.xml

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=380789r1=380788r2=380789view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Fri Feb 24 11:11:44 2006
@@ -57,10 +57,9 @@
   fileset dir=${lib.dir} includes=hadoop*.jar/
   patternset includes=bin.tgz/
 /unjar
-exec dir=bin executable=tar
-  arg value=xzf/
-  arg value=../${build.dir}/hadoop/bin.tgz/
-/exec
+
+untar src=${build.dir}/hadoop/bin.tgz dest=bin compression=gzip/
+chmod dir=bin perm=ugo+rx includes=*.sh,hadoop/
 
 !-- unpack hadoop webapp from hadoop jar into build directory --
 mkdir dir=${build.dir}/webapps/




svn commit: r380840 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

2006-02-24 Thread cutting
Author: cutting
Date: Fri Feb 24 14:38:06 2006
New Revision: 380840

URL: http://svn.apache.org/viewcvs?rev=380840view=rev
Log:
Update hadoop jar, to get recent fixes from that project.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=380840r1=380839r2=380840view=diff
==
Binary files - no diff available.




svn commit: r378381 - /lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml

2006-02-16 Thread cutting
Author: cutting
Date: Thu Feb 16 14:24:47 2006
New Revision: 378381

URL: http://svn.apache.org/viewcvs?rev=378381view=rev
Log:
Fix to work with Forrest 0.7, where ext: links seem to no longer work
in tabs.xml.

Modified:
lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml

Modified: lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml?rev=378381r1=378380r2=378381view=diff
==
--- lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml 
(original)
+++ lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml Thu 
Feb 16 14:24:47 2006
@@ -15,6 +15,6 @@
   --
 
   tab label=Main dir=/  
-  tab label=Wiki href=ext:wiki/
+  tab label=Wiki href=http://wiki.apache.org/nutch//
   
 /tabs




svn commit: r378044 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

2006-02-15 Thread cutting
Author: cutting
Date: Wed Feb 15 09:56:54 2006
New Revision: 378044

URL: http://svn.apache.org/viewcvs?rev=378044view=rev
Log:
Upgrade to latest version of Hadoop.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=378044r1=378043r2=378044view=diff
==
Binary files - no diff available.




svn commit: r378107 - in /lucene/nutch/trunk: conf/ conf/hadoop-env.sh.template conf/slaves.template lib/hadoop-0.1-dev.jar src/java/org/apache/nutch/fetcher/Fetcher.java

2006-02-15 Thread cutting
Author: cutting
Date: Wed Feb 15 14:45:31 2006
New Revision: 378107

URL: http://svn.apache.org/viewcvs?rev=378107view=rev
Log:
Fix Fetcher to disable speculative exexution, to keep it polite.  Also upgrade 
to latest hadoop jar that supports this  feature.  Note that Hadoop's 
environment specification has changed, with all environment variables settable 
from conf/hadoop-env.sh, and the slaves file is now in conf/, rather than in 
one's home directory.

Added:
lucene/nutch/trunk/conf/hadoop-env.sh.template
lucene/nutch/trunk/conf/slaves.template
Modified:
lucene/nutch/trunk/conf/   (props changed)
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Propchange: lucene/nutch/trunk/conf/
--
--- svn:ignore (original)
+++ svn:ignore Wed Feb 15 14:45:31 2006
@@ -1,5 +1,4 @@
-nutch-site.xml
-regex-normalize.xml
-crawl-urlfilter.txt
-regex-urlfilter.txt
-mapred-default.xml
+*.xml
+*.txt
+*.sh
+slaves

Added: lucene/nutch/trunk/conf/hadoop-env.sh.template
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/hadoop-env.sh.template?rev=378107view=auto
==
--- lucene/nutch/trunk/conf/hadoop-env.sh.template (added)
+++ lucene/nutch/trunk/conf/hadoop-env.sh.template Wed Feb 15 14:45:31 2006
@@ -0,0 +1,25 @@
+# Set Hadoop-specific environment variables here.
+
+# The java implementation to use.
+# export JAVA_HOME=/usr/bin/java
+
+# The maximum amount of heap to use, in MB. Default is 1000.
+# export HADOOP_HEAPSIZE=2000
+
+# Extra Java runtime options.  Empty by default.
+# export HADOOP_OPTS=-server
+
+# Where log files are stored.  $HADOOP_HOME/logs by default.
+# export HADOOP_LOG_DIR=${HADOOP_HOME}/logs
+
+# File naming remote slave hosts.  $HADOOP_HOME/conf/slaves by default.
+# export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves
+
+# host:path where hadoop code should be rsync'd from.  Unset by default.
+# export HADOOP_MASTER=master:/home/$USER/src/hadoop
+
+# The directory where pid files are stored. /tmp by default.
+# export HADOOP_PID_DIR=/var/hadoop/pids
+
+# A string representing this instance of hadoop. $USER by default.
+# export HADOOP_IDENT_STRING=$USER

Added: lucene/nutch/trunk/conf/slaves.template
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/slaves.template?rev=378107view=auto
==
--- lucene/nutch/trunk/conf/slaves.template (added)
+++ lucene/nutch/trunk/conf/slaves.template Wed Feb 15 14:45:31 2006
@@ -0,0 +1 @@
+localhost

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=378107r1=378106r2=378107view=diff
==
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=378107r1=378106r2=378107view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Feb 
15 14:45:31 2006
@@ -348,6 +348,9 @@
 job.set(SEGMENT_NAME_KEY, segment.getName());
 job.setBoolean(fetcher.parse, parsing);
 
+// for politeness, don't permit parallel execution of a single task
+job.setBoolean(mapred.speculative.execution, false);
+
 job.setInputDir(new File(segment, CrawlDatum.GENERATE_DIR_NAME));
 job.setInputFormat(InputFormat.class);
 job.setInputKeyClass(UTF8.class);




svn commit: r376815 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

2006-02-10 Thread cutting
Author: cutting
Date: Fri Feb 10 11:44:47 2006
New Revision: 376815

URL: http://svn.apache.org/viewcvs?rev=376815view=rev
Log:
Update Hadoop jar.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=376815r1=376814r2=376815view=diff
==
Binary files - no diff available.




svn commit: r376435 - in /lucene/nutch/trunk: lib/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/

2006-02-09 Thread cutting
Author: cutting
Date: Thu Feb  9 12:57:44 2006
New Revision: 376435

URL: http://svn.apache.org/viewcvs?rev=376435view=rev
Log:
Updating to latest Hadoop jar, adding now-required close() methods to mapper 
and reducer implementations.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=376435r1=376434r2=376435view=diff
==
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=376435r1=376434r2=376435view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Thu 
Feb  9 12:57:44 2006
@@ -56,6 +56,7 @@
 
   public static class CrawlDbStatMapper implements Mapper {
 public void configure(JobConf job) {}
+public void close() {}
 public void map(WritableComparable key, Writable value, OutputCollector 
output, Reporter reporter)
 throws IOException {
   CrawlDatum cd = (CrawlDatum) value;
@@ -68,6 +69,7 @@
 
   public static class CrawlDbStatReducer implements Reducer {
 public void configure(JobConf job) {}
+public void close() {}
 public void reduce(WritableComparable key, Iterator values, 
OutputCollector output, Reporter reporter)
 throws IOException {
 
@@ -127,8 +129,8 @@
   }
 }
 
-public void configure(JobConf job) {
-}
+public void configure(JobConf job) {}
+public void close() {}
   }
   
   public void processStatJob(String crawlDb, Configuration config) throws 
IOException {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=376435r1=376434r2=376435view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Thu 
Feb  9 12:57:44 2006
@@ -30,6 +30,8 @@
 retryMax = job.getInt(db.fetch.retry.max, 3);
   }
 
+  public void close() {}
+
   public void reduce(WritableComparable key, Iterator values,
  OutputCollector output, Reporter reporter)
 throws IOException {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=376435r1=376434r2=376435view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Thu Feb  
9 12:57:44 2006
@@ -51,6 +51,8 @@
   maxPerHost = job.getInt(generate.max.per.host, -1);
 }
 
+public void close() {}
+
 /** Select  invert subset due for fetch. */
 public void map(WritableComparable key, Writable value,
 OutputCollector output, Reporter reporter)

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=376435r1=376434r2=376435view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Thu Feb  9 
12:57:44 2006
@@ -48,6 +48,8 @@
   this.jobConf = job;
 }
 
+public void close() {}
+
 public void map(WritableComparable key, Writable val,
 OutputCollector output, Reporter reporter)
   throws IOException {
@@ -73,6 +75,7

svn commit: r376485 - in /lucene/nutch/trunk: ./ bin/ lib/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/j

2006-02-09 Thread cutting
Author: cutting
Date: Thu Feb  9 15:20:28 2006
New Revision: 376485

URL: http://svn.apache.org/viewcvs?rev=376485view=rev
Log:
Fix for NUTCH-209.  Nutch now supplies all code to remote MapReduce daemons 
through a job jar file.  So Hadoop daemons no longer need to be restarted when 
Nutch code changes.

Added:
lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java
Modified:
lucene/nutch/trunk/bin/nutch
lucene/nutch/trunk/build.xml
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java

Modified: lucene/nutch/trunk/bin/nutch
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/bin/nutch?rev=376485r1=376484r2=376485view=diff
==
--- lucene/nutch/trunk/bin/nutch (original)
+++ lucene/nutch/trunk/bin/nutch Thu Feb  9 15:20:28 2006
@@ -82,13 +82,13 @@
 CLASSPATH=${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}
 CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
 
-# for developers, add Nutch classes to CLASSPATH
-if [ -d $NUTCH_HOME/build/classes ]; then
-  CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/classes
-fi
+# for developers, add plugins, job  test code to CLASSPATH
 if [ -d $NUTCH_HOME/build/plugins ]; then
   CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build
 fi
+for f in $NUTCH_HOME/build/nutch-*.job; do
+  CLASSPATH=${CLASSPATH}:$f;
+done
 if [ -d $NUTCH_HOME/build/test/classes ]; then
   CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/test/classes
 fi
@@ -96,14 +96,14 @@
 # so that filenames w/ spaces are handled correctly in loops below
 IFS=
 
-# for releases, add Nutch jar to CLASSPATH
-for f in $NUTCH_HOME/nutch-*.jar; do
+# for releases, add Nutch job to CLASSPATH
+for f in $NUTCH_HOME/nutch-*.job; do
   CLASSPATH=${CLASSPATH}:$f;
 done
 
 # add plugins to classpath
 if [ -d $NUTCH_HOME/plugins ]; then
-  CLASSPATH=${CLASSPATH}:$NUTCH_HOME
+  CLASSPATH=${NUTCH_HOME}:${CLASSPATH}
 fi
 
 # add libs to CLASSPATH

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=376485r1=376484r2=376485view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Thu Feb  9 15:20:28 2006
@@ -1,6 +1,6 @@
 ?xml version=1.0?
 
-project name=Nutch default=compile
+project name=Nutch default=job
 
   !-- Load all the default properties, and any the user wants--
   !-- to contribute (without having to type -D or edit this file --
@@ -100,7 +100,6 @@
   target name=dynamic depends=generate-src, compile
   /target
 
-
   !-- == --
   !-- Make nutch.jar --
   !-- == --
@@ -119,6 +118,21 @@
   /target
 
   !-- == --
+  !-- Make job jar   --
+  !-- == --
+  !----
+  !-- == --
+  target name=job depends=compile
+jar jarfile=${build.dir}/${final.name}.job
+  zipfileset dir=${build.classes}/
+  zipfileset dir=${conf.dir} excludes=*.template/
+  zipfileset dir=${lib.dir} prefix=lib
+  includes=**/*.jar excludes=hadoop-*.jar/
+  zipfileset dir=${build.plugins} prefix=plugins/
+/jar
+  /target
+
+  !-- == --
   !-- Make nutch.war --
   !-- == --
   !----
@@ -385,7 +399,7

svn commit: r376072 - /lucene/nutch/trunk/conf/nutch-default.xml

2006-02-08 Thread cutting
Author: cutting
Date: Wed Feb  8 13:25:30 2006
New Revision: 376072

URL: http://svn.apache.org/viewcvs?rev=376072view=rev
Log:
Restore accidentally removed file defaults.

Modified:
lucene/nutch/trunk/conf/nutch-default.xml

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=376072r1=376071r2=376072view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Wed Feb  8 13:25:30 2006
@@ -7,6 +7,28 @@
 
 configuration
 
+!-- file properties --
+
+property
+  namefile.content.limit/name
+  value65536/value
+  descriptionThe length limit for downloaded content, in bytes.
+  If this value is larger than zero, content longer than it will be
+  truncated; otherwise (zero or negative), no truncation at all.
+  /description
+/property
+
+property
+  namefile.content.ignored/name
+  valuetrue/value
+  descriptionIf true, no file content will be saved during fetch.
+  And it is probably what we want to set most of time, since file:// URLs
+  are meant to be local and we can always use them directly at parsing
+  and indexing stages. Otherwise file contents will be saved.
+  !! NO IMPLEMENTED YET !!
+  /description
+/property
+
 !-- HTTP properties --
 
 property




svn commit: r375704 - in /lucene/nutch/trunk/lib: jetty-5.1.4.LICENSE.txt jetty-5.1.4.jar jetty-ext/

2006-02-07 Thread cutting
Author: cutting
Date: Tue Feb  7 13:02:46 2006
New Revision: 375704

URL: http://svn.apache.org/viewcvs?rev=375704view=rev
Log:
Restoring jetty to Nutch lib: removed by mistake.

Added:
lucene/nutch/trunk/lib/jetty-5.1.4.LICENSE.txt
  - copied unchanged from r374759, 
lucene/hadoop/trunk/lib/jetty-5.1.4.LICENSE.txt
lucene/nutch/trunk/lib/jetty-5.1.4.jar
  - copied unchanged from r374759, lucene/hadoop/trunk/lib/jetty-5.1.4.jar
lucene/nutch/trunk/lib/jetty-ext/
  - copied from r374759, lucene/hadoop/trunk/lib/jetty-ext/



svn commit: r375333 - /lucene/nutch/nightly/nightly.properties

2006-02-06 Thread cutting
Author: cutting
Date: Mon Feb  6 10:57:09 2006
New Revision: 375333

URL: http://svn.apache.org/viewcvs?rev=375333view=rev
Log:
Updated email paramters.

Modified:
lucene/nutch/nightly/nightly.properties

Modified: lucene/nutch/nightly/nightly.properties
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.properties?rev=375333r1=375332r2=375333view=diff
==
--- lucene/nutch/nightly/nightly.properties (original)
+++ lucene/nutch/nightly/nightly.properties Mon Feb  6 10:57:09 2006
@@ -1,5 +1,5 @@
-MailLogger.mailhost = mail.apache.org
-MailLogger.from = nutch-dev@incubator.apache.org
-MailLogger.failure.to = nutch-dev@incubator.apache.org
+MailLogger.mailhost = localhost
+MailLogger.from = nutch-dev@lucene.apache.org
+MailLogger.failure.to = nutch-dev@lucene.apache.org
 MailLogger.failure.subject = Nutch nightly build failure
 MailLogger.success.notify = false




svn commit: r372342 - /lucene/nutch/nightly/nightly.sh

2006-01-25 Thread cutting
Author: cutting
Date: Wed Jan 25 14:20:06 2006
New Revision: 372342

URL: http://svn.apache.org/viewcvs?rev=372342view=rev
Log:
Fix remove command.

Modified:
lucene/nutch/nightly/nightly.sh

Modified: lucene/nutch/nightly/nightly.sh
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.sh?rev=372342r1=372341r2=372342view=diff
==
--- lucene/nutch/nightly/nightly.sh (original)
+++ lucene/nutch/nightly/nightly.sh Wed Jan 25 14:20:06 2006
@@ -25,4 +25,4 @@
 scp build/*.tar.gz $REL_SERVER:$REL_DIR/nutch-`/bin/date +%F`.tar.gz
 
 # remove all but five newest builds
-ssh $REL_SERVER rm `ssh $REL_SERVER find -type f $REL_DIR | sort -r | tail +5`
+ssh $REL_SERVER rm `ssh $REL_SERVER find $REL_DIR -type f | sort -r | tail +5`




svn commit: r370632 - /lucene/nutch/trunk/conf/nutch-default.xml

2006-01-19 Thread cutting
Author: cutting
Date: Thu Jan 19 12:58:54 2006
New Revision: 370632

URL: http://svn.apache.org/viewcvs?rev=370632view=rev
Log:
Switch default to protocol-http, since it seems more reliable than 
protocol-httpclient.

Modified:
lucene/nutch/trunk/conf/nutch-default.xml

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=370632r1=370631r2=370632view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Thu Jan 19 12:58:54 2006
@@ -733,7 +733,7 @@
 
 property
   nameplugin.includes/name
-  
valueprotocol-httpclient|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)/value
+  
valueprotocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)/value
   descriptionRegular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.
   In any case you need at least include the nutch-extensionpoints plugin. By




svn commit: r370638 - /lucene/nutch/trunk/conf/nutch-default.xml

2006-01-19 Thread cutting
Author: cutting
Date: Thu Jan 19 13:24:58 2006
New Revision: 370638

URL: http://svn.apache.org/viewcvs?rev=370638view=rev
Log:
Document a few more properties.  Contributed by Dominik Friedrich.

Modified:
lucene/nutch/trunk/conf/nutch-default.xml

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=370638r1=370637r2=370638view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Thu Jan 19 13:24:58 2006
@@ -379,6 +379,14 @@
   exception./description
 /property
   
+property
+  nameio.map.index.skip/name
+  value0/value
+  descriptionNumber of index entries to skip between each entry.
+  Zero by default. Setting this to values larger than zero can
+  facilitate opening large map files using less memory./description
+/property
+
 !-- file system properties --
 
 property
@@ -412,6 +420,14 @@
   directories, typically on different devices./description
 /property
 
+property
+  namendfs.replication/name
+  value3/value
+  descriptionHow many copies we try to have at all times. The actual
+  number of replications is at max the number of datanodes in the
+  cluster./description
+/property
+
 !-- map/reduce properties --
 
 property
@@ -509,6 +525,13 @@
   value200m/value
   descriptionThe heap size (-Xmx) that will be used for task tracker
   child processes./description
+/property
+
+property
+  namemapred.combine.buffer.size/name
+  value10/value
+  descriptionThe number of entries the combining collector caches before
+  combining them and writing to disk./description
 /property
 
 !-- indexer properties --




svn commit: r370657 - in /lucene/nutch/nightly: nightly.cron nightly.properties nightly.sh

2006-01-19 Thread cutting
Author: cutting
Date: Thu Jan 19 14:46:28 2006
New Revision: 370657

URL: http://svn.apache.org/viewcvs?rev=370657view=rev
Log:
Moving nightly build to lucene.zones.apache.org.

Modified:
lucene/nutch/nightly/nightly.cron
lucene/nutch/nightly/nightly.properties
lucene/nutch/nightly/nightly.sh

Modified: lucene/nutch/nightly/nightly.cron
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.cron?rev=370657r1=370656r2=370657view=diff
==
--- lucene/nutch/nightly/nightly.cron (original)
+++ lucene/nutch/nightly/nightly.cron Thu Jan 19 14:46:28 2006
@@ -1,4 +1,4 @@
 # nightly crontab file
 # install with: crontab nightly.cron
 # run seventeen minutes after midnight, every day
-17 0 * * *   $HOME/src/nutch/nightly/nightly.sh  
$HOME/src/nutch/nightly/nightly.log 21
+17 0 * * *   $HOME/nutch-nightly/nightly.sh  
$HOME/nutch-nightly/nightly.log 21

Modified: lucene/nutch/nightly/nightly.properties
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.properties?rev=370657r1=370656r2=370657view=diff
==
--- lucene/nutch/nightly/nightly.properties (original)
+++ lucene/nutch/nightly/nightly.properties Thu Jan 19 14:46:28 2006
@@ -1,4 +1,4 @@
-MailLogger.mailhost = smtp.sbcglobal.net
+MailLogger.mailhost = mail.apache.org
 MailLogger.from = nutch-dev@incubator.apache.org
 MailLogger.failure.to = nutch-dev@incubator.apache.org
 MailLogger.failure.subject = Nutch nightly build failure

Modified: lucene/nutch/nightly/nightly.sh
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.sh?rev=370657r1=370656r2=370657view=diff
==
--- lucene/nutch/nightly/nightly.sh (original)
+++ lucene/nutch/nightly/nightly.sh Thu Jan 19 14:46:28 2006
@@ -1,6 +1,6 @@
 #!/bin/bash -vx
 
-export JAVA_HOME=$HOME/local/j2sdk1.4.2
+export JAVA_HOME=/usr/j2se
 
 TRUNK=http://svn.apache.org/repos/asf/lucene/nutch/trunk
 
@@ -12,12 +12,12 @@
 cd /tmp
 
 # export sources into it
-svn export $TRUNK nutch-nightly
+$HOME/bin/svn export $TRUNK nutch-nightly
 
 # run build
 cd nutch-nightly
-$HOME/local/ant/bin/ant \
- -propertyfile $HOME/src/nutch/nightly/nightly.properties \
+$HOME/bin/ant \
+ -propertyfile $HOME/nutch-nightly/nightly.properties \
  -logger org.apache.tools.ant.listener.MailLogger \
  -Dversion=nightly nightly
 




svn commit: r370281 - /lucene/nutch/trunk/build.xml

2006-01-18 Thread cutting
Author: cutting
Date: Wed Jan 18 14:03:28 2006
New Revision: 370281

URL: http://svn.apache.org/viewcvs?rev=370281view=rev
Log:
Fix NUTCH-102: include webapps in packaged releases.

Modified:
lucene/nutch/trunk/build.xml

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=370281r1=370280r2=370281view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Wed Jan 18 14:03:28 2006
@@ -377,6 +377,10 @@
   fileset dir=lib/
 /copy
 
+copy todir=${dist.dir}/webapps
+  fileset dir=${build.webapps}/
+/copy
+
 copy todir=${dist.dir}/plugins
   fileset dir=${build.plugins}/
 /copy




svn commit: r367406 - in /lucene/nutch/trunk/src: java/org/apache/nutch/ipc/RPC.java test/org/apache/nutch/ipc/TestRPC.java

2006-01-09 Thread cutting
Author: cutting
Date: Mon Jan  9 13:50:48 2006
New Revision: 367406

URL: http://svn.apache.org/viewcvs?rev=367406view=rev
Log:
Fix parallel RPC calls to work correctly with methods that return void.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java
lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java?rev=367406r1=367405r2=367406view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java Mon Jan  9 
13:50:48 2006
@@ -149,6 +149,10 @@
 
 Writable[] wrappedValues = CLIENT.call(invocations, addrs);
 
+if (method.getReturnType() == Void.TYPE) {
+  return null;
+}
+
 Object[] values =
   (Object[])Array.newInstance(method.getReturnType(),wrappedValues.length);
 for (int i = 0; i  values.length; i++)

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java?rev=367406r1=367405r2=367406view=diff
==
--- lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java Mon Jan  9 
13:50:48 2006
@@ -110,13 +110,17 @@
 }
 assertTrue(caught);
 
-// try a multi-call
-Method method =
+// try some multi-calls
+Method echo =
   TestProtocol.class.getMethod(echo, new Class[] { String.class });
-String[] values = (String[])RPC.call(method, new String[][]{{a},{b}},
+String[] strings = (String[])RPC.call(echo, new String[][]{{a},{b}},
  new InetSocketAddress[] {addr, addr});
-assertTrue(Arrays.equals(values, new String[]{a,b}));
+assertTrue(Arrays.equals(strings, new String[]{a,b}));
 
+Method ping = TestProtocol.class.getMethod(ping, new Class[] {});
+Object[] voids = (Object[])RPC.call(ping, new Object[][]{{},{}},
+new InetSocketAddress[] {addr, addr});
+assertEquals(voids, null);
 
 server.stop();
   }




svn commit: r367408 - /lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java

2006-01-09 Thread cutting
Author: cutting
Date: Mon Jan  9 13:55:31 2006
New Revision: 367408

URL: http://svn.apache.org/viewcvs?rev=367408view=rev
Log:
NUTCH-160: Switch RegexURLFilter to use Java regex's rather than oro, since 
Java's seem to be faster  more reliable.  By Rod Taylor.

Modified:

lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java

Modified: 
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java?rev=367408r1=367407r2=367408view=diff
==
--- 
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java
 Mon Jan  9 13:55:31 2006
@@ -32,12 +32,7 @@
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.logging.Logger;
-
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
-import org.apache.oro.text.regex.Perl5Pattern;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.MalformedPatternException;
+import java.util.regex.*;
 
 /**
  * Filters URLs based on a file of regular expressions. The file is named by
@@ -80,15 +75,14 @@
   }
 
   private static class Rule {
-public Perl5Pattern pattern;
+public Pattern pattern;
 public boolean sign;
 public String regex;
   }
 
   private List rules;
-  private PatternMatcher matcher = new Perl5Matcher();
 
-  public RegexURLFilter() throws IOException, MalformedPatternException {
+  public RegexURLFilter() throws IOException, PatternSyntaxException {
 String file = NutchConf.get().get(urlfilter.regex.file);
 // attribute file takes precedence if defined
 if (attributeFile != null)
@@ -103,7 +97,7 @@
   }
 
   public RegexURLFilter(String filename)
-throws IOException, MalformedPatternException {
+throws IOException, PatternSyntaxException {
 rules = readConfigurationFile(new FileReader(filename));
   }
 
@@ -111,7 +105,9 @@
 Iterator i=rules.iterator();
 while(i.hasNext()) {
   Rule r=(Rule) i.next();
-  if (matcher.contains(url,r.pattern)) {
+  Matcher matcher = r.pattern.matcher(url);
+
+  if (matcher.find()) {
 //System.out.println(Matched  + r.regex);
 return r.sign ? url : null;
   }
@@ -129,10 +125,9 @@
   // 
 
   private static List readConfigurationFile(Reader reader)
-throws IOException, MalformedPatternException {
+throws IOException, PatternSyntaxException {
 
 BufferedReader in=new BufferedReader(reader);
-Perl5Compiler compiler=new Perl5Compiler();
 List rules=new ArrayList();
 String line;

@@ -157,7 +152,7 @@
   String regex=line.substring(1);
 
   Rule rule=new Rule();
-  rule.pattern=(Perl5Pattern) compiler.compile(regex);
+  rule.pattern=Pattern.compile(regex);
   rule.sign=sign;
   rule.regex=regex;
   rules.add(rule);
@@ -167,7 +162,7 @@
   }
 
   public static void main(String args[])
-throws IOException, MalformedPatternException {
+throws IOException, PatternSyntaxException {
 
 RegexURLFilter filter=new RegexURLFilter();
 BufferedReader in=new BufferedReader(new InputStreamReader(System.in));




svn commit: r366550 - /lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java

2006-01-06 Thread cutting
Author: cutting
Date: Fri Jan  6 11:14:46 2006
New Revision: 366550

URL: http://svn.apache.org/viewcvs?rev=366550view=rev
Log:
Make it clearer why this optimization is valid.  For Stefan.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java?rev=366550r1=366549r2=366550view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java Fri Jan  6 
11:14:46 2006
@@ -306,7 +306,7 @@
* contains nulls for calls that timed out or errored.  */
   public Writable[] call(Writable[] params, InetSocketAddress[] addresses)
 throws IOException {
-if (params.length == 0) return new Writable[0];
+if (addresses.length == 0) return new Writable[0];
 
 ParallelResults results = new ParallelResults(params.length);
 synchronized (results) {




svn commit: r366242 - in /lucene/nutch/trunk: conf/nutch-default.xml src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java

2006-01-05 Thread cutting
Author: cutting
Date: Thu Jan  5 10:38:44 2006
New Revision: 366242

URL: http://svn.apache.org/viewcvs?rev=366242view=rev
Log:
Fix NegativeArraySizeException.

Modified:
lucene/nutch/trunk/conf/nutch-default.xml

lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=366242r1=366241r2=366242view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Thu Jan  5 10:38:44 2006
@@ -661,10 +661,11 @@
 
 property
   namesearcher.max.hits/name
-  value2147483647/value
-  descriptionSearch stops after this many hits are found.  Setting
-  this to smaller values can make searches much faster.  With a sorted
-  index, the quality of the hits suffers little./description
+  value-1/value
+  descriptionIf positive, search stops after this many hits are
+  found.  Setting this to small, positive values (e.g., 1000) can make
+  searches much faster.  With a sorted index, the quality of the hits
+  suffers little./description
 /property
 
 !-- URL normalizer properties --

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java?rev=366242r1=366241r2=366242view=diff
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java 
Thu Jan  5 10:38:44 2006
@@ -37,8 +37,7 @@
  * which do not affect ranking but might otherwise slow search considerably. */
 class LuceneQueryOptimizer {
 
-  private static int MAX_HITS =
-NutchConf.get().getInt(searcher.max.hits, Integer.MAX_VALUE);
+  private static int MAX_HITS = NutchConf.get().getInt(searcher.max.hits,-1);
 
   private static class LimitExceeded extends RuntimeException {
 private int maxDoc;
@@ -150,6 +149,13 @@
   }
 }
 if (sortField == null  !reverse) {
+
+  // no hit limit
+  if (MAX_HITS = 0) {
+return searcher.search(query, filter, numHits);
+  }
+
+  // hits limited -- use a LimitedCollector
   LimitedCollector collector = new LimitedCollector(numHits, MAX_HITS);
   LimitExceeded exceeded = null;
   try {




svn commit: r366271 - /lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java

2006-01-05 Thread cutting
Author: cutting
Date: Thu Jan  5 12:13:43 2006
New Revision: 366271

URL: http://svn.apache.org/viewcvs?rev=366271view=rev
Log:
Fix for NUTCH-108: eliminate voluminous messages when reconnecting.
From Paul Baclace.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java?rev=366271r1=366270r2=366271view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java Thu 
Jan  5 12:13:43 2006
@@ -287,8 +287,7 @@
 staleState = true;
 }
 } catch (Exception ex) {
-ex.printStackTrace();
-LOG.info(Lost connection to JobTracker [ + 
jobTrackAddr + ].  Retrying...);
+LOG.info(Lost connection to JobTracker [ + 
jobTrackAddr + ]. ex= + ex +   Retrying...);
 try {
 Thread.sleep(5000);
 } catch (InterruptedException ie) {




svn commit: r366322 - /lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java

2006-01-05 Thread cutting
Author: cutting
Date: Thu Jan  5 14:37:19 2006
New Revision: 366322

URL: http://svn.apache.org/viewcvs?rev=366322view=rev
Log:
Fix a bug in LimitedCollector.

Modified:

lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java?rev=366322r1=366321r2=366322view=diff
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java 
Thu Jan  5 14:37:19 2006
@@ -48,7 +48,7 @@
 private int maxHits;
 
 public LimitedCollector(int numHits, int maxHits) {
-  super(maxHits);
+  super(numHits);
   this.maxHits = maxHits;
 }
 




svn commit: r357197 [5/5] - in /lucene/nutch: branches/mapred/ trunk/ trunk/bin/ trunk/conf/ trunk/lib/ trunk/lib/jetty-ext/ trunk/site/ trunk/src/java/org/apache/nutch/crawl/ trunk/src/java/org/apach

2005-12-16 Thread cutting
Modified: 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java?rev=357197r1=357196r2=357197view=diff
==
--- 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
 Fri Dec 16 09:51:05 2005
@@ -53,6 +53,9 @@
   private static final boolean ALLOW_FORBIDDEN =
 NutchConf.get().getBoolean(http.robots.403.allow, false);
 
+  private static final int MAX_REDIRECTS =
+NutchConf.get().getInt(http.redirect.max, 3);
+
   private static final String[] AGENTS = getAgents();
   private static final Hashtable CACHE = new Hashtable();
   
@@ -377,16 +380,30 @@
 RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host);
 
 if (robotRules == null) { // cache miss
-  HttpResponse response = new HttpResponse(new URL(url, /robots.txt));
-
-  if (response.getCode() == 200)   // found rules: parse them
-robotRules = new RobotRulesParser().parseRules(response.getContent());
-  else if ( (response.getCode() == 403)  (!ALLOW_FORBIDDEN) )
-robotRules = FORBID_ALL_RULES;// use forbid all
-  else
-robotRules = EMPTY_RULES; // use default rules
+  int redirects = 0;
+  do {
+HttpResponse response = new HttpResponse(new URL(url, /robots.txt));
+
+int code = response.getCode();
+
+if (code == 200) {// found rules: parse them
+  robotRules = new 
RobotRulesParser().parseRules(response.getContent());
+} else if ( (code == 403)  (!ALLOW_FORBIDDEN) ) {
+  robotRules = FORBID_ALL_RULES;  // use forbid all
+} else if (code = 300  code  400) {   // handle redirect
+  if (redirects == MAX_REDIRECTS) {
+robotRules = EMPTY_RULES;
+  } else {
+url = new URL(url, response.getHeader(Location));
+LOG.fine(redirect to  + url); 
+redirects++;
+  }
+} else {
+  robotRules = EMPTY_RULES; // use default rules
+}
+  } while (robotRules == null);
 
-  CACHE.put(host, robotRules);// cache rules for host
+  CACHE.put(host, robotRules);  // cache rules for host
 }
 
 String path = url.getPath();  // check rules

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=357197r1=357196r2=357197view=diff
==
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 Fri Dec 16 09:51:05 2005
@@ -60,10 +60,14 @@
   }
 
   public HttpResponse(URL url) throws IOException {
+this(url, false);
+  }
+
+  HttpResponse(URL url, boolean followRedirects) throws IOException {
 this.base = url.toString();
 this.orig = url.toString();
 GetMethod get = new GetMethod(this.orig);
-get.setFollowRedirects(false);
+get.setFollowRedirects(followRedirects);
 get.setRequestHeader(User-Agent, Http.AGENT_STRING);
 HttpMethodParams params = get.getParams();
 // some servers cannot digest the new protocol

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java?rev=357197r1=357196r2=357197view=diff
==
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
 Fri Dec 16 09:51:05 2005
@@ -379,7 +379,8 @@
 if (robotRules == null) { // cache miss
   LOG.fine(cache miss  + url);
   try {
-HttpResponse response = new HttpResponse(new URL(url, /robots.txt));
+HttpResponse response = new HttpResponse(new URL(url, /robots.txt),
+ true);
 
 

svn commit: r348210 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java

2005-11-22 Thread cutting
Author: cutting
Date: Tue Nov 22 10:46:43 2005
New Revision: 348210

URL: http://svn.apache.org/viewcvs?rev=348210view=rev
Log:
Silently ignore missing checksum files.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java?rev=348210r1=348209r2=348210view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java
 Tue Nov 22 10:46:43 2005
@@ -54,7 +54,9 @@
 if (!Arrays.equals(version, VERSION))
   throw new IOException(Not a checksum file: +sumFile);
 bytesPerSum = sums.readInt();
-  } catch (IOException e) {
+  } catch (FileNotFoundException e) { // quietly ignore
+stopSumming();
+  } catch (IOException e) {   // loudly ignore
 LOG.warning(Problem opening checksum file: +e+. Ignoring.);
 stopSumming();
   }




svn commit: r348212 - in /lucene/nutch/branches/mapred/conf: crawl-tool.xml nutch-default.xml

2005-11-22 Thread cutting
Author: cutting
Date: Tue Nov 22 10:55:26 2005
New Revision: 348212

URL: http://svn.apache.org/viewcvs?rev=348212view=rev
Log:
Increase defaults for http.max.delays, since, with MapReduce's partitioning of 
fetchlists, delays are more likely.

Modified:
lucene/nutch/branches/mapred/conf/crawl-tool.xml
lucene/nutch/branches/mapred/conf/nutch-default.xml

Modified: lucene/nutch/branches/mapred/conf/crawl-tool.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/crawl-tool.xml?rev=348212r1=348211r2=348212view=diff
==
--- lucene/nutch/branches/mapred/conf/crawl-tool.xml (original)
+++ lucene/nutch/branches/mapred/conf/crawl-tool.xml Tue Nov 22 10:55:26 2005
@@ -33,7 +33,7 @@
 
 property
   namehttp.max.delays/name
-  value100/value
+  value1000/value
   descriptionThe number of times a thread will delay when trying to
   fetch a page.  When using the crawl tool there are likely to be very
   few different hosts, so we need to be willing to wait longer for

Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=348212r1=348211r2=348212view=diff
==
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Tue Nov 22 10:55:26 2005
@@ -69,7 +69,7 @@
 
 property
   namehttp.max.delays/name
-  value3/value
+  value100/value
   descriptionThe number of times a thread will delay when trying to
   fetch a page.  Each time it finds that a host is busy, it will wait
   fetcher.server.delay.  After http.max.delays attepts, it will give




svn commit: r332371 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java

2005-11-10 Thread cutting
Author: cutting
Date: Thu Nov 10 13:03:16 2005
New Revision: 332371

URL: http://svn.apache.org/viewcvs?rev=332371view=rev
Log:
Fix to not increment count of urls when urls are filtered by
maxPerHost limit.  Patch contributed by Rod Taylor.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java?rev=332371r1=332370r2=332371view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java 
Thu Nov 10 13:03:16 2005
@@ -76,23 +76,27 @@
OutputCollector output, Reporter reporter)
   throws IOException {
 
-  while (values.hasNext()  ++count  limit) {
+  while (values.hasNext()  count  limit) {
 
 UTF8 url = (UTF8)values.next();
 
-if (maxPerHost  0) {   // are we counting hosts?
+if (maxPerHost  0) { // are we counting hosts?
   String host = new URL(url.toString()).getHost();
-  Integer count = (Integer)hostCounts.get(host);
-  if (count != null) {
-if (count.intValue() = maxPerHost)
+  Integer hostCount = (Integer)hostCounts.get(host);
+  if (hostCount != null) {
+if (hostCount.intValue() = maxPerHost)
   continue;   // too many from host
-hostCounts.put(host, new Integer(count.intValue()+1));
+hostCounts.put(host, new Integer(hostCount.intValue()+1));
   } else {// update host count
 hostCounts.put(host, new Integer(1));
   }
 }
 
 output.collect(key, url);
+
+// Count is incremented only when we keep the URL
+// maxPerHost may cause us to skip it.
+count++;
   }
 
 }




svn commit: r328414 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java

2005-10-25 Thread cutting
Author: cutting
Date: Tue Oct 25 09:57:51 2005
New Revision: 328414

URL: http://svn.apache.org/viewcvs?rev=328414view=rev
Log:
Fix a type error for JDK 1.4.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java?rev=328414r1=328413r2=328414view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java
 Tue Oct 25 09:57:51 2005
@@ -66,7 +66,8 @@
   Outlink[] links = parse.getData().getOutlinks();
 
   // compute OPIC score contribution
-  float score = Float.valueOf(parse.getData().get(Fetcher.SCORE_KEY));
+  float score =
+Float.parseFloat(parse.getData().get(Fetcher.SCORE_KEY));
   score /= links.length;
   
   for (int i = 0; i  links.length; i++) {




svn commit: r327572 - /lucene/nutch/branches/mapred/bin/slaves.sh

2005-10-21 Thread cutting
Author: cutting
Date: Fri Oct 21 13:45:32 2005
New Revision: 327572

URL: http://svn.apache.org/viewcvs?rev=327572view=rev
Log:
Tag standard error with hostname too.

Modified:
lucene/nutch/branches/mapred/bin/slaves.sh

Modified: lucene/nutch/branches/mapred/bin/slaves.sh
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/slaves.sh?rev=327572r1=327571r2=327572view=diff
==
--- lucene/nutch/branches/mapred/bin/slaves.sh (original)
+++ lucene/nutch/branches/mapred/bin/slaves.sh Fri Oct 21 13:45:32 2005
@@ -21,7 +21,7 @@
 
 for slave in `cat $NUTCH_SLAVES`; do
  ssh -o ConnectTimeout=1 -o SendEnv='NUTCH* JAVA*' $slave $@ \
-   | sed s/^/$slave: / 
+   21 | sed s/^/$slave: / 
 done
 
 wait




svn commit: r327581 - in /lucene/nutch/branches/mapred/src/plugin/parse-html/src: java/org/apache/nutch/parse/html/DOMContentUtils.java test/org/apache/nutch/parse/html/TestDOMContentUtils.java

2005-10-21 Thread cutting
Author: cutting
Date: Fri Oct 21 14:04:54 2005
New Revision: 327581

URL: http://svn.apache.org/viewcvs?rev=327581view=rev
Log:
Ignore rel=nofollow links.

Modified:

lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java

lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java

Modified: 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=327581r1=327580r2=327581view=diff
==
--- 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
 Fri Oct 21 14:04:54 2005
@@ -306,13 +306,21 @@
 
   NamedNodeMap attrs = node.getAttributes();
   String target = null;
+  boolean noFollow = false;
   for (int i= 0; i  attrs.getLength(); i++ ) {
-if (params.attrName.equalsIgnoreCase(attrs.item(i).getNodeName())) 
{
-  target = attrs.item(i).getNodeValue();
-  break;
+Node attr = attrs.item(i);
+String attrName = attr.getNodeName();
+
+if (rel.equalsIgnoreCase(attrName) 
+nofollow.equalsIgnoreCase(attr.getNodeValue())) {
+  noFollow = true;
+}
+
+if (params.attrName.equalsIgnoreCase(attrName)) {
+  target = attr.getNodeValue();
 }
   }
-  if (target != null)
+  if (target != null  !noFollow)
 try {
   URL url = new URL(base, target);
   outlinks.add(new Outlink(url.toString(),

Modified: 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=327581r1=327580r2=327581view=diff
==
--- 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
 Fri Oct 21 14:04:54 2005
@@ -113,6 +113,12 @@
+ h2End\tthis\rmadness\n!/h2\r\n
+  ... .
+ /body  /html),
+
+// test that a rel=nofollow links are not returned
+new String(htmlhead/headbody
+   + a href=\http://www.nutch.org\; rel=\nofollow\ ignore 
/a
+   + a rel=\nofollow\ href=\http://www.nutch.org\; ignore 
/a
+   + /body/html),
   };
 
   private static String[] testBaseHrefs= {
@@ -123,6 +129,7 @@
 http://www.nutch.org/frames/;, 
 http://www.nutch.org/maps/;,
 http://www.nutch.org/whitespace/;,
+http://www.nutch.org//;,
   };
   
   private static final DocumentFragment testDOMs[]=
@@ -145,6 +152,7 @@
 + one two three space here space there no space 
 + one two two three three four put some text here and there. 
 + End this madness ! . . . .,
+ignore ignore,
   };
 
   private static final String[] answerTitle= {
@@ -155,6 +163,7 @@
 my title,
 my title,
 my title,
+,
   };
 
   // note: should be in page-order
@@ -214,6 +223,8 @@
  {
  new Outlink(http://www.nutch.org/index.html;, whitespace test),
  },
+ {
+ }
   };

 } catch (MalformedURLException e) {




svn commit: r327593 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java

2005-10-21 Thread cutting
Author: cutting
Date: Fri Oct 21 15:07:00 2005
New Revision: 327593

URL: http://svn.apache.org/viewcvs?rev=327593view=rev
Log:
Always create workdir so child can connect to it.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java?rev=327593r1=327592r2=327593view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java 
Fri Oct 21 15:07:00 2005
@@ -63,6 +63,7 @@
 
   String sep = System.getProperty(path.separator);
   File workDir = new File(new File(t.getJobFile()).getParent(), work);
+  workDir.mkdirs();

   StringBuffer classPath = new StringBuffer();
   // start with same classpath as parent process
@@ -72,7 +73,6 @@
   JobConf job = new JobConf(t.getJobFile());
   String jar = job.getJar();
   if (jar != null) {  // if jar exists, it into workDir
-workDir.mkdirs();
 runChild(new String[] { unzip, jar}, workDir);
 File[] libs = new File(workDir, lib).listFiles();
 for (int i = 0; i  libs.length; i++) {




svn commit: r326007 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java

2005-10-17 Thread cutting
Author: cutting
Date: Mon Oct 17 18:08:07 2005
New Revision: 326007

URL: http://svn.apache.org/viewcvs?rev=326007view=rev
Log:
Fix bogus javadoc.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java?rev=326007r1=326006r2=326007view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java 
Mon Oct 17 18:08:07 2005
@@ -36,7 +36,7 @@
 import org.apache.lucene.index.*;
 import org.apache.lucene.document.*;
 
-/** Maintains an inverted link map, listing incoming links for each url. */
+/** Create indexes for segments. */
 public class Indexer extends NutchConfigured implements Reducer {
 
   public static final Logger LOG =




svn commit: r320835 - in /lucene/nutch/branches/mapred/src: java/org/apache/nutch/db/ java/org/apache/nutch/fs/ java/org/apache/nutch/indexer/ java/org/apache/nutch/io/ java/org/apache/nutch/mapred/ j

2005-10-13 Thread cutting
Author: cutting
Date: Thu Oct 13 10:59:30 2005
New Revision: 320835

URL: http://svn.apache.org/viewcvs?rev=320835view=rev
Log:
Store checksums for all files written and verify them on read.  CRCs are stored 
for every 512 bytes of data, so that randomly accessed data may be verified.  
Errors are reported to the filesystem implementation.  Local file errors cause 
files to be moved to a bad file directory, so that bad disk areas are not 
reused.  NDFS file errors should cause blocks to be moved to a bad block 
directory on the datanode, forcing the use of replicas of the bad blocks with 
no loss of data.  This is not yet implemented for NDFS.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/db/DistributedWebDBWriter.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FileUtil.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataOutputStream.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSInputStream.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobClient.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/LocalJobRunner.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapOutputFile.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TextInputFormat.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TextOutputFormat.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DF.java

lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/db/DistributedWebDBWriter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/db/DistributedWebDBWriter.java?rev=320835r1=320834r2=320835view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/db/DistributedWebDBWriter.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/db/DistributedWebDBWriter.java
 Thu Oct 13 10:59:30 2005
@@ -1656,7 +1656,7 @@
 }
 
 // Bump number by 1.
-DataOutputStream out = new DataOutputStream(nfs.create(openCounter, 
true));
+DataOutputStream out = nfs.create(openCounter);
 try {
 out.write(OPEN_COUNTER_VERSION);
 out.writeInt(numOpens + 1);
@@ -1793,7 +1793,7 @@
 // 7. Finally, write out the total num of pages and links
 //
 File sectionStats = new File(newSectionDir, STATS_FILE);
-DataOutputStream out = new DataOutputStream(nfs.create(sectionStats, 
true));
+DataOutputStream out = nfs.create(sectionStats);
 try {
 //
 // These counts are guaranteed to be correct; they're
@@ -1854,7 +1854,7 @@
 }
 
 // Bump that number by 1.
-out = new DataOutputStream(nfs.create(closeCounter, true));
+out = nfs.create(closeCounter);
 try {
 out.write(CLOSE_COUNTER_VERSION);
 out.writeInt(numCloses + 1);

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FileUtil.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FileUtil.java?rev=320835r1=320834r2=320835view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FileUtil.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FileUtil.java Thu 
Oct 13 10:59:30 2005
@@ -54,9 +54,9 @@
 }
 
 if (nfs.isFile(src)) {
-DataInputStream in = new DataInputStream(nfs.open(src));
+NFSInputStream in = nfs.openRaw(src);
 try {
-DataOutputStream out = new DataOutputStream(nfs.create(dst));
+NFSOutputStream out = nfs.createRaw(dst, true);
 byte buf[] = new 
byte[NutchConf.get().getInt(io.file.buffer.size, 4096)];
 try {
 int readBytes = in.read(buf);

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java?rev=320835r1=320834r2=320835view=diff

svn commit: r320893 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/Seekable.java

2005-10-13 Thread cutting
Author: cutting
Date: Thu Oct 13 12:42:21 2005
New Revision: 320893

URL: http://svn.apache.org/viewcvs?rev=320893view=rev
Log:
Add new file.

Added:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/Seekable.java

Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/Seekable.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/Seekable.java?rev=320893view=auto
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/Seekable.java 
(added)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/Seekable.java Thu 
Oct 13 12:42:21 2005
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the License);
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.fs;
+
+import java.io.*;
+
+/* Stream which permits seeking. */
+public interface Seekable {
+  /**
+   * Seek to the given offset from the start of the file.
+   * The next read() will be from that location.  Can't
+   * seek past the end of the file.
+   */
+  void seek(long pos) throws IOException;
+}




svn commit: r320899 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java

2005-10-13 Thread cutting
Author: cutting
Date: Thu Oct 13 12:57:03 2005
New Revision: 320899

URL: http://svn.apache.org/viewcvs?rev=320899view=rev
Log:
Fix progress reporting for dedup.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java?rev=320899r1=320898r2=320899view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java
 Thu Oct 13 12:57:03 2005
@@ -127,7 +127,7 @@
   }
 
   public static class InputFormat extends InputFormatBase {
-private static final int INDEX_LENGTH = Integer.MAX_VALUE;
+private static final long INDEX_LENGTH = Integer.MAX_VALUE;
 
 /** Return each index as a split. */
 public FileSplit[] getSplits(NutchFileSystem fs, JobConf job,




svn commit: r320931 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java

2005-10-13 Thread cutting
Author: cutting
Date: Thu Oct 13 14:43:23 2005
New Revision: 320931

URL: http://svn.apache.org/viewcvs?rev=320931view=rev
Log:
Fix a NullPointerException.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java?rev=320931r1=320930r2=320931view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java 
Thu Oct 13 14:43:23 2005
@@ -268,10 +268,12 @@
 public File[] listFiles(File f, FileFilter filter) throws IOException {
 Vector results = new Vector();
 File listing[] = listFilesRaw(f);
-for (int i = 0; i  listing.length; i++) {
+if (listing != null) {
+  for (int i = 0; i  listing.length; i++) {
 if (filter.accept(listing[i])) {
-results.add(listing[i]);
+  results.add(listing[i]);
 }
+  }
 }
 return (File[]) results.toArray(new File[results.size()]);
 }




svn commit: r314958 - in /lucene/nutch/trunk/site: about.html bot.html credits.html i18n.html index.html index.pdf issue_tracking.html linkmap.html mailing_lists.html tutorial.html version_control.htm

2005-10-12 Thread cutting
Author: cutting
Date: Wed Oct 12 09:31:33 2005
New Revision: 314958

URL: http://svn.apache.org/viewcvs?rev=314958view=rev
Log:
Use mirrors for downloads.

Modified:
lucene/nutch/trunk/site/about.html
lucene/nutch/trunk/site/bot.html
lucene/nutch/trunk/site/credits.html
lucene/nutch/trunk/site/i18n.html
lucene/nutch/trunk/site/index.html
lucene/nutch/trunk/site/index.pdf
lucene/nutch/trunk/site/issue_tracking.html
lucene/nutch/trunk/site/linkmap.html
lucene/nutch/trunk/site/mailing_lists.html
lucene/nutch/trunk/site/tutorial.html
lucene/nutch/trunk/site/version_control.html

Modified: lucene/nutch/trunk/site/about.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/about.html?rev=314958r1=314957r2=314958view=diff
==
--- lucene/nutch/trunk/site/about.html (original)
+++ lucene/nutch/trunk/site/about.html Wed Oct 12 09:31:33 2005
@@ -138,7 +138,7 @@
 div onclick=SwitchMenu('menu_1.3', 'skin/') id=menu_1.3Title 
class=menutitleResources/div
 div id=menu_1.3 class=menuitemgroup
 div class=menuitem
-a title= href=release/Download/a
+a title= 
href=http://www.apache.org/dyn/closer.cgi/lucene/nutch/;Download/a
 /div
 div class=menuitem
 a title= href=mailing_lists.htmlMailing Lists/a

Modified: lucene/nutch/trunk/site/bot.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/bot.html?rev=314958r1=314957r2=314958view=diff
==
--- lucene/nutch/trunk/site/bot.html (original)
+++ lucene/nutch/trunk/site/bot.html Wed Oct 12 09:31:33 2005
@@ -138,7 +138,7 @@
 div onclick=SwitchMenu('menu_1.3', 'skin/') id=menu_1.3Title 
class=menutitleResources/div
 div id=menu_1.3 class=menuitemgroup
 div class=menuitem
-a title= href=release/Download/a
+a title= 
href=http://www.apache.org/dyn/closer.cgi/lucene/nutch/;Download/a
 /div
 div class=menuitem
 a title= href=mailing_lists.htmlMailing Lists/a

Modified: lucene/nutch/trunk/site/credits.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/credits.html?rev=314958r1=314957r2=314958view=diff
==
--- lucene/nutch/trunk/site/credits.html (original)
+++ lucene/nutch/trunk/site/credits.html Wed Oct 12 09:31:33 2005
@@ -138,7 +138,7 @@
 div onclick=SwitchMenu('menu_1.3', 'skin/') id=menu_1.3Title 
class=menutitleResources/div
 div id=menu_1.3 class=menuitemgroup
 div class=menuitem
-a title= href=release/Download/a
+a title= 
href=http://www.apache.org/dyn/closer.cgi/lucene/nutch/;Download/a
 /div
 div class=menuitem
 a title= href=mailing_lists.htmlMailing Lists/a

Modified: lucene/nutch/trunk/site/i18n.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/i18n.html?rev=314958r1=314957r2=314958view=diff
==
--- lucene/nutch/trunk/site/i18n.html (original)
+++ lucene/nutch/trunk/site/i18n.html Wed Oct 12 09:31:33 2005
@@ -138,7 +138,7 @@
 div onclick=SwitchMenu('menu_1.3', 'skin/') id=menu_1.3Title 
class=menutitleResources/div
 div id=menu_1.3 class=menuitemgroup
 div class=menuitem
-a title= href=release/Download/a
+a title= 
href=http://www.apache.org/dyn/closer.cgi/lucene/nutch/;Download/a
 /div
 div class=menuitem
 a title= href=mailing_lists.htmlMailing Lists/a

Modified: lucene/nutch/trunk/site/index.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/index.html?rev=314958r1=314957r2=314958view=diff
==
--- lucene/nutch/trunk/site/index.html (original)
+++ lucene/nutch/trunk/site/index.html Wed Oct 12 09:31:33 2005
@@ -138,7 +138,7 @@
 div onclick=SwitchMenu('menu_1.3', 'skin/') id=menu_1.3Title 
class=menutitleResources/div
 div id=menu_1.3 class=menuitemgroup
 div class=menuitem
-a title= href=release/Download/a
+a title= 
href=http://www.apache.org/dyn/closer.cgi/lucene/nutch/;Download/a
 /div
 div class=menuitem
 a title= href=mailing_lists.htmlMailing Lists/a
@@ -184,9 +184,6 @@
 a href=#NewsNews/a
 ul class=minitoc
 li
-a href=#1+October+2005%3A+Nutch+0.7.1+Released1 October 2005: Nutch 0.7.1 
Released/a
-/li
-li
 a href=#17+August+2005%3A+Nutch+0.7+Released17 August 2005: Nutch 0.7 
Released/a
 /li
 li
@@ -210,23 +207,17 @@
 a name=N1000C/aa name=News/a
 h2 class=h3News/h2
 div class=section
-a name=N10012/aa name=1+October+2005%3A+Nutch+0.7.1+Released/a
-h3 class=h41 October 2005: Nutch 0.7.1 Released/h3
-pThe 0.7.1 release of Nutch is now available. This is a bug fix release. See
-  a 
href=http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=292986;
-  CHANGES.txt/a for details. The release is available
-  a href=http://lucene.apache.org/nutch/release/;here/a./p
-a name=N10024/aa name=17+August+2005%3A+Nutch+0.7+Released/a
+a name=N10012/aa name=17

svn commit: r314991 - /lucene/nutch/nightly/nightly.sh

2005-10-12 Thread cutting
Author: cutting
Date: Wed Oct 12 11:33:47 2005
New Revision: 314991

URL: http://svn.apache.org/viewcvs?rev=314991view=rev
Log:
Put nightly releases on cvs.apache.org, not www, per Apache policy.

Modified:
lucene/nutch/nightly/nightly.sh

Modified: lucene/nutch/nightly/nightly.sh
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.sh?rev=314991r1=314990r2=314991view=diff
==
--- lucene/nutch/nightly/nightly.sh (original)
+++ lucene/nutch/nightly/nightly.sh Wed Oct 12 11:33:47 2005
@@ -5,7 +5,7 @@
 TRUNK=http://svn.apache.org/repos/asf/lucene/nutch/trunk
 
 REL_SERVER=people.apache.org
-REL_DIR=/www/www.apache.org/dist/lucene/nutch/nightly
+REL_DIR=/www/cvs.apache.org/dist/lucene/nutch/nightly
 
 # create an empty build directory
 rm -rf /tmp/nutch-nightly




svn commit: r312693 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/MapFile.java

2005-10-10 Thread cutting
Author: cutting
Date: Mon Oct 10 10:40:21 2005
New Revision: 312693

URL: http://svn.apache.org/viewcvs?rev=312693view=rev
Log:
Fix to permit non one-to-one mappings in index.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/MapFile.java

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/MapFile.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/MapFile.java?rev=312693r1=312692r2=312693view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/MapFile.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/MapFile.java Mon 
Oct 10 10:40:21 2005
@@ -234,7 +234,7 @@
 break;
 
   // check order to make sure comparator is compatible
-  if (lastKey != null  comparator.compare(lastKey, k) = 0)
+  if (lastKey != null  comparator.compare(lastKey, k)  0)
 throw new IOException(key out of order: +k+ after +lastKey);
   lastKey = k;
   




svn commit: r307445 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java

2005-10-09 Thread cutting
Author: cutting
Date: Sun Oct  9 08:15:34 2005
New Revision: 307445

URL: http://svn.apache.org/viewcvs?rev=307445view=rev
Log:
Overwrite should be default now.  Use super's implementation.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java?rev=307445r1=307444r2=307445view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java 
Sun Oct  9 08:15:34 2005
@@ -64,14 +64,6 @@
   return ndfs.open(getPath(f));
 }
 
-/**
- * Create the file at f.
- */
-public NFSOutputStream create(File f) throws IOException {
-return create(f, false);
-}
-/**
- */
 public NFSOutputStream create(File f, boolean overwrite) throws 
IOException {
   return ndfs.create(getPath(f), overwrite);
 }




svn commit: r307203 - in /lucene/nutch/branches/mapred: bin/nutch src/java/org/apache/nutch/crawl/Crawl.java src/java/org/apache/nutch/crawl/DeleteDuplicates.java src/java/org/apache/nutch/indexer/NdfsDirectory.java

2005-10-07 Thread cutting
Author: cutting
Date: Fri Oct  7 15:16:27 2005
New Revision: 307203

URL: http://svn.apache.org/viewcvs?rev=307203view=rev
Log:
First working version of MapReduce-based dedup.

Added:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java
Modified:
lucene/nutch/branches/mapred/bin/nutch
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java

Modified: lucene/nutch/branches/mapred/bin/nutch
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/nutch?rev=307203r1=307202r2=307203view=diff
==
--- lucene/nutch/branches/mapred/bin/nutch (original)
+++ lucene/nutch/branches/mapred/bin/nutch Fri Oct  7 15:16:27 2005
@@ -138,6 +138,8 @@
   CLASS=org.apache.nutch.crawl.LinkDb
 elif [ $COMMAND = index ] ; then
   CLASS=org.apache.nutch.crawl.Indexer
+elif [ $COMMAND = dedup ] ; then
+  CLASS=org.apache.nutch.crawl.DeleteDuplicates
 elif [ $COMMAND = merge ] ; then
   CLASS=org.apache.nutch.indexer.IndexMerger
 elif [ $COMMAND = server ] ; then

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java?rev=307203r1=307202r2=307203view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java Fri 
Oct  7 15:16:27 2005
@@ -109,8 +109,9 @@
   
 new LinkDb(conf).invert(linkDb, segments); // invert links
 
-// index
+// index  dedup
 new Indexer(conf).index(index, linkDb, fs.listFiles(segments));
+new DeleteDuplicates(conf).dedup(new File[] { index });
 
 LOG.info(crawl finished:  + dir);
   }

Added: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java?rev=307203view=auto
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java
 (added)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java
 Fri Oct  7 15:16:27 2005
@@ -0,0 +1,338 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the License);
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.*;
+import java.security.*;
+import java.text.*;
+import java.util.*;
+import java.util.logging.*;
+
+import org.apache.nutch.io.*;
+import org.apache.nutch.fs.*;
+import org.apache.nutch.util.*;
+import org.apache.nutch.mapred.*;
+import org.apache.nutch.indexer.*;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.document.Document;
+
+/**
+ * Deletes duplicate documents in a set of Lucene indexes.
+ * Duplicates have either the same contents (via MD5 hash) or the same URL.
+ **/
+public class DeleteDuplicates extends NutchConfigured
+  implements Mapper, OutputFormat {
+  private static final Logger LOG =
+LogFormatter.getLogger(org.apache.nutch.crawl.DeleteDuplicates);
+
+//   Algorithm:
+//  
+//   1. map indexes - md5, score, urlLen, index,doc
+//  partition by md5
+//  reduce, deleting all but largest score w/ shortest url
+//
+//   2. map indexes - url, fetchdate, index,doc
+//  partition by url
+//  reduce, deleting all but most recent.
+//
+//   Part 2 is not yet implemented, but the Indexer currently only indexes one
+//   URL per page, so this is not a critical problem.
+
+  public static class IndexDoc implements WritableComparable {
+private UTF8 index;   // the segment index
+private int doc;  // within the index
+
+public void write(DataOutput out) throws IOException {
+  index.write(out);
+  out.writeInt(doc);
+}
+
+public void readFields(DataInput in) throws IOException {
+  if (index == null) {
+index = new UTF8();
+  }
+  index.readFields(in);
+  this.doc = in.readInt

svn commit: r306808 - /lucene/nutch/trunk/conf/parse-plugins.xml

2005-10-06 Thread cutting
Author: cutting
Date: Thu Oct  6 10:02:03 2005
New Revision: 306808

URL: http://svn.apache.org/viewcvs?rev=306808view=rev
Log:
Add parse-ext content-types so that unit tests pass.

Modified:
lucene/nutch/trunk/conf/parse-plugins.xml

Modified: lucene/nutch/trunk/conf/parse-plugins.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.xml?rev=306808r1=306807r2=306808view=diff
==
--- lucene/nutch/trunk/conf/parse-plugins.xml (original)
+++ lucene/nutch/trunk/conf/parse-plugins.xml Thu Oct  6 10:02:03 2005
@@ -204,4 +204,14 @@
plugin id=parse-text /
/mimeType
 
+   !-- Types for parse-ext plugin: required for unit tests to pass. --
+
+   mimeType name=application/vnd.nutch.example.cat
+   plugin id=parse-ext /
+   /mimeType
+
+   mimeType name=application/vnd.nutch.example.md5sum
+   plugin id=parse-ext /
+   /mimeType
+
 /parse-plugins




svn commit: r306812 - /lucene/nutch/nightly/nightly.properties

2005-10-06 Thread cutting
Author: cutting
Date: Thu Oct  6 10:18:01 2005
New Revision: 306812

URL: http://svn.apache.org/viewcvs?rev=306812view=rev
Log:
Update mailhost, since I moved and have a different ISP at home.

Modified:
lucene/nutch/nightly/nightly.properties

Modified: lucene/nutch/nightly/nightly.properties
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.properties?rev=306812r1=306811r2=306812view=diff
==
--- lucene/nutch/nightly/nightly.properties (original)
+++ lucene/nutch/nightly/nightly.properties Thu Oct  6 10:18:01 2005
@@ -1,4 +1,4 @@
-MailLogger.mailhost = smtp.comcast.net
+MailLogger.mailhost = smtp.sbcglobal.net
 MailLogger.from = nutch-dev@incubator.apache.org
 MailLogger.failure.to = nutch-dev@incubator.apache.org
 MailLogger.failure.subject = Nutch nightly build failure




svn commit: r306813 - /lucene/nutch/nightly/nightly.sh

2005-10-06 Thread cutting
Author: cutting
Date: Thu Oct  6 10:18:46 2005
New Revision: 306813

URL: http://svn.apache.org/viewcvs?rev=306813view=rev
Log:
Use /tmp/nutch-nightly instead of /tmp/nutch to avoid conflicts with mapred.

Modified:
lucene/nutch/nightly/nightly.sh

Modified: lucene/nutch/nightly/nightly.sh
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.sh?rev=306813r1=306812r2=306813view=diff
==
--- lucene/nutch/nightly/nightly.sh (original)
+++ lucene/nutch/nightly/nightly.sh Thu Oct  6 10:18:46 2005
@@ -8,14 +8,14 @@
 REL_DIR=/www/lucene.apache.org/nutch/release/nightly
 
 # create an empty build directory
-rm -rf /tmp/nutch
+rm -rf /tmp/nutch-nightly
 cd /tmp
 
 # export sources into it
-svn export $TRUNK nutch
+svn export $TRUNK nutch-nightly
 
 # run build
-cd nutch
+cd nutch-nightly
 $HOME/local/ant/bin/ant \
  -propertyfile $HOME/src/nutch/nightly/nightly.properties \
  -logger org.apache.tools.ant.listener.MailLogger \




svn commit: r294928 - in /lucene/nutch/branches/mapred: site/tutorial.html site/tutorial.pdf src/site/src/documentation/content/xdocs/tutorial.xml

2005-10-04 Thread cutting
Author: cutting
Date: Tue Oct  4 14:58:53 2005
New Revision: 294928

URL: http://svn.apache.org/viewcvs?rev=294928view=rev
Log:
Update tutorial for mapred changes.  Still does not describe mapred or NDFS 
configuration.

Modified:
lucene/nutch/branches/mapred/site/tutorial.html
lucene/nutch/branches/mapred/site/tutorial.pdf

lucene/nutch/branches/mapred/src/site/src/documentation/content/xdocs/tutorial.xml

Modified: lucene/nutch/branches/mapred/site/tutorial.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/tutorial.html?rev=294928r1=294927r2=294928view=diff
==
--- lucene/nutch/branches/mapred/site/tutorial.html (original)
+++ lucene/nutch/branches/mapred/site/tutorial.html Tue Oct  4 14:58:53 2005
@@ -276,11 +276,11 @@
 ol
 
 
-liCreate a flat file of root urls.  For example, to crawl the
-span class=codefragnutch/span site you might start with a file named
-span class=codefragurls/span containing just the Nutch home page.  All 
other
-Nutch pages should be reachable from this page.  The span 
class=codefragurls/span
-file would thus look like:
+liCreate a directory with a flat file of root urls.  For example, to
+crawl the span class=codefragnutch/span site you might start with a file 
named
+span class=codefragurls/nutch/span containing the url of just the Nutch 
home
+page.  All other Nutch pages should be reachable from this page.  The
+span class=codefragurls/nutch/span file would thus contain:
 pre class=code
 http://lucene.apache.org/nutch/
 /pre
@@ -310,138 +310,152 @@
 span class=codefrag-dir/span emdir/em names the directory to put the 
crawl in./li
 
 li
-span class=codefrag-depth/span emdepth/em indicates the link depth 
from the root
-page that should be crawled./li
+span class=codefrag-threads/span emthreads/em determines the number 
of
+threads that will fetch in parallel./li
 
 li
-span class=codefrag-delay/span emdelay/em determines the number of 
seconds
-between accesses to each host./li
+span class=codefrag-depth/span emdepth/em indicates the link depth 
from the root
+page that should be crawled./li
 
 li
-span class=codefrag-threads/span emthreads/em determines the number 
of
-threads that will fetch in parallel./li
+span class=codefrag-topN/span emN/em determines the maximum number 
of pages that
+will be retrieved at each level up to the depth./li
 
 /ul
 pFor example, a typical call might be:/p
 pre class=code
-bin/nutch crawl urls -dir crawl.test -depth 3 gt;amp; crawl.log
+bin/nutch crawl urls -dir crawl -depth 3 -topN 50
 /pre
-pTypically one starts testing one's configuration by crawling at low
-depths, and watching the output to check that desired pages are found.
-Once one is more confident of the configuration, then an appropriate
-depth for a full crawl is around 10./p
+pTypically one starts testing one's configuration by crawling at
+shallow depths, sharply limiting the number of pages fetched at each
+level (span class=codefrag-topN/span), and watching the output to check 
that
+desired pages are fetched and undesirable pages are not.  Once one is
+confident of the configuration, then an appropriate depth for a full
+crawl is around 10.  The number of pages per level
+(span class=codefrag-topN/span) for a full crawl can be from tens of 
thousands to
+millions, depending on your resources./p
 pOnce crawling has completed, one can skip to the Searching section
 below./p
 /div
 
 
-a name=N100E4/aa name=Whole-web+Crawling/a
+a name=N100EA/aa name=Whole-web+Crawling/a
 h2 class=h3Whole-web Crawling/h2
 div class=section
 pWhole-web crawling is designed to handle very large crawls which may
 take weeks to complete, running on multiple machines./p
-a name=N100ED/aa name=Whole-web%3A+Concepts/a
+a name=N100F3/aa name=Whole-web%3A+Concepts/a
 h3 class=h4Whole-web: Concepts/h3
-pNutch data is of two types:/p
+pNutch data is composed of:/p
 ol
+
   
-liThe web database.  This contains information about every
-page known to Nutch, and about links between those pages./li
+liThe crawl database, or emcrawldb/em.  This contains
+information about every url known to Nutch, including whether it was
+fetched, and, if so, when./li
+
   
-liA set of segments.  Each segment is a set of pages that are
-fetched and indexed as a unit.  Segment data consists of the
-following types:/li
+liThe link database, or emlinkdb/em.  This contains the list
+of known links to each url, including both the source url and anchor
+text of the link./li
+
+  
+liA set of emsegments/em.  Each segment is a set of urls that are
+fetched as a unit.  Segments are directories with the following
+subdirectories:/li
+
   
 li
 ul
 
-lia emfetchlist/em is a file
-that names a set of pages to be fetched/li
+lia emcrawl_generate/em names a set of urls to be fetched/li
+
+lia emcrawl_fetch/em contains the status of fetching each url/li
+
+lia emcontent/em contains the content of each url/li

svn commit: r293404 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java

2005-10-03 Thread cutting
Author: cutting
Date: Mon Oct  3 10:33:32 2005
New Revision: 293404

URL: http://svn.apache.org/viewcvs?rev=293404view=rev
Log:
Remove redundant call to done(), observed by Stefan.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java?rev=293404r1=293403r2=293404view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java 
Mon Oct  3 10:33:32 2005
@@ -578,7 +578,6 @@
 
   try {
   task.run(job, umbilical);   // run the task
-  umbilical.done(taskid);
   } catch (Throwable throwable) {
   LOG.log(Level.WARNING, Failed to spawn child, throwable);
   // Report back any failures, for diagnostic purposes




svn commit: r292509 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java

2005-09-29 Thread cutting
Author: cutting
Date: Thu Sep 29 11:57:35 2005
New Revision: 292509

URL: http://svn.apache.org/viewcvs?rev=292509view=rev
Log:
Use a more reasonable value when timing out hung fetcher threads.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=292509r1=292508r2=292509view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java 
Thu Sep 29 11:57:35 2005
@@ -290,7 +290,8 @@
   // some threads seem to hang, despite all intentions
   if (done) { // last entry read
 long doneTime = System.currentTimeMillis();
-long timeout = getConf().getLong(http.timeout, 1) * 10;
+long timeout = // select timeout that avoids a task timeout
+  NutchConf.get().getInt(mapred.task.timeout, 10*60*1000)/2;
 while (activeThreads  0
 System.currentTimeMillis()-doneTime  timeout) {
   try {




svn commit: r292532 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MRConstants.java

2005-09-29 Thread cutting
Author: cutting
Date: Thu Sep 29 13:30:11 2005
New Revision: 292532

URL: http://svn.apache.org/viewcvs?rev=292532view=rev
Log:
Increase timeout, as launching large jobs can sometimes cause the jobtracker to 
not see heartbeats for a bit.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MRConstants.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MRConstants.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MRConstants.java?rev=292532r1=292531r2=292532view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MRConstants.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MRConstants.java 
Thu Sep 29 13:30:11 2005
@@ -25,7 +25,7 @@
 // Timeouts, constants
 //
 public static final long HEARTBEAT_INTERVAL = 3 * 1000;
-public static final long TASKTRACKER_EXPIRY_INTERVAL = 30 * 1000;
+public static final long TASKTRACKER_EXPIRY_INTERVAL = 10 * 60 * 1000;
 
 //
 // Result codes




svn commit: r292539 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch: fs/LocalFileSystem.java fs/NutchFileSystem.java ndfs/NDFSClient.java

2005-09-29 Thread cutting
Author: cutting
Date: Thu Sep 29 13:43:53 2005
New Revision: 292539

URL: http://svn.apache.org/viewcvs?rev=292539view=rev
Log:
Change so that default is to overwrite existing files, as this is normal under 
MapReduce, when tasks may be re-executed.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/NDFSClient.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java?rev=292539r1=292538r2=292539view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java 
Thu Sep 29 13:43:53 2005
@@ -95,13 +95,6 @@
 return new LocalNFSFileInputStream(f);
 }
 
-/**
- * Create the file at f.
- */
-public NFSOutputStream create(File f) throws IOException {
-return create(f, false);
-}
-
 /*
  * For create()'s NFSOutputStream.
  */
@@ -128,8 +121,6 @@
   public void write(int b) throws IOException { fos.write(b); }
 }
 
-/**
- */
 public NFSOutputStream create(File f, boolean overwrite) throws 
IOException {
 if (f.exists()  ! overwrite) {
 throw new IOException(File already exists:+f);

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java?rev=292539r1=292538r2=292539view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java 
Thu Sep 29 13:43:53 2005
@@ -122,10 +122,18 @@
 public abstract NFSInputStream open(File f) throws IOException;
 
 /**
- * Opens an OutputStream at the indicated File, whether local
- * or via NDFS.
+ * Opens an OutputStream at the indicated File.
+ * Files are overwritten by default.
+ */
+public NFSOutputStream create(File f) throws IOException {
+return create(f, true);
+}
+
+/** Opens an OutputStream at the indicated File.
+ * @param f the file name to open
+ * @param overwrite if a file with this name already exists, then if true,
+ *   the file will be overwritten, and if false an error will be thrown.
  */
-public abstract NFSOutputStream create(File f) throws IOException;
 public abstract NFSOutputStream create(File f, boolean overwrite) throws 
IOException;
 
 /**

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/NDFSClient.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/NDFSClient.java?rev=292539r1=292538r2=292539view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/NDFSClient.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/NDFSClient.java 
Thu Sep 29 13:43:53 2005
@@ -71,14 +71,6 @@
 return new NDFSInputStream(src.toString());
 }
 
-/**
- * Create an output stream that writes to all the right places.
- * Basically creates instance of inner subclass of OutputStream
- * that handles datanode/namenode negotiation.
- */
-public NFSOutputStream create(UTF8 src) throws IOException {
-return create(src, false);
-}
 public NFSOutputStream create(UTF8 src, boolean overwrite) throws 
IOException {
 return new NDFSOutputStream(src, overwrite);
 }




svn commit: r292556 - /lucene/nutch/branches/mapred/conf/nutch-default.xml

2005-09-29 Thread cutting
Author: cutting
Date: Thu Sep 29 14:27:49 2005
New Revision: 292556

URL: http://svn.apache.org/viewcvs?rev=292556view=rev
Log:
Document mapred.tasktracker.tasks.maximum and provide a default.

Modified:
lucene/nutch/branches/mapred/conf/nutch-default.xml

Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=292556r1=292555r2=292556view=diff
==
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Thu Sep 29 14:27:49 2005
@@ -419,13 +419,20 @@
   /description
 /property
 
-
 property
   namemapred.task.timeout/name
   value60/value
   descriptionThe number of milliseconds before a task will be
   terminated if it neither reads an input, writes an output, nor
   updates its status string.
+  /description
+/property
+
+property
+  namemapred.tasktracker.tasks.maximum/name
+  value2/value
+  descriptionThe maximum number of tasks that will be run
+  simultaneously by a task tracker.
   /description
 /property
 




svn commit: r290602 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DF.java

2005-09-20 Thread cutting
Author: cutting
Date: Tue Sep 20 19:38:56 2005
New Revision: 290602

URL: http://svn.apache.org/viewcvs?rev=290602view=rev
Log:
Fix NUTCH-93: long filesystem names can wrap to a new line and were
not parsed correctly.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DF.java

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DF.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DF.java?rev=290602r1=290601r2=290602view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DF.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DF.java Tue Sep 
20 19:38:56 2005
@@ -48,6 +48,9 @@
   new StringTokenizer(lines.readLine(),  \t\n\r\f%);
 
 this.filesystem = tokens.nextToken();
+if (!tokens.hasMoreTokens()) {// for long filesystem name
+  tokens = new StringTokenizer(lines.readLine(),  \t\n\r\f%);
+}
 this.capacity = Long.parseLong(tokens.nextToken()) * 1024;
 this.used = Long.parseLong(tokens.nextToken()) * 1024;
 this.available = Long.parseLong(tokens.nextToken()) * 1024;




svn commit: r290067 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch: mapred/InputFormatBase.java util/NutchConf.java

2005-09-19 Thread cutting
Author: cutting
Date: Sun Sep 18 23:08:19 2005
New Revision: 290067

URL: http://svn.apache.org/viewcvs?rev=290067view=rev
Log:
Improved error string  javadoc.  Contributed by Paul Baclace.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InputFormatBase.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/NutchConf.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InputFormatBase.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InputFormatBase.java?rev=290067r1=290066r2=290067view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InputFormatBase.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InputFormatBase.java
 Sun Sep 18 23:08:19 2005
@@ -46,8 +46,17 @@
Reporter reporter)
 throws IOException;
 
-  /** Subclasses may override to, e.g., select only files matching a regular
-   * expression.*/ 
+  /** List input directories.
+   * Subclasses may override to, e.g., select only files matching a regular
+   * expression.
+   * Property mapred.input.subdir, if set, names a subdirectory that
+   * is appended to all input dirs specified by job, and if the given fs
+   * lists those too, each is added to the returned array of File.
+   * @param fs
+   * @param job
+   * @return array of File objects, never zero length.
+   * @throws IOException if zero items.
+   */
   protected File[] listFiles(NutchFileSystem fs, JobConf job)
 throws IOException {
 File[] dirs = job.getInputDirs();
@@ -73,7 +82,7 @@
 }
 
 if (result.size() == 0) {
-  throw new IOException(No input files in: +job.getInputDirs());
+  throw new IOException(No input directories specified in: +job);
 }
 return (File[])result.toArray(new File[result.size()]);
   }

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/NutchConf.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/NutchConf.java?rev=290067r1=290066r2=290067view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/NutchConf.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/NutchConf.java 
Sun Sep 18 23:08:19 2005
@@ -30,14 +30,17 @@
 import javax.xml.transform.stream.StreamResult;
 
 /** Provides access to Nutch configuration parameters.
- *
+ * pAn ordered list of configuration parameter files with
+ * default and always-overrides site parameters.
  * pDefault values for all parameters are specified in a file named
  * ttnutch-default.xml/tt located on the classpath.  Overrides for these
  * defaults should be in an optional file named ttnutch-site.xml/tt, also
  * located on the classpath.  Typically these files reside in the
  * ttconf//tt subdirectory at the top-level of a Nutch installation.
+ * pThe resource files are read upon first access of values (set, get,
+ * or write) after [EMAIL PROTECTED] #addConfResource(String)} or
+ * [EMAIL PROTECTED] #addConfResource(File)}.
  */
-
 public class NutchConf {
   private static final Logger LOG =
 LogFormatter.getLogger(org.apache.nutch.util.NutchConf);
@@ -57,7 +60,7 @@
 resourceNames.add(nutch-site.xml);
   }
 
-  /** A new configuration with the same settings as another. */
+  /** A new configuration with the same settings cloned from another. */
   public NutchConf(NutchConf other) {
 this.resourceNames = (ArrayList)other.resourceNames.clone();
 if (other.properties != null)
@@ -392,6 +395,25 @@
 } catch (Exception e) {
   throw new RuntimeException(e);
 }
+  }
+
+
+  public String toString() {
+StringBuffer sb = new StringBuffer(resourceNames.size()*30);
+sb.append(NutchConf: );
+ListIterator i = resourceNames.listIterator();
+while (i.hasNext()) {
+  if (i.nextIndex() != 0) {
+sb.append( , );
+  }
+  Object obj = i.next();
+  if (obj instanceof File) {
+sb.append((File)obj);
+  } else {
+sb.append((String)obj);
+  }
+}
+return sb.toString();
   }
 
   /** For debugging.  List non-default properties to the terminal and exit. */




svn commit: r289281 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred: LocalJobRunner.java MapTask.java ReduceTask.java Task.java

2005-09-15 Thread cutting
Author: cutting
Date: Thu Sep 15 10:12:36 2005
New Revision: 289281

URL: http://svn.apache.org/viewcvs?rev=289281view=rev
Log:
Improve status reports: Always send final status when done; Have LocalJobRunner 
log status.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/LocalJobRunner.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/Task.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/LocalJobRunner.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/LocalJobRunner.java?rev=289281r1=289280r2=289281view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/LocalJobRunner.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/LocalJobRunner.java
 Thu Sep 15 10:12:36 2005
@@ -18,12 +18,16 @@
 
 import java.io.*;
 import java.util.*;
+import java.util.logging.*;
 
 import org.apache.nutch.io.*;
 import org.apache.nutch.fs.*;
+import org.apache.nutch.util.*;
 
 /** Implements MapReduce locally, in-process, for debugging. */ 
 public class LocalJobRunner implements JobSubmissionProtocol {
+  public static final Logger LOG =
+LogFormatter.getLogger(org.apache.nutch.mapred.LocalJobRunner);
 
   private NutchFileSystem fs;
   private HashMap jobs = new HashMap();
@@ -102,6 +106,7 @@
 public Task getTask(String taskid) { return null; }
 
 public void progress(String taskId, float progress, String state) {
+  LOG.info(state);
   float taskIndex = mapIds.indexOf(taskId);
   if (taskIndex = 0) {   // mapping
 float numTasks = mapIds.size();

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java?rev=289281r1=289280r2=289281view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java 
Thu Sep 15 10:12:36 2005
@@ -129,7 +129,7 @@
 }
   }
 }
-umbilical.done(getTaskId());
+done(umbilical);
   }
   
 }

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java?rev=289281r1=289280r2=289281view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java 
Thu Sep 15 10:12:36 2005
@@ -270,7 +270,7 @@
   out.close(reporter);
 }
 
-umbilical.done(getTaskId());
+done(umbilical);
   }
 
   /** Construct output file names so that, when an output directory listing is

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/Task.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/Task.java?rev=289281r1=289280r2=289281view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/Task.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/Task.java Thu 
Sep 15 10:12:36 2005
@@ -114,4 +114,10 @@
 }
   }
 
+  public void done(TaskUmbilicalProtocol umbilical)
+throws IOException {
+umbilical.progress(getTaskId(),   // send a final status report
+   taskProgress.get(), taskProgress.toString());
+umbilical.done(getTaskId());
+  }
 }




svn commit: r289282 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java

2005-09-15 Thread cutting
Author: cutting
Date: Thu Sep 15 10:15:16 2005
New Revision: 289282

URL: http://svn.apache.org/viewcvs?rev=289282view=rev
Log:
Finish even when some threads hung.  Improve status reports.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=289282r1=289281r2=289282view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java 
Thu Sep 15 10:15:16 2005
@@ -59,6 +59,7 @@
   private String segmentName;
   private int activeThreads;
   private int maxRedirect;
+  private boolean done;
 
   private long start = System.currentTimeMillis(); // start time of fetcher run
 
@@ -70,6 +71,10 @@
   private boolean parsing;
 
   private class FetcherThread extends Thread {
+public FetcherThread() {
+  this.setDaemon(true);   // don't hang JVM on exit
+}
+
 public void run() {
   synchronized (Fetcher.this) {activeThreads++;} // count threads
   
@@ -82,8 +87,10 @@
 break;// exit
   
   try {   // get next entry from input
-if (!input.next(key, datum))
+if (!input.next(key, datum)) {
+  done = true;
   break;  // at eof, exit
+}
   } catch (IOException e) {
 LOG.severe(fetcher caught:+e.toString());
 break;
@@ -125,8 +132,9 @@
 }
 break;
 
-  case ProtocolStatus.RETRY:  // retry
   case ProtocolStatus.EXCEPTION:
+logError(url, status.getMessage());
+  case ProtocolStatus.RETRY:  // retry
 output(key, datum, null, CrawlDatum.STATUS_FETCH_RETRY);
 break;
 
@@ -152,7 +160,7 @@
 
 
   } catch (Throwable t) { // unexpected exception
-logError(url, t);
+logError(url, t.toString());
 output(key, datum, null, CrawlDatum.STATUS_FETCH_GONE);
 
   }
@@ -165,9 +173,8 @@
   }
 }
 
-private void logError(String url, Throwable t) {
-  LOG.info(fetch of  + url +  failed with:  + t);
-  LOG.log(Level.FINE, stack, t);// stack trace
+private void logError(String url, String message) {
+  LOG.info(fetch of  + url +  failed with:  + message);
   synchronized (Fetcher.this) {   // record failure
 errors++;
   }
@@ -225,19 +232,14 @@
   private synchronized void updateStatus(int bytesInPage) throws IOException {
 pages++;
 bytes += bytesInPage;
+  }
 
-if ((pages % 100) == 0) { // show status every 100pp
-  long elapsed = (System.currentTimeMillis() - start)/1000;
-  String line1 =
-pages+ pages, +errors+ errors, +bytes+ bytes, +elapsed+ secs;
-  String line2 = 
-+ ((float)pages)/elapsed+ pages/s, 
-+ float)bytes)*8)/1024)/elapsed+ kb/s, 
-+ ((float)bytes)/pages+ bytes/page;
-  LOG.info( status: +line1);
-  LOG.info( status: +line2);
-  reporter.setStatus(line2);
-}
+  private synchronized void reportStatus() throws IOException {
+long elapsed = (System.currentTimeMillis() - start)/1000;
+reporter.setStatus
+  (pages+ pages, +errors+ errors, 
+   + Math.round(((float)pages*10)/elapsed)/10.0+ pages/s, 
+   + Math.round(float)bytes)*8)/1024)/elapsed)+ kb/s, );
   }
 
   public void configure(JobConf job) {
@@ -266,7 +268,7 @@
 this.input = input;
 this.output = output;
 this.reporter = reporter;
-   
+
 this.maxRedirect = getConf().getInt(http.redirect.max, 3);
 
 int threadCount = getConf().getInt(fetcher.threads.fetch, 10);
@@ -278,6 +280,24 @@
   try {
 Thread.sleep(1000);
   } catch (InterruptedException e) {}
+
+  reportStatus();
+
+  // some threads seem to hang, despite all intentions
+  if (done) { // last entry read
+long doneTime = System.currentTimeMillis();
+long timeout = getConf().getLong(http.timeout, 1) * 10;
+while (activeThreads  0
+System.currentTimeMillis()-doneTime  timeout) {
+  try {
+Thread.sleep(1000);   // wait for completion
+  } catch (InterruptedException e) {}
+}
+if (activeThreads  0) {  // abort after timeout
+  LOG.warning(Aborting with +activeThreads+ hung threads.);
+  return

svn commit: r289286 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java

2005-09-15 Thread cutting
Author: cutting
Date: Thu Sep 15 11:11:39 2005
New Revision: 289286

URL: http://svn.apache.org/viewcvs?rev=289286view=rev
Log:
Don't synchronize while making setStatus() RPC.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=289286r1=289285r2=289286view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java 
Thu Sep 15 11:11:39 2005
@@ -234,12 +234,16 @@
 bytes += bytesInPage;
   }
 
-  private synchronized void reportStatus() throws IOException {
-long elapsed = (System.currentTimeMillis() - start)/1000;
-reporter.setStatus
-  (pages+ pages, +errors+ errors, 
-   + Math.round(((float)pages*10)/elapsed)/10.0+ pages/s, 
-   + Math.round(float)bytes)*8)/1024)/elapsed)+ kb/s, );
+  private void reportStatus() throws IOException {
+String status;
+synchronized (this) {
+  long elapsed = (System.currentTimeMillis() - start)/1000;
+  status = 
+pages+ pages, +errors+ errors, 
++ Math.round(((float)pages*10)/elapsed)/10.0+ pages/s, 
++ Math.round(float)bytes)*8)/1024)/elapsed)+ kb/s, ;
+}
+reporter.setStatus(status);
   }
 
   public void configure(JobConf job) {




svn commit: r280911 - in /lucene/nutch/branches/mapred/bin: nutch-daemons.sh slaves.sh

2005-09-14 Thread cutting
Author: cutting
Date: Wed Sep 14 12:04:07 2005
New Revision: 280911

URL: http://svn.apache.org/viewcvs?rev=280911view=rev
Log:
Change scripts to pass environment, so that shared home directory is not 
required.

Modified:
lucene/nutch/branches/mapred/bin/nutch-daemons.sh
lucene/nutch/branches/mapred/bin/slaves.sh

Modified: lucene/nutch/branches/mapred/bin/nutch-daemons.sh
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/nutch-daemons.sh?rev=280911r1=280910r2=280911view=diff
==
--- lucene/nutch/branches/mapred/bin/nutch-daemons.sh (original)
+++ lucene/nutch/branches/mapred/bin/nutch-daemons.sh Wed Sep 14 12:04:07 2005
@@ -13,4 +13,4 @@
 bin=`dirname $0`
 bin=`cd $bin; pwd`
 
-exec $bin/slaves.sh /bin/bash --login $bin/nutch-daemon.sh $@
+exec $bin/slaves.sh $bin/nutch-daemon.sh $@

Modified: lucene/nutch/branches/mapred/bin/slaves.sh
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/slaves.sh?rev=280911r1=280910r2=280911view=diff
==
--- lucene/nutch/branches/mapred/bin/slaves.sh (original)
+++ lucene/nutch/branches/mapred/bin/slaves.sh Wed Sep 14 12:04:07 2005
@@ -21,5 +21,5 @@
 
 for slave in `cat $NUTCH_SLAVES`; do
  echo $slave:
- ssh -o ConnectTimeout=1 $slave $@
+ ssh -o ConnectTimeout=1 -o SendEnv='NUTCH* JAVA*' $slave $@
 done




svn commit: r280912 - /lucene/nutch/branches/mapred/bin/stop-all.sh

2005-09-14 Thread cutting
Author: cutting
Date: Wed Sep 14 12:04:41 2005
New Revision: 280912

URL: http://svn.apache.org/viewcvs?rev=280912view=rev
Log:
Stop jobtracker first, to stop tasks faster.

Modified:
lucene/nutch/branches/mapred/bin/stop-all.sh

Modified: lucene/nutch/branches/mapred/bin/stop-all.sh
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/stop-all.sh?rev=280912r1=280911r2=280912view=diff
==
--- lucene/nutch/branches/mapred/bin/stop-all.sh (original)
+++ lucene/nutch/branches/mapred/bin/stop-all.sh Wed Sep 14 12:04:41 2005
@@ -5,7 +5,7 @@
 bin=`dirname $0`
 bin=`cd $bin; pwd`
 
-$bin/nutch-daemons.sh stop tasktracker
 $bin/nutch-daemon.sh stop jobtracker
+$bin/nutch-daemons.sh stop tasktracker
 $bin/nutch-daemon.sh stop namenode
 $bin/nutch-daemons.sh stop datanode




svn commit: r280913 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTaskRunner.java

2005-09-14 Thread cutting
Author: cutting
Date: Wed Sep 14 12:05:08 2005
New Revision: 280913

URL: http://svn.apache.org/viewcvs?rev=280913view=rev
Log:
Log the stack trace, so we can debug this one better.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTaskRunner.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTaskRunner.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTaskRunner.java?rev=280913r1=280912r2=280913view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTaskRunner.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTaskRunner.java
 Wed Sep 14 12:05:08 2005
@@ -99,8 +99,9 @@
   copyPhase.startNextPhase();
   
 } catch (IOException e) { // failed: try again later
-  LOG.warning(copy failed: +loc.getMapTaskId()+ from +addr);
-  
+  LOG.log(Level.WARNING,
+  copy failed: +loc.getMapTaskId()+ from +addr,
+  e);
 } finally {
   MapOutputFile.setProgressReporter(null);
 }




svn commit: r280368 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/TestClient.java

2005-09-12 Thread cutting
Author: cutting
Date: Mon Sep 12 10:03:00 2005
New Revision: 280368

URL: http://svn.apache.org/viewcvs?rev=280368view=rev
Log:
Change so that -du and -ls commands work with zero arguments.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/TestClient.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/TestClient.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/TestClient.java?rev=280368r1=280367r2=280368view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/TestClient.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/TestClient.java 
Mon Sep 12 10:03:00 2005
@@ -239,17 +239,17 @@
 } else if (-moveToLocal.equals(cmd)) {
 tc.moveToLocal(argv[i++], new File(argv[i++]));
 } else if (-ls.equals(cmd)) {
-tc.ls(argv[i++]);
+String arg = i  argv.length ? argv[i++] : ;
+tc.ls(arg);
 } else if (-mv.equals(cmd)) {
 tc.rename(argv[i++], argv[i++]);
 } else if (-cp.equals(cmd)) {
 tc.copy(argv[i++], argv[i++]);
 } else if (-rm.equals(cmd)) {
 tc.delete(argv[i++]);
-} else if (-ls.equals(cmd)) {
-tc.ls(argv[i++]);
 } else if (-du.equals(cmd)) {
-tc.du(argv[i++]);
+String arg = i  argv.length ? argv[i++] : ;
+tc.du(arg);
 } else if (-mkdir.equals(cmd)) {
 tc.mkdir(argv[i++]);
 } else if (-report.equals(cmd)) {




svn commit: r280370 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java

2005-09-12 Thread cutting
Author: cutting
Date: Mon Sep 12 10:04:33 2005
New Revision: 280370

URL: http://svn.apache.org/viewcvs?rev=280370view=rev
Log:
Fix to correctly convert empty path to home directory rather than root.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java?rev=280370r1=280369r2=280370view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java 
Mon Sep 12 10:04:33 2005
@@ -50,12 +50,11 @@
 public String getName() { return name; }
 
 private UTF8 getPath(File file) {
-  File f = file;
   String path = getNDFSPath(file);
   if (!path.startsWith(NDFSFile.NDFS_FILE_SEPARATOR)) {
-f = new File(HOME_DIR, path);
+path = getNDFSPath(new File(HOME_DIR, path)); // make absolute
   }
-  return new UTF8(getNDFSPath(f));
+  return new UTF8(path);
 }
 
 /**
@@ -305,17 +304,10 @@
 parent = parent.getParentFile();
   }
   StringBuffer path = new StringBuffer();
-  String fname = (String) l.get(l.size() - 1);
-  if (!.equals(fname)) {
-path.append(fname); //handle not absolute paths
-  } else {
-if (l.size() == 1)
-  path.append(NDFSFile.NDFS_FILE_SEPARATOR); //handle root path
-  }
+  path.append(l.get(l.size() - 1));
   for (int i = l.size() - 2; i = 0; i--) {
-fname = (String) l.get(i);
 path.append(NDFSFile.NDFS_FILE_SEPARATOR);
-path.append(fname);
+path.append(l.get(i));
   }
   return path.toString();
 }




svn commit: r279596 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java

2005-09-08 Thread cutting
Author: cutting
Date: Thu Sep  8 11:09:28 2005
New Revision: 279596

URL: http://svn.apache.org/viewcvs?rev=279596view=rev
Log:
Fix so that input splitting errors don't leave job hung.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java?rev=279596r1=279595r2=279596view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java 
Thu Sep  8 11:09:28 2005
@@ -521,7 +521,17 @@
 JobInProgress createJob(String jobFile) throws IOException {
 JobInProgress job = new JobInProgress(jobFile);
 jobs.put(job.getProfile().getJobId(), job);
-job.launch();
+
+boolean error = true;
+try {
+  job.launch();
+  error = false;
+} finally {
+  if (error) {
+job.kill();
+  }
+}
+
 return job;
 }
 




svn commit: r279397 - /lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java

2005-09-07 Thread cutting
Author: cutting
Date: Wed Sep  7 11:42:11 2005
New Revision: 279397

URL: http://svn.apache.org/viewcvs?rev=279397view=rev
Log:
Add seek test.

Modified:

lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java

Modified: 
lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java?rev=279397r1=279396r2=279397view=diff
==
--- 
lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java
 Wed Sep  7 11:42:11 2005
@@ -31,6 +31,7 @@
   private static final Logger LOG = InputFormatBase.LOG;
 
   private static final long MEGA = 1024 * 1024;
+  private static final int SEEKS_PER_FILE = 4;
 
   private static String ROOT = System.getProperty(test.build.data,fs_test);
   private static File CONTROL_DIR = new File(ROOT, fs_control);
@@ -253,11 +254,94 @@
   }
 
 
+  public static class SeekMapper extends NutchConfigured implements Mapper {
+private Random random = new Random();
+private byte[] check  = new byte[8192];
+private NutchFileSystem fs;
+
+{
+  try {
+fs = NutchFileSystem.get();
+  } catch (IOException e) {
+throw new RuntimeException(e);
+  }
+}
+
+public SeekMapper() { super(null); }
+
+public SeekMapper(NutchConf conf) { super(conf); }
+
+public void configure(JobConf job) {
+  setConf(job);
+}
+
+public void map(WritableComparable key, Writable value,
+OutputCollector collector, Reporter reporter)
+  throws IOException {
+  String name = ((UTF8)key).toString();
+  long size = ((LongWritable)value).get();
+  long seed = Long.parseLong(name);
+
+  reporter.setStatus(opening  + name);
+
+  NFSDataInputStream in =
+new NFSDataInputStream(fs.open(new File(DATA_DIR, name)));
+
+  try {
+for (int i = 0; i  SEEKS_PER_FILE; i++) {
+  // generate a random position
+  long position = Math.abs(random.nextLong()) % size;
+  
+  // advance random state to that position
+  random.setSeed(seed);
+  for (int p = 0; p = position; p+= check.length) {
+reporter.setStatus(generating data for  + name);
+random.nextBytes(check);
+  }
+  
+  // seek file to that position
+  reporter.setStatus(seeking  + name);
+  in.seek(position);
+  
+  // check that byte matches
+  assertEquals(in.readByte(), check[(int)(position % check.length)]);
+  
+}
+  } finally {
+in.close();
+  }
+}
+  }
+
+  public static void seekTest(NutchFileSystem fs)
+throws Exception {
+
+fs.delete(READ_DIR);
+
+JobConf job = new JobConf(NutchConf.get());
+
+job.setInputDir(CONTROL_DIR);
+job.setInputFormat(SequenceFileInputFormat.class);
+job.setInputKeyClass(UTF8.class);
+job.setInputValueClass(LongWritable.class);
+
+job.setMapperClass(SeekMapper.class);
+job.setReducerClass(LongSumReducer.class);
+
+job.setOutputDir(READ_DIR);
+job.setOutputKeyClass(UTF8.class);
+job.setOutputValueClass(LongWritable.class);
+job.setNumReduceTasks(1);
+JobClient.runJob(job);
+  }
+
+
   public static void main(String[] args) throws Exception {
 int megaBytes = 10;
 int files = 100;
 boolean noRead = false;
 boolean noWrite = false;
+boolean noSeek = false;
 long seed = new Random().nextLong();
 
 String usage = Usage: TestNutchFileSystem -files N -megaBytes M [-noread] 
[-nowrite];
@@ -290,6 +374,9 @@
 }
 if (!noRead) {
   readTest(fs);
+}
+if (!noSeek) {
+  seekTest(fs);
 }
   }
 




svn commit: r279417 - /lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java

2005-09-07 Thread cutting
Author: cutting
Date: Wed Sep  7 13:34:00 2005
New Revision: 279417

URL: http://svn.apache.org/viewcvs?rev=279417view=rev
Log:
Run seek test as unit test; add -noseek command line option.

Modified:

lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java

Modified: 
lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java?rev=279417r1=279416r2=279417view=diff
==
--- 
lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java
 Wed Sep  7 13:34:00 2005
@@ -56,6 +56,7 @@
 createControlFile(fs, megaBytes, numFiles, seed);
 writeTest(fs);
 readTest(fs);
+seekTest(fs);
   }
 
   public static void createControlFile(NutchFileSystem fs,
@@ -344,7 +345,7 @@
 boolean noSeek = false;
 long seed = new Random().nextLong();
 
-String usage = Usage: TestNutchFileSystem -files N -megaBytes M [-noread] 
[-nowrite];
+String usage = Usage: TestNutchFileSystem -files N -megaBytes M [-noread] 
[-nowrite] [-noseek];
 
 if (args.length == 0) {
 System.err.println(usage);
@@ -359,6 +360,8 @@
 noRead = true;
   } else if (args[i].equals(-nowrite)) {
 noWrite = true;
+  } else if (args[i].equals(-noseek)) {
+noSeek = true;
   }
 }
 




svn commit: r265762 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java

2005-09-01 Thread cutting
Author: cutting
Date: Thu Sep  1 11:35:15 2005
New Revision: 265762

URL: http://svn.apache.org/viewcvs?rev=265762view=rev
Log:
Use partitioner to get partition.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=265762r1=265761r2=265762view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java
 Thu Sep  1 11:35:15 2005
@@ -31,6 +31,8 @@
 import org.apache.nutch.parse.*;
 import org.apache.nutch.pagedb.*;
 import org.apache.nutch.indexer.*;
+import org.apache.nutch.mapred.*;
+import org.apache.nutch.mapred.lib.*;
 
 /** Implements [EMAIL PROTECTED] HitSummarizer} and [EMAIL PROTECTED] 
HitContent} for a set of
  * fetched segments. */
@@ -44,6 +46,8 @@
 private MapFile.Reader[] parseText;
 private MapFile.Reader[] parseData;
 
+private Partitioner partitioner = new HashPartitioner();
+
 public Segment(NutchFileSystem nfs, File segmentDir) throws IOException {
   this.nfs = nfs;
   this.segmentDir = segmentDir;
@@ -93,7 +97,8 @@
 // hash the url to figure out which part its in
 private Writable getEntry(MapFile.Reader[] readers, UTF8 url,
   Writable entry) throws IOException {
-  return readers[url.hashCode()%readers.length].get(url, entry);
+  int part = partitioner.getPartition(url, null, readers.length);
+  return readers[part].get(url, entry);
 }
 
   }




svn commit: r265778 - in /lucene/nutch/branches/mapred/src: java/org/apache/nutch/crawl/ java/org/apache/nutch/mapred/ java/org/apache/nutch/searcher/ web/jsp/

2005-09-01 Thread cutting
Author: cutting
Date: Thu Sep  1 14:03:51 2005
New Revision: 265778

URL: http://svn.apache.org/viewcvs?rev=265778view=rev
Log:
Fix anchor  inlink access.

Added:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitInlinks.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbReader.java
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitContent.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java
lucene/nutch/branches/mapred/src/web/jsp/anchors.jsp

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java?rev=265778r1=265777r2=265778view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java 
Thu Sep  1 14:03:51 2005
@@ -131,9 +131,6 @@
 
 JobConf job = new JobConf(config);
 
-job.setInt(partition.url.by.host.seed, new Random().nextInt());
-job.setPartitionerClass(PartitionUrlByHost.class);
-
 job.setInputFormat(SequenceFileInputFormat.class);
 job.setInputKeyClass(UTF8.class);
 job.setInputValueClass(ParseData.class);

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java?rev=265778r1=265777r2=265778view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java
 Thu Sep  1 14:03:51 2005
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import java.io.File;
+import java.util.Arrays;
 
 import org.apache.nutch.fs.NutchFileSystem;
 
@@ -48,6 +49,31 @@
 
 public void close(Reporter reporter) throws IOException { out.close();}
   };
-  }  
+  }
+
+  /** Open the output generated by this format. */
+  public static MapFile.Reader[] getReaders(NutchFileSystem fs, File dir)
+throws IOException {
+File[] names = fs.listFiles(dir);
+
+// sort names, so that hash partitioning works
+Arrays.sort(names);
+
+MapFile.Reader[] parts = new MapFile.Reader[names.length];
+for (int i = 0; i  names.length; i++) {
+  parts[i] = new MapFile.Reader(fs, names[i].toString());
+}
+return parts;
+  }
+
+  /** Get an entry from output generated by this class. */
+  public static Writable getEntry(MapFile.Reader[] readers,
+  Partitioner partitioner,
+  WritableComparable key,
+  Writable value) throws IOException {
+int part = partitioner.getPartition(key, value, readers.length);
+return readers[part].get(key, value);
+  }
+
 }
 

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java?rev=265778r1=265777r2=265778view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java
 Thu Sep  1 14:03:51 2005
@@ -24,6 +24,7 @@
 
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.util.LogFormatter;
 import org.apache.nutch.io.*;
 import org.apache.nutch.ipc.RPC;
@@ -37,7 +38,7 @@
 
   /** The distributed search protocol. */
   public interface Protocol
-extends Searcher, HitDetailer, HitSummarizer, HitContent {
+extends Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks {
 
 /** The name of the segments searched by this node. */
 String[] getSegmentNames();
@@ -71,7 +72,8 @@
 
   /** The search client. */
   public static class Client extends Thread
-implements Searcher, HitDetailer, HitSummarizer, HitContent, Runnable {
+implements Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks,
+   Runnable {
 
 private InetSocketAddress[] defaultAddresses

svn commit: r264880 - /lucene/nutch/branches/mapred/bin/slaves.sh

2005-08-30 Thread cutting
Author: cutting
Date: Tue Aug 30 15:18:55 2005
New Revision: 264880

URL: http://svn.apache.org/viewcvs?rev=264880view=rev
Log:
Always put a newline after host name.

Modified:
lucene/nutch/branches/mapred/bin/slaves.sh

Modified: lucene/nutch/branches/mapred/bin/slaves.sh
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/slaves.sh?rev=264880r1=264879r2=264880view=diff
==
--- lucene/nutch/branches/mapred/bin/slaves.sh (original)
+++ lucene/nutch/branches/mapred/bin/slaves.sh Tue Aug 30 15:18:55 2005
@@ -20,6 +20,6 @@
 fi
 
 for slave in `cat $NUTCH_SLAVES`; do
- echo -n $slave:\ 
+ echo $slave:
  ssh -o ConnectTimeout=1 $slave $@
 done




svn commit: r264685 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred: InterTrackerProtocol.java JobTracker.java TaskTracker.java

2005-08-29 Thread cutting
Author: cutting
Date: Mon Aug 29 20:08:46 2005
New Revision: 264685

URL: http://svn.apache.org/viewcvs?rev=264685view=rev
Log:
Synchronize things in TaskTracker.offerService() loop.  Also remove boxing in 
the heartbeat RPC.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InterTrackerProtocol.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InterTrackerProtocol.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InterTrackerProtocol.java?rev=264685r1=264684r2=264685view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InterTrackerProtocol.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InterTrackerProtocol.java
 Mon Aug 29 20:08:46 2005
@@ -35,7 +35,7 @@
* TaskTracker must also indicate whether this is the first interaction
* (since state refresh)
*/
-  IntWritable emitHeartbeat(TaskTrackerStatus status, BooleanWritable 
initialContact);
+  int emitHeartbeat(TaskTrackerStatus status, boolean initialContact);
 
   /** Called to get new tasks from from the job tracker for this tracker.*/
   Task pollForNewTask(String trackerName);

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java?rev=264685r1=264684r2=264685view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java 
Mon Aug 29 20:08:46 2005
@@ -329,13 +329,13 @@
 /**
  * Process incoming heartbeat messages from the task trackers.
  */
-public synchronized IntWritable emitHeartbeat(TaskTrackerStatus 
trackerStatus, BooleanWritable initialContact) {
+public synchronized int emitHeartbeat(TaskTrackerStatus trackerStatus, 
boolean initialContact) {
 String trackerName = trackerStatus.getTrackerName();
 trackerStatus.setLastSeen(System.currentTimeMillis());
 
 synchronized (taskTrackers) {
 synchronized (trackerExpiryQueue) {
-if (initialContact.get()) {
+if (initialContact) {
 // If it's first contact, then clear out any state hanging 
around
 if (taskTrackers.get(trackerName) != null) {
 taskTrackers.remove(trackerName);
@@ -344,14 +344,14 @@
 } else {
 // If not first contact, there should be some record of 
the tracker
 if (taskTrackers.get(trackerName) == null) {
-return new 
IntWritable(InterTrackerProtocol.UNKNOWN_TASKTRACKER);
+return InterTrackerProtocol.UNKNOWN_TASKTRACKER;
 }
 }
 
 // Store latest state.  If first contact, then save current
 // state in expiry queue
 taskTrackers.put(trackerName, trackerStatus);
-if (initialContact.get()) {
+if (initialContact) {
 trackerExpiryQueue.add(trackerStatus);
 }
 }
@@ -359,7 +359,7 @@
 
 updateTaskStatuses(trackerStatus);
 //LOG.info(Got heartbeat from +trackerName);
-return new IntWritable(InterTrackerProtocol.TRACKERS_OK);
+return InterTrackerProtocol.TRACKERS_OK;
 }
 
 /**

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java?rev=264685r1=264684r2=264685view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java 
Mon Aug 29 20:08:46 2005
@@ -124,7 +124,7 @@
  * within the same process space might be restarted, so everything must be
  * clean.
  */
-public void close() throws IOException {
+public synchronized void close() throws IOException {
 // Kill running tasks
 Vector v = new Vector();
 for (Iterator it = tasks.values().iterator(); it.hasNext(); ) {
@@ -186,7 +186,7 @@
 // Emit standard hearbeat message to check in with JobTracker
 //
 Vector taskReports = new Vector();
-synchronized (runningTasks) {
+synchronized

svn commit: r240279 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred: MapFileOutputFormat.java MapTask.java RecordWriter.java ReduceTask.java SequenceFileOutputFormat.java TaskTracker.java TextOutputFormat.java

2005-08-26 Thread cutting
Author: cutting
Date: Fri Aug 26 09:37:55 2005
New Revision: 240279

URL: http://svn.apache.org/viewcvs?rev=240279view=rev
Log:
Always call done() on tasks, setting final progress to 1.0.  Also permit 
RecordWriter.close() to emit progress reports to avoid task timeouts when 
closing is lengthy.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/RecordWriter.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/SequenceFileOutputFormat.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TextOutputFormat.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java?rev=240279r1=240278r2=240279view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java
 Fri Aug 26 09:37:55 2005
@@ -46,7 +46,7 @@
   out.append(key, value);
 }
 
-public void close() throws IOException { out.close(); }
+public void close(Reporter reporter) throws IOException { out.close();}
   };
   }  
 }

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java?rev=240279r1=240278r2=240279view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java 
Fri Aug 26 09:37:55 2005
@@ -118,8 +118,6 @@
   ((CombiningCollector)collector).flush();
 }
 
-reportProgress(umbilical, 1.0f);  // done
-
   } finally {
 in.close();   // close input
   }
@@ -130,6 +128,7 @@
 }
   }
 }
+umbilical.done(getTaskId());
   }
   
 }

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/RecordWriter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/RecordWriter.java?rev=240279r1=240278r2=240279view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/RecordWriter.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/RecordWriter.java 
Fri Aug 26 09:37:55 2005
@@ -35,5 +35,5 @@
   void write(WritableComparable key, Writable value) throws IOException;
 
   /** Close this to future operations.*/ 
-  void close() throws IOException;
+  void close(Reporter reporter) throws IOException;
 }

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java?rev=240279r1=240278r2=240279view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java 
Fri Aug 26 09:37:55 2005
@@ -264,10 +264,10 @@
 } finally {
   in.close();
   lfs.delete(new File(sortedFile));   // remove sorted
-  out.close();
+  out.close(reporter);
 }
 
-reportProgress(umbilical);
+umbilical.done(getTaskId());
   }
 
   /** Construct output file names so that, when an output directory listing is

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/SequenceFileOutputFormat.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/SequenceFileOutputFormat.java?rev=240279r1=240278r2=240279view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/SequenceFileOutputFormat.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/SequenceFileOutputFormat.java
 Fri Aug 26 09:37:55 2005
@@ -46,7 +46,7 @@
   out.append(key, value);
 }
 
-public void close() throws IOException { out.close(); }
+public void close(Reporter reporter) throws IOException { out.close

svn commit: r240280 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java

2005-08-26 Thread cutting
Author: cutting
Date: Fri Aug 26 09:39:11 2005
New Revision: 240280

URL: http://svn.apache.org/viewcvs?rev=240280view=rev
Log:
Limit to 10,000 inlinks by default.  Also optimize a common case.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java?rev=240280r1=240279r2=240280view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java 
Fri Aug 26 09:39:11 2005
@@ -49,7 +49,7 @@
 
   public void configure(JobConf job) {
 maxAnchorLength = job.getInt(db.max.anchor.length, 100);
-maxInlinks = job.getInt(db.max.inlinks, 10);
+maxInlinks = job.getInt(db.max.inlinks, 1);
   }
 
   public void map(WritableComparable key, Writable value,
@@ -74,9 +74,21 @@
   public void reduce(WritableComparable key, Iterator values,
  OutputCollector output, Reporter reporter)
 throws IOException {
-Inlinks result = new Inlinks();
+
+Inlinks result = null;
+
 while (values.hasNext()) {
   Inlinks inlinks = (Inlinks)values.next();
+
+  if (result == null) {   // optimize a common case
+if (inlinks.size()  maxInlinks) {
+  result = inlinks;
+  continue;
+} else {
+  result = new Inlinks();
+}
+  }
+
   int end = Math.min(maxInlinks - result.size(), inlinks.size());
   for (int i = 0; i  end; i++) {
 result.add(inlinks.get(i));




svn commit: r240346 - /lucene/nutch/branches/mapred/conf/nutch-default.xml

2005-08-26 Thread cutting
Author: cutting
Date: Fri Aug 26 14:21:08 2005
New Revision: 240346

URL: http://svn.apache.org/viewcvs?rev=240346view=rev
Log:
Fix a crazy default.  This made indexing rather slow...

Modified:
lucene/nutch/branches/mapred/conf/nutch-default.xml

Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=240346r1=240345r2=240346view=diff
==
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Fri Aug 26 14:21:08 2005
@@ -488,12 +488,12 @@
 
 property
   nameindexer.maxMergeDocs/name
-  value50/value
+  value2147483647/value
   descriptionThis number determines the maximum number of Lucene
   Documents to be merged into a new Lucene segment. Larger values
-  increase indexing speed and reduce the number of Lucene segments,
+  increase batch indexing speed and reduce the number of Lucene segments,
   which reduces the number of open file handles; however, this also
-  increases RAM usage during indexing.
+  decreases incremental indexing performance.
   /description
 /property
 




svn commit: r235756 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java

2005-08-22 Thread cutting
Author: cutting
Date: Mon Aug 22 10:08:17 2005
New Revision: 235756

URL: http://svn.apache.org/viewcvs?rev=235756view=rev
Log:
Always kill forked child so that it doesn't consume file handles.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java?rev=235756r1=235755r2=235756view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java 
Mon Aug 22 10:08:17 2005
@@ -134,6 +134,8 @@
   
 } catch (InterruptedException e) {
   throw new IOException(e.toString());
+} finally {
+  kill();
 }
   }
 




svn commit: r233569 - /lucene/nutch/branches/mapred/bin/nutch-daemon.sh

2005-08-19 Thread cutting
Author: cutting
Date: Fri Aug 19 15:54:04 2005
New Revision: 233569

URL: http://svn.apache.org/viewcvs?rev=233569view=rev
Log:
Fix to sync whole tree.

Modified:
lucene/nutch/branches/mapred/bin/nutch-daemon.sh

Modified: lucene/nutch/branches/mapred/bin/nutch-daemon.sh
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/nutch-daemon.sh?rev=233569r1=233568r2=233569view=diff
==
--- lucene/nutch/branches/mapred/bin/nutch-daemon.sh (original)
+++ lucene/nutch/branches/mapred/bin/nutch-daemon.sh Fri Aug 19 15:54:04 2005
@@ -57,7 +57,7 @@
 root=`dirname $this`/..
 if [ $NUTCH_MASTER !=  ]; then
   echo rsync from $NUTCH_MASTER
-  rsync -a --delete --exclude=.svn $NUTCH_MASTER/{build,bin,lib,conf} $root
+  rsync -a --delete --exclude=.svn $NUTCH_MASTER/ $root
 fi
 
 cd $root




svn commit: r233360 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java

2005-08-18 Thread cutting
Author: cutting
Date: Thu Aug 18 12:19:05 2005
New Revision: 233360

URL: http://svn.apache.org/viewcvs?rev=233360view=rev
Log:
Fix a bug in equals(), whether other object may still be deflated.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java?rev=233360r1=233359r2=233360view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java 
Thu Aug 18 12:19:05 2005
@@ -158,6 +158,7 @@
   return false;
 }
 Content that = (Content)o;
+that.ensureInflated();
 return
   this.url.equals(that.url) 
   this.base.equals(that.base) 




svn commit: r232841 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch: io/CompressedWritable.java protocol/Content.java

2005-08-15 Thread cutting
Author: cutting
Date: Mon Aug 15 11:10:23 2005
New Revision: 232841

URL: http://svn.apache.org/viewcvs?rev=232841view=rev
Log:
Lazily decompress content.

Added:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java

Added: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java?rev=232841view=auto
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java
 (added)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java
 Mon Aug 15 11:10:23 2005
@@ -0,0 +1,81 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the License);
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.io;
+
+import java.io.IOException;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.DataInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.ByteArrayInputStream;
+import java.util.zip.DeflaterOutputStream;
+import java.util.zip.InflaterInputStream;
+
+/** A base-class for Writables which store themselves compressed and lazily
+ * inflate on field access.  This is useful for large objects whose fields are
+ * not be altered during a map or reduce operation: leaving the field data
+ * compressed makes copying the instance from one file to another much
+ * faster. */
+public abstract class CompressedWritable implements Writable {
+  // if non-null, the compressed field data of this instance.
+  private byte[] compressed;
+
+  public CompressedWritable() {}
+
+  public final void readFields(DataInput in) throws IOException {
+compressed = new byte[in.readInt()];
+in.readFully(compressed, 0, compressed.length);
+  }
+
+  /** Must be called by all methods which access fields to ensure that the data
+   * has been uncompressed. */
+  protected void ensureInflated() {
+if (compressed != null) {
+  try {
+ByteArrayInputStream deflated = new ByteArrayInputStream(compressed);
+DataInput inflater =
+  new DataInputStream(new InflaterInputStream(deflated));
+readFieldsCompressed(inflater);
+compressed = null;
+  } catch (IOException e) {
+throw new RuntimeException(e);
+  }
+}
+  }
+
+  /** Subclasses implement this instead of [EMAIL PROTECTED] 
#readFields(DataInput)}. */
+  protected abstract void readFieldsCompressed(DataInput in)
+throws IOException;
+
+  public final void write(DataOutput out) throws IOException {
+if (compressed == null) {
+  ByteArrayOutputStream deflated = new ByteArrayOutputStream();
+  DataOutputStream deflater =
+new DataOutputStream(new DeflaterOutputStream(deflated));
+  writeCompressed(deflater);
+  deflater.close();
+  compressed = deflated.toByteArray();
+}
+out.writeInt(compressed.length);
+out.write(compressed);
+  }
+
+  /** Subclasses implement this instead of [EMAIL PROTECTED] 
#write(DataOutput)}. */
+  protected abstract void writeCompressed(DataOutput out) throws IOException;
+
+}

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java?rev=232841r1=232840r2=232841view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java 
Mon Aug 15 11:10:23 2005
@@ -23,12 +23,13 @@
 import org.apache.nutch.fs.*;
 import org.apache.nutch.util.*;
 
-public final class Content extends VersionedWritable {
+public final class Content extends CompressedWritable {
 
   public static final String DIR_NAME = content;
 
   private final static byte VERSION = 1;
 
+  private byte version;
   private String url;
   private String base;
   private byte[] content;
@@ -53,15 +54,16 @@
 this.metadata = metadata;
   }
 
-  public byte getVersion() { return VERSION; }
-
-  public final void readFields(DataInput in) throws IOException {
-super.readFields

svn commit: r225344 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java

2005-07-26 Thread cutting
Author: cutting
Date: Tue Jul 26 09:40:00 2005
New Revision: 225344

URL: http://svn.apache.org/viewcvs?rev=225344view=rev
Log:
Fix bug with syncs in large merges.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java?rev=225344r1=225343r2=225344view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java 
Tue Jul 26 09:40:00 2005
@@ -618,7 +618,8 @@
 this.pass = pass;
 this.last = last;
 
-this.queue = new MergeQueue(factor, last ? outFile : outFile+.+pass);
+this.queue =
+  new MergeQueue(factor, last ? outFile : outFile+.+pass, last);
 
 this.inName = outFile+.+(pass-1);
 this.in = new NFSDataInputStream(nfs.open(new File(inName)));
@@ -695,7 +696,7 @@
   private MergeQueue queue;
 
   public MergeFiles() throws IOException {
-this.queue = new MergeQueue(factor, outFile);
+this.queue = new MergeQueue(factor, outFile, true);
   }
 
   public void close() throws IOException {
@@ -741,12 +742,15 @@
 
 private class MergeQueue extends PriorityQueue {
   private NFSDataOutputStream out;
+  private boolean done;
 
-  public MergeQueue(int size, String outName) throws IOException {
+  public MergeQueue(int size, String outName, boolean done)
+throws IOException {
 initialize(size);
 this.out =
   new NFSDataOutputStream(nfs.create(new File(outName)),
   memory/(factor+1));
+this.done = done;
   }
 
   protected boolean lessThan(Object a, Object b) {
@@ -758,6 +762,9 @@
 
   public void merge() throws IOException {
 Writer writer = new Writer(out, keyClass, valClass);
+if (!done) {
+  writer.sync = null; // disable sync on temp files
+}
 
 while (size() != 0) {
   MergeStream ms = (MergeStream)top();




svn commit: r219566 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred: TaskRunner.java TaskTracker.java

2005-07-18 Thread cutting
Author: cutting
Date: Mon Jul 18 13:57:34 2005
New Revision: 219566

URL: http://svn.apache.org/viewcvs?rev=219566view=rev
Log:
Catch Throwable, not just Exception, and always log and report it to tracker.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java?rev=219566r1=219565r2=219566view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java 
Mon Jul 18 13:57:34 2005
@@ -94,8 +94,15 @@
 t.getTaskId() // pass task identifier
   }, null);
 
-} catch (Exception e) {
-  LOG.log(Level.WARNING, Child Error, e);
+} catch (Throwable throwable) {
+  LOG.log(Level.WARNING, Child Error, throwable);
+  ByteArrayOutputStream baos = new ByteArrayOutputStream();
+  throwable.printStackTrace(new PrintStream(baos));
+  try {
+tracker.reportDiagnosticInfo(t.getTaskId(), baos.toString());
+  } catch (IOException e) {
+LOG.log(Level.WARNING, Reporting Diagnostics, e);
+  }
 } finally {
   tracker.reportTaskFinished(t.getTaskId());
 }

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java?rev=219566r1=219565r2=219566view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java 
Mon Jul 18 13:57:34 2005
@@ -502,7 +502,7 @@
  * The main() for child processes. 
  */
 public static class Child {
-public static void main(String[] args) throws Exception {
+public static void main(String[] args) throws Throwable {
   LogFormatter.showTime(false);
   LOG.info(Child starting);
 
@@ -516,12 +516,12 @@
   JobConf job = new JobConf(task.getJobFile());
   try {
   task.run(job, umbilical);   // run the task
-  } catch (Exception ie) {
+  } catch (Throwable throwable) {
+  LOG.log(Level.WARNING, Failed to spawn child, throwable);
   // Report back any failures, for diagnostic purposes
   ByteArrayOutputStream baos = new ByteArrayOutputStream();
-  ie.printStackTrace(new PrintStream(baos));
+  throwable.printStackTrace(new PrintStream(baos));
   umbilical.reportDiagnosticInfo(taskid, baos.toString());
-  throw ie;
   }
   umbilical.done(taskid);
 }




svn commit: r219563 - in /lucene/nutch/branches/mapred/conf: crawl-urlfilter.txt.template regex-urlfilter.txt.template

2005-07-18 Thread cutting
Author: cutting
Date: Mon Jul 18 13:42:37 2005
New Revision: 219563

URL: http://svn.apache.org/viewcvs?rev=219563view=rev
Log:
Skip URLs with repeating segments.

Modified:
lucene/nutch/branches/mapred/conf/crawl-urlfilter.txt.template
lucene/nutch/branches/mapred/conf/regex-urlfilter.txt.template

Modified: lucene/nutch/branches/mapred/conf/crawl-urlfilter.txt.template
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/crawl-urlfilter.txt.template?rev=219563r1=219562r2=219563view=diff
==
--- lucene/nutch/branches/mapred/conf/crawl-urlfilter.txt.template (original)
+++ lucene/nutch/branches/mapred/conf/crawl-urlfilter.txt.template Mon Jul 18 
13:42:37 2005
@@ -17,6 +17,9 @@
 # skip URLs containing certain characters as probable queries, etc.
 [EMAIL PROTECTED]
 
+# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
+-.*(/.+?)/.*?\1/.*?\1/
+
 # accept hosts in MY.DOMAIN.NAME
 +^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/
 

Modified: lucene/nutch/branches/mapred/conf/regex-urlfilter.txt.template
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/regex-urlfilter.txt.template?rev=219563r1=219562r2=219563view=diff
==
--- lucene/nutch/branches/mapred/conf/regex-urlfilter.txt.template (original)
+++ lucene/nutch/branches/mapred/conf/regex-urlfilter.txt.template Mon Jul 18 
13:42:37 2005
@@ -15,5 +15,8 @@
 # skip URLs containing certain characters as probable queries, etc.
 [EMAIL PROTECTED]
 
+# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
+-.*(/.+?)/.*?\1/.*?\1/
+
 # accept anything else
 +.




svn commit: r210201 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java

2005-07-11 Thread cutting
Author: cutting
Date: Mon Jul 11 13:05:28 2005
New Revision: 210201

URL: http://svn.apache.org/viewcvs?rev=210201view=rev
Log:
Store indexes in indexes directory.  Use correct FS to list segments.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java?rev=210201r1=210200r2=210201view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java Mon 
Jul 11 13:05:28 2005
@@ -91,7 +91,7 @@
 File crawlDb = new File(dir + /crawldb);
 File linkDb = new File(dir + /linkdb);
 File segments = new File(dir + /segments);
-File index = new File(dir + /index);
+File index = new File(dir + /indexes);
   
 // initialize crawlDb
 new Injector(conf).inject(crawlDb, rootUrlFile);
@@ -108,7 +108,7 @@
 new LinkDb(conf).invert(linkDb, segments); // invert links
 
 // index
-new Indexer(conf).index(index, linkDb, segments.listFiles());
+new Indexer(conf).index(index, linkDb, fs.listFiles(segments));
 
 LOG.info(crawl finished:  + dir);
   }




svn commit: r213607 - in /lucene/nutch/branches/mapred: ./ conf/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/searcher/

2005-07-11 Thread cutting
Author: cutting
Date: Mon Jul 11 14:30:22 2005
New Revision: 213607

URL: http://svn.apache.org/viewcvs?rev=213607view=rev
Log:
Get search working on NDFS-resident, MapReduce-created crawl.

Modified:
lucene/nutch/branches/mapred/build.xml
lucene/nutch/branches/mapred/conf/nutch-default.xml
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitDetails.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/IndexSearcher.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java

Modified: lucene/nutch/branches/mapred/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/build.xml?rev=213607r1=213606r2=213607view=diff
==
--- lucene/nutch/branches/mapred/build.xml (original)
+++ lucene/nutch/branches/mapred/build.xml Mon Jul 11 14:30:22 2005
@@ -119,7 +119,7 @@
   !-- == --
   !----
   !-- == --
-  target name=war depends=compile,generate-docs
+  target name=war depends=jar,compile,generate-docs
 war destfile=${build.dir}/${final.name}.war
 webxml=${web.src.dir}/web.xml
   fileset dir=${web.src.dir}/jsp/

Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=213607r1=213606r2=213607view=diff
==
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Mon Jul 11 14:30:22 2005
@@ -498,9 +498,9 @@
 
 property
   namesearcher.dir/name
-  value./value
+  valuecrawl/value
   description
-  Path to root of index directories.  This directory is searched (in
+  Path to root of crawl.  This directory is searched (in
   order) for either the file search-servers.txt, containing a list of
   distributed search servers, or the directory index containing
   merged indexes, or the directory segments containing segment

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java?rev=213607r1=213606r2=213607view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java 
Mon Jul 11 14:30:22 2005
@@ -100,6 +100,7 @@
 writer.optimize();
 writer.close();
 fs.completeLocalOutput(perm, temp);   // copy to ndfs
+fs.createNewFile(new File(perm, IndexSegment.DONE_NAME));
   }
 };
 }

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java?rev=213607r1=213606r2=213607view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java
 Mon Jul 11 14:30:22 2005
@@ -1,5 +1,3 @@
-package org.apache.lucene.store;
-
 /**
  * Copyright 2004 The Apache Software Foundation
  *
@@ -15,6 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+package org.apache.nutch.indexer;
 
 import java.io.*;
 import org.apache.lucene.store.*;

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=213607r1=213606r2=213607view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java
 Mon Jul 11 14:30:22 2005
@@ -39,57 +39,64 @@
 private NutchFileSystem nfs;
 private File segmentDir;
 
-private ArrayFile.Reader fetcher;
-private ArrayFile.Reader content;
-private ArrayFile.Reader text;
-private ArrayFile.Reader parsedata;
+private MapFile.Reader[] content;
+private

svn commit: r210036 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java

2005-07-10 Thread cutting
Author: cutting
Date: Sun Jul 10 14:20:46 2005
New Revision: 210036

URL: http://svn.apache.org/viewcvs?rev=210036view=rev
Log:
Actually use the new InputFormat!

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=210036r1=210035r2=210036view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java 
Sun Jul 10 14:20:46 2005
@@ -38,7 +38,7 @@
   
   public static final String DIGEST_KEY = nutch.content.digest;
 
-  public class InputFormat extends SequenceFileInputFormat {
+  public static class InputFormat extends SequenceFileInputFormat {
 /** Don't split inputs, to keep things polite. */
 public FileSplit[] getSplits(NutchFileSystem fs, JobConf job, int nSplits)
   throws IOException {
@@ -253,7 +253,7 @@
 job.setInt(fetcher.threads.fetch, threads);
 
 job.setInputDir(new File(segment, CrawlDatum.GENERATE_DIR_NAME));
-job.setInputFormat(SequenceFileInputFormat.class);
+job.setInputFormat(InputFormat.class);
 job.setInputKeyClass(UTF8.class);
 job.setInputValueClass(CrawlDatum.class);
 




  1   2   >