svn commit: r798304 [3/3] - in /lucene/nutch/logos: ./ character-hand-big.png character.eps nutch_logo.eps nutch_logo.png

2009-07-27 Thread cutting
Added: lucene/nutch/logos/nutch_logo.eps
URL: 
http://svn.apache.org/viewvc/lucene/nutch/logos/nutch_logo.eps?rev=798304&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/logos/nutch_logo.eps
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/logos/nutch_logo.png
URL: 
http://svn.apache.org/viewvc/lucene/nutch/logos/nutch_logo.png?rev=798304&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/logos/nutch_logo.png
--
svn:mime-type = application/octet-stream




svn commit: r798304 [1/3] - in /lucene/nutch/logos: ./ character-hand-big.png character.eps nutch_logo.eps nutch_logo.png

2009-07-27 Thread cutting
Author: cutting
Date: Mon Jul 27 22:06:52 2009
New Revision: 798304

URL: http://svn.apache.org/viewvc?rev=798304&view=rev
Log:
Adding high-resolution original logo artwork.

Added:
lucene/nutch/logos/
lucene/nutch/logos/character-hand-big.png   (with props)
lucene/nutch/logos/character.eps
lucene/nutch/logos/nutch_logo.eps   (with props)
lucene/nutch/logos/nutch_logo.png   (with props)

Added: lucene/nutch/logos/character-hand-big.png
URL: 
http://svn.apache.org/viewvc/lucene/nutch/logos/character-hand-big.png?rev=798304&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/logos/character-hand-big.png
--
svn:mime-type = application/octet-stream




svn commit: r503832 - /lucene/nutch/nightly/nightly.cron

2007-02-05 Thread cutting
Author: cutting
Date: Mon Feb  5 11:22:43 2007
New Revision: 503832

URL: http://svn.apache.org/viewvc?view=rev&rev=503832
Log:
Changed to a time with a potentially lower load.

Modified:
lucene/nutch/nightly/nightly.cron

Modified: lucene/nutch/nightly/nightly.cron
URL: 
http://svn.apache.org/viewvc/lucene/nutch/nightly/nightly.cron?view=diff&rev=503832&r1=503831&r2=503832
==
--- lucene/nutch/nightly/nightly.cron (original)
+++ lucene/nutch/nightly/nightly.cron Mon Feb  5 11:22:43 2007
@@ -1,4 +1,4 @@
 # nightly crontab file
 # install with: crontab nightly.cron
-# run seventeen minutes after midnight, every day
-17 0 * * *   $HOME/nutch-nightly/nightly.sh > 
$HOME/nutch-nightly/nightly.log 2>&1
+# run at 6:51, every day
+51 6 * * *   $HOME/nutch-nightly/nightly.sh > 
$HOME/nutch-nightly/nightly.log 2>&1




svn commit: r475926 - /lucene/nutch/nightly/nightly.sh

2006-11-17 Thread cutting
Author: cutting
Date: Thu Nov 16 13:03:26 2006
New Revision: 475926

URL: http://svn.apache.org/viewvc?view=rev&rev=475926
Log:
Update nightly build location.

Modified:
lucene/nutch/nightly/nightly.sh

Modified: lucene/nutch/nightly/nightly.sh
URL: 
http://svn.apache.org/viewvc/lucene/nutch/nightly/nightly.sh?view=diff&rev=475926&r1=475925&r2=475926
==
--- lucene/nutch/nightly/nightly.sh (original)
+++ lucene/nutch/nightly/nightly.sh Thu Nov 16 13:03:26 2006
@@ -5,7 +5,7 @@
 TRUNK=http://svn.apache.org/repos/asf/lucene/nutch/trunk
 
 REL_SERVER=people.apache.org
-REL_DIR=/www/people.apache.org/dist/lucene/nutch/nightly
+REL_DIR=/www/people.apache.org/builds/lucene/nutch/nightly
 
 # create an empty build directory
 rm -rf /tmp/nutch-nightly




svn commit: r421185 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java

2006-07-12 Thread cutting
Author: cutting
Date: Wed Jul 12 01:16:37 2006
New Revision: 421185

URL: http://svn.apache.org/viewvc?rev=421185&view=rev
Log:
Patch a bug introduced by Hadoop 0.4.0, which requires specified input
directories to exist.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=421185&r1=421184&r2=421185&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Wed Jul 12 
01:16:37 2006
@@ -65,7 +65,8 @@
 if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: done"); }
   }
 
-  public static JobConf createJob(Configuration config, Path crawlDb) {
+  public static JobConf createJob(Configuration config, Path crawlDb)
+throws IOException {
 Path newCrawlDb =
   new Path(crawlDb,
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
@@ -73,7 +74,11 @@
 JobConf job = new NutchJob(config);
 job.setJobName("crawldb " + crawlDb);
 
-job.addInputPath(new Path(crawlDb, CrawlDatum.DB_DIR_NAME));
+
+Path current = new Path(crawlDb, CrawlDatum.DB_DIR_NAME);
+if (FileSystem.get(job).exists(current)) {
+  job.addInputPath(current);
+}
 job.setInputFormat(SequenceFileInputFormat.class);
 job.setInputKeyClass(UTF8.class);
 job.setInputValueClass(CrawlDatum.class);




svn commit: r418739 - /lucene/nutch/nightly/nightly.sh

2006-07-03 Thread cutting
Author: cutting
Date: Mon Jul  3 03:44:31 2006
New Revision: 418739

URL: http://svn.apache.org/viewvc?rev=418739&view=rev
Log:
Use JDK 1.5 for nightly build.

Modified:
lucene/nutch/nightly/nightly.sh

Modified: lucene/nutch/nightly/nightly.sh
URL: 
http://svn.apache.org/viewvc/lucene/nutch/nightly/nightly.sh?rev=418739&r1=418738&r2=418739&view=diff
==
--- lucene/nutch/nightly/nightly.sh (original)
+++ lucene/nutch/nightly/nightly.sh Mon Jul  3 03:44:31 2006
@@ -1,6 +1,6 @@
 #!/bin/bash -vx
 
-export JAVA_HOME=/usr/j2se
+export JAVA_HOME=$HOME/local/jdk
 
 TRUNK=http://svn.apache.org/repos/asf/lucene/nutch/trunk
 




svn commit: r417884 - in /lucene/nutch/trunk: lib/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/segment/

2006-06-28 Thread cutting
Author: cutting
Date: Wed Jun 28 14:54:53 2006
New Revision: 417884

URL: http://svn.apache.org/viewvc?rev=417884&view=rev
Log:
NUTCH-312.  Upgrade to Hadoop 0.4.0.

Added:
lucene/nutch/trunk/lib/commons-cli-2.0-SNAPSHOT.jar   (with props)
lucene/nutch/trunk/lib/hadoop-0.4.0.jar   (with props)
Removed:
lucene/nutch/trunk/lib/hadoop-0.3.2.jar
Modified:

lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java

Added: lucene/nutch/trunk/lib/commons-cli-2.0-SNAPSHOT.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-cli-2.0-SNAPSHOT.jar?rev=417884&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/commons-cli-2.0-SNAPSHOT.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/hadoop-0.4.0.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.4.0.jar?rev=417884&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/hadoop-0.4.0.jar
--
svn:mime-type = application/octet-stream

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java?rev=417884&r1=417883&r2=417884&view=diff
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java 
Wed Jun 28 14:54:53 2006
@@ -31,6 +31,7 @@
 import org.apache.hadoop.mapred.RecordWriter;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.Progressable;
 
 import org.apache.nutch.parse.ParseOutputFormat;
 import org.apache.nutch.protocol.Content;
@@ -45,7 +46,8 @@
 
   public RecordWriter getRecordWriter(final FileSystem fs,
   final JobConf job,
-  final String name) throws IOException {
+  final String name,
+  final Progressable progress) throws 
IOException {
 
 final Path fetch =
   new Path(new Path(job.getOutputPath(), CrawlDatum.FETCH_DIR_NAME), name);
@@ -66,7 +68,7 @@
   }
 
   if (Fetcher.isParsing(job)) {
-parseOut = new ParseOutputFormat().getRecordWriter(fs, job, name);
+parseOut = new ParseOutputFormat().getRecordWriter(fs, job, name, 
null);
   }
 }
 

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=417884&r1=417883&r2=417884&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java 
Wed Jun 28 14:54:53 2006
@@ -26,6 +26,7 @@
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.*;
+import org.apache.hadoop.util.Progressable;
 
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
@@ -276,7 +277,8 @@
   /** Write nothing. */
   public RecordWriter getRecordWriter(final FileSystem fs,
   final JobConf job,
-  final String name) throws IOException {
+  final String name,
+  final Progressable progress) throws 
IOException {
 return new RecordWriter() {   
 public void write(WritableComparable key, Writable value)
   throws IOException {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?rev=417884&r1=417883&r2=417884&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexe

svn commit: r413175 - in /lucene/nutch/trunk/lib: hadoop-0.3.1.jar hadoop-0.3.2.jar

2006-06-09 Thread cutting
Author: cutting
Date: Fri Jun  9 14:48:23 2006
New Revision: 413175

URL: http://svn.apache.org/viewvc?rev=413175&view=rev
Log:
Upgrading to Hadoop 0.3.2 release.

Added:
lucene/nutch/trunk/lib/hadoop-0.3.2.jar   (with props)
Removed:
lucene/nutch/trunk/lib/hadoop-0.3.1.jar

Added: lucene/nutch/trunk/lib/hadoop-0.3.2.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.3.2.jar?rev=413175&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/hadoop-0.3.2.jar
--
svn:mime-type = application/octet-stream




svn commit: r411943 - in /lucene/nutch/trunk/lib: commons-logging-1.0.4.jar hadoop-0.2.1.jar hadoop-0.3.1.jar log4j-1.2.13.jar

2006-06-05 Thread cutting
Author: cutting
Date: Mon Jun  5 16:03:45 2006
New Revision: 411943

URL: http://svn.apache.org/viewvc?rev=411943&view=rev
Log:
Updating to Hadoop release 0.3.1.  Hadoop now uses Jakarta Commons Logging, 
configured for log4j by default.

Added:
lucene/nutch/trunk/lib/commons-logging-1.0.4.jar   (with props)
lucene/nutch/trunk/lib/hadoop-0.3.1.jar   (with props)
lucene/nutch/trunk/lib/log4j-1.2.13.jar   (with props)
Removed:
lucene/nutch/trunk/lib/hadoop-0.2.1.jar

Added: lucene/nutch/trunk/lib/commons-logging-1.0.4.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-logging-1.0.4.jar?rev=411943&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/commons-logging-1.0.4.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/hadoop-0.3.1.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.3.1.jar?rev=411943&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/hadoop-0.3.1.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/log4j-1.2.13.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/log4j-1.2.13.jar?rev=411943&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/log4j-1.2.13.jar
--
svn:mime-type = application/octet-stream




svn commit: r409769 - in /lucene/nutch/trunk: LICENSE.txt NOTICE.txt

2006-05-26 Thread cutting
Author: cutting
Date: Fri May 26 15:27:07 2006
New Revision: 409769

URL: http://svn.apache.org/viewvc?rev=409769&view=rev
Log:
Add NOTICE.txt file and put full license in LICENSE.txt, to better conform to 
http://www.apache.org/dev/apply-license.html.

Added:
lucene/nutch/trunk/NOTICE.txt
Modified:
lucene/nutch/trunk/LICENSE.txt

Modified: lucene/nutch/trunk/LICENSE.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/LICENSE.txt?rev=409769&r1=409768&r2=409769&view=diff
==
--- lucene/nutch/trunk/LICENSE.txt (original)
+++ lucene/nutch/trunk/LICENSE.txt Fri May 26 15:27:07 2006
@@ -1,15 +1,202 @@
-/**
- * Copyright 2004 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+
+ Apache License
+   Version 2.0, January 2004
+http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+  "License" shall mean the terms and conditions for use, reproduction,
+  and distribution as defined by Sections 1 through 9 of this document.
+
+  "Licensor" shall mean the copyright owner or entity authorized by
+  the copyright owner that is granting the License.
+
+  "Legal Entity" shall mean the union of the acting entity and all
+  other entities that control, are controlled by, or are under common
+  control with that entity. For the purposes of this definition,
+  "control" means (i) the power, direct or indirect, to cause the
+  direction or management of such entity, whether by contract or
+  otherwise, or (ii) ownership of fifty percent (50%) or more of the
+  outstanding shares, or (iii) beneficial ownership of such entity.
+
+  "You" (or "Your") shall mean an individual or Legal Entity
+  exercising permissions granted by this License.
+
+  "Source" form shall mean the preferred form for making modifications,
+  including but not limited to software source code, documentation
+  source, and configuration files.
+
+  "Object" form shall mean any form resulting from mechanical
+  transformation or translation of a Source form, including but
+  not limited to compiled object code, generated documentation,
+  and conversions to other media types.
+
+  "Work" shall mean the work of authorship, whether in Source or
+  Object form, made available under the License, as indicated by a
+  copyright notice that is included in or attached to the work
+  (an example is provided in the Appendix below).
+
+  "Derivative Works" shall mean any work, whether in Source or Object
+  form, that is based on (or derived from) the Work and for which the
+  editorial revisions, annotations, elaborations, or other modifications
+  represent, as a whole, an original work of authorship. For the purposes
+  of this License, Derivative Works shall not include works that remain
+  separable from, or merely link (or bind by name) to the interfaces of,
+  the Work and Derivative Works thereof.
+
+  "Contribution" shall mean any work of authorship, including
+  the original version of the Work and any modifications or additions
+  to that Work or Derivative Works thereof, that is intentionally
+  submitted to Licensor for inclusion in the Work by the copyright owner
+  or by an individual or Legal Entity authorized to submit on behalf of
+  the copyright owner. For the purposes of this definition, "submitted"
+  means any form of electronic, verbal, or written communication sent
+  to the Licensor or its representatives, including but not limited to
+  communication on electronic mailing lists, source code control systems,
+  and issue tracking systems that are managed by, or on behalf of, the
+  Licensor for the purpose of discussing and improving the Work, but
+  excluding communication that is conspicuously marked or otherwise
+  designated in writing by the copyright owner as "Not a Contribution."
+
+  "Contributor" shall mean Licensor and any individual or Legal Entity
+  on behalf of whom a Contribution has been 

svn commit: r405861 - in /lucene/nutch/trunk/lib: hadoop-0.2.0.jar hadoop-0.2.1.jar

2006-05-12 Thread cutting
Author: cutting
Date: Fri May 12 13:31:59 2006
New Revision: 405861

URL: http://svn.apache.org/viewcvs?rev=405861&view=rev
Log:
Upgrading to Hadoop 0.2.1.

Added:
lucene/nutch/trunk/lib/hadoop-0.2.1.jar   (with props)
Removed:
lucene/nutch/trunk/lib/hadoop-0.2.0.jar

Added: lucene/nutch/trunk/lib/hadoop-0.2.1.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.2.1.jar?rev=405861&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/hadoop-0.2.1.jar
--
svn:mime-type = application/octet-stream




svn commit: r400199 - in /lucene/nutch/trunk/lib: hadoop-0.1.1.jar hadoop-0.2.0.jar

2006-05-05 Thread cutting
Author: cutting
Date: Fri May  5 15:44:04 2006
New Revision: 400199

URL: http://svn.apache.org/viewcvs?rev=400199&view=rev
Log:
Upgrading to Hadoop 0.2.0.

Added:
lucene/nutch/trunk/lib/hadoop-0.2.0.jar   (with props)
Removed:
lucene/nutch/trunk/lib/hadoop-0.1.1.jar

Added: lucene/nutch/trunk/lib/hadoop-0.2.0.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.2.0.jar?rev=400199&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/hadoop-0.2.0.jar
--
svn:mime-type = application/octet-stream




svn commit: r400159 - /lucene/nutch/trunk/bin/

2006-05-05 Thread cutting
Author: cutting
Date: Fri May  5 13:01:44 2006
New Revision: 400159

URL: http://svn.apache.org/viewcvs?rev=400159&view=rev
Log:
Ignore bin/rcc (from Hadoop).

Modified:
lucene/nutch/trunk/bin/   (props changed)

Propchange: lucene/nutch/trunk/bin/
--
--- svn:ignore (original)
+++ svn:ignore Fri May  5 13:01:44 2006
@@ -1,6 +1,7 @@
 hadoop
 hadoop-daemon.sh
 hadoop-daemons.sh
+rcc
 slaves.sh
 start-all.sh
 start-dfs.sh




svn commit: r395676 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Generator.java

2006-04-20 Thread cutting
Author: cutting
Date: Thu Apr 20 12:18:56 2006
New Revision: 395676

URL: http://svn.apache.org/viewcvs?rev=395676&view=rev
Log:
Fix NUTCH-108.  Log hosts that exceed generate.max.per.host.  Contributed by 
Rod Taylor.

Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/CHANGES.txt?rev=395676&r1=395675&r2=395676&view=diff
==
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Apr 20 12:18:56 2006
@@ -4,6 +4,10 @@
 
  1. NUTCH-107 - Typo in plugin/urlfilter-*/plugin.xml. (Stephen Cross).
 
+ 2. NUTCH-108 - Log hosts that exceed generate.max.per.host.
+   (Rod Taylor via cutting)
+
+
 Release 0.7 - 2005-08-17
 
  1. Added support for "type:" in queries. Search results are limited/qualified

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=395676&r1=395675&r2=395676&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Thu Apr 
20 12:18:56 2006
@@ -127,12 +127,18 @@
 if (maxPerHost > 0) { // are we counting hosts?
   String host = new URL(url.toString()).getHost();
   Integer hostCount = (Integer)hostCounts.get(host);
-  if (hostCount != null) {
-if (hostCount.intValue() >= maxPerHost)
-  continue;   // too many from host
-hostCounts.put(host, new Integer(hostCount.intValue()+1));
-  } else {// update host count
-hostCounts.put(host, new Integer(1));
+
+  // increment hostCount
+  hostCount = new Integer(hostCount==null ? 1 : 
hostCount.intValue()+1);
+  hostCounts.put(host, hostCount);
+
+  // skip URL if above the limit per host.
+  if (hostCount.intValue() > maxPerHost) {
+if (hostCount.intValue() == maxPerHost + 1) {
+  LOG.info("Host "+ host +" has more than "+ maxPerHost +" URLs."+
+   " Skipping additional.");
+}
+continue;
   }
 }
 




svn commit: r394781 - /lucene/nutch/trunk/bin/

2006-04-17 Thread cutting
Author: cutting
Date: Mon Apr 17 14:40:58 2006
New Revision: 394781

URL: http://svn.apache.org/viewcvs?rev=394781&view=rev
Log:
Ignore more bin files.

Modified:
lucene/nutch/trunk/bin/   (props changed)

Propchange: lucene/nutch/trunk/bin/
--
--- svn:ignore (original)
+++ svn:ignore Mon Apr 17 14:40:58 2006
@@ -3,4 +3,8 @@
 hadoop-daemons.sh
 slaves.sh
 start-all.sh
+start-dfs.sh
+start-mapred.sh
 stop-all.sh
+stop-dfs.sh
+stop-mapred.sh




svn commit: r392458 - in /lucene/nutch/trunk/lib: hadoop-0.1.0.jar hadoop-0.1.1.jar

2006-04-07 Thread cutting
Author: cutting
Date: Fri Apr  7 16:48:10 2006
New Revision: 392458

URL: http://svn.apache.org/viewcvs?rev=392458&view=rev
Log:
Upgrading to Hadoop release 0.1.1.

Added:
lucene/nutch/trunk/lib/hadoop-0.1.1.jar   (with props)
Removed:
lucene/nutch/trunk/lib/hadoop-0.1.0.jar

Added: lucene/nutch/trunk/lib/hadoop-0.1.1.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1.1.jar?rev=392458&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/hadoop-0.1.1.jar
--
svn:mime-type = application/octet-stream




svn commit: r391371 - /lucene/nutch/trunk/

2006-04-04 Thread cutting
Author: cutting
Date: Tue Apr  4 10:21:18 2006
New Revision: 391371

URL: http://svn.apache.org/viewcvs?rev=391371&view=rev
Log:
Ignore Eclipse .settings file.

Modified:
lucene/nutch/trunk/   (props changed)

Propchange: lucene/nutch/trunk/
--
--- svn:ignore (original)
+++ svn:ignore Tue Apr  4 10:21:18 2006
@@ -3,3 +3,4 @@
 nutch.jar
 .classpath
 .project
+.settings




svn commit: r390745 - in /lucene/nutch/trunk/lib: hadoop-0.1-dev.jar hadoop-0.1.0.jar

2006-04-01 Thread cutting
Author: cutting
Date: Sat Apr  1 12:16:22 2006
New Revision: 390745

URL: http://svn.apache.org/viewcvs?rev=390745&view=rev
Log:
Update to Hadoop 0.1.0 release.

Added:
lucene/nutch/trunk/lib/hadoop-0.1.0.jar   (with props)
Removed:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Added: lucene/nutch/trunk/lib/hadoop-0.1.0.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1.0.jar?rev=390745&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/hadoop-0.1.0.jar
--
svn:mime-type = application/octet-stream




svn commit: r389634 - /lucene/nutch/trunk/bin/nutch

2006-03-28 Thread cutting
Author: cutting
Date: Tue Mar 28 16:04:51 2006
New Revision: 389634

URL: http://svn.apache.org/viewcvs?rev=389634&view=rev
Log:
Fix a bug when there are spaces in CWD, as is common on Windows.

Modified:
lucene/nutch/trunk/bin/nutch

Modified: lucene/nutch/trunk/bin/nutch
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/bin/nutch?rev=389634&r1=389633&r2=389634&view=diff
==
--- lucene/nutch/trunk/bin/nutch (original)
+++ lucene/nutch/trunk/bin/nutch Tue Mar 28 16:04:51 2006
@@ -82,6 +82,9 @@
 CLASSPATH=${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}
 CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
 
+# so that filenames w/ spaces are handled correctly in loops below
+IFS=
+
 # for developers, add plugins, job & test code to CLASSPATH
 if [ -d "$NUTCH_HOME/build/plugins" ]; then
   CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build
@@ -92,9 +95,6 @@
 if [ -d "$NUTCH_HOME/build/test/classes" ]; then
   CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/test/classes
 fi
-
-# so that filenames w/ spaces are handled correctly in loops below
-IFS=
 
 # for releases, add Nutch job to CLASSPATH
 for f in $NUTCH_HOME/nutch-*.job; do




svn commit: r388310 - in /lucene/nutch/trunk: lib/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/

2006-03-23 Thread cutting
Author: cutting
Date: Thu Mar 23 16:57:56 2006
New Revision: 388310

URL: http://svn.apache.org/viewcvs?rev=388310&view=rev
Log:
Upgrade to latest Hadoop jar.  Add job names to Nutch mapred jobs.  Update 
OutputFormat implementations to implement new checkOutputSpecs() method.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=388310&r1=388309&r2=388310&view=diff
==
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=388310&r1=388309&r2=388310&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Thu Mar 23 
16:57:56 2006
@@ -63,6 +63,7 @@
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
 JobConf job = new NutchJob(config);
+job.setJobName("crawldb " + crawlDb);
 
 job.addInputDir(new File(crawlDb, CrawlDatum.DB_DIR_NAME));
 job.setInputFormat(SequenceFileInputFormat.class);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=388310&r1=388309&r2=388310&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Thu 
Mar 23 16:57:56 2006
@@ -140,6 +140,7 @@
 File tmpFolder = new File(crawlDb, "stat_tmp" + 
System.currentTimeMillis());
 
 JobConf job = new NutchJob(config);
+job.setJobName("stats " + crawlDb);
 
 job.addInputDir(new File(crawlDb, CrawlDatum.DB_DIR_NAME));
 job.setInputFormat(SequenceFileInputFormat.class);
@@ -227,6 +228,7 @@
 File outFolder = new File(output);
 
 JobConf job = new NutchJob(config);
+job.setJobName("dump " + crawlDb);
 
 job.addInputDir(new File(crawlDb, CrawlDatum.DB_DIR_NAME));
 job.setInputFormat(SequenceFileInputFormat.class);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=388310&r1=388309&r2=388310&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Thu Mar 
23 16:57:56 2006
@@ -189,6 +189,7 @@
 // map to inverted subset due for fetch, sort by link count
 LOG.info("Generator: Selecting most-linked urls due for fetch.");
 JobConf job = new NutchJob(getConf());
+job.setJobName("generate: select " + segment);
 
 if (numLists == -1) { // for politeness make
   numLists = job.getNumMapTasks();// a partition per fetch task
@@ -215,6 +216,7 @@
 // invert again, paritition by host, sort by url hash
 LOG.info("Generator: Partitioning selected urls by host, for politeness.");
 job = new NutchJob(getConf());
+job.setJobName("generate: partition " + segment);
 
 job.setInt("partition.url.by.host.seed", new Random().nextInt());
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=388310&r1=388309&r2=388310&vie

svn commit: r387310 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

2006-03-20 Thread cutting
Author: cutting
Date: Mon Mar 20 13:08:15 2006
New Revision: 387310

URL: http://svn.apache.org/viewcvs?rev=387310&view=rev
Log:
Upgrade to current Hadoop.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=387310&r1=387309&r2=387310&view=diff
==
Binary files - no diff available.




svn commit: r386181 - in /lucene/nutch/branches/branch-0.7: site/issue_tracking.html site/issue_tracking.pdf src/site/src/documentation/content/xdocs/issue_tracking.xml

2006-03-15 Thread cutting
Author: cutting
Date: Wed Mar 15 14:20:40 2006
New Revision: 386181

URL: http://svn.apache.org/viewcvs?rev=386181&view=rev
Log:
Updated link to jira.

Modified:
lucene/nutch/branches/branch-0.7/site/issue_tracking.html
lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf

lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml

Modified: lucene/nutch/branches/branch-0.7/site/issue_tracking.html
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/site/issue_tracking.html?rev=386181&r1=386180&r2=386181&view=diff
==
--- lucene/nutch/branches/branch-0.7/site/issue_tracking.html (original)
+++ lucene/nutch/branches/branch-0.7/site/issue_tracking.html Wed Mar 15 
14:20:40 2006
@@ -128,7 +128,7 @@
 
 
   Nutch issues (bugs, as well as enhancement requests) are tracked in 
-  Apache JIRA http://nagoya.apache.org/jira/browse/Nutch";>here.
+  Apache JIRA http://issues.apache.org/jira/browse/Nutch";>here.
   If you aren't sure whether something is a bug, post a question on the
   Nutch user mailing list.
 

Modified: lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf?rev=386181&r1=386180&r2=386181&view=diff
==
--- lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf (original)
+++ lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf Wed Mar 15 
14:20:40 2006
@@ -32,7 +32,7 @@
 /Rect [ 485.232 585.8 505.884 573.8 ]
 /C [ 0 0 0 ]
 /Border [ 0 0 0 ]
-/A << /URI (http://nagoya.apache.org/jira/browse/Nutch)
+/A << /URI (http://issues.apache.org/jira/browse/Nutch)
 /S /URI >>
 /H /I
 >>

Modified: 
lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml?rev=386181&r1=386180&r2=386181&view=diff
==
--- 
lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml
 (original)
+++ 
lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml
 Wed Mar 15 14:20:40 2006
@@ -11,7 +11,7 @@
   
 
   Nutch issues (bugs, as well as enhancement requests) are tracked in 
-  Apache JIRA http://nagoya.apache.org/jira/browse/Nutch";>here.
+  Apache JIRA http://issues.apache.org/jira/browse/Nutch";>here.
   If you aren't sure whether something is a bug, post a question on the
   Nutch user mailing list.
 




svn commit: r384843 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

2006-03-10 Thread cutting
Author: cutting
Date: Fri Mar 10 08:27:49 2006
New Revision: 384843

URL: http://svn.apache.org/viewcvs?rev=384843&view=rev
Log:
Upgrade to latest hadoop jar.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=384843&r1=384842&r2=384843&view=diff
==
Binary files - no diff available.




svn commit: r383698 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

2006-03-06 Thread cutting
Author: cutting
Date: Mon Mar  6 14:54:20 2006
New Revision: 383698

URL: http://svn.apache.org/viewcvs?rev=383698&view=rev
Log:
Upgrade to latest version of Hadoop.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=383698&r1=383697&r2=383698&view=diff
==
Binary files - no diff available.




svn commit: r382939 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

2006-03-03 Thread cutting
Author: cutting
Date: Fri Mar  3 13:46:21 2006
New Revision: 382939

URL: http://svn.apache.org/viewcvs?rev=382939&view=rev
Log:
Upgrade hadoop to latest version with some important mapred bug fixes.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=382939&r1=382938&r2=382939&view=diff
==
Binary files - no diff available.




svn commit: r382912 - in /lucene/nutch/trunk/src/java/org/apache/nutch: crawl/ fetcher/ indexer/ parse/ plugin/ searcher/ segment/

2006-03-03 Thread cutting
Author: cutting
Date: Fri Mar  3 11:05:41 2006
New Revision: 382912

URL: http://svn.apache.org/viewcvs?rev=382912&view=rev
Log:
Undo unintentional changes made in r381751.  Thanks, Jerome, for catching this!

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=382912&r1=382911&r2=382912&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Fri Mar  3 
11:05:41 2006
@@ -44,11 +44,11 @@
 
 
   /* Perform complete crawling and indexing given a set of root urls. */
-  public static boolean doMain(String args[]) throws Exception {
+  public static void main(String args[]) throws Exception {
 if (args.length < 1) {
   System.out.println
 ("Usage: Crawl  [-dir d] [-threads n] [-depth i] [-topN N]");
-  return false;
+  return;
 }
 
 Configuration conf = NutchConfiguration.create();
@@ -122,22 +122,5 @@
 new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir, job).merge();
 
 LOG.info("crawl finished: " + dir);
-
-return true;
-  }
-
-  /**
-   * main() wrapper that returns proper exit status
-   */
-  public static void main(String[] args) {
-Runtime rt = Runtime.getRuntime();
-try {
-  boolean status = doMain(args);
-  rt.exit(status ? 0 : 1);
-}
-catch (Exception e) {
-  LOG.log(Level.SEVERE, "error, caught Exception in main()", e);
-  rt.exit(1);
-}
   }
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=382912&r1=382911&r2=382912&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Fri Mar  3 
11:05:41 2006
@@ -90,31 +90,17 @@
 fs.delete(old);
   }
 
-  public static boolean doMain(String[] args) throws Exception {
+  public static void main(String[] args) throws Exception {
 CrawlDb crawlDb = new CrawlDb(NutchConfiguration.create());
 
 if (args.length < 2) {
   System.err.println("Usage:  ");
-  return false;
+  return;
 }
 
 crawlDb.update(new File(args[0]), new File(args[1]));
-
-return true;
   }
 
-  /**
-   * main() wrapper that returns proper exit status
-   */
-  public static void main(String[] args) {
-Runtime rt = Runtime.getRuntime();
-try {
-  boolean status = doMain(args);
-  rt.exit(status ? 0 : 1);
-}
-catch (Exception e) {
-  LOG.log(Level.SEVERE, "error, caught Exception in main()", e);
-  rt.exit(1);
-}
-  }
+
+
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=382912&r1=382911&r2=382912&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Fri 
Mar  3 11:05:41 2006
@@ -20,7 +20,7 @@
 import java.io.IOException;
 import java.util.Iterator;
 import java.util.TreeMap;
-import java.util.logging.*;
+import java.util.logging.Logger;
 
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.io.LongWritable;
@@ -241,7 +241,7 @@
 JobClient.runJob(job);
   }
 
-  public static boolean doMain(String[] args) throws IOException {
+  public static void main(String[] args) throws IOE

svn commit: r382579 - /lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java

2006-03-02 Thread cutting
Author: cutting
Date: Thu Mar  2 16:06:59 2006
New Revision: 382579

URL: http://svn.apache.org/viewcvs?rev=382579&view=rev
Log:
Disable speculative execution, since input format has side effects.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=382579&r1=382578&r2=382579&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java 
Thu Mar  2 16:06:59 2006
@@ -307,6 +307,7 @@
 job.setInputKeyClass(HashScore.class);
 job.setInputValueClass(IndexDoc.class);
 job.setInputFormat(InputFormat.class);
+job.setBoolean("mapred.speculative.execution", false);
 
 job.setPartitionerClass(HashPartitioner.class);
 job.setReducerClass(HashReducer.class);




svn commit: r382573 - in /lucene/nutch/trunk: conf/hadoop-env.sh.template lib/hadoop-0.1-dev.jar

2006-03-02 Thread cutting
Author: cutting
Date: Thu Mar  2 15:59:24 2006
New Revision: 382573

URL: http://svn.apache.org/viewcvs?rev=382573&view=rev
Log:
Update to latest Hadoop code.

Modified:
lucene/nutch/trunk/conf/hadoop-env.sh.template
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/conf/hadoop-env.sh.template
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/hadoop-env.sh.template?rev=382573&r1=382572&r2=382573&view=diff
==
--- lucene/nutch/trunk/conf/hadoop-env.sh.template (original)
+++ lucene/nutch/trunk/conf/hadoop-env.sh.template Thu Mar  2 15:59:24 2006
@@ -1,6 +1,11 @@
 # Set Hadoop-specific environment variables here.
 
-# The java implementation to use.
+# The only required environment variable is JAVA_HOME.  All others are
+# optional.  When running a distributed configuration it is best to
+# set JAVA_HOME in this file, so that it is correctly defined on
+# remote nodes.
+
+# The java implementation to use.  Required.
 # export JAVA_HOME=/usr/bin/java
 
 # The maximum amount of heap to use, in MB. Default is 1000.
@@ -8,6 +13,9 @@
 
 # Extra Java runtime options.  Empty by default.
 # export HADOOP_OPTS=-server
+
+# Extra ssh options.  Default: '-o ConnectTimeout=1 -o 
SendEnv=HADOOP_CONF_DIR'.
+# export HADOOP_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR"
 
 # Where log files are stored.  $HADOOP_HOME/logs by default.
 # export HADOOP_LOG_DIR=${HADOOP_HOME}/logs

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=382573&r1=382572&r2=382573&view=diff
==
Binary files - no diff available.




svn commit: r382512 - in /lucene/nutch/trunk/lib: lucene-core-1.9-final.jar lucene-core-1.9.1.jar lucene-misc-1.9-final.jar lucene-misc-1.9.1.jar

2006-03-02 Thread cutting
Author: cutting
Date: Thu Mar  2 12:59:09 2006
New Revision: 382512

URL: http://svn.apache.org/viewcvs?rev=382512&view=rev
Log:
Upgrade to Lucene 1.9.1.

Added:
lucene/nutch/trunk/lib/lucene-core-1.9.1.jar   (with props)
lucene/nutch/trunk/lib/lucene-misc-1.9.1.jar   (with props)
Removed:
lucene/nutch/trunk/lib/lucene-core-1.9-final.jar
lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar

Added: lucene/nutch/trunk/lib/lucene-core-1.9.1.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-core-1.9.1.jar?rev=382512&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/lucene-core-1.9.1.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/lucene-misc-1.9.1.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-misc-1.9.1.jar?rev=382512&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/lucene-misc-1.9.1.jar
--
svn:mime-type = application/octet-stream




svn commit: r381824 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

2006-02-28 Thread cutting
Author: cutting
Date: Tue Feb 28 15:30:02 2006
New Revision: 381824

URL: http://svn.apache.org/viewcvs?rev=381824&view=rev
Log:
Updating hadoop jar.  Includes fixes for Windows.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=381824&r1=381823&r2=381824&view=diff
==
Binary files - no diff available.




svn commit: r381751 - in /lucene/nutch/trunk: site/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org

2006-02-28 Thread cutting
Author: cutting
Date: Tue Feb 28 11:25:12 2006
New Revision: 381751

URL: http://svn.apache.org/viewcvs?rev=381751&view=rev
Log:
Adding DOAP for Nutch.  Contributed by Chris Mattmann.

Added:
lucene/nutch/trunk/site/doap.rdf
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java

Added: lucene/nutch/trunk/site/doap.rdf
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/doap.rdf?rev=381751&view=auto
==
--- lucene/nutch/trunk/site/doap.rdf (added)
+++ lucene/nutch/trunk/site/doap.rdf Tue Feb 28 11:25:12 2006
@@ -0,0 +1,47 @@
+
+
+http://usefulinc.com/ns/doap#"; 
+ xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"; 
+ xmlns:asfext="http://projects.apache.org/ns/asfext#";
+ xmlns:foaf="http://xmlns.com/foaf/0.1/";>
+
+  http://lucene.apache.org/nutch/";>
+2006-02-28
+http://usefulinc.com/doap/licenses/asl20"; />
+Apache Nutch
+http://lucene.apache.org/nutch/"; />
+http://lucene.apache.org"; />
+Nutch is the open-source search engine.
+Nutch is open source web-search software. It builds
+on Lucene Java and Hadoop, adding web-specifics, such as a
+crawler, a link-graph database, parsers for HTML and other
+document formats, etc.
+
+http://issues.apache.org/jira/browse/NUTCH"; />
+http://lucene.apache.org/nutch/mailing_lists.html"; />
+http://www.apache.org/dyn/closer.cgi/lucene/nutch/"; />
+Java
+http://projects.apache.org/category/web-framework"; 
/>
+
+  
+0.7
+2005-08-17
+0.7
+  
+
+
+  
+http://svn.apache.org/repos/asf/lucene/nutch/"/>
+http://svn.apache.org/viewcvs.cgi/lucene/nutch/"/>
+  
+
+  
+

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=381751&r1=381750&r2=381751&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Tue Feb 28 
11:25:12 2006
@@ -44,11 +44,11 @@
 
 
   /* Perform complete crawling and indexing given a set of root urls. */
-  public static void main(String args[]) throws Exception {
+  public static boolean doMain(String args[]) throws Exception {
 if (args.length < 1) {
   System.out.println
 ("Usage: Crawl  [-dir d] [-threads n] [-depth i] [-topN N]");
-  return;
+  return false;
 }
 
 Configuration conf = NutchConfiguration.create();
@@ -122,5 +122,22 @@
 new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir, job).merge();
 
 LOG.info("crawl finished: " + dir);
+
+return true;
+  }
+
+  /**
+   * main() wrapper that returns proper exit status
+   */
+  public static void main(String[] args) {
+Runtime rt = Runtime.getRuntime();
+try {
+  boolean status = doMain(args);
+  rt.exit(status ? 0 : 1);
+}
+catch (Exception e) {
+  LOG.log(Level.SEVERE, "error, caught Exception in main()", e);
+  rt.exit(1);
+}
   }
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=381751&r1=381750&r2=381751&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Tue Feb 28 
11:25:12 2006
@@ -90,17 +90,31 @@
 fs.delete(old);
   }
 
-  public static void main(String[] args) throws Exception {
+  public static boolean doMain(St

svn commit: r381721 - in /lucene/nutch/trunk/lib: lucene-core-1.9-final.jar lucene-core-1.9-rc1-dev.jar lucene-misc-1.9-final.jar lucene-misc-1.9-rc1-dev.jar

2006-02-28 Thread cutting
Author: cutting
Date: Tue Feb 28 10:00:43 2006
New Revision: 381721

URL: http://svn.apache.org/viewcvs?rev=381721&view=rev
Log:
Upgrade lucene version to final release.

Added:
lucene/nutch/trunk/lib/lucene-core-1.9-final.jar   (with props)
lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar   (with props)
Removed:
lucene/nutch/trunk/lib/lucene-core-1.9-rc1-dev.jar
lucene/nutch/trunk/lib/lucene-misc-1.9-rc1-dev.jar

Added: lucene/nutch/trunk/lib/lucene-core-1.9-final.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-core-1.9-final.jar?rev=381721&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/lucene-core-1.9-final.jar
--
svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar?rev=381721&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar
--
svn:mime-type = application/octet-stream




svn commit: r380840 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

2006-02-24 Thread cutting
Author: cutting
Date: Fri Feb 24 14:38:06 2006
New Revision: 380840

URL: http://svn.apache.org/viewcvs?rev=380840&view=rev
Log:
Update hadoop jar, to get recent fixes from that project.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=380840&r1=380839&r2=380840&view=diff
==
Binary files - no diff available.




svn commit: r380789 - /lucene/nutch/trunk/build.xml

2006-02-24 Thread cutting
Author: cutting
Date: Fri Feb 24 11:11:44 2006
New Revision: 380789

URL: http://svn.apache.org/viewcvs?rev=380789&view=rev
Log:
Fix to not use 'exec', but rather 'untar' and 'chmod' which are more portable.

Modified:
lucene/nutch/trunk/build.xml

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=380789&r1=380788&r2=380789&view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Fri Feb 24 11:11:44 2006
@@ -57,10 +57,9 @@
   
   
 
-
-  
-  
-
+
+
+
 
 
 




svn commit: r378396 - in /lucene/nutch/trunk/src/java/org/apache/nutch: crawl/ searcher/

2006-02-16 Thread cutting
Author: cutting
Date: Thu Feb 16 15:31:52 2006
New Revision: 378396

URL: http://svn.apache.org/viewcvs?rev=378396&view=rev
Log:
Fix for NUTCH-211: add close method to search classes.  Contributed by Stefan.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitContent.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitInlinks.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Searcher.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=378396&r1=378395&r2=378396&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Thu 
Feb 16 15:31:52 2006
@@ -32,7 +32,7 @@
 import java.util.logging.Logger;
 
 /** . */
-public class LinkDbReader {
+public class LinkDbReader implements Closeable {
   public static final Logger LOG = 
LogFormatter.getLogger(LinkDbReader.class.getName());
 
   private static final Partitioner PARTITIONER = new HashPartitioner();
@@ -66,6 +66,14 @@
 
 return (Inlinks)MapFileOutputFormat.getEntry
   (readers, PARTITIONER, url, new Inlinks());
+  }
+  
+  public void close() throws IOException {
+if (readers != null) {
+  for (int i = 0; i < readers.length; i++) {
+readers[i].close();
+  }
+}
   }
   
   public static void processDumpJob(String linkdb, String output, 
Configuration config) throws IOException {

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=378396&r1=378395&r2=378396&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java 
Thu Feb 16 15:31:52 2006
@@ -20,6 +20,7 @@
 import java.io.File;
 
 import java.util.HashMap;
+import java.util.Iterator;
 
 import org.apache.hadoop.io.*;
 import org.apache.hadoop.fs.*;
@@ -34,7 +35,7 @@
  * fetched segments. */
 public class FetchedSegments implements HitSummarizer, HitContent {
 
-  private static class Segment {
+  private static class Segment implements Closeable {
 private static final Partitioner PARTITIONER = new HashPartitioner();
 
 private FileSystem fs;
@@ -93,6 +94,19 @@
   return MapFileOutputFormat.getEntry(readers, PARTITIONER, url, entry);
 }
 
+public void close() throws IOException {
+  if (content != null) { closeReaders(content); }
+  if (parseText != null) { closeReaders(parseText); }
+  if (parseData != null) { closeReaders(parseData); }
+  if (crawl != null) { closeReaders(crawl); }
+}
+
+private void closeReaders(MapFile.Reader[] readers) throws IOException {
+  for (int i = 0; i < readers.length; i++) {
+readers[i].close();
+  }
+}
+
   }
 
   private HashMap segments = new HashMap();
@@ -206,5 +220,11 @@
 return new UTF8(details.getValue("url"));
   }
 
-
+  public void close() throws IOException {
+Iterator iterator = segments.values().iterator();
+while (iterator.hasNext()) {
+  ((Segment) iterator.next()).close();
+}
+  }
+  
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitContent.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitContent.java?rev=378396&r1=378395&r2=378396&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitContent.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitContent.java Thu 
Feb 16 15:31:52 2006
@@ -18,11 +18,12 @@
 
 import java.io.IOException;
 
+import org.apache.hadoop.io.Closeable;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseText;
 
 /** Service that returns the content of a hit. */
-public interface HitContent {
+public interface HitContent extends Closeable {
   /** Returns the content of a hit document. */
   byte[] getContent(HitDetails details) throws IOException;
 

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitInlinks.java
URL: 
http://svn.apache.org/viewc

svn commit: r378381 - /lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml

2006-02-16 Thread cutting
Author: cutting
Date: Thu Feb 16 14:24:47 2006
New Revision: 378381

URL: http://svn.apache.org/viewcvs?rev=378381&view=rev
Log:
Fix to work with Forrest 0.7, where ext: links seem to no longer work
in tabs.xml.

Modified:
lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml

Modified: lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml?rev=378381&r1=378380&r2=378381&view=diff
==
--- lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml 
(original)
+++ lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml Thu 
Feb 16 14:24:47 2006
@@ -15,6 +15,6 @@
   -->
 
 
-  
+  http://wiki.apache.org/nutch/"/>
   
 




svn commit: r378108 - /lucene/nutch/trunk/

2006-02-15 Thread cutting
Author: cutting
Date: Wed Feb 15 14:47:00 2006
New Revision: 378108

URL: http://svn.apache.org/viewcvs?rev=378108&view=rev
Log:
Ignore logs directory.

Modified:
lucene/nutch/trunk/   (props changed)

Propchange: lucene/nutch/trunk/
--
--- svn:ignore (original)
+++ svn:ignore Wed Feb 15 14:47:00 2006
@@ -1,4 +1,5 @@
 build
+logs
 nutch.jar
 .classpath
 .project




svn commit: r378107 - in /lucene/nutch/trunk: conf/ conf/hadoop-env.sh.template conf/slaves.template lib/hadoop-0.1-dev.jar src/java/org/apache/nutch/fetcher/Fetcher.java

2006-02-15 Thread cutting
Author: cutting
Date: Wed Feb 15 14:45:31 2006
New Revision: 378107

URL: http://svn.apache.org/viewcvs?rev=378107&view=rev
Log:
Fix Fetcher to disable speculative exexution, to keep it polite.  Also upgrade 
to latest hadoop jar that supports this  feature.  Note that Hadoop's 
environment specification has changed, with all environment variables settable 
from conf/hadoop-env.sh, and the slaves file is now in conf/, rather than in 
one's home directory.

Added:
lucene/nutch/trunk/conf/hadoop-env.sh.template
lucene/nutch/trunk/conf/slaves.template
Modified:
lucene/nutch/trunk/conf/   (props changed)
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Propchange: lucene/nutch/trunk/conf/
--
--- svn:ignore (original)
+++ svn:ignore Wed Feb 15 14:45:31 2006
@@ -1,5 +1,4 @@
-nutch-site.xml
-regex-normalize.xml
-crawl-urlfilter.txt
-regex-urlfilter.txt
-mapred-default.xml
+*.xml
+*.txt
+*.sh
+slaves

Added: lucene/nutch/trunk/conf/hadoop-env.sh.template
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/hadoop-env.sh.template?rev=378107&view=auto
==
--- lucene/nutch/trunk/conf/hadoop-env.sh.template (added)
+++ lucene/nutch/trunk/conf/hadoop-env.sh.template Wed Feb 15 14:45:31 2006
@@ -0,0 +1,25 @@
+# Set Hadoop-specific environment variables here.
+
+# The java implementation to use.
+# export JAVA_HOME=/usr/bin/java
+
+# The maximum amount of heap to use, in MB. Default is 1000.
+# export HADOOP_HEAPSIZE=2000
+
+# Extra Java runtime options.  Empty by default.
+# export HADOOP_OPTS=-server
+
+# Where log files are stored.  $HADOOP_HOME/logs by default.
+# export HADOOP_LOG_DIR=${HADOOP_HOME}/logs
+
+# File naming remote slave hosts.  $HADOOP_HOME/conf/slaves by default.
+# export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves
+
+# host:path where hadoop code should be rsync'd from.  Unset by default.
+# export HADOOP_MASTER=master:/home/$USER/src/hadoop
+
+# The directory where pid files are stored. /tmp by default.
+# export HADOOP_PID_DIR=/var/hadoop/pids
+
+# A string representing this instance of hadoop. $USER by default.
+# export HADOOP_IDENT_STRING=$USER

Added: lucene/nutch/trunk/conf/slaves.template
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/slaves.template?rev=378107&view=auto
==
--- lucene/nutch/trunk/conf/slaves.template (added)
+++ lucene/nutch/trunk/conf/slaves.template Wed Feb 15 14:45:31 2006
@@ -0,0 +1 @@
+localhost

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=378107&r1=378106&r2=378107&view=diff
==
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=378107&r1=378106&r2=378107&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Feb 
15 14:45:31 2006
@@ -348,6 +348,9 @@
 job.set(SEGMENT_NAME_KEY, segment.getName());
 job.setBoolean("fetcher.parse", parsing);
 
+// for politeness, don't permit parallel execution of a single task
+job.setBoolean("mapred.speculative.execution", false);
+
 job.setInputDir(new File(segment, CrawlDatum.GENERATE_DIR_NAME));
 job.setInputFormat(InputFormat.class);
 job.setInputKeyClass(UTF8.class);




svn commit: r378044 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

2006-02-15 Thread cutting
Author: cutting
Date: Wed Feb 15 09:56:54 2006
New Revision: 378044

URL: http://svn.apache.org/viewcvs?rev=378044&view=rev
Log:
Upgrade to latest version of Hadoop.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=378044&r1=378043&r2=378044&view=diff
==
Binary files - no diff available.




svn commit: r376815 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

2006-02-10 Thread cutting
Author: cutting
Date: Fri Feb 10 11:44:47 2006
New Revision: 376815

URL: http://svn.apache.org/viewcvs?rev=376815&view=rev
Log:
Update Hadoop jar.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=376815&r1=376814&r2=376815&view=diff
==
Binary files - no diff available.




svn commit: r376808 - in /lucene/nutch/trunk/conf: configuration.xsl hadoop-site.xml.template

2006-02-10 Thread cutting
Author: cutting
Date: Fri Feb 10 11:31:06 2006
New Revision: 376808

URL: http://svn.apache.org/viewcvs?rev=376808&view=rev
Log:
Add a template for hadoop-site.xml, and the stylesheet for config files.

Added:
lucene/nutch/trunk/conf/configuration.xsl
lucene/nutch/trunk/conf/hadoop-site.xml.template

Added: lucene/nutch/trunk/conf/configuration.xsl
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/configuration.xsl?rev=376808&view=auto
==
--- lucene/nutch/trunk/conf/configuration.xsl (added)
+++ lucene/nutch/trunk/conf/configuration.xsl Fri Feb 10 11:31:06 2006
@@ -0,0 +1,24 @@
+
+http://www.w3.org/1999/XSL/Transform"; version="1.0">
+
+
+
+
+
+
+ name
+ value
+ description
+
+
+
+  
+  
+  
+
+
+
+
+
+
+

Added: lucene/nutch/trunk/conf/hadoop-site.xml.template
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/hadoop-site.xml.template?rev=376808&view=auto
==
--- lucene/nutch/trunk/conf/hadoop-site.xml.template (added)
+++ lucene/nutch/trunk/conf/hadoop-site.xml.template Fri Feb 10 11:31:06 2006
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+




svn commit: r376803 - in /lucene/nutch/trunk: build.xml lib/hadoop-0.1-dev.jar

2006-02-10 Thread cutting
Author: cutting
Date: Fri Feb 10 11:22:15 2006
New Revision: 376803

URL: http://svn.apache.org/viewcvs?rev=376803&view=rev
Log:
Unpack Hadoop webapps from jar so that they can be used.

Modified:
lucene/nutch/trunk/build.xml
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=376803&r1=376802&r2=376803&view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Fri Feb 10 11:22:15 2006
@@ -62,6 +62,13 @@
   
 
 
+
+
+
+  
+  
+
+
   
 
   
@@ -414,6 +421,10 @@
 
 
   
+
+
+
+  
 
 
 

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=376803&r1=376802&r2=376803&view=diff
==
Binary files - no diff available.




svn commit: r376485 - in /lucene/nutch/trunk: ./ bin/ lib/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/j

2006-02-09 Thread cutting
Author: cutting
Date: Thu Feb  9 15:20:28 2006
New Revision: 376485

URL: http://svn.apache.org/viewcvs?rev=376485&view=rev
Log:
Fix for NUTCH-209.  Nutch now supplies all code to remote MapReduce daemons 
through a job jar file.  So Hadoop daemons no longer need to be restarted when 
Nutch code changes.

Added:
lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java
Modified:
lucene/nutch/trunk/bin/nutch
lucene/nutch/trunk/build.xml
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java

Modified: lucene/nutch/trunk/bin/nutch
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/bin/nutch?rev=376485&r1=376484&r2=376485&view=diff
==
--- lucene/nutch/trunk/bin/nutch (original)
+++ lucene/nutch/trunk/bin/nutch Thu Feb  9 15:20:28 2006
@@ -82,13 +82,13 @@
 CLASSPATH=${NUTCH_CONF_DIR:=$NUTCH_HOME/conf}
 CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
 
-# for developers, add Nutch classes to CLASSPATH
-if [ -d "$NUTCH_HOME/build/classes" ]; then
-  CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/classes
-fi
+# for developers, add plugins, job & test code to CLASSPATH
 if [ -d "$NUTCH_HOME/build/plugins" ]; then
   CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build
 fi
+for f in $NUTCH_HOME/build/nutch-*.job; do
+  CLASSPATH=${CLASSPATH}:$f;
+done
 if [ -d "$NUTCH_HOME/build/test/classes" ]; then
   CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/test/classes
 fi
@@ -96,14 +96,14 @@
 # so that filenames w/ spaces are handled correctly in loops below
 IFS=
 
-# for releases, add Nutch jar to CLASSPATH
-for f in $NUTCH_HOME/nutch-*.jar; do
+# for releases, add Nutch job to CLASSPATH
+for f in $NUTCH_HOME/nutch-*.job; do
   CLASSPATH=${CLASSPATH}:$f;
 done
 
 # add plugins to classpath
 if [ -d "$NUTCH_HOME/plugins" ]; then
-  CLASSPATH=${CLASSPATH}:$NUTCH_HOME
+  CLASSPATH=${NUTCH_HOME}:${CLASSPATH}
 fi
 
 # add libs to CLASSPATH

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=376485&r1=376484&r2=376485&view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Thu Feb  9 15:20:28 2006
@@ -1,6 +1,6 @@
 
 
-
+
 
   
   
@@ -100,7 +100,6 @@
   
   
 
-
   
   
   
@@ -119,6 +118,21 @@
   
 
   
+  
+  
+  
+  
+  
+
+  
+  
+  
+  
+
+  
+
+  
   
   
   
@@ -385,7 +399,7 @@
   
   
   
-  
+  
 
 
 
@@ -402,7 +416,7 @@
 
 
 
-
+
 
 
 

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=376485&r1=376484&r2=376485&view=diff
==
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=376485&r1=376484&r2=376485&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Thu Feb  9 
15:20:28 2006
@@ -31,6 +31,7 @@
 import org.apache.nutch.indexer.IndexMerger;
 import org.apache.nutch.indexer.Indexer;
 import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
 
 public class Crawl {
   public static final Logger LOG =
@@ -52,7 +53,7 @@
 
 Configuration conf = NutchConfiguration.create();
 conf.addDefaultResource("crawl-tool.xml");
-JobConf job = new JobConf(conf);
+JobConf job = new NutchJob(conf);
 
 File rootUrlDir = null;
 File dir = new File("crawl-" + getDate());

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
URL: 
http://svn.apache.

svn commit: r376435 - in /lucene/nutch/trunk: lib/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/

2006-02-09 Thread cutting
Author: cutting
Date: Thu Feb  9 12:57:44 2006
New Revision: 376435

URL: http://svn.apache.org/viewcvs?rev=376435&view=rev
Log:
Updating to latest Hadoop jar, adding now-required close() methods to mapper 
and reducer implementations.

Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=376435&r1=376434&r2=376435&view=diff
==
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=376435&r1=376434&r2=376435&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Thu 
Feb  9 12:57:44 2006
@@ -56,6 +56,7 @@
 
   public static class CrawlDbStatMapper implements Mapper {
 public void configure(JobConf job) {}
+public void close() {}
 public void map(WritableComparable key, Writable value, OutputCollector 
output, Reporter reporter)
 throws IOException {
   CrawlDatum cd = (CrawlDatum) value;
@@ -68,6 +69,7 @@
 
   public static class CrawlDbStatReducer implements Reducer {
 public void configure(JobConf job) {}
+public void close() {}
 public void reduce(WritableComparable key, Iterator values, 
OutputCollector output, Reporter reporter)
 throws IOException {
 
@@ -127,8 +129,8 @@
   }
 }
 
-public void configure(JobConf job) {
-}
+public void configure(JobConf job) {}
+public void close() {}
   }
   
   public void processStatJob(String crawlDb, Configuration config) throws 
IOException {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=376435&r1=376434&r2=376435&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Thu 
Feb  9 12:57:44 2006
@@ -30,6 +30,8 @@
 retryMax = job.getInt("db.fetch.retry.max", 3);
   }
 
+  public void close() {}
+
   public void reduce(WritableComparable key, Iterator values,
  OutputCollector output, Reporter reporter)
 throws IOException {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=376435&r1=376434&r2=376435&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Thu Feb  
9 12:57:44 2006
@@ -51,6 +51,8 @@
   maxPerHost = job.getInt("generate.max.per.host", -1);
 }
 
+public void close() {}
+
 /** Select & invert subset due for fetch. */
 public void map(WritableComparable key, Writable value,
 OutputCollector output, Reporter reporter)

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=376435&r1=376434&r2=376435&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Thu Feb  9 
12:57:44 2006
@@ -48,6 +48,8 @@
   this.jobConf = job;
 }
 
+public void close() {}
+
 public void map(WritableComparable key, Writable val,
 

svn commit: r376072 - /lucene/nutch/trunk/conf/nutch-default.xml

2006-02-08 Thread cutting
Author: cutting
Date: Wed Feb  8 13:25:30 2006
New Revision: 376072

URL: http://svn.apache.org/viewcvs?rev=376072&view=rev
Log:
Restore accidentally removed file defaults.

Modified:
lucene/nutch/trunk/conf/nutch-default.xml

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=376072&r1=376071&r2=376072&view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Wed Feb  8 13:25:30 2006
@@ -7,6 +7,28 @@
 
 
 
+
+
+
+  file.content.limit
+  65536
+  The length limit for downloaded content, in bytes.
+  If this value is larger than zero, content longer than it will be
+  truncated; otherwise (zero or negative), no truncation at all.
+  
+
+
+
+  file.content.ignored
+  true
+  If true, no file content will be saved during fetch.
+  And it is probably what we want to set most of time, since file:// URLs
+  are meant to be local and we can always use them directly at parsing
+  and indexing stages. Otherwise file contents will be saved.
+  !! NO IMPLEMENTED YET !!
+  
+
+
 
 
 




svn commit: r375704 - in /lucene/nutch/trunk/lib: jetty-5.1.4.LICENSE.txt jetty-5.1.4.jar jetty-ext/

2006-02-07 Thread cutting
Author: cutting
Date: Tue Feb  7 13:02:46 2006
New Revision: 375704

URL: http://svn.apache.org/viewcvs?rev=375704&view=rev
Log:
Restoring jetty to Nutch lib: removed by mistake.

Added:
lucene/nutch/trunk/lib/jetty-5.1.4.LICENSE.txt
  - copied unchanged from r374759, 
lucene/hadoop/trunk/lib/jetty-5.1.4.LICENSE.txt
lucene/nutch/trunk/lib/jetty-5.1.4.jar
  - copied unchanged from r374759, lucene/hadoop/trunk/lib/jetty-5.1.4.jar
lucene/nutch/trunk/lib/jetty-ext/
  - copied from r374759, lucene/hadoop/trunk/lib/jetty-ext/



svn commit: r375414 - in /lucene/nutch/trunk: bin/ build.xml lib/hadoop-0.1-dev.jar

2006-02-06 Thread cutting
Author: cutting
Date: Mon Feb  6 15:36:01 2006
New Revision: 375414

URL: http://svn.apache.org/viewcvs?rev=375414&view=rev
Log:
Extract Hadoop's scripts from Hadoop's jar into bin/ directory.

Modified:
lucene/nutch/trunk/bin/   (props changed)
lucene/nutch/trunk/build.xml
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar

Propchange: lucene/nutch/trunk/bin/
--
--- svn:ignore (added)
+++ svn:ignore Mon Feb  6 15:36:01 2006
@@ -0,0 +1,6 @@
+hadoop
+hadoop-daemon.sh
+hadoop-daemons.sh
+slaves.sh
+start-all.sh
+stop-all.sh

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=375414&r1=375413&r2=375414&view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Mon Feb  6 15:36:01 2006
@@ -51,6 +51,16 @@
   
 
 
+
+
+
+  
+  
+
+
+  
+  
+
 
   
 

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=375414&r1=375413&r2=375414&view=diff
==
Binary files - no diff available.




svn commit: r375333 - /lucene/nutch/nightly/nightly.properties

2006-02-06 Thread cutting
Author: cutting
Date: Mon Feb  6 10:57:09 2006
New Revision: 375333

URL: http://svn.apache.org/viewcvs?rev=375333&view=rev
Log:
Updated email paramters.

Modified:
lucene/nutch/nightly/nightly.properties

Modified: lucene/nutch/nightly/nightly.properties
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.properties?rev=375333&r1=375332&r2=375333&view=diff
==
--- lucene/nutch/nightly/nightly.properties (original)
+++ lucene/nutch/nightly/nightly.properties Mon Feb  6 10:57:09 2006
@@ -1,5 +1,5 @@
-MailLogger.mailhost = mail.apache.org
-MailLogger.from = nutch-dev@incubator.apache.org
-MailLogger.failure.to = nutch-dev@incubator.apache.org
+MailLogger.mailhost = localhost
+MailLogger.from = nutch-dev@lucene.apache.org
+MailLogger.failure.to = nutch-dev@lucene.apache.org
 MailLogger.failure.subject = Nutch nightly build failure
 MailLogger.success.notify = false




svn commit: r375326 - in /lucene/nutch/trunk: conf/hadoop-default.xml conf/mapred-default.xml.template lib/hadoop-0.1-dev.jar src/java/org/apache/nutch/crawl/Crawl.java

2006-02-06 Thread cutting
Author: cutting
Date: Mon Feb  6 10:16:22 2006
New Revision: 375326

URL: http://svn.apache.org/viewcvs?rev=375326&view=rev
Log:
Remove Hadoop config files and update to latest Hadoop jar.

Removed:
lucene/nutch/trunk/conf/hadoop-default.xml
lucene/nutch/trunk/conf/mapred-default.xml.template
Modified:
lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java

Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=375326&r1=375325&r2=375326&view=diff
==
Binary files - no diff available.

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=375326&r1=375325&r2=375326&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Mon Feb  6 
10:16:22 2006
@@ -51,7 +51,7 @@
 }
 
 Configuration conf = NutchConfiguration.create();
-conf.addAppResource("crawl-tool.xml");
+conf.addDefaultResource("crawl-tool.xml");
 JobConf job = new JobConf(conf);
 
 File rootUrlDir = null;




svn commit: r375321 - in /lucene/nutch/trunk/src/java/org/apache/nutch: fetcher/FetcherOutput.java parse/ParseData.java parse/ParseText.java protocol/Content.java util/NutchConfiguration.java

2006-02-06 Thread cutting
Author: cutting
Date: Mon Feb  6 09:52:30 2006
New Revision: 375321

URL: http://svn.apache.org/viewcvs?rev=375321&view=rev
Log:
Add aliases for some Writable classes for back-compatibility.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java?rev=375321&r1=375320&r2=375321&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java Mon 
Feb  6 09:52:30 2006
@@ -32,8 +32,6 @@
   private ParseImpl parse;
   private Configuration conf;
 
-  static { WritableName.setName(FetcherOutput.class, "FetcherOutput"); }
-
   public FetcherOutput() {}
 
   public FetcherOutput(CrawlDatum crawlDatum, Content content,

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?rev=375321&r1=375320&r2=375321&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Mon Feb  
6 09:52:30 2006
@@ -41,9 +41,6 @@
   private ParseStatus status;
   private Configuration conf;
   
-  static { WritableName.setName(ParseData.class, "ParseData"); }
-
-
   // TODO [EMAIL PROTECTED]: should we really implement Configurable or should 
we add the
   // parameter Configuration to the default-constructor. NOTE: The test
   // TestWriteable instantiates ParseData with Class.newInstance() -> the 
default

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java?rev=375321&r1=375320&r2=375321&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java Mon Feb  
6 09:52:30 2006
@@ -37,8 +37,6 @@
 this.text = text;
   }
 
-  static { WritableName.setName(ParseText.class, "ParseText"); }
-
   public byte getVersion() { return VERSION; }
 
   public void readFields(DataInput in) throws IOException {

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=375321&r1=375320&r2=375321&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Mon Feb  
6 09:52:30 2006
@@ -42,8 +42,6 @@
   private boolean mimeTypeMagic;
   private MimeTypes mimeTypes;
 
-  static { WritableName.setName(Content.class, "Content"); }
-
   public Content() {}
 
   public Content(String url, String base, byte[] content, String contentType,

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java?rev=375321&r1=375320&r2=375321&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java 
Mon Feb  6 09:52:30 2006
@@ -17,10 +17,21 @@
 package org.apache.nutch.util;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.WritableName;
 
 /** Utility to create Hadoop [EMAIL PROTECTED] Configuration}s that include 
Nutch-specific
  * resources.  */
 public class NutchConfiguration {
+
+  // for back-compatibility, add old aliases for these Writable classes
+  // this may be removed after the 0.8 release
+  static {
+WritableName.addName(org.apache.nutch.fetcher.FetcherOutput.class,
+ "FetcherOutput"); 
+WritableName.addName(org.apache.nutch.parse.ParseData.class, "ParseData"); 
+WritableName.addName(org.apache.nutch.parse.ParseText.class, "ParseText"); 
+

svn commit: r374799 - /lucene/nutch/trunk/build.xml

2006-02-03 Thread cutting
Author: cutting
Date: Fri Feb  3 16:55:20 2006
New Revision: 374799

URL: http://svn.apache.org/viewcvs?rev=374799&view=rev
Log:
Remove vestiges of mapred's webapp.

Modified:
lucene/nutch/trunk/build.xml

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=374799&r1=374798&r2=374799&view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Fri Feb  3 16:55:20 2006
@@ -38,7 +38,6 @@
   
 
 
-
 
 
 
@@ -47,10 +46,6 @@
   
 
 
-
-  
-
-
 
   
   
@@ -375,10 +370,6 @@
 
 
   
-
-
-
-  
 
 
 




svn commit: r374797 - /lucene/nutch/trunk/src/test/org/apache/nutch/pagedb/

2006-02-03 Thread cutting
Author: cutting
Date: Fri Feb  3 16:46:45 2006
New Revision: 374797

URL: http://svn.apache.org/viewcvs?rev=374797&view=rev
Log:
Removing unused directory.

Removed:
lucene/nutch/trunk/src/test/org/apache/nutch/pagedb/



svn commit: r374796 [5/5] - in /lucene/nutch/trunk: bin/ conf/ lib/ lib/jetty-ext/ src/java/org/apache/nutch/analysis/ src/java/org/apache/nutch/clustering/ src/java/org/apache/nutch/crawl/ src/java/o

2006-02-03 Thread cutting
Modified: 
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java?rev=374796&r1=374795&r2=374796&view=diff
==
--- 
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java 
(original)
+++ 
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java 
Fri Feb  3 16:38:32 2006
@@ -18,7 +18,8 @@
 
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.OutlinkExtractor;
-import org.apache.nutch.util.NutchConf;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
 
 import junit.framework.TestCase;
 
@@ -31,15 +32,15 @@
  */
 public class TestOutlinkExtractor extends TestCase {
 
-  private static NutchConf nutchConf = new NutchConf();
+  private static Configuration conf = NutchConfiguration.create();
   public void testGetNoOutlinks() {
 Outlink[]  outlinks = null;
 
-outlinks = OutlinkExtractor.getOutlinks(null, nutchConf);
+outlinks = OutlinkExtractor.getOutlinks(null, conf);
 assertNotNull(outlinks);
 assertEquals(0, outlinks.length);
 
-outlinks = OutlinkExtractor.getOutlinks("", nutchConf);
+outlinks = OutlinkExtractor.getOutlinks("", conf);
 assertNotNull(outlinks);
 assertEquals(0, outlinks.length);
   }
@@ -48,7 +49,7 @@
 Outlink[] outlinks = OutlinkExtractor.getOutlinks(
 "Test with http://www.nutch.org/index.html is it found? " +
 "What about www.google.com at http://www.google.de " +
-"A longer URL could be http://www.sybit.com/solutions/portals.html";, 
nutchConf);
+"A longer URL could be http://www.sybit.com/solutions/portals.html";, 
conf);
 
 assertTrue("Url not found!", outlinks.length == 3);
 assertEquals("Wrong URL", "http://www.nutch.org/index.html";, 
outlinks[0].getToUrl());
@@ -60,7 +61,7 @@
 Outlink[] outlinks = OutlinkExtractor.getOutlinks(
 "Test with http://www.nutch.org/index.html is it found? " +
 "What about www.google.com at http://www.google.de " +
-"A longer URL could be http://www.sybit.com/solutions/portals.html";, 
"http://www.sybit.de";, nutchConf);
+"A longer URL could be http://www.sybit.com/solutions/portals.html";, 
"http://www.sybit.de";, conf);
 
 assertTrue("Url not found!", outlinks.length == 3);
 assertEquals("Wrong URL", "http://www.nutch.org/index.html";, 
outlinks[0].getToUrl());
@@ -70,7 +71,7 @@
   public void testGetOutlinksFtp() {
 Outlink[] outlinks = OutlinkExtractor.getOutlinks(
 "Test with ftp://www.nutch.org is it found? " +
-"What about www.google.com at ftp://www.google.de";, nutchConf);
+"What about www.google.com at ftp://www.google.de";, conf);
 
 assertTrue("Url not found!", outlinks.length >1);
 assertEquals("Wrong URL", "ftp://www.nutch.org/";, outlinks[0].getToUrl());

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java?rev=374796&r1=374795&r2=374796&view=diff
==
--- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java 
(original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java Fri 
Feb  3 16:38:32 2006
@@ -16,9 +16,12 @@
 
 package org.apache.nutch.parse;
 
-import org.apache.nutch.io.*;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.nutch.util.WritableTestUtils;
 import org.apache.nutch.protocol.ContentProperties;
-import org.apache.nutch.util.NutchConf;
 
 import junit.framework.TestCase;
 
@@ -26,7 +29,7 @@
 
 public class TestParseData extends TestCase {
 
-  private NutchConf nutchConf = new NutchConf();
+  private Configuration conf = NutchConfiguration.create();
   
   public TestParseData(String name) { super(name); }
 
@@ -35,8 +38,8 @@
 String title = "The Foo Page";
 
 Outlink[] outlinks = new Outlink[] {
-  new Outlink("http://foo.com/";, "Foo", nutchConf),
-  new Outlink("http://bar.com/";, "Bar", nutchConf)
+  new Outlink("http://foo.com/";, "Foo", conf),
+  new Outlink("http://bar.com/";, "Bar", conf)
 };
 
 ContentProperties metaData = new ContentProperties();
@@ -44,9 +47,9 @@
 metaData.put("Charset", "UTF-8");
 
 ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, 
metaData);
-r.setConf(nutchConf);
+r.setConf(conf);
 
-TestWritable.testWritable(r);
+WritableTestUtils.testWritable(r, conf);
   }

 }

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseText.java
URL:

svn commit: r374202 - /lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskRunner.java

2006-02-01 Thread cutting
Author: cutting
Date: Wed Feb  1 15:19:54 2006
New Revision: 374202

URL: http://svn.apache.org/viewcvs?rev=374202&view=rev
Log:
Fix NUTCH-197: job fails when jar doesn't contain a lib directory.  Contributed 
by Owen O'Malley.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskRunner.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskRunner.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskRunner.java?rev=374202&r1=374201&r2=374202&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskRunner.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskRunner.java Wed Feb 
 1 15:19:54 2006
@@ -78,9 +78,11 @@
   if (jar != null) {  // if jar exists, it into workDir
 runChild(new String[] { "unzip", jar}, workDir);
 File[] libs = new File(workDir, "lib").listFiles();
-for (int i = 0; i < libs.length; i++) {
-  classPath.append(sep);  // add libs from jar to classpath
-  classPath.append(libs[i]);
+if (libs != null) {
+  for (int i = 0; i < libs.length; i++) {
+classPath.append(sep);// add libs from jar to classpath
+classPath.append(libs[i]);
+  }
 }
 classPath.append(sep);
 classPath.append(new File(workDir, "classes"));




svn commit: r372810 - /lucene/nutch/trunk/bin/nutch

2006-01-27 Thread cutting
Author: cutting
Date: Fri Jan 27 02:45:35 2006
New Revision: 372810

URL: http://svn.apache.org/viewcvs?rev=372810&view=rev
Log:
Explicitly specify bash, since this script requires some bash-specific features.

Modified:
lucene/nutch/trunk/bin/nutch

Modified: lucene/nutch/trunk/bin/nutch
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/bin/nutch?rev=372810&r1=372809&r2=372810&view=diff
==
--- lucene/nutch/trunk/bin/nutch (original)
+++ lucene/nutch/trunk/bin/nutch Fri Jan 27 02:45:35 2006
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # 
 # The Nutch command script
 #




svn commit: r372342 - /lucene/nutch/nightly/nightly.sh

2006-01-25 Thread cutting
Author: cutting
Date: Wed Jan 25 14:20:06 2006
New Revision: 372342

URL: http://svn.apache.org/viewcvs?rev=372342&view=rev
Log:
Fix remove command.

Modified:
lucene/nutch/nightly/nightly.sh

Modified: lucene/nutch/nightly/nightly.sh
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.sh?rev=372342&r1=372341&r2=372342&view=diff
==
--- lucene/nutch/nightly/nightly.sh (original)
+++ lucene/nutch/nightly/nightly.sh Wed Jan 25 14:20:06 2006
@@ -25,4 +25,4 @@
 scp build/*.tar.gz $REL_SERVER:$REL_DIR/nutch-`/bin/date +%F`.tar.gz
 
 # remove all but five newest builds
-ssh $REL_SERVER rm `ssh $REL_SERVER find -type f $REL_DIR | sort -r | tail +5`
+ssh $REL_SERVER rm `ssh $REL_SERVER find $REL_DIR -type f | sort -r | tail +5`




svn commit: r372315 - /lucene/nutch/nightly/nightly.sh

2006-01-25 Thread cutting
Author: cutting
Date: Wed Jan 25 13:12:13 2006
New Revision: 372315

URL: http://svn.apache.org/viewcvs?rev=372315&view=rev
Log:
Fix deletion of old versions.

Modified:
lucene/nutch/nightly/nightly.sh

Modified: lucene/nutch/nightly/nightly.sh
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.sh?rev=372315&r1=372314&r2=372315&view=diff
==
--- lucene/nutch/nightly/nightly.sh (original)
+++ lucene/nutch/nightly/nightly.sh Wed Jan 25 13:12:13 2006
@@ -24,5 +24,5 @@
 # release it
 scp build/*.tar.gz $REL_SERVER:$REL_DIR/nutch-`/bin/date +%F`.tar.gz
 
-# remove old release
-ssh $REL_SERVER rm -rf $REL_DIR/nutch-`/bin/date -d'week ago' +%F`.tar.gz
+# remove all but five newest builds
+ssh $REL_SERVER rm `ssh $REL_SERVER find -type f $REL_DIR | sort -r | tail +5`




svn commit: r370657 - in /lucene/nutch/nightly: nightly.cron nightly.properties nightly.sh

2006-01-19 Thread cutting
Author: cutting
Date: Thu Jan 19 14:46:28 2006
New Revision: 370657

URL: http://svn.apache.org/viewcvs?rev=370657&view=rev
Log:
Moving nightly build to lucene.zones.apache.org.

Modified:
lucene/nutch/nightly/nightly.cron
lucene/nutch/nightly/nightly.properties
lucene/nutch/nightly/nightly.sh

Modified: lucene/nutch/nightly/nightly.cron
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.cron?rev=370657&r1=370656&r2=370657&view=diff
==
--- lucene/nutch/nightly/nightly.cron (original)
+++ lucene/nutch/nightly/nightly.cron Thu Jan 19 14:46:28 2006
@@ -1,4 +1,4 @@
 # nightly crontab file
 # install with: crontab nightly.cron
 # run seventeen minutes after midnight, every day
-17 0 * * *   $HOME/src/nutch/nightly/nightly.sh > 
$HOME/src/nutch/nightly/nightly.log 2>&1
+17 0 * * *   $HOME/nutch-nightly/nightly.sh > 
$HOME/nutch-nightly/nightly.log 2>&1

Modified: lucene/nutch/nightly/nightly.properties
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.properties?rev=370657&r1=370656&r2=370657&view=diff
==
--- lucene/nutch/nightly/nightly.properties (original)
+++ lucene/nutch/nightly/nightly.properties Thu Jan 19 14:46:28 2006
@@ -1,4 +1,4 @@
-MailLogger.mailhost = smtp.sbcglobal.net
+MailLogger.mailhost = mail.apache.org
 MailLogger.from = nutch-dev@incubator.apache.org
 MailLogger.failure.to = nutch-dev@incubator.apache.org
 MailLogger.failure.subject = Nutch nightly build failure

Modified: lucene/nutch/nightly/nightly.sh
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.sh?rev=370657&r1=370656&r2=370657&view=diff
==
--- lucene/nutch/nightly/nightly.sh (original)
+++ lucene/nutch/nightly/nightly.sh Thu Jan 19 14:46:28 2006
@@ -1,6 +1,6 @@
 #!/bin/bash -vx
 
-export JAVA_HOME=$HOME/local/j2sdk1.4.2
+export JAVA_HOME=/usr/j2se
 
 TRUNK=http://svn.apache.org/repos/asf/lucene/nutch/trunk
 
@@ -12,12 +12,12 @@
 cd /tmp
 
 # export sources into it
-svn export $TRUNK nutch-nightly
+$HOME/bin/svn export $TRUNK nutch-nightly
 
 # run build
 cd nutch-nightly
-$HOME/local/ant/bin/ant \
- -propertyfile $HOME/src/nutch/nightly/nightly.properties \
+$HOME/bin/ant \
+ -propertyfile $HOME/nutch-nightly/nightly.properties \
  -logger org.apache.tools.ant.listener.MailLogger \
  -Dversion=nightly nightly
 




svn commit: r370638 - /lucene/nutch/trunk/conf/nutch-default.xml

2006-01-19 Thread cutting
Author: cutting
Date: Thu Jan 19 13:24:58 2006
New Revision: 370638

URL: http://svn.apache.org/viewcvs?rev=370638&view=rev
Log:
Document a few more properties.  Contributed by Dominik Friedrich.

Modified:
lucene/nutch/trunk/conf/nutch-default.xml

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=370638&r1=370637&r2=370638&view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Thu Jan 19 13:24:58 2006
@@ -379,6 +379,14 @@
   exception.
 
   
+
+  io.map.index.skip
+  0
+  Number of index entries to skip between each entry.
+  Zero by default. Setting this to values larger than zero can
+  facilitate opening large map files using less memory.
+
+
 
 
 
@@ -412,6 +420,14 @@
   directories, typically on different devices.
 
 
+
+  ndfs.replication
+  3
+  How many copies we try to have at all times. The actual
+  number of replications is at max the number of datanodes in the
+  cluster.
+
+
 
 
 
@@ -509,6 +525,13 @@
   200m
   The heap size (-Xmx) that will be used for task tracker
   child processes.
+
+
+
+  mapred.combine.buffer.size
+  10
+  The number of entries the combining collector caches before
+  combining them and writing to disk.
 
 
 




svn commit: r370632 - /lucene/nutch/trunk/conf/nutch-default.xml

2006-01-19 Thread cutting
Author: cutting
Date: Thu Jan 19 12:58:54 2006
New Revision: 370632

URL: http://svn.apache.org/viewcvs?rev=370632&view=rev
Log:
Switch default to protocol-http, since it seems more reliable than 
protocol-httpclient.

Modified:
lucene/nutch/trunk/conf/nutch-default.xml

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=370632&r1=370631&r2=370632&view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Thu Jan 19 12:58:54 2006
@@ -733,7 +733,7 @@
 
 
   plugin.includes
-  
protocol-httpclient|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)
+  
protocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)
   Regular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.
   In any case you need at least include the nutch-extensionpoints plugin. By




svn commit: r370281 - /lucene/nutch/trunk/build.xml

2006-01-18 Thread cutting
Author: cutting
Date: Wed Jan 18 14:03:28 2006
New Revision: 370281

URL: http://svn.apache.org/viewcvs?rev=370281&view=rev
Log:
Fix NUTCH-102: include webapps in packaged releases.

Modified:
lucene/nutch/trunk/build.xml

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=370281&r1=370280&r2=370281&view=diff
==
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Wed Jan 18 14:03:28 2006
@@ -377,6 +377,10 @@
   
 
 
+
+  
+
+
 
   
 




svn commit: r367408 - /lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java

2006-01-09 Thread cutting
Author: cutting
Date: Mon Jan  9 13:55:31 2006
New Revision: 367408

URL: http://svn.apache.org/viewcvs?rev=367408&view=rev
Log:
NUTCH-160: Switch RegexURLFilter to use Java regex's rather than oro, since 
Java's seem to be faster & more reliable.  By Rod Taylor.

Modified:

lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java

Modified: 
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java?rev=367408&r1=367407&r2=367408&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java
 Mon Jan  9 13:55:31 2006
@@ -32,12 +32,7 @@
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.logging.Logger;
-
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
-import org.apache.oro.text.regex.Perl5Pattern;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.MalformedPatternException;
+import java.util.regex.*;
 
 /**
  * Filters URLs based on a file of regular expressions. The file is named by
@@ -80,15 +75,14 @@
   }
 
   private static class Rule {
-public Perl5Pattern pattern;
+public Pattern pattern;
 public boolean sign;
 public String regex;
   }
 
   private List rules;
-  private PatternMatcher matcher = new Perl5Matcher();
 
-  public RegexURLFilter() throws IOException, MalformedPatternException {
+  public RegexURLFilter() throws IOException, PatternSyntaxException {
 String file = NutchConf.get().get("urlfilter.regex.file");
 // attribute "file" takes precedence if defined
 if (attributeFile != null)
@@ -103,7 +97,7 @@
   }
 
   public RegexURLFilter(String filename)
-throws IOException, MalformedPatternException {
+throws IOException, PatternSyntaxException {
 rules = readConfigurationFile(new FileReader(filename));
   }
 
@@ -111,7 +105,9 @@
 Iterator i=rules.iterator();
 while(i.hasNext()) {
   Rule r=(Rule) i.next();
-  if (matcher.contains(url,r.pattern)) {
+  Matcher matcher = r.pattern.matcher(url);
+
+  if (matcher.find()) {
 //System.out.println("Matched " + r.regex);
 return r.sign ? url : null;
   }
@@ -129,10 +125,9 @@
   // 
 
   private static List readConfigurationFile(Reader reader)
-throws IOException, MalformedPatternException {
+throws IOException, PatternSyntaxException {
 
 BufferedReader in=new BufferedReader(reader);
-Perl5Compiler compiler=new Perl5Compiler();
 List rules=new ArrayList();
 String line;

@@ -157,7 +152,7 @@
   String regex=line.substring(1);
 
   Rule rule=new Rule();
-  rule.pattern=(Perl5Pattern) compiler.compile(regex);
+  rule.pattern=Pattern.compile(regex);
   rule.sign=sign;
   rule.regex=regex;
   rules.add(rule);
@@ -167,7 +162,7 @@
   }
 
   public static void main(String args[])
-throws IOException, MalformedPatternException {
+throws IOException, PatternSyntaxException {
 
 RegexURLFilter filter=new RegexURLFilter();
 BufferedReader in=new BufferedReader(new InputStreamReader(System.in));




svn commit: r367406 - in /lucene/nutch/trunk/src: java/org/apache/nutch/ipc/RPC.java test/org/apache/nutch/ipc/TestRPC.java

2006-01-09 Thread cutting
Author: cutting
Date: Mon Jan  9 13:50:48 2006
New Revision: 367406

URL: http://svn.apache.org/viewcvs?rev=367406&view=rev
Log:
Fix parallel RPC calls to work correctly with methods that return void.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java
lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java?rev=367406&r1=367405&r2=367406&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java Mon Jan  9 
13:50:48 2006
@@ -149,6 +149,10 @@
 
 Writable[] wrappedValues = CLIENT.call(invocations, addrs);
 
+if (method.getReturnType() == Void.TYPE) {
+  return null;
+}
+
 Object[] values =
   (Object[])Array.newInstance(method.getReturnType(),wrappedValues.length);
 for (int i = 0; i < values.length; i++)

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java?rev=367406&r1=367405&r2=367406&view=diff
==
--- lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java Mon Jan  9 
13:50:48 2006
@@ -110,13 +110,17 @@
 }
 assertTrue(caught);
 
-// try a multi-call
-Method method =
+// try some multi-calls
+Method echo =
   TestProtocol.class.getMethod("echo", new Class[] { String.class });
-String[] values = (String[])RPC.call(method, new String[][]{{"a"},{"b"}},
+String[] strings = (String[])RPC.call(echo, new String[][]{{"a"},{"b"}},
  new InetSocketAddress[] {addr, addr});
-assertTrue(Arrays.equals(values, new String[]{"a","b"}));
+assertTrue(Arrays.equals(strings, new String[]{"a","b"}));
 
+Method ping = TestProtocol.class.getMethod("ping", new Class[] {});
+Object[] voids = (Object[])RPC.call(ping, new Object[][]{{},{}},
+new InetSocketAddress[] {addr, addr});
+assertEquals(voids, null);
 
 server.stop();
   }




svn commit: r366573 - /lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java

2006-01-06 Thread cutting
Author: cutting
Date: Fri Jan  6 13:42:25 2006
New Revision: 366573

URL: http://svn.apache.org/viewcvs?rev=366573&view=rev
Log:
Fix for NUTCH-150, by Paul Baclace.

1. Adds a comment that non-plain-text can be a problem.
2. Adds quantifiers to the regular expression to limit length of matched text.
3. Monitors the time spent doing matching and if more than 60 seconds,
it will stop looking for additional matches (this does not prevent the
first lengthy match).

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=366573&r1=366572&r2=366573&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
Fri Jan  6 13:42:25 2006
@@ -55,11 +55,12 @@
*  
*/
   private static final String URL_PATTERN = 
-
"([A-Za-z][A-Za-z0-9+.-]+:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?)";
+
"([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
 
   /**
* Extracts Outlink from given plain text.
-   * 
+   * Applying this method to non-plain-text can result in extremely lengthy
+   * runtimes for parasitic cases (postscript is a known example).
* @param plainText  the plain text from wich URLs should be extracted.
* 
* @return Array of Outlinks within found in plainText
@@ -78,7 +79,7 @@
* @return Array of Outlinks within found in plainText
*/
   public static Outlink[] getOutlinks(final String plainText, String anchor) {
-
+long start = System.currentTimeMillis();
 final List outlinks = new ArrayList();
 
 try {
@@ -95,13 +96,19 @@
 
   //loop the matches
   while (matcher.contains(input, pattern)) {
+// if this is taking too long, stop matching
+//   (SHOULD really check cpu time used so that heavily loaded systems
+//   do not unnecessarily hit this limit.)
+if (System.currentTimeMillis() - start >= 6L) {
+  LOG.warning("Time limit exceeded for getOutLinks");
+  break;
+}
 result = matcher.getMatch();
 url = result.group(0);
 outlinks.add(new Outlink(url, anchor));
   }
 } catch (Exception ex) {
-  // if it is a malformed URL we just throw it away and continue with
-  // extraction.
+  // if the matcher fails (perhaps a malformed URL) we just log it and 
move on
   LOG.throwing(OutlinkExtractor.class.getName(), "getOutlinks", ex);
 }
 




svn commit: r366571 - /lucene/nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java

2006-01-06 Thread cutting
Author: cutting
Date: Fri Jan  6 13:35:35 2006
New Revision: 366571

URL: http://svn.apache.org/viewcvs?rev=366571&view=rev
Log:
Fix for NUTCH-151: CommandRunner can hang after the main thread exec
is finished and has inefficient busy loop.

  I encountered a case where the JVM of a Tasktracker child did not exit
after the main thread returned; a thread dump showed only the threads named
STDOUT and STDERR from CommandRunner as non-daemon threads, and both were
doing a read().  CommandRunner also had an excessively costly busy loop.
These problems were fixed by:
1. The pipe io threads should be daemons.
2. The main thread should always interrupt() the pipe io threads when
   finishing up, not just when a timeout occurs.
3. Sleep before testing whether the process has finished with
   Process.exitValue().
4. Increased the sleep time to be 1000msec.


By Paul Baclace.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java?rev=366571&r1=366570&r2=366571&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java Fri 
Jan  6 13:35:35 2006
@@ -18,6 +18,7 @@
  * Adopted by John Xing for Nutch Project from
  * http://blog.fivesight.com/prb/space/Call+an+External+Command+from+Java/,
  * which explains the code in detail.
+ * [Original author is moving his site to http://mult.ifario.us/   -peb]
  *
  * Comments by John Xing on 20040621:
  * (1) EDU.oswego.cs.dl.util.concurrent.* is in j2sdk 1.5 now.
@@ -31,6 +32,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.io.InterruptedIOException;
 
 import EDU.oswego.cs.dl.util.concurrent.BrokenBarrierException;
 import EDU.oswego.cs.dl.util.concurrent.CyclicBarrier;
@@ -80,40 +82,47 @@
   }
 
   public void evaluate() throws IOException {
-Process proc = Runtime.getRuntime().exec(_command);
+  this.exec();
+  }
 
+  /**
+   *
+   * @return process exit value (return code) or -1 if timed out.
+   * @throws IOException
+   */
+  public int exec() throws IOException {
+Process proc = Runtime.getRuntime().exec(_command);
 _barrier = new CyclicBarrier(3 + ((_stdin != null) ? 1 : 0));
 
 PullerThread so =
   new PullerThread("STDOUT", proc.getInputStream(), _stdout);
+so.setDaemon(true);
 so.start();
 
 PullerThread se =
   new PullerThread("STDERR", proc.getErrorStream(), _stderr);
+se.setDaemon(true);
 se.start();
 
 PusherThread si = null;
 if (_stdin != null) {
   si = new PusherThread("STDIN", _stdin, proc.getOutputStream());
+  si.setDaemon(true);
   si.start();
 }
 
 boolean _timedout = false;
 long end = System.currentTimeMillis() + _timeout * 1000;
 
+//
 try {
   if (_timeout == 0) {
-_barrier.barrier();
+_barrier.barrier(); // JDK 1.5: // _barrier.await();
   } else {
-_barrier.attemptBarrier(_timeout * 1000);
+_barrier.attemptBarrier(_timeout * 1000); // JDK 1.5: //  
_barrier.await(_timeout, TimeUnit.SECONDS);
   }
 } catch (TimeoutException ex) {
   _timedout = true;
-  if (si != null) {
-si.interrupt();
-  }
-  so.interrupt();
-  se.interrupt();
   if (_destroyOnTimeout) {
 proc.destroy();
   }
@@ -123,16 +132,27 @@
   /* IGNORE */
 }
 
+// tell the io threads we are finished
+if (si != null) {
+  si.interrupt();
+}
+so.interrupt();
+se.interrupt();
+
 _xit = -1;
 
 if (!_timedout) {
   if (_waitForExit) {
 do {
   try {
+Thread.sleep(1000);
 _xit = proc.exitValue();
-Thread.sleep(250);
   } catch (InterruptedException ie) {
-/* IGNORE */
+  if (Thread.interrupted()) {
+  break; // stop waiting on an interrupt for this thread
+  } else {
+  continue;
+  }
   } catch (IllegalThreadStateException iltse) {
 continue;
   }
@@ -152,6 +172,7 @@
 proc.destroy();
   }
 }
+return _xit;
   }
 
   public Throwable getThrownError() {
@@ -163,8 +184,6 @@
 private OutputStream _os;
 private InputStream _is;
 
-private volatile boolean _kaput;
-
 private boolean _closeInput;
 
 protected PumperThread(
@@ -179,7 +198,6 @@
 }
 
 public void run() {
-  _kaput = false;
   try {
 byte[] buf = new byte[BUF];
 int read = 0;
@@ -189,9 +207,10 @@
   _os.write(buf, 0, read);
   _os.flush();
 }
+  } catch (Inte

svn commit: r366550 - /lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java

2006-01-06 Thread cutting
Author: cutting
Date: Fri Jan  6 11:14:46 2006
New Revision: 366550

URL: http://svn.apache.org/viewcvs?rev=366550&view=rev
Log:
Make it clearer why this optimization is valid.  For Stefan.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java?rev=366550&r1=366549&r2=366550&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java Fri Jan  6 
11:14:46 2006
@@ -306,7 +306,7 @@
* contains nulls for calls that timed out or errored.  */
   public Writable[] call(Writable[] params, InetSocketAddress[] addresses)
 throws IOException {
-if (params.length == 0) return new Writable[0];
+if (addresses.length == 0) return new Writable[0];
 
 ParallelResults results = new ParallelResults(params.length);
 synchronized (results) {




svn commit: r366322 - /lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java

2006-01-05 Thread cutting
Author: cutting
Date: Thu Jan  5 14:37:19 2006
New Revision: 366322

URL: http://svn.apache.org/viewcvs?rev=366322&view=rev
Log:
Fix a bug in LimitedCollector.

Modified:

lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java?rev=366322&r1=366321&r2=366322&view=diff
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java 
Thu Jan  5 14:37:19 2006
@@ -48,7 +48,7 @@
 private int maxHits;
 
 public LimitedCollector(int numHits, int maxHits) {
-  super(maxHits);
+  super(numHits);
   this.maxHits = maxHits;
 }
 




svn commit: r366280 - /lucene/nutch/trunk/conf/nutch-default.xml

2006-01-05 Thread cutting
Author: cutting
Date: Thu Jan  5 13:08:27 2006
New Revision: 366280

URL: http://svn.apache.org/viewcvs?rev=366280&view=rev
Log:
Fix NUTCH-131: add mapred.child.heap.size.  From Marko Bauhardt.

Modified:
lucene/nutch/trunk/conf/nutch-default.xml

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=366280&r1=366279&r2=366280&view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Thu Jan  5 13:08:27 2006
@@ -504,6 +504,13 @@
   
 
 
+
+  mapred.child.heap.size
+  200m
+  The heap size (-Xmx) that will be used for task tracker
+  child processes.
+
+
 
 
 




svn commit: r366271 - /lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java

2006-01-05 Thread cutting
Author: cutting
Date: Thu Jan  5 12:13:43 2006
New Revision: 366271

URL: http://svn.apache.org/viewcvs?rev=366271&view=rev
Log:
Fix for NUTCH-108: eliminate voluminous messages when reconnecting.
>From Paul Baclace.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java?rev=366271&r1=366270&r2=366271&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java Thu 
Jan  5 12:13:43 2006
@@ -287,8 +287,7 @@
 staleState = true;
 }
 } catch (Exception ex) {
-ex.printStackTrace();
-LOG.info("Lost connection to JobTracker [" + 
jobTrackAddr + "].  Retrying...");
+LOG.info("Lost connection to JobTracker [" + 
jobTrackAddr + "]. ex=" + ex + "  Retrying...");
 try {
 Thread.sleep(5000);
 } catch (InterruptedException ie) {




svn commit: r366242 - in /lucene/nutch/trunk: conf/nutch-default.xml src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java

2006-01-05 Thread cutting
Author: cutting
Date: Thu Jan  5 10:38:44 2006
New Revision: 366242

URL: http://svn.apache.org/viewcvs?rev=366242&view=rev
Log:
Fix NegativeArraySizeException.

Modified:
lucene/nutch/trunk/conf/nutch-default.xml

lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=366242&r1=366241&r2=366242&view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Thu Jan  5 10:38:44 2006
@@ -661,10 +661,11 @@
 
 
   searcher.max.hits
-  2147483647
-  Search stops after this many hits are found.  Setting
-  this to smaller values can make searches much faster.  With a sorted
-  index, the quality of the hits suffers little.
+  -1
+  If positive, search stops after this many hits are
+  found.  Setting this to small, positive values (e.g., 1000) can make
+  searches much faster.  With a sorted index, the quality of the hits
+  suffers little.
 
 
 

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java?rev=366242&r1=366241&r2=366242&view=diff
==
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java 
(original)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java 
Thu Jan  5 10:38:44 2006
@@ -37,8 +37,7 @@
  * which do not affect ranking but might otherwise slow search considerably. */
 class LuceneQueryOptimizer {
 
-  private static int MAX_HITS =
-NutchConf.get().getInt("searcher.max.hits", Integer.MAX_VALUE);
+  private static int MAX_HITS = NutchConf.get().getInt("searcher.max.hits",-1);
 
   private static class LimitExceeded extends RuntimeException {
 private int maxDoc;
@@ -150,6 +149,13 @@
   }
 }
 if (sortField == null && !reverse) {
+
+  // no hit limit
+  if (MAX_HITS <= 0) {
+return searcher.search(query, filter, numHits);
+  }
+
+  // hits limited -- use a LimitedCollector
   LimitedCollector collector = new LimitedCollector(numHits, MAX_HITS);
   LimitExceeded exceeded = null;
   try {




svn commit: r365459 - in /lucene/nutch/trunk: conf/nutch-default.xml src/java/org/apache/nutch/indexer/IndexSorter.java src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java

2006-01-02 Thread cutting
Author: cutting
Date: Mon Jan  2 15:27:50 2006
New Revision: 365459

URL: http://svn.apache.org/viewcvs?rev=365459&view=rev
Log:
Add index sorter & ability to stop searching after N hits.

Added:
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java
Modified:
lucene/nutch/trunk/conf/nutch-default.xml

lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=365459&r1=365458&r2=365459&view=diff
==
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Mon Jan  2 15:27:50 2006
@@ -659,6 +659,14 @@
   
 
 
+
+  searcher.max.hits
+  2147483647
+  Search stops after this many hits are found.  Setting
+  this to smaller values can make searches much faster.  With a sorted
+  index, the quality of the hits suffers little.
+
+
 
 
 

Added: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java?rev=365459&view=auto
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java 
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java Mon 
Jan  2 15:27:50 2006
@@ -0,0 +1,295 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Date;
+import java.util.Arrays;
+
+import org.apache.lucene.index.*;
+import org.apache.lucene.document.*;
+import org.apache.lucene.store.*;
+import org.apache.lucene.search.*;
+
+import org.apache.nutch.util.NutchConf;
+
+/** Sort a Nutch index by page score.  Higher scoring documents are assigned
+ * smaller document numbers. */
+public class IndexSorter {
+
+  private static class PostingMap implements Comparable {
+private int newDoc;
+private long offset;
+
+public int compareTo(Object o) {  // order by newDoc id
+  return this.newDoc - ((PostingMap)o).newDoc;
+}
+  }
+
+  private static class SortedTermPositions implements TermPositions {
+private TermPositions original;
+private int[] oldToNew;
+
+private int docFreq;
+
+private PostingMap[] postingMaps = new PostingMap[0];
+private int pointer;
+
+private int freq;
+private int position;
+
+private static final String TEMP_FILE = "temp";
+private final RAMDirectory tempDir = new RAMDirectory();
+private final RAMOutputStream out =
+  (RAMOutputStream)tempDir.createOutput(TEMP_FILE);
+private IndexInput in;
+
+public SortedTermPositions(TermPositions original, int[] oldToNew) {
+  this.original = original;
+  this.oldToNew = oldToNew;
+}
+
+public void seek(Term term) throws IOException {
+  throw new UnsupportedOperationException();
+}
+
+public void seek(TermEnum terms) throws IOException {
+  original.seek(terms);
+
+  docFreq = terms.docFreq();
+  pointer = -1;
+
+  if (docFreq > postingMaps.length) { // grow postingsMap
+PostingMap[] newMap = new PostingMap[docFreq];
+System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length);
+for (int i = postingMaps.length; i < docFreq; i++) {
+  newMap[i] = new PostingMap();
+}
+postingMaps = newMap;
+  }
+
+  out.reset();
+
+  int i = 0;
+  while (original.next()) {
+PostingMap map = postingMaps[i++];
+map.newDoc = oldToNew[original.doc()];// remap the newDoc id
+map.offset = out.getFilePointer();// save pointer to buffer
+
+final int tf = original.freq();   // buffer tf & positions
+out.writeVInt(tf);
+int prevPosition = 0;
+for (int j = tf; j > 0; j--) {// delta encode positions
+  int p = original.nextPosition();
+  out.writeVInt(p - prevPosition);
+  prevPosition = p;
+}
+  }
+  out.flush();
+  docFreq = i;// allow for deletions
+  
+  Arra

svn commit: r365454 - /lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java

2006-01-02 Thread cutting
Author: cutting
Date: Mon Jan  2 14:33:38 2006
New Revision: 365454

URL: http://svn.apache.org/viewcvs?rev=365454&view=rev
Log:
Revert unintended commit.

Modified:
lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java?rev=365454&r1=365453&r2=365454&view=diff
==
--- lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java Mon Jan  2 
14:33:38 2006
@@ -35,8 +35,6 @@
   public static final Logger LOG =
 LogFormatter.getLogger("org.apache.nutch.ipc.TestIPC");
 
-  private static final int TIMEOUT = 1;
-
   // quiet during testing, since output ends up on console
   static {
 LOG.setLevel(Level.WARNING);
@@ -44,6 +42,8 @@
 Server.LOG.setLevel(Level.WARNING);
   }
 
+  public TestIPC(String name) { super(name); }
+
   private static final Random RANDOM = new Random();
 
   private static final int PORT = 1234;
@@ -53,7 +53,7 @@
 
 public TestServer(int port, int handlerCount, boolean sleep) {
   super(port, LongWritable.class, handlerCount);
-  this.setTimeout(TIMEOUT);
+  this.setTimeout(1000);
   this.sleep = sleep;
 }
 
@@ -75,7 +75,7 @@
 public SerialCaller(Client client, int count) {
   this.client = client;
   this.count = count;
-  client.setTimeout(TIMEOUT);
+  client.setTimeout(1000);
 }
 
 public void run() {
@@ -108,7 +108,7 @@
   this.client = client;
   this.addresses = addresses;
   this.count = count;
-  client.setTimeout(TIMEOUT);
+  client.setTimeout(1000);
 }
 
 public void run() {
@@ -157,10 +157,10 @@
   callers[i].join();
   assertFalse(callers[i].failed);
 }
-// for (int i = 0; i < clientCount; i++) {
-//   clients[i].stop();
-// }
-// server.stop();
+for (int i = 0; i < clientCount; i++) {
+  clients[i].stop();
+}
+server.stop();
   }

   public void testParallel() throws Exception {
@@ -207,17 +207,14 @@

   public static void main(String[] args) throws Exception {
 // crank up the volume!
-//  LOG.setLevel(Level.INFO);
-//  Client.LOG.setLevel(Level.INFO);
-//  Server.LOG.setLevel(Level.INFO);
-//  LogFormatter.setShowThreadIDs(true);
-
- new TestIPC().testSerial(10, false, 500, 500, 100);
-//new TestIPC().testParallel(10, false, 2, 4, 2, 4, 1000);
-
-// TestIPC test = new TestIPC();
-// test.testSerial();
-// test.testParallel();
+LOG.setLevel(Level.FINE);
+Client.LOG.setLevel(Level.FINE);
+Server.LOG.setLevel(Level.FINE);
+LogFormatter.setShowThreadIDs(true);
+
+//new TestIPC("test").testSerial(5, false, 2, 10, 1000);
+
+new TestIPC("test").testParallel(10, false, 2, 4, 2, 4, 1000);
 
   }
 




svn commit: r365450 - in /lucene/nutch/trunk: lib/lucene-1.9-rc1-dev.jar lib/lucene-core-1.9-rc1-dev.jar src/test/org/apache/nutch/ipc/TestIPC.java

2006-01-02 Thread cutting
Author: cutting
Date: Mon Jan  2 14:08:50 2006
New Revision: 365450

URL: http://svn.apache.org/viewcvs?rev=365450&view=rev
Log:
New version of Lucene that includes TopDocCollector.

Added:
lucene/nutch/trunk/lib/lucene-core-1.9-rc1-dev.jar   (with props)
Removed:
lucene/nutch/trunk/lib/lucene-1.9-rc1-dev.jar
Modified:
lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java

Added: lucene/nutch/trunk/lib/lucene-core-1.9-rc1-dev.jar
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-core-1.9-rc1-dev.jar?rev=365450&view=auto
==
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/lucene-core-1.9-rc1-dev.jar
--
svn:mime-type = application/octet-stream

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java?rev=365450&r1=365449&r2=365450&view=diff
==
--- lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java Mon Jan  2 
14:08:50 2006
@@ -35,6 +35,8 @@
   public static final Logger LOG =
 LogFormatter.getLogger("org.apache.nutch.ipc.TestIPC");
 
+  private static final int TIMEOUT = 1;
+
   // quiet during testing, since output ends up on console
   static {
 LOG.setLevel(Level.WARNING);
@@ -42,8 +44,6 @@
 Server.LOG.setLevel(Level.WARNING);
   }
 
-  public TestIPC(String name) { super(name); }
-
   private static final Random RANDOM = new Random();
 
   private static final int PORT = 1234;
@@ -53,7 +53,7 @@
 
 public TestServer(int port, int handlerCount, boolean sleep) {
   super(port, LongWritable.class, handlerCount);
-  this.setTimeout(1000);
+  this.setTimeout(TIMEOUT);
   this.sleep = sleep;
 }
 
@@ -75,7 +75,7 @@
 public SerialCaller(Client client, int count) {
   this.client = client;
   this.count = count;
-  client.setTimeout(1000);
+  client.setTimeout(TIMEOUT);
 }
 
 public void run() {
@@ -108,7 +108,7 @@
   this.client = client;
   this.addresses = addresses;
   this.count = count;
-  client.setTimeout(1000);
+  client.setTimeout(TIMEOUT);
 }
 
 public void run() {
@@ -157,10 +157,10 @@
   callers[i].join();
   assertFalse(callers[i].failed);
 }
-for (int i = 0; i < clientCount; i++) {
-  clients[i].stop();
-}
-server.stop();
+// for (int i = 0; i < clientCount; i++) {
+//   clients[i].stop();
+// }
+// server.stop();
   }

   public void testParallel() throws Exception {
@@ -207,14 +207,17 @@

   public static void main(String[] args) throws Exception {
 // crank up the volume!
-LOG.setLevel(Level.FINE);
-Client.LOG.setLevel(Level.FINE);
-Server.LOG.setLevel(Level.FINE);
-LogFormatter.setShowThreadIDs(true);
-
-//new TestIPC("test").testSerial(5, false, 2, 10, 1000);
-
-new TestIPC("test").testParallel(10, false, 2, 4, 2, 4, 1000);
+//  LOG.setLevel(Level.INFO);
+//  Client.LOG.setLevel(Level.INFO);
+//  Server.LOG.setLevel(Level.INFO);
+//  LogFormatter.setShowThreadIDs(true);
+
+ new TestIPC().testSerial(10, false, 500, 500, 100);
+//new TestIPC().testParallel(10, false, 2, 4, 2, 4, 1000);
+
+// TestIPC test = new TestIPC();
+// test.testSerial();
+// test.testParallel();
 
   }
 




svn commit: r365392 - /lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java

2006-01-02 Thread cutting
Author: cutting
Date: Mon Jan  2 10:51:18 2006
New Revision: 365392

URL: http://svn.apache.org/viewcvs?rev=365392&view=rev
Log:
Fix divide by zero error in DeleteDuplicates.java.

Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=365392&r1=365391&r2=365392&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java 
Mon Jan  2 10:51:18 2006
@@ -190,7 +190,7 @@
   }
 
   public long getPos() throws IOException {
-return (doc*INDEX_LENGTH)/maxDoc;
+return maxDoc==0 ? 0 : (doc*INDEX_LENGTH)/maxDoc;
   }
 
   public void close() throws IOException {




svn commit: r357197 [5/5] - in /lucene/nutch: branches/mapred/ trunk/ trunk/bin/ trunk/conf/ trunk/lib/ trunk/lib/jetty-ext/ trunk/site/ trunk/src/java/org/apache/nutch/crawl/ trunk/src/java/org/apach

2005-12-16 Thread cutting
Modified: 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java?rev=357197&r1=357196&r2=357197&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
 Fri Dec 16 09:51:05 2005
@@ -53,6 +53,9 @@
   private static final boolean ALLOW_FORBIDDEN =
 NutchConf.get().getBoolean("http.robots.403.allow", false);
 
+  private static final int MAX_REDIRECTS =
+NutchConf.get().getInt("http.redirect.max", 3);
+
   private static final String[] AGENTS = getAgents();
   private static final Hashtable CACHE = new Hashtable();
   
@@ -377,16 +380,30 @@
 RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host);
 
 if (robotRules == null) { // cache miss
-  HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"));
-
-  if (response.getCode() == 200)   // found rules: parse them
-robotRules = new RobotRulesParser().parseRules(response.getContent());
-  else if ( (response.getCode() == 403) && (!ALLOW_FORBIDDEN) )
-robotRules = FORBID_ALL_RULES;// use forbid all
-  else
-robotRules = EMPTY_RULES; // use default rules
+  int redirects = 0;
+  do {
+HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"));
+
+int code = response.getCode();
+
+if (code == 200) {// found rules: parse them
+  robotRules = new 
RobotRulesParser().parseRules(response.getContent());
+} else if ( (code == 403) && (!ALLOW_FORBIDDEN) ) {
+  robotRules = FORBID_ALL_RULES;  // use forbid all
+} else if (code >= 300 && code < 400) {   // handle redirect
+  if (redirects == MAX_REDIRECTS) {
+robotRules = EMPTY_RULES;
+  } else {
+url = new URL(url, response.getHeader("Location"));
+LOG.fine("redirect to " + url); 
+redirects++;
+  }
+} else {
+  robotRules = EMPTY_RULES; // use default rules
+}
+  } while (robotRules == null);
 
-  CACHE.put(host, robotRules);// cache rules for host
+  CACHE.put(host, robotRules);  // cache rules for host
 }
 
 String path = url.getPath();  // check rules

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=357197&r1=357196&r2=357197&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 Fri Dec 16 09:51:05 2005
@@ -60,10 +60,14 @@
   }
 
   public HttpResponse(URL url) throws IOException {
+this(url, false);
+  }
+
+  HttpResponse(URL url, boolean followRedirects) throws IOException {
 this.base = url.toString();
 this.orig = url.toString();
 GetMethod get = new GetMethod(this.orig);
-get.setFollowRedirects(false);
+get.setFollowRedirects(followRedirects);
 get.setRequestHeader("User-Agent", Http.AGENT_STRING);
 HttpMethodParams params = get.getParams();
 // some servers cannot digest the new protocol

Modified: 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java?rev=357197&r1=357196&r2=357197&view=diff
==
--- 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
 Fri Dec 16 09:51:05 2005
@@ -379,7 +379,8 @@
 if (robotRules == null) { // cache miss
   LOG.fine("cache miss " + url);
   try {
-HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"));
+HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"),
+  

svn commit: r357197 [3/5] - in /lucene/nutch: branches/mapred/ trunk/ trunk/bin/ trunk/conf/ trunk/lib/ trunk/lib/jetty-ext/ trunk/site/ trunk/src/java/org/apache/nutch/crawl/ trunk/src/java/org/apach

2005-12-16 Thread cutting
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSDataset.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSDataset.java?rev=357197&r1=357196&r2=357197&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSDataset.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSDataset.java Fri Dec 16 
09:51:05 2005
@@ -29,7 +29,8 @@
  ***/
 public class FSDataset implements FSConstants {
 static final double USABLE_DISK_PCT = 0.98;
-/**
+
+  /**
  * A node type that can be built into a tree reflecting the
  * hierarchy of blocks on the local disk.
  */
@@ -166,6 +167,13 @@
 blkid = blkid >> ((15 - halfByteIndex) * 4);
 return (int) ((0x000F) & blkid);
 }
+
+public String toString() {
+  return "FSDir{" +
+  "dir=" + dir +
+  ", children=" + (children == null ? null : 
Arrays.asList(children)) +
+  "}";
+}
 }
 
 //
@@ -282,17 +290,23 @@
 ongoingCreates.add(b);
 reserved += BLOCK_SIZE;
 f = getTmpFile(b);
-
-if (f.exists()) {
-throw new IOException("Unexpected problem in startBlock() for 
" + b + ".  File " + f + " should not be present, but is.");
-}
-}
-
-//
-// Create the zero-length temp file
-//
-if (!f.createNewFile()) {
-throw new IOException("Unexpected problem in startBlock() for " + 
b + ".  File " + f + " should be creatable, but is already present.");
+   try {
+   if (f.exists()) {
+   throw new IOException("Unexpected problem in startBlock() 
for " + b + ".  File " + f + " should not be present, but is.");
+   }
+
+   //
+   // Create the zero-length temp file
+   //
+   if (!f.createNewFile()) {
+   throw new IOException("Unexpected problem in startBlock() 
for " + b + ".  File " + f + " should be creatable, but is already present.");
+   }
+   } catch (IOException ie) {
+System.out.println("Exception!  " + ie);
+   ongoingCreates.remove(b);   
+   reserved -= BLOCK_SIZE;
+throw ie;
+   }
 }
 
 //
@@ -405,4 +419,11 @@
 // REMIND - mjc - should cache this result for performance
 return new File(tmp, b.getBlockName());
 }
+
+public String toString() {
+  return "FSDataset{" +
+"dirpath='" + dirpath + "'" +
+"}";
+}
+
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSDirectory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSDirectory.java?rev=357197&r1=357196&r2=357197&view=diff
==
--- lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSDirectory.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSDirectory.java Fri Dec 
16 09:51:05 2005
@@ -47,7 +47,7 @@
 class INode {
 public String name;
 public INode parent;
-public Vector children = new Vector();
+public TreeMap children = new TreeMap();
 public Block blocks[];
 
 /**
@@ -59,61 +59,45 @@
 }
 
 /**
+ * This is the external interface
  */
 INode getNode(String target) {
-if (! target.startsWith("/")) {
+if (! target.startsWith("/") || target.length() == 0) {
 return null;
-}
-
-if (parent == null) {
-if ("/".equals(target)) {
-return this;
-} else {
-// Check with children
-for (Iterator it = children.iterator(); it.hasNext(); ) {
-INode child = (INode) it.next();
-INode result = child.getNode(target);
-if (result != null) {
-return result;
-}
-}
-}
+} else if (parent == null && "/".equals(target)) {
+return this;
 } else {
-// Strip the leading slash
-if (target.length() > 1) {
-target = target.substring(1);
-}
-
-// Check if it's the current node
-if (name.equals(target)) {
-return this;
+Vector components = new Vector();
+int start = 0;
+int slashid = 0;
+while (start < target.length() && (slash

svn commit: r351462 - in /lucene/nutch/branches/mapred/src: java/org/apache/nutch/ipc/ java/org/apache/nutch/ndfs/ test/org/apache/nutch/ndfs/

2005-12-01 Thread cutting
Author: cutting
Date: Thu Dec  1 12:28:49 2005
New Revision: 351462

URL: http://svn.apache.org/viewcvs?rev=351462&view=rev
Log:
Add TestNDFS, NUTCH-116.  Contributed by Paul Baclace.

Added:
lucene/nutch/branches/mapred/src/test/org/apache/nutch/ndfs/
lucene/nutch/branches/mapred/src/test/org/apache/nutch/ndfs/TestNDFS.java
Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/ipc/Server.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/FSDataset.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/FSNamesystem.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/NameNode.java

Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/ipc/Server.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/ipc/Server.java?rev=351462&r1=351461&r2=351462&view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/ipc/Server.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/ipc/Server.java Thu 
Dec  1 12:28:49 2005
@@ -95,7 +95,9 @@
   }
   try {
 socket.close();
-  } catch (IOException e) {}
+  } catch (IOException e) {
+LOG.info(getName() + ": e=" + e);
+  }
   LOG.info(getName() + ": exiting");
 }
   }
@@ -166,9 +168,9 @@
 
   /** Handles queued calls . */
   private class Handler extends Thread {
-public Handler() {
+public Handler(int instanceNumber) {
   this.setDaemon(true);
-  this.setName("Server handler on " + port);
+  this.setName("Server handler "+ instanceNumber + " on " + port);
 }
 
 public void run() {
@@ -242,25 +244,31 @@
 listener.start();
 
 for (int i = 0; i < handlerCount; i++) {
-  Handler handler = new Handler();
+  Handler handler = new Handler(i);
   handler.start();
 }
   }
 
-  /** Stops the service.  No calls will be handled after this is called.  All
-   * threads will exit. */
+  /** Stops the service.  No new calls will be handled after this is called.  
All
+   * subthreads will likely be finished after this returns.
+   */
   public synchronized void stop() {
 LOG.info("Stopping server on " + port);
 running = false;
 try {
-  Thread.sleep(timeout);// let all threads exit
+  Thread.sleep(timeout); //  inexactly wait for pending requests to 
finish
 } catch (InterruptedException e) {}
-notify();
+notifyAll();
   }
 
-  /** Wait for the server to be stopped. */
+  /** Wait for the server to be stopped.
+   * Does not wait for all subthreads to finish.
+   *  See [EMAIL PROTECTED] #stop()}.
+   */
   public synchronized void join() throws InterruptedException {
-wait();
+while (running) {
+  wait();
+}
   }
 
   /** Called for each call. */

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java?rev=351462&r1=351461&r2=351462&view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java 
Thu Dec  1 12:28:49 2005
@@ -37,7 +37,7 @@
  **/
 public class DataNode implements FSConstants, Runnable {
 public static final Logger LOG = 
LogFormatter.getLogger("org.apache.nutch.ndfs.DataNode");
-//
+  //
 // REMIND - mjc - I might bring "maxgigs" back so user can place 
 // artificial  limit on space
 //private static final long GIGABYTE = 1024 * 1024 * 1024;
@@ -59,6 +59,8 @@
 return new InetSocketAddress(host, port);
 }
 
+
+private static Vector subThreadList = null;
 DatanodeProtocol namenode;
 FSDataset data;
 String localName;
@@ -66,6 +68,8 @@
 Vector receivedBlockList = new Vector();
 int xmitsInProgress = 0;
 Daemon dataXceiveServer = null;
+long blockReportInterval;
+private long datanodeStartupPeriod;
 private NutchConf fConf;
 
 /**
@@ -98,6 +102,13 @@
 this.localName = machineName + ":" + tmpPort;
 this.dataXceiveServer = new Daemon(new DataXceiveServer(ss));
 this.dataXceiveServer.start();
+
+long blockReportIntervalBasis =
+  conf.getLong("ndfs.blockreport.intervalMsec", BLOCKREPORT_INTERVAL);
+this.blockReportInterval =
+  blockReportIntervalBasis - new 
Random().nextInt((int)(blockReportIntervalBasis/10));
+this.datanodeStartupPeriod =
+

svn commit: r350310 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch: crawl/Crawl.java fs/NDFSFileSystem.java fs/NutchFileSystem.java mapred/JobConf.java ndfs/DatanodeInfo.java ndfs/NDFSCl

2005-12-01 Thread cutting
Author: cutting
Date: Thu Dec  1 11:59:24 2005
New Revision: 350310

URL: http://svn.apache.org/viewcvs?rev=350310&view=rev
Log:
Paul Baclace's code & comment cleanups from NUTCH-116.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobConf.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DatanodeInfo.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/NDFSClient.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java?rev=350310&r1=350309&r2=350310&view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java Thu 
Dec  1 11:59:24 2005
@@ -52,7 +52,7 @@
 JobConf conf = new JobConf(NutchConf.get());
 //conf.addConfResource("crawl-tool.xml");
 
-File rootUrlFile = null;
+File rootUrlDir = null;
 File dir = new File("crawl-" + getDate());
 int threads = conf.getInt("fetcher.threads.fetch", 10);
 int depth = 5;
@@ -72,7 +72,7 @@
 topN = Integer.parseInt(args[i+1]);
 i++;
   } else if (args[i] != null) {
-rootUrlFile = new File(args[i]);
+rootUrlDir = new File(args[i]);
   }
 }
 
@@ -82,7 +82,7 @@
 }
 
 LOG.info("crawl started in: " + dir);
-LOG.info("rootUrlFile = " + rootUrlFile);
+LOG.info("rootUrlDir = " + rootUrlDir);
 LOG.info("threads = " + threads);
 LOG.info("depth = " + depth);
 
@@ -98,7 +98,7 @@
 File tmpDir = conf.getLocalFile("crawl", getDate());
   
 // initialize crawlDb
-new Injector(conf).inject(crawlDb, rootUrlFile);
+new Injector(conf).inject(crawlDb, rootUrlDir);
   
 for (int i = 0; i < depth; i++) { // generate new segment
   File segment =

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java?rev=350310&r1=350309&r2=350310&view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java 
Thu Dec  1 11:59:24 2005
@@ -25,8 +25,9 @@
 import org.apache.nutch.util.NutchConf;
 
 /
- * Implement the NutchFileSystem interface for the NDFS system.
- *
+ * Implementation of the abstract NutchFileSystem for the NDFS system.
+ * This is the distributed file system.  It can be distributed over
+ * 1 or more machines 
  * @author Mike Cafarella
  */
 public class NDFSFileSystem extends NutchFileSystem {

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java?rev=350310&r1=350309&r2=350310&view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java 
Thu Dec  1 11:59:24 2005
@@ -24,16 +24,21 @@
 import org.apache.nutch.util.*;
 
 /
- * NutchFileSystem is an interface for a fairly simple
- * distributed file system.  A Nutch installation might consist
+ * An abstract base class for a fairly simple
+ * distributed file system.
+ * A Nutch installation might consist
  * of multiple machines, which should swap files transparently.
  * This interface allows other Nutch systems to find and place
  * files into the distributed Nutch-controlled file world.
- *
+ * 
+ * A local implementation exists for testing and for small Nutch instances.
+ * 
  * The standard job of NutchFileSystem is to take the location-
  * independent NutchFile objects, and resolve them using local
  * knowledge and local instances of ShareGroup.
- * 
+ * 
+ * The local implementation is [EMAIL PROTECTED] LocalFileSystem} and 
distributed
+ * implementation is [EMAIL PROTECTED] NDFSFileSystem}.
  * @author Mike Cafarella
  ***

svn commit: r350294 - in /lucene/nutch/branches/mapred: build.xml default.properties src/plugin/build-plugin.xml

2005-12-01 Thread cutting
Author: cutting
Date: Thu Dec  1 10:24:07 2005
New Revision: 350294

URL: http://svn.apache.org/viewcvs?rev=350294&view=rev
Log:
Always specify java source & target versions to javac.  From Michael Stack.

Modified:
lucene/nutch/branches/mapred/build.xml
lucene/nutch/branches/mapred/default.properties
lucene/nutch/branches/mapred/src/plugin/build-plugin.xml

Modified: lucene/nutch/branches/mapred/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/build.xml?rev=350294&r1=350293&r2=350294&view=diff
==
--- lucene/nutch/branches/mapred/build.xml (original)
+++ lucene/nutch/branches/mapred/build.xml Thu Dec  1 10:24:07 2005
@@ -70,9 +70,11 @@
  srcdir="${src.dir}"
  includes="org/apache/nutch/**/*.java"
  destdir="${build.classes}"
- debug="${debug}"
- optimize="${optimize}"
- deprecation="${deprecation}">
+ debug="${javac.debug}"
+ optimize="${javac.optimize}"
+ target="${javac.version}"
+ source="${javac.version}"
+ deprecation="${javac.deprecation}">
   
 
   
@@ -154,8 +156,11 @@
  srcdir="${test.src.dir}"
  includes="org/apache/nutch/**/*.java"
  destdir="${test.build.classes}"
- debug="${debug}"
- deprecation="${deprecation}">
+ debug="${javac.debug}"
+ optimize="${javac.optimize}"
+ target="${javac.version}"
+ source="${javac.version}"
+ deprecation="${javac.deprecation}">
   
 
   

Modified: lucene/nutch/branches/mapred/default.properties
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/default.properties?rev=350294&r1=350293&r2=350294&view=diff
==
--- lucene/nutch/branches/mapred/default.properties (original)
+++ lucene/nutch/branches/mapred/default.properties Thu Dec  1 10:24:07 2005
@@ -38,9 +38,10 @@
 
 dist.dir=${build.dir}/${final.name}
 
-debug=on
-optimize=on
-deprecation=off
+javac.debug=on
+javac.optimize=on
+javac.deprecation=off
+javac.version= 1.4
 
 plugin.http=org.apache.nutch.protocol.http*
 plugin.httpclient=org.apache.nutch.protocol.httpclient*

Modified: lucene/nutch/branches/mapred/src/plugin/build-plugin.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/build-plugin.xml?rev=350294&r1=350293&r2=350294&view=diff
==
--- lucene/nutch/branches/mapred/src/plugin/build-plugin.xml (original)
+++ lucene/nutch/branches/mapred/src/plugin/build-plugin.xml Thu Dec  1 
10:24:07 2005
@@ -7,8 +7,7 @@
   
   
 
-  
-  
+  
   
   
 
@@ -27,13 +26,8 @@
 
   
 
-  
-  
-
-  http://java.sun.com/j2se/1.4/docs/api/"/>
-
-  
+  
+  
 
   
 
@@ -85,6 +79,9 @@
  includes="**/*.java"
  destdir="${build.classes}"
  debug="${javac.debug}"
+ optimize="${javac.optimize}"
+ target="${javac.version}"
+ source="${javac.version}"
  deprecation="${javac.deprecation}">
   
 
@@ -126,7 +123,11 @@
  srcdir="${src.test}"
  includes="**/*.java"
  destdir="${build.test}"
- debug="${debug}">
+ debug="${javac.debug}"
+ optimize="${javac.optimize}"
+ target="${javac.version}"
+ source="${javac.version}"
+ deprecation="${javac.deprecation}">
   
 
   




svn commit: r348533 - /lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java

2005-11-23 Thread cutting
Author: cutting
Date: Wed Nov 23 11:55:11 2005
New Revision: 348533

URL: http://svn.apache.org/viewcvs?rev=348533&view=rev
Log:
Fix to not extract urls whose method=post.

Modified:

lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java

Modified: 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=348533&r1=348532&r2=348533&view=diff
==
--- 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
 Wed Nov 23 11:55:11 2005
@@ -296,10 +296,7 @@
 if (node.getNodeType() == Node.ELEMENT_NODE) {
   LinkParams params = 
(LinkParams)linkParams.get(node.getNodeName().toLowerCase());
   if (params != null) {
-if (shouldThrowAwayLink(node, children, childLen, params)) {
-  // this has no inner structure or just a single nested
-  // anchor-- toss it!
-} else {
+if (!shouldThrowAwayLink(node, children, childLen, params)) {
 
   StringBuffer linkText = new StringBuffer();
   getText(linkText, node, true);
@@ -307,20 +304,21 @@
   NamedNodeMap attrs = node.getAttributes();
   String target = null;
   boolean noFollow = false;
+  boolean post = false;
   for (int i= 0; i < attrs.getLength(); i++ ) {
 Node attr = attrs.item(i);
 String attrName = attr.getNodeName();
-
-if ("rel".equalsIgnoreCase(attrName) &&
-"nofollow".equalsIgnoreCase(attr.getNodeValue())) {
-  noFollow = true;
-}
-
 if (params.attrName.equalsIgnoreCase(attrName)) {
   target = attr.getNodeValue();
+} else if ("rel".equalsIgnoreCase(attrName) &&
+   "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+  noFollow = true;
+} else if ("method".equalsIgnoreCase(attrName) &&
+   "post".equalsIgnoreCase(attr.getNodeValue())) {
+  post = true;
 }
   }
-  if (target != null && !noFollow)
+  if (target != null && !noFollow && !post)
 try {
   URL url = new URL(base, target);
   outlinks.add(new Outlink(url.toString(),




svn commit: r348531 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java

2005-11-23 Thread cutting
Author: cutting
Date: Wed Nov 23 11:46:05 2005
New Revision: 348531

URL: http://svn.apache.org/viewcvs?rev=348531&view=rev
Log:
Fix to increment retry count.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=348531&r1=348530&r2=348531&view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java 
Wed Nov 23 11:46:05 2005
@@ -140,6 +140,7 @@
   case ProtocolStatus.EXCEPTION:
 logError(url, status.getMessage());
   case ProtocolStatus.RETRY:  // retry
+datum.setRetriesSinceFetch(datum.getRetriesSinceFetch()+1);
 output(key, datum, null, CrawlDatum.STATUS_FETCH_RETRY);
 break;
 




svn commit: r348284 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java

2005-11-22 Thread cutting
Author: cutting
Date: Tue Nov 22 15:15:45 2005
New Revision: 348284

URL: http://svn.apache.org/viewcvs?rev=348284&view=rev
Log:
Fix illegal iterator access bug.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java?rev=348284&r1=348283&r2=348284&view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java 
Tue Nov 22 15:15:45 2005
@@ -142,8 +142,8 @@
  */
 public synchronized void close() throws IOException {
 // Kill running tasks
-for (Iterator it = tasks.values().iterator(); it.hasNext(); ) {
-TaskInProgress tip = (TaskInProgress) it.next();
+while (tasks.size() > 0) {
+TaskInProgress tip = (TaskInProgress)tasks.get(tasks.firstKey());
 tip.jobHasFinished();
 }
 




svn commit: r348212 - in /lucene/nutch/branches/mapred/conf: crawl-tool.xml nutch-default.xml

2005-11-22 Thread cutting
Author: cutting
Date: Tue Nov 22 10:55:26 2005
New Revision: 348212

URL: http://svn.apache.org/viewcvs?rev=348212&view=rev
Log:
Increase defaults for http.max.delays, since, with MapReduce's partitioning of 
fetchlists, delays are more likely.

Modified:
lucene/nutch/branches/mapred/conf/crawl-tool.xml
lucene/nutch/branches/mapred/conf/nutch-default.xml

Modified: lucene/nutch/branches/mapred/conf/crawl-tool.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/crawl-tool.xml?rev=348212&r1=348211&r2=348212&view=diff
==
--- lucene/nutch/branches/mapred/conf/crawl-tool.xml (original)
+++ lucene/nutch/branches/mapred/conf/crawl-tool.xml Tue Nov 22 10:55:26 2005
@@ -33,7 +33,7 @@
 
 
   http.max.delays
-  100
+  1000
   The number of times a thread will delay when trying to
   fetch a page.  When using the crawl tool there are likely to be very
   few different hosts, so we need to be willing to wait longer for

Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=348212&r1=348211&r2=348212&view=diff
==
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Tue Nov 22 10:55:26 2005
@@ -69,7 +69,7 @@
 
 
   http.max.delays
-  3
+  100
   The number of times a thread will delay when trying to
   fetch a page.  Each time it finds that a host is busy, it will wait
   fetcher.server.delay.  After http.max.delays attepts, it will give




svn commit: r348210 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java

2005-11-22 Thread cutting
Author: cutting
Date: Tue Nov 22 10:46:43 2005
New Revision: 348210

URL: http://svn.apache.org/viewcvs?rev=348210&view=rev
Log:
Silently ignore missing checksum files.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java?rev=348210&r1=348209&r2=348210&view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java
 Tue Nov 22 10:46:43 2005
@@ -54,7 +54,9 @@
 if (!Arrays.equals(version, VERSION))
   throw new IOException("Not a checksum file: "+sumFile);
 bytesPerSum = sums.readInt();
-  } catch (IOException e) {
+  } catch (FileNotFoundException e) { // quietly ignore
+stopSumming();
+  } catch (IOException e) {   // loudly ignore
 LOG.warning("Problem opening checksum file: "+e+". Ignoring.");
 stopSumming();
   }




svn commit: r344403 - in /lucene/nutch/branches/mapred: conf/nutch-default.xml src/java/org/apache/nutch/fs/ChecksumException.java src/java/org/apache/nutch/fs/NFSDataInputStream.java src/java/org/apa

2005-11-15 Thread cutting
Author: cutting
Date: Tue Nov 15 10:00:14 2005
New Revision: 344403

URL: http://svn.apache.org/viewcvs?rev=344403&view=rev
Log:
Add ability to skip over data with bad checksums.

Added:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/ChecksumException.java
Modified:
lucene/nutch/branches/mapred/conf/nutch-default.xml

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java

Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=344403&r1=344402&r2=344403&view=diff
==
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Tue Nov 15 10:00:14 2005
@@ -339,6 +339,21 @@
   buffered during read and write operations.
 
   
+
+  io.bytes.per.checksum
+  512
+  The number of bytes per checksum.  Must not be larger than
+  io.file.buffer.size.
+
+
+
+  io.skip.checksum.errors
+  false
+  If true, when a checksum error is encountered while
+  reading a sequence file, entries are skipped, instead of throwing an
+  exception.
+
+  
 
 
 

Added: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/ChecksumException.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/ChecksumException.java?rev=344403&view=auto
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/ChecksumException.java
 (added)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/ChecksumException.java
 Tue Nov 15 10:00:14 2005
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.fs;
+
+import java.io.IOException;
+
+/** Thrown for checksum errors. */
+public class ChecksumException extends IOException {
+  public ChecksumException(String description) {
+super(description);
+  }
+}

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java?rev=344403&r1=344402&r2=344403&view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java
 Tue Nov 15 10:00:14 2005
@@ -109,13 +109,15 @@
 stopSumming();
 return;
   }
-  if (crc != (int)sum.getValue()) {
-fs.reportChecksumFailure(file, (NFSInputStream)in,
- getPos()-delta, bytesPerSum, crc);
-throw new IOException("Checksum error: "+file);
-  }
+  int sumValue = (int)sum.getValue();
   sum.reset();
   inSum = 0;
+  if (crc != sumValue) {
+long pos = getPos() - delta;
+fs.reportChecksumFailure(file, (NFSInputStream)in,
+ pos, bytesPerSum, crc);
+throw new ChecksumException("Checksum error: "+file+" at "+pos);
+  }
 }
 
 public long getPos() throws IOException {

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java?rev=344403&r1=344402&r2=344403&view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java 
Tue Nov 15 10:00:14 2005
@@ -359,23 +359,39 @@
   if (in.getPos() >= end)
 return -1;
 
-  int length = in.readInt();
+  try {
+int length = in.readInt();
 
-  if (version[3] > 1 && sync != null &&
-  length == SYNC_ESCAPE) {// process a sync entry
-//LOG.info("sync@"+in.getPos());
-in.readFully(syncCheck);  // r

svn commit: r332371 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java

2005-11-10 Thread cutting
Author: cutting
Date: Thu Nov 10 13:03:16 2005
New Revision: 332371

URL: http://svn.apache.org/viewcvs?rev=332371&view=rev
Log:
Fix to not increment count of urls when urls are filtered by
maxPerHost limit.  Patch contributed by Rod Taylor.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java?rev=332371&r1=332370&r2=332371&view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java 
Thu Nov 10 13:03:16 2005
@@ -76,23 +76,27 @@
OutputCollector output, Reporter reporter)
   throws IOException {
 
-  while (values.hasNext() && ++count < limit) {
+  while (values.hasNext() && count < limit) {
 
 UTF8 url = (UTF8)values.next();
 
-if (maxPerHost > 0) {   // are we counting hosts?
+if (maxPerHost > 0) { // are we counting hosts?
   String host = new URL(url.toString()).getHost();
-  Integer count = (Integer)hostCounts.get(host);
-  if (count != null) {
-if (count.intValue() >= maxPerHost)
+  Integer hostCount = (Integer)hostCounts.get(host);
+  if (hostCount != null) {
+if (hostCount.intValue() >= maxPerHost)
   continue;   // too many from host
-hostCounts.put(host, new Integer(count.intValue()+1));
+hostCounts.put(host, new Integer(hostCount.intValue()+1));
   } else {// update host count
 hostCounts.put(host, new Integer(1));
   }
 }
 
 output.collect(key, url);
+
+// Count is incremented only when we keep the URL
+// maxPerHost may cause us to skip it.
+count++;
   }
 
 }




svn commit: r332089 - /lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java

2005-11-09 Thread cutting
Author: cutting
Date: Wed Nov  9 09:46:16 2005
New Revision: 332089

URL: http://svn.apache.org/viewcvs?rev=332089&view=rev
Log:
Fix to follow redirects to robots.txt

Modified:

lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java

Modified: 
lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java?rev=332089&r1=332088&r2=332089&view=diff
==
--- 
lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
 Wed Nov  9 09:46:16 2005
@@ -53,6 +53,9 @@
   private static final boolean ALLOW_FORBIDDEN =
 NutchConf.get().getBoolean("http.robots.403.allow", false);
 
+  private static final int MAX_REDIRECTS =
+NutchConf.get().getInt("http.redirect.max", 3);
+
   private static final String[] AGENTS = getAgents();
   private static final Hashtable CACHE = new Hashtable();
   
@@ -377,16 +380,30 @@
 RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host);
 
 if (robotRules == null) { // cache miss
-  HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"));
-
-  if (response.getCode() == 200)   // found rules: parse them
-robotRules = new RobotRulesParser().parseRules(response.getContent());
-  else if ( (response.getCode() == 403) && (!ALLOW_FORBIDDEN) )
-robotRules = FORBID_ALL_RULES;// use forbid all
-  else
-robotRules = EMPTY_RULES; // use default rules
+  int redirects = 0;
+  do {
+HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"));
+
+int code = response.getCode();
+
+if (code == 200) {// found rules: parse them
+  robotRules = new 
RobotRulesParser().parseRules(response.getContent());
+} else if ( (code == 403) && (!ALLOW_FORBIDDEN) ) {
+  robotRules = FORBID_ALL_RULES;  // use forbid all
+} else if (code >= 300 && code < 400) {   // handle redirect
+  if (redirects == MAX_REDIRECTS) {
+robotRules = EMPTY_RULES;
+  } else {
+url = new URL(url, response.getHeader("Location"));
+LOG.fine("redirect to " + url); 
+redirects++;
+  }
+} else {
+  robotRules = EMPTY_RULES; // use default rules
+}
+  } while (robotRules == null);
 
-  CACHE.put(host, robotRules);// cache rules for host
+  CACHE.put(host, robotRules);  // cache rules for host
 }
 
 String path = url.getPath();  // check rules




svn commit: r332088 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java

2005-11-09 Thread cutting
Author: cutting
Date: Wed Nov  9 09:45:18 2005
New Revision: 332088

URL: http://svn.apache.org/viewcvs?rev=332088&view=rev
Log:
Use a more informative thread name.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=332088&r1=332087&r2=332088&view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java 
Wed Nov  9 09:45:18 2005
@@ -74,6 +74,7 @@
   private class FetcherThread extends Thread {
 public FetcherThread() {
   this.setDaemon(true);   // don't hang JVM on exit
+  this.setName("FetcherThread");  // use an informative name
 }
 
 public void run() {




svn commit: r331858 - in /lucene/nutch/branches/mapred: conf/nutch-default.xml src/java/org/apache/nutch/crawl/Generator.java src/java/org/apache/nutch/crawl/Injector.java

2005-11-08 Thread cutting
Author: cutting
Date: Tue Nov  8 10:25:11 2005
New Revision: 331858

URL: http://svn.apache.org/viewcvs?rev=331858&view=rev
Log:
Use absolute paths for temporary crawl files.

Modified:
lucene/nutch/branches/mapred/conf/nutch-default.xml
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java

Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=331858&r1=331857&r2=331858&view=diff
==
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Tue Nov  8 10:25:11 2005
@@ -387,7 +387,14 @@
 
   mapred.system.dir
   /tmp/nutch/mapred/system
-  The local directory where MapReduce stores control files.
+  The shared directory where MapReduce stores control files.
+  
+
+
+
+  mapred.temp.dir
+  /tmp/nutch/mapred/temp
+  A shared directory for temporary files.
   
 
 

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java?rev=331858&r1=331857&r2=331858&view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java 
Tue Nov  8 10:25:11 2005
@@ -155,7 +155,8 @@
 throws IOException {
 
 File tempDir =
-  new File("generate-temp-"+
+  new File(NutchConf.get().get("mapred.temp.dir", ".") +
+   "/generate-temp-"+
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
 File segment = new File(segments, getDate());

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java?rev=331858&r1=331857&r2=331858&view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java 
Tue Nov  8 10:25:11 2005
@@ -84,7 +84,8 @@
 LOG.info("Injector: urlDir: " + urlDir);
 
 File tempDir =
-  new File("inject-temp-"+
+  new File(NutchConf.get().get("mapred.temp.dir", ".") +
+   "/inject-temp-"+
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
 
 // map text input file to a  file




svn commit: r331585 - /lucene/nutch/branches/mapred/conf/nutch-default.xml

2005-11-07 Thread cutting
Author: cutting
Date: Mon Nov  7 11:29:37 2005
New Revision: 331585

URL: http://svn.apache.org/viewcvs?rev=331585&view=rev
Log:
Document job tracker's adminstrative web ui port.

Modified:
lucene/nutch/branches/mapred/conf/nutch-default.xml

Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=331585&r1=331584&r2=331585&view=diff
==
--- lucene/nutch/branches/mapred/conf/nutch-default.xml (original)
+++ lucene/nutch/branches/mapred/conf/nutch-default.xml Mon Nov  7 11:29:37 2005
@@ -426,6 +426,14 @@
   
 
 
+
+  mapred.job.tracker.info.port
+  7845
+  The port number where the jobtracker runs its
+  administrative web interface.
+  
+
+
 
 
 




svn commit: r331556 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java

2005-11-07 Thread cutting
Author: cutting
Date: Mon Nov  7 09:55:59 2005
New Revision: 331556

URL: http://svn.apache.org/viewcvs?rev=331556&view=rev
Log:
Fix to only try to parse successful fetches.  Also, log number of threads in
task process, not in controller, as this may be overridden by nutch-site.xml.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=331556&r1=331555&r2=331556&view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java 
Mon Nov  7 09:55:59 2005
@@ -202,7 +202,7 @@
 (SCORE_KEY, Float.toString(datum.getScore()));
 
   Parse parse = null;
-  if (parsing) {
+  if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
 ParseStatus parseStatus;
 try {
   parse = ParseUtil.parse(content);
@@ -280,6 +280,8 @@
 this.maxRedirect = getConf().getInt("http.redirect.max", 3);
 
 int threadCount = getConf().getInt("fetcher.threads.fetch", 10);
+LOG.info("Fetcher: threads: " + threadCount);
+
 for (int i = 0; i < threadCount; i++) {   // spawn threads
   new FetcherThread().start();
 }
@@ -311,8 +313,6 @@
 
 LOG.info("Fetcher: starting");
 LOG.info("Fetcher: segment: " + segment);
-LOG.info("Fetcher: threads: " + threads);
-
 
 JobConf job = new JobConf(getConf());
 




svn commit: r331555 - in /lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient: HttpResponse.java RobotRulesParser.java

2005-11-07 Thread cutting
Author: cutting
Date: Mon Nov  7 09:53:54 2005
New Revision: 331555

URL: http://svn.apache.org/viewcvs?rev=331555&view=rev
Log:
NUTCH-124: Follow redirects when fetching robots.txt.

Modified:

lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java

lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java

Modified: 
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=331555&r1=331554&r2=331555&view=diff
==
--- 
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
 Mon Nov  7 09:53:54 2005
@@ -82,10 +82,14 @@
   }
 
   public HttpResponse(URL url) throws IOException {
+this(url, false);
+  }
+
+  HttpResponse(URL url, boolean followRedirects) throws IOException {
 this.base = url.toString();
 this.orig = url.toString();
 GetMethod get = new GetMethod(this.orig);
-get.setFollowRedirects(false);
+get.setFollowRedirects(followRedirects);
 get.setRequestHeader("User-Agent", Http.AGENT_STRING);
 HttpMethodParams params = get.getParams();
 // some servers cannot digest the new protocol

Modified: 
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java?rev=331555&r1=331554&r2=331555&view=diff
==
--- 
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java
 Mon Nov  7 09:53:54 2005
@@ -379,7 +379,8 @@
 if (robotRules == null) { // cache miss
   LOG.fine("cache miss " + url);
   try {
-HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"));
+HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"),
+ true);
 
 if (response.getCode() == 200)   // found rules: parse them
   robotRules = new 
RobotRulesParser().parseRules(response.getContent());




svn commit: r330641 - /lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java

2005-11-03 Thread cutting
Author: cutting
Date: Thu Nov  3 13:19:11 2005
New Revision: 330641

URL: http://svn.apache.org/viewcvs?rev=330641&view=rev
Log:
Fix to correctly handle zero-length files.

Modified:

lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java

Modified: 
lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java?rev=330641&r1=330640&r2=330641&view=diff
==
--- 
lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java
 Thu Nov  3 13:19:11 2005
@@ -305,6 +305,8 @@
   long size = ((LongWritable)value).get();
   long seed = Long.parseLong(name);
 
+  if (size == 0) return;
+
   reporter.setStatus("opening " + name);
 
   NFSDataInputStream in = fs.open(new File(DATA_DIR, name));




svn commit: r330640 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java

2005-11-03 Thread cutting
Author: cutting
Date: Thu Nov  3 13:18:05 2005
New Revision: 330640

URL: http://svn.apache.org/viewcvs?rev=330640&view=rev
Log:
Fix a buggy cast when files are longer than Integer.MAX_VALUE, and
improve some diagnostics.

Modified:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java?rev=330640&r1=330639&r2=330640&view=diff
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java 
(original)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java 
Thu Nov  3 13:18:05 2005
@@ -366,8 +366,11 @@
 
 while (anotherChunk) {
 while (len > 0) {
-int bytesRead = in.read(buf, 0, 
Math.min(buf.length, (int) len));
-if (bytesRead >= 0) {
+int bytesRead = in.read(buf, 0, 
(int)Math.min(buf.length, len));
+if (bytesRead < 0) {
+  throw new EOFException("EOF 
reading from "+s.toString());
+}
+if (bytesRead > 0) {
 try {
 out.write(buf, 0, 
bytesRead);
 } catch (IOException iex) {
@@ -393,8 +396,8 @@
 }
 }
 }
+len -= bytesRead;
 }
-len -= bytesRead;
 }
 
 if (encodingType == 
RUNLENGTH_ENCODING) {
@@ -556,7 +559,7 @@
 in.close();
 }
 } catch (IOException ie) {
-ie.printStackTrace();
+  LOG.log(Level.WARNING, "DataXCeiver", ie);
 } finally {
 try {
 s.close();




svn commit: r330638 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java

2005-11-03 Thread cutting
Author: cutting
Date: Thu Nov  3 13:16:28 2005
New Revision: 330638

URL: http://svn.apache.org/viewcvs?rev=330638&view=rev
Log:
Keep trying to restart job tracker.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java?rev=330638&r1=330637&r2=330638&view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java 
Thu Nov  3 13:16:28 2005
@@ -40,10 +40,21 @@
 public static void startTracker(NutchConf conf) throws IOException {
   if (tracker != null)
 throw new IOException("JobTracker already running.");
-  tracker = new JobTracker(conf);
+  while (true) {
+try {
+  tracker = new JobTracker(conf);
+  break;
+} catch (IOException e) {
+  LOG.log(Level.WARNING, "Starting tracker", e);
+}
+try {
+  Thread.sleep(1000);
+} catch (InterruptedException e) {
+}
+  }
   tracker.offerService();
-
 }
+
 public static JobTracker getTracker() {
 return tracker;
 }




svn commit: r330636 - /lucene/nutch/branches/mapred/bin/slaves.sh

2005-11-03 Thread cutting
Author: cutting
Date: Thu Nov  3 13:15:11 2005
New Revision: 330636

URL: http://svn.apache.org/viewcvs?rev=330636&view=rev
Log:
Don't rely on ssh SendEnv.  Instead folks can use
$HOME/.ssh/environment to pass environment variables to slaves.  Note
that sshd must be configured on slaves with PermitUserEnvironment
enabled for this to operate.

Modified:
lucene/nutch/branches/mapred/bin/slaves.sh

Modified: lucene/nutch/branches/mapred/bin/slaves.sh
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/slaves.sh?rev=330636&r1=330635&r2=330636&view=diff
==
--- lucene/nutch/branches/mapred/bin/slaves.sh (original)
+++ lucene/nutch/branches/mapred/bin/slaves.sh Thu Nov  3 13:15:11 2005
@@ -20,7 +20,7 @@
 fi
 
 for slave in `cat $NUTCH_SLAVES`; do
- ssh -o ConnectTimeout=1 -o SendEnv='NUTCH* JAVA*' $slave "$@" \
+ ssh -o ConnectTimeout=1 $slave "$@" \
2>&1 | sed "s/^/$slave: /" &
 done
 




svn commit: r328414 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java

2005-10-25 Thread cutting
Author: cutting
Date: Tue Oct 25 09:57:51 2005
New Revision: 328414

URL: http://svn.apache.org/viewcvs?rev=328414&view=rev
Log:
Fix a type error for JDK 1.4.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java?rev=328414&r1=328413&r2=328414&view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java
 Tue Oct 25 09:57:51 2005
@@ -66,7 +66,8 @@
   Outlink[] links = parse.getData().getOutlinks();
 
   // compute OPIC score contribution
-  float score = Float.valueOf(parse.getData().get(Fetcher.SCORE_KEY));
+  float score =
+Float.parseFloat(parse.getData().get(Fetcher.SCORE_KEY));
   score /= links.length;
   
   for (int i = 0; i < links.length; i++) {




svn commit: r327593 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java

2005-10-21 Thread cutting
Author: cutting
Date: Fri Oct 21 15:07:00 2005
New Revision: 327593

URL: http://svn.apache.org/viewcvs?rev=327593&view=rev
Log:
Always create workdir so child can connect to it.

Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java?rev=327593&r1=327592&r2=327593&view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java 
Fri Oct 21 15:07:00 2005
@@ -63,6 +63,7 @@
 
   String sep = System.getProperty("path.separator");
   File workDir = new File(new File(t.getJobFile()).getParent(), "work");
+  workDir.mkdirs();

   StringBuffer classPath = new StringBuffer();
   // start with same classpath as parent process
@@ -72,7 +73,6 @@
   JobConf job = new JobConf(t.getJobFile());
   String jar = job.getJar();
   if (jar != null) {  // if jar exists, it into workDir
-workDir.mkdirs();
 runChild(new String[] { "unzip", jar}, workDir);
 File[] libs = new File(workDir, "lib").listFiles();
 for (int i = 0; i < libs.length; i++) {




svn commit: r327581 - in /lucene/nutch/branches/mapred/src/plugin/parse-html/src: java/org/apache/nutch/parse/html/DOMContentUtils.java test/org/apache/nutch/parse/html/TestDOMContentUtils.java

2005-10-21 Thread cutting
Author: cutting
Date: Fri Oct 21 14:04:54 2005
New Revision: 327581

URL: http://svn.apache.org/viewcvs?rev=327581&view=rev
Log:
Ignore rel=nofollow links.

Modified:

lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java

lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java

Modified: 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=327581&r1=327580&r2=327581&view=diff
==
--- 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
 Fri Oct 21 14:04:54 2005
@@ -306,13 +306,21 @@
 
   NamedNodeMap attrs = node.getAttributes();
   String target = null;
+  boolean noFollow = false;
   for (int i= 0; i < attrs.getLength(); i++ ) {
-if (params.attrName.equalsIgnoreCase(attrs.item(i).getNodeName())) 
{
-  target = attrs.item(i).getNodeValue();
-  break;
+Node attr = attrs.item(i);
+String attrName = attr.getNodeName();
+
+if ("rel".equalsIgnoreCase(attrName) &&
+"nofollow".equalsIgnoreCase(attr.getNodeValue())) {
+  noFollow = true;
+}
+
+if (params.attrName.equalsIgnoreCase(attrName)) {
+  target = attr.getNodeValue();
 }
   }
-  if (target != null)
+  if (target != null && !noFollow)
 try {
   URL url = new URL(base, target);
   outlinks.add(new Outlink(url.toString(),

Modified: 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=327581&r1=327580&r2=327581&view=diff
==
--- 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
 (original)
+++ 
lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
 Fri Oct 21 14:04:54 2005
@@ -113,6 +113,12 @@
+ "End\tthis\rmadness\n!\r\n"
+ " ... ."
+ "  "),
+
+// test that  links are not returned
+new String(""
+   + "http://www.nutch.org\"; rel=\"nofollow\"> ignore 
"
+   + "http://www.nutch.org\";> ignore 
"
+   + ""),
   };
 
   private static String[] testBaseHrefs= {
@@ -123,6 +129,7 @@
 "http://www.nutch.org/frames/";, 
 "http://www.nutch.org/maps/";,
 "http://www.nutch.org/whitespace/";,
+"http://www.nutch.org//";,
   };
   
   private static final DocumentFragment testDOMs[]=
@@ -145,6 +152,7 @@
 + "one two three space here space there no space "
 + "one two two three three four put some text here and there. "
 + "End this madness ! . . . .",
+"ignore ignore",
   };
 
   private static final String[] answerTitle= {
@@ -155,6 +163,7 @@
 "my title",
 "my title",
 "my title",
+"",
   };
 
   // note: should be in page-order
@@ -214,6 +223,8 @@
  {
  new Outlink("http://www.nutch.org/index.html";, "whitespace test"),
  },
+ {
+ }
   };

 } catch (MalformedURLException e) {




svn commit: r327573 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch: fs/FSError.java fs/LocalFileSystem.java mapred/LocalJobRunner.java mapred/TaskRunner.java mapred/TaskTracker.java mapr

2005-10-21 Thread cutting
Author: cutting
Date: Fri Oct 21 13:49:02 2005
New Revision: 327573

URL: http://svn.apache.org/viewcvs?rev=327573&view=rev
Log:
Exit tasktracker when errors writing to the local disk, so that more tasks will 
not fail on this host.  Also fix so that executables may be included in job jar 
files, and so that a job's config can override static config options, read with 
NutchConf.get().

Added:
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FSError.java
Modified:

lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/LocalJobRunner.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java

lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskUmbilicalProtocol.java

Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FSError.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FSError.java?rev=327573&view=auto
==
--- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FSError.java 
(added)
+++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FSError.java Fri 
Oct 21 13:49:02 2005
@@ -0,0 +1,25 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.fs;
+
+/** Thrown for unexpected filesystem errors, presumed to reflect disk errors
+ * in the native filesystem. */
+public class FSError extends Error {
+  FSError(Throwable cause) {
+super(cause);
+  }
+}

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java?rev=327573&r1=327572&r2=327573&view=diff
==
--- 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java 
(original)
+++ 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java 
Fri Oct 21 13:49:02 2005
@@ -78,11 +78,23 @@
 public int available() throws IOException { return fis.available(); }
 public void close() throws IOException { fis.close(); }
 public boolean markSupport() { return false; }
-public int read() throws IOException { return fis.read(); }
-public int read(byte[] b) throws IOException { return fis.read(b); }
+
+public int read() throws IOException {
+  try {
+return fis.read();
+  } catch (IOException e) {   // unexpected exception
+throw new FSError(e); // assume native fs error
+  }
+}
+
 public int read(byte[] b, int off, int len) throws IOException {
+  try {
 return fis.read(b, off, len);
+  } catch (IOException e) {   // unexpected exception
+throw new FSError(e); // assume native fs error
+  }
 }
+
 public long skip(long n) throws IOException { return fis.skip(n); }
 }
 
@@ -115,11 +127,21 @@
*/
   public void close() throws IOException { fos.close(); }
   public void flush() throws IOException { fos.flush(); }
-  public void write(byte[] b) throws IOException { fos.write(b); }
+
   public void write(byte[] b, int off, int len) throws IOException {
-fos.write(b, off, len);
+try {
+  fos.write(b, off, len);
+} catch (IOException e) {   // unexpected exception
+  throw new FSError(e); // assume native fs error
+}
+  }
+  public void write(int b) throws IOException {
+try {
+  fos.write(b);
+} catch (IOException e) {   // unexpected exception
+  throw new FSError(e); // assume native fs error
+}
   }
-  public void write(int b) throws IOException { fos.write(b); }
 }
 
 public NFSOutputStream createRaw(File f, boolean overwrite)

Modified: 
lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/LocalJobRunner.java
URL: 
http://svn.apache.org/viewcvs/lucene/nu

  1   2   3   >