svn commit: r798304 [3/3] - in /lucene/nutch/logos: ./ character-hand-big.png character.eps nutch_logo.eps nutch_logo.png
Added: lucene/nutch/logos/nutch_logo.eps URL: http://svn.apache.org/viewvc/lucene/nutch/logos/nutch_logo.eps?rev=798304&view=auto == Binary file - no diff available. Propchange: lucene/nutch/logos/nutch_logo.eps -- svn:mime-type = application/octet-stream Added: lucene/nutch/logos/nutch_logo.png URL: http://svn.apache.org/viewvc/lucene/nutch/logos/nutch_logo.png?rev=798304&view=auto == Binary file - no diff available. Propchange: lucene/nutch/logos/nutch_logo.png -- svn:mime-type = application/octet-stream
svn commit: r798304 [1/3] - in /lucene/nutch/logos: ./ character-hand-big.png character.eps nutch_logo.eps nutch_logo.png
Author: cutting Date: Mon Jul 27 22:06:52 2009 New Revision: 798304 URL: http://svn.apache.org/viewvc?rev=798304&view=rev Log: Adding high-resolution original logo artwork. Added: lucene/nutch/logos/ lucene/nutch/logos/character-hand-big.png (with props) lucene/nutch/logos/character.eps lucene/nutch/logos/nutch_logo.eps (with props) lucene/nutch/logos/nutch_logo.png (with props) Added: lucene/nutch/logos/character-hand-big.png URL: http://svn.apache.org/viewvc/lucene/nutch/logos/character-hand-big.png?rev=798304&view=auto == Binary file - no diff available. Propchange: lucene/nutch/logos/character-hand-big.png -- svn:mime-type = application/octet-stream
svn commit: r503832 - /lucene/nutch/nightly/nightly.cron
Author: cutting Date: Mon Feb 5 11:22:43 2007 New Revision: 503832 URL: http://svn.apache.org/viewvc?view=rev&rev=503832 Log: Changed to a time with a potentially lower load. Modified: lucene/nutch/nightly/nightly.cron Modified: lucene/nutch/nightly/nightly.cron URL: http://svn.apache.org/viewvc/lucene/nutch/nightly/nightly.cron?view=diff&rev=503832&r1=503831&r2=503832 == --- lucene/nutch/nightly/nightly.cron (original) +++ lucene/nutch/nightly/nightly.cron Mon Feb 5 11:22:43 2007 @@ -1,4 +1,4 @@ # nightly crontab file # install with: crontab nightly.cron -# run seventeen minutes after midnight, every day -17 0 * * * $HOME/nutch-nightly/nightly.sh > $HOME/nutch-nightly/nightly.log 2>&1 +# run at 6:51, every day +51 6 * * * $HOME/nutch-nightly/nightly.sh > $HOME/nutch-nightly/nightly.log 2>&1
svn commit: r475926 - /lucene/nutch/nightly/nightly.sh
Author: cutting Date: Thu Nov 16 13:03:26 2006 New Revision: 475926 URL: http://svn.apache.org/viewvc?view=rev&rev=475926 Log: Update nightly build location. Modified: lucene/nutch/nightly/nightly.sh Modified: lucene/nutch/nightly/nightly.sh URL: http://svn.apache.org/viewvc/lucene/nutch/nightly/nightly.sh?view=diff&rev=475926&r1=475925&r2=475926 == --- lucene/nutch/nightly/nightly.sh (original) +++ lucene/nutch/nightly/nightly.sh Thu Nov 16 13:03:26 2006 @@ -5,7 +5,7 @@ TRUNK=http://svn.apache.org/repos/asf/lucene/nutch/trunk REL_SERVER=people.apache.org -REL_DIR=/www/people.apache.org/dist/lucene/nutch/nightly +REL_DIR=/www/people.apache.org/builds/lucene/nutch/nightly # create an empty build directory rm -rf /tmp/nutch-nightly
svn commit: r421185 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
Author: cutting Date: Wed Jul 12 01:16:37 2006 New Revision: 421185 URL: http://svn.apache.org/viewvc?rev=421185&view=rev Log: Patch a bug introduced by Hadoop 0.4.0, which requires specified input directories to exist. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=421185&r1=421184&r2=421185&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Wed Jul 12 01:16:37 2006 @@ -65,7 +65,8 @@ if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: done"); } } - public static JobConf createJob(Configuration config, Path crawlDb) { + public static JobConf createJob(Configuration config, Path crawlDb) +throws IOException { Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); @@ -73,7 +74,11 @@ JobConf job = new NutchJob(config); job.setJobName("crawldb " + crawlDb); -job.addInputPath(new Path(crawlDb, CrawlDatum.DB_DIR_NAME)); + +Path current = new Path(crawlDb, CrawlDatum.DB_DIR_NAME); +if (FileSystem.get(job).exists(current)) { + job.addInputPath(current); +} job.setInputFormat(SequenceFileInputFormat.class); job.setInputKeyClass(UTF8.class); job.setInputValueClass(CrawlDatum.class);
svn commit: r418739 - /lucene/nutch/nightly/nightly.sh
Author: cutting Date: Mon Jul 3 03:44:31 2006 New Revision: 418739 URL: http://svn.apache.org/viewvc?rev=418739&view=rev Log: Use JDK 1.5 for nightly build. Modified: lucene/nutch/nightly/nightly.sh Modified: lucene/nutch/nightly/nightly.sh URL: http://svn.apache.org/viewvc/lucene/nutch/nightly/nightly.sh?rev=418739&r1=418738&r2=418739&view=diff == --- lucene/nutch/nightly/nightly.sh (original) +++ lucene/nutch/nightly/nightly.sh Mon Jul 3 03:44:31 2006 @@ -1,6 +1,6 @@ #!/bin/bash -vx -export JAVA_HOME=/usr/j2se +export JAVA_HOME=$HOME/local/jdk TRUNK=http://svn.apache.org/repos/asf/lucene/nutch/trunk
svn commit: r417884 - in /lucene/nutch/trunk: lib/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/segment/
Author: cutting Date: Wed Jun 28 14:54:53 2006 New Revision: 417884 URL: http://svn.apache.org/viewvc?rev=417884&view=rev Log: NUTCH-312. Upgrade to Hadoop 0.4.0. Added: lucene/nutch/trunk/lib/commons-cli-2.0-SNAPSHOT.jar (with props) lucene/nutch/trunk/lib/hadoop-0.4.0.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.3.2.jar Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Added: lucene/nutch/trunk/lib/commons-cli-2.0-SNAPSHOT.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-cli-2.0-SNAPSHOT.jar?rev=417884&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/commons-cli-2.0-SNAPSHOT.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/hadoop-0.4.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.4.0.jar?rev=417884&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.4.0.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java?rev=417884&r1=417883&r2=417884&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java Wed Jun 28 14:54:53 2006 @@ -31,6 +31,7 @@ import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.util.Progressable; import org.apache.nutch.parse.ParseOutputFormat; import org.apache.nutch.protocol.Content; @@ -45,7 +46,8 @@ public RecordWriter getRecordWriter(final FileSystem fs, final JobConf job, - final String name) throws IOException { + final String name, + final Progressable progress) throws IOException { final Path fetch = new Path(new Path(job.getOutputPath(), CrawlDatum.FETCH_DIR_NAME), name); @@ -66,7 +68,7 @@ } if (Fetcher.isParsing(job)) { -parseOut = new ParseOutputFormat().getRecordWriter(fs, job, name); +parseOut = new ParseOutputFormat().getRecordWriter(fs, job, name, null); } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=417884&r1=417883&r2=417884&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Wed Jun 28 14:54:53 2006 @@ -26,6 +26,7 @@ import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; +import org.apache.hadoop.util.Progressable; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -276,7 +277,8 @@ /** Write nothing. */ public RecordWriter getRecordWriter(final FileSystem fs, final JobConf job, - final String name) throws IOException { + final String name, + final Progressable progress) throws IOException { return new RecordWriter() { public void write(WritableComparable key, Writable value) throws IOException { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?rev=417884&r1=417883&r2=417884&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexe
svn commit: r413175 - in /lucene/nutch/trunk/lib: hadoop-0.3.1.jar hadoop-0.3.2.jar
Author: cutting Date: Fri Jun 9 14:48:23 2006 New Revision: 413175 URL: http://svn.apache.org/viewvc?rev=413175&view=rev Log: Upgrading to Hadoop 0.3.2 release. Added: lucene/nutch/trunk/lib/hadoop-0.3.2.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.3.1.jar Added: lucene/nutch/trunk/lib/hadoop-0.3.2.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.3.2.jar?rev=413175&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.3.2.jar -- svn:mime-type = application/octet-stream
svn commit: r411943 - in /lucene/nutch/trunk/lib: commons-logging-1.0.4.jar hadoop-0.2.1.jar hadoop-0.3.1.jar log4j-1.2.13.jar
Author: cutting Date: Mon Jun 5 16:03:45 2006 New Revision: 411943 URL: http://svn.apache.org/viewvc?rev=411943&view=rev Log: Updating to Hadoop release 0.3.1. Hadoop now uses Jakarta Commons Logging, configured for log4j by default. Added: lucene/nutch/trunk/lib/commons-logging-1.0.4.jar (with props) lucene/nutch/trunk/lib/hadoop-0.3.1.jar (with props) lucene/nutch/trunk/lib/log4j-1.2.13.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.2.1.jar Added: lucene/nutch/trunk/lib/commons-logging-1.0.4.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-logging-1.0.4.jar?rev=411943&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/commons-logging-1.0.4.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/hadoop-0.3.1.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.3.1.jar?rev=411943&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.3.1.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/log4j-1.2.13.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/log4j-1.2.13.jar?rev=411943&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/log4j-1.2.13.jar -- svn:mime-type = application/octet-stream
svn commit: r409769 - in /lucene/nutch/trunk: LICENSE.txt NOTICE.txt
Author: cutting Date: Fri May 26 15:27:07 2006 New Revision: 409769 URL: http://svn.apache.org/viewvc?rev=409769&view=rev Log: Add NOTICE.txt file and put full license in LICENSE.txt, to better conform to http://www.apache.org/dev/apply-license.html. Added: lucene/nutch/trunk/NOTICE.txt Modified: lucene/nutch/trunk/LICENSE.txt Modified: lucene/nutch/trunk/LICENSE.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/LICENSE.txt?rev=409769&r1=409768&r2=409769&view=diff == --- lucene/nutch/trunk/LICENSE.txt (original) +++ lucene/nutch/trunk/LICENSE.txt Fri May 26 15:27:07 2006 @@ -1,15 +1,202 @@ -/** - * Copyright 2004 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ + + Apache License + Version 2.0, January 2004 +http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been
svn commit: r405861 - in /lucene/nutch/trunk/lib: hadoop-0.2.0.jar hadoop-0.2.1.jar
Author: cutting Date: Fri May 12 13:31:59 2006 New Revision: 405861 URL: http://svn.apache.org/viewcvs?rev=405861&view=rev Log: Upgrading to Hadoop 0.2.1. Added: lucene/nutch/trunk/lib/hadoop-0.2.1.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.2.0.jar Added: lucene/nutch/trunk/lib/hadoop-0.2.1.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.2.1.jar?rev=405861&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.2.1.jar -- svn:mime-type = application/octet-stream
svn commit: r400199 - in /lucene/nutch/trunk/lib: hadoop-0.1.1.jar hadoop-0.2.0.jar
Author: cutting Date: Fri May 5 15:44:04 2006 New Revision: 400199 URL: http://svn.apache.org/viewcvs?rev=400199&view=rev Log: Upgrading to Hadoop 0.2.0. Added: lucene/nutch/trunk/lib/hadoop-0.2.0.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.1.1.jar Added: lucene/nutch/trunk/lib/hadoop-0.2.0.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.2.0.jar?rev=400199&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.2.0.jar -- svn:mime-type = application/octet-stream
svn commit: r400159 - /lucene/nutch/trunk/bin/
Author: cutting Date: Fri May 5 13:01:44 2006 New Revision: 400159 URL: http://svn.apache.org/viewcvs?rev=400159&view=rev Log: Ignore bin/rcc (from Hadoop). Modified: lucene/nutch/trunk/bin/ (props changed) Propchange: lucene/nutch/trunk/bin/ -- --- svn:ignore (original) +++ svn:ignore Fri May 5 13:01:44 2006 @@ -1,6 +1,7 @@ hadoop hadoop-daemon.sh hadoop-daemons.sh +rcc slaves.sh start-all.sh start-dfs.sh
svn commit: r395676 - in /lucene/nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Generator.java
Author: cutting Date: Thu Apr 20 12:18:56 2006 New Revision: 395676 URL: http://svn.apache.org/viewcvs?rev=395676&view=rev Log: Fix NUTCH-108. Log hosts that exceed generate.max.per.host. Contributed by Rod Taylor. Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/CHANGES.txt?rev=395676&r1=395675&r2=395676&view=diff == --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Apr 20 12:18:56 2006 @@ -4,6 +4,10 @@ 1. NUTCH-107 - Typo in plugin/urlfilter-*/plugin.xml. (Stephen Cross). + 2. NUTCH-108 - Log hosts that exceed generate.max.per.host. + (Rod Taylor via cutting) + + Release 0.7 - 2005-08-17 1. Added support for "type:" in queries. Search results are limited/qualified Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=395676&r1=395675&r2=395676&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Thu Apr 20 12:18:56 2006 @@ -127,12 +127,18 @@ if (maxPerHost > 0) { // are we counting hosts? String host = new URL(url.toString()).getHost(); Integer hostCount = (Integer)hostCounts.get(host); - if (hostCount != null) { -if (hostCount.intValue() >= maxPerHost) - continue; // too many from host -hostCounts.put(host, new Integer(hostCount.intValue()+1)); - } else {// update host count -hostCounts.put(host, new Integer(1)); + + // increment hostCount + hostCount = new Integer(hostCount==null ? 1 : hostCount.intValue()+1); + hostCounts.put(host, hostCount); + + // skip URL if above the limit per host. + if (hostCount.intValue() > maxPerHost) { +if (hostCount.intValue() == maxPerHost + 1) { + LOG.info("Host "+ host +" has more than "+ maxPerHost +" URLs."+ + " Skipping additional."); +} +continue; } }
svn commit: r394781 - /lucene/nutch/trunk/bin/
Author: cutting Date: Mon Apr 17 14:40:58 2006 New Revision: 394781 URL: http://svn.apache.org/viewcvs?rev=394781&view=rev Log: Ignore more bin files. Modified: lucene/nutch/trunk/bin/ (props changed) Propchange: lucene/nutch/trunk/bin/ -- --- svn:ignore (original) +++ svn:ignore Mon Apr 17 14:40:58 2006 @@ -3,4 +3,8 @@ hadoop-daemons.sh slaves.sh start-all.sh +start-dfs.sh +start-mapred.sh stop-all.sh +stop-dfs.sh +stop-mapred.sh
svn commit: r392458 - in /lucene/nutch/trunk/lib: hadoop-0.1.0.jar hadoop-0.1.1.jar
Author: cutting Date: Fri Apr 7 16:48:10 2006 New Revision: 392458 URL: http://svn.apache.org/viewcvs?rev=392458&view=rev Log: Upgrading to Hadoop release 0.1.1. Added: lucene/nutch/trunk/lib/hadoop-0.1.1.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.1.0.jar Added: lucene/nutch/trunk/lib/hadoop-0.1.1.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1.1.jar?rev=392458&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.1.1.jar -- svn:mime-type = application/octet-stream
svn commit: r391371 - /lucene/nutch/trunk/
Author: cutting Date: Tue Apr 4 10:21:18 2006 New Revision: 391371 URL: http://svn.apache.org/viewcvs?rev=391371&view=rev Log: Ignore Eclipse .settings file. Modified: lucene/nutch/trunk/ (props changed) Propchange: lucene/nutch/trunk/ -- --- svn:ignore (original) +++ svn:ignore Tue Apr 4 10:21:18 2006 @@ -3,3 +3,4 @@ nutch.jar .classpath .project +.settings
svn commit: r390745 - in /lucene/nutch/trunk/lib: hadoop-0.1-dev.jar hadoop-0.1.0.jar
Author: cutting Date: Sat Apr 1 12:16:22 2006 New Revision: 390745 URL: http://svn.apache.org/viewcvs?rev=390745&view=rev Log: Update to Hadoop 0.1.0 release. Added: lucene/nutch/trunk/lib/hadoop-0.1.0.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Added: lucene/nutch/trunk/lib/hadoop-0.1.0.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1.0.jar?rev=390745&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.1.0.jar -- svn:mime-type = application/octet-stream
svn commit: r389634 - /lucene/nutch/trunk/bin/nutch
Author: cutting Date: Tue Mar 28 16:04:51 2006 New Revision: 389634 URL: http://svn.apache.org/viewcvs?rev=389634&view=rev Log: Fix a bug when there are spaces in CWD, as is common on Windows. Modified: lucene/nutch/trunk/bin/nutch Modified: lucene/nutch/trunk/bin/nutch URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/bin/nutch?rev=389634&r1=389633&r2=389634&view=diff == --- lucene/nutch/trunk/bin/nutch (original) +++ lucene/nutch/trunk/bin/nutch Tue Mar 28 16:04:51 2006 @@ -82,6 +82,9 @@ CLASSPATH=${NUTCH_CONF_DIR:=$NUTCH_HOME/conf} CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar +# so that filenames w/ spaces are handled correctly in loops below +IFS= + # for developers, add plugins, job & test code to CLASSPATH if [ -d "$NUTCH_HOME/build/plugins" ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build @@ -92,9 +95,6 @@ if [ -d "$NUTCH_HOME/build/test/classes" ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/test/classes fi - -# so that filenames w/ spaces are handled correctly in loops below -IFS= # for releases, add Nutch job to CLASSPATH for f in $NUTCH_HOME/nutch-*.job; do
svn commit: r388310 - in /lucene/nutch/trunk: lib/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/
Author: cutting Date: Thu Mar 23 16:57:56 2006 New Revision: 388310 URL: http://svn.apache.org/viewcvs?rev=388310&view=rev Log: Upgrade to latest Hadoop jar. Add job names to Nutch mapred jobs. Update OutputFormat implementations to implement new checkOutputSpecs() method. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=388310&r1=388309&r2=388310&view=diff == Binary files - no diff available. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=388310&r1=388309&r2=388310&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Thu Mar 23 16:57:56 2006 @@ -63,6 +63,7 @@ Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); +job.setJobName("crawldb " + crawlDb); job.addInputDir(new File(crawlDb, CrawlDatum.DB_DIR_NAME)); job.setInputFormat(SequenceFileInputFormat.class); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=388310&r1=388309&r2=388310&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Thu Mar 23 16:57:56 2006 @@ -140,6 +140,7 @@ File tmpFolder = new File(crawlDb, "stat_tmp" + System.currentTimeMillis()); JobConf job = new NutchJob(config); +job.setJobName("stats " + crawlDb); job.addInputDir(new File(crawlDb, CrawlDatum.DB_DIR_NAME)); job.setInputFormat(SequenceFileInputFormat.class); @@ -227,6 +228,7 @@ File outFolder = new File(output); JobConf job = new NutchJob(config); +job.setJobName("dump " + crawlDb); job.addInputDir(new File(crawlDb, CrawlDatum.DB_DIR_NAME)); job.setInputFormat(SequenceFileInputFormat.class); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=388310&r1=388309&r2=388310&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Thu Mar 23 16:57:56 2006 @@ -189,6 +189,7 @@ // map to inverted subset due for fetch, sort by link count LOG.info("Generator: Selecting most-linked urls due for fetch."); JobConf job = new NutchJob(getConf()); +job.setJobName("generate: select " + segment); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks();// a partition per fetch task @@ -215,6 +216,7 @@ // invert again, paritition by host, sort by url hash LOG.info("Generator: Partitioning selected urls by host, for politeness."); job = new NutchJob(getConf()); +job.setJobName("generate: partition " + segment); job.setInt("partition.url.by.host.seed", new Random().nextInt()); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=388310&r1=388309&r2=388310&vie
svn commit: r387310 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
Author: cutting Date: Mon Mar 20 13:08:15 2006 New Revision: 387310 URL: http://svn.apache.org/viewcvs?rev=387310&view=rev Log: Upgrade to current Hadoop. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=387310&r1=387309&r2=387310&view=diff == Binary files - no diff available.
svn commit: r386181 - in /lucene/nutch/branches/branch-0.7: site/issue_tracking.html site/issue_tracking.pdf src/site/src/documentation/content/xdocs/issue_tracking.xml
Author: cutting Date: Wed Mar 15 14:20:40 2006 New Revision: 386181 URL: http://svn.apache.org/viewcvs?rev=386181&view=rev Log: Updated link to jira. Modified: lucene/nutch/branches/branch-0.7/site/issue_tracking.html lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml Modified: lucene/nutch/branches/branch-0.7/site/issue_tracking.html URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/site/issue_tracking.html?rev=386181&r1=386180&r2=386181&view=diff == --- lucene/nutch/branches/branch-0.7/site/issue_tracking.html (original) +++ lucene/nutch/branches/branch-0.7/site/issue_tracking.html Wed Mar 15 14:20:40 2006 @@ -128,7 +128,7 @@ Nutch issues (bugs, as well as enhancement requests) are tracked in - Apache JIRA http://nagoya.apache.org/jira/browse/Nutch";>here. + Apache JIRA http://issues.apache.org/jira/browse/Nutch";>here. If you aren't sure whether something is a bug, post a question on the Nutch user mailing list. Modified: lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf?rev=386181&r1=386180&r2=386181&view=diff == --- lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf (original) +++ lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf Wed Mar 15 14:20:40 2006 @@ -32,7 +32,7 @@ /Rect [ 485.232 585.8 505.884 573.8 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] -/A << /URI (http://nagoya.apache.org/jira/browse/Nutch) +/A << /URI (http://issues.apache.org/jira/browse/Nutch) /S /URI >> /H /I >> Modified: lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml?rev=386181&r1=386180&r2=386181&view=diff == --- lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml (original) +++ lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml Wed Mar 15 14:20:40 2006 @@ -11,7 +11,7 @@ Nutch issues (bugs, as well as enhancement requests) are tracked in - Apache JIRA http://nagoya.apache.org/jira/browse/Nutch";>here. + Apache JIRA http://issues.apache.org/jira/browse/Nutch";>here. If you aren't sure whether something is a bug, post a question on the Nutch user mailing list.
svn commit: r384843 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
Author: cutting Date: Fri Mar 10 08:27:49 2006 New Revision: 384843 URL: http://svn.apache.org/viewcvs?rev=384843&view=rev Log: Upgrade to latest hadoop jar. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=384843&r1=384842&r2=384843&view=diff == Binary files - no diff available.
svn commit: r383698 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
Author: cutting Date: Mon Mar 6 14:54:20 2006 New Revision: 383698 URL: http://svn.apache.org/viewcvs?rev=383698&view=rev Log: Upgrade to latest version of Hadoop. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=383698&r1=383697&r2=383698&view=diff == Binary files - no diff available.
svn commit: r382939 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
Author: cutting Date: Fri Mar 3 13:46:21 2006 New Revision: 382939 URL: http://svn.apache.org/viewcvs?rev=382939&view=rev Log: Upgrade hadoop to latest version with some important mapred bug fixes. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=382939&r1=382938&r2=382939&view=diff == Binary files - no diff available.
svn commit: r382912 - in /lucene/nutch/trunk/src/java/org/apache/nutch: crawl/ fetcher/ indexer/ parse/ plugin/ searcher/ segment/
Author: cutting Date: Fri Mar 3 11:05:41 2006 New Revision: 382912 URL: http://svn.apache.org/viewcvs?rev=382912&view=rev Log: Undo unintentional changes made in r381751. Thanks, Jerome, for catching this! Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=382912&r1=382911&r2=382912&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Fri Mar 3 11:05:41 2006 @@ -44,11 +44,11 @@ /* Perform complete crawling and indexing given a set of root urls. */ - public static boolean doMain(String args[]) throws Exception { + public static void main(String args[]) throws Exception { if (args.length < 1) { System.out.println ("Usage: Crawl [-dir d] [-threads n] [-depth i] [-topN N]"); - return false; + return; } Configuration conf = NutchConfiguration.create(); @@ -122,22 +122,5 @@ new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir, job).merge(); LOG.info("crawl finished: " + dir); - -return true; - } - - /** - * main() wrapper that returns proper exit status - */ - public static void main(String[] args) { -Runtime rt = Runtime.getRuntime(); -try { - boolean status = doMain(args); - rt.exit(status ? 0 : 1); -} -catch (Exception e) { - LOG.log(Level.SEVERE, "error, caught Exception in main()", e); - rt.exit(1); -} } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=382912&r1=382911&r2=382912&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Fri Mar 3 11:05:41 2006 @@ -90,31 +90,17 @@ fs.delete(old); } - public static boolean doMain(String[] args) throws Exception { + public static void main(String[] args) throws Exception { CrawlDb crawlDb = new CrawlDb(NutchConfiguration.create()); if (args.length < 2) { System.err.println("Usage: "); - return false; + return; } crawlDb.update(new File(args[0]), new File(args[1])); - -return true; } - /** - * main() wrapper that returns proper exit status - */ - public static void main(String[] args) { -Runtime rt = Runtime.getRuntime(); -try { - boolean status = doMain(args); - rt.exit(status ? 0 : 1); -} -catch (Exception e) { - LOG.log(Level.SEVERE, "error, caught Exception in main()", e); - rt.exit(1); -} - } + + } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=382912&r1=382911&r2=382912&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Fri Mar 3 11:05:41 2006 @@ -20,7 +20,7 @@ import java.io.IOException; import java.util.Iterator; import java.util.TreeMap; -import java.util.logging.*; +import java.util.logging.Logger; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.io.LongWritable; @@ -241,7 +241,7 @@ JobClient.runJob(job); } - public static boolean doMain(String[] args) throws IOException { + public static void main(String[] args) throws IOE
svn commit: r382579 - /lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
Author: cutting Date: Thu Mar 2 16:06:59 2006 New Revision: 382579 URL: http://svn.apache.org/viewcvs?rev=382579&view=rev Log: Disable speculative execution, since input format has side effects. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=382579&r1=382578&r2=382579&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Thu Mar 2 16:06:59 2006 @@ -307,6 +307,7 @@ job.setInputKeyClass(HashScore.class); job.setInputValueClass(IndexDoc.class); job.setInputFormat(InputFormat.class); +job.setBoolean("mapred.speculative.execution", false); job.setPartitionerClass(HashPartitioner.class); job.setReducerClass(HashReducer.class);
svn commit: r382573 - in /lucene/nutch/trunk: conf/hadoop-env.sh.template lib/hadoop-0.1-dev.jar
Author: cutting Date: Thu Mar 2 15:59:24 2006 New Revision: 382573 URL: http://svn.apache.org/viewcvs?rev=382573&view=rev Log: Update to latest Hadoop code. Modified: lucene/nutch/trunk/conf/hadoop-env.sh.template lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/conf/hadoop-env.sh.template URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/hadoop-env.sh.template?rev=382573&r1=382572&r2=382573&view=diff == --- lucene/nutch/trunk/conf/hadoop-env.sh.template (original) +++ lucene/nutch/trunk/conf/hadoop-env.sh.template Thu Mar 2 15:59:24 2006 @@ -1,6 +1,11 @@ # Set Hadoop-specific environment variables here. -# The java implementation to use. +# The only required environment variable is JAVA_HOME. All others are +# optional. When running a distributed configuration it is best to +# set JAVA_HOME in this file, so that it is correctly defined on +# remote nodes. + +# The java implementation to use. Required. # export JAVA_HOME=/usr/bin/java # The maximum amount of heap to use, in MB. Default is 1000. @@ -8,6 +13,9 @@ # Extra Java runtime options. Empty by default. # export HADOOP_OPTS=-server + +# Extra ssh options. Default: '-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR'. +# export HADOOP_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR" # Where log files are stored. $HADOOP_HOME/logs by default. # export HADOOP_LOG_DIR=${HADOOP_HOME}/logs Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=382573&r1=382572&r2=382573&view=diff == Binary files - no diff available.
svn commit: r382512 - in /lucene/nutch/trunk/lib: lucene-core-1.9-final.jar lucene-core-1.9.1.jar lucene-misc-1.9-final.jar lucene-misc-1.9.1.jar
Author: cutting Date: Thu Mar 2 12:59:09 2006 New Revision: 382512 URL: http://svn.apache.org/viewcvs?rev=382512&view=rev Log: Upgrade to Lucene 1.9.1. Added: lucene/nutch/trunk/lib/lucene-core-1.9.1.jar (with props) lucene/nutch/trunk/lib/lucene-misc-1.9.1.jar (with props) Removed: lucene/nutch/trunk/lib/lucene-core-1.9-final.jar lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar Added: lucene/nutch/trunk/lib/lucene-core-1.9.1.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-core-1.9.1.jar?rev=382512&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/lucene-core-1.9.1.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/lucene-misc-1.9.1.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-misc-1.9.1.jar?rev=382512&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/lucene-misc-1.9.1.jar -- svn:mime-type = application/octet-stream
svn commit: r381824 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
Author: cutting Date: Tue Feb 28 15:30:02 2006 New Revision: 381824 URL: http://svn.apache.org/viewcvs?rev=381824&view=rev Log: Updating hadoop jar. Includes fixes for Windows. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=381824&r1=381823&r2=381824&view=diff == Binary files - no diff available.
svn commit: r381751 - in /lucene/nutch/trunk: site/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org
Author: cutting Date: Tue Feb 28 11:25:12 2006 New Revision: 381751 URL: http://svn.apache.org/viewcvs?rev=381751&view=rev Log: Adding DOAP for Nutch. Contributed by Chris Mattmann. Added: lucene/nutch/trunk/site/doap.rdf Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Added: lucene/nutch/trunk/site/doap.rdf URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/doap.rdf?rev=381751&view=auto == --- lucene/nutch/trunk/site/doap.rdf (added) +++ lucene/nutch/trunk/site/doap.rdf Tue Feb 28 11:25:12 2006 @@ -0,0 +1,47 @@ + + +http://usefulinc.com/ns/doap#"; + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + xmlns:asfext="http://projects.apache.org/ns/asfext#"; + xmlns:foaf="http://xmlns.com/foaf/0.1/";> + + http://lucene.apache.org/nutch/";> +2006-02-28 +http://usefulinc.com/doap/licenses/asl20"; /> +Apache Nutch +http://lucene.apache.org/nutch/"; /> +http://lucene.apache.org"; /> +Nutch is the open-source search engine. +Nutch is open source web-search software. It builds +on Lucene Java and Hadoop, adding web-specifics, such as a +crawler, a link-graph database, parsers for HTML and other +document formats, etc. + +http://issues.apache.org/jira/browse/NUTCH"; /> +http://lucene.apache.org/nutch/mailing_lists.html"; /> +http://www.apache.org/dyn/closer.cgi/lucene/nutch/"; /> +Java +http://projects.apache.org/category/web-framework"; /> + + +0.7 +2005-08-17 +0.7 + + + + +http://svn.apache.org/repos/asf/lucene/nutch/"/> +http://svn.apache.org/viewcvs.cgi/lucene/nutch/"/> + + + + Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=381751&r1=381750&r2=381751&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Tue Feb 28 11:25:12 2006 @@ -44,11 +44,11 @@ /* Perform complete crawling and indexing given a set of root urls. */ - public static void main(String args[]) throws Exception { + public static boolean doMain(String args[]) throws Exception { if (args.length < 1) { System.out.println ("Usage: Crawl [-dir d] [-threads n] [-depth i] [-topN N]"); - return; + return false; } Configuration conf = NutchConfiguration.create(); @@ -122,5 +122,22 @@ new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir, job).merge(); LOG.info("crawl finished: " + dir); + +return true; + } + + /** + * main() wrapper that returns proper exit status + */ + public static void main(String[] args) { +Runtime rt = Runtime.getRuntime(); +try { + boolean status = doMain(args); + rt.exit(status ? 0 : 1); +} +catch (Exception e) { + LOG.log(Level.SEVERE, "error, caught Exception in main()", e); + rt.exit(1); +} } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=381751&r1=381750&r2=381751&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Tue Feb 28 11:25:12 2006 @@ -90,17 +90,31 @@ fs.delete(old); } - public static void main(String[] args) throws Exception { + public static boolean doMain(St
svn commit: r381721 - in /lucene/nutch/trunk/lib: lucene-core-1.9-final.jar lucene-core-1.9-rc1-dev.jar lucene-misc-1.9-final.jar lucene-misc-1.9-rc1-dev.jar
Author: cutting Date: Tue Feb 28 10:00:43 2006 New Revision: 381721 URL: http://svn.apache.org/viewcvs?rev=381721&view=rev Log: Upgrade lucene version to final release. Added: lucene/nutch/trunk/lib/lucene-core-1.9-final.jar (with props) lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar (with props) Removed: lucene/nutch/trunk/lib/lucene-core-1.9-rc1-dev.jar lucene/nutch/trunk/lib/lucene-misc-1.9-rc1-dev.jar Added: lucene/nutch/trunk/lib/lucene-core-1.9-final.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-core-1.9-final.jar?rev=381721&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/lucene-core-1.9-final.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar?rev=381721&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar -- svn:mime-type = application/octet-stream
svn commit: r380840 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
Author: cutting Date: Fri Feb 24 14:38:06 2006 New Revision: 380840 URL: http://svn.apache.org/viewcvs?rev=380840&view=rev Log: Update hadoop jar, to get recent fixes from that project. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=380840&r1=380839&r2=380840&view=diff == Binary files - no diff available.
svn commit: r380789 - /lucene/nutch/trunk/build.xml
Author: cutting Date: Fri Feb 24 11:11:44 2006 New Revision: 380789 URL: http://svn.apache.org/viewcvs?rev=380789&view=rev Log: Fix to not use 'exec', but rather 'untar' and 'chmod' which are more portable. Modified: lucene/nutch/trunk/build.xml Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=380789&r1=380788&r2=380789&view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Fri Feb 24 11:11:44 2006 @@ -57,10 +57,9 @@ - - - - + + +
svn commit: r378396 - in /lucene/nutch/trunk/src/java/org/apache/nutch: crawl/ searcher/
Author: cutting Date: Thu Feb 16 15:31:52 2006 New Revision: 378396 URL: http://svn.apache.org/viewcvs?rev=378396&view=rev Log: Fix for NUTCH-211: add close method to search classes. Contributed by Stefan. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitContent.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitInlinks.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/NutchBean.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Searcher.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=378396&r1=378395&r2=378396&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Thu Feb 16 15:31:52 2006 @@ -32,7 +32,7 @@ import java.util.logging.Logger; /** . */ -public class LinkDbReader { +public class LinkDbReader implements Closeable { public static final Logger LOG = LogFormatter.getLogger(LinkDbReader.class.getName()); private static final Partitioner PARTITIONER = new HashPartitioner(); @@ -66,6 +66,14 @@ return (Inlinks)MapFileOutputFormat.getEntry (readers, PARTITIONER, url, new Inlinks()); + } + + public void close() throws IOException { +if (readers != null) { + for (int i = 0; i < readers.length; i++) { +readers[i].close(); + } +} } public static void processDumpJob(String linkdb, String output, Configuration config) throws IOException { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=378396&r1=378395&r2=378396&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java Thu Feb 16 15:31:52 2006 @@ -20,6 +20,7 @@ import java.io.File; import java.util.HashMap; +import java.util.Iterator; import org.apache.hadoop.io.*; import org.apache.hadoop.fs.*; @@ -34,7 +35,7 @@ * fetched segments. */ public class FetchedSegments implements HitSummarizer, HitContent { - private static class Segment { + private static class Segment implements Closeable { private static final Partitioner PARTITIONER = new HashPartitioner(); private FileSystem fs; @@ -93,6 +94,19 @@ return MapFileOutputFormat.getEntry(readers, PARTITIONER, url, entry); } +public void close() throws IOException { + if (content != null) { closeReaders(content); } + if (parseText != null) { closeReaders(parseText); } + if (parseData != null) { closeReaders(parseData); } + if (crawl != null) { closeReaders(crawl); } +} + +private void closeReaders(MapFile.Reader[] readers) throws IOException { + for (int i = 0; i < readers.length; i++) { +readers[i].close(); + } +} + } private HashMap segments = new HashMap(); @@ -206,5 +220,11 @@ return new UTF8(details.getValue("url")); } - + public void close() throws IOException { +Iterator iterator = segments.values().iterator(); +while (iterator.hasNext()) { + ((Segment) iterator.next()).close(); +} + } + } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitContent.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitContent.java?rev=378396&r1=378395&r2=378396&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitContent.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitContent.java Thu Feb 16 15:31:52 2006 @@ -18,11 +18,12 @@ import java.io.IOException; +import org.apache.hadoop.io.Closeable; import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseText; /** Service that returns the content of a hit. */ -public interface HitContent { +public interface HitContent extends Closeable { /** Returns the content of a hit document. */ byte[] getContent(HitDetails details) throws IOException; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/HitInlinks.java URL: http://svn.apache.org/viewc
svn commit: r378381 - /lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml
Author: cutting Date: Thu Feb 16 14:24:47 2006 New Revision: 378381 URL: http://svn.apache.org/viewcvs?rev=378381&view=rev Log: Fix to work with Forrest 0.7, where ext: links seem to no longer work in tabs.xml. Modified: lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml Modified: lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml?rev=378381&r1=378380&r2=378381&view=diff == --- lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml (original) +++ lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml Thu Feb 16 14:24:47 2006 @@ -15,6 +15,6 @@ --> - + http://wiki.apache.org/nutch/"/>
svn commit: r378108 - /lucene/nutch/trunk/
Author: cutting Date: Wed Feb 15 14:47:00 2006 New Revision: 378108 URL: http://svn.apache.org/viewcvs?rev=378108&view=rev Log: Ignore logs directory. Modified: lucene/nutch/trunk/ (props changed) Propchange: lucene/nutch/trunk/ -- --- svn:ignore (original) +++ svn:ignore Wed Feb 15 14:47:00 2006 @@ -1,4 +1,5 @@ build +logs nutch.jar .classpath .project
svn commit: r378107 - in /lucene/nutch/trunk: conf/ conf/hadoop-env.sh.template conf/slaves.template lib/hadoop-0.1-dev.jar src/java/org/apache/nutch/fetcher/Fetcher.java
Author: cutting Date: Wed Feb 15 14:45:31 2006 New Revision: 378107 URL: http://svn.apache.org/viewcvs?rev=378107&view=rev Log: Fix Fetcher to disable speculative exexution, to keep it polite. Also upgrade to latest hadoop jar that supports this feature. Note that Hadoop's environment specification has changed, with all environment variables settable from conf/hadoop-env.sh, and the slaves file is now in conf/, rather than in one's home directory. Added: lucene/nutch/trunk/conf/hadoop-env.sh.template lucene/nutch/trunk/conf/slaves.template Modified: lucene/nutch/trunk/conf/ (props changed) lucene/nutch/trunk/lib/hadoop-0.1-dev.jar lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Propchange: lucene/nutch/trunk/conf/ -- --- svn:ignore (original) +++ svn:ignore Wed Feb 15 14:45:31 2006 @@ -1,5 +1,4 @@ -nutch-site.xml -regex-normalize.xml -crawl-urlfilter.txt -regex-urlfilter.txt -mapred-default.xml +*.xml +*.txt +*.sh +slaves Added: lucene/nutch/trunk/conf/hadoop-env.sh.template URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/hadoop-env.sh.template?rev=378107&view=auto == --- lucene/nutch/trunk/conf/hadoop-env.sh.template (added) +++ lucene/nutch/trunk/conf/hadoop-env.sh.template Wed Feb 15 14:45:31 2006 @@ -0,0 +1,25 @@ +# Set Hadoop-specific environment variables here. + +# The java implementation to use. +# export JAVA_HOME=/usr/bin/java + +# The maximum amount of heap to use, in MB. Default is 1000. +# export HADOOP_HEAPSIZE=2000 + +# Extra Java runtime options. Empty by default. +# export HADOOP_OPTS=-server + +# Where log files are stored. $HADOOP_HOME/logs by default. +# export HADOOP_LOG_DIR=${HADOOP_HOME}/logs + +# File naming remote slave hosts. $HADOOP_HOME/conf/slaves by default. +# export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves + +# host:path where hadoop code should be rsync'd from. Unset by default. +# export HADOOP_MASTER=master:/home/$USER/src/hadoop + +# The directory where pid files are stored. /tmp by default. +# export HADOOP_PID_DIR=/var/hadoop/pids + +# A string representing this instance of hadoop. $USER by default. +# export HADOOP_IDENT_STRING=$USER Added: lucene/nutch/trunk/conf/slaves.template URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/slaves.template?rev=378107&view=auto == --- lucene/nutch/trunk/conf/slaves.template (added) +++ lucene/nutch/trunk/conf/slaves.template Wed Feb 15 14:45:31 2006 @@ -0,0 +1 @@ +localhost Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=378107&r1=378106&r2=378107&view=diff == Binary files - no diff available. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=378107&r1=378106&r2=378107&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Feb 15 14:45:31 2006 @@ -348,6 +348,9 @@ job.set(SEGMENT_NAME_KEY, segment.getName()); job.setBoolean("fetcher.parse", parsing); +// for politeness, don't permit parallel execution of a single task +job.setBoolean("mapred.speculative.execution", false); + job.setInputDir(new File(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); job.setInputKeyClass(UTF8.class);
svn commit: r378044 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
Author: cutting Date: Wed Feb 15 09:56:54 2006 New Revision: 378044 URL: http://svn.apache.org/viewcvs?rev=378044&view=rev Log: Upgrade to latest version of Hadoop. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=378044&r1=378043&r2=378044&view=diff == Binary files - no diff available.
svn commit: r376815 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
Author: cutting Date: Fri Feb 10 11:44:47 2006 New Revision: 376815 URL: http://svn.apache.org/viewcvs?rev=376815&view=rev Log: Update Hadoop jar. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=376815&r1=376814&r2=376815&view=diff == Binary files - no diff available.
svn commit: r376808 - in /lucene/nutch/trunk/conf: configuration.xsl hadoop-site.xml.template
Author: cutting Date: Fri Feb 10 11:31:06 2006 New Revision: 376808 URL: http://svn.apache.org/viewcvs?rev=376808&view=rev Log: Add a template for hadoop-site.xml, and the stylesheet for config files. Added: lucene/nutch/trunk/conf/configuration.xsl lucene/nutch/trunk/conf/hadoop-site.xml.template Added: lucene/nutch/trunk/conf/configuration.xsl URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/configuration.xsl?rev=376808&view=auto == --- lucene/nutch/trunk/conf/configuration.xsl (added) +++ lucene/nutch/trunk/conf/configuration.xsl Fri Feb 10 11:31:06 2006 @@ -0,0 +1,24 @@ + +http://www.w3.org/1999/XSL/Transform"; version="1.0"> + + + + + + + name + value + description + + + + + + + + + + + + + Added: lucene/nutch/trunk/conf/hadoop-site.xml.template URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/hadoop-site.xml.template?rev=376808&view=auto == --- lucene/nutch/trunk/conf/hadoop-site.xml.template (added) +++ lucene/nutch/trunk/conf/hadoop-site.xml.template Fri Feb 10 11:31:06 2006 @@ -0,0 +1,8 @@ + + + + + + + +
svn commit: r376803 - in /lucene/nutch/trunk: build.xml lib/hadoop-0.1-dev.jar
Author: cutting Date: Fri Feb 10 11:22:15 2006 New Revision: 376803 URL: http://svn.apache.org/viewcvs?rev=376803&view=rev Log: Unpack Hadoop webapps from jar so that they can be used. Modified: lucene/nutch/trunk/build.xml lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=376803&r1=376802&r2=376803&view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Fri Feb 10 11:22:15 2006 @@ -62,6 +62,13 @@ + + + + + + + @@ -414,6 +421,10 @@ + + + + Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=376803&r1=376802&r2=376803&view=diff == Binary files - no diff available.
svn commit: r376485 - in /lucene/nutch/trunk: ./ bin/ lib/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/j
Author: cutting Date: Thu Feb 9 15:20:28 2006 New Revision: 376485 URL: http://svn.apache.org/viewcvs?rev=376485&view=rev Log: Fix for NUTCH-209. Nutch now supplies all code to remote MapReduce daemons through a job jar file. So Hadoop daemons no longer need to be restarted when Nutch code changes. Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java Modified: lucene/nutch/trunk/bin/nutch lucene/nutch/trunk/build.xml lucene/nutch/trunk/lib/hadoop-0.1-dev.jar lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java Modified: lucene/nutch/trunk/bin/nutch URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/bin/nutch?rev=376485&r1=376484&r2=376485&view=diff == --- lucene/nutch/trunk/bin/nutch (original) +++ lucene/nutch/trunk/bin/nutch Thu Feb 9 15:20:28 2006 @@ -82,13 +82,13 @@ CLASSPATH=${NUTCH_CONF_DIR:=$NUTCH_HOME/conf} CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar -# for developers, add Nutch classes to CLASSPATH -if [ -d "$NUTCH_HOME/build/classes" ]; then - CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/classes -fi +# for developers, add plugins, job & test code to CLASSPATH if [ -d "$NUTCH_HOME/build/plugins" ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build fi +for f in $NUTCH_HOME/build/nutch-*.job; do + CLASSPATH=${CLASSPATH}:$f; +done if [ -d "$NUTCH_HOME/build/test/classes" ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/test/classes fi @@ -96,14 +96,14 @@ # so that filenames w/ spaces are handled correctly in loops below IFS= -# for releases, add Nutch jar to CLASSPATH -for f in $NUTCH_HOME/nutch-*.jar; do +# for releases, add Nutch job to CLASSPATH +for f in $NUTCH_HOME/nutch-*.job; do CLASSPATH=${CLASSPATH}:$f; done # add plugins to classpath if [ -d "$NUTCH_HOME/plugins" ]; then - CLASSPATH=${CLASSPATH}:$NUTCH_HOME + CLASSPATH=${NUTCH_HOME}:${CLASSPATH} fi # add libs to CLASSPATH Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=376485&r1=376484&r2=376485&view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Thu Feb 9 15:20:28 2006 @@ -1,6 +1,6 @@ - + @@ -100,7 +100,6 @@ - @@ -119,6 +118,21 @@ + + + + + + + + + + + + + + @@ -385,7 +399,7 @@ - + @@ -402,7 +416,7 @@ - + Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=376485&r1=376484&r2=376485&view=diff == Binary files - no diff available. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=376485&r1=376484&r2=376485&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Thu Feb 9 15:20:28 2006 @@ -31,6 +31,7 @@ import org.apache.nutch.indexer.IndexMerger; import org.apache.nutch.indexer.Indexer; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; public class Crawl { public static final Logger LOG = @@ -52,7 +53,7 @@ Configuration conf = NutchConfiguration.create(); conf.addDefaultResource("crawl-tool.xml"); -JobConf job = new JobConf(conf); +JobConf job = new NutchJob(conf); File rootUrlDir = null; File dir = new File("crawl-" + getDate()); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.
svn commit: r376435 - in /lucene/nutch/trunk: lib/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/
Author: cutting Date: Thu Feb 9 12:57:44 2006 New Revision: 376435 URL: http://svn.apache.org/viewcvs?rev=376435&view=rev Log: Updating to latest Hadoop jar, adding now-required close() methods to mapper and reducer implementations. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=376435&r1=376434&r2=376435&view=diff == Binary files - no diff available. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=376435&r1=376434&r2=376435&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Thu Feb 9 12:57:44 2006 @@ -56,6 +56,7 @@ public static class CrawlDbStatMapper implements Mapper { public void configure(JobConf job) {} +public void close() {} public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { CrawlDatum cd = (CrawlDatum) value; @@ -68,6 +69,7 @@ public static class CrawlDbStatReducer implements Reducer { public void configure(JobConf job) {} +public void close() {} public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { @@ -127,8 +129,8 @@ } } -public void configure(JobConf job) { -} +public void configure(JobConf job) {} +public void close() {} } public void processStatJob(String crawlDb, Configuration config) throws IOException { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=376435&r1=376434&r2=376435&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Thu Feb 9 12:57:44 2006 @@ -30,6 +30,8 @@ retryMax = job.getInt("db.fetch.retry.max", 3); } + public void close() {} + public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=376435&r1=376434&r2=376435&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Thu Feb 9 12:57:44 2006 @@ -51,6 +51,8 @@ maxPerHost = job.getInt("generate.max.per.host", -1); } +public void close() {} + /** Select & invert subset due for fetch. */ public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=376435&r1=376434&r2=376435&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Thu Feb 9 12:57:44 2006 @@ -48,6 +48,8 @@ this.jobConf = job; } +public void close() {} + public void map(WritableComparable key, Writable val,
svn commit: r376072 - /lucene/nutch/trunk/conf/nutch-default.xml
Author: cutting Date: Wed Feb 8 13:25:30 2006 New Revision: 376072 URL: http://svn.apache.org/viewcvs?rev=376072&view=rev Log: Restore accidentally removed file defaults. Modified: lucene/nutch/trunk/conf/nutch-default.xml Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=376072&r1=376071&r2=376072&view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Wed Feb 8 13:25:30 2006 @@ -7,6 +7,28 @@ + + + + file.content.limit + 65536 + The length limit for downloaded content, in bytes. + If this value is larger than zero, content longer than it will be + truncated; otherwise (zero or negative), no truncation at all. + + + + + file.content.ignored + true + If true, no file content will be saved during fetch. + And it is probably what we want to set most of time, since file:// URLs + are meant to be local and we can always use them directly at parsing + and indexing stages. Otherwise file contents will be saved. + !! NO IMPLEMENTED YET !! + + +
svn commit: r375704 - in /lucene/nutch/trunk/lib: jetty-5.1.4.LICENSE.txt jetty-5.1.4.jar jetty-ext/
Author: cutting Date: Tue Feb 7 13:02:46 2006 New Revision: 375704 URL: http://svn.apache.org/viewcvs?rev=375704&view=rev Log: Restoring jetty to Nutch lib: removed by mistake. Added: lucene/nutch/trunk/lib/jetty-5.1.4.LICENSE.txt - copied unchanged from r374759, lucene/hadoop/trunk/lib/jetty-5.1.4.LICENSE.txt lucene/nutch/trunk/lib/jetty-5.1.4.jar - copied unchanged from r374759, lucene/hadoop/trunk/lib/jetty-5.1.4.jar lucene/nutch/trunk/lib/jetty-ext/ - copied from r374759, lucene/hadoop/trunk/lib/jetty-ext/
svn commit: r375414 - in /lucene/nutch/trunk: bin/ build.xml lib/hadoop-0.1-dev.jar
Author: cutting Date: Mon Feb 6 15:36:01 2006 New Revision: 375414 URL: http://svn.apache.org/viewcvs?rev=375414&view=rev Log: Extract Hadoop's scripts from Hadoop's jar into bin/ directory. Modified: lucene/nutch/trunk/bin/ (props changed) lucene/nutch/trunk/build.xml lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Propchange: lucene/nutch/trunk/bin/ -- --- svn:ignore (added) +++ svn:ignore Mon Feb 6 15:36:01 2006 @@ -0,0 +1,6 @@ +hadoop +hadoop-daemon.sh +hadoop-daemons.sh +slaves.sh +start-all.sh +stop-all.sh Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=375414&r1=375413&r2=375414&view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Mon Feb 6 15:36:01 2006 @@ -51,6 +51,16 @@ + + + + + + + + + + Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=375414&r1=375413&r2=375414&view=diff == Binary files - no diff available.
svn commit: r375333 - /lucene/nutch/nightly/nightly.properties
Author: cutting Date: Mon Feb 6 10:57:09 2006 New Revision: 375333 URL: http://svn.apache.org/viewcvs?rev=375333&view=rev Log: Updated email paramters. Modified: lucene/nutch/nightly/nightly.properties Modified: lucene/nutch/nightly/nightly.properties URL: http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.properties?rev=375333&r1=375332&r2=375333&view=diff == --- lucene/nutch/nightly/nightly.properties (original) +++ lucene/nutch/nightly/nightly.properties Mon Feb 6 10:57:09 2006 @@ -1,5 +1,5 @@ -MailLogger.mailhost = mail.apache.org -MailLogger.from = nutch-dev@incubator.apache.org -MailLogger.failure.to = nutch-dev@incubator.apache.org +MailLogger.mailhost = localhost +MailLogger.from = nutch-dev@lucene.apache.org +MailLogger.failure.to = nutch-dev@lucene.apache.org MailLogger.failure.subject = Nutch nightly build failure MailLogger.success.notify = false
svn commit: r375326 - in /lucene/nutch/trunk: conf/hadoop-default.xml conf/mapred-default.xml.template lib/hadoop-0.1-dev.jar src/java/org/apache/nutch/crawl/Crawl.java
Author: cutting Date: Mon Feb 6 10:16:22 2006 New Revision: 375326 URL: http://svn.apache.org/viewcvs?rev=375326&view=rev Log: Remove Hadoop config files and update to latest Hadoop jar. Removed: lucene/nutch/trunk/conf/hadoop-default.xml lucene/nutch/trunk/conf/mapred-default.xml.template Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=375326&r1=375325&r2=375326&view=diff == Binary files - no diff available. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=375326&r1=375325&r2=375326&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Mon Feb 6 10:16:22 2006 @@ -51,7 +51,7 @@ } Configuration conf = NutchConfiguration.create(); -conf.addAppResource("crawl-tool.xml"); +conf.addDefaultResource("crawl-tool.xml"); JobConf job = new JobConf(conf); File rootUrlDir = null;
svn commit: r375321 - in /lucene/nutch/trunk/src/java/org/apache/nutch: fetcher/FetcherOutput.java parse/ParseData.java parse/ParseText.java protocol/Content.java util/NutchConfiguration.java
Author: cutting Date: Mon Feb 6 09:52:30 2006 New Revision: 375321 URL: http://svn.apache.org/viewcvs?rev=375321&view=rev Log: Add aliases for some Writable classes for back-compatibility. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java?rev=375321&r1=375320&r2=375321&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java Mon Feb 6 09:52:30 2006 @@ -32,8 +32,6 @@ private ParseImpl parse; private Configuration conf; - static { WritableName.setName(FetcherOutput.class, "FetcherOutput"); } - public FetcherOutput() {} public FetcherOutput(CrawlDatum crawlDatum, Content content, Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?rev=375321&r1=375320&r2=375321&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Mon Feb 6 09:52:30 2006 @@ -41,9 +41,6 @@ private ParseStatus status; private Configuration conf; - static { WritableName.setName(ParseData.class, "ParseData"); } - - // TODO [EMAIL PROTECTED]: should we really implement Configurable or should we add the // parameter Configuration to the default-constructor. NOTE: The test // TestWriteable instantiates ParseData with Class.newInstance() -> the default Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java?rev=375321&r1=375320&r2=375321&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseText.java Mon Feb 6 09:52:30 2006 @@ -37,8 +37,6 @@ this.text = text; } - static { WritableName.setName(ParseText.class, "ParseText"); } - public byte getVersion() { return VERSION; } public void readFields(DataInput in) throws IOException { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=375321&r1=375320&r2=375321&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Mon Feb 6 09:52:30 2006 @@ -42,8 +42,6 @@ private boolean mimeTypeMagic; private MimeTypes mimeTypes; - static { WritableName.setName(Content.class, "Content"); } - public Content() {} public Content(String url, String base, byte[] content, String contentType, Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java?rev=375321&r1=375320&r2=375321&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java Mon Feb 6 09:52:30 2006 @@ -17,10 +17,21 @@ package org.apache.nutch.util; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.WritableName; /** Utility to create Hadoop [EMAIL PROTECTED] Configuration}s that include Nutch-specific * resources. */ public class NutchConfiguration { + + // for back-compatibility, add old aliases for these Writable classes + // this may be removed after the 0.8 release + static { +WritableName.addName(org.apache.nutch.fetcher.FetcherOutput.class, + "FetcherOutput"); +WritableName.addName(org.apache.nutch.parse.ParseData.class, "ParseData"); +WritableName.addName(org.apache.nutch.parse.ParseText.class, "ParseText"); +
svn commit: r374799 - /lucene/nutch/trunk/build.xml
Author: cutting Date: Fri Feb 3 16:55:20 2006 New Revision: 374799 URL: http://svn.apache.org/viewcvs?rev=374799&view=rev Log: Remove vestiges of mapred's webapp. Modified: lucene/nutch/trunk/build.xml Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=374799&r1=374798&r2=374799&view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Fri Feb 3 16:55:20 2006 @@ -38,7 +38,6 @@ - @@ -47,10 +46,6 @@ - - - - @@ -375,10 +370,6 @@ - - - -
svn commit: r374797 - /lucene/nutch/trunk/src/test/org/apache/nutch/pagedb/
Author: cutting Date: Fri Feb 3 16:46:45 2006 New Revision: 374797 URL: http://svn.apache.org/viewcvs?rev=374797&view=rev Log: Removing unused directory. Removed: lucene/nutch/trunk/src/test/org/apache/nutch/pagedb/
svn commit: r374796 [5/5] - in /lucene/nutch/trunk: bin/ conf/ lib/ lib/jetty-ext/ src/java/org/apache/nutch/analysis/ src/java/org/apache/nutch/clustering/ src/java/org/apache/nutch/crawl/ src/java/o
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java?rev=374796&r1=374795&r2=374796&view=diff == --- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java Fri Feb 3 16:38:32 2006 @@ -18,7 +18,8 @@ import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.OutlinkExtractor; -import org.apache.nutch.util.NutchConf; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; import junit.framework.TestCase; @@ -31,15 +32,15 @@ */ public class TestOutlinkExtractor extends TestCase { - private static NutchConf nutchConf = new NutchConf(); + private static Configuration conf = NutchConfiguration.create(); public void testGetNoOutlinks() { Outlink[] outlinks = null; -outlinks = OutlinkExtractor.getOutlinks(null, nutchConf); +outlinks = OutlinkExtractor.getOutlinks(null, conf); assertNotNull(outlinks); assertEquals(0, outlinks.length); -outlinks = OutlinkExtractor.getOutlinks("", nutchConf); +outlinks = OutlinkExtractor.getOutlinks("", conf); assertNotNull(outlinks); assertEquals(0, outlinks.length); } @@ -48,7 +49,7 @@ Outlink[] outlinks = OutlinkExtractor.getOutlinks( "Test with http://www.nutch.org/index.html is it found? " + "What about www.google.com at http://www.google.de " + -"A longer URL could be http://www.sybit.com/solutions/portals.html";, nutchConf); +"A longer URL could be http://www.sybit.com/solutions/portals.html";, conf); assertTrue("Url not found!", outlinks.length == 3); assertEquals("Wrong URL", "http://www.nutch.org/index.html";, outlinks[0].getToUrl()); @@ -60,7 +61,7 @@ Outlink[] outlinks = OutlinkExtractor.getOutlinks( "Test with http://www.nutch.org/index.html is it found? " + "What about www.google.com at http://www.google.de " + -"A longer URL could be http://www.sybit.com/solutions/portals.html";, "http://www.sybit.de";, nutchConf); +"A longer URL could be http://www.sybit.com/solutions/portals.html";, "http://www.sybit.de";, conf); assertTrue("Url not found!", outlinks.length == 3); assertEquals("Wrong URL", "http://www.nutch.org/index.html";, outlinks[0].getToUrl()); @@ -70,7 +71,7 @@ public void testGetOutlinksFtp() { Outlink[] outlinks = OutlinkExtractor.getOutlinks( "Test with ftp://www.nutch.org is it found? " + -"What about www.google.com at ftp://www.google.de";, nutchConf); +"What about www.google.com at ftp://www.google.de";, conf); assertTrue("Url not found!", outlinks.length >1); assertEquals("Wrong URL", "ftp://www.nutch.org/";, outlinks[0].getToUrl()); Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java?rev=374796&r1=374795&r2=374796&view=diff == --- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java Fri Feb 3 16:38:32 2006 @@ -16,9 +16,12 @@ package org.apache.nutch.parse; -import org.apache.nutch.io.*; +import org.apache.hadoop.io.*; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; + +import org.apache.nutch.util.WritableTestUtils; import org.apache.nutch.protocol.ContentProperties; -import org.apache.nutch.util.NutchConf; import junit.framework.TestCase; @@ -26,7 +29,7 @@ public class TestParseData extends TestCase { - private NutchConf nutchConf = new NutchConf(); + private Configuration conf = NutchConfiguration.create(); public TestParseData(String name) { super(name); } @@ -35,8 +38,8 @@ String title = "The Foo Page"; Outlink[] outlinks = new Outlink[] { - new Outlink("http://foo.com/";, "Foo", nutchConf), - new Outlink("http://bar.com/";, "Bar", nutchConf) + new Outlink("http://foo.com/";, "Foo", conf), + new Outlink("http://bar.com/";, "Bar", conf) }; ContentProperties metaData = new ContentProperties(); @@ -44,9 +47,9 @@ metaData.put("Charset", "UTF-8"); ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData); -r.setConf(nutchConf); +r.setConf(conf); -TestWritable.testWritable(r); +WritableTestUtils.testWritable(r, conf); } } Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseText.java URL:
svn commit: r374202 - /lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskRunner.java
Author: cutting Date: Wed Feb 1 15:19:54 2006 New Revision: 374202 URL: http://svn.apache.org/viewcvs?rev=374202&view=rev Log: Fix NUTCH-197: job fails when jar doesn't contain a lib directory. Contributed by Owen O'Malley. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskRunner.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskRunner.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskRunner.java?rev=374202&r1=374201&r2=374202&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskRunner.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskRunner.java Wed Feb 1 15:19:54 2006 @@ -78,9 +78,11 @@ if (jar != null) { // if jar exists, it into workDir runChild(new String[] { "unzip", jar}, workDir); File[] libs = new File(workDir, "lib").listFiles(); -for (int i = 0; i < libs.length; i++) { - classPath.append(sep); // add libs from jar to classpath - classPath.append(libs[i]); +if (libs != null) { + for (int i = 0; i < libs.length; i++) { +classPath.append(sep);// add libs from jar to classpath +classPath.append(libs[i]); + } } classPath.append(sep); classPath.append(new File(workDir, "classes"));
svn commit: r372810 - /lucene/nutch/trunk/bin/nutch
Author: cutting Date: Fri Jan 27 02:45:35 2006 New Revision: 372810 URL: http://svn.apache.org/viewcvs?rev=372810&view=rev Log: Explicitly specify bash, since this script requires some bash-specific features. Modified: lucene/nutch/trunk/bin/nutch Modified: lucene/nutch/trunk/bin/nutch URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/bin/nutch?rev=372810&r1=372809&r2=372810&view=diff == --- lucene/nutch/trunk/bin/nutch (original) +++ lucene/nutch/trunk/bin/nutch Fri Jan 27 02:45:35 2006 @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # # The Nutch command script #
svn commit: r372342 - /lucene/nutch/nightly/nightly.sh
Author: cutting Date: Wed Jan 25 14:20:06 2006 New Revision: 372342 URL: http://svn.apache.org/viewcvs?rev=372342&view=rev Log: Fix remove command. Modified: lucene/nutch/nightly/nightly.sh Modified: lucene/nutch/nightly/nightly.sh URL: http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.sh?rev=372342&r1=372341&r2=372342&view=diff == --- lucene/nutch/nightly/nightly.sh (original) +++ lucene/nutch/nightly/nightly.sh Wed Jan 25 14:20:06 2006 @@ -25,4 +25,4 @@ scp build/*.tar.gz $REL_SERVER:$REL_DIR/nutch-`/bin/date +%F`.tar.gz # remove all but five newest builds -ssh $REL_SERVER rm `ssh $REL_SERVER find -type f $REL_DIR | sort -r | tail +5` +ssh $REL_SERVER rm `ssh $REL_SERVER find $REL_DIR -type f | sort -r | tail +5`
svn commit: r372315 - /lucene/nutch/nightly/nightly.sh
Author: cutting Date: Wed Jan 25 13:12:13 2006 New Revision: 372315 URL: http://svn.apache.org/viewcvs?rev=372315&view=rev Log: Fix deletion of old versions. Modified: lucene/nutch/nightly/nightly.sh Modified: lucene/nutch/nightly/nightly.sh URL: http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.sh?rev=372315&r1=372314&r2=372315&view=diff == --- lucene/nutch/nightly/nightly.sh (original) +++ lucene/nutch/nightly/nightly.sh Wed Jan 25 13:12:13 2006 @@ -24,5 +24,5 @@ # release it scp build/*.tar.gz $REL_SERVER:$REL_DIR/nutch-`/bin/date +%F`.tar.gz -# remove old release -ssh $REL_SERVER rm -rf $REL_DIR/nutch-`/bin/date -d'week ago' +%F`.tar.gz +# remove all but five newest builds +ssh $REL_SERVER rm `ssh $REL_SERVER find -type f $REL_DIR | sort -r | tail +5`
svn commit: r370657 - in /lucene/nutch/nightly: nightly.cron nightly.properties nightly.sh
Author: cutting Date: Thu Jan 19 14:46:28 2006 New Revision: 370657 URL: http://svn.apache.org/viewcvs?rev=370657&view=rev Log: Moving nightly build to lucene.zones.apache.org. Modified: lucene/nutch/nightly/nightly.cron lucene/nutch/nightly/nightly.properties lucene/nutch/nightly/nightly.sh Modified: lucene/nutch/nightly/nightly.cron URL: http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.cron?rev=370657&r1=370656&r2=370657&view=diff == --- lucene/nutch/nightly/nightly.cron (original) +++ lucene/nutch/nightly/nightly.cron Thu Jan 19 14:46:28 2006 @@ -1,4 +1,4 @@ # nightly crontab file # install with: crontab nightly.cron # run seventeen minutes after midnight, every day -17 0 * * * $HOME/src/nutch/nightly/nightly.sh > $HOME/src/nutch/nightly/nightly.log 2>&1 +17 0 * * * $HOME/nutch-nightly/nightly.sh > $HOME/nutch-nightly/nightly.log 2>&1 Modified: lucene/nutch/nightly/nightly.properties URL: http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.properties?rev=370657&r1=370656&r2=370657&view=diff == --- lucene/nutch/nightly/nightly.properties (original) +++ lucene/nutch/nightly/nightly.properties Thu Jan 19 14:46:28 2006 @@ -1,4 +1,4 @@ -MailLogger.mailhost = smtp.sbcglobal.net +MailLogger.mailhost = mail.apache.org MailLogger.from = nutch-dev@incubator.apache.org MailLogger.failure.to = nutch-dev@incubator.apache.org MailLogger.failure.subject = Nutch nightly build failure Modified: lucene/nutch/nightly/nightly.sh URL: http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.sh?rev=370657&r1=370656&r2=370657&view=diff == --- lucene/nutch/nightly/nightly.sh (original) +++ lucene/nutch/nightly/nightly.sh Thu Jan 19 14:46:28 2006 @@ -1,6 +1,6 @@ #!/bin/bash -vx -export JAVA_HOME=$HOME/local/j2sdk1.4.2 +export JAVA_HOME=/usr/j2se TRUNK=http://svn.apache.org/repos/asf/lucene/nutch/trunk @@ -12,12 +12,12 @@ cd /tmp # export sources into it -svn export $TRUNK nutch-nightly +$HOME/bin/svn export $TRUNK nutch-nightly # run build cd nutch-nightly -$HOME/local/ant/bin/ant \ - -propertyfile $HOME/src/nutch/nightly/nightly.properties \ +$HOME/bin/ant \ + -propertyfile $HOME/nutch-nightly/nightly.properties \ -logger org.apache.tools.ant.listener.MailLogger \ -Dversion=nightly nightly
svn commit: r370638 - /lucene/nutch/trunk/conf/nutch-default.xml
Author: cutting Date: Thu Jan 19 13:24:58 2006 New Revision: 370638 URL: http://svn.apache.org/viewcvs?rev=370638&view=rev Log: Document a few more properties. Contributed by Dominik Friedrich. Modified: lucene/nutch/trunk/conf/nutch-default.xml Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=370638&r1=370637&r2=370638&view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Thu Jan 19 13:24:58 2006 @@ -379,6 +379,14 @@ exception. + + io.map.index.skip + 0 + Number of index entries to skip between each entry. + Zero by default. Setting this to values larger than zero can + facilitate opening large map files using less memory. + + @@ -412,6 +420,14 @@ directories, typically on different devices. + + ndfs.replication + 3 + How many copies we try to have at all times. The actual + number of replications is at max the number of datanodes in the + cluster. + + @@ -509,6 +525,13 @@ 200m The heap size (-Xmx) that will be used for task tracker child processes. + + + + mapred.combine.buffer.size + 10 + The number of entries the combining collector caches before + combining them and writing to disk.
svn commit: r370632 - /lucene/nutch/trunk/conf/nutch-default.xml
Author: cutting Date: Thu Jan 19 12:58:54 2006 New Revision: 370632 URL: http://svn.apache.org/viewcvs?rev=370632&view=rev Log: Switch default to protocol-http, since it seems more reliable than protocol-httpclient. Modified: lucene/nutch/trunk/conf/nutch-default.xml Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=370632&r1=370631&r2=370632&view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Thu Jan 19 12:58:54 2006 @@ -733,7 +733,7 @@ plugin.includes - protocol-httpclient|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url) + protocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url) Regular expression naming plugin directory names to include. Any plugin not matching this expression is excluded. In any case you need at least include the nutch-extensionpoints plugin. By
svn commit: r370281 - /lucene/nutch/trunk/build.xml
Author: cutting Date: Wed Jan 18 14:03:28 2006 New Revision: 370281 URL: http://svn.apache.org/viewcvs?rev=370281&view=rev Log: Fix NUTCH-102: include webapps in packaged releases. Modified: lucene/nutch/trunk/build.xml Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=370281&r1=370280&r2=370281&view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Wed Jan 18 14:03:28 2006 @@ -377,6 +377,10 @@ + + + +
svn commit: r367408 - /lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java
Author: cutting Date: Mon Jan 9 13:55:31 2006 New Revision: 367408 URL: http://svn.apache.org/viewcvs?rev=367408&view=rev Log: NUTCH-160: Switch RegexURLFilter to use Java regex's rather than oro, since Java's seem to be faster & more reliable. By Rod Taylor. Modified: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java Modified: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java?rev=367408&r1=367407&r2=367408&view=diff == --- lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java (original) +++ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java Mon Jan 9 13:55:31 2006 @@ -32,12 +32,7 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.logging.Logger; - -import org.apache.oro.text.regex.Perl5Compiler; -import org.apache.oro.text.regex.Perl5Matcher; -import org.apache.oro.text.regex.Perl5Pattern; -import org.apache.oro.text.regex.PatternMatcher; -import org.apache.oro.text.regex.MalformedPatternException; +import java.util.regex.*; /** * Filters URLs based on a file of regular expressions. The file is named by @@ -80,15 +75,14 @@ } private static class Rule { -public Perl5Pattern pattern; +public Pattern pattern; public boolean sign; public String regex; } private List rules; - private PatternMatcher matcher = new Perl5Matcher(); - public RegexURLFilter() throws IOException, MalformedPatternException { + public RegexURLFilter() throws IOException, PatternSyntaxException { String file = NutchConf.get().get("urlfilter.regex.file"); // attribute "file" takes precedence if defined if (attributeFile != null) @@ -103,7 +97,7 @@ } public RegexURLFilter(String filename) -throws IOException, MalformedPatternException { +throws IOException, PatternSyntaxException { rules = readConfigurationFile(new FileReader(filename)); } @@ -111,7 +105,9 @@ Iterator i=rules.iterator(); while(i.hasNext()) { Rule r=(Rule) i.next(); - if (matcher.contains(url,r.pattern)) { + Matcher matcher = r.pattern.matcher(url); + + if (matcher.find()) { //System.out.println("Matched " + r.regex); return r.sign ? url : null; } @@ -129,10 +125,9 @@ // private static List readConfigurationFile(Reader reader) -throws IOException, MalformedPatternException { +throws IOException, PatternSyntaxException { BufferedReader in=new BufferedReader(reader); -Perl5Compiler compiler=new Perl5Compiler(); List rules=new ArrayList(); String line; @@ -157,7 +152,7 @@ String regex=line.substring(1); Rule rule=new Rule(); - rule.pattern=(Perl5Pattern) compiler.compile(regex); + rule.pattern=Pattern.compile(regex); rule.sign=sign; rule.regex=regex; rules.add(rule); @@ -167,7 +162,7 @@ } public static void main(String args[]) -throws IOException, MalformedPatternException { +throws IOException, PatternSyntaxException { RegexURLFilter filter=new RegexURLFilter(); BufferedReader in=new BufferedReader(new InputStreamReader(System.in));
svn commit: r367406 - in /lucene/nutch/trunk/src: java/org/apache/nutch/ipc/RPC.java test/org/apache/nutch/ipc/TestRPC.java
Author: cutting Date: Mon Jan 9 13:50:48 2006 New Revision: 367406 URL: http://svn.apache.org/viewcvs?rev=367406&view=rev Log: Fix parallel RPC calls to work correctly with methods that return void. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java?rev=367406&r1=367405&r2=367406&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java Mon Jan 9 13:50:48 2006 @@ -149,6 +149,10 @@ Writable[] wrappedValues = CLIENT.call(invocations, addrs); +if (method.getReturnType() == Void.TYPE) { + return null; +} + Object[] values = (Object[])Array.newInstance(method.getReturnType(),wrappedValues.length); for (int i = 0; i < values.length; i++) Modified: lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java?rev=367406&r1=367405&r2=367406&view=diff == --- lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java Mon Jan 9 13:50:48 2006 @@ -110,13 +110,17 @@ } assertTrue(caught); -// try a multi-call -Method method = +// try some multi-calls +Method echo = TestProtocol.class.getMethod("echo", new Class[] { String.class }); -String[] values = (String[])RPC.call(method, new String[][]{{"a"},{"b"}}, +String[] strings = (String[])RPC.call(echo, new String[][]{{"a"},{"b"}}, new InetSocketAddress[] {addr, addr}); -assertTrue(Arrays.equals(values, new String[]{"a","b"})); +assertTrue(Arrays.equals(strings, new String[]{"a","b"})); +Method ping = TestProtocol.class.getMethod("ping", new Class[] {}); +Object[] voids = (Object[])RPC.call(ping, new Object[][]{{},{}}, +new InetSocketAddress[] {addr, addr}); +assertEquals(voids, null); server.stop(); }
svn commit: r366573 - /lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
Author: cutting Date: Fri Jan 6 13:42:25 2006 New Revision: 366573 URL: http://svn.apache.org/viewcvs?rev=366573&view=rev Log: Fix for NUTCH-150, by Paul Baclace. 1. Adds a comment that non-plain-text can be a problem. 2. Adds quantifiers to the regular expression to limit length of matched text. 3. Monitors the time spent doing matching and if more than 60 seconds, it will stop looking for additional matches (this does not prevent the first lengthy match). Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=366573&r1=366572&r2=366573&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Fri Jan 6 13:42:25 2006 @@ -55,11 +55,12 @@ * */ private static final String URL_PATTERN = - "([A-Za-z][A-Za-z0-9+.-]+:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?)"; + "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; /** * Extracts Outlink from given plain text. - * + * Applying this method to non-plain-text can result in extremely lengthy + * runtimes for parasitic cases (postscript is a known example). * @param plainText the plain text from wich URLs should be extracted. * * @return Array of Outlinks within found in plainText @@ -78,7 +79,7 @@ * @return Array of Outlinks within found in plainText */ public static Outlink[] getOutlinks(final String plainText, String anchor) { - +long start = System.currentTimeMillis(); final List outlinks = new ArrayList(); try { @@ -95,13 +96,19 @@ //loop the matches while (matcher.contains(input, pattern)) { +// if this is taking too long, stop matching +// (SHOULD really check cpu time used so that heavily loaded systems +// do not unnecessarily hit this limit.) +if (System.currentTimeMillis() - start >= 6L) { + LOG.warning("Time limit exceeded for getOutLinks"); + break; +} result = matcher.getMatch(); url = result.group(0); outlinks.add(new Outlink(url, anchor)); } } catch (Exception ex) { - // if it is a malformed URL we just throw it away and continue with - // extraction. + // if the matcher fails (perhaps a malformed URL) we just log it and move on LOG.throwing(OutlinkExtractor.class.getName(), "getOutlinks", ex); }
svn commit: r366571 - /lucene/nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java
Author: cutting Date: Fri Jan 6 13:35:35 2006 New Revision: 366571 URL: http://svn.apache.org/viewcvs?rev=366571&view=rev Log: Fix for NUTCH-151: CommandRunner can hang after the main thread exec is finished and has inefficient busy loop. I encountered a case where the JVM of a Tasktracker child did not exit after the main thread returned; a thread dump showed only the threads named STDOUT and STDERR from CommandRunner as non-daemon threads, and both were doing a read(). CommandRunner also had an excessively costly busy loop. These problems were fixed by: 1. The pipe io threads should be daemons. 2. The main thread should always interrupt() the pipe io threads when finishing up, not just when a timeout occurs. 3. Sleep before testing whether the process has finished with Process.exitValue(). 4. Increased the sleep time to be 1000msec. By Paul Baclace. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java?rev=366571&r1=366570&r2=366571&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java Fri Jan 6 13:35:35 2006 @@ -18,6 +18,7 @@ * Adopted by John Xing for Nutch Project from * http://blog.fivesight.com/prb/space/Call+an+External+Command+from+Java/, * which explains the code in detail. + * [Original author is moving his site to http://mult.ifario.us/ -peb] * * Comments by John Xing on 20040621: * (1) EDU.oswego.cs.dl.util.concurrent.* is in j2sdk 1.5 now. @@ -31,6 +32,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.io.InterruptedIOException; import EDU.oswego.cs.dl.util.concurrent.BrokenBarrierException; import EDU.oswego.cs.dl.util.concurrent.CyclicBarrier; @@ -80,40 +82,47 @@ } public void evaluate() throws IOException { -Process proc = Runtime.getRuntime().exec(_command); + this.exec(); + } + /** + * + * @return process exit value (return code) or -1 if timed out. + * @throws IOException + */ + public int exec() throws IOException { +Process proc = Runtime.getRuntime().exec(_command); _barrier = new CyclicBarrier(3 + ((_stdin != null) ? 1 : 0)); PullerThread so = new PullerThread("STDOUT", proc.getInputStream(), _stdout); +so.setDaemon(true); so.start(); PullerThread se = new PullerThread("STDERR", proc.getErrorStream(), _stderr); +se.setDaemon(true); se.start(); PusherThread si = null; if (_stdin != null) { si = new PusherThread("STDIN", _stdin, proc.getOutputStream()); + si.setDaemon(true); si.start(); } boolean _timedout = false; long end = System.currentTimeMillis() + _timeout * 1000; +// try { if (_timeout == 0) { -_barrier.barrier(); +_barrier.barrier(); // JDK 1.5: // _barrier.await(); } else { -_barrier.attemptBarrier(_timeout * 1000); +_barrier.attemptBarrier(_timeout * 1000); // JDK 1.5: // _barrier.await(_timeout, TimeUnit.SECONDS); } } catch (TimeoutException ex) { _timedout = true; - if (si != null) { -si.interrupt(); - } - so.interrupt(); - se.interrupt(); if (_destroyOnTimeout) { proc.destroy(); } @@ -123,16 +132,27 @@ /* IGNORE */ } +// tell the io threads we are finished +if (si != null) { + si.interrupt(); +} +so.interrupt(); +se.interrupt(); + _xit = -1; if (!_timedout) { if (_waitForExit) { do { try { +Thread.sleep(1000); _xit = proc.exitValue(); -Thread.sleep(250); } catch (InterruptedException ie) { -/* IGNORE */ + if (Thread.interrupted()) { + break; // stop waiting on an interrupt for this thread + } else { + continue; + } } catch (IllegalThreadStateException iltse) { continue; } @@ -152,6 +172,7 @@ proc.destroy(); } } +return _xit; } public Throwable getThrownError() { @@ -163,8 +184,6 @@ private OutputStream _os; private InputStream _is; -private volatile boolean _kaput; - private boolean _closeInput; protected PumperThread( @@ -179,7 +198,6 @@ } public void run() { - _kaput = false; try { byte[] buf = new byte[BUF]; int read = 0; @@ -189,9 +207,10 @@ _os.write(buf, 0, read); _os.flush(); } + } catch (Inte
svn commit: r366550 - /lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java
Author: cutting Date: Fri Jan 6 11:14:46 2006 New Revision: 366550 URL: http://svn.apache.org/viewcvs?rev=366550&view=rev Log: Make it clearer why this optimization is valid. For Stefan. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java?rev=366550&r1=366549&r2=366550&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java Fri Jan 6 11:14:46 2006 @@ -306,7 +306,7 @@ * contains nulls for calls that timed out or errored. */ public Writable[] call(Writable[] params, InetSocketAddress[] addresses) throws IOException { -if (params.length == 0) return new Writable[0]; +if (addresses.length == 0) return new Writable[0]; ParallelResults results = new ParallelResults(params.length); synchronized (results) {
svn commit: r366322 - /lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
Author: cutting Date: Thu Jan 5 14:37:19 2006 New Revision: 366322 URL: http://svn.apache.org/viewcvs?rev=366322&view=rev Log: Fix a bug in LimitedCollector. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java?rev=366322&r1=366321&r2=366322&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Thu Jan 5 14:37:19 2006 @@ -48,7 +48,7 @@ private int maxHits; public LimitedCollector(int numHits, int maxHits) { - super(maxHits); + super(numHits); this.maxHits = maxHits; }
svn commit: r366280 - /lucene/nutch/trunk/conf/nutch-default.xml
Author: cutting Date: Thu Jan 5 13:08:27 2006 New Revision: 366280 URL: http://svn.apache.org/viewcvs?rev=366280&view=rev Log: Fix NUTCH-131: add mapred.child.heap.size. From Marko Bauhardt. Modified: lucene/nutch/trunk/conf/nutch-default.xml Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=366280&r1=366279&r2=366280&view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Thu Jan 5 13:08:27 2006 @@ -504,6 +504,13 @@ + + mapred.child.heap.size + 200m + The heap size (-Xmx) that will be used for task tracker + child processes. + +
svn commit: r366271 - /lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java
Author: cutting Date: Thu Jan 5 12:13:43 2006 New Revision: 366271 URL: http://svn.apache.org/viewcvs?rev=366271&view=rev Log: Fix for NUTCH-108: eliminate voluminous messages when reconnecting. >From Paul Baclace. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java?rev=366271&r1=366270&r2=366271&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java Thu Jan 5 12:13:43 2006 @@ -287,8 +287,7 @@ staleState = true; } } catch (Exception ex) { -ex.printStackTrace(); -LOG.info("Lost connection to JobTracker [" + jobTrackAddr + "]. Retrying..."); +LOG.info("Lost connection to JobTracker [" + jobTrackAddr + "]. ex=" + ex + " Retrying..."); try { Thread.sleep(5000); } catch (InterruptedException ie) {
svn commit: r366242 - in /lucene/nutch/trunk: conf/nutch-default.xml src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
Author: cutting Date: Thu Jan 5 10:38:44 2006 New Revision: 366242 URL: http://svn.apache.org/viewcvs?rev=366242&view=rev Log: Fix NegativeArraySizeException. Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=366242&r1=366241&r2=366242&view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Thu Jan 5 10:38:44 2006 @@ -661,10 +661,11 @@ searcher.max.hits - 2147483647 - Search stops after this many hits are found. Setting - this to smaller values can make searches much faster. With a sorted - index, the quality of the hits suffers little. + -1 + If positive, search stops after this many hits are + found. Setting this to small, positive values (e.g., 1000) can make + searches much faster. With a sorted index, the quality of the hits + suffers little. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java?rev=366242&r1=366241&r2=366242&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Thu Jan 5 10:38:44 2006 @@ -37,8 +37,7 @@ * which do not affect ranking but might otherwise slow search considerably. */ class LuceneQueryOptimizer { - private static int MAX_HITS = -NutchConf.get().getInt("searcher.max.hits", Integer.MAX_VALUE); + private static int MAX_HITS = NutchConf.get().getInt("searcher.max.hits",-1); private static class LimitExceeded extends RuntimeException { private int maxDoc; @@ -150,6 +149,13 @@ } } if (sortField == null && !reverse) { + + // no hit limit + if (MAX_HITS <= 0) { +return searcher.search(query, filter, numHits); + } + + // hits limited -- use a LimitedCollector LimitedCollector collector = new LimitedCollector(numHits, MAX_HITS); LimitExceeded exceeded = null; try {
svn commit: r365459 - in /lucene/nutch/trunk: conf/nutch-default.xml src/java/org/apache/nutch/indexer/IndexSorter.java src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
Author: cutting Date: Mon Jan 2 15:27:50 2006 New Revision: 365459 URL: http://svn.apache.org/viewcvs?rev=365459&view=rev Log: Add index sorter & ability to stop searching after N hits. Added: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=365459&r1=365458&r2=365459&view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Mon Jan 2 15:27:50 2006 @@ -659,6 +659,14 @@ + + searcher.max.hits + 2147483647 + Search stops after this many hits are found. Setting + this to smaller values can make searches much faster. With a sorted + index, the quality of the hits suffers little. + + Added: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java?rev=365459&view=auto == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java (added) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java Mon Jan 2 15:27:50 2006 @@ -0,0 +1,295 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer; + +import java.io.File; +import java.io.IOException; +import java.util.Date; +import java.util.Arrays; + +import org.apache.lucene.index.*; +import org.apache.lucene.document.*; +import org.apache.lucene.store.*; +import org.apache.lucene.search.*; + +import org.apache.nutch.util.NutchConf; + +/** Sort a Nutch index by page score. Higher scoring documents are assigned + * smaller document numbers. */ +public class IndexSorter { + + private static class PostingMap implements Comparable { +private int newDoc; +private long offset; + +public int compareTo(Object o) { // order by newDoc id + return this.newDoc - ((PostingMap)o).newDoc; +} + } + + private static class SortedTermPositions implements TermPositions { +private TermPositions original; +private int[] oldToNew; + +private int docFreq; + +private PostingMap[] postingMaps = new PostingMap[0]; +private int pointer; + +private int freq; +private int position; + +private static final String TEMP_FILE = "temp"; +private final RAMDirectory tempDir = new RAMDirectory(); +private final RAMOutputStream out = + (RAMOutputStream)tempDir.createOutput(TEMP_FILE); +private IndexInput in; + +public SortedTermPositions(TermPositions original, int[] oldToNew) { + this.original = original; + this.oldToNew = oldToNew; +} + +public void seek(Term term) throws IOException { + throw new UnsupportedOperationException(); +} + +public void seek(TermEnum terms) throws IOException { + original.seek(terms); + + docFreq = terms.docFreq(); + pointer = -1; + + if (docFreq > postingMaps.length) { // grow postingsMap +PostingMap[] newMap = new PostingMap[docFreq]; +System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length); +for (int i = postingMaps.length; i < docFreq; i++) { + newMap[i] = new PostingMap(); +} +postingMaps = newMap; + } + + out.reset(); + + int i = 0; + while (original.next()) { +PostingMap map = postingMaps[i++]; +map.newDoc = oldToNew[original.doc()];// remap the newDoc id +map.offset = out.getFilePointer();// save pointer to buffer + +final int tf = original.freq(); // buffer tf & positions +out.writeVInt(tf); +int prevPosition = 0; +for (int j = tf; j > 0; j--) {// delta encode positions + int p = original.nextPosition(); + out.writeVInt(p - prevPosition); + prevPosition = p; +} + } + out.flush(); + docFreq = i;// allow for deletions + + Arra
svn commit: r365454 - /lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java
Author: cutting Date: Mon Jan 2 14:33:38 2006 New Revision: 365454 URL: http://svn.apache.org/viewcvs?rev=365454&view=rev Log: Revert unintended commit. Modified: lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java Modified: lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java?rev=365454&r1=365453&r2=365454&view=diff == --- lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java Mon Jan 2 14:33:38 2006 @@ -35,8 +35,6 @@ public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.ipc.TestIPC"); - private static final int TIMEOUT = 1; - // quiet during testing, since output ends up on console static { LOG.setLevel(Level.WARNING); @@ -44,6 +42,8 @@ Server.LOG.setLevel(Level.WARNING); } + public TestIPC(String name) { super(name); } + private static final Random RANDOM = new Random(); private static final int PORT = 1234; @@ -53,7 +53,7 @@ public TestServer(int port, int handlerCount, boolean sleep) { super(port, LongWritable.class, handlerCount); - this.setTimeout(TIMEOUT); + this.setTimeout(1000); this.sleep = sleep; } @@ -75,7 +75,7 @@ public SerialCaller(Client client, int count) { this.client = client; this.count = count; - client.setTimeout(TIMEOUT); + client.setTimeout(1000); } public void run() { @@ -108,7 +108,7 @@ this.client = client; this.addresses = addresses; this.count = count; - client.setTimeout(TIMEOUT); + client.setTimeout(1000); } public void run() { @@ -157,10 +157,10 @@ callers[i].join(); assertFalse(callers[i].failed); } -// for (int i = 0; i < clientCount; i++) { -// clients[i].stop(); -// } -// server.stop(); +for (int i = 0; i < clientCount; i++) { + clients[i].stop(); +} +server.stop(); } public void testParallel() throws Exception { @@ -207,17 +207,14 @@ public static void main(String[] args) throws Exception { // crank up the volume! -// LOG.setLevel(Level.INFO); -// Client.LOG.setLevel(Level.INFO); -// Server.LOG.setLevel(Level.INFO); -// LogFormatter.setShowThreadIDs(true); - - new TestIPC().testSerial(10, false, 500, 500, 100); -//new TestIPC().testParallel(10, false, 2, 4, 2, 4, 1000); - -// TestIPC test = new TestIPC(); -// test.testSerial(); -// test.testParallel(); +LOG.setLevel(Level.FINE); +Client.LOG.setLevel(Level.FINE); +Server.LOG.setLevel(Level.FINE); +LogFormatter.setShowThreadIDs(true); + +//new TestIPC("test").testSerial(5, false, 2, 10, 1000); + +new TestIPC("test").testParallel(10, false, 2, 4, 2, 4, 1000); }
svn commit: r365450 - in /lucene/nutch/trunk: lib/lucene-1.9-rc1-dev.jar lib/lucene-core-1.9-rc1-dev.jar src/test/org/apache/nutch/ipc/TestIPC.java
Author: cutting Date: Mon Jan 2 14:08:50 2006 New Revision: 365450 URL: http://svn.apache.org/viewcvs?rev=365450&view=rev Log: New version of Lucene that includes TopDocCollector. Added: lucene/nutch/trunk/lib/lucene-core-1.9-rc1-dev.jar (with props) Removed: lucene/nutch/trunk/lib/lucene-1.9-rc1-dev.jar Modified: lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java Added: lucene/nutch/trunk/lib/lucene-core-1.9-rc1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-core-1.9-rc1-dev.jar?rev=365450&view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/lucene-core-1.9-rc1-dev.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java?rev=365450&r1=365449&r2=365450&view=diff == --- lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestIPC.java Mon Jan 2 14:08:50 2006 @@ -35,6 +35,8 @@ public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.ipc.TestIPC"); + private static final int TIMEOUT = 1; + // quiet during testing, since output ends up on console static { LOG.setLevel(Level.WARNING); @@ -42,8 +44,6 @@ Server.LOG.setLevel(Level.WARNING); } - public TestIPC(String name) { super(name); } - private static final Random RANDOM = new Random(); private static final int PORT = 1234; @@ -53,7 +53,7 @@ public TestServer(int port, int handlerCount, boolean sleep) { super(port, LongWritable.class, handlerCount); - this.setTimeout(1000); + this.setTimeout(TIMEOUT); this.sleep = sleep; } @@ -75,7 +75,7 @@ public SerialCaller(Client client, int count) { this.client = client; this.count = count; - client.setTimeout(1000); + client.setTimeout(TIMEOUT); } public void run() { @@ -108,7 +108,7 @@ this.client = client; this.addresses = addresses; this.count = count; - client.setTimeout(1000); + client.setTimeout(TIMEOUT); } public void run() { @@ -157,10 +157,10 @@ callers[i].join(); assertFalse(callers[i].failed); } -for (int i = 0; i < clientCount; i++) { - clients[i].stop(); -} -server.stop(); +// for (int i = 0; i < clientCount; i++) { +// clients[i].stop(); +// } +// server.stop(); } public void testParallel() throws Exception { @@ -207,14 +207,17 @@ public static void main(String[] args) throws Exception { // crank up the volume! -LOG.setLevel(Level.FINE); -Client.LOG.setLevel(Level.FINE); -Server.LOG.setLevel(Level.FINE); -LogFormatter.setShowThreadIDs(true); - -//new TestIPC("test").testSerial(5, false, 2, 10, 1000); - -new TestIPC("test").testParallel(10, false, 2, 4, 2, 4, 1000); +// LOG.setLevel(Level.INFO); +// Client.LOG.setLevel(Level.INFO); +// Server.LOG.setLevel(Level.INFO); +// LogFormatter.setShowThreadIDs(true); + + new TestIPC().testSerial(10, false, 500, 500, 100); +//new TestIPC().testParallel(10, false, 2, 4, 2, 4, 1000); + +// TestIPC test = new TestIPC(); +// test.testSerial(); +// test.testParallel(); }
svn commit: r365392 - /lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
Author: cutting Date: Mon Jan 2 10:51:18 2006 New Revision: 365392 URL: http://svn.apache.org/viewcvs?rev=365392&view=rev Log: Fix divide by zero error in DeleteDuplicates.java. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=365392&r1=365391&r2=365392&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Mon Jan 2 10:51:18 2006 @@ -190,7 +190,7 @@ } public long getPos() throws IOException { -return (doc*INDEX_LENGTH)/maxDoc; +return maxDoc==0 ? 0 : (doc*INDEX_LENGTH)/maxDoc; } public void close() throws IOException {
svn commit: r357197 [5/5] - in /lucene/nutch: branches/mapred/ trunk/ trunk/bin/ trunk/conf/ trunk/lib/ trunk/lib/jetty-ext/ trunk/site/ trunk/src/java/org/apache/nutch/crawl/ trunk/src/java/org/apach
Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java?rev=357197&r1=357196&r2=357197&view=diff == --- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java Fri Dec 16 09:51:05 2005 @@ -53,6 +53,9 @@ private static final boolean ALLOW_FORBIDDEN = NutchConf.get().getBoolean("http.robots.403.allow", false); + private static final int MAX_REDIRECTS = +NutchConf.get().getInt("http.redirect.max", 3); + private static final String[] AGENTS = getAgents(); private static final Hashtable CACHE = new Hashtable(); @@ -377,16 +380,30 @@ RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host); if (robotRules == null) { // cache miss - HttpResponse response = new HttpResponse(new URL(url, "/robots.txt")); - - if (response.getCode() == 200) // found rules: parse them -robotRules = new RobotRulesParser().parseRules(response.getContent()); - else if ( (response.getCode() == 403) && (!ALLOW_FORBIDDEN) ) -robotRules = FORBID_ALL_RULES;// use forbid all - else -robotRules = EMPTY_RULES; // use default rules + int redirects = 0; + do { +HttpResponse response = new HttpResponse(new URL(url, "/robots.txt")); + +int code = response.getCode(); + +if (code == 200) {// found rules: parse them + robotRules = new RobotRulesParser().parseRules(response.getContent()); +} else if ( (code == 403) && (!ALLOW_FORBIDDEN) ) { + robotRules = FORBID_ALL_RULES; // use forbid all +} else if (code >= 300 && code < 400) { // handle redirect + if (redirects == MAX_REDIRECTS) { +robotRules = EMPTY_RULES; + } else { +url = new URL(url, response.getHeader("Location")); +LOG.fine("redirect to " + url); +redirects++; + } +} else { + robotRules = EMPTY_RULES; // use default rules +} + } while (robotRules == null); - CACHE.put(host, robotRules);// cache rules for host + CACHE.put(host, robotRules); // cache rules for host } String path = url.getPath(); // check rules Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=357197&r1=357196&r2=357197&view=diff == --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Fri Dec 16 09:51:05 2005 @@ -60,10 +60,14 @@ } public HttpResponse(URL url) throws IOException { +this(url, false); + } + + HttpResponse(URL url, boolean followRedirects) throws IOException { this.base = url.toString(); this.orig = url.toString(); GetMethod get = new GetMethod(this.orig); -get.setFollowRedirects(false); +get.setFollowRedirects(followRedirects); get.setRequestHeader("User-Agent", Http.AGENT_STRING); HttpMethodParams params = get.getParams(); // some servers cannot digest the new protocol Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java?rev=357197&r1=357196&r2=357197&view=diff == --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java Fri Dec 16 09:51:05 2005 @@ -379,7 +379,8 @@ if (robotRules == null) { // cache miss LOG.fine("cache miss " + url); try { -HttpResponse response = new HttpResponse(new URL(url, "/robots.txt")); +HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"), +
svn commit: r357197 [3/5] - in /lucene/nutch: branches/mapred/ trunk/ trunk/bin/ trunk/conf/ trunk/lib/ trunk/lib/jetty-ext/ trunk/site/ trunk/src/java/org/apache/nutch/crawl/ trunk/src/java/org/apach
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSDataset.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSDataset.java?rev=357197&r1=357196&r2=357197&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSDataset.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSDataset.java Fri Dec 16 09:51:05 2005 @@ -29,7 +29,8 @@ ***/ public class FSDataset implements FSConstants { static final double USABLE_DISK_PCT = 0.98; -/** + + /** * A node type that can be built into a tree reflecting the * hierarchy of blocks on the local disk. */ @@ -166,6 +167,13 @@ blkid = blkid >> ((15 - halfByteIndex) * 4); return (int) ((0x000F) & blkid); } + +public String toString() { + return "FSDir{" + + "dir=" + dir + + ", children=" + (children == null ? null : Arrays.asList(children)) + + "}"; +} } // @@ -282,17 +290,23 @@ ongoingCreates.add(b); reserved += BLOCK_SIZE; f = getTmpFile(b); - -if (f.exists()) { -throw new IOException("Unexpected problem in startBlock() for " + b + ". File " + f + " should not be present, but is."); -} -} - -// -// Create the zero-length temp file -// -if (!f.createNewFile()) { -throw new IOException("Unexpected problem in startBlock() for " + b + ". File " + f + " should be creatable, but is already present."); + try { + if (f.exists()) { + throw new IOException("Unexpected problem in startBlock() for " + b + ". File " + f + " should not be present, but is."); + } + + // + // Create the zero-length temp file + // + if (!f.createNewFile()) { + throw new IOException("Unexpected problem in startBlock() for " + b + ". File " + f + " should be creatable, but is already present."); + } + } catch (IOException ie) { +System.out.println("Exception! " + ie); + ongoingCreates.remove(b); + reserved -= BLOCK_SIZE; +throw ie; + } } // @@ -405,4 +419,11 @@ // REMIND - mjc - should cache this result for performance return new File(tmp, b.getBlockName()); } + +public String toString() { + return "FSDataset{" + +"dirpath='" + dirpath + "'" + +"}"; +} + } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSDirectory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSDirectory.java?rev=357197&r1=357196&r2=357197&view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSDirectory.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/ndfs/FSDirectory.java Fri Dec 16 09:51:05 2005 @@ -47,7 +47,7 @@ class INode { public String name; public INode parent; -public Vector children = new Vector(); +public TreeMap children = new TreeMap(); public Block blocks[]; /** @@ -59,61 +59,45 @@ } /** + * This is the external interface */ INode getNode(String target) { -if (! target.startsWith("/")) { +if (! target.startsWith("/") || target.length() == 0) { return null; -} - -if (parent == null) { -if ("/".equals(target)) { -return this; -} else { -// Check with children -for (Iterator it = children.iterator(); it.hasNext(); ) { -INode child = (INode) it.next(); -INode result = child.getNode(target); -if (result != null) { -return result; -} -} -} +} else if (parent == null && "/".equals(target)) { +return this; } else { -// Strip the leading slash -if (target.length() > 1) { -target = target.substring(1); -} - -// Check if it's the current node -if (name.equals(target)) { -return this; +Vector components = new Vector(); +int start = 0; +int slashid = 0; +while (start < target.length() && (slash
svn commit: r351462 - in /lucene/nutch/branches/mapred/src: java/org/apache/nutch/ipc/ java/org/apache/nutch/ndfs/ test/org/apache/nutch/ndfs/
Author: cutting Date: Thu Dec 1 12:28:49 2005 New Revision: 351462 URL: http://svn.apache.org/viewcvs?rev=351462&view=rev Log: Add TestNDFS, NUTCH-116. Contributed by Paul Baclace. Added: lucene/nutch/branches/mapred/src/test/org/apache/nutch/ndfs/ lucene/nutch/branches/mapred/src/test/org/apache/nutch/ndfs/TestNDFS.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/ipc/Server.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/FSDataset.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/FSNamesystem.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/NameNode.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/ipc/Server.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/ipc/Server.java?rev=351462&r1=351461&r2=351462&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/ipc/Server.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/ipc/Server.java Thu Dec 1 12:28:49 2005 @@ -95,7 +95,9 @@ } try { socket.close(); - } catch (IOException e) {} + } catch (IOException e) { +LOG.info(getName() + ": e=" + e); + } LOG.info(getName() + ": exiting"); } } @@ -166,9 +168,9 @@ /** Handles queued calls . */ private class Handler extends Thread { -public Handler() { +public Handler(int instanceNumber) { this.setDaemon(true); - this.setName("Server handler on " + port); + this.setName("Server handler "+ instanceNumber + " on " + port); } public void run() { @@ -242,25 +244,31 @@ listener.start(); for (int i = 0; i < handlerCount; i++) { - Handler handler = new Handler(); + Handler handler = new Handler(i); handler.start(); } } - /** Stops the service. No calls will be handled after this is called. All - * threads will exit. */ + /** Stops the service. No new calls will be handled after this is called. All + * subthreads will likely be finished after this returns. + */ public synchronized void stop() { LOG.info("Stopping server on " + port); running = false; try { - Thread.sleep(timeout);// let all threads exit + Thread.sleep(timeout); // inexactly wait for pending requests to finish } catch (InterruptedException e) {} -notify(); +notifyAll(); } - /** Wait for the server to be stopped. */ + /** Wait for the server to be stopped. + * Does not wait for all subthreads to finish. + * See [EMAIL PROTECTED] #stop()}. + */ public synchronized void join() throws InterruptedException { -wait(); +while (running) { + wait(); +} } /** Called for each call. */ Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java?rev=351462&r1=351461&r2=351462&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java Thu Dec 1 12:28:49 2005 @@ -37,7 +37,7 @@ **/ public class DataNode implements FSConstants, Runnable { public static final Logger LOG = LogFormatter.getLogger("org.apache.nutch.ndfs.DataNode"); -// + // // REMIND - mjc - I might bring "maxgigs" back so user can place // artificial limit on space //private static final long GIGABYTE = 1024 * 1024 * 1024; @@ -59,6 +59,8 @@ return new InetSocketAddress(host, port); } + +private static Vector subThreadList = null; DatanodeProtocol namenode; FSDataset data; String localName; @@ -66,6 +68,8 @@ Vector receivedBlockList = new Vector(); int xmitsInProgress = 0; Daemon dataXceiveServer = null; +long blockReportInterval; +private long datanodeStartupPeriod; private NutchConf fConf; /** @@ -98,6 +102,13 @@ this.localName = machineName + ":" + tmpPort; this.dataXceiveServer = new Daemon(new DataXceiveServer(ss)); this.dataXceiveServer.start(); + +long blockReportIntervalBasis = + conf.getLong("ndfs.blockreport.intervalMsec", BLOCKREPORT_INTERVAL); +this.blockReportInterval = + blockReportIntervalBasis - new Random().nextInt((int)(blockReportIntervalBasis/10)); +this.datanodeStartupPeriod = +
svn commit: r350310 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch: crawl/Crawl.java fs/NDFSFileSystem.java fs/NutchFileSystem.java mapred/JobConf.java ndfs/DatanodeInfo.java ndfs/NDFSCl
Author: cutting Date: Thu Dec 1 11:59:24 2005 New Revision: 350310 URL: http://svn.apache.org/viewcvs?rev=350310&view=rev Log: Paul Baclace's code & comment cleanups from NUTCH-116. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobConf.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DatanodeInfo.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/NDFSClient.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java?rev=350310&r1=350309&r2=350310&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java Thu Dec 1 11:59:24 2005 @@ -52,7 +52,7 @@ JobConf conf = new JobConf(NutchConf.get()); //conf.addConfResource("crawl-tool.xml"); -File rootUrlFile = null; +File rootUrlDir = null; File dir = new File("crawl-" + getDate()); int threads = conf.getInt("fetcher.threads.fetch", 10); int depth = 5; @@ -72,7 +72,7 @@ topN = Integer.parseInt(args[i+1]); i++; } else if (args[i] != null) { -rootUrlFile = new File(args[i]); +rootUrlDir = new File(args[i]); } } @@ -82,7 +82,7 @@ } LOG.info("crawl started in: " + dir); -LOG.info("rootUrlFile = " + rootUrlFile); +LOG.info("rootUrlDir = " + rootUrlDir); LOG.info("threads = " + threads); LOG.info("depth = " + depth); @@ -98,7 +98,7 @@ File tmpDir = conf.getLocalFile("crawl", getDate()); // initialize crawlDb -new Injector(conf).inject(crawlDb, rootUrlFile); +new Injector(conf).inject(crawlDb, rootUrlDir); for (int i = 0; i < depth; i++) { // generate new segment File segment = Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java?rev=350310&r1=350309&r2=350310&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java Thu Dec 1 11:59:24 2005 @@ -25,8 +25,9 @@ import org.apache.nutch.util.NutchConf; / - * Implement the NutchFileSystem interface for the NDFS system. - * + * Implementation of the abstract NutchFileSystem for the NDFS system. + * This is the distributed file system. It can be distributed over + * 1 or more machines * @author Mike Cafarella */ public class NDFSFileSystem extends NutchFileSystem { Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java?rev=350310&r1=350309&r2=350310&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java Thu Dec 1 11:59:24 2005 @@ -24,16 +24,21 @@ import org.apache.nutch.util.*; / - * NutchFileSystem is an interface for a fairly simple - * distributed file system. A Nutch installation might consist + * An abstract base class for a fairly simple + * distributed file system. + * A Nutch installation might consist * of multiple machines, which should swap files transparently. * This interface allows other Nutch systems to find and place * files into the distributed Nutch-controlled file world. - * + * + * A local implementation exists for testing and for small Nutch instances. + * * The standard job of NutchFileSystem is to take the location- * independent NutchFile objects, and resolve them using local * knowledge and local instances of ShareGroup. - * + * + * The local implementation is [EMAIL PROTECTED] LocalFileSystem} and distributed + * implementation is [EMAIL PROTECTED] NDFSFileSystem}. * @author Mike Cafarella ***
svn commit: r350294 - in /lucene/nutch/branches/mapred: build.xml default.properties src/plugin/build-plugin.xml
Author: cutting Date: Thu Dec 1 10:24:07 2005 New Revision: 350294 URL: http://svn.apache.org/viewcvs?rev=350294&view=rev Log: Always specify java source & target versions to javac. From Michael Stack. Modified: lucene/nutch/branches/mapred/build.xml lucene/nutch/branches/mapred/default.properties lucene/nutch/branches/mapred/src/plugin/build-plugin.xml Modified: lucene/nutch/branches/mapred/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/build.xml?rev=350294&r1=350293&r2=350294&view=diff == --- lucene/nutch/branches/mapred/build.xml (original) +++ lucene/nutch/branches/mapred/build.xml Thu Dec 1 10:24:07 2005 @@ -70,9 +70,11 @@ srcdir="${src.dir}" includes="org/apache/nutch/**/*.java" destdir="${build.classes}" - debug="${debug}" - optimize="${optimize}" - deprecation="${deprecation}"> + debug="${javac.debug}" + optimize="${javac.optimize}" + target="${javac.version}" + source="${javac.version}" + deprecation="${javac.deprecation}"> @@ -154,8 +156,11 @@ srcdir="${test.src.dir}" includes="org/apache/nutch/**/*.java" destdir="${test.build.classes}" - debug="${debug}" - deprecation="${deprecation}"> + debug="${javac.debug}" + optimize="${javac.optimize}" + target="${javac.version}" + source="${javac.version}" + deprecation="${javac.deprecation}"> Modified: lucene/nutch/branches/mapred/default.properties URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/default.properties?rev=350294&r1=350293&r2=350294&view=diff == --- lucene/nutch/branches/mapred/default.properties (original) +++ lucene/nutch/branches/mapred/default.properties Thu Dec 1 10:24:07 2005 @@ -38,9 +38,10 @@ dist.dir=${build.dir}/${final.name} -debug=on -optimize=on -deprecation=off +javac.debug=on +javac.optimize=on +javac.deprecation=off +javac.version= 1.4 plugin.http=org.apache.nutch.protocol.http* plugin.httpclient=org.apache.nutch.protocol.httpclient* Modified: lucene/nutch/branches/mapred/src/plugin/build-plugin.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/build-plugin.xml?rev=350294&r1=350293&r2=350294&view=diff == --- lucene/nutch/branches/mapred/src/plugin/build-plugin.xml (original) +++ lucene/nutch/branches/mapred/src/plugin/build-plugin.xml Thu Dec 1 10:24:07 2005 @@ -7,8 +7,7 @@ - - + @@ -27,13 +26,8 @@ - - - - http://java.sun.com/j2se/1.4/docs/api/"/> - - + + @@ -85,6 +79,9 @@ includes="**/*.java" destdir="${build.classes}" debug="${javac.debug}" + optimize="${javac.optimize}" + target="${javac.version}" + source="${javac.version}" deprecation="${javac.deprecation}"> @@ -126,7 +123,11 @@ srcdir="${src.test}" includes="**/*.java" destdir="${build.test}" - debug="${debug}"> + debug="${javac.debug}" + optimize="${javac.optimize}" + target="${javac.version}" + source="${javac.version}" + deprecation="${javac.deprecation}">
svn commit: r348533 - /lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
Author: cutting Date: Wed Nov 23 11:55:11 2005 New Revision: 348533 URL: http://svn.apache.org/viewcvs?rev=348533&view=rev Log: Fix to not extract urls whose method=post. Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=348533&r1=348532&r2=348533&view=diff == --- lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original) +++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Wed Nov 23 11:55:11 2005 @@ -296,10 +296,7 @@ if (node.getNodeType() == Node.ELEMENT_NODE) { LinkParams params = (LinkParams)linkParams.get(node.getNodeName().toLowerCase()); if (params != null) { -if (shouldThrowAwayLink(node, children, childLen, params)) { - // this has no inner structure or just a single nested - // anchor-- toss it! -} else { +if (!shouldThrowAwayLink(node, children, childLen, params)) { StringBuffer linkText = new StringBuffer(); getText(linkText, node, true); @@ -307,20 +304,21 @@ NamedNodeMap attrs = node.getAttributes(); String target = null; boolean noFollow = false; + boolean post = false; for (int i= 0; i < attrs.getLength(); i++ ) { Node attr = attrs.item(i); String attrName = attr.getNodeName(); - -if ("rel".equalsIgnoreCase(attrName) && -"nofollow".equalsIgnoreCase(attr.getNodeValue())) { - noFollow = true; -} - if (params.attrName.equalsIgnoreCase(attrName)) { target = attr.getNodeValue(); +} else if ("rel".equalsIgnoreCase(attrName) && + "nofollow".equalsIgnoreCase(attr.getNodeValue())) { + noFollow = true; +} else if ("method".equalsIgnoreCase(attrName) && + "post".equalsIgnoreCase(attr.getNodeValue())) { + post = true; } } - if (target != null && !noFollow) + if (target != null && !noFollow && !post) try { URL url = new URL(base, target); outlinks.add(new Outlink(url.toString(),
svn commit: r348531 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
Author: cutting Date: Wed Nov 23 11:46:05 2005 New Revision: 348531 URL: http://svn.apache.org/viewcvs?rev=348531&view=rev Log: Fix to increment retry count. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=348531&r1=348530&r2=348531&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Wed Nov 23 11:46:05 2005 @@ -140,6 +140,7 @@ case ProtocolStatus.EXCEPTION: logError(url, status.getMessage()); case ProtocolStatus.RETRY: // retry +datum.setRetriesSinceFetch(datum.getRetriesSinceFetch()+1); output(key, datum, null, CrawlDatum.STATUS_FETCH_RETRY); break;
svn commit: r348284 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java
Author: cutting Date: Tue Nov 22 15:15:45 2005 New Revision: 348284 URL: http://svn.apache.org/viewcvs?rev=348284&view=rev Log: Fix illegal iterator access bug. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java?rev=348284&r1=348283&r2=348284&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java Tue Nov 22 15:15:45 2005 @@ -142,8 +142,8 @@ */ public synchronized void close() throws IOException { // Kill running tasks -for (Iterator it = tasks.values().iterator(); it.hasNext(); ) { -TaskInProgress tip = (TaskInProgress) it.next(); +while (tasks.size() > 0) { +TaskInProgress tip = (TaskInProgress)tasks.get(tasks.firstKey()); tip.jobHasFinished(); }
svn commit: r348212 - in /lucene/nutch/branches/mapred/conf: crawl-tool.xml nutch-default.xml
Author: cutting Date: Tue Nov 22 10:55:26 2005 New Revision: 348212 URL: http://svn.apache.org/viewcvs?rev=348212&view=rev Log: Increase defaults for http.max.delays, since, with MapReduce's partitioning of fetchlists, delays are more likely. Modified: lucene/nutch/branches/mapred/conf/crawl-tool.xml lucene/nutch/branches/mapred/conf/nutch-default.xml Modified: lucene/nutch/branches/mapred/conf/crawl-tool.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/crawl-tool.xml?rev=348212&r1=348211&r2=348212&view=diff == --- lucene/nutch/branches/mapred/conf/crawl-tool.xml (original) +++ lucene/nutch/branches/mapred/conf/crawl-tool.xml Tue Nov 22 10:55:26 2005 @@ -33,7 +33,7 @@ http.max.delays - 100 + 1000 The number of times a thread will delay when trying to fetch a page. When using the crawl tool there are likely to be very few different hosts, so we need to be willing to wait longer for Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=348212&r1=348211&r2=348212&view=diff == --- lucene/nutch/branches/mapred/conf/nutch-default.xml (original) +++ lucene/nutch/branches/mapred/conf/nutch-default.xml Tue Nov 22 10:55:26 2005 @@ -69,7 +69,7 @@ http.max.delays - 3 + 100 The number of times a thread will delay when trying to fetch a page. Each time it finds that a host is busy, it will wait fetcher.server.delay. After http.max.delays attepts, it will give
svn commit: r348210 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java
Author: cutting Date: Tue Nov 22 10:46:43 2005 New Revision: 348210 URL: http://svn.apache.org/viewcvs?rev=348210&view=rev Log: Silently ignore missing checksum files. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java?rev=348210&r1=348209&r2=348210&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java Tue Nov 22 10:46:43 2005 @@ -54,7 +54,9 @@ if (!Arrays.equals(version, VERSION)) throw new IOException("Not a checksum file: "+sumFile); bytesPerSum = sums.readInt(); - } catch (IOException e) { + } catch (FileNotFoundException e) { // quietly ignore +stopSumming(); + } catch (IOException e) { // loudly ignore LOG.warning("Problem opening checksum file: "+e+". Ignoring."); stopSumming(); }
svn commit: r344403 - in /lucene/nutch/branches/mapred: conf/nutch-default.xml src/java/org/apache/nutch/fs/ChecksumException.java src/java/org/apache/nutch/fs/NFSDataInputStream.java src/java/org/apa
Author: cutting Date: Tue Nov 15 10:00:14 2005 New Revision: 344403 URL: http://svn.apache.org/viewcvs?rev=344403&view=rev Log: Add ability to skip over data with bad checksums. Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/ChecksumException.java Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=344403&r1=344402&r2=344403&view=diff == --- lucene/nutch/branches/mapred/conf/nutch-default.xml (original) +++ lucene/nutch/branches/mapred/conf/nutch-default.xml Tue Nov 15 10:00:14 2005 @@ -339,6 +339,21 @@ buffered during read and write operations. + + io.bytes.per.checksum + 512 + The number of bytes per checksum. Must not be larger than + io.file.buffer.size. + + + + io.skip.checksum.errors + false + If true, when a checksum error is encountered while + reading a sequence file, entries are skipped, instead of throwing an + exception. + + Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/ChecksumException.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/ChecksumException.java?rev=344403&view=auto == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/ChecksumException.java (added) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/ChecksumException.java Tue Nov 15 10:00:14 2005 @@ -0,0 +1,26 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.fs; + +import java.io.IOException; + +/** Thrown for checksum errors. */ +public class ChecksumException extends IOException { + public ChecksumException(String description) { +super(description); + } +} Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java?rev=344403&r1=344402&r2=344403&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java Tue Nov 15 10:00:14 2005 @@ -109,13 +109,15 @@ stopSumming(); return; } - if (crc != (int)sum.getValue()) { -fs.reportChecksumFailure(file, (NFSInputStream)in, - getPos()-delta, bytesPerSum, crc); -throw new IOException("Checksum error: "+file); - } + int sumValue = (int)sum.getValue(); sum.reset(); inSum = 0; + if (crc != sumValue) { +long pos = getPos() - delta; +fs.reportChecksumFailure(file, (NFSInputStream)in, + pos, bytesPerSum, crc); +throw new ChecksumException("Checksum error: "+file+" at "+pos); + } } public long getPos() throws IOException { Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java?rev=344403&r1=344402&r2=344403&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java Tue Nov 15 10:00:14 2005 @@ -359,23 +359,39 @@ if (in.getPos() >= end) return -1; - int length = in.readInt(); + try { +int length = in.readInt(); - if (version[3] > 1 && sync != null && - length == SYNC_ESCAPE) {// process a sync entry -//LOG.info("sync@"+in.getPos()); -in.readFully(syncCheck); // r
svn commit: r332371 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
Author: cutting Date: Thu Nov 10 13:03:16 2005 New Revision: 332371 URL: http://svn.apache.org/viewcvs?rev=332371&view=rev Log: Fix to not increment count of urls when urls are filtered by maxPerHost limit. Patch contributed by Rod Taylor. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java?rev=332371&r1=332370&r2=332371&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java Thu Nov 10 13:03:16 2005 @@ -76,23 +76,27 @@ OutputCollector output, Reporter reporter) throws IOException { - while (values.hasNext() && ++count < limit) { + while (values.hasNext() && count < limit) { UTF8 url = (UTF8)values.next(); -if (maxPerHost > 0) { // are we counting hosts? +if (maxPerHost > 0) { // are we counting hosts? String host = new URL(url.toString()).getHost(); - Integer count = (Integer)hostCounts.get(host); - if (count != null) { -if (count.intValue() >= maxPerHost) + Integer hostCount = (Integer)hostCounts.get(host); + if (hostCount != null) { +if (hostCount.intValue() >= maxPerHost) continue; // too many from host -hostCounts.put(host, new Integer(count.intValue()+1)); +hostCounts.put(host, new Integer(hostCount.intValue()+1)); } else {// update host count hostCounts.put(host, new Integer(1)); } } output.collect(key, url); + +// Count is incremented only when we keep the URL +// maxPerHost may cause us to skip it. +count++; } }
svn commit: r332089 - /lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java
Author: cutting Date: Wed Nov 9 09:46:16 2005 New Revision: 332089 URL: http://svn.apache.org/viewcvs?rev=332089&view=rev Log: Fix to follow redirects to robots.txt Modified: lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java Modified: lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java?rev=332089&r1=332088&r2=332089&view=diff == --- lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java (original) +++ lucene/nutch/branches/mapred/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java Wed Nov 9 09:46:16 2005 @@ -53,6 +53,9 @@ private static final boolean ALLOW_FORBIDDEN = NutchConf.get().getBoolean("http.robots.403.allow", false); + private static final int MAX_REDIRECTS = +NutchConf.get().getInt("http.redirect.max", 3); + private static final String[] AGENTS = getAgents(); private static final Hashtable CACHE = new Hashtable(); @@ -377,16 +380,30 @@ RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host); if (robotRules == null) { // cache miss - HttpResponse response = new HttpResponse(new URL(url, "/robots.txt")); - - if (response.getCode() == 200) // found rules: parse them -robotRules = new RobotRulesParser().parseRules(response.getContent()); - else if ( (response.getCode() == 403) && (!ALLOW_FORBIDDEN) ) -robotRules = FORBID_ALL_RULES;// use forbid all - else -robotRules = EMPTY_RULES; // use default rules + int redirects = 0; + do { +HttpResponse response = new HttpResponse(new URL(url, "/robots.txt")); + +int code = response.getCode(); + +if (code == 200) {// found rules: parse them + robotRules = new RobotRulesParser().parseRules(response.getContent()); +} else if ( (code == 403) && (!ALLOW_FORBIDDEN) ) { + robotRules = FORBID_ALL_RULES; // use forbid all +} else if (code >= 300 && code < 400) { // handle redirect + if (redirects == MAX_REDIRECTS) { +robotRules = EMPTY_RULES; + } else { +url = new URL(url, response.getHeader("Location")); +LOG.fine("redirect to " + url); +redirects++; + } +} else { + robotRules = EMPTY_RULES; // use default rules +} + } while (robotRules == null); - CACHE.put(host, robotRules);// cache rules for host + CACHE.put(host, robotRules); // cache rules for host } String path = url.getPath(); // check rules
svn commit: r332088 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
Author: cutting Date: Wed Nov 9 09:45:18 2005 New Revision: 332088 URL: http://svn.apache.org/viewcvs?rev=332088&view=rev Log: Use a more informative thread name. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=332088&r1=332087&r2=332088&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Wed Nov 9 09:45:18 2005 @@ -74,6 +74,7 @@ private class FetcherThread extends Thread { public FetcherThread() { this.setDaemon(true); // don't hang JVM on exit + this.setName("FetcherThread"); // use an informative name } public void run() {
svn commit: r331858 - in /lucene/nutch/branches/mapred: conf/nutch-default.xml src/java/org/apache/nutch/crawl/Generator.java src/java/org/apache/nutch/crawl/Injector.java
Author: cutting Date: Tue Nov 8 10:25:11 2005 New Revision: 331858 URL: http://svn.apache.org/viewcvs?rev=331858&view=rev Log: Use absolute paths for temporary crawl files. Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=331858&r1=331857&r2=331858&view=diff == --- lucene/nutch/branches/mapred/conf/nutch-default.xml (original) +++ lucene/nutch/branches/mapred/conf/nutch-default.xml Tue Nov 8 10:25:11 2005 @@ -387,7 +387,14 @@ mapred.system.dir /tmp/nutch/mapred/system - The local directory where MapReduce stores control files. + The shared directory where MapReduce stores control files. + + + + + mapred.temp.dir + /tmp/nutch/mapred/temp + A shared directory for temporary files. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java?rev=331858&r1=331857&r2=331858&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java Tue Nov 8 10:25:11 2005 @@ -155,7 +155,8 @@ throws IOException { File tempDir = - new File("generate-temp-"+ + new File(NutchConf.get().get("mapred.temp.dir", ".") + + "/generate-temp-"+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); File segment = new File(segments, getDate()); Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java?rev=331858&r1=331857&r2=331858&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Injector.java Tue Nov 8 10:25:11 2005 @@ -84,7 +84,8 @@ LOG.info("Injector: urlDir: " + urlDir); File tempDir = - new File("inject-temp-"+ + new File(NutchConf.get().get("mapred.temp.dir", ".") + + "/inject-temp-"+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // map text input file to a file
svn commit: r331585 - /lucene/nutch/branches/mapred/conf/nutch-default.xml
Author: cutting Date: Mon Nov 7 11:29:37 2005 New Revision: 331585 URL: http://svn.apache.org/viewcvs?rev=331585&view=rev Log: Document job tracker's adminstrative web ui port. Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=331585&r1=331584&r2=331585&view=diff == --- lucene/nutch/branches/mapred/conf/nutch-default.xml (original) +++ lucene/nutch/branches/mapred/conf/nutch-default.xml Mon Nov 7 11:29:37 2005 @@ -426,6 +426,14 @@ + + mapred.job.tracker.info.port + 7845 + The port number where the jobtracker runs its + administrative web interface. + + +
svn commit: r331556 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
Author: cutting Date: Mon Nov 7 09:55:59 2005 New Revision: 331556 URL: http://svn.apache.org/viewcvs?rev=331556&view=rev Log: Fix to only try to parse successful fetches. Also, log number of threads in task process, not in controller, as this may be overridden by nutch-site.xml. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=331556&r1=331555&r2=331556&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Mon Nov 7 09:55:59 2005 @@ -202,7 +202,7 @@ (SCORE_KEY, Float.toString(datum.getScore())); Parse parse = null; - if (parsing) { + if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) { ParseStatus parseStatus; try { parse = ParseUtil.parse(content); @@ -280,6 +280,8 @@ this.maxRedirect = getConf().getInt("http.redirect.max", 3); int threadCount = getConf().getInt("fetcher.threads.fetch", 10); +LOG.info("Fetcher: threads: " + threadCount); + for (int i = 0; i < threadCount; i++) { // spawn threads new FetcherThread().start(); } @@ -311,8 +313,6 @@ LOG.info("Fetcher: starting"); LOG.info("Fetcher: segment: " + segment); -LOG.info("Fetcher: threads: " + threads); - JobConf job = new JobConf(getConf());
svn commit: r331555 - in /lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient: HttpResponse.java RobotRulesParser.java
Author: cutting Date: Mon Nov 7 09:53:54 2005 New Revision: 331555 URL: http://svn.apache.org/viewcvs?rev=331555&view=rev Log: NUTCH-124: Follow redirects when fetching robots.txt. Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=331555&r1=331554&r2=331555&view=diff == --- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original) +++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Mon Nov 7 09:53:54 2005 @@ -82,10 +82,14 @@ } public HttpResponse(URL url) throws IOException { +this(url, false); + } + + HttpResponse(URL url, boolean followRedirects) throws IOException { this.base = url.toString(); this.orig = url.toString(); GetMethod get = new GetMethod(this.orig); -get.setFollowRedirects(false); +get.setFollowRedirects(followRedirects); get.setRequestHeader("User-Agent", Http.AGENT_STRING); HttpMethodParams params = get.getParams(); // some servers cannot digest the new protocol Modified: lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java?rev=331555&r1=331554&r2=331555&view=diff == --- lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java (original) +++ lucene/nutch/branches/mapred/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java Mon Nov 7 09:53:54 2005 @@ -379,7 +379,8 @@ if (robotRules == null) { // cache miss LOG.fine("cache miss " + url); try { -HttpResponse response = new HttpResponse(new URL(url, "/robots.txt")); +HttpResponse response = new HttpResponse(new URL(url, "/robots.txt"), + true); if (response.getCode() == 200) // found rules: parse them robotRules = new RobotRulesParser().parseRules(response.getContent());
svn commit: r330641 - /lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java
Author: cutting Date: Thu Nov 3 13:19:11 2005 New Revision: 330641 URL: http://svn.apache.org/viewcvs?rev=330641&view=rev Log: Fix to correctly handle zero-length files. Modified: lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java Modified: lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java?rev=330641&r1=330640&r2=330641&view=diff == --- lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java (original) +++ lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java Thu Nov 3 13:19:11 2005 @@ -305,6 +305,8 @@ long size = ((LongWritable)value).get(); long seed = Long.parseLong(name); + if (size == 0) return; + reporter.setStatus("opening " + name); NFSDataInputStream in = fs.open(new File(DATA_DIR, name));
svn commit: r330640 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java
Author: cutting Date: Thu Nov 3 13:18:05 2005 New Revision: 330640 URL: http://svn.apache.org/viewcvs?rev=330640&view=rev Log: Fix a buggy cast when files are longer than Integer.MAX_VALUE, and improve some diagnostics. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java?rev=330640&r1=330639&r2=330640&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DataNode.java Thu Nov 3 13:18:05 2005 @@ -366,8 +366,11 @@ while (anotherChunk) { while (len > 0) { -int bytesRead = in.read(buf, 0, Math.min(buf.length, (int) len)); -if (bytesRead >= 0) { +int bytesRead = in.read(buf, 0, (int)Math.min(buf.length, len)); +if (bytesRead < 0) { + throw new EOFException("EOF reading from "+s.toString()); +} +if (bytesRead > 0) { try { out.write(buf, 0, bytesRead); } catch (IOException iex) { @@ -393,8 +396,8 @@ } } } +len -= bytesRead; } -len -= bytesRead; } if (encodingType == RUNLENGTH_ENCODING) { @@ -556,7 +559,7 @@ in.close(); } } catch (IOException ie) { -ie.printStackTrace(); + LOG.log(Level.WARNING, "DataXCeiver", ie); } finally { try { s.close();
svn commit: r330638 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java
Author: cutting Date: Thu Nov 3 13:16:28 2005 New Revision: 330638 URL: http://svn.apache.org/viewcvs?rev=330638&view=rev Log: Keep trying to restart job tracker. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java?rev=330638&r1=330637&r2=330638&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java Thu Nov 3 13:16:28 2005 @@ -40,10 +40,21 @@ public static void startTracker(NutchConf conf) throws IOException { if (tracker != null) throw new IOException("JobTracker already running."); - tracker = new JobTracker(conf); + while (true) { +try { + tracker = new JobTracker(conf); + break; +} catch (IOException e) { + LOG.log(Level.WARNING, "Starting tracker", e); +} +try { + Thread.sleep(1000); +} catch (InterruptedException e) { +} + } tracker.offerService(); - } + public static JobTracker getTracker() { return tracker; }
svn commit: r330636 - /lucene/nutch/branches/mapred/bin/slaves.sh
Author: cutting Date: Thu Nov 3 13:15:11 2005 New Revision: 330636 URL: http://svn.apache.org/viewcvs?rev=330636&view=rev Log: Don't rely on ssh SendEnv. Instead folks can use $HOME/.ssh/environment to pass environment variables to slaves. Note that sshd must be configured on slaves with PermitUserEnvironment enabled for this to operate. Modified: lucene/nutch/branches/mapred/bin/slaves.sh Modified: lucene/nutch/branches/mapred/bin/slaves.sh URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/slaves.sh?rev=330636&r1=330635&r2=330636&view=diff == --- lucene/nutch/branches/mapred/bin/slaves.sh (original) +++ lucene/nutch/branches/mapred/bin/slaves.sh Thu Nov 3 13:15:11 2005 @@ -20,7 +20,7 @@ fi for slave in `cat $NUTCH_SLAVES`; do - ssh -o ConnectTimeout=1 -o SendEnv='NUTCH* JAVA*' $slave "$@" \ + ssh -o ConnectTimeout=1 $slave "$@" \ 2>&1 | sed "s/^/$slave: /" & done
svn commit: r328414 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java
Author: cutting Date: Tue Oct 25 09:57:51 2005 New Revision: 328414 URL: http://svn.apache.org/viewcvs?rev=328414&view=rev Log: Fix a type error for JDK 1.4. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java?rev=328414&r1=328413&r2=328414&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java Tue Oct 25 09:57:51 2005 @@ -66,7 +66,8 @@ Outlink[] links = parse.getData().getOutlinks(); // compute OPIC score contribution - float score = Float.valueOf(parse.getData().get(Fetcher.SCORE_KEY)); + float score = +Float.parseFloat(parse.getData().get(Fetcher.SCORE_KEY)); score /= links.length; for (int i = 0; i < links.length; i++) {
svn commit: r327593 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java
Author: cutting Date: Fri Oct 21 15:07:00 2005 New Revision: 327593 URL: http://svn.apache.org/viewcvs?rev=327593&view=rev Log: Always create workdir so child can connect to it. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java?rev=327593&r1=327592&r2=327593&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java Fri Oct 21 15:07:00 2005 @@ -63,6 +63,7 @@ String sep = System.getProperty("path.separator"); File workDir = new File(new File(t.getJobFile()).getParent(), "work"); + workDir.mkdirs(); StringBuffer classPath = new StringBuffer(); // start with same classpath as parent process @@ -72,7 +73,6 @@ JobConf job = new JobConf(t.getJobFile()); String jar = job.getJar(); if (jar != null) { // if jar exists, it into workDir -workDir.mkdirs(); runChild(new String[] { "unzip", jar}, workDir); File[] libs = new File(workDir, "lib").listFiles(); for (int i = 0; i < libs.length; i++) {
svn commit: r327581 - in /lucene/nutch/branches/mapred/src/plugin/parse-html/src: java/org/apache/nutch/parse/html/DOMContentUtils.java test/org/apache/nutch/parse/html/TestDOMContentUtils.java
Author: cutting Date: Fri Oct 21 14:04:54 2005 New Revision: 327581 URL: http://svn.apache.org/viewcvs?rev=327581&view=rev Log: Ignore rel=nofollow links. Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=327581&r1=327580&r2=327581&view=diff == --- lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original) +++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Fri Oct 21 14:04:54 2005 @@ -306,13 +306,21 @@ NamedNodeMap attrs = node.getAttributes(); String target = null; + boolean noFollow = false; for (int i= 0; i < attrs.getLength(); i++ ) { -if (params.attrName.equalsIgnoreCase(attrs.item(i).getNodeName())) { - target = attrs.item(i).getNodeValue(); - break; +Node attr = attrs.item(i); +String attrName = attr.getNodeName(); + +if ("rel".equalsIgnoreCase(attrName) && +"nofollow".equalsIgnoreCase(attr.getNodeValue())) { + noFollow = true; +} + +if (params.attrName.equalsIgnoreCase(attrName)) { + target = attr.getNodeValue(); } } - if (target != null) + if (target != null && !noFollow) try { URL url = new URL(base, target); outlinks.add(new Outlink(url.toString(), Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=327581&r1=327580&r2=327581&view=diff == --- lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original) +++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Fri Oct 21 14:04:54 2005 @@ -113,6 +113,12 @@ + "End\tthis\rmadness\n!\r\n" + " ... ." + " "), + +// test that links are not returned +new String("" + + "http://www.nutch.org\"; rel=\"nofollow\"> ignore " + + "http://www.nutch.org\";> ignore " + + ""), }; private static String[] testBaseHrefs= { @@ -123,6 +129,7 @@ "http://www.nutch.org/frames/";, "http://www.nutch.org/maps/";, "http://www.nutch.org/whitespace/";, +"http://www.nutch.org//";, }; private static final DocumentFragment testDOMs[]= @@ -145,6 +152,7 @@ + "one two three space here space there no space " + "one two two three three four put some text here and there. " + "End this madness ! . . . .", +"ignore ignore", }; private static final String[] answerTitle= { @@ -155,6 +163,7 @@ "my title", "my title", "my title", +"", }; // note: should be in page-order @@ -214,6 +223,8 @@ { new Outlink("http://www.nutch.org/index.html";, "whitespace test"), }, + { + } }; } catch (MalformedURLException e) {
svn commit: r327573 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch: fs/FSError.java fs/LocalFileSystem.java mapred/LocalJobRunner.java mapred/TaskRunner.java mapred/TaskTracker.java mapr
Author: cutting Date: Fri Oct 21 13:49:02 2005 New Revision: 327573 URL: http://svn.apache.org/viewcvs?rev=327573&view=rev Log: Exit tasktracker when errors writing to the local disk, so that more tasks will not fail on this host. Also fix so that executables may be included in job jar files, and so that a job's config can override static config options, read with NutchConf.get(). Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FSError.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/LocalJobRunner.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskUmbilicalProtocol.java Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FSError.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FSError.java?rev=327573&view=auto == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FSError.java (added) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FSError.java Fri Oct 21 13:49:02 2005 @@ -0,0 +1,25 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.fs; + +/** Thrown for unexpected filesystem errors, presumed to reflect disk errors + * in the native filesystem. */ +public class FSError extends Error { + FSError(Throwable cause) { +super(cause); + } +} Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java?rev=327573&r1=327572&r2=327573&view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java Fri Oct 21 13:49:02 2005 @@ -78,11 +78,23 @@ public int available() throws IOException { return fis.available(); } public void close() throws IOException { fis.close(); } public boolean markSupport() { return false; } -public int read() throws IOException { return fis.read(); } -public int read(byte[] b) throws IOException { return fis.read(b); } + +public int read() throws IOException { + try { +return fis.read(); + } catch (IOException e) { // unexpected exception +throw new FSError(e); // assume native fs error + } +} + public int read(byte[] b, int off, int len) throws IOException { + try { return fis.read(b, off, len); + } catch (IOException e) { // unexpected exception +throw new FSError(e); // assume native fs error + } } + public long skip(long n) throws IOException { return fis.skip(n); } } @@ -115,11 +127,21 @@ */ public void close() throws IOException { fos.close(); } public void flush() throws IOException { fos.flush(); } - public void write(byte[] b) throws IOException { fos.write(b); } + public void write(byte[] b, int off, int len) throws IOException { -fos.write(b, off, len); +try { + fos.write(b, off, len); +} catch (IOException e) { // unexpected exception + throw new FSError(e); // assume native fs error +} + } + public void write(int b) throws IOException { +try { + fos.write(b); +} catch (IOException e) { // unexpected exception + throw new FSError(e); // assume native fs error +} } - public void write(int b) throws IOException { fos.write(b); } } public NFSOutputStream createRaw(File f, boolean overwrite) Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/LocalJobRunner.java URL: http://svn.apache.org/viewcvs/lucene/nu