svn commit: r798304 [3/3] - in /lucene/nutch/logos: ./ character-hand-big.png character.eps nutch_logo.eps nutch_logo.png
Added: lucene/nutch/logos/nutch_logo.eps URL: http://svn.apache.org/viewvc/lucene/nutch/logos/nutch_logo.eps?rev=798304view=auto == Binary file - no diff available. Propchange: lucene/nutch/logos/nutch_logo.eps -- svn:mime-type = application/octet-stream Added: lucene/nutch/logos/nutch_logo.png URL: http://svn.apache.org/viewvc/lucene/nutch/logos/nutch_logo.png?rev=798304view=auto == Binary file - no diff available. Propchange: lucene/nutch/logos/nutch_logo.png -- svn:mime-type = application/octet-stream
svn commit: r475926 - /lucene/nutch/nightly/nightly.sh
Author: cutting Date: Thu Nov 16 13:03:26 2006 New Revision: 475926 URL: http://svn.apache.org/viewvc?view=revrev=475926 Log: Update nightly build location. Modified: lucene/nutch/nightly/nightly.sh Modified: lucene/nutch/nightly/nightly.sh URL: http://svn.apache.org/viewvc/lucene/nutch/nightly/nightly.sh?view=diffrev=475926r1=475925r2=475926 == --- lucene/nutch/nightly/nightly.sh (original) +++ lucene/nutch/nightly/nightly.sh Thu Nov 16 13:03:26 2006 @@ -5,7 +5,7 @@ TRUNK=http://svn.apache.org/repos/asf/lucene/nutch/trunk REL_SERVER=people.apache.org -REL_DIR=/www/people.apache.org/dist/lucene/nutch/nightly +REL_DIR=/www/people.apache.org/builds/lucene/nutch/nightly # create an empty build directory rm -rf /tmp/nutch-nightly
svn commit: r421185 - /lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java
Author: cutting Date: Wed Jul 12 01:16:37 2006 New Revision: 421185 URL: http://svn.apache.org/viewvc?rev=421185view=rev Log: Patch a bug introduced by Hadoop 0.4.0, which requires specified input directories to exist. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=421185r1=421184r2=421185view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Wed Jul 12 01:16:37 2006 @@ -65,7 +65,8 @@ if (LOG.isInfoEnabled()) { LOG.info(CrawlDb update: done); } } - public static JobConf createJob(Configuration config, Path crawlDb) { + public static JobConf createJob(Configuration config, Path crawlDb) +throws IOException { Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); @@ -73,7 +74,11 @@ JobConf job = new NutchJob(config); job.setJobName(crawldb + crawlDb); -job.addInputPath(new Path(crawlDb, CrawlDatum.DB_DIR_NAME)); + +Path current = new Path(crawlDb, CrawlDatum.DB_DIR_NAME); +if (FileSystem.get(job).exists(current)) { + job.addInputPath(current); +} job.setInputFormat(SequenceFileInputFormat.class); job.setInputKeyClass(UTF8.class); job.setInputValueClass(CrawlDatum.class);
svn commit: r417884 - in /lucene/nutch/trunk: lib/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/segment/
Author: cutting Date: Wed Jun 28 14:54:53 2006 New Revision: 417884 URL: http://svn.apache.org/viewvc?rev=417884view=rev Log: NUTCH-312. Upgrade to Hadoop 0.4.0. Added: lucene/nutch/trunk/lib/commons-cli-2.0-SNAPSHOT.jar (with props) lucene/nutch/trunk/lib/hadoop-0.4.0.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.3.2.jar Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Added: lucene/nutch/trunk/lib/commons-cli-2.0-SNAPSHOT.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-cli-2.0-SNAPSHOT.jar?rev=417884view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/commons-cli-2.0-SNAPSHOT.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/hadoop-0.4.0.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.4.0.jar?rev=417884view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.4.0.jar -- svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java?rev=417884r1=417883r2=417884view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java Wed Jun 28 14:54:53 2006 @@ -31,6 +31,7 @@ import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.util.Progressable; import org.apache.nutch.parse.ParseOutputFormat; import org.apache.nutch.protocol.Content; @@ -45,7 +46,8 @@ public RecordWriter getRecordWriter(final FileSystem fs, final JobConf job, - final String name) throws IOException { + final String name, + final Progressable progress) throws IOException { final Path fetch = new Path(new Path(job.getOutputPath(), CrawlDatum.FETCH_DIR_NAME), name); @@ -66,7 +68,7 @@ } if (Fetcher.isParsing(job)) { -parseOut = new ParseOutputFormat().getRecordWriter(fs, job, name); +parseOut = new ParseOutputFormat().getRecordWriter(fs, job, name, null); } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=417884r1=417883r2=417884view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Wed Jun 28 14:54:53 2006 @@ -26,6 +26,7 @@ import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; +import org.apache.hadoop.util.Progressable; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -276,7 +277,8 @@ /** Write nothing. */ public RecordWriter getRecordWriter(final FileSystem fs, final JobConf job, - final String name) throws IOException { + final String name, + final Progressable progress) throws IOException { return new RecordWriter() { public void write(WritableComparable key, Writable value) throws IOException { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?rev=417884r1=417883r2=417884view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original) +++ lucene/nutch/trunk/src/java
svn commit: r413175 - in /lucene/nutch/trunk/lib: hadoop-0.3.1.jar hadoop-0.3.2.jar
Author: cutting Date: Fri Jun 9 14:48:23 2006 New Revision: 413175 URL: http://svn.apache.org/viewvc?rev=413175view=rev Log: Upgrading to Hadoop 0.3.2 release. Added: lucene/nutch/trunk/lib/hadoop-0.3.2.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.3.1.jar Added: lucene/nutch/trunk/lib/hadoop-0.3.2.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.3.2.jar?rev=413175view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.3.2.jar -- svn:mime-type = application/octet-stream
svn commit: r405861 - in /lucene/nutch/trunk/lib: hadoop-0.2.0.jar hadoop-0.2.1.jar
Author: cutting Date: Fri May 12 13:31:59 2006 New Revision: 405861 URL: http://svn.apache.org/viewcvs?rev=405861view=rev Log: Upgrading to Hadoop 0.2.1. Added: lucene/nutch/trunk/lib/hadoop-0.2.1.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.2.0.jar Added: lucene/nutch/trunk/lib/hadoop-0.2.1.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.2.1.jar?rev=405861view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.2.1.jar -- svn:mime-type = application/octet-stream
svn commit: r400159 - /lucene/nutch/trunk/bin/
Author: cutting Date: Fri May 5 13:01:44 2006 New Revision: 400159 URL: http://svn.apache.org/viewcvs?rev=400159view=rev Log: Ignore bin/rcc (from Hadoop). Modified: lucene/nutch/trunk/bin/ (props changed) Propchange: lucene/nutch/trunk/bin/ -- --- svn:ignore (original) +++ svn:ignore Fri May 5 13:01:44 2006 @@ -1,6 +1,7 @@ hadoop hadoop-daemon.sh hadoop-daemons.sh +rcc slaves.sh start-all.sh start-dfs.sh
svn commit: r400199 - in /lucene/nutch/trunk/lib: hadoop-0.1.1.jar hadoop-0.2.0.jar
Author: cutting Date: Fri May 5 15:44:04 2006 New Revision: 400199 URL: http://svn.apache.org/viewcvs?rev=400199view=rev Log: Upgrading to Hadoop 0.2.0. Added: lucene/nutch/trunk/lib/hadoop-0.2.0.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.1.1.jar Added: lucene/nutch/trunk/lib/hadoop-0.2.0.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.2.0.jar?rev=400199view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.2.0.jar -- svn:mime-type = application/octet-stream
svn commit: r394781 - /lucene/nutch/trunk/bin/
Author: cutting Date: Mon Apr 17 14:40:58 2006 New Revision: 394781 URL: http://svn.apache.org/viewcvs?rev=394781view=rev Log: Ignore more bin files. Modified: lucene/nutch/trunk/bin/ (props changed) Propchange: lucene/nutch/trunk/bin/ -- --- svn:ignore (original) +++ svn:ignore Mon Apr 17 14:40:58 2006 @@ -3,4 +3,8 @@ hadoop-daemons.sh slaves.sh start-all.sh +start-dfs.sh +start-mapred.sh stop-all.sh +stop-dfs.sh +stop-mapred.sh
svn commit: r392458 - in /lucene/nutch/trunk/lib: hadoop-0.1.0.jar hadoop-0.1.1.jar
Author: cutting Date: Fri Apr 7 16:48:10 2006 New Revision: 392458 URL: http://svn.apache.org/viewcvs?rev=392458view=rev Log: Upgrading to Hadoop release 0.1.1. Added: lucene/nutch/trunk/lib/hadoop-0.1.1.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.1.0.jar Added: lucene/nutch/trunk/lib/hadoop-0.1.1.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1.1.jar?rev=392458view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.1.1.jar -- svn:mime-type = application/octet-stream
svn commit: r390745 - in /lucene/nutch/trunk/lib: hadoop-0.1-dev.jar hadoop-0.1.0.jar
Author: cutting Date: Sat Apr 1 12:16:22 2006 New Revision: 390745 URL: http://svn.apache.org/viewcvs?rev=390745view=rev Log: Update to Hadoop 0.1.0 release. Added: lucene/nutch/trunk/lib/hadoop-0.1.0.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Added: lucene/nutch/trunk/lib/hadoop-0.1.0.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1.0.jar?rev=390745view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.1.0.jar -- svn:mime-type = application/octet-stream
svn commit: r387310 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
Author: cutting Date: Mon Mar 20 13:08:15 2006 New Revision: 387310 URL: http://svn.apache.org/viewcvs?rev=387310view=rev Log: Upgrade to current Hadoop. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=387310r1=387309r2=387310view=diff == Binary files - no diff available.
svn commit: r386181 - in /lucene/nutch/branches/branch-0.7: site/issue_tracking.html site/issue_tracking.pdf src/site/src/documentation/content/xdocs/issue_tracking.xml
Author: cutting Date: Wed Mar 15 14:20:40 2006 New Revision: 386181 URL: http://svn.apache.org/viewcvs?rev=386181view=rev Log: Updated link to jira. Modified: lucene/nutch/branches/branch-0.7/site/issue_tracking.html lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml Modified: lucene/nutch/branches/branch-0.7/site/issue_tracking.html URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/site/issue_tracking.html?rev=386181r1=386180r2=386181view=diff == --- lucene/nutch/branches/branch-0.7/site/issue_tracking.html (original) +++ lucene/nutch/branches/branch-0.7/site/issue_tracking.html Wed Mar 15 14:20:40 2006 @@ -128,7 +128,7 @@ p Nutch issues (bugs, as well as enhancement requests) are tracked in - Apache JIRA a href=http://nagoya.apache.org/jira/browse/Nutch;here/a. + Apache JIRA a href=http://issues.apache.org/jira/browse/Nutch;here/a. If you aren't sure whether something is a bug, post a question on the Nutch user a href=mailing_lists.htmlmailing list/a. /p Modified: lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf?rev=386181r1=386180r2=386181view=diff == --- lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf (original) +++ lucene/nutch/branches/branch-0.7/site/issue_tracking.pdf Wed Mar 15 14:20:40 2006 @@ -32,7 +32,7 @@ /Rect [ 485.232 585.8 505.884 573.8 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] -/A /URI (http://nagoya.apache.org/jira/browse/Nutch) +/A /URI (http://issues.apache.org/jira/browse/Nutch) /S /URI /H /I Modified: lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml?rev=386181r1=386180r2=386181view=diff == --- lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml (original) +++ lucene/nutch/branches/branch-0.7/src/site/src/documentation/content/xdocs/issue_tracking.xml Wed Mar 15 14:20:40 2006 @@ -11,7 +11,7 @@ body p Nutch issues (bugs, as well as enhancement requests) are tracked in - Apache JIRA a href=http://nagoya.apache.org/jira/browse/Nutch;here/a. + Apache JIRA a href=http://issues.apache.org/jira/browse/Nutch;here/a. If you aren't sure whether something is a bug, post a question on the Nutch user a href=mailing_lists.htmlmailing list/a. /p
svn commit: r383698 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
Author: cutting Date: Mon Mar 6 14:54:20 2006 New Revision: 383698 URL: http://svn.apache.org/viewcvs?rev=383698view=rev Log: Upgrade to latest version of Hadoop. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=383698r1=383697r2=383698view=diff == Binary files - no diff available.
svn commit: r382912 - in /lucene/nutch/trunk/src/java/org/apache/nutch: crawl/ fetcher/ indexer/ parse/ plugin/ searcher/ segment/
Author: cutting Date: Fri Mar 3 11:05:41 2006 New Revision: 382912 URL: http://svn.apache.org/viewcvs?rev=382912view=rev Log: Undo unintentional changes made in r381751. Thanks, Jerome, for catching this! Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java?rev=382912r1=382911r2=382912view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java Fri Mar 3 11:05:41 2006 @@ -44,11 +44,11 @@ /* Perform complete crawling and indexing given a set of root urls. */ - public static boolean doMain(String args[]) throws Exception { + public static void main(String args[]) throws Exception { if (args.length 1) { System.out.println (Usage: Crawl urlDir [-dir d] [-threads n] [-depth i] [-topN N]); - return false; + return; } Configuration conf = NutchConfiguration.create(); @@ -122,22 +122,5 @@ new IndexMerger(fs, fs.listFiles(indexes), index, tmpDir, job).merge(); LOG.info(crawl finished: + dir); - -return true; - } - - /** - * main() wrapper that returns proper exit status - */ - public static void main(String[] args) { -Runtime rt = Runtime.getRuntime(); -try { - boolean status = doMain(args); - rt.exit(status ? 0 : 1); -} -catch (Exception e) { - LOG.log(Level.SEVERE, error, caught Exception in main(), e); - rt.exit(1); -} } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=382912r1=382911r2=382912view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Fri Mar 3 11:05:41 2006 @@ -90,31 +90,17 @@ fs.delete(old); } - public static boolean doMain(String[] args) throws Exception { + public static void main(String[] args) throws Exception { CrawlDb crawlDb = new CrawlDb(NutchConfiguration.create()); if (args.length 2) { System.err.println(Usage: crawldb segment); - return false; + return; } crawlDb.update(new File(args[0]), new File(args[1])); - -return true; } - /** - * main() wrapper that returns proper exit status - */ - public static void main(String[] args) { -Runtime rt = Runtime.getRuntime(); -try { - boolean status = doMain(args); - rt.exit(status ? 0 : 1); -} -catch (Exception e) { - LOG.log(Level.SEVERE, error, caught Exception in main(), e); - rt.exit(1); -} - } + + } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=382912r1=382911r2=382912view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Fri Mar 3 11:05:41 2006 @@ -20,7 +20,7 @@ import java.io.IOException; import java.util.Iterator; import java.util.TreeMap; -import java.util.logging.*; +import java.util.logging.Logger; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.io.LongWritable; @@ -241,7 +241,7 @@ JobClient.runJob(job); } - public static boolean doMain(String[] args) throws IOException { + public static void main(String[] args) throws IOException { CrawlDbReader dbr = new CrawlDbReader(); if (args.length 1) { @@ -250,7 +250,7
svn commit: r382939 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
Author: cutting Date: Fri Mar 3 13:46:21 2006 New Revision: 382939 URL: http://svn.apache.org/viewcvs?rev=382939view=rev Log: Upgrade hadoop to latest version with some important mapred bug fixes. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=382939r1=382938r2=382939view=diff == Binary files - no diff available.
svn commit: r382512 - in /lucene/nutch/trunk/lib: lucene-core-1.9-final.jar lucene-core-1.9.1.jar lucene-misc-1.9-final.jar lucene-misc-1.9.1.jar
Author: cutting Date: Thu Mar 2 12:59:09 2006 New Revision: 382512 URL: http://svn.apache.org/viewcvs?rev=382512view=rev Log: Upgrade to Lucene 1.9.1. Added: lucene/nutch/trunk/lib/lucene-core-1.9.1.jar (with props) lucene/nutch/trunk/lib/lucene-misc-1.9.1.jar (with props) Removed: lucene/nutch/trunk/lib/lucene-core-1.9-final.jar lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar Added: lucene/nutch/trunk/lib/lucene-core-1.9.1.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-core-1.9.1.jar?rev=382512view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/lucene-core-1.9.1.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/lucene-misc-1.9.1.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-misc-1.9.1.jar?rev=382512view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/lucene-misc-1.9.1.jar -- svn:mime-type = application/octet-stream
svn commit: r382573 - in /lucene/nutch/trunk: conf/hadoop-env.sh.template lib/hadoop-0.1-dev.jar
Author: cutting Date: Thu Mar 2 15:59:24 2006 New Revision: 382573 URL: http://svn.apache.org/viewcvs?rev=382573view=rev Log: Update to latest Hadoop code. Modified: lucene/nutch/trunk/conf/hadoop-env.sh.template lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/conf/hadoop-env.sh.template URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/hadoop-env.sh.template?rev=382573r1=382572r2=382573view=diff == --- lucene/nutch/trunk/conf/hadoop-env.sh.template (original) +++ lucene/nutch/trunk/conf/hadoop-env.sh.template Thu Mar 2 15:59:24 2006 @@ -1,6 +1,11 @@ # Set Hadoop-specific environment variables here. -# The java implementation to use. +# The only required environment variable is JAVA_HOME. All others are +# optional. When running a distributed configuration it is best to +# set JAVA_HOME in this file, so that it is correctly defined on +# remote nodes. + +# The java implementation to use. Required. # export JAVA_HOME=/usr/bin/java # The maximum amount of heap to use, in MB. Default is 1000. @@ -8,6 +13,9 @@ # Extra Java runtime options. Empty by default. # export HADOOP_OPTS=-server + +# Extra ssh options. Default: '-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR'. +# export HADOOP_SSH_OPTS=-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR # Where log files are stored. $HADOOP_HOME/logs by default. # export HADOOP_LOG_DIR=${HADOOP_HOME}/logs Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=382573r1=382572r2=382573view=diff == Binary files - no diff available.
svn commit: r382579 - /lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
Author: cutting Date: Thu Mar 2 16:06:59 2006 New Revision: 382579 URL: http://svn.apache.org/viewcvs?rev=382579view=rev Log: Disable speculative execution, since input format has side effects. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=382579r1=382578r2=382579view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Thu Mar 2 16:06:59 2006 @@ -307,6 +307,7 @@ job.setInputKeyClass(HashScore.class); job.setInputValueClass(IndexDoc.class); job.setInputFormat(InputFormat.class); +job.setBoolean(mapred.speculative.execution, false); job.setPartitionerClass(HashPartitioner.class); job.setReducerClass(HashReducer.class);
svn commit: r381721 - in /lucene/nutch/trunk/lib: lucene-core-1.9-final.jar lucene-core-1.9-rc1-dev.jar lucene-misc-1.9-final.jar lucene-misc-1.9-rc1-dev.jar
Author: cutting Date: Tue Feb 28 10:00:43 2006 New Revision: 381721 URL: http://svn.apache.org/viewcvs?rev=381721view=rev Log: Upgrade lucene version to final release. Added: lucene/nutch/trunk/lib/lucene-core-1.9-final.jar (with props) lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar (with props) Removed: lucene/nutch/trunk/lib/lucene-core-1.9-rc1-dev.jar lucene/nutch/trunk/lib/lucene-misc-1.9-rc1-dev.jar Added: lucene/nutch/trunk/lib/lucene-core-1.9-final.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-core-1.9-final.jar?rev=381721view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/lucene-core-1.9-final.jar -- svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar?rev=381721view=auto == Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/lucene-misc-1.9-final.jar -- svn:mime-type = application/octet-stream
svn commit: r381824 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
Author: cutting Date: Tue Feb 28 15:30:02 2006 New Revision: 381824 URL: http://svn.apache.org/viewcvs?rev=381824view=rev Log: Updating hadoop jar. Includes fixes for Windows. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=381824r1=381823r2=381824view=diff == Binary files - no diff available.
svn commit: r380789 - /lucene/nutch/trunk/build.xml
Author: cutting Date: Fri Feb 24 11:11:44 2006 New Revision: 380789 URL: http://svn.apache.org/viewcvs?rev=380789view=rev Log: Fix to not use 'exec', but rather 'untar' and 'chmod' which are more portable. Modified: lucene/nutch/trunk/build.xml Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=380789r1=380788r2=380789view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Fri Feb 24 11:11:44 2006 @@ -57,10 +57,9 @@ fileset dir=${lib.dir} includes=hadoop*.jar/ patternset includes=bin.tgz/ /unjar -exec dir=bin executable=tar - arg value=xzf/ - arg value=../${build.dir}/hadoop/bin.tgz/ -/exec + +untar src=${build.dir}/hadoop/bin.tgz dest=bin compression=gzip/ +chmod dir=bin perm=ugo+rx includes=*.sh,hadoop/ !-- unpack hadoop webapp from hadoop jar into build directory -- mkdir dir=${build.dir}/webapps/
svn commit: r380840 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
Author: cutting Date: Fri Feb 24 14:38:06 2006 New Revision: 380840 URL: http://svn.apache.org/viewcvs?rev=380840view=rev Log: Update hadoop jar, to get recent fixes from that project. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=380840r1=380839r2=380840view=diff == Binary files - no diff available.
svn commit: r378381 - /lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml
Author: cutting Date: Thu Feb 16 14:24:47 2006 New Revision: 378381 URL: http://svn.apache.org/viewcvs?rev=378381view=rev Log: Fix to work with Forrest 0.7, where ext: links seem to no longer work in tabs.xml. Modified: lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml Modified: lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml?rev=378381r1=378380r2=378381view=diff == --- lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml (original) +++ lucene/nutch/trunk/src/site/src/documentation/content/xdocs/tabs.xml Thu Feb 16 14:24:47 2006 @@ -15,6 +15,6 @@ -- tab label=Main dir=/ - tab label=Wiki href=ext:wiki/ + tab label=Wiki href=http://wiki.apache.org/nutch// /tabs
svn commit: r378044 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
Author: cutting Date: Wed Feb 15 09:56:54 2006 New Revision: 378044 URL: http://svn.apache.org/viewcvs?rev=378044view=rev Log: Upgrade to latest version of Hadoop. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=378044r1=378043r2=378044view=diff == Binary files - no diff available.
svn commit: r378107 - in /lucene/nutch/trunk: conf/ conf/hadoop-env.sh.template conf/slaves.template lib/hadoop-0.1-dev.jar src/java/org/apache/nutch/fetcher/Fetcher.java
Author: cutting Date: Wed Feb 15 14:45:31 2006 New Revision: 378107 URL: http://svn.apache.org/viewcvs?rev=378107view=rev Log: Fix Fetcher to disable speculative exexution, to keep it polite. Also upgrade to latest hadoop jar that supports this feature. Note that Hadoop's environment specification has changed, with all environment variables settable from conf/hadoop-env.sh, and the slaves file is now in conf/, rather than in one's home directory. Added: lucene/nutch/trunk/conf/hadoop-env.sh.template lucene/nutch/trunk/conf/slaves.template Modified: lucene/nutch/trunk/conf/ (props changed) lucene/nutch/trunk/lib/hadoop-0.1-dev.jar lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Propchange: lucene/nutch/trunk/conf/ -- --- svn:ignore (original) +++ svn:ignore Wed Feb 15 14:45:31 2006 @@ -1,5 +1,4 @@ -nutch-site.xml -regex-normalize.xml -crawl-urlfilter.txt -regex-urlfilter.txt -mapred-default.xml +*.xml +*.txt +*.sh +slaves Added: lucene/nutch/trunk/conf/hadoop-env.sh.template URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/hadoop-env.sh.template?rev=378107view=auto == --- lucene/nutch/trunk/conf/hadoop-env.sh.template (added) +++ lucene/nutch/trunk/conf/hadoop-env.sh.template Wed Feb 15 14:45:31 2006 @@ -0,0 +1,25 @@ +# Set Hadoop-specific environment variables here. + +# The java implementation to use. +# export JAVA_HOME=/usr/bin/java + +# The maximum amount of heap to use, in MB. Default is 1000. +# export HADOOP_HEAPSIZE=2000 + +# Extra Java runtime options. Empty by default. +# export HADOOP_OPTS=-server + +# Where log files are stored. $HADOOP_HOME/logs by default. +# export HADOOP_LOG_DIR=${HADOOP_HOME}/logs + +# File naming remote slave hosts. $HADOOP_HOME/conf/slaves by default. +# export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves + +# host:path where hadoop code should be rsync'd from. Unset by default. +# export HADOOP_MASTER=master:/home/$USER/src/hadoop + +# The directory where pid files are stored. /tmp by default. +# export HADOOP_PID_DIR=/var/hadoop/pids + +# A string representing this instance of hadoop. $USER by default. +# export HADOOP_IDENT_STRING=$USER Added: lucene/nutch/trunk/conf/slaves.template URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/slaves.template?rev=378107view=auto == --- lucene/nutch/trunk/conf/slaves.template (added) +++ lucene/nutch/trunk/conf/slaves.template Wed Feb 15 14:45:31 2006 @@ -0,0 +1 @@ +localhost Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=378107r1=378106r2=378107view=diff == Binary files - no diff available. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=378107r1=378106r2=378107view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Feb 15 14:45:31 2006 @@ -348,6 +348,9 @@ job.set(SEGMENT_NAME_KEY, segment.getName()); job.setBoolean(fetcher.parse, parsing); +// for politeness, don't permit parallel execution of a single task +job.setBoolean(mapred.speculative.execution, false); + job.setInputDir(new File(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); job.setInputKeyClass(UTF8.class);
svn commit: r376815 - /lucene/nutch/trunk/lib/hadoop-0.1-dev.jar
Author: cutting Date: Fri Feb 10 11:44:47 2006 New Revision: 376815 URL: http://svn.apache.org/viewcvs?rev=376815view=rev Log: Update Hadoop jar. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=376815r1=376814r2=376815view=diff == Binary files - no diff available.
svn commit: r376435 - in /lucene/nutch/trunk: lib/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/
Author: cutting Date: Thu Feb 9 12:57:44 2006 New Revision: 376435 URL: http://svn.apache.org/viewcvs?rev=376435view=rev Log: Updating to latest Hadoop jar, adding now-required close() methods to mapper and reducer implementations. Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Modified: lucene/nutch/trunk/lib/hadoop-0.1-dev.jar URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/lib/hadoop-0.1-dev.jar?rev=376435r1=376434r2=376435view=diff == Binary files - no diff available. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=376435r1=376434r2=376435view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Thu Feb 9 12:57:44 2006 @@ -56,6 +56,7 @@ public static class CrawlDbStatMapper implements Mapper { public void configure(JobConf job) {} +public void close() {} public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { CrawlDatum cd = (CrawlDatum) value; @@ -68,6 +69,7 @@ public static class CrawlDbStatReducer implements Reducer { public void configure(JobConf job) {} +public void close() {} public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { @@ -127,8 +129,8 @@ } } -public void configure(JobConf job) { -} +public void configure(JobConf job) {} +public void close() {} } public void processStatJob(String crawlDb, Configuration config) throws IOException { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=376435r1=376434r2=376435view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Thu Feb 9 12:57:44 2006 @@ -30,6 +30,8 @@ retryMax = job.getInt(db.fetch.retry.max, 3); } + public void close() {} + public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=376435r1=376434r2=376435view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Thu Feb 9 12:57:44 2006 @@ -51,6 +51,8 @@ maxPerHost = job.getInt(generate.max.per.host, -1); } +public void close() {} + /** Select invert subset due for fetch. */ public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=376435r1=376434r2=376435view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Thu Feb 9 12:57:44 2006 @@ -48,6 +48,8 @@ this.jobConf = job; } +public void close() {} + public void map(WritableComparable key, Writable val, OutputCollector output, Reporter reporter) throws IOException { @@ -73,6 +75,7
svn commit: r376485 - in /lucene/nutch/trunk: ./ bin/ lib/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/j
Author: cutting Date: Thu Feb 9 15:20:28 2006 New Revision: 376485 URL: http://svn.apache.org/viewcvs?rev=376485view=rev Log: Fix for NUTCH-209. Nutch now supplies all code to remote MapReduce daemons through a job jar file. So Hadoop daemons no longer need to be restarted when Nutch code changes. Added: lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java Modified: lucene/nutch/trunk/bin/nutch lucene/nutch/trunk/build.xml lucene/nutch/trunk/lib/hadoop-0.1-dev.jar lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java Modified: lucene/nutch/trunk/bin/nutch URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/bin/nutch?rev=376485r1=376484r2=376485view=diff == --- lucene/nutch/trunk/bin/nutch (original) +++ lucene/nutch/trunk/bin/nutch Thu Feb 9 15:20:28 2006 @@ -82,13 +82,13 @@ CLASSPATH=${NUTCH_CONF_DIR:=$NUTCH_HOME/conf} CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar -# for developers, add Nutch classes to CLASSPATH -if [ -d $NUTCH_HOME/build/classes ]; then - CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/classes -fi +# for developers, add plugins, job test code to CLASSPATH if [ -d $NUTCH_HOME/build/plugins ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build fi +for f in $NUTCH_HOME/build/nutch-*.job; do + CLASSPATH=${CLASSPATH}:$f; +done if [ -d $NUTCH_HOME/build/test/classes ]; then CLASSPATH=${CLASSPATH}:$NUTCH_HOME/build/test/classes fi @@ -96,14 +96,14 @@ # so that filenames w/ spaces are handled correctly in loops below IFS= -# for releases, add Nutch jar to CLASSPATH -for f in $NUTCH_HOME/nutch-*.jar; do +# for releases, add Nutch job to CLASSPATH +for f in $NUTCH_HOME/nutch-*.job; do CLASSPATH=${CLASSPATH}:$f; done # add plugins to classpath if [ -d $NUTCH_HOME/plugins ]; then - CLASSPATH=${CLASSPATH}:$NUTCH_HOME + CLASSPATH=${NUTCH_HOME}:${CLASSPATH} fi # add libs to CLASSPATH Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=376485r1=376484r2=376485view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Thu Feb 9 15:20:28 2006 @@ -1,6 +1,6 @@ ?xml version=1.0? -project name=Nutch default=compile +project name=Nutch default=job !-- Load all the default properties, and any the user wants-- !-- to contribute (without having to type -D or edit this file -- @@ -100,7 +100,6 @@ target name=dynamic depends=generate-src, compile /target - !-- == -- !-- Make nutch.jar -- !-- == -- @@ -119,6 +118,21 @@ /target !-- == -- + !-- Make job jar -- + !-- == -- + !---- + !-- == -- + target name=job depends=compile +jar jarfile=${build.dir}/${final.name}.job + zipfileset dir=${build.classes}/ + zipfileset dir=${conf.dir} excludes=*.template/ + zipfileset dir=${lib.dir} prefix=lib + includes=**/*.jar excludes=hadoop-*.jar/ + zipfileset dir=${build.plugins} prefix=plugins/ +/jar + /target + + !-- == -- !-- Make nutch.war -- !-- == -- !---- @@ -385,7 +399,7
svn commit: r376072 - /lucene/nutch/trunk/conf/nutch-default.xml
Author: cutting Date: Wed Feb 8 13:25:30 2006 New Revision: 376072 URL: http://svn.apache.org/viewcvs?rev=376072view=rev Log: Restore accidentally removed file defaults. Modified: lucene/nutch/trunk/conf/nutch-default.xml Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=376072r1=376071r2=376072view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Wed Feb 8 13:25:30 2006 @@ -7,6 +7,28 @@ configuration +!-- file properties -- + +property + namefile.content.limit/name + value65536/value + descriptionThe length limit for downloaded content, in bytes. + If this value is larger than zero, content longer than it will be + truncated; otherwise (zero or negative), no truncation at all. + /description +/property + +property + namefile.content.ignored/name + valuetrue/value + descriptionIf true, no file content will be saved during fetch. + And it is probably what we want to set most of time, since file:// URLs + are meant to be local and we can always use them directly at parsing + and indexing stages. Otherwise file contents will be saved. + !! NO IMPLEMENTED YET !! + /description +/property + !-- HTTP properties -- property
svn commit: r375704 - in /lucene/nutch/trunk/lib: jetty-5.1.4.LICENSE.txt jetty-5.1.4.jar jetty-ext/
Author: cutting Date: Tue Feb 7 13:02:46 2006 New Revision: 375704 URL: http://svn.apache.org/viewcvs?rev=375704view=rev Log: Restoring jetty to Nutch lib: removed by mistake. Added: lucene/nutch/trunk/lib/jetty-5.1.4.LICENSE.txt - copied unchanged from r374759, lucene/hadoop/trunk/lib/jetty-5.1.4.LICENSE.txt lucene/nutch/trunk/lib/jetty-5.1.4.jar - copied unchanged from r374759, lucene/hadoop/trunk/lib/jetty-5.1.4.jar lucene/nutch/trunk/lib/jetty-ext/ - copied from r374759, lucene/hadoop/trunk/lib/jetty-ext/
svn commit: r375333 - /lucene/nutch/nightly/nightly.properties
Author: cutting Date: Mon Feb 6 10:57:09 2006 New Revision: 375333 URL: http://svn.apache.org/viewcvs?rev=375333view=rev Log: Updated email paramters. Modified: lucene/nutch/nightly/nightly.properties Modified: lucene/nutch/nightly/nightly.properties URL: http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.properties?rev=375333r1=375332r2=375333view=diff == --- lucene/nutch/nightly/nightly.properties (original) +++ lucene/nutch/nightly/nightly.properties Mon Feb 6 10:57:09 2006 @@ -1,5 +1,5 @@ -MailLogger.mailhost = mail.apache.org -MailLogger.from = nutch-dev@incubator.apache.org -MailLogger.failure.to = nutch-dev@incubator.apache.org +MailLogger.mailhost = localhost +MailLogger.from = nutch-dev@lucene.apache.org +MailLogger.failure.to = nutch-dev@lucene.apache.org MailLogger.failure.subject = Nutch nightly build failure MailLogger.success.notify = false
svn commit: r372342 - /lucene/nutch/nightly/nightly.sh
Author: cutting Date: Wed Jan 25 14:20:06 2006 New Revision: 372342 URL: http://svn.apache.org/viewcvs?rev=372342view=rev Log: Fix remove command. Modified: lucene/nutch/nightly/nightly.sh Modified: lucene/nutch/nightly/nightly.sh URL: http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.sh?rev=372342r1=372341r2=372342view=diff == --- lucene/nutch/nightly/nightly.sh (original) +++ lucene/nutch/nightly/nightly.sh Wed Jan 25 14:20:06 2006 @@ -25,4 +25,4 @@ scp build/*.tar.gz $REL_SERVER:$REL_DIR/nutch-`/bin/date +%F`.tar.gz # remove all but five newest builds -ssh $REL_SERVER rm `ssh $REL_SERVER find -type f $REL_DIR | sort -r | tail +5` +ssh $REL_SERVER rm `ssh $REL_SERVER find $REL_DIR -type f | sort -r | tail +5`
svn commit: r370632 - /lucene/nutch/trunk/conf/nutch-default.xml
Author: cutting Date: Thu Jan 19 12:58:54 2006 New Revision: 370632 URL: http://svn.apache.org/viewcvs?rev=370632view=rev Log: Switch default to protocol-http, since it seems more reliable than protocol-httpclient. Modified: lucene/nutch/trunk/conf/nutch-default.xml Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=370632r1=370631r2=370632view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Thu Jan 19 12:58:54 2006 @@ -733,7 +733,7 @@ property nameplugin.includes/name - valueprotocol-httpclient|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)/value + valueprotocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)/value descriptionRegular expression naming plugin directory names to include. Any plugin not matching this expression is excluded. In any case you need at least include the nutch-extensionpoints plugin. By
svn commit: r370638 - /lucene/nutch/trunk/conf/nutch-default.xml
Author: cutting Date: Thu Jan 19 13:24:58 2006 New Revision: 370638 URL: http://svn.apache.org/viewcvs?rev=370638view=rev Log: Document a few more properties. Contributed by Dominik Friedrich. Modified: lucene/nutch/trunk/conf/nutch-default.xml Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=370638r1=370637r2=370638view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Thu Jan 19 13:24:58 2006 @@ -379,6 +379,14 @@ exception./description /property +property + nameio.map.index.skip/name + value0/value + descriptionNumber of index entries to skip between each entry. + Zero by default. Setting this to values larger than zero can + facilitate opening large map files using less memory./description +/property + !-- file system properties -- property @@ -412,6 +420,14 @@ directories, typically on different devices./description /property +property + namendfs.replication/name + value3/value + descriptionHow many copies we try to have at all times. The actual + number of replications is at max the number of datanodes in the + cluster./description +/property + !-- map/reduce properties -- property @@ -509,6 +525,13 @@ value200m/value descriptionThe heap size (-Xmx) that will be used for task tracker child processes./description +/property + +property + namemapred.combine.buffer.size/name + value10/value + descriptionThe number of entries the combining collector caches before + combining them and writing to disk./description /property !-- indexer properties --
svn commit: r370657 - in /lucene/nutch/nightly: nightly.cron nightly.properties nightly.sh
Author: cutting Date: Thu Jan 19 14:46:28 2006 New Revision: 370657 URL: http://svn.apache.org/viewcvs?rev=370657view=rev Log: Moving nightly build to lucene.zones.apache.org. Modified: lucene/nutch/nightly/nightly.cron lucene/nutch/nightly/nightly.properties lucene/nutch/nightly/nightly.sh Modified: lucene/nutch/nightly/nightly.cron URL: http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.cron?rev=370657r1=370656r2=370657view=diff == --- lucene/nutch/nightly/nightly.cron (original) +++ lucene/nutch/nightly/nightly.cron Thu Jan 19 14:46:28 2006 @@ -1,4 +1,4 @@ # nightly crontab file # install with: crontab nightly.cron # run seventeen minutes after midnight, every day -17 0 * * * $HOME/src/nutch/nightly/nightly.sh $HOME/src/nutch/nightly/nightly.log 21 +17 0 * * * $HOME/nutch-nightly/nightly.sh $HOME/nutch-nightly/nightly.log 21 Modified: lucene/nutch/nightly/nightly.properties URL: http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.properties?rev=370657r1=370656r2=370657view=diff == --- lucene/nutch/nightly/nightly.properties (original) +++ lucene/nutch/nightly/nightly.properties Thu Jan 19 14:46:28 2006 @@ -1,4 +1,4 @@ -MailLogger.mailhost = smtp.sbcglobal.net +MailLogger.mailhost = mail.apache.org MailLogger.from = nutch-dev@incubator.apache.org MailLogger.failure.to = nutch-dev@incubator.apache.org MailLogger.failure.subject = Nutch nightly build failure Modified: lucene/nutch/nightly/nightly.sh URL: http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.sh?rev=370657r1=370656r2=370657view=diff == --- lucene/nutch/nightly/nightly.sh (original) +++ lucene/nutch/nightly/nightly.sh Thu Jan 19 14:46:28 2006 @@ -1,6 +1,6 @@ #!/bin/bash -vx -export JAVA_HOME=$HOME/local/j2sdk1.4.2 +export JAVA_HOME=/usr/j2se TRUNK=http://svn.apache.org/repos/asf/lucene/nutch/trunk @@ -12,12 +12,12 @@ cd /tmp # export sources into it -svn export $TRUNK nutch-nightly +$HOME/bin/svn export $TRUNK nutch-nightly # run build cd nutch-nightly -$HOME/local/ant/bin/ant \ - -propertyfile $HOME/src/nutch/nightly/nightly.properties \ +$HOME/bin/ant \ + -propertyfile $HOME/nutch-nightly/nightly.properties \ -logger org.apache.tools.ant.listener.MailLogger \ -Dversion=nightly nightly
svn commit: r370281 - /lucene/nutch/trunk/build.xml
Author: cutting Date: Wed Jan 18 14:03:28 2006 New Revision: 370281 URL: http://svn.apache.org/viewcvs?rev=370281view=rev Log: Fix NUTCH-102: include webapps in packaged releases. Modified: lucene/nutch/trunk/build.xml Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=370281r1=370280r2=370281view=diff == --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Wed Jan 18 14:03:28 2006 @@ -377,6 +377,10 @@ fileset dir=lib/ /copy +copy todir=${dist.dir}/webapps + fileset dir=${build.webapps}/ +/copy + copy todir=${dist.dir}/plugins fileset dir=${build.plugins}/ /copy
svn commit: r367406 - in /lucene/nutch/trunk/src: java/org/apache/nutch/ipc/RPC.java test/org/apache/nutch/ipc/TestRPC.java
Author: cutting Date: Mon Jan 9 13:50:48 2006 New Revision: 367406 URL: http://svn.apache.org/viewcvs?rev=367406view=rev Log: Fix parallel RPC calls to work correctly with methods that return void. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java?rev=367406r1=367405r2=367406view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/ipc/RPC.java Mon Jan 9 13:50:48 2006 @@ -149,6 +149,10 @@ Writable[] wrappedValues = CLIENT.call(invocations, addrs); +if (method.getReturnType() == Void.TYPE) { + return null; +} + Object[] values = (Object[])Array.newInstance(method.getReturnType(),wrappedValues.length); for (int i = 0; i values.length; i++) Modified: lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java?rev=367406r1=367405r2=367406view=diff == --- lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/ipc/TestRPC.java Mon Jan 9 13:50:48 2006 @@ -110,13 +110,17 @@ } assertTrue(caught); -// try a multi-call -Method method = +// try some multi-calls +Method echo = TestProtocol.class.getMethod(echo, new Class[] { String.class }); -String[] values = (String[])RPC.call(method, new String[][]{{a},{b}}, +String[] strings = (String[])RPC.call(echo, new String[][]{{a},{b}}, new InetSocketAddress[] {addr, addr}); -assertTrue(Arrays.equals(values, new String[]{a,b})); +assertTrue(Arrays.equals(strings, new String[]{a,b})); +Method ping = TestProtocol.class.getMethod(ping, new Class[] {}); +Object[] voids = (Object[])RPC.call(ping, new Object[][]{{},{}}, +new InetSocketAddress[] {addr, addr}); +assertEquals(voids, null); server.stop(); }
svn commit: r367408 - /lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java
Author: cutting Date: Mon Jan 9 13:55:31 2006 New Revision: 367408 URL: http://svn.apache.org/viewcvs?rev=367408view=rev Log: NUTCH-160: Switch RegexURLFilter to use Java regex's rather than oro, since Java's seem to be faster more reliable. By Rod Taylor. Modified: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java Modified: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java?rev=367408r1=367407r2=367408view=diff == --- lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java (original) +++ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java Mon Jan 9 13:55:31 2006 @@ -32,12 +32,7 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.logging.Logger; - -import org.apache.oro.text.regex.Perl5Compiler; -import org.apache.oro.text.regex.Perl5Matcher; -import org.apache.oro.text.regex.Perl5Pattern; -import org.apache.oro.text.regex.PatternMatcher; -import org.apache.oro.text.regex.MalformedPatternException; +import java.util.regex.*; /** * Filters URLs based on a file of regular expressions. The file is named by @@ -80,15 +75,14 @@ } private static class Rule { -public Perl5Pattern pattern; +public Pattern pattern; public boolean sign; public String regex; } private List rules; - private PatternMatcher matcher = new Perl5Matcher(); - public RegexURLFilter() throws IOException, MalformedPatternException { + public RegexURLFilter() throws IOException, PatternSyntaxException { String file = NutchConf.get().get(urlfilter.regex.file); // attribute file takes precedence if defined if (attributeFile != null) @@ -103,7 +97,7 @@ } public RegexURLFilter(String filename) -throws IOException, MalformedPatternException { +throws IOException, PatternSyntaxException { rules = readConfigurationFile(new FileReader(filename)); } @@ -111,7 +105,9 @@ Iterator i=rules.iterator(); while(i.hasNext()) { Rule r=(Rule) i.next(); - if (matcher.contains(url,r.pattern)) { + Matcher matcher = r.pattern.matcher(url); + + if (matcher.find()) { //System.out.println(Matched + r.regex); return r.sign ? url : null; } @@ -129,10 +125,9 @@ // private static List readConfigurationFile(Reader reader) -throws IOException, MalformedPatternException { +throws IOException, PatternSyntaxException { BufferedReader in=new BufferedReader(reader); -Perl5Compiler compiler=new Perl5Compiler(); List rules=new ArrayList(); String line; @@ -157,7 +152,7 @@ String regex=line.substring(1); Rule rule=new Rule(); - rule.pattern=(Perl5Pattern) compiler.compile(regex); + rule.pattern=Pattern.compile(regex); rule.sign=sign; rule.regex=regex; rules.add(rule); @@ -167,7 +162,7 @@ } public static void main(String args[]) -throws IOException, MalformedPatternException { +throws IOException, PatternSyntaxException { RegexURLFilter filter=new RegexURLFilter(); BufferedReader in=new BufferedReader(new InputStreamReader(System.in));
svn commit: r366550 - /lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java
Author: cutting Date: Fri Jan 6 11:14:46 2006 New Revision: 366550 URL: http://svn.apache.org/viewcvs?rev=366550view=rev Log: Make it clearer why this optimization is valid. For Stefan. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java?rev=366550r1=366549r2=366550view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/ipc/Client.java Fri Jan 6 11:14:46 2006 @@ -306,7 +306,7 @@ * contains nulls for calls that timed out or errored. */ public Writable[] call(Writable[] params, InetSocketAddress[] addresses) throws IOException { -if (params.length == 0) return new Writable[0]; +if (addresses.length == 0) return new Writable[0]; ParallelResults results = new ParallelResults(params.length); synchronized (results) {
svn commit: r366242 - in /lucene/nutch/trunk: conf/nutch-default.xml src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
Author: cutting Date: Thu Jan 5 10:38:44 2006 New Revision: 366242 URL: http://svn.apache.org/viewcvs?rev=366242view=rev Log: Fix NegativeArraySizeException. Modified: lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=366242r1=366241r2=366242view=diff == --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Thu Jan 5 10:38:44 2006 @@ -661,10 +661,11 @@ property namesearcher.max.hits/name - value2147483647/value - descriptionSearch stops after this many hits are found. Setting - this to smaller values can make searches much faster. With a sorted - index, the quality of the hits suffers little./description + value-1/value + descriptionIf positive, search stops after this many hits are + found. Setting this to small, positive values (e.g., 1000) can make + searches much faster. With a sorted index, the quality of the hits + suffers little./description /property !-- URL normalizer properties -- Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java?rev=366242r1=366241r2=366242view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Thu Jan 5 10:38:44 2006 @@ -37,8 +37,7 @@ * which do not affect ranking but might otherwise slow search considerably. */ class LuceneQueryOptimizer { - private static int MAX_HITS = -NutchConf.get().getInt(searcher.max.hits, Integer.MAX_VALUE); + private static int MAX_HITS = NutchConf.get().getInt(searcher.max.hits,-1); private static class LimitExceeded extends RuntimeException { private int maxDoc; @@ -150,6 +149,13 @@ } } if (sortField == null !reverse) { + + // no hit limit + if (MAX_HITS = 0) { +return searcher.search(query, filter, numHits); + } + + // hits limited -- use a LimitedCollector LimitedCollector collector = new LimitedCollector(numHits, MAX_HITS); LimitExceeded exceeded = null; try {
svn commit: r366271 - /lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java
Author: cutting Date: Thu Jan 5 12:13:43 2006 New Revision: 366271 URL: http://svn.apache.org/viewcvs?rev=366271view=rev Log: Fix for NUTCH-108: eliminate voluminous messages when reconnecting. From Paul Baclace. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java?rev=366271r1=366270r2=366271view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/mapred/TaskTracker.java Thu Jan 5 12:13:43 2006 @@ -287,8 +287,7 @@ staleState = true; } } catch (Exception ex) { -ex.printStackTrace(); -LOG.info(Lost connection to JobTracker [ + jobTrackAddr + ]. Retrying...); +LOG.info(Lost connection to JobTracker [ + jobTrackAddr + ]. ex= + ex + Retrying...); try { Thread.sleep(5000); } catch (InterruptedException ie) {
svn commit: r366322 - /lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java
Author: cutting Date: Thu Jan 5 14:37:19 2006 New Revision: 366322 URL: http://svn.apache.org/viewcvs?rev=366322view=rev Log: Fix a bug in LimitedCollector. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java?rev=366322r1=366321r2=366322view=diff == --- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LuceneQueryOptimizer.java Thu Jan 5 14:37:19 2006 @@ -48,7 +48,7 @@ private int maxHits; public LimitedCollector(int numHits, int maxHits) { - super(maxHits); + super(numHits); this.maxHits = maxHits; }
svn commit: r357197 [5/5] - in /lucene/nutch: branches/mapred/ trunk/ trunk/bin/ trunk/conf/ trunk/lib/ trunk/lib/jetty-ext/ trunk/site/ trunk/src/java/org/apache/nutch/crawl/ trunk/src/java/org/apach
Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java?rev=357197r1=357196r2=357197view=diff == --- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/RobotRulesParser.java Fri Dec 16 09:51:05 2005 @@ -53,6 +53,9 @@ private static final boolean ALLOW_FORBIDDEN = NutchConf.get().getBoolean(http.robots.403.allow, false); + private static final int MAX_REDIRECTS = +NutchConf.get().getInt(http.redirect.max, 3); + private static final String[] AGENTS = getAgents(); private static final Hashtable CACHE = new Hashtable(); @@ -377,16 +380,30 @@ RobotRuleSet robotRules = (RobotRuleSet)CACHE.get(host); if (robotRules == null) { // cache miss - HttpResponse response = new HttpResponse(new URL(url, /robots.txt)); - - if (response.getCode() == 200) // found rules: parse them -robotRules = new RobotRulesParser().parseRules(response.getContent()); - else if ( (response.getCode() == 403) (!ALLOW_FORBIDDEN) ) -robotRules = FORBID_ALL_RULES;// use forbid all - else -robotRules = EMPTY_RULES; // use default rules + int redirects = 0; + do { +HttpResponse response = new HttpResponse(new URL(url, /robots.txt)); + +int code = response.getCode(); + +if (code == 200) {// found rules: parse them + robotRules = new RobotRulesParser().parseRules(response.getContent()); +} else if ( (code == 403) (!ALLOW_FORBIDDEN) ) { + robotRules = FORBID_ALL_RULES; // use forbid all +} else if (code = 300 code 400) { // handle redirect + if (redirects == MAX_REDIRECTS) { +robotRules = EMPTY_RULES; + } else { +url = new URL(url, response.getHeader(Location)); +LOG.fine(redirect to + url); +redirects++; + } +} else { + robotRules = EMPTY_RULES; // use default rules +} + } while (robotRules == null); - CACHE.put(host, robotRules);// cache rules for host + CACHE.put(host, robotRules); // cache rules for host } String path = url.getPath(); // check rules Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?rev=357197r1=357196r2=357197view=diff == --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Fri Dec 16 09:51:05 2005 @@ -60,10 +60,14 @@ } public HttpResponse(URL url) throws IOException { +this(url, false); + } + + HttpResponse(URL url, boolean followRedirects) throws IOException { this.base = url.toString(); this.orig = url.toString(); GetMethod get = new GetMethod(this.orig); -get.setFollowRedirects(false); +get.setFollowRedirects(followRedirects); get.setRequestHeader(User-Agent, Http.AGENT_STRING); HttpMethodParams params = get.getParams(); // some servers cannot digest the new protocol Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java?rev=357197r1=357196r2=357197view=diff == --- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/RobotRulesParser.java Fri Dec 16 09:51:05 2005 @@ -379,7 +379,8 @@ if (robotRules == null) { // cache miss LOG.fine(cache miss + url); try { -HttpResponse response = new HttpResponse(new URL(url, /robots.txt)); +HttpResponse response = new HttpResponse(new URL(url, /robots.txt), + true);
svn commit: r348210 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java
Author: cutting Date: Tue Nov 22 10:46:43 2005 New Revision: 348210 URL: http://svn.apache.org/viewcvs?rev=348210view=rev Log: Silently ignore missing checksum files. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java?rev=348210r1=348209r2=348210view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java Tue Nov 22 10:46:43 2005 @@ -54,7 +54,9 @@ if (!Arrays.equals(version, VERSION)) throw new IOException(Not a checksum file: +sumFile); bytesPerSum = sums.readInt(); - } catch (IOException e) { + } catch (FileNotFoundException e) { // quietly ignore +stopSumming(); + } catch (IOException e) { // loudly ignore LOG.warning(Problem opening checksum file: +e+. Ignoring.); stopSumming(); }
svn commit: r348212 - in /lucene/nutch/branches/mapred/conf: crawl-tool.xml nutch-default.xml
Author: cutting Date: Tue Nov 22 10:55:26 2005 New Revision: 348212 URL: http://svn.apache.org/viewcvs?rev=348212view=rev Log: Increase defaults for http.max.delays, since, with MapReduce's partitioning of fetchlists, delays are more likely. Modified: lucene/nutch/branches/mapred/conf/crawl-tool.xml lucene/nutch/branches/mapred/conf/nutch-default.xml Modified: lucene/nutch/branches/mapred/conf/crawl-tool.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/crawl-tool.xml?rev=348212r1=348211r2=348212view=diff == --- lucene/nutch/branches/mapred/conf/crawl-tool.xml (original) +++ lucene/nutch/branches/mapred/conf/crawl-tool.xml Tue Nov 22 10:55:26 2005 @@ -33,7 +33,7 @@ property namehttp.max.delays/name - value100/value + value1000/value descriptionThe number of times a thread will delay when trying to fetch a page. When using the crawl tool there are likely to be very few different hosts, so we need to be willing to wait longer for Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=348212r1=348211r2=348212view=diff == --- lucene/nutch/branches/mapred/conf/nutch-default.xml (original) +++ lucene/nutch/branches/mapred/conf/nutch-default.xml Tue Nov 22 10:55:26 2005 @@ -69,7 +69,7 @@ property namehttp.max.delays/name - value3/value + value100/value descriptionThe number of times a thread will delay when trying to fetch a page. Each time it finds that a host is busy, it will wait fetcher.server.delay. After http.max.delays attepts, it will give
svn commit: r332371 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java
Author: cutting Date: Thu Nov 10 13:03:16 2005 New Revision: 332371 URL: http://svn.apache.org/viewcvs?rev=332371view=rev Log: Fix to not increment count of urls when urls are filtered by maxPerHost limit. Patch contributed by Rod Taylor. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java?rev=332371r1=332370r2=332371view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Generator.java Thu Nov 10 13:03:16 2005 @@ -76,23 +76,27 @@ OutputCollector output, Reporter reporter) throws IOException { - while (values.hasNext() ++count limit) { + while (values.hasNext() count limit) { UTF8 url = (UTF8)values.next(); -if (maxPerHost 0) { // are we counting hosts? +if (maxPerHost 0) { // are we counting hosts? String host = new URL(url.toString()).getHost(); - Integer count = (Integer)hostCounts.get(host); - if (count != null) { -if (count.intValue() = maxPerHost) + Integer hostCount = (Integer)hostCounts.get(host); + if (hostCount != null) { +if (hostCount.intValue() = maxPerHost) continue; // too many from host -hostCounts.put(host, new Integer(count.intValue()+1)); +hostCounts.put(host, new Integer(hostCount.intValue()+1)); } else {// update host count hostCounts.put(host, new Integer(1)); } } output.collect(key, url); + +// Count is incremented only when we keep the URL +// maxPerHost may cause us to skip it. +count++; } }
svn commit: r328414 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java
Author: cutting Date: Tue Oct 25 09:57:51 2005 New Revision: 328414 URL: http://svn.apache.org/viewcvs?rev=328414view=rev Log: Fix a type error for JDK 1.4. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java?rev=328414r1=328413r2=328414view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/ParseOutputFormat.java Tue Oct 25 09:57:51 2005 @@ -66,7 +66,8 @@ Outlink[] links = parse.getData().getOutlinks(); // compute OPIC score contribution - float score = Float.valueOf(parse.getData().get(Fetcher.SCORE_KEY)); + float score = +Float.parseFloat(parse.getData().get(Fetcher.SCORE_KEY)); score /= links.length; for (int i = 0; i links.length; i++) {
svn commit: r327572 - /lucene/nutch/branches/mapred/bin/slaves.sh
Author: cutting Date: Fri Oct 21 13:45:32 2005 New Revision: 327572 URL: http://svn.apache.org/viewcvs?rev=327572view=rev Log: Tag standard error with hostname too. Modified: lucene/nutch/branches/mapred/bin/slaves.sh Modified: lucene/nutch/branches/mapred/bin/slaves.sh URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/slaves.sh?rev=327572r1=327571r2=327572view=diff == --- lucene/nutch/branches/mapred/bin/slaves.sh (original) +++ lucene/nutch/branches/mapred/bin/slaves.sh Fri Oct 21 13:45:32 2005 @@ -21,7 +21,7 @@ for slave in `cat $NUTCH_SLAVES`; do ssh -o ConnectTimeout=1 -o SendEnv='NUTCH* JAVA*' $slave $@ \ - | sed s/^/$slave: / + 21 | sed s/^/$slave: / done wait
svn commit: r327581 - in /lucene/nutch/branches/mapred/src/plugin/parse-html/src: java/org/apache/nutch/parse/html/DOMContentUtils.java test/org/apache/nutch/parse/html/TestDOMContentUtils.java
Author: cutting Date: Fri Oct 21 14:04:54 2005 New Revision: 327581 URL: http://svn.apache.org/viewcvs?rev=327581view=rev Log: Ignore rel=nofollow links. Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java?rev=327581r1=327580r2=327581view=diff == --- lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java (original) +++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java Fri Oct 21 14:04:54 2005 @@ -306,13 +306,21 @@ NamedNodeMap attrs = node.getAttributes(); String target = null; + boolean noFollow = false; for (int i= 0; i attrs.getLength(); i++ ) { -if (params.attrName.equalsIgnoreCase(attrs.item(i).getNodeName())) { - target = attrs.item(i).getNodeValue(); - break; +Node attr = attrs.item(i); +String attrName = attr.getNodeName(); + +if (rel.equalsIgnoreCase(attrName) +nofollow.equalsIgnoreCase(attr.getNodeValue())) { + noFollow = true; +} + +if (params.attrName.equalsIgnoreCase(attrName)) { + target = attr.getNodeValue(); } } - if (target != null) + if (target != null !noFollow) try { URL url = new URL(base, target); outlinks.add(new Outlink(url.toString(), Modified: lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=327581r1=327580r2=327581view=diff == --- lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original) +++ lucene/nutch/branches/mapred/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Fri Oct 21 14:04:54 2005 @@ -113,6 +113,12 @@ + h2End\tthis\rmadness\n!/h2\r\n + ... . + /body /html), + +// test that a rel=nofollow links are not returned +new String(htmlhead/headbody + + a href=\http://www.nutch.org\; rel=\nofollow\ ignore /a + + a rel=\nofollow\ href=\http://www.nutch.org\; ignore /a + + /body/html), }; private static String[] testBaseHrefs= { @@ -123,6 +129,7 @@ http://www.nutch.org/frames/;, http://www.nutch.org/maps/;, http://www.nutch.org/whitespace/;, +http://www.nutch.org//;, }; private static final DocumentFragment testDOMs[]= @@ -145,6 +152,7 @@ + one two three space here space there no space + one two two three three four put some text here and there. + End this madness ! . . . ., +ignore ignore, }; private static final String[] answerTitle= { @@ -155,6 +163,7 @@ my title, my title, my title, +, }; // note: should be in page-order @@ -214,6 +223,8 @@ { new Outlink(http://www.nutch.org/index.html;, whitespace test), }, + { + } }; } catch (MalformedURLException e) {
svn commit: r327593 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java
Author: cutting Date: Fri Oct 21 15:07:00 2005 New Revision: 327593 URL: http://svn.apache.org/viewcvs?rev=327593view=rev Log: Always create workdir so child can connect to it. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java?rev=327593r1=327592r2=327593view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java Fri Oct 21 15:07:00 2005 @@ -63,6 +63,7 @@ String sep = System.getProperty(path.separator); File workDir = new File(new File(t.getJobFile()).getParent(), work); + workDir.mkdirs(); StringBuffer classPath = new StringBuffer(); // start with same classpath as parent process @@ -72,7 +73,6 @@ JobConf job = new JobConf(t.getJobFile()); String jar = job.getJar(); if (jar != null) { // if jar exists, it into workDir -workDir.mkdirs(); runChild(new String[] { unzip, jar}, workDir); File[] libs = new File(workDir, lib).listFiles(); for (int i = 0; i libs.length; i++) {
svn commit: r326007 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java
Author: cutting Date: Mon Oct 17 18:08:07 2005 New Revision: 326007 URL: http://svn.apache.org/viewcvs?rev=326007view=rev Log: Fix bogus javadoc. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java?rev=326007r1=326006r2=326007view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java Mon Oct 17 18:08:07 2005 @@ -36,7 +36,7 @@ import org.apache.lucene.index.*; import org.apache.lucene.document.*; -/** Maintains an inverted link map, listing incoming links for each url. */ +/** Create indexes for segments. */ public class Indexer extends NutchConfigured implements Reducer { public static final Logger LOG =
svn commit: r320835 - in /lucene/nutch/branches/mapred/src: java/org/apache/nutch/db/ java/org/apache/nutch/fs/ java/org/apache/nutch/indexer/ java/org/apache/nutch/io/ java/org/apache/nutch/mapred/ j
Author: cutting Date: Thu Oct 13 10:59:30 2005 New Revision: 320835 URL: http://svn.apache.org/viewcvs?rev=320835view=rev Log: Store checksums for all files written and verify them on read. CRCs are stored for every 512 bytes of data, so that randomly accessed data may be verified. Errors are reported to the filesystem implementation. Local file errors cause files to be moved to a bad file directory, so that bad disk areas are not reused. NDFS file errors should cause blocks to be moved to a bad block directory on the datanode, forcing the use of replicas of the bad blocks with no loss of data. This is not yet implemented for NDFS. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/db/DistributedWebDBWriter.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FileUtil.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataInputStream.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSDataOutputStream.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NFSInputStream.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobClient.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/LocalJobRunner.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapOutputFile.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TextInputFormat.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TextOutputFormat.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DF.java lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/db/DistributedWebDBWriter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/db/DistributedWebDBWriter.java?rev=320835r1=320834r2=320835view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/db/DistributedWebDBWriter.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/db/DistributedWebDBWriter.java Thu Oct 13 10:59:30 2005 @@ -1656,7 +1656,7 @@ } // Bump number by 1. -DataOutputStream out = new DataOutputStream(nfs.create(openCounter, true)); +DataOutputStream out = nfs.create(openCounter); try { out.write(OPEN_COUNTER_VERSION); out.writeInt(numOpens + 1); @@ -1793,7 +1793,7 @@ // 7. Finally, write out the total num of pages and links // File sectionStats = new File(newSectionDir, STATS_FILE); -DataOutputStream out = new DataOutputStream(nfs.create(sectionStats, true)); +DataOutputStream out = nfs.create(sectionStats); try { // // These counts are guaranteed to be correct; they're @@ -1854,7 +1854,7 @@ } // Bump that number by 1. -out = new DataOutputStream(nfs.create(closeCounter, true)); +out = nfs.create(closeCounter); try { out.write(CLOSE_COUNTER_VERSION); out.writeInt(numCloses + 1); Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FileUtil.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FileUtil.java?rev=320835r1=320834r2=320835view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FileUtil.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/FileUtil.java Thu Oct 13 10:59:30 2005 @@ -54,9 +54,9 @@ } if (nfs.isFile(src)) { -DataInputStream in = new DataInputStream(nfs.open(src)); +NFSInputStream in = nfs.openRaw(src); try { -DataOutputStream out = new DataOutputStream(nfs.create(dst)); +NFSOutputStream out = nfs.createRaw(dst, true); byte buf[] = new byte[NutchConf.get().getInt(io.file.buffer.size, 4096)]; try { int readBytes = in.read(buf); Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java?rev=320835r1=320834r2=320835view=diff
svn commit: r320893 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/Seekable.java
Author: cutting Date: Thu Oct 13 12:42:21 2005 New Revision: 320893 URL: http://svn.apache.org/viewcvs?rev=320893view=rev Log: Add new file. Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/Seekable.java Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/Seekable.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/Seekable.java?rev=320893view=auto == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/Seekable.java (added) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/Seekable.java Thu Oct 13 12:42:21 2005 @@ -0,0 +1,28 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the License); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.fs; + +import java.io.*; + +/* Stream which permits seeking. */ +public interface Seekable { + /** + * Seek to the given offset from the start of the file. + * The next read() will be from that location. Can't + * seek past the end of the file. + */ + void seek(long pos) throws IOException; +}
svn commit: r320899 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java
Author: cutting Date: Thu Oct 13 12:57:03 2005 New Revision: 320899 URL: http://svn.apache.org/viewcvs?rev=320899view=rev Log: Fix progress reporting for dedup. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java?rev=320899r1=320898r2=320899view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java Thu Oct 13 12:57:03 2005 @@ -127,7 +127,7 @@ } public static class InputFormat extends InputFormatBase { -private static final int INDEX_LENGTH = Integer.MAX_VALUE; +private static final long INDEX_LENGTH = Integer.MAX_VALUE; /** Return each index as a split. */ public FileSplit[] getSplits(NutchFileSystem fs, JobConf job,
svn commit: r320931 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java
Author: cutting Date: Thu Oct 13 14:43:23 2005 New Revision: 320931 URL: http://svn.apache.org/viewcvs?rev=320931view=rev Log: Fix a NullPointerException. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java?rev=320931r1=320930r2=320931view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java Thu Oct 13 14:43:23 2005 @@ -268,10 +268,12 @@ public File[] listFiles(File f, FileFilter filter) throws IOException { Vector results = new Vector(); File listing[] = listFilesRaw(f); -for (int i = 0; i listing.length; i++) { +if (listing != null) { + for (int i = 0; i listing.length; i++) { if (filter.accept(listing[i])) { -results.add(listing[i]); + results.add(listing[i]); } + } } return (File[]) results.toArray(new File[results.size()]); }
svn commit: r314958 - in /lucene/nutch/trunk/site: about.html bot.html credits.html i18n.html index.html index.pdf issue_tracking.html linkmap.html mailing_lists.html tutorial.html version_control.htm
Author: cutting Date: Wed Oct 12 09:31:33 2005 New Revision: 314958 URL: http://svn.apache.org/viewcvs?rev=314958view=rev Log: Use mirrors for downloads. Modified: lucene/nutch/trunk/site/about.html lucene/nutch/trunk/site/bot.html lucene/nutch/trunk/site/credits.html lucene/nutch/trunk/site/i18n.html lucene/nutch/trunk/site/index.html lucene/nutch/trunk/site/index.pdf lucene/nutch/trunk/site/issue_tracking.html lucene/nutch/trunk/site/linkmap.html lucene/nutch/trunk/site/mailing_lists.html lucene/nutch/trunk/site/tutorial.html lucene/nutch/trunk/site/version_control.html Modified: lucene/nutch/trunk/site/about.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/about.html?rev=314958r1=314957r2=314958view=diff == --- lucene/nutch/trunk/site/about.html (original) +++ lucene/nutch/trunk/site/about.html Wed Oct 12 09:31:33 2005 @@ -138,7 +138,7 @@ div onclick=SwitchMenu('menu_1.3', 'skin/') id=menu_1.3Title class=menutitleResources/div div id=menu_1.3 class=menuitemgroup div class=menuitem -a title= href=release/Download/a +a title= href=http://www.apache.org/dyn/closer.cgi/lucene/nutch/;Download/a /div div class=menuitem a title= href=mailing_lists.htmlMailing Lists/a Modified: lucene/nutch/trunk/site/bot.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/bot.html?rev=314958r1=314957r2=314958view=diff == --- lucene/nutch/trunk/site/bot.html (original) +++ lucene/nutch/trunk/site/bot.html Wed Oct 12 09:31:33 2005 @@ -138,7 +138,7 @@ div onclick=SwitchMenu('menu_1.3', 'skin/') id=menu_1.3Title class=menutitleResources/div div id=menu_1.3 class=menuitemgroup div class=menuitem -a title= href=release/Download/a +a title= href=http://www.apache.org/dyn/closer.cgi/lucene/nutch/;Download/a /div div class=menuitem a title= href=mailing_lists.htmlMailing Lists/a Modified: lucene/nutch/trunk/site/credits.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/credits.html?rev=314958r1=314957r2=314958view=diff == --- lucene/nutch/trunk/site/credits.html (original) +++ lucene/nutch/trunk/site/credits.html Wed Oct 12 09:31:33 2005 @@ -138,7 +138,7 @@ div onclick=SwitchMenu('menu_1.3', 'skin/') id=menu_1.3Title class=menutitleResources/div div id=menu_1.3 class=menuitemgroup div class=menuitem -a title= href=release/Download/a +a title= href=http://www.apache.org/dyn/closer.cgi/lucene/nutch/;Download/a /div div class=menuitem a title= href=mailing_lists.htmlMailing Lists/a Modified: lucene/nutch/trunk/site/i18n.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/i18n.html?rev=314958r1=314957r2=314958view=diff == --- lucene/nutch/trunk/site/i18n.html (original) +++ lucene/nutch/trunk/site/i18n.html Wed Oct 12 09:31:33 2005 @@ -138,7 +138,7 @@ div onclick=SwitchMenu('menu_1.3', 'skin/') id=menu_1.3Title class=menutitleResources/div div id=menu_1.3 class=menuitemgroup div class=menuitem -a title= href=release/Download/a +a title= href=http://www.apache.org/dyn/closer.cgi/lucene/nutch/;Download/a /div div class=menuitem a title= href=mailing_lists.htmlMailing Lists/a Modified: lucene/nutch/trunk/site/index.html URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/site/index.html?rev=314958r1=314957r2=314958view=diff == --- lucene/nutch/trunk/site/index.html (original) +++ lucene/nutch/trunk/site/index.html Wed Oct 12 09:31:33 2005 @@ -138,7 +138,7 @@ div onclick=SwitchMenu('menu_1.3', 'skin/') id=menu_1.3Title class=menutitleResources/div div id=menu_1.3 class=menuitemgroup div class=menuitem -a title= href=release/Download/a +a title= href=http://www.apache.org/dyn/closer.cgi/lucene/nutch/;Download/a /div div class=menuitem a title= href=mailing_lists.htmlMailing Lists/a @@ -184,9 +184,6 @@ a href=#NewsNews/a ul class=minitoc li -a href=#1+October+2005%3A+Nutch+0.7.1+Released1 October 2005: Nutch 0.7.1 Released/a -/li -li a href=#17+August+2005%3A+Nutch+0.7+Released17 August 2005: Nutch 0.7 Released/a /li li @@ -210,23 +207,17 @@ a name=N1000C/aa name=News/a h2 class=h3News/h2 div class=section -a name=N10012/aa name=1+October+2005%3A+Nutch+0.7.1+Released/a -h3 class=h41 October 2005: Nutch 0.7.1 Released/h3 -pThe 0.7.1 release of Nutch is now available. This is a bug fix release. See - a href=http://svn.apache.org/viewcvs.cgi/lucene/nutch/branches/branch-0.7/CHANGES.txt?rev=292986; - CHANGES.txt/a for details. The release is available - a href=http://lucene.apache.org/nutch/release/;here/a./p -a name=N10024/aa name=17+August+2005%3A+Nutch+0.7+Released/a +a name=N10012/aa name=17
svn commit: r314991 - /lucene/nutch/nightly/nightly.sh
Author: cutting Date: Wed Oct 12 11:33:47 2005 New Revision: 314991 URL: http://svn.apache.org/viewcvs?rev=314991view=rev Log: Put nightly releases on cvs.apache.org, not www, per Apache policy. Modified: lucene/nutch/nightly/nightly.sh Modified: lucene/nutch/nightly/nightly.sh URL: http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.sh?rev=314991r1=314990r2=314991view=diff == --- lucene/nutch/nightly/nightly.sh (original) +++ lucene/nutch/nightly/nightly.sh Wed Oct 12 11:33:47 2005 @@ -5,7 +5,7 @@ TRUNK=http://svn.apache.org/repos/asf/lucene/nutch/trunk REL_SERVER=people.apache.org -REL_DIR=/www/www.apache.org/dist/lucene/nutch/nightly +REL_DIR=/www/cvs.apache.org/dist/lucene/nutch/nightly # create an empty build directory rm -rf /tmp/nutch-nightly
svn commit: r312693 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/MapFile.java
Author: cutting Date: Mon Oct 10 10:40:21 2005 New Revision: 312693 URL: http://svn.apache.org/viewcvs?rev=312693view=rev Log: Fix to permit non one-to-one mappings in index. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/MapFile.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/MapFile.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/MapFile.java?rev=312693r1=312692r2=312693view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/MapFile.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/MapFile.java Mon Oct 10 10:40:21 2005 @@ -234,7 +234,7 @@ break; // check order to make sure comparator is compatible - if (lastKey != null comparator.compare(lastKey, k) = 0) + if (lastKey != null comparator.compare(lastKey, k) 0) throw new IOException(key out of order: +k+ after +lastKey); lastKey = k;
svn commit: r307445 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java
Author: cutting Date: Sun Oct 9 08:15:34 2005 New Revision: 307445 URL: http://svn.apache.org/viewcvs?rev=307445view=rev Log: Overwrite should be default now. Use super's implementation. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java?rev=307445r1=307444r2=307445view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java Sun Oct 9 08:15:34 2005 @@ -64,14 +64,6 @@ return ndfs.open(getPath(f)); } -/** - * Create the file at f. - */ -public NFSOutputStream create(File f) throws IOException { -return create(f, false); -} -/** - */ public NFSOutputStream create(File f, boolean overwrite) throws IOException { return ndfs.create(getPath(f), overwrite); }
svn commit: r307203 - in /lucene/nutch/branches/mapred: bin/nutch src/java/org/apache/nutch/crawl/Crawl.java src/java/org/apache/nutch/crawl/DeleteDuplicates.java src/java/org/apache/nutch/indexer/NdfsDirectory.java
Author: cutting Date: Fri Oct 7 15:16:27 2005 New Revision: 307203 URL: http://svn.apache.org/viewcvs?rev=307203view=rev Log: First working version of MapReduce-based dedup. Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java Modified: lucene/nutch/branches/mapred/bin/nutch lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java Modified: lucene/nutch/branches/mapred/bin/nutch URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/nutch?rev=307203r1=307202r2=307203view=diff == --- lucene/nutch/branches/mapred/bin/nutch (original) +++ lucene/nutch/branches/mapred/bin/nutch Fri Oct 7 15:16:27 2005 @@ -138,6 +138,8 @@ CLASS=org.apache.nutch.crawl.LinkDb elif [ $COMMAND = index ] ; then CLASS=org.apache.nutch.crawl.Indexer +elif [ $COMMAND = dedup ] ; then + CLASS=org.apache.nutch.crawl.DeleteDuplicates elif [ $COMMAND = merge ] ; then CLASS=org.apache.nutch.indexer.IndexMerger elif [ $COMMAND = server ] ; then Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java?rev=307203r1=307202r2=307203view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java Fri Oct 7 15:16:27 2005 @@ -109,8 +109,9 @@ new LinkDb(conf).invert(linkDb, segments); // invert links -// index +// index dedup new Indexer(conf).index(index, linkDb, fs.listFiles(segments)); +new DeleteDuplicates(conf).dedup(new File[] { index }); LOG.info(crawl finished: + dir); } Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java?rev=307203view=auto == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java (added) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/DeleteDuplicates.java Fri Oct 7 15:16:27 2005 @@ -0,0 +1,338 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the License); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import java.io.*; +import java.security.*; +import java.text.*; +import java.util.*; +import java.util.logging.*; + +import org.apache.nutch.io.*; +import org.apache.nutch.fs.*; +import org.apache.nutch.util.*; +import org.apache.nutch.mapred.*; +import org.apache.nutch.indexer.*; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.document.Document; + +/** + * Deletes duplicate documents in a set of Lucene indexes. + * Duplicates have either the same contents (via MD5 hash) or the same URL. + **/ +public class DeleteDuplicates extends NutchConfigured + implements Mapper, OutputFormat { + private static final Logger LOG = +LogFormatter.getLogger(org.apache.nutch.crawl.DeleteDuplicates); + +// Algorithm: +// +// 1. map indexes - md5, score, urlLen, index,doc +// partition by md5 +// reduce, deleting all but largest score w/ shortest url +// +// 2. map indexes - url, fetchdate, index,doc +// partition by url +// reduce, deleting all but most recent. +// +// Part 2 is not yet implemented, but the Indexer currently only indexes one +// URL per page, so this is not a critical problem. + + public static class IndexDoc implements WritableComparable { +private UTF8 index; // the segment index +private int doc; // within the index + +public void write(DataOutput out) throws IOException { + index.write(out); + out.writeInt(doc); +} + +public void readFields(DataInput in) throws IOException { + if (index == null) { +index = new UTF8(); + } + index.readFields(in); + this.doc = in.readInt
svn commit: r306808 - /lucene/nutch/trunk/conf/parse-plugins.xml
Author: cutting Date: Thu Oct 6 10:02:03 2005 New Revision: 306808 URL: http://svn.apache.org/viewcvs?rev=306808view=rev Log: Add parse-ext content-types so that unit tests pass. Modified: lucene/nutch/trunk/conf/parse-plugins.xml Modified: lucene/nutch/trunk/conf/parse-plugins.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/parse-plugins.xml?rev=306808r1=306807r2=306808view=diff == --- lucene/nutch/trunk/conf/parse-plugins.xml (original) +++ lucene/nutch/trunk/conf/parse-plugins.xml Thu Oct 6 10:02:03 2005 @@ -204,4 +204,14 @@ plugin id=parse-text / /mimeType + !-- Types for parse-ext plugin: required for unit tests to pass. -- + + mimeType name=application/vnd.nutch.example.cat + plugin id=parse-ext / + /mimeType + + mimeType name=application/vnd.nutch.example.md5sum + plugin id=parse-ext / + /mimeType + /parse-plugins
svn commit: r306812 - /lucene/nutch/nightly/nightly.properties
Author: cutting Date: Thu Oct 6 10:18:01 2005 New Revision: 306812 URL: http://svn.apache.org/viewcvs?rev=306812view=rev Log: Update mailhost, since I moved and have a different ISP at home. Modified: lucene/nutch/nightly/nightly.properties Modified: lucene/nutch/nightly/nightly.properties URL: http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.properties?rev=306812r1=306811r2=306812view=diff == --- lucene/nutch/nightly/nightly.properties (original) +++ lucene/nutch/nightly/nightly.properties Thu Oct 6 10:18:01 2005 @@ -1,4 +1,4 @@ -MailLogger.mailhost = smtp.comcast.net +MailLogger.mailhost = smtp.sbcglobal.net MailLogger.from = nutch-dev@incubator.apache.org MailLogger.failure.to = nutch-dev@incubator.apache.org MailLogger.failure.subject = Nutch nightly build failure
svn commit: r306813 - /lucene/nutch/nightly/nightly.sh
Author: cutting Date: Thu Oct 6 10:18:46 2005 New Revision: 306813 URL: http://svn.apache.org/viewcvs?rev=306813view=rev Log: Use /tmp/nutch-nightly instead of /tmp/nutch to avoid conflicts with mapred. Modified: lucene/nutch/nightly/nightly.sh Modified: lucene/nutch/nightly/nightly.sh URL: http://svn.apache.org/viewcvs/lucene/nutch/nightly/nightly.sh?rev=306813r1=306812r2=306813view=diff == --- lucene/nutch/nightly/nightly.sh (original) +++ lucene/nutch/nightly/nightly.sh Thu Oct 6 10:18:46 2005 @@ -8,14 +8,14 @@ REL_DIR=/www/lucene.apache.org/nutch/release/nightly # create an empty build directory -rm -rf /tmp/nutch +rm -rf /tmp/nutch-nightly cd /tmp # export sources into it -svn export $TRUNK nutch +svn export $TRUNK nutch-nightly # run build -cd nutch +cd nutch-nightly $HOME/local/ant/bin/ant \ -propertyfile $HOME/src/nutch/nightly/nightly.properties \ -logger org.apache.tools.ant.listener.MailLogger \
svn commit: r294928 - in /lucene/nutch/branches/mapred: site/tutorial.html site/tutorial.pdf src/site/src/documentation/content/xdocs/tutorial.xml
Author: cutting Date: Tue Oct 4 14:58:53 2005 New Revision: 294928 URL: http://svn.apache.org/viewcvs?rev=294928view=rev Log: Update tutorial for mapred changes. Still does not describe mapred or NDFS configuration. Modified: lucene/nutch/branches/mapred/site/tutorial.html lucene/nutch/branches/mapred/site/tutorial.pdf lucene/nutch/branches/mapred/src/site/src/documentation/content/xdocs/tutorial.xml Modified: lucene/nutch/branches/mapred/site/tutorial.html URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/site/tutorial.html?rev=294928r1=294927r2=294928view=diff == --- lucene/nutch/branches/mapred/site/tutorial.html (original) +++ lucene/nutch/branches/mapred/site/tutorial.html Tue Oct 4 14:58:53 2005 @@ -276,11 +276,11 @@ ol -liCreate a flat file of root urls. For example, to crawl the -span class=codefragnutch/span site you might start with a file named -span class=codefragurls/span containing just the Nutch home page. All other -Nutch pages should be reachable from this page. The span class=codefragurls/span -file would thus look like: +liCreate a directory with a flat file of root urls. For example, to +crawl the span class=codefragnutch/span site you might start with a file named +span class=codefragurls/nutch/span containing the url of just the Nutch home +page. All other Nutch pages should be reachable from this page. The +span class=codefragurls/nutch/span file would thus contain: pre class=code http://lucene.apache.org/nutch/ /pre @@ -310,138 +310,152 @@ span class=codefrag-dir/span emdir/em names the directory to put the crawl in./li li -span class=codefrag-depth/span emdepth/em indicates the link depth from the root -page that should be crawled./li +span class=codefrag-threads/span emthreads/em determines the number of +threads that will fetch in parallel./li li -span class=codefrag-delay/span emdelay/em determines the number of seconds -between accesses to each host./li +span class=codefrag-depth/span emdepth/em indicates the link depth from the root +page that should be crawled./li li -span class=codefrag-threads/span emthreads/em determines the number of -threads that will fetch in parallel./li +span class=codefrag-topN/span emN/em determines the maximum number of pages that +will be retrieved at each level up to the depth./li /ul pFor example, a typical call might be:/p pre class=code -bin/nutch crawl urls -dir crawl.test -depth 3 gt;amp; crawl.log +bin/nutch crawl urls -dir crawl -depth 3 -topN 50 /pre -pTypically one starts testing one's configuration by crawling at low -depths, and watching the output to check that desired pages are found. -Once one is more confident of the configuration, then an appropriate -depth for a full crawl is around 10./p +pTypically one starts testing one's configuration by crawling at +shallow depths, sharply limiting the number of pages fetched at each +level (span class=codefrag-topN/span), and watching the output to check that +desired pages are fetched and undesirable pages are not. Once one is +confident of the configuration, then an appropriate depth for a full +crawl is around 10. The number of pages per level +(span class=codefrag-topN/span) for a full crawl can be from tens of thousands to +millions, depending on your resources./p pOnce crawling has completed, one can skip to the Searching section below./p /div -a name=N100E4/aa name=Whole-web+Crawling/a +a name=N100EA/aa name=Whole-web+Crawling/a h2 class=h3Whole-web Crawling/h2 div class=section pWhole-web crawling is designed to handle very large crawls which may take weeks to complete, running on multiple machines./p -a name=N100ED/aa name=Whole-web%3A+Concepts/a +a name=N100F3/aa name=Whole-web%3A+Concepts/a h3 class=h4Whole-web: Concepts/h3 -pNutch data is of two types:/p +pNutch data is composed of:/p ol + -liThe web database. This contains information about every -page known to Nutch, and about links between those pages./li +liThe crawl database, or emcrawldb/em. This contains +information about every url known to Nutch, including whether it was +fetched, and, if so, when./li + -liA set of segments. Each segment is a set of pages that are -fetched and indexed as a unit. Segment data consists of the -following types:/li +liThe link database, or emlinkdb/em. This contains the list +of known links to each url, including both the source url and anchor +text of the link./li + + +liA set of emsegments/em. Each segment is a set of urls that are +fetched as a unit. Segments are directories with the following +subdirectories:/li + li ul -lia emfetchlist/em is a file -that names a set of pages to be fetched/li +lia emcrawl_generate/em names a set of urls to be fetched/li + +lia emcrawl_fetch/em contains the status of fetching each url/li + +lia emcontent/em contains the content of each url/li
svn commit: r293404 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java
Author: cutting Date: Mon Oct 3 10:33:32 2005 New Revision: 293404 URL: http://svn.apache.org/viewcvs?rev=293404view=rev Log: Remove redundant call to done(), observed by Stefan. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java?rev=293404r1=293403r2=293404view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java Mon Oct 3 10:33:32 2005 @@ -578,7 +578,6 @@ try { task.run(job, umbilical); // run the task - umbilical.done(taskid); } catch (Throwable throwable) { LOG.log(Level.WARNING, Failed to spawn child, throwable); // Report back any failures, for diagnostic purposes
svn commit: r292509 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
Author: cutting Date: Thu Sep 29 11:57:35 2005 New Revision: 292509 URL: http://svn.apache.org/viewcvs?rev=292509view=rev Log: Use a more reasonable value when timing out hung fetcher threads. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=292509r1=292508r2=292509view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Thu Sep 29 11:57:35 2005 @@ -290,7 +290,8 @@ // some threads seem to hang, despite all intentions if (done) { // last entry read long doneTime = System.currentTimeMillis(); -long timeout = getConf().getLong(http.timeout, 1) * 10; +long timeout = // select timeout that avoids a task timeout + NutchConf.get().getInt(mapred.task.timeout, 10*60*1000)/2; while (activeThreads 0 System.currentTimeMillis()-doneTime timeout) { try {
svn commit: r292532 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MRConstants.java
Author: cutting Date: Thu Sep 29 13:30:11 2005 New Revision: 292532 URL: http://svn.apache.org/viewcvs?rev=292532view=rev Log: Increase timeout, as launching large jobs can sometimes cause the jobtracker to not see heartbeats for a bit. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MRConstants.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MRConstants.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MRConstants.java?rev=292532r1=292531r2=292532view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MRConstants.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MRConstants.java Thu Sep 29 13:30:11 2005 @@ -25,7 +25,7 @@ // Timeouts, constants // public static final long HEARTBEAT_INTERVAL = 3 * 1000; -public static final long TASKTRACKER_EXPIRY_INTERVAL = 30 * 1000; +public static final long TASKTRACKER_EXPIRY_INTERVAL = 10 * 60 * 1000; // // Result codes
svn commit: r292539 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch: fs/LocalFileSystem.java fs/NutchFileSystem.java ndfs/NDFSClient.java
Author: cutting Date: Thu Sep 29 13:43:53 2005 New Revision: 292539 URL: http://svn.apache.org/viewcvs?rev=292539view=rev Log: Change so that default is to overwrite existing files, as this is normal under MapReduce, when tasks may be re-executed. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/NDFSClient.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java?rev=292539r1=292538r2=292539view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/LocalFileSystem.java Thu Sep 29 13:43:53 2005 @@ -95,13 +95,6 @@ return new LocalNFSFileInputStream(f); } -/** - * Create the file at f. - */ -public NFSOutputStream create(File f) throws IOException { -return create(f, false); -} - /* * For create()'s NFSOutputStream. */ @@ -128,8 +121,6 @@ public void write(int b) throws IOException { fos.write(b); } } -/** - */ public NFSOutputStream create(File f, boolean overwrite) throws IOException { if (f.exists() ! overwrite) { throw new IOException(File already exists:+f); Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java?rev=292539r1=292538r2=292539view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NutchFileSystem.java Thu Sep 29 13:43:53 2005 @@ -122,10 +122,18 @@ public abstract NFSInputStream open(File f) throws IOException; /** - * Opens an OutputStream at the indicated File, whether local - * or via NDFS. + * Opens an OutputStream at the indicated File. + * Files are overwritten by default. + */ +public NFSOutputStream create(File f) throws IOException { +return create(f, true); +} + +/** Opens an OutputStream at the indicated File. + * @param f the file name to open + * @param overwrite if a file with this name already exists, then if true, + * the file will be overwritten, and if false an error will be thrown. */ -public abstract NFSOutputStream create(File f) throws IOException; public abstract NFSOutputStream create(File f, boolean overwrite) throws IOException; /** Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/NDFSClient.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/NDFSClient.java?rev=292539r1=292538r2=292539view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/NDFSClient.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/NDFSClient.java Thu Sep 29 13:43:53 2005 @@ -71,14 +71,6 @@ return new NDFSInputStream(src.toString()); } -/** - * Create an output stream that writes to all the right places. - * Basically creates instance of inner subclass of OutputStream - * that handles datanode/namenode negotiation. - */ -public NFSOutputStream create(UTF8 src) throws IOException { -return create(src, false); -} public NFSOutputStream create(UTF8 src, boolean overwrite) throws IOException { return new NDFSOutputStream(src, overwrite); }
svn commit: r292556 - /lucene/nutch/branches/mapred/conf/nutch-default.xml
Author: cutting Date: Thu Sep 29 14:27:49 2005 New Revision: 292556 URL: http://svn.apache.org/viewcvs?rev=292556view=rev Log: Document mapred.tasktracker.tasks.maximum and provide a default. Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=292556r1=292555r2=292556view=diff == --- lucene/nutch/branches/mapred/conf/nutch-default.xml (original) +++ lucene/nutch/branches/mapred/conf/nutch-default.xml Thu Sep 29 14:27:49 2005 @@ -419,13 +419,20 @@ /description /property - property namemapred.task.timeout/name value60/value descriptionThe number of milliseconds before a task will be terminated if it neither reads an input, writes an output, nor updates its status string. + /description +/property + +property + namemapred.tasktracker.tasks.maximum/name + value2/value + descriptionThe maximum number of tasks that will be run + simultaneously by a task tracker. /description /property
svn commit: r290602 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DF.java
Author: cutting Date: Tue Sep 20 19:38:56 2005 New Revision: 290602 URL: http://svn.apache.org/viewcvs?rev=290602view=rev Log: Fix NUTCH-93: long filesystem names can wrap to a new line and were not parsed correctly. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DF.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DF.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DF.java?rev=290602r1=290601r2=290602view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DF.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/ndfs/DF.java Tue Sep 20 19:38:56 2005 @@ -48,6 +48,9 @@ new StringTokenizer(lines.readLine(), \t\n\r\f%); this.filesystem = tokens.nextToken(); +if (!tokens.hasMoreTokens()) {// for long filesystem name + tokens = new StringTokenizer(lines.readLine(), \t\n\r\f%); +} this.capacity = Long.parseLong(tokens.nextToken()) * 1024; this.used = Long.parseLong(tokens.nextToken()) * 1024; this.available = Long.parseLong(tokens.nextToken()) * 1024;
svn commit: r290067 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch: mapred/InputFormatBase.java util/NutchConf.java
Author: cutting Date: Sun Sep 18 23:08:19 2005 New Revision: 290067 URL: http://svn.apache.org/viewcvs?rev=290067view=rev Log: Improved error string javadoc. Contributed by Paul Baclace. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InputFormatBase.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/NutchConf.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InputFormatBase.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InputFormatBase.java?rev=290067r1=290066r2=290067view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InputFormatBase.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InputFormatBase.java Sun Sep 18 23:08:19 2005 @@ -46,8 +46,17 @@ Reporter reporter) throws IOException; - /** Subclasses may override to, e.g., select only files matching a regular - * expression.*/ + /** List input directories. + * Subclasses may override to, e.g., select only files matching a regular + * expression. + * Property mapred.input.subdir, if set, names a subdirectory that + * is appended to all input dirs specified by job, and if the given fs + * lists those too, each is added to the returned array of File. + * @param fs + * @param job + * @return array of File objects, never zero length. + * @throws IOException if zero items. + */ protected File[] listFiles(NutchFileSystem fs, JobConf job) throws IOException { File[] dirs = job.getInputDirs(); @@ -73,7 +82,7 @@ } if (result.size() == 0) { - throw new IOException(No input files in: +job.getInputDirs()); + throw new IOException(No input directories specified in: +job); } return (File[])result.toArray(new File[result.size()]); } Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/NutchConf.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/NutchConf.java?rev=290067r1=290066r2=290067view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/NutchConf.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/util/NutchConf.java Sun Sep 18 23:08:19 2005 @@ -30,14 +30,17 @@ import javax.xml.transform.stream.StreamResult; /** Provides access to Nutch configuration parameters. - * + * pAn ordered list of configuration parameter files with + * default and always-overrides site parameters. * pDefault values for all parameters are specified in a file named * ttnutch-default.xml/tt located on the classpath. Overrides for these * defaults should be in an optional file named ttnutch-site.xml/tt, also * located on the classpath. Typically these files reside in the * ttconf//tt subdirectory at the top-level of a Nutch installation. + * pThe resource files are read upon first access of values (set, get, + * or write) after [EMAIL PROTECTED] #addConfResource(String)} or + * [EMAIL PROTECTED] #addConfResource(File)}. */ - public class NutchConf { private static final Logger LOG = LogFormatter.getLogger(org.apache.nutch.util.NutchConf); @@ -57,7 +60,7 @@ resourceNames.add(nutch-site.xml); } - /** A new configuration with the same settings as another. */ + /** A new configuration with the same settings cloned from another. */ public NutchConf(NutchConf other) { this.resourceNames = (ArrayList)other.resourceNames.clone(); if (other.properties != null) @@ -392,6 +395,25 @@ } catch (Exception e) { throw new RuntimeException(e); } + } + + + public String toString() { +StringBuffer sb = new StringBuffer(resourceNames.size()*30); +sb.append(NutchConf: ); +ListIterator i = resourceNames.listIterator(); +while (i.hasNext()) { + if (i.nextIndex() != 0) { +sb.append( , ); + } + Object obj = i.next(); + if (obj instanceof File) { +sb.append((File)obj); + } else { +sb.append((String)obj); + } +} +return sb.toString(); } /** For debugging. List non-default properties to the terminal and exit. */
svn commit: r289281 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred: LocalJobRunner.java MapTask.java ReduceTask.java Task.java
Author: cutting Date: Thu Sep 15 10:12:36 2005 New Revision: 289281 URL: http://svn.apache.org/viewcvs?rev=289281view=rev Log: Improve status reports: Always send final status when done; Have LocalJobRunner log status. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/LocalJobRunner.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/Task.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/LocalJobRunner.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/LocalJobRunner.java?rev=289281r1=289280r2=289281view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/LocalJobRunner.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/LocalJobRunner.java Thu Sep 15 10:12:36 2005 @@ -18,12 +18,16 @@ import java.io.*; import java.util.*; +import java.util.logging.*; import org.apache.nutch.io.*; import org.apache.nutch.fs.*; +import org.apache.nutch.util.*; /** Implements MapReduce locally, in-process, for debugging. */ public class LocalJobRunner implements JobSubmissionProtocol { + public static final Logger LOG = +LogFormatter.getLogger(org.apache.nutch.mapred.LocalJobRunner); private NutchFileSystem fs; private HashMap jobs = new HashMap(); @@ -102,6 +106,7 @@ public Task getTask(String taskid) { return null; } public void progress(String taskId, float progress, String state) { + LOG.info(state); float taskIndex = mapIds.indexOf(taskId); if (taskIndex = 0) { // mapping float numTasks = mapIds.size(); Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java?rev=289281r1=289280r2=289281view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java Thu Sep 15 10:12:36 2005 @@ -129,7 +129,7 @@ } } } -umbilical.done(getTaskId()); +done(umbilical); } } Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java?rev=289281r1=289280r2=289281view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java Thu Sep 15 10:12:36 2005 @@ -270,7 +270,7 @@ out.close(reporter); } -umbilical.done(getTaskId()); +done(umbilical); } /** Construct output file names so that, when an output directory listing is Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/Task.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/Task.java?rev=289281r1=289280r2=289281view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/Task.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/Task.java Thu Sep 15 10:12:36 2005 @@ -114,4 +114,10 @@ } } + public void done(TaskUmbilicalProtocol umbilical) +throws IOException { +umbilical.progress(getTaskId(), // send a final status report + taskProgress.get(), taskProgress.toString()); +umbilical.done(getTaskId()); + } }
svn commit: r289282 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
Author: cutting Date: Thu Sep 15 10:15:16 2005 New Revision: 289282 URL: http://svn.apache.org/viewcvs?rev=289282view=rev Log: Finish even when some threads hung. Improve status reports. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=289282r1=289281r2=289282view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Thu Sep 15 10:15:16 2005 @@ -59,6 +59,7 @@ private String segmentName; private int activeThreads; private int maxRedirect; + private boolean done; private long start = System.currentTimeMillis(); // start time of fetcher run @@ -70,6 +71,10 @@ private boolean parsing; private class FetcherThread extends Thread { +public FetcherThread() { + this.setDaemon(true); // don't hang JVM on exit +} + public void run() { synchronized (Fetcher.this) {activeThreads++;} // count threads @@ -82,8 +87,10 @@ break;// exit try { // get next entry from input -if (!input.next(key, datum)) +if (!input.next(key, datum)) { + done = true; break; // at eof, exit +} } catch (IOException e) { LOG.severe(fetcher caught:+e.toString()); break; @@ -125,8 +132,9 @@ } break; - case ProtocolStatus.RETRY: // retry case ProtocolStatus.EXCEPTION: +logError(url, status.getMessage()); + case ProtocolStatus.RETRY: // retry output(key, datum, null, CrawlDatum.STATUS_FETCH_RETRY); break; @@ -152,7 +160,7 @@ } catch (Throwable t) { // unexpected exception -logError(url, t); +logError(url, t.toString()); output(key, datum, null, CrawlDatum.STATUS_FETCH_GONE); } @@ -165,9 +173,8 @@ } } -private void logError(String url, Throwable t) { - LOG.info(fetch of + url + failed with: + t); - LOG.log(Level.FINE, stack, t);// stack trace +private void logError(String url, String message) { + LOG.info(fetch of + url + failed with: + message); synchronized (Fetcher.this) { // record failure errors++; } @@ -225,19 +232,14 @@ private synchronized void updateStatus(int bytesInPage) throws IOException { pages++; bytes += bytesInPage; + } -if ((pages % 100) == 0) { // show status every 100pp - long elapsed = (System.currentTimeMillis() - start)/1000; - String line1 = -pages+ pages, +errors+ errors, +bytes+ bytes, +elapsed+ secs; - String line2 = -+ ((float)pages)/elapsed+ pages/s, -+ float)bytes)*8)/1024)/elapsed+ kb/s, -+ ((float)bytes)/pages+ bytes/page; - LOG.info( status: +line1); - LOG.info( status: +line2); - reporter.setStatus(line2); -} + private synchronized void reportStatus() throws IOException { +long elapsed = (System.currentTimeMillis() - start)/1000; +reporter.setStatus + (pages+ pages, +errors+ errors, + + Math.round(((float)pages*10)/elapsed)/10.0+ pages/s, + + Math.round(float)bytes)*8)/1024)/elapsed)+ kb/s, ); } public void configure(JobConf job) { @@ -266,7 +268,7 @@ this.input = input; this.output = output; this.reporter = reporter; - + this.maxRedirect = getConf().getInt(http.redirect.max, 3); int threadCount = getConf().getInt(fetcher.threads.fetch, 10); @@ -278,6 +280,24 @@ try { Thread.sleep(1000); } catch (InterruptedException e) {} + + reportStatus(); + + // some threads seem to hang, despite all intentions + if (done) { // last entry read +long doneTime = System.currentTimeMillis(); +long timeout = getConf().getLong(http.timeout, 1) * 10; +while (activeThreads 0 +System.currentTimeMillis()-doneTime timeout) { + try { +Thread.sleep(1000); // wait for completion + } catch (InterruptedException e) {} +} +if (activeThreads 0) { // abort after timeout + LOG.warning(Aborting with +activeThreads+ hung threads.); + return
svn commit: r289286 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
Author: cutting Date: Thu Sep 15 11:11:39 2005 New Revision: 289286 URL: http://svn.apache.org/viewcvs?rev=289286view=rev Log: Don't synchronize while making setStatus() RPC. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=289286r1=289285r2=289286view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Thu Sep 15 11:11:39 2005 @@ -234,12 +234,16 @@ bytes += bytesInPage; } - private synchronized void reportStatus() throws IOException { -long elapsed = (System.currentTimeMillis() - start)/1000; -reporter.setStatus - (pages+ pages, +errors+ errors, - + Math.round(((float)pages*10)/elapsed)/10.0+ pages/s, - + Math.round(float)bytes)*8)/1024)/elapsed)+ kb/s, ); + private void reportStatus() throws IOException { +String status; +synchronized (this) { + long elapsed = (System.currentTimeMillis() - start)/1000; + status = +pages+ pages, +errors+ errors, ++ Math.round(((float)pages*10)/elapsed)/10.0+ pages/s, ++ Math.round(float)bytes)*8)/1024)/elapsed)+ kb/s, ; +} +reporter.setStatus(status); } public void configure(JobConf job) {
svn commit: r280911 - in /lucene/nutch/branches/mapred/bin: nutch-daemons.sh slaves.sh
Author: cutting Date: Wed Sep 14 12:04:07 2005 New Revision: 280911 URL: http://svn.apache.org/viewcvs?rev=280911view=rev Log: Change scripts to pass environment, so that shared home directory is not required. Modified: lucene/nutch/branches/mapred/bin/nutch-daemons.sh lucene/nutch/branches/mapred/bin/slaves.sh Modified: lucene/nutch/branches/mapred/bin/nutch-daemons.sh URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/nutch-daemons.sh?rev=280911r1=280910r2=280911view=diff == --- lucene/nutch/branches/mapred/bin/nutch-daemons.sh (original) +++ lucene/nutch/branches/mapred/bin/nutch-daemons.sh Wed Sep 14 12:04:07 2005 @@ -13,4 +13,4 @@ bin=`dirname $0` bin=`cd $bin; pwd` -exec $bin/slaves.sh /bin/bash --login $bin/nutch-daemon.sh $@ +exec $bin/slaves.sh $bin/nutch-daemon.sh $@ Modified: lucene/nutch/branches/mapred/bin/slaves.sh URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/slaves.sh?rev=280911r1=280910r2=280911view=diff == --- lucene/nutch/branches/mapred/bin/slaves.sh (original) +++ lucene/nutch/branches/mapred/bin/slaves.sh Wed Sep 14 12:04:07 2005 @@ -21,5 +21,5 @@ for slave in `cat $NUTCH_SLAVES`; do echo $slave: - ssh -o ConnectTimeout=1 $slave $@ + ssh -o ConnectTimeout=1 -o SendEnv='NUTCH* JAVA*' $slave $@ done
svn commit: r280912 - /lucene/nutch/branches/mapred/bin/stop-all.sh
Author: cutting Date: Wed Sep 14 12:04:41 2005 New Revision: 280912 URL: http://svn.apache.org/viewcvs?rev=280912view=rev Log: Stop jobtracker first, to stop tasks faster. Modified: lucene/nutch/branches/mapred/bin/stop-all.sh Modified: lucene/nutch/branches/mapred/bin/stop-all.sh URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/stop-all.sh?rev=280912r1=280911r2=280912view=diff == --- lucene/nutch/branches/mapred/bin/stop-all.sh (original) +++ lucene/nutch/branches/mapred/bin/stop-all.sh Wed Sep 14 12:04:41 2005 @@ -5,7 +5,7 @@ bin=`dirname $0` bin=`cd $bin; pwd` -$bin/nutch-daemons.sh stop tasktracker $bin/nutch-daemon.sh stop jobtracker +$bin/nutch-daemons.sh stop tasktracker $bin/nutch-daemon.sh stop namenode $bin/nutch-daemons.sh stop datanode
svn commit: r280913 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTaskRunner.java
Author: cutting Date: Wed Sep 14 12:05:08 2005 New Revision: 280913 URL: http://svn.apache.org/viewcvs?rev=280913view=rev Log: Log the stack trace, so we can debug this one better. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTaskRunner.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTaskRunner.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTaskRunner.java?rev=280913r1=280912r2=280913view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTaskRunner.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTaskRunner.java Wed Sep 14 12:05:08 2005 @@ -99,8 +99,9 @@ copyPhase.startNextPhase(); } catch (IOException e) { // failed: try again later - LOG.warning(copy failed: +loc.getMapTaskId()+ from +addr); - + LOG.log(Level.WARNING, + copy failed: +loc.getMapTaskId()+ from +addr, + e); } finally { MapOutputFile.setProgressReporter(null); }
svn commit: r280368 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/TestClient.java
Author: cutting Date: Mon Sep 12 10:03:00 2005 New Revision: 280368 URL: http://svn.apache.org/viewcvs?rev=280368view=rev Log: Change so that -du and -ls commands work with zero arguments. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/TestClient.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/TestClient.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/TestClient.java?rev=280368r1=280367r2=280368view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/TestClient.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/TestClient.java Mon Sep 12 10:03:00 2005 @@ -239,17 +239,17 @@ } else if (-moveToLocal.equals(cmd)) { tc.moveToLocal(argv[i++], new File(argv[i++])); } else if (-ls.equals(cmd)) { -tc.ls(argv[i++]); +String arg = i argv.length ? argv[i++] : ; +tc.ls(arg); } else if (-mv.equals(cmd)) { tc.rename(argv[i++], argv[i++]); } else if (-cp.equals(cmd)) { tc.copy(argv[i++], argv[i++]); } else if (-rm.equals(cmd)) { tc.delete(argv[i++]); -} else if (-ls.equals(cmd)) { -tc.ls(argv[i++]); } else if (-du.equals(cmd)) { -tc.du(argv[i++]); +String arg = i argv.length ? argv[i++] : ; +tc.du(arg); } else if (-mkdir.equals(cmd)) { tc.mkdir(argv[i++]); } else if (-report.equals(cmd)) {
svn commit: r280370 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java
Author: cutting Date: Mon Sep 12 10:04:33 2005 New Revision: 280370 URL: http://svn.apache.org/viewcvs?rev=280370view=rev Log: Fix to correctly convert empty path to home directory rather than root. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java?rev=280370r1=280369r2=280370view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/fs/NDFSFileSystem.java Mon Sep 12 10:04:33 2005 @@ -50,12 +50,11 @@ public String getName() { return name; } private UTF8 getPath(File file) { - File f = file; String path = getNDFSPath(file); if (!path.startsWith(NDFSFile.NDFS_FILE_SEPARATOR)) { -f = new File(HOME_DIR, path); +path = getNDFSPath(new File(HOME_DIR, path)); // make absolute } - return new UTF8(getNDFSPath(f)); + return new UTF8(path); } /** @@ -305,17 +304,10 @@ parent = parent.getParentFile(); } StringBuffer path = new StringBuffer(); - String fname = (String) l.get(l.size() - 1); - if (!.equals(fname)) { -path.append(fname); //handle not absolute paths - } else { -if (l.size() == 1) - path.append(NDFSFile.NDFS_FILE_SEPARATOR); //handle root path - } + path.append(l.get(l.size() - 1)); for (int i = l.size() - 2; i = 0; i--) { -fname = (String) l.get(i); path.append(NDFSFile.NDFS_FILE_SEPARATOR); -path.append(fname); +path.append(l.get(i)); } return path.toString(); }
svn commit: r279596 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java
Author: cutting Date: Thu Sep 8 11:09:28 2005 New Revision: 279596 URL: http://svn.apache.org/viewcvs?rev=279596view=rev Log: Fix so that input splitting errors don't leave job hung. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java?rev=279596r1=279595r2=279596view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java Thu Sep 8 11:09:28 2005 @@ -521,7 +521,17 @@ JobInProgress createJob(String jobFile) throws IOException { JobInProgress job = new JobInProgress(jobFile); jobs.put(job.getProfile().getJobId(), job); -job.launch(); + +boolean error = true; +try { + job.launch(); + error = false; +} finally { + if (error) { +job.kill(); + } +} + return job; }
svn commit: r279397 - /lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java
Author: cutting Date: Wed Sep 7 11:42:11 2005 New Revision: 279397 URL: http://svn.apache.org/viewcvs?rev=279397view=rev Log: Add seek test. Modified: lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java Modified: lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java?rev=279397r1=279396r2=279397view=diff == --- lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java (original) +++ lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java Wed Sep 7 11:42:11 2005 @@ -31,6 +31,7 @@ private static final Logger LOG = InputFormatBase.LOG; private static final long MEGA = 1024 * 1024; + private static final int SEEKS_PER_FILE = 4; private static String ROOT = System.getProperty(test.build.data,fs_test); private static File CONTROL_DIR = new File(ROOT, fs_control); @@ -253,11 +254,94 @@ } + public static class SeekMapper extends NutchConfigured implements Mapper { +private Random random = new Random(); +private byte[] check = new byte[8192]; +private NutchFileSystem fs; + +{ + try { +fs = NutchFileSystem.get(); + } catch (IOException e) { +throw new RuntimeException(e); + } +} + +public SeekMapper() { super(null); } + +public SeekMapper(NutchConf conf) { super(conf); } + +public void configure(JobConf job) { + setConf(job); +} + +public void map(WritableComparable key, Writable value, +OutputCollector collector, Reporter reporter) + throws IOException { + String name = ((UTF8)key).toString(); + long size = ((LongWritable)value).get(); + long seed = Long.parseLong(name); + + reporter.setStatus(opening + name); + + NFSDataInputStream in = +new NFSDataInputStream(fs.open(new File(DATA_DIR, name))); + + try { +for (int i = 0; i SEEKS_PER_FILE; i++) { + // generate a random position + long position = Math.abs(random.nextLong()) % size; + + // advance random state to that position + random.setSeed(seed); + for (int p = 0; p = position; p+= check.length) { +reporter.setStatus(generating data for + name); +random.nextBytes(check); + } + + // seek file to that position + reporter.setStatus(seeking + name); + in.seek(position); + + // check that byte matches + assertEquals(in.readByte(), check[(int)(position % check.length)]); + +} + } finally { +in.close(); + } +} + } + + public static void seekTest(NutchFileSystem fs) +throws Exception { + +fs.delete(READ_DIR); + +JobConf job = new JobConf(NutchConf.get()); + +job.setInputDir(CONTROL_DIR); +job.setInputFormat(SequenceFileInputFormat.class); +job.setInputKeyClass(UTF8.class); +job.setInputValueClass(LongWritable.class); + +job.setMapperClass(SeekMapper.class); +job.setReducerClass(LongSumReducer.class); + +job.setOutputDir(READ_DIR); +job.setOutputKeyClass(UTF8.class); +job.setOutputValueClass(LongWritable.class); +job.setNumReduceTasks(1); +JobClient.runJob(job); + } + + public static void main(String[] args) throws Exception { int megaBytes = 10; int files = 100; boolean noRead = false; boolean noWrite = false; +boolean noSeek = false; long seed = new Random().nextLong(); String usage = Usage: TestNutchFileSystem -files N -megaBytes M [-noread] [-nowrite]; @@ -290,6 +374,9 @@ } if (!noRead) { readTest(fs); +} +if (!noSeek) { + seekTest(fs); } }
svn commit: r279417 - /lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java
Author: cutting Date: Wed Sep 7 13:34:00 2005 New Revision: 279417 URL: http://svn.apache.org/viewcvs?rev=279417view=rev Log: Run seek test as unit test; add -noseek command line option. Modified: lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java Modified: lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java?rev=279417r1=279416r2=279417view=diff == --- lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java (original) +++ lucene/nutch/branches/mapred/src/test/org/apache/nutch/fs/TestNutchFileSystem.java Wed Sep 7 13:34:00 2005 @@ -56,6 +56,7 @@ createControlFile(fs, megaBytes, numFiles, seed); writeTest(fs); readTest(fs); +seekTest(fs); } public static void createControlFile(NutchFileSystem fs, @@ -344,7 +345,7 @@ boolean noSeek = false; long seed = new Random().nextLong(); -String usage = Usage: TestNutchFileSystem -files N -megaBytes M [-noread] [-nowrite]; +String usage = Usage: TestNutchFileSystem -files N -megaBytes M [-noread] [-nowrite] [-noseek]; if (args.length == 0) { System.err.println(usage); @@ -359,6 +360,8 @@ noRead = true; } else if (args[i].equals(-nowrite)) { noWrite = true; + } else if (args[i].equals(-noseek)) { +noSeek = true; } }
svn commit: r265762 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java
Author: cutting Date: Thu Sep 1 11:35:15 2005 New Revision: 265762 URL: http://svn.apache.org/viewcvs?rev=265762view=rev Log: Use partitioner to get partition. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=265762r1=265761r2=265762view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java Thu Sep 1 11:35:15 2005 @@ -31,6 +31,8 @@ import org.apache.nutch.parse.*; import org.apache.nutch.pagedb.*; import org.apache.nutch.indexer.*; +import org.apache.nutch.mapred.*; +import org.apache.nutch.mapred.lib.*; /** Implements [EMAIL PROTECTED] HitSummarizer} and [EMAIL PROTECTED] HitContent} for a set of * fetched segments. */ @@ -44,6 +46,8 @@ private MapFile.Reader[] parseText; private MapFile.Reader[] parseData; +private Partitioner partitioner = new HashPartitioner(); + public Segment(NutchFileSystem nfs, File segmentDir) throws IOException { this.nfs = nfs; this.segmentDir = segmentDir; @@ -93,7 +97,8 @@ // hash the url to figure out which part its in private Writable getEntry(MapFile.Reader[] readers, UTF8 url, Writable entry) throws IOException { - return readers[url.hashCode()%readers.length].get(url, entry); + int part = partitioner.getPartition(url, null, readers.length); + return readers[part].get(url, entry); } }
svn commit: r265778 - in /lucene/nutch/branches/mapred/src: java/org/apache/nutch/crawl/ java/org/apache/nutch/mapred/ java/org/apache/nutch/searcher/ web/jsp/
Author: cutting Date: Thu Sep 1 14:03:51 2005 New Revision: 265778 URL: http://svn.apache.org/viewcvs?rev=265778view=rev Log: Fix anchor inlink access. Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitInlinks.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/LinkDbReader.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitContent.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java lucene/nutch/branches/mapred/src/web/jsp/anchors.jsp Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java?rev=265778r1=265777r2=265778view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java Thu Sep 1 14:03:51 2005 @@ -131,9 +131,6 @@ JobConf job = new JobConf(config); -job.setInt(partition.url.by.host.seed, new Random().nextInt()); -job.setPartitionerClass(PartitionUrlByHost.class); - job.setInputFormat(SequenceFileInputFormat.class); job.setInputKeyClass(UTF8.class); job.setInputValueClass(ParseData.class); Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java?rev=265778r1=265777r2=265778view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java Thu Sep 1 14:03:51 2005 @@ -18,6 +18,7 @@ import java.io.IOException; import java.io.File; +import java.util.Arrays; import org.apache.nutch.fs.NutchFileSystem; @@ -48,6 +49,31 @@ public void close(Reporter reporter) throws IOException { out.close();} }; - } + } + + /** Open the output generated by this format. */ + public static MapFile.Reader[] getReaders(NutchFileSystem fs, File dir) +throws IOException { +File[] names = fs.listFiles(dir); + +// sort names, so that hash partitioning works +Arrays.sort(names); + +MapFile.Reader[] parts = new MapFile.Reader[names.length]; +for (int i = 0; i names.length; i++) { + parts[i] = new MapFile.Reader(fs, names[i].toString()); +} +return parts; + } + + /** Get an entry from output generated by this class. */ + public static Writable getEntry(MapFile.Reader[] readers, + Partitioner partitioner, + WritableComparable key, + Writable value) throws IOException { +int part = partitioner.getPartition(key, value, readers.length); +return readers[part].get(key, value); + } + } Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java?rev=265778r1=265777r2=265778view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/DistributedSearch.java Thu Sep 1 14:03:51 2005 @@ -24,6 +24,7 @@ import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseText; +import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.util.LogFormatter; import org.apache.nutch.io.*; import org.apache.nutch.ipc.RPC; @@ -37,7 +38,7 @@ /** The distributed search protocol. */ public interface Protocol -extends Searcher, HitDetailer, HitSummarizer, HitContent { +extends Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks { /** The name of the segments searched by this node. */ String[] getSegmentNames(); @@ -71,7 +72,8 @@ /** The search client. */ public static class Client extends Thread -implements Searcher, HitDetailer, HitSummarizer, HitContent, Runnable { +implements Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks, + Runnable { private InetSocketAddress[] defaultAddresses
svn commit: r264880 - /lucene/nutch/branches/mapred/bin/slaves.sh
Author: cutting Date: Tue Aug 30 15:18:55 2005 New Revision: 264880 URL: http://svn.apache.org/viewcvs?rev=264880view=rev Log: Always put a newline after host name. Modified: lucene/nutch/branches/mapred/bin/slaves.sh Modified: lucene/nutch/branches/mapred/bin/slaves.sh URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/slaves.sh?rev=264880r1=264879r2=264880view=diff == --- lucene/nutch/branches/mapred/bin/slaves.sh (original) +++ lucene/nutch/branches/mapred/bin/slaves.sh Tue Aug 30 15:18:55 2005 @@ -20,6 +20,6 @@ fi for slave in `cat $NUTCH_SLAVES`; do - echo -n $slave:\ + echo $slave: ssh -o ConnectTimeout=1 $slave $@ done
svn commit: r264685 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred: InterTrackerProtocol.java JobTracker.java TaskTracker.java
Author: cutting Date: Mon Aug 29 20:08:46 2005 New Revision: 264685 URL: http://svn.apache.org/viewcvs?rev=264685view=rev Log: Synchronize things in TaskTracker.offerService() loop. Also remove boxing in the heartbeat RPC. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InterTrackerProtocol.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InterTrackerProtocol.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InterTrackerProtocol.java?rev=264685r1=264684r2=264685view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InterTrackerProtocol.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/InterTrackerProtocol.java Mon Aug 29 20:08:46 2005 @@ -35,7 +35,7 @@ * TaskTracker must also indicate whether this is the first interaction * (since state refresh) */ - IntWritable emitHeartbeat(TaskTrackerStatus status, BooleanWritable initialContact); + int emitHeartbeat(TaskTrackerStatus status, boolean initialContact); /** Called to get new tasks from from the job tracker for this tracker.*/ Task pollForNewTask(String trackerName); Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java?rev=264685r1=264684r2=264685view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/JobTracker.java Mon Aug 29 20:08:46 2005 @@ -329,13 +329,13 @@ /** * Process incoming heartbeat messages from the task trackers. */ -public synchronized IntWritable emitHeartbeat(TaskTrackerStatus trackerStatus, BooleanWritable initialContact) { +public synchronized int emitHeartbeat(TaskTrackerStatus trackerStatus, boolean initialContact) { String trackerName = trackerStatus.getTrackerName(); trackerStatus.setLastSeen(System.currentTimeMillis()); synchronized (taskTrackers) { synchronized (trackerExpiryQueue) { -if (initialContact.get()) { +if (initialContact) { // If it's first contact, then clear out any state hanging around if (taskTrackers.get(trackerName) != null) { taskTrackers.remove(trackerName); @@ -344,14 +344,14 @@ } else { // If not first contact, there should be some record of the tracker if (taskTrackers.get(trackerName) == null) { -return new IntWritable(InterTrackerProtocol.UNKNOWN_TASKTRACKER); +return InterTrackerProtocol.UNKNOWN_TASKTRACKER; } } // Store latest state. If first contact, then save current // state in expiry queue taskTrackers.put(trackerName, trackerStatus); -if (initialContact.get()) { +if (initialContact) { trackerExpiryQueue.add(trackerStatus); } } @@ -359,7 +359,7 @@ updateTaskStatuses(trackerStatus); //LOG.info(Got heartbeat from +trackerName); -return new IntWritable(InterTrackerProtocol.TRACKERS_OK); +return InterTrackerProtocol.TRACKERS_OK; } /** Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java?rev=264685r1=264684r2=264685view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java Mon Aug 29 20:08:46 2005 @@ -124,7 +124,7 @@ * within the same process space might be restarted, so everything must be * clean. */ -public void close() throws IOException { +public synchronized void close() throws IOException { // Kill running tasks Vector v = new Vector(); for (Iterator it = tasks.values().iterator(); it.hasNext(); ) { @@ -186,7 +186,7 @@ // Emit standard hearbeat message to check in with JobTracker // Vector taskReports = new Vector(); -synchronized (runningTasks) { +synchronized
svn commit: r240279 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred: MapFileOutputFormat.java MapTask.java RecordWriter.java ReduceTask.java SequenceFileOutputFormat.java TaskTracker.java TextOutputFormat.java
Author: cutting Date: Fri Aug 26 09:37:55 2005 New Revision: 240279 URL: http://svn.apache.org/viewcvs?rev=240279view=rev Log: Always call done() on tasks, setting final progress to 1.0. Also permit RecordWriter.close() to emit progress reports to avoid task timeouts when closing is lengthy. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/RecordWriter.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/SequenceFileOutputFormat.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TextOutputFormat.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java?rev=240279r1=240278r2=240279view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapFileOutputFormat.java Fri Aug 26 09:37:55 2005 @@ -46,7 +46,7 @@ out.append(key, value); } -public void close() throws IOException { out.close(); } +public void close(Reporter reporter) throws IOException { out.close();} }; } } Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java?rev=240279r1=240278r2=240279view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/MapTask.java Fri Aug 26 09:37:55 2005 @@ -118,8 +118,6 @@ ((CombiningCollector)collector).flush(); } -reportProgress(umbilical, 1.0f); // done - } finally { in.close(); // close input } @@ -130,6 +128,7 @@ } } } +umbilical.done(getTaskId()); } } Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/RecordWriter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/RecordWriter.java?rev=240279r1=240278r2=240279view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/RecordWriter.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/RecordWriter.java Fri Aug 26 09:37:55 2005 @@ -35,5 +35,5 @@ void write(WritableComparable key, Writable value) throws IOException; /** Close this to future operations.*/ - void close() throws IOException; + void close(Reporter reporter) throws IOException; } Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java?rev=240279r1=240278r2=240279view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/ReduceTask.java Fri Aug 26 09:37:55 2005 @@ -264,10 +264,10 @@ } finally { in.close(); lfs.delete(new File(sortedFile)); // remove sorted - out.close(); + out.close(reporter); } -reportProgress(umbilical); +umbilical.done(getTaskId()); } /** Construct output file names so that, when an output directory listing is Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/SequenceFileOutputFormat.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/SequenceFileOutputFormat.java?rev=240279r1=240278r2=240279view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/SequenceFileOutputFormat.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/SequenceFileOutputFormat.java Fri Aug 26 09:37:55 2005 @@ -46,7 +46,7 @@ out.append(key, value); } -public void close() throws IOException { out.close(); } +public void close(Reporter reporter) throws IOException { out.close
svn commit: r240280 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java
Author: cutting Date: Fri Aug 26 09:39:11 2005 New Revision: 240280 URL: http://svn.apache.org/viewcvs?rev=240280view=rev Log: Limit to 10,000 inlinks by default. Also optimize a common case. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java?rev=240280r1=240279r2=240280view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/LinkDb.java Fri Aug 26 09:39:11 2005 @@ -49,7 +49,7 @@ public void configure(JobConf job) { maxAnchorLength = job.getInt(db.max.anchor.length, 100); -maxInlinks = job.getInt(db.max.inlinks, 10); +maxInlinks = job.getInt(db.max.inlinks, 1); } public void map(WritableComparable key, Writable value, @@ -74,9 +74,21 @@ public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { -Inlinks result = new Inlinks(); + +Inlinks result = null; + while (values.hasNext()) { Inlinks inlinks = (Inlinks)values.next(); + + if (result == null) { // optimize a common case +if (inlinks.size() maxInlinks) { + result = inlinks; + continue; +} else { + result = new Inlinks(); +} + } + int end = Math.min(maxInlinks - result.size(), inlinks.size()); for (int i = 0; i end; i++) { result.add(inlinks.get(i));
svn commit: r240346 - /lucene/nutch/branches/mapred/conf/nutch-default.xml
Author: cutting Date: Fri Aug 26 14:21:08 2005 New Revision: 240346 URL: http://svn.apache.org/viewcvs?rev=240346view=rev Log: Fix a crazy default. This made indexing rather slow... Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=240346r1=240345r2=240346view=diff == --- lucene/nutch/branches/mapred/conf/nutch-default.xml (original) +++ lucene/nutch/branches/mapred/conf/nutch-default.xml Fri Aug 26 14:21:08 2005 @@ -488,12 +488,12 @@ property nameindexer.maxMergeDocs/name - value50/value + value2147483647/value descriptionThis number determines the maximum number of Lucene Documents to be merged into a new Lucene segment. Larger values - increase indexing speed and reduce the number of Lucene segments, + increase batch indexing speed and reduce the number of Lucene segments, which reduces the number of open file handles; however, this also - increases RAM usage during indexing. + decreases incremental indexing performance. /description /property
svn commit: r235756 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java
Author: cutting Date: Mon Aug 22 10:08:17 2005 New Revision: 235756 URL: http://svn.apache.org/viewcvs?rev=235756view=rev Log: Always kill forked child so that it doesn't consume file handles. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java?rev=235756r1=235755r2=235756view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java Mon Aug 22 10:08:17 2005 @@ -134,6 +134,8 @@ } catch (InterruptedException e) { throw new IOException(e.toString()); +} finally { + kill(); } }
svn commit: r233569 - /lucene/nutch/branches/mapred/bin/nutch-daemon.sh
Author: cutting Date: Fri Aug 19 15:54:04 2005 New Revision: 233569 URL: http://svn.apache.org/viewcvs?rev=233569view=rev Log: Fix to sync whole tree. Modified: lucene/nutch/branches/mapred/bin/nutch-daemon.sh Modified: lucene/nutch/branches/mapred/bin/nutch-daemon.sh URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/bin/nutch-daemon.sh?rev=233569r1=233568r2=233569view=diff == --- lucene/nutch/branches/mapred/bin/nutch-daemon.sh (original) +++ lucene/nutch/branches/mapred/bin/nutch-daemon.sh Fri Aug 19 15:54:04 2005 @@ -57,7 +57,7 @@ root=`dirname $this`/.. if [ $NUTCH_MASTER != ]; then echo rsync from $NUTCH_MASTER - rsync -a --delete --exclude=.svn $NUTCH_MASTER/{build,bin,lib,conf} $root + rsync -a --delete --exclude=.svn $NUTCH_MASTER/ $root fi cd $root
svn commit: r233360 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java
Author: cutting Date: Thu Aug 18 12:19:05 2005 New Revision: 233360 URL: http://svn.apache.org/viewcvs?rev=233360view=rev Log: Fix a bug in equals(), whether other object may still be deflated. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java?rev=233360r1=233359r2=233360view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java Thu Aug 18 12:19:05 2005 @@ -158,6 +158,7 @@ return false; } Content that = (Content)o; +that.ensureInflated(); return this.url.equals(that.url) this.base.equals(that.base)
svn commit: r232841 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch: io/CompressedWritable.java protocol/Content.java
Author: cutting Date: Mon Aug 15 11:10:23 2005 New Revision: 232841 URL: http://svn.apache.org/viewcvs?rev=232841view=rev Log: Lazily decompress content. Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java Added: lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java?rev=232841view=auto == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java (added) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/CompressedWritable.java Mon Aug 15 11:10:23 2005 @@ -0,0 +1,81 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the License); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.io; + +import java.io.IOException; +import java.io.DataInput; +import java.io.DataOutput; +import java.io.DataOutputStream; +import java.io.DataInputStream; +import java.io.ByteArrayOutputStream; +import java.io.ByteArrayInputStream; +import java.util.zip.DeflaterOutputStream; +import java.util.zip.InflaterInputStream; + +/** A base-class for Writables which store themselves compressed and lazily + * inflate on field access. This is useful for large objects whose fields are + * not be altered during a map or reduce operation: leaving the field data + * compressed makes copying the instance from one file to another much + * faster. */ +public abstract class CompressedWritable implements Writable { + // if non-null, the compressed field data of this instance. + private byte[] compressed; + + public CompressedWritable() {} + + public final void readFields(DataInput in) throws IOException { +compressed = new byte[in.readInt()]; +in.readFully(compressed, 0, compressed.length); + } + + /** Must be called by all methods which access fields to ensure that the data + * has been uncompressed. */ + protected void ensureInflated() { +if (compressed != null) { + try { +ByteArrayInputStream deflated = new ByteArrayInputStream(compressed); +DataInput inflater = + new DataInputStream(new InflaterInputStream(deflated)); +readFieldsCompressed(inflater); +compressed = null; + } catch (IOException e) { +throw new RuntimeException(e); + } +} + } + + /** Subclasses implement this instead of [EMAIL PROTECTED] #readFields(DataInput)}. */ + protected abstract void readFieldsCompressed(DataInput in) +throws IOException; + + public final void write(DataOutput out) throws IOException { +if (compressed == null) { + ByteArrayOutputStream deflated = new ByteArrayOutputStream(); + DataOutputStream deflater = +new DataOutputStream(new DeflaterOutputStream(deflated)); + writeCompressed(deflater); + deflater.close(); + compressed = deflated.toByteArray(); +} +out.writeInt(compressed.length); +out.write(compressed); + } + + /** Subclasses implement this instead of [EMAIL PROTECTED] #write(DataOutput)}. */ + protected abstract void writeCompressed(DataOutput out) throws IOException; + +} Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java?rev=232841r1=232840r2=232841view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/protocol/Content.java Mon Aug 15 11:10:23 2005 @@ -23,12 +23,13 @@ import org.apache.nutch.fs.*; import org.apache.nutch.util.*; -public final class Content extends VersionedWritable { +public final class Content extends CompressedWritable { public static final String DIR_NAME = content; private final static byte VERSION = 1; + private byte version; private String url; private String base; private byte[] content; @@ -53,15 +54,16 @@ this.metadata = metadata; } - public byte getVersion() { return VERSION; } - - public final void readFields(DataInput in) throws IOException { -super.readFields
svn commit: r225344 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java
Author: cutting Date: Tue Jul 26 09:40:00 2005 New Revision: 225344 URL: http://svn.apache.org/viewcvs?rev=225344view=rev Log: Fix bug with syncs in large merges. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java?rev=225344r1=225343r2=225344view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/io/SequenceFile.java Tue Jul 26 09:40:00 2005 @@ -618,7 +618,8 @@ this.pass = pass; this.last = last; -this.queue = new MergeQueue(factor, last ? outFile : outFile+.+pass); +this.queue = + new MergeQueue(factor, last ? outFile : outFile+.+pass, last); this.inName = outFile+.+(pass-1); this.in = new NFSDataInputStream(nfs.open(new File(inName))); @@ -695,7 +696,7 @@ private MergeQueue queue; public MergeFiles() throws IOException { -this.queue = new MergeQueue(factor, outFile); +this.queue = new MergeQueue(factor, outFile, true); } public void close() throws IOException { @@ -741,12 +742,15 @@ private class MergeQueue extends PriorityQueue { private NFSDataOutputStream out; + private boolean done; - public MergeQueue(int size, String outName) throws IOException { + public MergeQueue(int size, String outName, boolean done) +throws IOException { initialize(size); this.out = new NFSDataOutputStream(nfs.create(new File(outName)), memory/(factor+1)); +this.done = done; } protected boolean lessThan(Object a, Object b) { @@ -758,6 +762,9 @@ public void merge() throws IOException { Writer writer = new Writer(out, keyClass, valClass); +if (!done) { + writer.sync = null; // disable sync on temp files +} while (size() != 0) { MergeStream ms = (MergeStream)top();
svn commit: r219566 - in /lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred: TaskRunner.java TaskTracker.java
Author: cutting Date: Mon Jul 18 13:57:34 2005 New Revision: 219566 URL: http://svn.apache.org/viewcvs?rev=219566view=rev Log: Catch Throwable, not just Exception, and always log and report it to tracker. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java?rev=219566r1=219565r2=219566view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskRunner.java Mon Jul 18 13:57:34 2005 @@ -94,8 +94,15 @@ t.getTaskId() // pass task identifier }, null); -} catch (Exception e) { - LOG.log(Level.WARNING, Child Error, e); +} catch (Throwable throwable) { + LOG.log(Level.WARNING, Child Error, throwable); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + throwable.printStackTrace(new PrintStream(baos)); + try { +tracker.reportDiagnosticInfo(t.getTaskId(), baos.toString()); + } catch (IOException e) { +LOG.log(Level.WARNING, Reporting Diagnostics, e); + } } finally { tracker.reportTaskFinished(t.getTaskId()); } Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java?rev=219566r1=219565r2=219566view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/mapred/TaskTracker.java Mon Jul 18 13:57:34 2005 @@ -502,7 +502,7 @@ * The main() for child processes. */ public static class Child { -public static void main(String[] args) throws Exception { +public static void main(String[] args) throws Throwable { LogFormatter.showTime(false); LOG.info(Child starting); @@ -516,12 +516,12 @@ JobConf job = new JobConf(task.getJobFile()); try { task.run(job, umbilical); // run the task - } catch (Exception ie) { + } catch (Throwable throwable) { + LOG.log(Level.WARNING, Failed to spawn child, throwable); // Report back any failures, for diagnostic purposes ByteArrayOutputStream baos = new ByteArrayOutputStream(); - ie.printStackTrace(new PrintStream(baos)); + throwable.printStackTrace(new PrintStream(baos)); umbilical.reportDiagnosticInfo(taskid, baos.toString()); - throw ie; } umbilical.done(taskid); }
svn commit: r219563 - in /lucene/nutch/branches/mapred/conf: crawl-urlfilter.txt.template regex-urlfilter.txt.template
Author: cutting Date: Mon Jul 18 13:42:37 2005 New Revision: 219563 URL: http://svn.apache.org/viewcvs?rev=219563view=rev Log: Skip URLs with repeating segments. Modified: lucene/nutch/branches/mapred/conf/crawl-urlfilter.txt.template lucene/nutch/branches/mapred/conf/regex-urlfilter.txt.template Modified: lucene/nutch/branches/mapred/conf/crawl-urlfilter.txt.template URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/crawl-urlfilter.txt.template?rev=219563r1=219562r2=219563view=diff == --- lucene/nutch/branches/mapred/conf/crawl-urlfilter.txt.template (original) +++ lucene/nutch/branches/mapred/conf/crawl-urlfilter.txt.template Mon Jul 18 13:42:37 2005 @@ -17,6 +17,9 @@ # skip URLs containing certain characters as probable queries, etc. [EMAIL PROTECTED] +# skip URLs with slash-delimited segment that repeats 3+ times, to break loops +-.*(/.+?)/.*?\1/.*?\1/ + # accept hosts in MY.DOMAIN.NAME +^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/ Modified: lucene/nutch/branches/mapred/conf/regex-urlfilter.txt.template URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/regex-urlfilter.txt.template?rev=219563r1=219562r2=219563view=diff == --- lucene/nutch/branches/mapred/conf/regex-urlfilter.txt.template (original) +++ lucene/nutch/branches/mapred/conf/regex-urlfilter.txt.template Mon Jul 18 13:42:37 2005 @@ -15,5 +15,8 @@ # skip URLs containing certain characters as probable queries, etc. [EMAIL PROTECTED] +# skip URLs with slash-delimited segment that repeats 3+ times, to break loops +-.*(/.+?)/.*?\1/.*?\1/ + # accept anything else +.
svn commit: r210201 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java
Author: cutting Date: Mon Jul 11 13:05:28 2005 New Revision: 210201 URL: http://svn.apache.org/viewcvs?rev=210201view=rev Log: Store indexes in indexes directory. Use correct FS to list segments. Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java?rev=210201r1=210200r2=210201view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Crawl.java Mon Jul 11 13:05:28 2005 @@ -91,7 +91,7 @@ File crawlDb = new File(dir + /crawldb); File linkDb = new File(dir + /linkdb); File segments = new File(dir + /segments); -File index = new File(dir + /index); +File index = new File(dir + /indexes); // initialize crawlDb new Injector(conf).inject(crawlDb, rootUrlFile); @@ -108,7 +108,7 @@ new LinkDb(conf).invert(linkDb, segments); // invert links // index -new Indexer(conf).index(index, linkDb, segments.listFiles()); +new Indexer(conf).index(index, linkDb, fs.listFiles(segments)); LOG.info(crawl finished: + dir); }
svn commit: r213607 - in /lucene/nutch/branches/mapred: ./ conf/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/searcher/
Author: cutting Date: Mon Jul 11 14:30:22 2005 New Revision: 213607 URL: http://svn.apache.org/viewcvs?rev=213607view=rev Log: Get search working on NDFS-resident, MapReduce-created crawl. Modified: lucene/nutch/branches/mapred/build.xml lucene/nutch/branches/mapred/conf/nutch-default.xml lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/HitDetails.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/IndexSearcher.java lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/NutchBean.java Modified: lucene/nutch/branches/mapred/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/build.xml?rev=213607r1=213606r2=213607view=diff == --- lucene/nutch/branches/mapred/build.xml (original) +++ lucene/nutch/branches/mapred/build.xml Mon Jul 11 14:30:22 2005 @@ -119,7 +119,7 @@ !-- == -- !---- !-- == -- - target name=war depends=compile,generate-docs + target name=war depends=jar,compile,generate-docs war destfile=${build.dir}/${final.name}.war webxml=${web.src.dir}/web.xml fileset dir=${web.src.dir}/jsp/ Modified: lucene/nutch/branches/mapred/conf/nutch-default.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/conf/nutch-default.xml?rev=213607r1=213606r2=213607view=diff == --- lucene/nutch/branches/mapred/conf/nutch-default.xml (original) +++ lucene/nutch/branches/mapred/conf/nutch-default.xml Mon Jul 11 14:30:22 2005 @@ -498,9 +498,9 @@ property namesearcher.dir/name - value./value + valuecrawl/value description - Path to root of index directories. This directory is searched (in + Path to root of crawl. This directory is searched (in order) for either the file search-servers.txt, containing a list of distributed search servers, or the directory index containing merged indexes, or the directory segments containing segment Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java?rev=213607r1=213606r2=213607view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Indexer.java Mon Jul 11 14:30:22 2005 @@ -100,6 +100,7 @@ writer.optimize(); writer.close(); fs.completeLocalOutput(perm, temp); // copy to ndfs +fs.createNewFile(new File(perm, IndexSegment.DONE_NAME)); } }; } Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java?rev=213607r1=213606r2=213607view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/indexer/NdfsDirectory.java Mon Jul 11 14:30:22 2005 @@ -1,5 +1,3 @@ -package org.apache.lucene.store; - /** * Copyright 2004 The Apache Software Foundation * @@ -15,6 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +package org.apache.nutch.indexer; import java.io.*; import org.apache.lucene.store.*; Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java?rev=213607r1=213606r2=213607view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/searcher/FetchedSegments.java Mon Jul 11 14:30:22 2005 @@ -39,57 +39,64 @@ private NutchFileSystem nfs; private File segmentDir; -private ArrayFile.Reader fetcher; -private ArrayFile.Reader content; -private ArrayFile.Reader text; -private ArrayFile.Reader parsedata; +private MapFile.Reader[] content; +private
svn commit: r210036 - /lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java
Author: cutting Date: Sun Jul 10 14:20:46 2005 New Revision: 210036 URL: http://svn.apache.org/viewcvs?rev=210036view=rev Log: Actually use the new InputFormat! Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Modified: lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java URL: http://svn.apache.org/viewcvs/lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java?rev=210036r1=210035r2=210036view=diff == --- lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java (original) +++ lucene/nutch/branches/mapred/src/java/org/apache/nutch/crawl/Fetcher.java Sun Jul 10 14:20:46 2005 @@ -38,7 +38,7 @@ public static final String DIGEST_KEY = nutch.content.digest; - public class InputFormat extends SequenceFileInputFormat { + public static class InputFormat extends SequenceFileInputFormat { /** Don't split inputs, to keep things polite. */ public FileSplit[] getSplits(NutchFileSystem fs, JobConf job, int nSplits) throws IOException { @@ -253,7 +253,7 @@ job.setInt(fetcher.threads.fetch, threads); job.setInputDir(new File(segment, CrawlDatum.GENERATE_DIR_NAME)); -job.setInputFormat(SequenceFileInputFormat.class); +job.setInputFormat(InputFormat.class); job.setInputKeyClass(UTF8.class); job.setInputValueClass(CrawlDatum.class);