Author: kubes Date: Tue Dec 2 06:47:35 2008 New Revision: 722477 URL: http://svn.apache.org/viewvc?rev=722477&view=rev Log: NUTCH-663: Upgrade Nutch to use Hadoop 0.19
Added: lucene/nutch/trunk/lib/hadoop-0.19.0-core.jar (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.18.1-core.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.a lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so.1 lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so.1.0.0 lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1 lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0 lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=722477&r1=722476&r2=722477&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Tue Dec 2 06:47:35 2008 @@ -285,6 +285,8 @@ (dogacan) 105. NUTCH-662 - Upgrade Nutch to use Lucene 2.4. (kubes) + +106. NUTCH-663 - Upgrade Nutch to use Hadoop 0.19 (kubes) Release 0.9 - 2007-04-02 Added: lucene/nutch/trunk/lib/hadoop-0.19.0-core.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.19.0-core.jar?rev=722477&view=auto ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.19.0-core.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.a URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.a?rev=722477&r1=722476&r2=722477&view=diff ============================================================================== Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so?rev=722477&r1=722476&r2=722477&view=diff ============================================================================== Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so.1 URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so.1?rev=722477&r1=722476&r2=722477&view=diff ============================================================================== Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so.1.0.0 URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-amd64-64/libhadoop.so.1.0.0?rev=722477&r1=722476&r2=722477&view=diff ============================================================================== Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a?rev=722477&r1=722476&r2=722477&view=diff ============================================================================== Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so?rev=722477&r1=722476&r2=722477&view=diff ============================================================================== Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1 URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1?rev=722477&r1=722476&r2=722477&view=diff ============================================================================== Binary files - no diff available. Modified: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0 URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0?rev=722477&r1=722476&r2=722477&view=diff ============================================================================== Binary files - no diff available. Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=722477&r1=722476&r2=722477&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue Dec 2 06:47:35 2008 @@ -58,12 +58,13 @@ /** Don't split inputs, to keep things polite. */ public InputSplit[] getSplits(JobConf job, int nSplits) throws IOException { - Path[] files = listPaths(job); + FileStatus[] files = listStatus(job); FileSystem fs = FileSystem.get(job); InputSplit[] splits = new InputSplit[files.length]; for (int i = 0; i < files.length; i++) { - splits[i] = new FileSplit(files[i], 0, - fs.getFileStatus(files[i]).getLen(), (String[])null); + FileStatus cur = files[i]; + splits[i] = new FileSplit(cur.getPath(), 0, + cur.getLen(), (String[])null); } return splits; } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=722477&r1=722476&r2=722477&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Tue Dec 2 06:47:35 2008 @@ -92,12 +92,13 @@ /** Don't split inputs, to keep things polite. */ public InputSplit[] getSplits(JobConf job, int nSplits) throws IOException { - Path[] files = listPaths(job); + FileStatus[] files = listStatus(job); FileSplit[] splits = new FileSplit[files.length]; FileSystem fs = FileSystem.get(job); for (int i = 0; i < files.length; i++) { - splits[i] = new FileSplit(files[i], 0, - fs.getFileStatus(files[i]).getLen(), (String[])null); + FileStatus cur = files[i]; + splits[i] = new FileSplit(cur.getPath(), 0, + cur.getLen(), (String[])null); } return splits; } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?rev=722477&r1=722476&r2=722477&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Tue Dec 2 06:47:35 2008 @@ -161,7 +161,6 @@ try { return new SequenceFileRecordReader<Text, MetaWrapper>(job, fSplit) { - @Override public synchronized boolean next(Text key, MetaWrapper wrapper) throws IOException { LOG.debug("Running OIF.next()"); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?rev=722477&r1=722476&r2=722477&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Tue Dec 2 06:47:35 2008 @@ -47,6 +47,7 @@ import org.apache.hadoop.io.UTF8; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; @@ -203,21 +204,21 @@ JobConf job = createJobConf(); job.setJobName("read " + segment); - if (ge) job.addInputPath(new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); - if (fe) job.addInputPath(new Path(segment, CrawlDatum.FETCH_DIR_NAME)); - if (pa) job.addInputPath(new Path(segment, CrawlDatum.PARSE_DIR_NAME)); - if (co) job.addInputPath(new Path(segment, Content.DIR_NAME)); - if (pd) job.addInputPath(new Path(segment, ParseData.DIR_NAME)); - if (pt) job.addInputPath(new Path(segment, ParseText.DIR_NAME)); + if (ge) FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); + if (fe) FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME)); + if (pa) FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.PARSE_DIR_NAME)); + if (co) FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME)); + if (pd) FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME)); + if (pt) FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(InputCompatMapper.class); job.setReducerClass(SegmentReader.class); Path tempDir = new Path(job.get("hadoop.tmp.dir", "/tmp") + "/segread-" + new java.util.Random().nextInt()); - fs.delete(tempDir); + fs.delete(tempDir, true); - job.setOutputPath(tempDir); + FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class);