Author: ab Date: Wed Mar 19 03:34:14 2008 New Revision: 638779 URL: http://svn.apache.org/viewvc?rev=638779&view=rev Log: NUTCH-598 - Remove deprecated use of ToolBase. Use generics in Hadoop API.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Mar 19 03:34:14 2008 @@ -239,6 +239,9 @@ 87. NUTCH-223 - Crawl.java uses Integer.MAX_VALUE (Jeff Ritchie via ab) +88. NUTCH-598 - Remove deprecated use of ToolBase. Use generics in Hadoop API. + (Emmanuel Joke, dogacan, ab) + Release 0.9 - 2007-04-02 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDb.java Wed Mar 19 03:34:14 2008 @@ -28,8 +28,7 @@ import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; -import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.ToolBase; +import org.apache.hadoop.util.*; import org.apache.nutch.util.HadoopFSUtil; import org.apache.nutch.util.LockUtil; @@ -40,7 +39,7 @@ * This class takes the output of the fetcher and updates the * crawldb accordingly. */ -public class CrawlDb extends ToolBase { +public class CrawlDb extends Configured implements Tool { public static final Log LOG = LogFactory.getLog(CrawlDb.class); public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed"; @@ -48,11 +47,8 @@ public static final String CURRENT_NAME = "current"; public static final String LOCK_NAME = ".locked"; - - public CrawlDb() { - - } + public CrawlDb() {} public CrawlDb(Configuration conf) { setConf(conf); @@ -150,7 +146,7 @@ } public static void main(String[] args) throws Exception { - int res = new CrawlDb().doMain(NutchConfiguration.create(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDb(), args); System.exit(res); } @@ -182,8 +178,8 @@ } else if (args[i].equals("-noAdditions")) { additionsAllowed = false; } else if (args[i].equals("-dir")) { - Path[] paths = fs.listPaths(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs)); - dirs.addAll(Arrays.asList(paths)); + FileStatus[] paths = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs)); + dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths))); } else { dirs.add(new Path(args[i])); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Wed Mar 19 03:34:14 2008 @@ -28,10 +28,9 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.*; -import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.ToolBase; +import org.apache.hadoop.util.*; +import org.apache.hadoop.conf.*; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -50,10 +49,10 @@ * * @author Andrzej Bialecki */ -public class CrawlDbMerger extends ToolBase { +public class CrawlDbMerger extends Configured implements Tool { private static final Log LOG = LogFactory.getLog(CrawlDbMerger.class); - public static class Merger extends MapReduceBase implements Reducer { + public static class Merger extends MapReduceBase implements Reducer<Text, CrawlDatum, Text, CrawlDatum> { MapWritable meta = new MapWritable(); private FetchSchedule schedule; @@ -63,13 +62,13 @@ schedule = FetchScheduleFactory.getFetchSchedule(conf); } - public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) + public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { CrawlDatum res = null; long resTime = 0L; meta.clear(); while (values.hasNext()) { - CrawlDatum val = (CrawlDatum) values.next(); + CrawlDatum val = values.next(); if (res == null) { res = val; resTime = schedule.calculateLastFetchTime(res); @@ -138,7 +137,7 @@ * @param args */ public static void main(String[] args) throws Exception { - int res = new CrawlDbMerger().doMain(NutchConfiguration.create(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDbMerger(), args); System.exit(res); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Wed Mar 19 03:34:14 2008 @@ -29,8 +29,7 @@ import org.apache.hadoop.io.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; -import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.ToolBase; +import org.apache.hadoop.util.*; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -45,7 +44,7 @@ import org.apache.nutch.util.NutchJob; /** Generates a subset of a crawl db to fetch. */ -public class Generator extends ToolBase { +public class Generator extends Configured implements Tool { public static final String CRAWL_GENERATE_FILTER = "crawl.generate.filter"; public static final String GENERATE_MAX_PER_HOST_BY_IP = "generate.max.per.host.by.ip"; @@ -81,7 +80,7 @@ } /** Selects entries due for fetch. */ - public static class Selector implements Mapper, Partitioner, Reducer { + public static class Selector implements Mapper<Text, CrawlDatum, FloatWritable, SelectorEntry>, Partitioner<FloatWritable, Writable>, Reducer<FloatWritable, SelectorEntry, FloatWritable, SelectorEntry> { private LongWritable genTime = new LongWritable(System.currentTimeMillis()); private long curTime; private long limit; @@ -89,7 +88,7 @@ private HashMap<String, IntWritable> hostCounts = new HashMap<String, IntWritable>(); private int maxPerHost; - private Partitioner hostPartitioner = new PartitionUrlByHost(); + private Partitioner<Text, Writable> hostPartitioner = new PartitionUrlByHost(); private URLFilters filters; private URLNormalizers normalizers; private ScoringFilters scfilters; @@ -120,10 +119,10 @@ public void close() {} /** Select & invert subset due for fetch. */ - public void map(WritableComparable key, Writable value, - OutputCollector output, Reporter reporter) + public void map(Text key, CrawlDatum value, + OutputCollector<FloatWritable, SelectorEntry> output, Reporter reporter) throws IOException { - Text url = (Text)key; + Text url = key; if (filter) { // If filtering is on don't generate URLs that don't pass URLFilters try { @@ -136,7 +135,7 @@ } } } - CrawlDatum crawlDatum = (CrawlDatum)value; + CrawlDatum crawlDatum = value; // check fetch schedule if (!schedule.shouldFetch(url, crawlDatum, curTime)) { @@ -167,20 +166,21 @@ } /** Partition by host. */ - public int getPartition(WritableComparable key, Writable value, + public int getPartition(FloatWritable key, Writable value, int numReduceTasks) { return hostPartitioner.getPartition(((SelectorEntry)value).url, key, numReduceTasks); } /** Collect until limit is reached. */ - public void reduce(WritableComparable key, Iterator values, - OutputCollector output, Reporter reporter) + public void reduce(FloatWritable key, Iterator<SelectorEntry> values, + OutputCollector<FloatWritable, SelectorEntry> output, + Reporter reporter) throws IOException { while (values.hasNext() && count < limit) { - SelectorEntry entry = (SelectorEntry)values.next(); + SelectorEntry entry = values.next(); Text url = entry.url; String urlString = url.toString(); URL u = null; @@ -268,22 +268,23 @@ } } - public static class SelectorInverseMapper extends MapReduceBase implements Mapper { + public static class SelectorInverseMapper extends MapReduceBase implements Mapper<FloatWritable, SelectorEntry, Text, SelectorEntry> { - public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { + public void map(FloatWritable key, SelectorEntry value, OutputCollector<Text, SelectorEntry> output, Reporter reporter) throws IOException { SelectorEntry entry = (SelectorEntry)value; output.collect(entry.url, entry); } } - public static class PartitionReducer extends MapReduceBase implements Reducer { + public static class PartitionReducer extends MapReduceBase + implements Reducer<Text, SelectorEntry, Text, CrawlDatum> { - public void reduce(WritableComparable key, Iterator values, - OutputCollector output, Reporter reporter) throws IOException { + public void reduce(Text key, Iterator<SelectorEntry> values, + OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { // if using HashComparator, we get only one input key in case of hash collision // so use only URLs from values while (values.hasNext()) { - SelectorEntry entry = (SelectorEntry)values.next(); + SelectorEntry entry = values.next(); output.collect(entry.url, entry.datum); } } @@ -323,27 +324,27 @@ /** * Update the CrawlDB so that the next generate won't include the same URLs. */ - public static class CrawlDbUpdater extends MapReduceBase implements Mapper, Reducer { + public static class CrawlDbUpdater extends MapReduceBase implements Mapper<WritableComparable, Writable, Text, CrawlDatum>, Reducer<Text, CrawlDatum, Text, CrawlDatum> { long generateTime; public void configure(JobConf job) { generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, 0L); } - public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { + public void map(WritableComparable key, Writable value, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { if (key instanceof FloatWritable) { // tempDir source SelectorEntry se = (SelectorEntry)value; output.collect(se.url, se.datum); } else { - output.collect(key, value); + output.collect((Text)key, (CrawlDatum)value); } } - public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { + public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { CrawlDatum orig = null; LongWritable genTime = null; while (values.hasNext()) { - CrawlDatum val = (CrawlDatum)values.next(); + CrawlDatum val = values.next(); if (val.getMetaData().containsKey(Nutch.WRITABLE_GENERATE_TIME_KEY)) { genTime = (LongWritable)val.getMetaData().get(Nutch.WRITABLE_GENERATE_TIME_KEY); if (genTime.get() != generateTime) { @@ -359,13 +360,10 @@ orig.getMetaData().put(Nutch.WRITABLE_GENERATE_TIME_KEY, genTime); } output.collect(key, orig); - } - + } } - public Generator() { - - } + public Generator() {} public Generator(Configuration conf) { setConf(conf); @@ -564,7 +562,7 @@ * Generate a fetchlist from the crawldb. */ public static void main(String args[]) throws Exception { - int res = new Generator().doMain(NutchConfiguration.create(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new Generator(), args); System.exit(res); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Wed Mar 19 03:34:14 2008 @@ -28,8 +28,7 @@ import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; -import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.ToolBase; +import org.apache.hadoop.util.*; import org.apache.nutch.net.*; import org.apache.nutch.scoring.ScoringFilterException; @@ -39,12 +38,12 @@ /** This class takes a flat file of URLs and adds them to the of pages to be * crawled. Useful for bootstrapping the system. */ -public class Injector extends ToolBase { +public class Injector extends Configured implements Tool { public static final Log LOG = LogFactory.getLog(Injector.class); /** Normalize and filter injected urls. */ - public static class InjectMapper implements Mapper { + public static class InjectMapper implements Mapper<WritableComparable, Text, Text, CrawlDatum> { private URLNormalizers urlNormalizers; private int interval; private float scoreInjected; @@ -65,12 +64,10 @@ public void close() {} - public void map(WritableComparable key, Writable val, - OutputCollector output, Reporter reporter) + public void map(WritableComparable key, Text value, + OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { - Text value = (Text)val; String url = value.toString(); // value is line of text - // System.out.println("url: " +url); try { url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT); url = filters.filter(url); // filter the url @@ -98,17 +95,17 @@ } /** Combine multiple new entries for a url. */ - public static class InjectReducer implements Reducer { + public static class InjectReducer implements Reducer<Text, CrawlDatum, Text, CrawlDatum> { public void configure(JobConf job) {} public void close() {} - public void reduce(WritableComparable key, Iterator values, - OutputCollector output, Reporter reporter) + public void reduce(Text key, Iterator<CrawlDatum> values, + OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { CrawlDatum old = null; CrawlDatum injected = null; while (values.hasNext()) { - CrawlDatum val = (CrawlDatum)values.next(); + CrawlDatum val = values.next(); if (val.getStatus() == CrawlDatum.STATUS_INJECTED) { injected = val; injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); @@ -124,9 +121,7 @@ } } - public Injector() { - - } + public Injector() {} public Injector(Configuration conf) { setConf(conf); @@ -179,7 +174,7 @@ } public static void main(String[] args) throws Exception { - int res = new Injector().doMain(NutchConfiguration.create(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new Injector(), args); System.exit(res); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Wed Mar 19 03:34:14 2008 @@ -30,8 +30,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; -import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.ToolBase; +import org.apache.hadoop.util.*; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; @@ -42,7 +41,7 @@ import org.apache.nutch.util.NutchJob; /** Maintains an inverted link map, listing incoming links for each url. */ -public class LinkDb extends ToolBase implements Mapper { +public class LinkDb extends Configured implements Tool, Mapper<Text, ParseData, Text, Inlinks> { public static final Log LOG = LogFactory.getLog(LinkDb.class); @@ -54,9 +53,7 @@ private URLFilters urlFilters; private URLNormalizers urlNormalizers; - public LinkDb() { - - } + public LinkDb() {} public LinkDb(Configuration conf) { setConf(conf); @@ -75,8 +72,8 @@ public void close() {} - public void map(WritableComparable key, Writable value, - OutputCollector output, Reporter reporter) + public void map(Text key, ParseData parseData, + OutputCollector<Text, Inlinks> output, Reporter reporter) throws IOException { String fromUrl = key.toString(); String fromHost = getHost(fromUrl); @@ -97,7 +94,6 @@ } } if (fromUrl == null) return; // discard all outlinks - ParseData parseData = (ParseData)value; Outlink[] outlinks = parseData.getOutlinks(); Inlinks inlinks = new Inlinks(); for (int i = 0; i < outlinks.length; i++) { @@ -147,8 +143,8 @@ public void invert(Path linkDb, final Path segmentsDir, boolean normalize, boolean filter, boolean force) throws IOException { final FileSystem fs = FileSystem.get(getConf()); - Path[] files = fs.listPaths(segmentsDir, HadoopFSUtil.getPassDirectoriesFilter(fs)); - invert(linkDb, files, normalize, filter, force); + FileStatus[] files = fs.listStatus(segmentsDir, HadoopFSUtil.getPassDirectoriesFilter(fs)); + invert(linkDb, HadoopFSUtil.getPaths(files), normalize, filter, force); } public void invert(Path linkDb, Path[] segments, boolean normalize, boolean filter, boolean force) throws IOException { @@ -249,7 +245,7 @@ } public static void main(String[] args) throws Exception { - int res = new LinkDb().doMain(NutchConfiguration.create(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new LinkDb(), args); System.exit(res); } @@ -265,7 +261,7 @@ return -1; } Path segDir = null; - final FileSystem fs = FileSystem.get(conf); + final FileSystem fs = FileSystem.get(getConf()); Path db = new Path(args[0]); ArrayList<Path> segs = new ArrayList<Path>(); boolean filter = true; @@ -274,15 +270,8 @@ for (int i = 1; i < args.length; i++) { if (args[i].equals("-dir")) { segDir = new Path(args[++i]); - Path[] files = fs.listPaths(segDir, new PathFilter() { - public boolean accept(Path f) { - try { - if (fs.getFileStatus(f).isDir()) return true; - } catch (IOException ioe) {}; - return false; - } - }); - if (files != null) segs.addAll(Arrays.asList(files)); + FileStatus[] files = fs.listStatus(segDir, HadoopFSUtil.getPassDirectoriesFilter(fs)); + if (files != null) segs.addAll(Arrays.asList(HadoopFSUtil.getPaths(files))); break; } else if (args[i].equalsIgnoreCase("-noNormalize")) { normalize = false; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java Wed Mar 19 03:34:14 2008 @@ -24,10 +24,10 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapFileOutputFormat; @@ -36,7 +36,8 @@ import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.ToolBase; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -58,7 +59,7 @@ * * @author Andrzej Bialecki */ -public class LinkDbMerger extends ToolBase implements Reducer { +public class LinkDbMerger extends Configured implements Tool, Reducer<Text, Inlinks, Text, Inlinks> { private static final Log LOG = LogFactory.getLog(LinkDbMerger.class); private int maxInlinks; @@ -71,12 +72,12 @@ setConf(conf); } - public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { + public void reduce(Text key, Iterator<Inlinks> values, OutputCollector<Text, Inlinks> output, Reporter reporter) throws IOException { Inlinks result = new Inlinks(); while (values.hasNext()) { - Inlinks inlinks = (Inlinks)values.next(); + Inlinks inlinks = values.next(); int end = Math.min(maxInlinks - result.size(), inlinks.size()); Iterator<Inlink> it = inlinks.iterator(); @@ -135,7 +136,7 @@ * @param args */ public static void main(String[] args) throws Exception { - int res = new LinkDbMerger().doMain(NutchConfiguration.create(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbMerger(), args); System.exit(res); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Wed Mar 19 03:34:14 2008 @@ -23,12 +23,12 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configured; import org.apache.hadoop.io.*; import org.apache.hadoop.fs.*; import org.apache.hadoop.mapred.*; import org.apache.hadoop.mapred.lib.HashPartitioner; -import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.ToolBase; +import org.apache.hadoop.util.*; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.util.NutchConfiguration; @@ -37,10 +37,10 @@ import java.util.Iterator; /** . */ -public class LinkDbReader extends ToolBase implements Closeable { +public class LinkDbReader extends Configured implements Tool, Closeable { public static final Log LOG = LogFactory.getLog(LinkDbReader.class); - private static final Partitioner PARTITIONER = new HashPartitioner(); + private static final Partitioner<WritableComparable, Writable> PARTITIONER = new HashPartitioner<WritableComparable, Writable>(); private FileSystem fs; private Path directory; @@ -111,7 +111,7 @@ } public static void main(String[] args) throws Exception { - int res = new LinkDbReader().doMain(NutchConfiguration.create(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbReader(), args); System.exit(res); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/PartitionUrlByHost.java Wed Mar 19 03:34:14 2008 @@ -27,7 +27,7 @@ import org.apache.nutch.net.URLNormalizers; /** Partition urls by hostname. */ -public class PartitionUrlByHost implements Partitioner { +public class PartitionUrlByHost implements Partitioner<Text, Writable> { private static final Log LOG = LogFactory.getLog(PartitionUrlByHost.class); private int seed; @@ -41,9 +41,9 @@ public void close() {} /** Hash by hostname. */ - public int getPartition(WritableComparable key, Writable value, + public int getPartition(Text key, Writable value, int numReduceTasks) { - String urlString = ((Text)key).toString(); + String urlString = key.toString(); try { urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_PARTITION); } catch (Exception e) { Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Mar 19 03:34:14 2008 @@ -29,8 +29,7 @@ import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; -import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.ToolBase; +import org.apache.hadoop.util.*; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; @@ -45,7 +44,7 @@ /** The fetcher. Most of the work is done by plugins. */ -public class Fetcher extends ToolBase implements MapRunnable { +public class Fetcher extends Configured implements Tool, MapRunnable<WritableComparable, Writable, Text, NutchWritable> { public static final Log LOG = LogFactory.getLog(Fetcher.class); @@ -55,7 +54,7 @@ public static final String PROTOCOL_REDIR = "protocol"; - public static class InputFormat extends SequenceFileInputFormat { + public static class InputFormat extends SequenceFileInputFormat<WritableComparable, Writable> { /** Don't split inputs, to keep things polite. */ public InputSplit[] getSplits(JobConf job, int nSplits) throws IOException { @@ -69,8 +68,8 @@ } } - private RecordReader input; - private OutputCollector output; + private RecordReader<WritableComparable, Writable> input; + private OutputCollector<Text, NutchWritable> output; private Reporter reporter; private String segmentName; @@ -455,7 +454,7 @@ return conf.getBoolean("fetcher.store.content", true); } - public void run(RecordReader input, OutputCollector output, + public void run(RecordReader<WritableComparable, Writable> input, OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException { this.input = input; @@ -529,7 +528,7 @@ /** Run the fetcher. */ public static void main(String[] args) throws Exception { - int res = new Fetcher().doMain(NutchConfiguration.create(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new Fetcher(), args); System.exit(res); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Wed Mar 19 03:34:14 2008 @@ -28,9 +28,7 @@ import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; -import org.apache.hadoop.util.Progressable; -import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.ToolBase; +import org.apache.hadoop.util.*; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -63,8 +61,8 @@ * * @author Andrzej Bialecki */ -public class DeleteDuplicates extends ToolBase - implements Mapper, Reducer, OutputFormat { +public class DeleteDuplicates extends Configured + implements Tool, Mapper<WritableComparable, Writable, Text, IntWritable>, Reducer<Text, IntWritable, WritableComparable, Writable>, OutputFormat<WritableComparable, Writable> { private static final Log LOG = LogFactory.getLog(DeleteDuplicates.class); // Algorithm: @@ -141,7 +139,7 @@ } - public static class InputFormat extends InputFormatBase { + public static class InputFormat extends FileInputFormat<Text, IndexDoc> { private static final long INDEX_LENGTH = Integer.MAX_VALUE; /** Return each index as a split. */ @@ -155,7 +153,7 @@ return splits; } - public class DDRecordReader implements RecordReader { + public class DDRecordReader implements RecordReader<Text, IndexDoc> { private IndexReader indexReader; private int maxDoc = 0; @@ -174,7 +172,7 @@ this.index = index; } - public boolean next(WritableComparable key, Writable value) + public boolean next(Text key, IndexDoc indexDoc) throws IOException { // skip empty indexes @@ -189,9 +187,8 @@ Document document = indexReader.document(doc); // fill in key - ((Text)key).set(document.get("url")); + key.set(document.get("url")); // fill in value - IndexDoc indexDoc = (IndexDoc)value; indexDoc.keep = true; indexDoc.url.set(document.get("url")); indexDoc.hash.setDigest(document.get("digest")); @@ -226,11 +223,11 @@ indexReader.close(); } - public WritableComparable createKey() { + public Text createKey() { return new Text(); } - public Writable createValue() { + public IndexDoc createValue() { return new IndexDoc(); } @@ -240,7 +237,7 @@ } /** Return each index as a split. */ - public RecordReader getRecordReader(InputSplit split, + public RecordReader<Text, IndexDoc> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { FileSplit fsplit = (FileSplit)split; @@ -250,27 +247,27 @@ } } - public static class HashPartitioner implements Partitioner { + public static class HashPartitioner implements Partitioner<MD5Hash, Writable> { public void configure(JobConf job) {} public void close() {} - public int getPartition(WritableComparable key, Writable value, + public int getPartition(MD5Hash key, Writable value, int numReduceTasks) { - int hashCode = ((MD5Hash)key).hashCode(); + int hashCode = key.hashCode(); return (hashCode & Integer.MAX_VALUE) % numReduceTasks; } } - public static class UrlsReducer implements Reducer { + public static class UrlsReducer implements Reducer<Text, IndexDoc, MD5Hash, IndexDoc> { public void configure(JobConf job) {} public void close() {} - public void reduce(WritableComparable key, Iterator values, - OutputCollector output, Reporter reporter) throws IOException { + public void reduce(Text key, Iterator<IndexDoc> values, + OutputCollector<MD5Hash, IndexDoc> output, Reporter reporter) throws IOException { IndexDoc latest = null; while (values.hasNext()) { - IndexDoc value = (IndexDoc)values.next(); + IndexDoc value = values.next(); if (latest == null) { latest = value; continue; @@ -296,7 +293,7 @@ } } - public static class HashReducer implements Reducer { + public static class HashReducer implements Reducer<MD5Hash, IndexDoc, Text, IndexDoc> { boolean byScore; public void configure(JobConf job) { @@ -304,12 +301,12 @@ } public void close() {} - public void reduce(WritableComparable key, Iterator values, - OutputCollector output, Reporter reporter) + public void reduce(MD5Hash key, Iterator<IndexDoc> values, + OutputCollector<Text, IndexDoc> output, Reporter reporter) throws IOException { IndexDoc highest = null; while (values.hasNext()) { - IndexDoc value = (IndexDoc)values.next(); + IndexDoc value = values.next(); // skip already deleted if (!value.keep) { LOG.debug("-discard " + value + " (already marked)"); @@ -355,7 +352,7 @@ public void setConf(Configuration conf) { super.setConf(conf); try { - fs = FileSystem.get(conf); + if(conf != null) fs = FileSystem.get(conf); } catch (IOException e) { throw new RuntimeException(e); } @@ -365,7 +362,7 @@ /** Map [*,IndexDoc] pairs to [index,doc] pairs. */ public void map(WritableComparable key, Writable value, - OutputCollector output, Reporter reporter) + OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException { IndexDoc indexDoc = (IndexDoc)value; // don't delete these @@ -375,14 +372,14 @@ } /** Delete docs named in values from index named in key. */ - public void reduce(WritableComparable key, Iterator values, - OutputCollector output, Reporter reporter) + public void reduce(Text key, Iterator<IntWritable> values, + OutputCollector<WritableComparable, Writable> output, Reporter reporter) throws IOException { Path index = new Path(key.toString()); IndexReader reader = IndexReader.open(new FsDirectory(fs, index, false, getConf())); try { while (values.hasNext()) { - IntWritable value = (IntWritable)values.next(); + IntWritable value = values.next(); LOG.debug("-delete " + index + " doc=" + value); reader.deleteDocument(value.get()); } @@ -392,11 +389,11 @@ } /** Write nothing. */ - public RecordWriter getRecordWriter(final FileSystem fs, + public RecordWriter<WritableComparable, Writable> getRecordWriter(final FileSystem fs, final JobConf job, final String name, final Progressable progress) throws IOException { - return new RecordWriter() { + return new RecordWriter<WritableComparable, Writable>() { public void write(WritableComparable key, Writable value) throws IOException { throw new UnsupportedOperationException(); @@ -496,7 +493,7 @@ } public static void main(String[] args) throws Exception { - int res = new DeleteDuplicates().doMain(NutchConfiguration.create(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new DeleteDuplicates(), args); System.exit(res); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexMerger.java Wed Mar 19 03:34:14 2008 @@ -25,8 +25,7 @@ import org.apache.hadoop.fs.*; import org.apache.hadoop.mapred.FileAlreadyExistsException; -import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.ToolBase; +import org.apache.hadoop.util.*; import org.apache.hadoop.conf.*; import org.apache.nutch.util.HadoopFSUtil; @@ -43,7 +42,7 @@ * @author Doug Cutting * @author Mike Cafarella *************************************************************************/ -public class IndexMerger extends ToolBase { +public class IndexMerger extends Configured implements Tool { public static final Log LOG = LogFactory.getLog(IndexMerger.class); public static final String DONE_NAME = "merge.done"; @@ -81,17 +80,17 @@ Directory[] dirs = new Directory[indexes.length]; for (int i = 0; i < indexes.length; i++) { if (LOG.isInfoEnabled()) { LOG.info("Adding " + indexes[i]); } - dirs[i] = new FsDirectory(fs, indexes[i], false, this.conf); + dirs[i] = new FsDirectory(fs, indexes[i], false, getConf()); } // // Merge indices // IndexWriter writer = new IndexWriter(localOutput.toString(), null, true); - writer.setMergeFactor(conf.getInt("indexer.mergeFactor", IndexWriter.DEFAULT_MERGE_FACTOR)); - writer.setMaxBufferedDocs(conf.getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MAX_BUFFERED_DOCS)); - writer.setMaxMergeDocs(conf.getInt("indexer.maxMergeDocs", IndexWriter.DEFAULT_MAX_MERGE_DOCS)); - writer.setTermIndexInterval(conf.getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL)); + writer.setMergeFactor(getConf().getInt("indexer.mergeFactor", IndexWriter.DEFAULT_MERGE_FACTOR)); + writer.setMaxBufferedDocs(getConf().getInt("indexer.minMergeDocs", IndexWriter.DEFAULT_MAX_BUFFERED_DOCS)); + writer.setMaxMergeDocs(getConf().getInt("indexer.maxMergeDocs", IndexWriter.DEFAULT_MAX_MERGE_DOCS)); + writer.setTermIndexInterval(getConf().getInt("indexer.termIndexInterval", IndexWriter.DEFAULT_TERM_INDEX_INTERVAL)); writer.setInfoStream(LogUtil.getDebugStream(LOG)); writer.setUseCompoundFile(false); writer.setSimilarity(new NutchSimilarity()); @@ -109,7 +108,7 @@ * Create an index for the input files in the named directory. */ public static void main(String[] args) throws Exception { - int res = new IndexMerger().doMain(NutchConfiguration.create(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new IndexMerger(), args); System.exit(res); } @@ -123,7 +122,7 @@ // // Parse args, read all index directories to be processed // - FileSystem fs = FileSystem.get(conf); + FileSystem fs = FileSystem.get(getConf()); List<Path> indexDirs = new ArrayList<Path>(); Path workDir = new Path("indexmerger-" + System.currentTimeMillis()); @@ -152,7 +151,7 @@ LOG.fatal("IndexMerger: " + StringUtils.stringifyException(e)); return -1; } finally { - FileSystem.getLocal(conf).delete(workDir); + FileSystem.getLocal(getConf()).delete(workDir); } } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSorter.java Wed Mar 19 03:34:14 2008 @@ -32,12 +32,12 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.ToolBase; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.util.*; /** Sort a Nutch index by page score. Higher scoring documents are assigned * smaller document numbers. */ -public class IndexSorter extends ToolBase { +public class IndexSorter extends Configured implements Tool { private static final Log LOG = LogFactory.getLog(IndexSorter.class); private static class PostingMap implements Comparable<PostingMap> { @@ -300,7 +300,7 @@ /** */ public static void main(String[] args) throws Exception { - int res = new IndexSorter().doMain(NutchConfiguration.create(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new IndexSorter(), args); System.exit(res); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Wed Mar 19 03:34:14 2008 @@ -27,9 +27,7 @@ import org.apache.hadoop.fs.*; import org.apache.hadoop.conf.*; import org.apache.hadoop.mapred.*; -import org.apache.hadoop.util.Progressable; -import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.ToolBase; +import org.apache.hadoop.util.*; import org.apache.nutch.parse.*; import org.apache.nutch.analysis.*; @@ -51,7 +49,7 @@ import org.apache.nutch.metadata.Nutch; /** Create indexes for segments. */ -public class Indexer extends ToolBase implements Reducer, Mapper { +public class Indexer extends Configured implements Tool, Reducer<Text, NutchWritable, Text, Writable>, Mapper<Text, Writable, Text, NutchWritable> { public static final String DONE_NAME = "index.done"; @@ -85,8 +83,8 @@ /** Unwrap Lucene Documents created by reduce and add them to an index. */ public static class OutputFormat - extends org.apache.hadoop.mapred.OutputFormatBase { - public RecordWriter getRecordWriter(final FileSystem fs, JobConf job, + extends org.apache.hadoop.mapred.OutputFormatBase<WritableComparable, LuceneDocumentWrapper> { + public RecordWriter<WritableComparable, LuceneDocumentWrapper> getRecordWriter(final FileSystem fs, JobConf job, String name, final Progressable progress) throws IOException { final Path perm = new Path(job.getOutputPath(), name); final Path temp = @@ -109,12 +107,12 @@ writer.setUseCompoundFile(false); writer.setSimilarity(new NutchSimilarity()); - return new RecordWriter() { + return new RecordWriter<WritableComparable, LuceneDocumentWrapper>() { boolean closed; - public void write(WritableComparable key, Writable value) + public void write(WritableComparable key, LuceneDocumentWrapper value) throws IOException { // unwrap & index doc - Document doc = ((LuceneDocumentWrapper) value).get(); + Document doc = value.get(); NutchAnalyzer analyzer = factory.get(doc.get("lang")); if (LOG.isInfoEnabled()) { LOG.info(" Indexing [" + doc.getField("url").stringValue() + "]" + @@ -174,8 +172,8 @@ public void close() {} - public void reduce(WritableComparable key, Iterator values, - OutputCollector output, Reporter reporter) + public void reduce(Text key, Iterator<NutchWritable> values, + OutputCollector<Text, Writable> output, Reporter reporter) throws IOException { Inlinks inlinks = null; CrawlDatum dbDatum = null; @@ -183,7 +181,7 @@ ParseData parseData = null; ParseText parseText = null; while (values.hasNext()) { - Writable value = ((NutchWritable)values.next()).get(); // unwrap + Writable value = values.next().get(); // unwrap if (value instanceof Inlinks) { inlinks = (Inlinks)value; } else if (value instanceof CrawlDatum) { @@ -248,7 +246,7 @@ fetchDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, url); } // run indexing filters - doc = this.filters.filter(doc, parse, (Text)key, fetchDatum, inlinks); + doc = this.filters.filter(doc, parse, key, fetchDatum, inlinks); } catch (IndexingException e) { if (LOG.isWarnEnabled()) { LOG.warn("Error indexing "+key+": "+e); } return; @@ -315,7 +313,7 @@ } public static void main(String[] args) throws Exception { - int res = new Indexer().doMain(NutchConfiguration.create(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new Indexer(), args); System.exit(res); } @@ -341,8 +339,8 @@ } } - public void map(WritableComparable key, Writable value, - OutputCollector output, Reporter reporter) throws IOException { + public void map(Text key, Writable value, + OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException { output.collect(key, new NutchWritable(value)); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Wed Mar 19 03:34:14 2008 @@ -23,7 +23,7 @@ import org.apache.nutch.crawl.SignatureFactory; import org.apache.hadoop.io.*; import org.apache.hadoop.mapred.*; -import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.*; import org.apache.hadoop.conf.*; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.protocol.*; @@ -37,7 +37,7 @@ import java.util.Map.Entry; /* Parse content in a segment. */ -public class ParseSegment extends Configured implements Mapper, Reducer { +public class ParseSegment extends Configured implements Tool, Mapper<WritableComparable, Content, Text, ParseImpl>, Reducer<Text, Writable, Text, Writable> { public static final Log LOG = LogFactory.getLog(Parser.class); @@ -60,15 +60,14 @@ private Text newKey = new Text(); - public void map(WritableComparable key, Writable value, - OutputCollector output, Reporter reporter) + public void map(WritableComparable key, Content content, + OutputCollector<Text, ParseImpl> output, Reporter reporter) throws IOException { // convert on the fly from old UTF8 keys if (key instanceof UTF8) { newKey.set(key.toString()); key = newKey; } - Content content = (Content) value; ParseResult parseResult = null; try { @@ -111,8 +110,8 @@ } } - public void reduce(WritableComparable key, Iterator values, - OutputCollector output, Reporter reporter) + public void reduce(Text key, Iterator<Writable> values, + OutputCollector<Text, Writable> output, Reporter reporter) throws IOException { output.collect(key, (Writable)values.next()); // collect first value } @@ -144,6 +143,11 @@ public static void main(String[] args) throws Exception { + int res = ToolRunner.run(NutchConfiguration.create(), new ParseSegment(), args); + System.exit(res); + } + + public int run(String[] args) throws Exception { Path segment; String usage = "Usage: ParseSegment segment"; @@ -151,11 +155,9 @@ if (args.length == 0) { System.err.println(usage); System.exit(-1); - } - + } segment = new Path(args[0]); - - ParseSegment parseSegment = new ParseSegment(NutchConfiguration.create()); - parseSegment.parse(segment); + parse(segment); + return 0; } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java Wed Mar 19 03:34:14 2008 @@ -25,8 +25,8 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.conf.Configured; import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; @@ -38,7 +38,8 @@ import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.ToolBase; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Generator; import org.apache.nutch.crawl.PartitionUrlByHost; @@ -55,13 +56,15 @@ * * @author Andrzej Bialecki */ -public class FreeGenerator extends ToolBase { +public class FreeGenerator extends Configured implements Tool { private static final Log LOG = LogFactory.getLog(FreeGenerator.class); private static final String FILTER_KEY = "free.generator.filter"; private static final String NORMALIZE_KEY = "free.generator.normalize"; - public static class FG extends MapReduceBase implements Mapper, Reducer { + public static class FG extends MapReduceBase + implements Mapper<WritableComparable, Text, Text, Generator.SelectorEntry>, + Reducer<Text, Generator.SelectorEntry, Text, CrawlDatum> { private URLNormalizers normalizers = null; private URLFilters filters = null; private ScoringFilters scfilters; @@ -82,7 +85,8 @@ Generator.SelectorEntry entry = new Generator.SelectorEntry(); - public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { + public void map(WritableComparable key, Text value, OutputCollector<Text, + Generator.SelectorEntry> output, Reporter reporter) throws IOException { // value is a line of text String urlString = value.toString(); try { @@ -111,7 +115,8 @@ output.collect(url, entry); } - public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) throws IOException { + public void reduce(Text key, Iterator<Generator.SelectorEntry> values, + OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { // pick unique urls from values - discard the reduce key due to hash collisions HashMap<Text, CrawlDatum> unique = new HashMap<Text, CrawlDatum>(); while (values.hasNext()) { @@ -177,7 +182,7 @@ } public static void main(String[] args) throws Exception { - int res = new FreeGenerator().doMain(NutchConfiguration.create(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new FreeGenerator(), args); System.exit(res); } } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java Wed Mar 19 03:34:14 2008 @@ -24,18 +24,18 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.ToolBase; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.crawl.SignatureFactory; @@ -65,11 +65,10 @@ * <p>Arc files are tars of compressed gzips which are produced by both the * internet archive project and the grub distributed crawler project.</p> * - * TODO: This class needs to be changed to use ToolRunner instead of ToolBase. */ public class ArcSegmentCreator - extends ToolBase - implements Mapper { + extends Configured + implements Tool, Mapper<Text, BytesWritable, Text, NutchWritable> { public static final Log LOG = LogFactory.getLog(ArcSegmentCreator.class); public static final String URL_VERSION = "arc.url.version"; @@ -145,7 +144,7 @@ * * @return The result of the parse in a ParseStatus object. */ - private ParseStatus output(OutputCollector output, String segmentName, + private ParseStatus output(OutputCollector<Text, NutchWritable> output, String segmentName, Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status) { @@ -184,7 +183,7 @@ // set the content signature if (parseResult == null) { byte[] signature = SignatureFactory.getSignature(getConf()).calculate( - content, new ParseStatus().getEmptyParse(conf)); + content, new ParseStatus().getEmptyParse(getConf())); datum.setSignature(signature); } @@ -266,12 +265,12 @@ * segments.</p> * * @param key The arc record header. - * @param value The arc record raw content bytes. + * @param bytes The arc record raw content bytes. * @param output The output collecter. * @param reporter The progress reporter. */ - public void map(WritableComparable key, Writable value, - OutputCollector output, Reporter reporter) + public void map(Text key, BytesWritable bytes, + OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException { String[] headers = key.toString().split("\\s+"); @@ -289,7 +288,6 @@ // get the raw bytes from the arc file, create a new crawldatum Text url = new Text(); - BytesWritable bytes = (BytesWritable)value; CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, interval, 1.0f); String segmentName = getConf().get(Nutch.SEGMENT_NAME_KEY); @@ -371,7 +369,7 @@ public static void main(String args[]) throws Exception { - int res = new ArcSegmentCreator().doMain(NutchConfiguration.create(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new ArcSegmentCreator(), args); System.exit(res); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java?rev=638779&r1=638778&r2=638779&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java Wed Mar 19 03:34:14 2008 @@ -18,6 +18,7 @@ import java.io.IOException; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; @@ -49,6 +50,23 @@ } }; + } + + /** + * Turns an array of FileStatus into an array of Paths. + */ + public static Path[] getPaths(FileStatus[] stats) { + if (stats == null) { + return null; + } + if (stats.length == 0) { + return new Path[0]; + } + Path[] res = new Path[stats.length]; + for (int i = 0; i < stats.length; i++) { + res[i] = stats[i].getPath(); + } + return res; } }