Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Thu Jan 29 05:38:59 2015 @@ -51,9 +51,9 @@ import org.apache.nutch.util.URLUtil; * Generates a subset of a crawl db to fetch. This version allows to generate * fetchlists for several segments in one go. Unlike in the initial version * (OldGenerator), the IP resolution is done ONLY on the entries which have been - * selected for fetching. The URLs are partitioned by IP, domain or host within a - * segment. We can chose separately how to count the URLS i.e. by domain or host - * to limit the entries. + * selected for fetching. The URLs are partitioned by IP, domain or host within + * a segment. We can chose separately how to count the URLS i.e. by domain or + * host to limit the entries. **/ public class Generator extends Configured implements Tool { @@ -73,7 +73,7 @@ public class Generator extends Configure public static final String GENERATOR_CUR_TIME = "generate.curTime"; public static final String GENERATOR_DELAY = "crawl.gen.delay"; public static final String GENERATOR_MAX_NUM_SEGMENTS = "generate.max.num.segments"; - + public static class SelectorEntry implements Writable { public Text url; public CrawlDatum datum; @@ -98,25 +98,25 @@ public class Generator extends Configure } public String toString() { - return "url=" + url.toString() + ", datum=" + datum.toString() + ", segnum=" - + segnum.toString(); + return "url=" + url.toString() + ", datum=" + datum.toString() + + ", segnum=" + segnum.toString(); } } /** Selects entries due for fetch. */ public static class Selector implements - Mapper<Text,CrawlDatum,FloatWritable,SelectorEntry>, - Partitioner<FloatWritable,Writable>, - Reducer<FloatWritable,SelectorEntry,FloatWritable,SelectorEntry> { + Mapper<Text, CrawlDatum, FloatWritable, SelectorEntry>, + Partitioner<FloatWritable, Writable>, + Reducer<FloatWritable, SelectorEntry, FloatWritable, SelectorEntry> { private LongWritable genTime = new LongWritable(System.currentTimeMillis()); private long curTime; private long limit; private long count; - private HashMap<String,int[]> hostCounts = new HashMap<String,int[]>(); + private HashMap<String, int[]> hostCounts = new HashMap<String, int[]>(); private int segCounts[]; private int maxCount; private boolean byDomain = false; - private Partitioner<Text,Writable> partitioner = new URLPartitioner(); + private Partitioner<Text, Writable> partitioner = new URLPartitioner(); private URLFilters filters; private URLNormalizers normalizers; private ScoringFilters scfilters; @@ -134,22 +134,26 @@ public class Generator extends Configure public void configure(JobConf job) { curTime = job.getLong(GENERATOR_CUR_TIME, System.currentTimeMillis()); - limit = job.getLong(GENERATOR_TOP_N, Long.MAX_VALUE) / job.getNumReduceTasks(); + limit = job.getLong(GENERATOR_TOP_N, Long.MAX_VALUE) + / job.getNumReduceTasks(); maxCount = job.getInt(GENERATOR_MAX_COUNT, -1); - if (maxCount==-1){ + if (maxCount == -1) { byDomain = false; } - if (GENERATOR_COUNT_VALUE_DOMAIN.equals(job.get(GENERATOR_COUNT_MODE))) byDomain = true; + if (GENERATOR_COUNT_VALUE_DOMAIN.equals(job.get(GENERATOR_COUNT_MODE))) + byDomain = true; filters = new URLFilters(job); normalise = job.getBoolean(GENERATOR_NORMALISE, true); - if (normalise) normalizers = new URLNormalizers(job, - URLNormalizers.SCOPE_GENERATE_HOST_COUNT); + if (normalise) + normalizers = new URLNormalizers(job, + URLNormalizers.SCOPE_GENERATE_HOST_COUNT); scfilters = new ScoringFilters(job); partitioner.configure(job); filter = job.getBoolean(GENERATOR_FILTER, true); genDelay = job.getLong(GENERATOR_DELAY, 7L) * 3600L * 24L * 1000L; long time = job.getLong(Nutch.GENERATE_TIME_KEY, 0L); - if (time > 0) genTime.set(time); + if (time > 0) + genTime.set(time); schedule = FetchScheduleFactory.getFetchSchedule(job); scoreThreshold = job.getFloat(GENERATOR_MIN_SCORE, Float.NaN); intervalThreshold = job.getInt(GENERATOR_MIN_INTERVAL, -1); @@ -158,21 +162,24 @@ public class Generator extends Configure segCounts = new int[maxNumSegments]; } - public void close() {} + public void close() { + } /** Select & invert subset due for fetch. */ public void map(Text key, CrawlDatum value, - OutputCollector<FloatWritable,SelectorEntry> output, Reporter reporter) + OutputCollector<FloatWritable, SelectorEntry> output, Reporter reporter) throws IOException { Text url = key; if (filter) { // If filtering is on don't generate URLs that don't pass // URLFilters try { - if (filters.filter(url.toString()) == null) return; + if (filters.filter(url.toString()) == null) + return; } catch (URLFilterException e) { if (LOG.isWarnEnabled()) { - LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + ")"); + LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + + ")"); } } } @@ -189,8 +196,8 @@ public class Generator extends Configure Nutch.WRITABLE_GENERATE_TIME_KEY); if (oldGenTime != null) { // awaiting fetch & update if (oldGenTime.get() + genDelay > curTime) // still wait for - // update - return; + // update + return; } float sort = 1.0f; try { @@ -202,13 +209,19 @@ public class Generator extends Configure } if (restrictStatus != null - && !restrictStatus.equalsIgnoreCase(CrawlDatum.getStatusName(crawlDatum.getStatus()))) return; + && !restrictStatus.equalsIgnoreCase(CrawlDatum + .getStatusName(crawlDatum.getStatus()))) + return; // consider only entries with a score superior to the threshold - if (scoreThreshold != Float.NaN && sort < scoreThreshold) return; + if (scoreThreshold != Float.NaN && sort < scoreThreshold) + return; - // consider only entries with a retry (or fetch) interval lower than threshold - if (intervalThreshold != -1 && crawlDatum.getFetchInterval() > intervalThreshold) return; + // consider only entries with a retry (or fetch) interval lower than + // threshold + if (intervalThreshold != -1 + && crawlDatum.getFetchInterval() > intervalThreshold) + return; // sort by decreasing score, using DecreasingFloatComparator sortValue.set(sort); @@ -220,13 +233,15 @@ public class Generator extends Configure } /** Partition by host / domain or IP. */ - public int getPartition(FloatWritable key, Writable value, int numReduceTasks) { - return partitioner.getPartition(((SelectorEntry) value).url, key, numReduceTasks); + public int getPartition(FloatWritable key, Writable value, + int numReduceTasks) { + return partitioner.getPartition(((SelectorEntry) value).url, key, + numReduceTasks); } /** Collect until limit is reached. */ public void reduce(FloatWritable key, Iterator<SelectorEntry> values, - OutputCollector<FloatWritable,SelectorEntry> output, Reporter reporter) + OutputCollector<FloatWritable, SelectorEntry> output, Reporter reporter) throws IOException { while (values.hasNext()) { @@ -236,7 +251,8 @@ public class Generator extends Configure if (currentsegmentnum < maxNumSegments) { count = 0; currentsegmentnum++; - } else break; + } else + break; } SelectorEntry entry = values.next(); @@ -270,7 +286,7 @@ public class Generator extends Configure if (maxCount > 0) { int[] hostCount = hostCounts.get(hostordomain); if (hostCount == null) { - hostCount = new int[] {1, 0}; + hostCount = new int[] { 1, 0 }; hostCounts.put(hostordomain, hostCount); } @@ -278,7 +294,8 @@ public class Generator extends Configure hostCount[1]++; // check if topN reached, select next segment if it is - while (segCounts[hostCount[0]-1] >= limit && hostCount[0] < maxNumSegments) { + while (segCounts[hostCount[0] - 1] >= limit + && hostCount[0] < maxNumSegments) { hostCount[0]++; hostCount[1] = 0; } @@ -291,18 +308,23 @@ public class Generator extends Configure hostCount[1] = 0; } else { if (hostCount[1] == maxCount + 1 && LOG.isInfoEnabled()) { - LOG.info("Host or domain " + hostordomain + " has more than " + maxCount - + " URLs for all " + maxNumSegments + " segments. Additional URLs won't be included in the fetchlist."); + LOG.info("Host or domain " + + hostordomain + + " has more than " + + maxCount + + " URLs for all " + + maxNumSegments + + " segments. Additional URLs won't be included in the fetchlist."); } // skip this entry continue; } } entry.segnum = new IntWritable(hostCount[0]); - segCounts[hostCount[0]-1]++; + segCounts[hostCount[0] - 1]++; } else { entry.segnum = new IntWritable(currentsegmentnum); - segCounts[currentsegmentnum-1]++; + segCounts[currentsegmentnum - 1]++; } output.collect(key, entry); @@ -316,16 +338,17 @@ public class Generator extends Configure // Allows the reducers to generate one subfile per public static class GeneratorOutputFormat extends - MultipleSequenceFileOutputFormat<FloatWritable,SelectorEntry> { + MultipleSequenceFileOutputFormat<FloatWritable, SelectorEntry> { // generate a filename based on the segnum stored for this entry - protected String generateFileNameForKeyValue(FloatWritable key, SelectorEntry value, - String name) { + protected String generateFileNameForKeyValue(FloatWritable key, + SelectorEntry value, String name) { return "fetchlist-" + value.segnum.toString() + "/" + name; } } - public static class DecreasingFloatComparator extends FloatWritable.Comparator { + public static class DecreasingFloatComparator extends + FloatWritable.Comparator { /** Compares two FloatWritables decreasing. */ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { @@ -334,20 +357,22 @@ public class Generator extends Configure } public static class SelectorInverseMapper extends MapReduceBase implements - Mapper<FloatWritable,SelectorEntry,Text,SelectorEntry> { + Mapper<FloatWritable, SelectorEntry, Text, SelectorEntry> { public void map(FloatWritable key, SelectorEntry value, - OutputCollector<Text,SelectorEntry> output, Reporter reporter) throws IOException { + OutputCollector<Text, SelectorEntry> output, Reporter reporter) + throws IOException { SelectorEntry entry = value; output.collect(entry.url, entry); } } public static class PartitionReducer extends MapReduceBase implements - Reducer<Text,SelectorEntry,Text,CrawlDatum> { + Reducer<Text, SelectorEntry, Text, CrawlDatum> { public void reduce(Text key, Iterator<SelectorEntry> values, - OutputCollector<Text,CrawlDatum> output, Reporter reporter) throws IOException { + OutputCollector<Text, CrawlDatum> output, Reporter reporter) + throws IOException { // if using HashComparator, we get only one input key in case of // hash collision // so use only URLs from values @@ -365,7 +390,7 @@ public class Generator extends Configure super(Text.class); } - @SuppressWarnings("rawtypes" ) + @SuppressWarnings("rawtypes") public int compare(WritableComparable a, WritableComparable b) { Text url1 = (Text) a; Text url2 = (Text) b; @@ -395,15 +420,17 @@ public class Generator extends Configure * Update the CrawlDB so that the next generate won't include the same URLs. */ public static class CrawlDbUpdater extends MapReduceBase implements - Mapper<Text,CrawlDatum,Text,CrawlDatum>, Reducer<Text,CrawlDatum,Text,CrawlDatum> { + Mapper<Text, CrawlDatum, Text, CrawlDatum>, + Reducer<Text, CrawlDatum, Text, CrawlDatum> { long generateTime; public void configure(JobConf job) { generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, 0L); } - public void map(Text key, CrawlDatum value, OutputCollector<Text,CrawlDatum> output, - Reporter reporter) throws IOException { + public void map(Text key, CrawlDatum value, + OutputCollector<Text, CrawlDatum> output, Reporter reporter) + throws IOException { output.collect(key, value); } @@ -411,7 +438,8 @@ public class Generator extends Configure private LongWritable genTime = new LongWritable(0L); public void reduce(Text key, Iterator<CrawlDatum> values, - OutputCollector<Text,CrawlDatum> output, Reporter reporter) throws IOException { + OutputCollector<Text, CrawlDatum> output, Reporter reporter) + throws IOException { genTime.set(0L); while (values.hasNext()) { CrawlDatum val = values.next(); @@ -435,19 +463,21 @@ public class Generator extends Configure } } - public Generator() {} + public Generator() { + } public Generator(Configuration conf) { setConf(conf); } - public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime) - throws IOException { + public Path[] generate(Path dbDir, Path segments, int numLists, long topN, + long curTime) throws IOException { JobConf job = new NutchJob(getConf()); boolean filter = job.getBoolean(GENERATOR_FILTER, true); boolean normalise = job.getBoolean(GENERATOR_NORMALISE, true); - return generate(dbDir, segments, numLists, topN, curTime, filter, normalise, false, 1); + return generate(dbDir, segments, numLists, topN, curTime, filter, + normalise, false, 1); } /** @@ -456,7 +486,8 @@ public class Generator extends Configure **/ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean force) throws IOException { - return generate(dbDir, segments, numLists, topN, curTime, filter, true, force, 1); + return generate(dbDir, segments, numLists, topN, curTime, filter, true, + force, 1); } /** @@ -482,11 +513,11 @@ public class Generator extends Configure * When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, - long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) - throws IOException { + long curTime, boolean filter, boolean norm, boolean force, + int maxNumSegments) throws IOException { - Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/generate-temp-" - + java.util.UUID.randomUUID().toString()); + Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + + "/generate-temp-" + java.util.UUID.randomUUID().toString()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); @@ -501,7 +532,7 @@ public class Generator extends Configure if (topN != Long.MAX_VALUE) { LOG.info("Generator: topN: " + topN); } - + // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); @@ -553,7 +584,8 @@ public class Generator extends Configure try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); - if (!subfetchlist.getName().startsWith("fetchlist-")) continue; + if (!subfetchlist.getName().startsWith("fetchlist-")) + continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); @@ -573,8 +605,8 @@ public class Generator extends Configure if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir - Path tempDir2 = new Path(getConf().get("mapred.temp.dir", ".") + "/generate-temp-" - + java.util.UUID.randomUUID().toString()); + Path tempDir2 = new Path(getConf().get("mapred.temp.dir", ".") + + "/generate-temp-" + java.util.UUID.randomUUID().toString()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); @@ -607,7 +639,8 @@ public class Generator extends Configure fs.delete(tempDir, true); long end = System.currentTimeMillis(); - LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); @@ -653,7 +686,8 @@ public class Generator extends Configure public static synchronized String generateSegmentName() { try { Thread.sleep(1000); - } catch (Throwable t) {} + } catch (Throwable t) { + } ; return sdf.format(new Date(System.currentTimeMillis())); } @@ -662,7 +696,8 @@ public class Generator extends Configure * Generate a fetchlist from the crawldb. */ public static void main(String args[]) throws Exception { - int res = ToolRunner.run(NutchConfiguration.create(), new Generator(), args); + int res = ToolRunner + .run(NutchConfiguration.create(), new Generator(), args); System.exit(res); } @@ -706,9 +741,10 @@ public class Generator extends Configure } try { - Path[] segs = generate(dbDir, segmentsDir, numFetchers, topN, curTime, filter, - norm, force, maxNumSegments); - if (segs == null) return 1; + Path[] segs = generate(dbDir, segmentsDir, numFetchers, topN, curTime, + filter, norm, force, maxNumSegments); + if (segs == null) + return 1; } catch (Exception e) { LOG.error("Generator: " + StringUtils.stringifyException(e)); return -1;
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/Inlink.java Thu Jan 29 05:38:59 2015 @@ -26,7 +26,8 @@ public class Inlink implements Writable private String fromUrl; private String anchor; - public Inlink() {} + public Inlink() { + } public Inlink(String fromUrl, String anchor) { this.fromUrl = fromUrl; @@ -40,8 +41,8 @@ public class Inlink implements Writable /** Skips over one Inlink in the input. */ public static void skip(DataInput in) throws IOException { - Text.skip(in); // skip fromUrl - Text.skip(in); // skip anchor + Text.skip(in); // skip fromUrl + Text.skip(in); // skip anchor } public void write(DataOutput out) throws IOException { @@ -55,16 +56,20 @@ public class Inlink implements Writable return inlink; } - public String getFromUrl() { return fromUrl; } - public String getAnchor() { return anchor; } + public String getFromUrl() { + return fromUrl; + } + + public String getAnchor() { + return anchor; + } public boolean equals(Object o) { if (!(o instanceof Inlink)) return false; - Inlink other = (Inlink)o; - return - this.fromUrl.equals(other.fromUrl) && - this.anchor.equals(other.anchor); + Inlink other = (Inlink) o; + return this.fromUrl.equals(other.fromUrl) + && this.anchor.equals(other.anchor); } public int hashCode() { Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/Inlinks.java Thu Jan 29 05:38:59 2015 @@ -27,17 +27,25 @@ import org.apache.hadoop.io.*; public class Inlinks implements Writable { private HashSet<Inlink> inlinks = new HashSet<Inlink>(1); - public void add(Inlink inlink) { inlinks.add(inlink); } + public void add(Inlink inlink) { + inlinks.add(inlink); + } - public void add(Inlinks inlinks) { this.inlinks.addAll(inlinks.inlinks); } + public void add(Inlinks inlinks) { + this.inlinks.addAll(inlinks.inlinks); + } public Iterator<Inlink> iterator() { return this.inlinks.iterator(); } - - public int size() { return inlinks.size(); } - public void clear() { inlinks.clear(); } + public int size() { + return inlinks.size(); + } + + public void clear() { + inlinks.clear(); + } public void readFields(DataInput in) throws IOException { int length = in.readInt(); @@ -67,30 +75,32 @@ public class Inlinks implements Writable return buffer.toString(); } - /** Return the set of anchor texts. Only a single anchor with a given text - * is permitted from a given domain. */ + /** + * Return the set of anchor texts. Only a single anchor with a given text is + * permitted from a given domain. + */ public String[] getAnchors() { - HashMap<String, Set<String>> domainToAnchors = - new HashMap<String, Set<String>>(); + HashMap<String, Set<String>> domainToAnchors = new HashMap<String, Set<String>>(); ArrayList<String> results = new ArrayList<String>(); Iterator<Inlink> it = inlinks.iterator(); while (it.hasNext()) { Inlink inlink = it.next(); String anchor = inlink.getAnchor(); - if (anchor.length() == 0) // skip empty anchors + if (anchor.length() == 0) // skip empty anchors continue; - String domain = null; // extract domain name + String domain = null; // extract domain name try { domain = new URL(inlink.getFromUrl()).getHost(); - } catch (MalformedURLException e) {} + } catch (MalformedURLException e) { + } Set<String> domainAnchors = domainToAnchors.get(domain); if (domainAnchors == null) { domainAnchors = new HashSet<String>(); domainToAnchors.put(domain, domainAnchors); } - if (domainAnchors.add(anchor)) { // new anchor from domain - results.add(anchor); // collect it + if (domainAnchors.add(anchor)) { // new anchor from domain + results.add(anchor); // collect it } } Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Thu Jan 29 05:38:59 2015 @@ -43,7 +43,8 @@ import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.TimingUtil; /** Maintains an inverted link map, listing incoming links for each url. */ -public class LinkDb extends Configured implements Tool, Mapper<Text, ParseData, Text, Inlinks> { +public class LinkDb extends Configured implements Tool, + Mapper<Text, ParseData, Text, Inlinks> { public static final Logger LOG = LoggerFactory.getLogger(LinkDb.class); @@ -56,13 +57,14 @@ public class LinkDb extends Configured i private boolean ignoreInternalLinks; private URLFilters urlFilters; private URLNormalizers urlNormalizers; - - public LinkDb() {} - + + public LinkDb() { + } + public LinkDb(Configuration conf) { setConf(conf); } - + public void configure(JobConf job) { maxAnchorLength = job.getInt("db.max.anchor.length", 100); ignoreInternalLinks = job.getBoolean(IGNORE_INTERNAL_LINKS, true); @@ -74,16 +76,19 @@ public class LinkDb extends Configured i } } - public void close() {} + public void close() { + } public void map(Text key, ParseData parseData, - OutputCollector<Text, Inlinks> output, Reporter reporter) - throws IOException { + OutputCollector<Text, Inlinks> output, Reporter reporter) + throws IOException { String fromUrl = key.toString(); String fromHost = getHost(fromUrl); if (urlNormalizers != null) { try { - fromUrl = urlNormalizers.normalize(fromUrl, URLNormalizers.SCOPE_LINKDB); // normalize the url + fromUrl = urlNormalizers + .normalize(fromUrl, URLNormalizers.SCOPE_LINKDB); // normalize the + // url } catch (Exception e) { LOG.warn("Skipping " + fromUrl + ":" + e); fromUrl = null; @@ -97,7 +102,8 @@ public class LinkDb extends Configured i fromUrl = null; } } - if (fromUrl == null) return; // discard all outlinks + if (fromUrl == null) + return; // discard all outlinks Outlink[] outlinks = parseData.getOutlinks(); Inlinks inlinks = new Inlinks(); for (int i = 0; i < outlinks.length; i++) { @@ -107,12 +113,14 @@ public class LinkDb extends Configured i if (ignoreInternalLinks) { String toHost = getHost(toUrl); if (toHost == null || toHost.equals(fromHost)) { // internal link - continue; // skip it + continue; // skip it } } if (urlNormalizers != null) { try { - toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_LINKDB); // normalize the url + toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_LINKDB); // normalize + // the + // url } catch (Exception e) { LOG.warn("Skipping " + toUrl + ":" + e); toUrl = null; @@ -126,13 +134,14 @@ public class LinkDb extends Configured i toUrl = null; } } - if (toUrl == null) continue; + if (toUrl == null) + continue; inlinks.clear(); - String anchor = outlink.getAnchor(); // truncate long anchors + String anchor = outlink.getAnchor(); // truncate long anchors if (anchor.length() > maxAnchorLength) { anchor = anchor.substring(0, maxAnchorLength); } - inlinks.add(new Inlink(fromUrl, anchor)); // collect inverted link + inlinks.add(new Inlink(fromUrl, anchor)); // collect inverted link output.collect(new Text(toUrl), inlinks); } } @@ -145,13 +154,16 @@ public class LinkDb extends Configured i } } - public void invert(Path linkDb, final Path segmentsDir, boolean normalize, boolean filter, boolean force) throws IOException { + public void invert(Path linkDb, final Path segmentsDir, boolean normalize, + boolean filter, boolean force) throws IOException { final FileSystem fs = FileSystem.get(getConf()); - FileStatus[] files = fs.listStatus(segmentsDir, HadoopFSUtil.getPassDirectoriesFilter(fs)); + FileStatus[] files = fs.listStatus(segmentsDir, + HadoopFSUtil.getPassDirectoriesFilter(fs)); invert(linkDb, HadoopFSUtil.getPaths(files), normalize, filter, force); } - public void invert(Path linkDb, Path[] segments, boolean normalize, boolean filter, boolean force) throws IOException { + public void invert(Path linkDb, Path[] segments, boolean normalize, + boolean filter, boolean force) throws IOException { JobConf job = LinkDb.createJob(getConf(), linkDb, normalize, filter); Path lock = new Path(linkDb, LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); @@ -174,7 +186,8 @@ public class LinkDb extends Configured i if (LOG.isInfoEnabled()) { LOG.info("LinkDb: adding segment: " + segments[i]); } - FileInputFormat.addInputPath(job, new Path(segments[i], ParseData.DIR_NAME)); + FileInputFormat.addInputPath(job, new Path(segments[i], + ParseData.DIR_NAME)); } try { JobClient.runJob(job); @@ -203,13 +216,14 @@ public class LinkDb extends Configured i LinkDb.install(job, linkDb); long end = System.currentTimeMillis(); - LOG.info("LinkDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + LOG.info("LinkDb: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); } - private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) { - Path newLinkDb = - new Path("linkdb-" + - Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); + private static JobConf createJob(Configuration config, Path linkDb, + boolean normalize, boolean filter) { + Path newLinkDb = new Path("linkdb-" + + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("linkdb " + linkDb); @@ -247,12 +261,14 @@ public class LinkDb extends Configured i Path old = new Path(linkDb, "old"); Path current = new Path(linkDb, CURRENT_NAME); if (fs.exists(current)) { - if (fs.exists(old)) fs.delete(old, true); + if (fs.exists(old)) + fs.delete(old, true); fs.rename(current, old); } fs.mkdirs(linkDb); fs.rename(newLinkDb, current); - if (fs.exists(old)) fs.delete(old, true); + if (fs.exists(old)) + fs.delete(old, true); LockUtil.removeLockFile(fs, new Path(linkDb, LOCK_NAME)); } @@ -263,11 +279,14 @@ public class LinkDb extends Configured i public int run(String[] args) throws Exception { if (args.length < 2) { - System.err.println("Usage: LinkDb <linkdb> (-dir <segmentsDir> | <seg1> <seg2> ...) [-force] [-noNormalize] [-noFilter]"); + System.err + .println("Usage: LinkDb <linkdb> (-dir <segmentsDir> | <seg1> <seg2> ...) [-force] [-noNormalize] [-noFilter]"); System.err.println("\tlinkdb\toutput LinkDb to create or update"); - System.err.println("\t-dir segmentsDir\tparent directory of several segments, OR"); + System.err + .println("\t-dir segmentsDir\tparent directory of several segments, OR"); System.err.println("\tseg1 seg2 ...\t list of segment directories"); - System.err.println("\t-force\tforce update even if LinkDb appears to be locked (CAUTION advised)"); + System.err + .println("\t-force\tforce update even if LinkDb appears to be locked (CAUTION advised)"); System.err.println("\t-noNormalize\tdon't normalize link URLs"); System.err.println("\t-noFilter\tdon't apply URLFilters to link URLs"); return -1; @@ -281,7 +300,8 @@ public class LinkDb extends Configured i boolean force = false; for (int i = 1; i < args.length; i++) { if (args[i].equals("-dir")) { - FileStatus[] paths = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs)); + FileStatus[] paths = fs.listStatus(new Path(args[++i]), + HadoopFSUtil.getPassDirectoriesFilter(fs)); segs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths))); } else if (args[i].equalsIgnoreCase("-noNormalize")) { normalize = false; @@ -289,7 +309,8 @@ public class LinkDb extends Configured i filter = false; } else if (args[i].equalsIgnoreCase("-force")) { force = true; - } else segs.add(new Path(args[i])); + } else + segs.add(new Path(args[i])); } try { invert(db, segs.toArray(new Path[segs.size()]), normalize, filter, force); Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbFilter.java Thu Jan 29 05:38:59 2015 @@ -31,8 +31,8 @@ import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; /** - * This class provides a way to separate the URL normalization - * and filtering steps from the rest of LinkDb manipulation code. + * This class provides a way to separate the URL normalization and filtering + * steps from the rest of LinkDb manipulation code. * * @author Andrzej Bialecki */ @@ -50,13 +50,13 @@ public class LinkDbFilter implements Map private URLFilters filters; private URLNormalizers normalizers; - + private String scope; - + public static final Logger LOG = LoggerFactory.getLogger(LinkDbFilter.class); private Text newKey = new Text(); - + public void configure(JobConf job) { filter = job.getBoolean(URL_FILTERING, false); normalize = job.getBoolean(URL_NORMALIZING, false); @@ -69,10 +69,12 @@ public class LinkDbFilter implements Map } } - public void close() {} + public void close() { + } public void map(Text key, Inlinks value, - OutputCollector<Text, Inlinks> output, Reporter reporter) throws IOException { + OutputCollector<Text, Inlinks> output, Reporter reporter) + throws IOException { String url = key.toString(); Inlinks result = new Inlinks(); if (normalize) { @@ -91,7 +93,8 @@ public class LinkDbFilter implements Map url = null; } } - if (url == null) return; // didn't pass the filters + if (url == null) + return; // didn't pass the filters Iterator<Inlink> it = value.iterator(); String fromUrl = null; while (it.hasNext()) { @@ -113,7 +116,7 @@ public class LinkDbFilter implements Map fromUrl = null; } } - if (fromUrl != null) { + if (fromUrl != null) { result.add(new Inlink(fromUrl, inlink.getAnchor())); } } Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java Thu Jan 29 05:38:59 2015 @@ -46,37 +46,44 @@ import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.TimingUtil; /** - * This tool merges several LinkDb-s into one, optionally filtering - * URLs through the current URLFilters, to skip prohibited URLs and - * links. + * This tool merges several LinkDb-s into one, optionally filtering URLs through + * the current URLFilters, to skip prohibited URLs and links. * - * <p>It's possible to use this tool just for filtering - in that case - * only one LinkDb should be specified in arguments.</p> - * <p>If more than one LinkDb contains information about the same URL, - * all inlinks are accumulated, but only at most <code>db.max.inlinks</code> - * inlinks will ever be added.</p> - * <p>If activated, URLFilters will be applied to both the target URLs and - * to any incoming link URL. If a target URL is prohibited, all - * inlinks to that target will be removed, including the target URL. If - * some of incoming links are prohibited, only they will be removed, and they - * won't count when checking the above-mentioned maximum limit. + * <p> + * It's possible to use this tool just for filtering - in that case only one + * LinkDb should be specified in arguments. + * </p> + * <p> + * If more than one LinkDb contains information about the same URL, all inlinks + * are accumulated, but only at most <code>db.max.inlinks</code> inlinks will + * ever be added. + * </p> + * <p> + * If activated, URLFilters will be applied to both the target URLs and to any + * incoming link URL. If a target URL is prohibited, all inlinks to that target + * will be removed, including the target URL. If some of incoming links are + * prohibited, only they will be removed, and they won't count when checking the + * above-mentioned maximum limit. * * @author Andrzej Bialecki */ -public class LinkDbMerger extends Configured implements Tool, Reducer<Text, Inlinks, Text, Inlinks> { +public class LinkDbMerger extends Configured implements Tool, + Reducer<Text, Inlinks, Text, Inlinks> { private static final Logger LOG = LoggerFactory.getLogger(LinkDbMerger.class); - + private int maxInlinks; - + public LinkDbMerger() { - + } - + public LinkDbMerger(Configuration conf) { setConf(conf); } - public void reduce(Text key, Iterator<Inlinks> values, OutputCollector<Text, Inlinks> output, Reporter reporter) throws IOException { + public void reduce(Text key, Iterator<Inlinks> values, + OutputCollector<Text, Inlinks> output, Reporter reporter) + throws IOException { Inlinks result = new Inlinks(); @@ -86,43 +93,48 @@ public class LinkDbMerger extends Config int end = Math.min(maxInlinks - result.size(), inlinks.size()); Iterator<Inlink> it = inlinks.iterator(); int i = 0; - while(it.hasNext() && i++ < end) { + while (it.hasNext() && i++ < end) { result.add(it.next()); } } - if (result.size() == 0) return; + if (result.size() == 0) + return; output.collect(key, result); - + } public void configure(JobConf job) { maxInlinks = job.getInt("db.max.inlinks", 10000); } - public void close() throws IOException { } + public void close() throws IOException { + } - public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) throws Exception { + public void merge(Path output, Path[] dbs, boolean normalize, boolean filter) + throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("LinkDb merge: starting at " + sdf.format(start)); JobConf job = createMergeJob(getConf(), output, normalize, filter); for (int i = 0; i < dbs.length; i++) { - FileInputFormat.addInputPath(job, new Path(dbs[i], LinkDb.CURRENT_NAME)); + FileInputFormat.addInputPath(job, new Path(dbs[i], LinkDb.CURRENT_NAME)); } JobClient.runJob(job); FileSystem fs = FileSystem.get(getConf()); fs.mkdirs(output); - fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, LinkDb.CURRENT_NAME)); + fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, + LinkDb.CURRENT_NAME)); long end = System.currentTimeMillis(); - LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + LOG.info("LinkDb merge: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); } - public static JobConf createMergeJob(Configuration config, Path linkDb, boolean normalize, boolean filter) { - Path newLinkDb = - new Path("linkdb-merge-" + - Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); + public static JobConf createMergeJob(Configuration config, Path linkDb, + boolean normalize, boolean filter) { + Path newLinkDb = new Path("linkdb-merge-" + + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("linkdb merge " + linkDb); @@ -145,22 +157,27 @@ public class LinkDbMerger extends Config return job; } - + /** * @param args */ public static void main(String[] args) throws Exception { - int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbMerger(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbMerger(), + args); System.exit(res); } - + public int run(String[] args) throws Exception { if (args.length < 2) { - System.err.println("Usage: LinkDbMerger <output_linkdb> <linkdb1> [<linkdb2> <linkdb3> ...] [-normalize] [-filter]"); + System.err + .println("Usage: LinkDbMerger <output_linkdb> <linkdb1> [<linkdb2> <linkdb3> ...] [-normalize] [-filter]"); System.err.println("\toutput_linkdb\toutput LinkDb"); - System.err.println("\tlinkdb1 ...\tinput LinkDb-s (single input LinkDb is ok)"); - System.err.println("\t-normalize\tuse URLNormalizer on both fromUrls and toUrls in linkdb(s) (usually not needed)"); - System.err.println("\t-filter\tuse URLFilters on both fromUrls and toUrls in linkdb(s)"); + System.err + .println("\tlinkdb1 ...\tinput LinkDb-s (single input LinkDb is ok)"); + System.err + .println("\t-normalize\tuse URLNormalizer on both fromUrls and toUrls in linkdb(s) (usually not needed)"); + System.err + .println("\t-filter\tuse URLFilters on both fromUrls and toUrls in linkdb(s)"); return -1; } Path output = new Path(args[0]); @@ -172,7 +189,8 @@ public class LinkDbMerger extends Config filter = true; } else if (args[i].equals("-normalize")) { normalize = true; - } else dbs.add(new Path(args[i])); + } else + dbs.add(new Path(args[i])); } try { merge(output, dbs.toArray(new Path[dbs.size()]), normalize, filter); Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Thu Jan 29 05:38:59 2015 @@ -50,14 +50,14 @@ public class LinkDbReader extends Config private MapFile.Reader[] readers; public LinkDbReader() { - + } - + public LinkDbReader(Configuration conf, Path directory) throws Exception { setConf(conf); init(directory); } - + public void init(Path directory) throws Exception { this.fs = FileSystem.get(getConf()); this.directory = directory; @@ -73,16 +73,16 @@ public class LinkDbReader extends Config public Inlinks getInlinks(Text url) throws IOException { if (readers == null) { - synchronized(this) { - readers = MapFileOutputFormat.getReaders - (fs, new Path(directory, LinkDb.CURRENT_NAME), getConf()); + synchronized (this) { + readers = MapFileOutputFormat.getReaders(fs, new Path(directory, + LinkDb.CURRENT_NAME), getConf()); } } - - return (Inlinks)MapFileOutputFormat.getEntry - (readers, PARTITIONER, url, new Inlinks()); + + return (Inlinks) MapFileOutputFormat.getEntry(readers, PARTITIONER, url, + new Inlinks()); } - + public void close() throws IOException { if (readers != null) { for (int i = 0; i < readers.length; i++) { @@ -90,7 +90,7 @@ public class LinkDbReader extends Config } } } - + public void processDumpJob(String linkdb, String output) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); @@ -114,19 +114,24 @@ public class LinkDbReader extends Config JobClient.runJob(job); long end = System.currentTimeMillis(); - LOG.info("LinkDb dump: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + LOG.info("LinkDb dump: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); } - + public static void main(String[] args) throws Exception { - int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbReader(), args); + int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbReader(), + args); System.exit(res); } - + public int run(String[] args) throws Exception { if (args.length < 2) { - System.err.println("Usage: LinkDbReader <linkdb> (-dump <out_dir> | -url <url>)"); - System.err.println("\t-dump <out_dir>\tdump whole link db to a text file in <out_dir>"); - System.err.println("\t-url <url>\tprint information about <url> to System.out"); + System.err + .println("Usage: LinkDbReader <linkdb> (-dump <out_dir> | -url <url>)"); + System.err + .println("\t-dump <out_dir>\tdump whole link db to a text file in <out_dir>"); + System.err + .println("\t-url <url>\tprint information about <url> to System.out"); return -1; } try { Modified: nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java Thu Jan 29 05:38:59 2015 @@ -22,9 +22,9 @@ import org.apache.nutch.parse.Parse; import org.apache.nutch.protocol.Content; /** - * Default implementation of a page signature. It calculates an MD5 hash - * of the raw binary content of a page. In case there is no content, it - * calculates a hash from the page's URL. + * Default implementation of a page signature. It calculates an MD5 hash of the + * raw binary content of a page. In case there is no content, it calculates a + * hash from the page's URL. * * @author Andrzej Bialecki <[email protected]> */ @@ -32,7 +32,8 @@ public class MD5Signature extends Signat public byte[] calculate(Content content, Parse parse) { byte[] data = content.getContent(); - if (data == null) data = content.getUrl().getBytes(); + if (data == null) + data = content.getUrl().getBytes(); return MD5Hash.digest(data).getDigest(); } } Modified: nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java Thu Jan 29 05:38:59 2015 @@ -47,19 +47,19 @@ import org.apache.hadoop.util.StringUtil import org.apache.nutch.protocol.ProtocolStatus; /** - * A writable map, with a similar behavior as <code>java.util.HashMap</code>. - * In addition to the size of key and value writable tuple two additional bytes - * are stored to identify the Writable classes. This means that a maximum of - * 255 different class types can be used for key and value objects. - * A binary-id to class mapping is defined in a static block of this class. - * However it is possible to use custom implementations of Writable. - * For these custom Writables we write the byte id - utf class name tuple - * into the header of each MapWritable that uses these types. - * + * A writable map, with a similar behavior as <code>java.util.HashMap</code>. In + * addition to the size of key and value writable tuple two additional bytes are + * stored to identify the Writable classes. This means that a maximum of 255 + * different class types can be used for key and value objects. A binary-id to + * class mapping is defined in a static block of this class. However it is + * possible to use custom implementations of Writable. For these custom + * Writables we write the byte id - utf class name tuple into the header of each + * MapWritable that uses these types. + * * @author Stefan Groschupf * @deprecated Use org.apache.hadoop.io.MapWritable instead. */ - + @Deprecated public class MapWritable implements Writable { @@ -105,14 +105,16 @@ public class MapWritable implements Writ CLASS_ID_MAP.put(clazz, byteId); ID_CLASS_MAP.put(byteId, clazz); } - - public MapWritable() { } - + + public MapWritable() { + } + /** * Copy constructor. This constructor makes a deep copy, using serialization / * deserialization to break any possible references to contained objects. * - * @param map map to copy from + * @param map + * map to copy from */ public MapWritable(MapWritable map) { if (map != null) { @@ -123,8 +125,8 @@ public class MapWritable implements Writ dib.reset(dob.getData(), dob.getLength()); readFields(dib); } catch (IOException e) { - throw new IllegalArgumentException("this map cannot be copied: " + - StringUtils.stringifyException(e)); + throw new IllegalArgumentException("this map cannot be copied: " + + StringUtils.stringifyException(e)); } } } @@ -177,7 +179,8 @@ public class MapWritable implements Writ public Set<Writable> keySet() { HashSet<Writable> set = new HashSet<Writable>(); - if (isEmpty()) return set; + if (isEmpty()) + return set; set.add(fFirst.fKey); KeyValueEntry entry = fFirst; while ((entry = entry.fNextEntry) != null) { @@ -257,7 +260,8 @@ public class MapWritable implements Writ public boolean equals(Object obj) { if (obj instanceof MapWritable) { MapWritable map = (MapWritable) obj; - if (fSize != map.fSize) return false; + if (fSize != map.fSize) + return false; HashSet<KeyValueEntry> set1 = new HashSet<KeyValueEntry>(); KeyValueEntry e1 = fFirst; while (e1 != null) { @@ -345,7 +349,7 @@ public class MapWritable implements Writ clazz = Class.forName(Text.readString(in)); addIdEntry(id, clazz); } catch (Exception e) { - if (LOG.isWarnEnabled()) { + if (LOG.isWarnEnabled()) { LOG.warn("Unable to load internal map entry" + e.toString()); } fIdCount--; @@ -364,8 +368,8 @@ public class MapWritable implements Writ } } catch (IOException e) { if (LOG.isWarnEnabled()) { - LOG.warn("Unable to load meta data entry, ignoring.. : " + - e.toString()); + LOG.warn("Unable to load meta data entry, ignoring.. : " + + e.toString()); } fSize--; } Modified: nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java Thu Jan 29 05:38:59 2015 @@ -34,29 +34,31 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * Extension of @see AdaptiveFetchSchedule that allows for more flexible configuration - * of DEC and INC factors for various MIME-types. - * - * This class can be typically used in cases where a recrawl consists of many different - * MIME-types. It's not very common for MIME-types other than text/html to change frequently. - * Using this class you can configure different factors per MIME-type so to prefer frequently - * changing MIME-types over others. + * Extension of @see AdaptiveFetchSchedule that allows for more flexible + * configuration of DEC and INC factors for various MIME-types. + * + * This class can be typically used in cases where a recrawl consists of many + * different MIME-types. It's not very common for MIME-types other than + * text/html to change frequently. Using this class you can configure different + * factors per MIME-type so to prefer frequently changing MIME-types over + * others. + * + * For it to work this class relies on the Content-Type MetaData key being + * present in the CrawlDB. This can either be done when injecting new URL's or + * by adding "Content-Type" to the db.parsemeta.to.crawldb configuration setting + * to force MIME-types of newly discovered URL's to be added to the CrawlDB. * - * For it to work this class relies on the Content-Type MetaData key being present in the CrawlDB. - * This can either be done when injecting new URL's or by adding "Content-Type" to the - * db.parsemeta.to.crawldb configuration setting to force MIME-types of newly discovered URL's to - * be added to the CrawlDB. - * * @author markus */ public class MimeAdaptiveFetchSchedule extends AdaptiveFetchSchedule { // Loggg - public static final Logger LOG = LoggerFactory.getLogger(MimeAdaptiveFetchSchedule.class); + public static final Logger LOG = LoggerFactory + .getLogger(MimeAdaptiveFetchSchedule.class); // Conf directives public static final String SCHEDULE_INC_RATE = "db.fetch.schedule.adaptive.inc_rate"; public static final String SCHEDULE_DEC_RATE = "db.fetch.schedule.adaptive.dec_rate"; - public static final String SCHEDULE_MIME_FILE= "db.fetch.schedule.mime.file"; + public static final String SCHEDULE_MIME_FILE = "db.fetch.schedule.mime.file"; // Default values for DEC and INC rate private float defaultIncRate; @@ -74,18 +76,21 @@ public class MimeAdaptiveFetchSchedule e } // Here we store the mime's and their delta's - private HashMap<String,AdaptiveRate> mimeMap; + private HashMap<String, AdaptiveRate> mimeMap; public void setConf(Configuration conf) { super.setConf(conf); - if (conf == null) return; + if (conf == null) + return; - // Read and set the default INC and DEC rates in case we cannot set values based on MIME-type + // Read and set the default INC and DEC rates in case we cannot set values + // based on MIME-type defaultIncRate = conf.getFloat(SCHEDULE_INC_RATE, 0.2f); defaultDecRate = conf.getFloat(SCHEDULE_DEC_RATE, 0.2f); // Where's the mime/factor file? - Reader mimeFile = conf.getConfResourceAsReader(conf.get(SCHEDULE_MIME_FILE, "adaptive-mimetypes.txt")); + Reader mimeFile = conf.getConfResourceAsReader(conf.get(SCHEDULE_MIME_FILE, + "adaptive-mimetypes.txt")); try { readMimeFile(mimeFile); @@ -96,8 +101,8 @@ public class MimeAdaptiveFetchSchedule e @Override public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, - long prevFetchTime, long prevModifiedTime, - long fetchTime, long modifiedTime, int state) { + long prevFetchTime, long prevModifiedTime, long fetchTime, + long modifiedTime, int state) { // Set defaults INC_RATE = defaultIncRate; @@ -106,7 +111,8 @@ public class MimeAdaptiveFetchSchedule e // Check if the Content-Type field is available in the CrawlDatum if (datum.getMetaData().containsKey(HttpHeaders.WRITABLE_CONTENT_TYPE)) { // Get the MIME-type of the current URL - String currentMime = datum.getMetaData().get(HttpHeaders.WRITABLE_CONTENT_TYPE).toString(); + String currentMime = datum.getMetaData() + .get(HttpHeaders.WRITABLE_CONTENT_TYPE).toString(); // Get rid of charset currentMime = currentMime.substring(0, currentMime.indexOf(';')); @@ -120,18 +126,19 @@ public class MimeAdaptiveFetchSchedule e } return super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime, - fetchTime, modifiedTime, state); + fetchTime, modifiedTime, state); } /** * Reads the mime types and their associated INC/DEC factors in a HashMap - * - * @param mimeFile Reader + * + * @param mimeFile + * Reader * @return void */ private void readMimeFile(Reader mimeFile) throws IOException { // Instance of our mime/factor map - mimeMap = new HashMap<String,AdaptiveRate>(); + mimeMap = new HashMap<String, AdaptiveRate>(); // Open a reader BufferedReader reader = new BufferedReader(mimeFile); @@ -149,7 +156,8 @@ public class MimeAdaptiveFetchSchedule e // Sanity check, we need two or three items if (splits.length == 3) { // Add a lower cased MIME-type and the factor to the map - mimeMap.put(StringUtils.lowerCase(splits[0]), new AdaptiveRate(new Float(splits[1]), new Float(splits[2]))); + mimeMap.put(StringUtils.lowerCase(splits[0]), new AdaptiveRate( + new Float(splits[1]), new Float(splits[2]))); } else { LOG.warn("Invalid configuration line in: " + line); } @@ -178,7 +186,8 @@ public class MimeAdaptiveFetchSchedule e // Set a default MIME-type to test with org.apache.hadoop.io.MapWritable x = new org.apache.hadoop.io.MapWritable(); - x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text("text/html; charset=utf-8")); + x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text( + "text/html; charset=utf-8")); p.setMetaData(x); p.setFetchTime(0); @@ -187,37 +196,45 @@ public class MimeAdaptiveFetchSchedule e // let's move the timeline a couple of deltas for (int i = 0; i < 10000; i++) { if (lastModified + update < curTime) { - //System.out.println("i=" + i + ", lastModified=" + lastModified + ", update=" + update + ", curTime=" + curTime); + // System.out.println("i=" + i + ", lastModified=" + lastModified + + // ", update=" + update + ", curTime=" + curTime); changed = true; changeCnt++; lastModified = curTime; } - LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " - + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss); + LOG.info(i + ". " + changed + "\twill fetch at " + + (p.getFetchTime() / delta) + "\tinterval " + + (p.getFetchInterval() / SECONDS_PER_DAY) + " days" + "\t missed " + + miss); if (p.getFetchTime() <= curTime) { fetchCnt++; - fs.setFetchSchedule(new Text("http://www.example.com"), p, - p.getFetchTime(), p.getModifiedTime(), curTime, lastModified, - changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED); - - LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " - + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days"); - - if (!changed) miss++; - if (miss > maxMiss) maxMiss = miss; + fs.setFetchSchedule(new Text("http://www.example.com"), p, p + .getFetchTime(), p.getModifiedTime(), curTime, lastModified, + changed ? FetchSchedule.STATUS_MODIFIED + : FetchSchedule.STATUS_NOTMODIFIED); + + LOG.info("\tfetched & adjusted: " + "\twill fetch at " + + (p.getFetchTime() / delta) + "\tinterval " + + (p.getFetchInterval() / SECONDS_PER_DAY) + " days"); + + if (!changed) + miss++; + if (miss > maxMiss) + maxMiss = miss; changed = false; totalMiss += miss; miss = 0; } - if (changed) miss++; + if (changed) + miss++; curTime += delta; } LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss); - LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times."); + LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + + " times."); } - } \ No newline at end of file Modified: nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java Thu Jan 29 05:38:59 2015 @@ -26,32 +26,31 @@ public class NutchWritable extends Gener static { CLASSES = (Class<? extends Writable>[]) new Class<?>[] { - org.apache.hadoop.io.NullWritable.class, - org.apache.hadoop.io.BooleanWritable.class, - org.apache.hadoop.io.LongWritable.class, - org.apache.hadoop.io.BytesWritable.class, - org.apache.hadoop.io.FloatWritable.class, - org.apache.hadoop.io.IntWritable.class, - org.apache.hadoop.io.MapWritable.class, - org.apache.hadoop.io.Text.class, - org.apache.hadoop.io.MD5Hash.class, - org.apache.nutch.crawl.CrawlDatum.class, - org.apache.nutch.crawl.Inlink.class, - org.apache.nutch.crawl.Inlinks.class, - org.apache.nutch.indexer.NutchIndexAction.class, - org.apache.nutch.metadata.Metadata.class, - org.apache.nutch.parse.Outlink.class, - org.apache.nutch.parse.ParseText.class, - org.apache.nutch.parse.ParseData.class, - org.apache.nutch.parse.ParseImpl.class, - org.apache.nutch.parse.ParseStatus.class, - org.apache.nutch.protocol.Content.class, - org.apache.nutch.protocol.ProtocolStatus.class, - org.apache.nutch.scoring.webgraph.LinkDatum.class - }; + org.apache.hadoop.io.NullWritable.class, + org.apache.hadoop.io.BooleanWritable.class, + org.apache.hadoop.io.LongWritable.class, + org.apache.hadoop.io.BytesWritable.class, + org.apache.hadoop.io.FloatWritable.class, + org.apache.hadoop.io.IntWritable.class, + org.apache.hadoop.io.MapWritable.class, + org.apache.hadoop.io.Text.class, org.apache.hadoop.io.MD5Hash.class, + org.apache.nutch.crawl.CrawlDatum.class, + org.apache.nutch.crawl.Inlink.class, + org.apache.nutch.crawl.Inlinks.class, + org.apache.nutch.indexer.NutchIndexAction.class, + org.apache.nutch.metadata.Metadata.class, + org.apache.nutch.parse.Outlink.class, + org.apache.nutch.parse.ParseText.class, + org.apache.nutch.parse.ParseData.class, + org.apache.nutch.parse.ParseImpl.class, + org.apache.nutch.parse.ParseStatus.class, + org.apache.nutch.protocol.Content.class, + org.apache.nutch.protocol.ProtocolStatus.class, + org.apache.nutch.scoring.webgraph.LinkDatum.class }; } - public NutchWritable() { } + public NutchWritable() { + } public NutchWritable(Writable instance) { set(instance); Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java Thu Jan 29 05:38:59 2015 @@ -24,7 +24,7 @@ import org.apache.hadoop.conf.Configurab public abstract class Signature implements Configurable { protected Configuration conf; - + public abstract byte[] calculate(Content content, Parse parse); public Configuration getConf() { Modified: nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java Thu Jan 29 05:38:59 2015 @@ -23,25 +23,34 @@ public class SignatureComparator impleme public int compare(Object o1, Object o2) { return _compare(o1, o2); } - + public static int _compare(Object o1, Object o2) { - if (o1 == null && o2 == null) return 0; - if (o1 == null) return -1; - if (o2 == null) return 1; - if (!(o1 instanceof byte[])) return -1; - if (!(o2 instanceof byte[])) return 1; - byte[] data1 = (byte[])o1; - byte[] data2 = (byte[])o2; + if (o1 == null && o2 == null) + return 0; + if (o1 == null) + return -1; + if (o2 == null) + return 1; + if (!(o1 instanceof byte[])) + return -1; + if (!(o2 instanceof byte[])) + return 1; + byte[] data1 = (byte[]) o1; + byte[] data2 = (byte[]) o2; return _compare(data1, 0, data1.length, data2, 0, data2.length); } - - public static int _compare(byte[] data1, int s1, int l1, byte[] data2, int s2, int l2) { - if (l2 > l1) return -1; - if (l2 < l1) return 1; + + public static int _compare(byte[] data1, int s1, int l1, byte[] data2, + int s2, int l2) { + if (l2 > l1) + return -1; + if (l2 < l1) + return 1; int res = 0; for (int i = 0; i < l1; i++) { res = (data1[s1 + i] - data2[s2 + i]); - if (res != 0) return res; + if (res != 0) + return res; } return 0; } Modified: nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java Thu Jan 29 05:38:59 2015 @@ -27,28 +27,30 @@ import org.apache.nutch.util.ObjectCache /** * Factory class, which instantiates a Signature implementation according to the - * current Configuration configuration. This newly created instance is cached in the - * Configuration instance, so that it could be later retrieved. + * current Configuration configuration. This newly created instance is cached in + * the Configuration instance, so that it could be later retrieved. * * @author Andrzej Bialecki <[email protected]> */ public class SignatureFactory { - private static final Logger LOG = LoggerFactory.getLogger(SignatureFactory.class); + private static final Logger LOG = LoggerFactory + .getLogger(SignatureFactory.class); - private SignatureFactory() {} // no public ctor + private SignatureFactory() { + } // no public ctor /** Return the default Signature implementation. */ public synchronized static Signature getSignature(Configuration conf) { String clazz = conf.get("db.signature.class", MD5Signature.class.getName()); ObjectCache objectCache = ObjectCache.get(conf); - Signature impl = (Signature)objectCache.getObject(clazz); + Signature impl = (Signature) objectCache.getObject(clazz); if (impl == null) { try { if (LOG.isInfoEnabled()) { LOG.info("Using Signature impl: " + clazz); } Class<?> implClass = Class.forName(clazz); - impl = (Signature)implClass.newInstance(); + impl = (Signature) implClass.newInstance(); impl.setConf(conf); objectCache.setObject(clazz, impl); } catch (Exception e) { Modified: nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java Thu Jan 29 05:38:59 2015 @@ -22,9 +22,9 @@ import org.apache.nutch.parse.Parse; import org.apache.nutch.protocol.Content; /** - * Implementation of a page signature. It calculates an MD5 hash - * of the textual content of a page. In case there is no content, it - * calculates a hash from the page's URL. + * Implementation of a page signature. It calculates an MD5 hash of the textual + * content of a page. In case there is no content, it calculates a hash from the + * page's URL. */ public class TextMD5Signature extends Signature { @@ -36,7 +36,7 @@ public class TextMD5Signature extends Si if (text == null || text.length() == 0) { return fallback.calculate(content, parse); } - + return MD5Hash.digest(text).getDigest(); } } Modified: nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java Thu Jan 29 05:38:59 2015 @@ -35,41 +35,50 @@ import org.apache.nutch.util.StringUtil; import org.apache.nutch.util.NutchConfiguration; /** - * <p>An implementation of a page signature. It calculates an MD5 hash - * of a plain text "profile" of a page. In case there is no text, it - * calculates a hash using the {@link MD5Signature}.</p> - * <p>The algorithm to calculate a page "profile" takes the plain text version of - * a page and performs the following steps: + * <p> + * An implementation of a page signature. It calculates an MD5 hash of a plain + * text "profile" of a page. In case there is no text, it calculates a hash + * using the {@link MD5Signature}. + * </p> + * <p> + * The algorithm to calculate a page "profile" takes the plain text version of a + * page and performs the following steps: * <ul> * <li>remove all characters except letters and digits, and bring all characters * to lower case,</li> * <li>split the text into tokens (all consecutive non-whitespace characters),</li> - * <li>discard tokens equal or shorter than MIN_TOKEN_LEN (default 2 characters),</li> + * <li>discard tokens equal or shorter than MIN_TOKEN_LEN (default 2 + * characters),</li> * <li>sort the list of tokens by decreasing frequency,</li> - * <li>round down the counts of tokens to the nearest multiple of QUANT - * (<code>QUANT = QUANT_RATE * maxFreq</code>, where <code>QUANT_RATE</code> is 0.01f - * by default, and <code>maxFreq</code> is the maximum token frequency). If - * <code>maxFreq</code> is higher than 1, then QUANT is always higher than 2 (which - * means that tokens with frequency 1 are always discarded).</li> - * <li>tokens, which frequency after quantization falls below QUANT, are discarded.</li> - * <li>create a list of tokens and their quantized frequency, separated by spaces, - * in the order of decreasing frequency.</li> + * <li>round down the counts of tokens to the nearest multiple of QUANT ( + * <code>QUANT = QUANT_RATE * maxFreq</code>, where <code>QUANT_RATE</code> is + * 0.01f by default, and <code>maxFreq</code> is the maximum token frequency). + * If <code>maxFreq</code> is higher than 1, then QUANT is always higher than 2 + * (which means that tokens with frequency 1 are always discarded).</li> + * <li>tokens, which frequency after quantization falls below QUANT, are + * discarded.</li> + * <li>create a list of tokens and their quantized frequency, separated by + * spaces, in the order of decreasing frequency.</li> * </ul> * This list is then submitted to an MD5 hash calculation. * * @author Andrzej Bialecki <[email protected]> */ public class TextProfileSignature extends Signature { - + Signature fallback = new MD5Signature(); public byte[] calculate(Content content, Parse parse) { - int MIN_TOKEN_LEN = getConf().getInt("db.signature.text_profile.min_token_len", 2); - float QUANT_RATE = getConf().getFloat("db.signature.text_profile.quant_rate", 0.01f); + int MIN_TOKEN_LEN = getConf().getInt( + "db.signature.text_profile.min_token_len", 2); + float QUANT_RATE = getConf().getFloat( + "db.signature.text_profile.quant_rate", 0.01f); HashMap<String, Token> tokens = new HashMap<String, Token>(); String text = null; - if (parse != null) text = parse.getText(); - if (text == null || text.length() == 0) return fallback.calculate(content, parse); + if (parse != null) + text = parse.getText(); + if (text == null || text.length() == 0) + return fallback.calculate(content, parse); StringBuffer curToken = new StringBuffer(); int maxFreq = 0; for (int i = 0; i < text.length(); i++) { @@ -87,7 +96,8 @@ public class TextProfileSignature extend tokens.put(s, tok); } tok.cnt++; - if (tok.cnt > maxFreq) maxFreq = tok.cnt; + if (tok.cnt > maxFreq) + maxFreq = tok.cnt; } curToken.setLength(0); } @@ -103,17 +113,20 @@ public class TextProfileSignature extend tokens.put(s, tok); } tok.cnt++; - if (tok.cnt > maxFreq) maxFreq = tok.cnt; + if (tok.cnt > maxFreq) + maxFreq = tok.cnt; } Iterator<Token> it = tokens.values().iterator(); ArrayList<Token> profile = new ArrayList<Token>(); // calculate the QUANT value int QUANT = Math.round(maxFreq * QUANT_RATE); if (QUANT < 2) { - if (maxFreq > 1) QUANT = 2; - else QUANT = 1; + if (maxFreq > 1) + QUANT = 2; + else + QUANT = 1; } - while(it.hasNext()) { + while (it.hasNext()) { Token t = it.next(); // round down to the nearest QUANT t.cnt = (t.cnt / QUANT) * QUANT; @@ -128,32 +141,33 @@ public class TextProfileSignature extend it = profile.iterator(); while (it.hasNext()) { Token t = it.next(); - if (newText.length() > 0) newText.append("\n"); + if (newText.length() > 0) + newText.append("\n"); newText.append(t.toString()); } return MD5Hash.digest(newText.toString()).getDigest(); } - + private static class Token { public int cnt; public String val; - + public Token(int cnt, String val) { this.cnt = cnt; this.val = val; } - + public String toString() { return val + " " + cnt; } } - + private static class TokenComparator implements Comparator<Token> { public int compare(Token t1, Token t2) { return t2.cnt - t1.cnt; } } - + public static void main(String[] args) throws Exception { TextProfileSignature sig = new TextProfileSignature(); sig.setConf(NutchConfiguration.create()); @@ -161,15 +175,18 @@ public class TextProfileSignature extend File[] files = new File(args[0]).listFiles(); for (int i = 0; i < files.length; i++) { FileInputStream fis = new FileInputStream(files[i]); - BufferedReader br = new BufferedReader(new InputStreamReader(fis, "UTF-8")); + BufferedReader br = new BufferedReader( + new InputStreamReader(fis, "UTF-8")); StringBuffer text = new StringBuffer(); String line = null; while ((line = br.readLine()) != null) { - if (text.length() > 0) text.append("\n"); + if (text.length() > 0) + text.append("\n"); text.append(line); } br.close(); - byte[] signature = sig.calculate(null, new ParseImpl(text.toString(), null)); + byte[] signature = sig.calculate(null, new ParseImpl(text.toString(), + null)); res.put(files[i].toString(), signature); } Iterator<String> it = res.keySet().iterator(); Modified: nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/URLPartitioner.java Thu Jan 29 05:38:59 2015 @@ -33,8 +33,9 @@ import org.apache.nutch.util.URLUtil; * Partition urls by host, domain name or IP depending on the value of the * parameter 'partition.url.mode' which can be 'byHost', 'byDomain' or 'byIP' */ -public class URLPartitioner implements Partitioner<Text,Writable> { - private static final Logger LOG = LoggerFactory.getLogger(URLPartitioner.class); +public class URLPartitioner implements Partitioner<Text, Writable> { + private static final Logger LOG = LoggerFactory + .getLogger(URLPartitioner.class); public static final String PARTITION_MODE_KEY = "partition.url.mode"; @@ -58,7 +59,8 @@ public class URLPartitioner implements P normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_PARTITION); } - public void close() {} + public void close() { + } /** Hash by domain name. */ public int getPartition(Text key, Writable value, int numReduceTasks) { @@ -66,15 +68,16 @@ public class URLPartitioner implements P URL url = null; int hashCode = urlString.hashCode(); try { - urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_PARTITION); + urlString = normalizers.normalize(urlString, + URLNormalizers.SCOPE_PARTITION); url = new URL(urlString); hashCode = url.getHost().hashCode(); } catch (MalformedURLException e) { LOG.warn("Malformed URL: '" + urlString + "'"); } - if (mode.equals(PARTITION_MODE_DOMAIN) && url != null) hashCode = URLUtil - .getDomainName(url).hashCode(); + if (mode.equals(PARTITION_MODE_DOMAIN) && url != null) + hashCode = URLUtil.getDomainName(url).hashCode(); else if (mode.equals(PARTITION_MODE_IP)) { try { InetAddress address = InetAddress.getByName(url.getHost());
