Author: dogacan Date: Mon Sep 3 06:37:24 2007 New Revision: 572335 URL: http://svn.apache.org/viewvc?rev=572335&view=rev Log: NUTCH-532 - CrawlDbMerger: wrong computation of last fetch time. Contributed by Emmanuel Joke.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=572335&r1=572334&r2=572335&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Sep 3 06:37:24 2007 @@ -126,6 +126,9 @@ 42. NUTCH-545 - Configuration and OnlineClusterer get initialized in every request. (Dawid Weiss via dogacan) +43. NUTCH-532 - CrawlDbMerger: wrong computation of last fetch time. + (Emmanuel Joke via dogacan) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=572335&r1=572334&r2=572335&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Mon Sep 3 06:37:24 2007 @@ -33,8 +33,8 @@ public abstract class AbstractFetchSchedule extends Configured implements FetchSchedule { private static final Log LOG = LogFactory.getLog(AbstractFetchSchedule.class); - private float defaultInterval; - private float maxInterval; + private int defaultInterval; + private int maxInterval; public AbstractFetchSchedule() { super(null); @@ -48,9 +48,11 @@ super.setConf(conf); if (conf == null) return; int oldDefaultInterval = conf.getInt("db.default.fetch.interval", 0); - defaultInterval = conf.getFloat("db.fetch.interval.default", 0); + defaultInterval = conf.getInt("db.fetch.interval.default", 0); if (oldDefaultInterval > 0 && defaultInterval == 0) defaultInterval = oldDefaultInterval * SECONDS_PER_DAY; - maxInterval = conf.getFloat("db.fetch.interval.max", 30.0f * SECONDS_PER_DAY); + int oldMaxInterval = conf.getInt("db.max.fetch.interval", 0); + maxInterval = conf.getInt("db.fetch.interval.max", 0 ); + if (oldMaxInterval > 0 && maxInterval == 0) maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY; LOG.info("defaultInterval=" + defaultInterval); LOG.info("maxInterval=" + maxInterval); } @@ -91,7 +93,7 @@ // no page is truly GONE ... just increase the interval by 50% // and try much later. datum.setFetchInterval(datum.getFetchInterval() * 1.5f); - datum.setFetchTime(fetchTime + Math.round(datum.getFetchInterval() * 1000.0d)); + datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000); if (maxInterval < datum.getFetchInterval()) forceRefetch(url, datum, false); return datum; } @@ -117,6 +119,14 @@ } /** + * This method return the last fetch time of the CrawlDatum + * @return the date as a long. + */ + public long calculateLastFetchTime(CrawlDatum datum){ + return datum.getFetchTime() - (long)datum.getFetchInterval() * 1000; + } + + /** * This method provides information whether the page is suitable for * selection in the current fetchlist. NOTE: a true return value does not * guarantee that the page will be fetched, it just allows it to be @@ -136,7 +146,7 @@ // pages are never truly GONE - we have to check them from time to time. // pages with too long fetchInterval are adjusted so that they fit within // maximum fetchInterval (segment retention period). - if (datum.getFetchTime() - curTime > maxInterval * 1000) { + if (datum.getFetchTime() - curTime > (long) maxInterval * 1000) { datum.setFetchInterval(maxInterval * 0.9f); datum.setFetchTime(curTime); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=572335&r1=572334&r2=572335&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java Mon Sep 3 06:37:24 2007 @@ -57,9 +57,9 @@ private float DEC_RATE; - private float MAX_INTERVAL; + private int MAX_INTERVAL; - private float MIN_INTERVAL; + private int MIN_INTERVAL; private boolean SYNC_DELTA; @@ -70,8 +70,8 @@ if (conf == null) return; INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f); DEC_RATE = conf.getFloat("db.fetch.schedule.adaptive.dec_rate", 0.2f); - MIN_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.min_interval", 60.0f); - MAX_INTERVAL = conf.getFloat("db.fetch.schedule.adaptive.max_interval", (float) (3600 * 24 * 365)); // 1 year + MIN_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.min_interval", 60); + MAX_INTERVAL = conf.getInt("db.fetch.schedule.adaptive.max_interval", SECONDS_PER_DAY * 365 ); // 1 year SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", true); SYNC_DELTA_RATE = conf.getFloat("db.fetch.schedule.adaptive.sync_delta_rate", 0.2f); } @@ -101,7 +101,7 @@ } if (interval < MIN_INTERVAL) interval = MIN_INTERVAL; if (interval > MAX_INTERVAL) interval = MAX_INTERVAL; - datum.setFetchTime(refTime + Math.round(1000.0f * datum.getFetchInterval())); + datum.setFetchTime(refTime + (long)datum.getFetchInterval() * 1000 ); datum.setModifiedTime(modifiedTime); return datum; } @@ -134,14 +134,14 @@ lastModified = curTime; } System.out.println(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " - + (p.getFetchInterval() / (float) (3600 * 24)) + " days" + "\t missed " + miss); + + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t missed " + miss); if (p.getFetchTime() <= curTime) { fetchCnt++; fs.setFetchSchedule(new Text("http://www.example.com"), p, p.getFetchTime(), p.getModifiedTime(), curTime, lastModified, changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED); System.out.println("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " - + (p.getFetchInterval() / (float) (3600 * 24)) + " days"); + + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days"); if (!changed) miss++; if (miss > maxMiss) maxMiss = miss; changed = false; Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=572335&r1=572334&r2=572335&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Mon Sep 3 06:37:24 2007 @@ -29,7 +29,7 @@ public static final String FETCH_DIR_NAME = "crawl_fetch"; public static final String PARSE_DIR_NAME = "crawl_parse"; - private final static byte CUR_VERSION = 5; + private final static byte CUR_VERSION = 6; /** Compatibility values for on-the-fly conversion from versions < 5. */ private static final byte OLD_STATUS_SIGNATURE = 0; @@ -114,7 +114,7 @@ private byte status; private long fetchTime = System.currentTimeMillis(); private byte retries; - private float fetchInterval; + private int fetchInterval; private float score = 1.0f; private byte[] signature = null; private long modifiedTime; @@ -134,12 +134,12 @@ metaData = new MapWritable(); } - public CrawlDatum(int status, float fetchInterval) { + public CrawlDatum(int status, int fetchInterval) { this.status = (byte)status; this.fetchInterval = fetchInterval; } - public CrawlDatum(int status, float fetchInterval, float score) { + public CrawlDatum(int status, int fetchInterval, float score) { this(status, fetchInterval); this.score = score; } @@ -172,10 +172,13 @@ public byte getRetriesSinceFetch() { return retries; } public void setRetriesSinceFetch(int retries) {this.retries = (byte)retries;} - public float getFetchInterval() { return fetchInterval; } - public void setFetchInterval(float fetchInterval) { + public int getFetchInterval() { return fetchInterval; } + public void setFetchInterval(int fetchInterval) { this.fetchInterval = fetchInterval; } + public void setFetchInterval(float fetchInterval) { + this.fetchInterval = Math.round(fetchInterval); + } public float getScore() { return score; } public void setScore(float score) { this.score = score; } @@ -221,7 +224,9 @@ status = in.readByte(); fetchTime = in.readLong(); retries = in.readByte(); - fetchInterval = in.readFloat(); + if (version > 5) { + fetchInterval = in.readInt(); + } else fetchInterval = Math.round(in.readFloat()); score = in.readFloat(); if (version > 2) { modifiedTime = in.readLong(); @@ -256,7 +261,7 @@ out.writeByte(status); out.writeLong(fetchTime); out.writeByte(retries); - out.writeFloat(fetchInterval); + out.writeInt(fetchInterval); out.writeFloat(score); out.writeLong(modifiedTime); if (signature == null) { @@ -330,8 +335,8 @@ int retries2 = b2[s2+1+1+8]; if (retries2 != retries1) return retries2 - retries1; - float fetchInterval1 = readFloat(b1, s1+1+1+8+1); - float fetchInterval2 = readFloat(b2, s2+1+1+8+1); + int fetchInterval1 = readInt(b1, s1+1+1+8+1); + int fetchInterval2 = readInt(b2, s2+1+1+8+1); if (fetchInterval2 != fetchInterval1) return (fetchInterval2 - fetchInterval1) > 0 ? 1 : -1; long modifiedTime1 = readLong(b1, s1 + SCORE_OFFSET + 4); @@ -409,7 +414,7 @@ ((int)fetchTime) ^ ((int)modifiedTime) ^ retries ^ - Float.floatToIntBits(fetchInterval) ^ + fetchInterval ^ Float.floatToIntBits(score); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?rev=572335&r1=572334&r2=572335&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Mon Sep 3 06:37:24 2007 @@ -55,10 +55,12 @@ public static class Merger extends MapReduceBase implements Reducer { MapWritable meta = new MapWritable(); + private FetchSchedule schedule; public void close() throws IOException {} public void configure(JobConf conf) { + schedule = FetchScheduleFactory.getFetchSchedule(conf); } public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) @@ -70,17 +72,17 @@ CrawlDatum val = (CrawlDatum) values.next(); if (res == null) { res = val; - resTime = res.getFetchTime() - Math.round(res.getFetchInterval() * 3600 * 24 * 1000); + resTime = schedule.calculateLastFetchTime(res); meta.putAll(res.getMetaData()); continue; } // compute last fetch time, and pick the latest - long valTime = val.getFetchTime() - Math.round(val.getFetchInterval() * 3600 * 24 * 1000); + long valTime = schedule.calculateLastFetchTime(val); if (valTime > resTime) { // collect all metadata, newer values override older values meta.putAll(val.getMetaData()); res = val; - resTime = res.getFetchTime() - Math.round(res.getFetchInterval() * 3600 * 24 * 1000); + resTime = valTime ; } else { // insert older metadata before newer val.getMetaData().putAll(meta); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=572335&r1=572334&r2=572335&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Mon Sep 3 06:37:24 2007 @@ -40,14 +40,16 @@ private ArrayList<CrawlDatum> linked = new ArrayList<CrawlDatum>(); private ScoringFilters scfilters = null; private boolean additionsAllowed; - private float maxInterval; + private int maxInterval; private FetchSchedule schedule; public void configure(JobConf job) { retryMax = job.getInt("db.fetch.retry.max", 3); scfilters = new ScoringFilters(job); additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true); - maxInterval = (float)(job.getInt("db.max.fetch.interval", 30) * 3600 * 24); + int oldMaxInterval = job.getInt("db.max.fetch.interval", 0); + maxInterval = job.getInt("db.fetch.interval.max", 0 ); + if (oldMaxInterval > 0 && maxInterval == 0) maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY; schedule = FetchScheduleFactory.getFetchSchedule(job); } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java?rev=572335&r1=572334&r2=572335&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java Mon Sep 3 06:37:24 2007 @@ -32,7 +32,7 @@ public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, long fetchTime, long modifiedTime, int state) { - datum.setFetchTime(fetchTime + Math.round(datum.getFetchInterval() * 1000.0d)); + datum.setFetchTime(fetchTime + (long)datum.getFetchInterval() * 1000); datum.setModifiedTime(modifiedTime); return datum; } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java?rev=572335&r1=572334&r2=572335&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java Mon Sep 3 06:37:24 2007 @@ -35,7 +35,7 @@ /** Page is known to remain unmodified since our last visit. */ public static final int STATUS_NOTMODIFIED = 2; - public static final float SECONDS_PER_DAY = 3600.0f * 24.0f; + public static final int SECONDS_PER_DAY = 3600 * 24; /** * Initialize fetch schedule related data. Implementations should at least * set the <code>fetchTime</code> and <code>fetchInterval</code>. The default @@ -111,6 +111,12 @@ public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, long fetchTime); + /** + * Calculates last fetch time of the given CrawlDatum. + * @return the date as a long. + */ + public long calculateLastFetchTime(CrawlDatum datum); + /** * This method provides information whether the page is suitable for * selection in the current fetchlist. NOTE: a true return value does not Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=572335&r1=572334&r2=572335&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Mon Sep 3 06:37:24 2007 @@ -46,7 +46,7 @@ /** Normalize and filter injected urls. */ public static class InjectMapper implements Mapper { private URLNormalizers urlNormalizers; - private float interval; + private int interval; private float scoreInjected; private JobConf jobConf; private URLFilters filters; @@ -57,7 +57,7 @@ public void configure(JobConf job) { this.jobConf = job; urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT); - interval = jobConf.getFloat("db.fetch.interval.default", 2592000.0f); + interval = jobConf.getInt("db.fetch.interval.default", 2592000); filters = new URLFilters(jobConf); scfilters = new ScoringFilters(jobConf); scoreInjected = jobConf.getFloat("db.score.injected", 1.0f); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=572335&r1=572334&r2=572335&view=diff ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Mon Sep 3 06:37:24 2007 @@ -82,7 +82,7 @@ this.filters = new URLFilters(job); this.scfilters = new ScoringFilters(job); final UrlValidator validator = UrlValidator.get(); - final float interval = job.getFloat("db.fetch.interval.default", 2592000.0f); + final int interval = job.getInt("db.fetch.interval.default", 2592000); final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false); final int maxOutlinks = job.getInt("db.max.outlinks.per.page", 100); final CompressionType compType = SequenceFile.getCompressionType(job); @@ -125,7 +125,7 @@ byte[] signature = StringUtil.fromHexString(sig); if (signature != null) { // append a CrawlDatum with a signature - CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0.0f); + CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0); d.setSignature(signature); crawlOut.append(key, d); } Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=572335&r1=572334&r2=572335&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Mon Sep 3 06:37:24 2007 @@ -341,7 +341,7 @@ * @return Constructed object */ private URLCrawlDatum createURLCrawlDatum(final String url, - final float fetchInterval, final float score) { + final int fetchInterval, final float score) { return new CrawlDBTestUtil.URLCrawlDatum(new Text(url), new CrawlDatum( CrawlDatum.STATUS_DB_UNFETCHED, fetchInterval, score)); } Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java?rev=572335&r1=572334&r2=572335&view=diff ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java Mon Sep 3 06:37:24 2007 @@ -84,7 +84,7 @@ assertEquals(100, datum2.size()); testWritable(datum2); - CrawlDatum c = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 1f); + CrawlDatum c = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 1); c.setMetaData(new MapWritable()); for (int i = 0; i < 100; i++) { c.getMetaData().put(new LongWritable(i), new Text("" + 1));