Author: markus Date: Fri Jul 20 14:22:19 2012 New Revision: 1363793 URL: http://svn.apache.org/viewvc?rev=1363793&view=rev Log: NUTCH-1388 Optionally maintain custom fetch interval despite AdaptiveFetchSchedule
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1363793&r1=1363792&r2=1363793&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Jul 20 14:22:19 2012 @@ -2,6 +2,10 @@ Nutch Change Log (trunk) Current Development: +* NUTCH-1388 Optionally maintain custom fetch interval despite AdaptiveFetchSchedule (markus) + +* NUTCH-1430 Freegenerator records overwrite CrawlDB records with AdaptiveFetchSchedule (markus) + * NUTCH-1087 Deprecate crawl command and replace with example script (jnioche) * NUTCH-1306 Add option to not commit and clarify existing solr.commit.size (ferdy) Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=1363793&r1=1363792&r2=1363793&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java Fri Jul 20 14:22:19 2012 @@ -94,9 +94,9 @@ public class AdaptiveFetchSchedule exten float interval = datum.getFetchInterval(); long refTime = fetchTime; - if (datum.getMetaData().containsKey(Nutch.WRITABLE_CUSTOM_INTERVAL_KEY)) { + if (datum.getMetaData().containsKey(Nutch.WRITABLE_FIXED_INTERVAL_KEY)) { // Is fetch interval preset in CrawlDatum MD? Then use preset interval - FloatWritable customIntervalWritable= (FloatWritable)(datum.getMetaData().get(Nutch.WRITABLE_CUSTOM_INTERVAL_KEY)); + FloatWritable customIntervalWritable= (FloatWritable)(datum.getMetaData().get(Nutch.WRITABLE_FIXED_INTERVAL_KEY)); interval = customIntervalWritable.get(); } else { if (modifiedTime <= 0) modifiedTime = fetchTime; Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=1363793&r1=1363792&r2=1363793&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Fri Jul 20 14:22:19 2012 @@ -32,6 +32,7 @@ import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.*; import org.apache.nutch.net.*; +import org.apache.nutch.metadata.Nutch; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.NutchConfiguration; @@ -45,6 +46,7 @@ import org.apache.nutch.util.TimingUtil; * Note that some metadata keys are reserved : <br> * - <i>nutch.score</i> : allows to set a custom score for a specific URL <br> * - <i>nutch.fetchInterval</i> : allows to set a custom fetch interval for a specific URL <br> + * - <i>nutch.fetchInterval.fixed</i> : allows to set a custom fetch interval for a specific URL that is not changed by AdaptiveFetchSchedule <br> * e.g. http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000 \t userType=open_source **/ public class Injector extends Configured implements Tool { @@ -54,6 +56,8 @@ public class Injector extends Configured public static String nutchScoreMDName = "nutch.score"; /** metadata key reserved for setting a custom fetchInterval for a specific URL */ public static String nutchFetchIntervalMDName = "nutch.fetchInterval"; + /** metadata key reserved for setting a fixed custom fetchInterval for a specific URL */ + public static String nutchFixedFetchIntervalMDName = "nutch.fetchInterval.fixed"; /** Normalize and filter injected urls. */ public static class InjectMapper implements Mapper<WritableComparable, Text, Text, CrawlDatum> { @@ -91,6 +95,7 @@ public class Injector extends Configured // must be name=value and separated by \t float customScore = -1f; int customInterval = interval; + int fixedInterval = -1; Map<String,String> metadata = new TreeMap<String,String>(); if (url.indexOf("\t")!=-1){ String[] splits = url.split("\t"); @@ -109,11 +114,16 @@ public class Injector extends Configured customScore = Float.parseFloat(metavalue);} catch (NumberFormatException nfe){} } - else if (metaname.equals(nutchFetchIntervalMDName)) { - try { - customInterval = Integer.parseInt(metavalue);} - catch (NumberFormatException nfe){} - } + else if (metaname.equals(nutchFetchIntervalMDName)) { + try { + customInterval = Integer.parseInt(metavalue);} + catch (NumberFormatException nfe){} + } + else if (metaname.equals(nutchFixedFetchIntervalMDName)) { + try { + fixedInterval = Integer.parseInt(metavalue);} + catch (NumberFormatException nfe){} + } else metadata.put(metaname,metavalue); } } @@ -126,7 +136,18 @@ public class Injector extends Configured } if (url != null) { // if it passes value.set(url); // collect it - CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_INJECTED, customInterval); + CrawlDatum datum = new CrawlDatum(); + datum.setStatus(CrawlDatum.STATUS_INJECTED); + + // Is interval custom? Then set as meta data + if (fixedInterval > -1) { + // Set writable using float. Flaot is used by AdaptiveFetchSchedule + datum.getMetaData().put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval)); + datum.setFetchInterval(fixedInterval); + } else { + datum.setFetchInterval(customInterval); + } + datum.setFetchTime(curTime); // now add the metadata Iterator<String> keysIter = metadata.keySet().iterator(); Modified: nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?rev=1363793&r1=1363792&r2=1363793&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original) +++ nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Fri Jul 20 14:22:19 2012 @@ -68,7 +68,7 @@ public interface Nutch { public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY); /** Used by AdaptiveFetchSchedule to maintain custom fetch interval */ - public static final String CUSTOM_INTERVAL_KEY = "interval"; + public static final String FIXED_INTERVAL_KEY = "fixedInterval"; - public static final Text WRITABLE_CUSTOM_INTERVAL_KEY = new Text(CUSTOM_INTERVAL_KEY); + public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(FIXED_INTERVAL_KEY); }