Author: markus
Date: Fri Jul 20 14:22:19 2012
New Revision: 1363793

URL: http://svn.apache.org/viewvc?rev=1363793&view=rev
Log:
NUTCH-1388 Optionally maintain custom fetch interval despite 
AdaptiveFetchSchedule

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
    nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
    nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1363793&r1=1363792&r2=1363793&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jul 20 14:22:19 2012
@@ -2,6 +2,10 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1388 Optionally maintain custom fetch interval despite 
AdaptiveFetchSchedule (markus)
+
+* NUTCH-1430 Freegenerator records overwrite CrawlDB records with 
AdaptiveFetchSchedule (markus)
+
 * NUTCH-1087 Deprecate crawl command and replace with example script (jnioche)
 
 * NUTCH-1306 Add option to not commit and clarify existing solr.commit.size 
(ferdy)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=1363793&r1=1363792&r2=1363793&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java Fri 
Jul 20 14:22:19 2012
@@ -94,9 +94,9 @@ public class AdaptiveFetchSchedule exten
     float interval = datum.getFetchInterval();
     long refTime = fetchTime;
 
-    if (datum.getMetaData().containsKey(Nutch.WRITABLE_CUSTOM_INTERVAL_KEY)) {
+    if (datum.getMetaData().containsKey(Nutch.WRITABLE_FIXED_INTERVAL_KEY)) {
       // Is fetch interval preset in CrawlDatum MD? Then use preset interval
-      FloatWritable customIntervalWritable= 
(FloatWritable)(datum.getMetaData().get(Nutch.WRITABLE_CUSTOM_INTERVAL_KEY));
+      FloatWritable customIntervalWritable= 
(FloatWritable)(datum.getMetaData().get(Nutch.WRITABLE_FIXED_INTERVAL_KEY));
       interval = customIntervalWritable.get();
     } else {
       if (modifiedTime <= 0) modifiedTime = fetchTime;

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=1363793&r1=1363792&r2=1363793&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Fri Jul 20 
14:22:19 2012
@@ -32,6 +32,7 @@ import org.apache.hadoop.mapred.*;
 import org.apache.hadoop.util.*;
 
 import org.apache.nutch.net.*;
+import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.NutchConfiguration;
@@ -45,6 +46,7 @@ import org.apache.nutch.util.TimingUtil;
  * Note that some metadata keys are reserved : <br>
  * - <i>nutch.score</i> : allows to set a custom score for a specific URL <br>
  * - <i>nutch.fetchInterval</i> : allows to set a custom fetch interval for a 
specific URL <br>
+ * - <i>nutch.fetchInterval.fixed</i> : allows to set a custom fetch interval 
for a specific URL that is not changed by AdaptiveFetchSchedule <br>
  * e.g. http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000 
\t userType=open_source
  **/
 public class Injector extends Configured implements Tool {
@@ -54,6 +56,8 @@ public class Injector extends Configured
   public static String nutchScoreMDName = "nutch.score";
   /** metadata key reserved for setting a custom fetchInterval for a specific 
URL */
   public static String nutchFetchIntervalMDName = "nutch.fetchInterval";
+  /** metadata key reserved for setting a fixed custom fetchInterval for a 
specific URL */
+  public static String nutchFixedFetchIntervalMDName = 
"nutch.fetchInterval.fixed";
 
   /** Normalize and filter injected urls. */
   public static class InjectMapper implements Mapper<WritableComparable, Text, 
Text, CrawlDatum> {
@@ -91,6 +95,7 @@ public class Injector extends Configured
       // must be name=value and separated by \t
       float customScore = -1f;
       int customInterval = interval;
+      int fixedInterval = -1;
       Map<String,String> metadata = new TreeMap<String,String>();
       if (url.indexOf("\t")!=-1){
          String[] splits = url.split("\t");
@@ -109,11 +114,16 @@ public class Injector extends Configured
                          customScore = Float.parseFloat(metavalue);}
                          catch (NumberFormatException nfe){}
                  }
-                 else if (metaname.equals(nutchFetchIntervalMDName)) {
-                         try {
-                                 customInterval = Integer.parseInt(metavalue);}
-                         catch (NumberFormatException nfe){}
-                 }
+                  else if (metaname.equals(nutchFetchIntervalMDName)) {
+                          try {
+                                  customInterval = 
Integer.parseInt(metavalue);}
+                          catch (NumberFormatException nfe){}
+                  }
+                  else if (metaname.equals(nutchFixedFetchIntervalMDName)) {
+                          try {
+                                  fixedInterval = Integer.parseInt(metavalue);}
+                          catch (NumberFormatException nfe){}
+                  }
                  else metadata.put(metaname,metavalue);
          }
       }
@@ -126,7 +136,18 @@ public class Injector extends Configured
       }
       if (url != null) {                          // if it passes
         value.set(url);                           // collect it
-        CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_INJECTED, 
customInterval);
+        CrawlDatum datum = new CrawlDatum();
+        datum.setStatus(CrawlDatum.STATUS_INJECTED);
+
+        // Is interval custom? Then set as meta data
+        if (fixedInterval > -1) {
+          // Set writable using float. Flaot is used by AdaptiveFetchSchedule
+          datum.getMetaData().put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new 
FloatWritable(fixedInterval));
+          datum.setFetchInterval(fixedInterval);
+        } else {
+          datum.setFetchInterval(customInterval);
+        }
+
         datum.setFetchTime(curTime);
         // now add the metadata
         Iterator<String> keysIter = metadata.keySet().iterator();

Modified: nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?rev=1363793&r1=1363792&r2=1363793&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Fri Jul 20 
14:22:19 2012
@@ -68,7 +68,7 @@ public interface Nutch {
   public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY);
 
   /** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
-  public static final String CUSTOM_INTERVAL_KEY = "interval";
+  public static final String FIXED_INTERVAL_KEY = "fixedInterval";
 
-  public static final Text WRITABLE_CUSTOM_INTERVAL_KEY = new 
Text(CUSTOM_INTERVAL_KEY);
+  public static final Text WRITABLE_FIXED_INTERVAL_KEY = new 
Text(FIXED_INTERVAL_KEY);
 }


Reply via email to