svn commit: r1349226 - in /nutch/trunk: ./ conf/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/metadata/

markus Tue, 12 Jun 2012 03:12:23 -0700

Author: markus
Date: Tue Jun 12 10:11:47 2012
New Revision: 1349226

URL: http://svn.apache.org/viewvc?rev=1349226&view=rev
Log:
NUTCH-1024 Dynamically set fetchInterval by MIME-type


Added:
    nutch/trunk/conf/adaptive-mimetypes.txt
    nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
    nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java
    nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1349226&r1=1349225&r2=1349226&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jun 12 10:11:47 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1024 Dynamically set fetchInterval by MIME-type (markus)
+
 * NUTCH-1364 Add a counter in Generator for malformed urls (lewismc)
 
 * NUTCH-1360 Suport the storing of IP address connected to when web crawling 
(lewismc)

Added: nutch/trunk/conf/adaptive-mimetypes.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/adaptive-mimetypes.txt?rev=1349226&view=auto
==============================================================================
--- nutch/trunk/conf/adaptive-mimetypes.txt (added)
+++ nutch/trunk/conf/adaptive-mimetypes.txt Tue Jun 12 10:11:47 2012
@@ -0,0 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This configuration file is used by the MimeAdaptiveFetchScheduler and
+# allows the user to set the INC and DEC rates for the AdaptiveFetchScheduler
+# by MIME-type. Values are separated by tab.
+
+# MIME-type    inc_rate        dec_rate
+text/html      0.2     0.2
+application/xhtml+xml  0.2     0.2
+application/pdf        0.1     0.4

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1349226&r1=1349225&r2=1349226&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Jun 12 10:11:47 2012
@@ -409,6 +409,13 @@
 </property>
 
 <property>
+  <name>db.fetch.schedule.mime.file</name>
+  <value>adaptive-mimetypes.txt</value>
+  <description>The configuration file for the MimeAdaptiveFetchSchedule.
+  </description>
+</property>
+
+<property>
   <name>db.update.additions.allowed</name>
   <value>true</value>
   <description>If true, updatedb will add newly discovered URLs, if false

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java?rev=1349226&r1=1349225&r2=1349226&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java Tue 
Jun 12 10:11:47 2012
@@ -19,9 +19,14 @@ package org.apache.nutch.crawl;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.FloatWritable;
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.util.NutchConfiguration;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 /**
  * This class implements an adaptive re-fetch algorithm. This works as follows:
  * <ul>
@@ -53,9 +58,12 @@ import org.apache.nutch.util.NutchConfig
  */
 public class AdaptiveFetchSchedule extends AbstractFetchSchedule {
 
-  private float INC_RATE;
+  // Loggg
+  public static final Logger LOG = 
LoggerFactory.getLogger(AbstractFetchSchedule.class);
+
+  protected float INC_RATE;
 
-  private float DEC_RATE;
+  protected float DEC_RATE;
 
   private int MAX_INTERVAL;
 
@@ -82,30 +90,39 @@ public class AdaptiveFetchSchedule exten
           long fetchTime, long modifiedTime, int state) {
     super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
         fetchTime, modifiedTime, state);
-    long refTime = fetchTime;
-    if (modifiedTime <= 0) modifiedTime = fetchTime;
+
     float interval = datum.getFetchInterval();
-    switch (state) {
-      case FetchSchedule.STATUS_MODIFIED:
-        interval *= (1.0f - DEC_RATE);
-        break;
-      case FetchSchedule.STATUS_NOTMODIFIED:
-        interval *= (1.0f + INC_RATE);
-        break;
-      case FetchSchedule.STATUS_UNKNOWN:
-        break;
-    }
-    if (SYNC_DELTA) {
-      // try to synchronize with the time of change
-      long delta = (fetchTime - modifiedTime) / 1000L;
-      if (delta > interval) interval = delta;
-      refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000);
-    }
-    if (interval < MIN_INTERVAL) {
-      interval = MIN_INTERVAL;
-    } else if (interval > MAX_INTERVAL) {
-      interval = MAX_INTERVAL;
+    long refTime = fetchTime;
+
+    if (datum.getMetaData().containsKey(Nutch.WRITABLE_CUSTOM_INTERVAL_KEY)) {
+      // Is fetch interval preset in CrawlDatum MD? Then use preset interval
+      FloatWritable customIntervalWritable= 
(FloatWritable)(datum.getMetaData().get(Nutch.WRITABLE_CUSTOM_INTERVAL_KEY));
+      interval = customIntervalWritable.get();
+    } else {
+      if (modifiedTime <= 0) modifiedTime = fetchTime;
+      switch (state) {
+        case FetchSchedule.STATUS_MODIFIED:
+          interval *= (1.0f - DEC_RATE);
+          break;
+        case FetchSchedule.STATUS_NOTMODIFIED:
+          interval *= (1.0f + INC_RATE);
+          break;
+        case FetchSchedule.STATUS_UNKNOWN:
+          break;
+      }
+      if (SYNC_DELTA) {
+        // try to synchronize with the time of change
+        long delta = (fetchTime - modifiedTime) / 1000L;
+        if (delta > interval) interval = delta;
+        refTime = fetchTime - Math.round(delta * SYNC_DELTA_RATE * 1000);
+      }
+      if (interval < MIN_INTERVAL) {
+        interval = MIN_INTERVAL;
+      } else if (interval > MAX_INTERVAL) {
+        interval = MAX_INTERVAL;
+      }
     }
+
     datum.setFetchInterval(interval);
     datum.setFetchTime(refTime + Math.round(interval * 1000.0));
     datum.setModifiedTime(modifiedTime);
@@ -130,7 +147,7 @@ public class AdaptiveFetchSchedule exten
     // initial fetchInterval is 10 days
     CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
     p.setFetchTime(0);
-    System.out.println(p);
+    LOG.info(p.toString());
     // let's move the timeline a couple of deltas
     for (int i = 0; i < 10000; i++) {
       if (lastModified + update < curTime) {
@@ -139,14 +156,14 @@ public class AdaptiveFetchSchedule exten
         changeCnt++;
         lastModified = curTime;
       }
-      System.out.println(i + ". " + changed + "\twill fetch at " + 
(p.getFetchTime() / delta) + "\tinterval "
+      LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / 
delta) + "\tinterval "
               + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t 
missed " + miss);
       if (p.getFetchTime() <= curTime) {
         fetchCnt++;
         fs.setFetchSchedule(new Text("http://www.example.com";), p,
                 p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
                 changed ? FetchSchedule.STATUS_MODIFIED : 
FetchSchedule.STATUS_NOTMODIFIED);
-        System.out.println("\tfetched & adjusted: " + "\twill fetch at " + 
(p.getFetchTime() / delta) + "\tinterval "
+        LOG.info("\tfetched & adjusted: " + "\twill fetch at " + 
(p.getFetchTime() / delta) + "\tinterval "
                 + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");
         if (!changed) miss++;
         if (miss > maxMiss) maxMiss = miss;
@@ -157,7 +174,7 @@ public class AdaptiveFetchSchedule exten
       if (changed) miss++;
       curTime += delta;
     }
-    System.out.println("Total missed: " + totalMiss + ", max miss: " + 
maxMiss);
-    System.out.println("Page changed " + changeCnt + " times, fetched " + 
fetchCnt + " times.");
+    LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
+    LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " 
times.");
   }
 }

Added: 
nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java?rev=1349226&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java 
(added)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java 
Tue Jun 12 10:11:47 2012
@@ -0,0 +1,223 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.util.HashMap;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.*;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.HttpHeaders;
+import org.apache.nutch.util.NutchConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Extension of @see AdaptiveFetchSchedule that allows for more flexible 
configuration
+ * of DEC and INC factors for various MIME-types.
+ *
+ * This class can be typically used in cases where a recrawl consists of many 
different
+ * MIME-types. It's not very common for MIME-types other than text/html to 
change frequently.
+ * Using this class you can configure different factors per MIME-type so to 
prefer frequently
+ * changing MIME-types over others.
+ * 
+ * For it to work this class relies on the Content-Type MetaData key being 
present in the CrawlDB.
+ * This can either be done when injecting new URL's or by adding 
"Content-Type" to the
+ * db.parsemeta.to.crawldb configuration setting to force MIME-types of newly 
discovered URL's to
+ * be added to the CrawlDB.
+ *
+ * @author markus
+ */
+public class MimeAdaptiveFetchSchedule extends AdaptiveFetchSchedule {
+  // Loggg
+  public static final Logger LOG = 
LoggerFactory.getLogger(MimeAdaptiveFetchSchedule.class);
+
+  // Conf directives
+  public static final String SCHEDULE_INC_RATE = 
"db.fetch.schedule.adaptive.inc_rate";
+  public static final String SCHEDULE_DEC_RATE = 
"db.fetch.schedule.adaptive.dec_rate";
+  public static final String SCHEDULE_MIME_FILE= "db.fetch.schedule.mime.file";
+
+  // Default values for DEC and INC rate
+  private float defaultIncRate;
+  private float defaultDecRate;
+
+  // Structure to store inc and dec rates per MIME-type
+  private class AdaptiveRate {
+    public float inc;
+    public float dec;
+
+    public AdaptiveRate(Float inc, Float dec) {
+      this.inc = inc;
+      this.dec = dec;
+    }
+  }
+
+  // Here we store the mime's and their delta's
+  private HashMap<String,AdaptiveRate> mimeMap;
+
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    if (conf == null) return;
+
+    // Read and set the default INC and DEC rates in case we cannot set values 
based on MIME-type
+    defaultIncRate = conf.getFloat(SCHEDULE_INC_RATE, 0.2f);
+    defaultDecRate = conf.getFloat(SCHEDULE_DEC_RATE, 0.2f);
+
+    // Where's the mime/factor file?
+    Reader mimeFile = 
conf.getConfResourceAsReader(conf.get(SCHEDULE_MIME_FILE, 
"adaptive-mimetypes.txt"));
+
+    try {
+      readMimeFile(mimeFile);
+    } catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+
+  @Override
+  public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
+          long prevFetchTime, long prevModifiedTime,
+          long fetchTime, long modifiedTime, int state) {
+
+    // Set defaults
+    INC_RATE = defaultIncRate;
+    DEC_RATE = defaultDecRate;
+
+    // Check if the Content-Type field is available in the CrawlDatum
+    if (datum.getMetaData().containsKey(HttpHeaders.WRITABLE_CONTENT_TYPE)) {
+      // Get the MIME-type of the current URL
+      String currentMime = 
datum.getMetaData().get(HttpHeaders.WRITABLE_CONTENT_TYPE).toString();
+
+      // Get rid of charset
+      currentMime = currentMime.substring(0, currentMime.indexOf(';'));
+
+      // Check if this MIME-type exists in our map
+      if (mimeMap.containsKey(currentMime)) {
+        // Yes, set the INC and DEC rates for this MIME-type
+        INC_RATE = mimeMap.get(currentMime).inc;
+        DEC_RATE = mimeMap.get(currentMime).dec;
+      }
+    }
+
+    return super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
+      fetchTime, modifiedTime, state);
+  }
+
+  /**
+   * Reads the mime types and their associated INC/DEC factors in a HashMap
+   *
+   * @param mimeFile Reader
+   * @return void
+   */
+  private void readMimeFile(Reader mimeFile) throws IOException {
+    // Instance of our mime/factor map
+    mimeMap = new HashMap<String,AdaptiveRate>();
+
+    // Open a reader
+    BufferedReader reader = new BufferedReader(mimeFile);
+
+    String line = null;
+    String[] splits = null;
+
+    // Read all lines
+    while ((line = reader.readLine()) != null) {
+      // Skip blank lines and comments
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        // Split the line by TAB
+        splits = line.split("\t");
+
+        // Sanity check, we need two or three items
+        if (splits.length == 3) {
+          // Add a lower cased MIME-type and the factor to the map
+          mimeMap.put(StringUtils.lowerCase(splits[0]), new AdaptiveRate(new 
Float(splits[1]), new Float(splits[2])));
+        } else {
+          LOG.warn("Invalid configuration line in: " + line);
+        }
+      }
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+    FetchSchedule fs = new MimeAdaptiveFetchSchedule();
+    fs.setConf(NutchConfiguration.create());
+    // we start the time at 0, for simplicity
+    long curTime = 0;
+    long delta = 1000L * 3600L * 24L; // 2 hours
+    // we trigger the update of the page every 30 days
+    long update = 1000L * 3600L * 24L * 30L; // 30 days
+    boolean changed = true;
+    long lastModified = 0;
+    int miss = 0;
+    int totalMiss = 0;
+    int maxMiss = 0;
+    int fetchCnt = 0;
+    int changeCnt = 0;
+
+    // initial fetchInterval is 10 days
+    CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
+
+    // Set a default MIME-type to test with
+    org.apache.hadoop.io.MapWritable x = new 
org.apache.hadoop.io.MapWritable();
+    x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text("text/html; 
charset=utf-8"));
+    p.setMetaData(x);
+
+    p.setFetchTime(0);
+    LOG.info(p.toString());
+
+    // let's move the timeline a couple of deltas
+    for (int i = 0; i < 10000; i++) {
+      if (lastModified + update < curTime) {
+        //System.out.println("i=" + i + ", lastModified=" + lastModified + ", 
update=" + update + ", curTime=" + curTime);
+        changed = true;
+        changeCnt++;
+        lastModified = curTime;
+      }
+
+      LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / 
delta) + "\tinterval "
+              + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days" + "\t 
missed " + miss);
+
+      if (p.getFetchTime() <= curTime) {
+        fetchCnt++;
+        fs.setFetchSchedule(new Text("http://www.example.com";), p,
+                p.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
+                changed ? FetchSchedule.STATUS_MODIFIED : 
FetchSchedule.STATUS_NOTMODIFIED);
+
+        LOG.info("\tfetched & adjusted: " + "\twill fetch at " + 
(p.getFetchTime() / delta) + "\tinterval "
+                + (p.getFetchInterval() / SECONDS_PER_DAY ) + " days");
+
+        if (!changed) miss++;
+        if (miss > maxMiss) maxMiss = miss;
+        changed = false;
+        totalMiss += miss;
+        miss = 0;
+      }
+
+      if (changed) miss++;
+      curTime += delta;
+    }
+    LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
+    LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " 
times.");
+  }
+
+
+}
\ No newline at end of file

Modified: nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java?rev=1349226&r1=1349225&r2=1349226&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java Tue Jun 12 
10:11:47 2012
@@ -16,6 +16,7 @@
  */
 package org.apache.nutch.metadata;
 
+import org.apache.hadoop.io.Text;
 
 /**
  * A collection of HTTP header names.
@@ -41,6 +42,8 @@ public interface HttpHeaders {
   public final static String CONTENT_MD5 = "Content-MD5";
   
   public final static String CONTENT_TYPE = "Content-Type";
+
+  public static final Text WRITABLE_CONTENT_TYPE = new Text(CONTENT_TYPE);
   
   public final static String LAST_MODIFIED = "Last-Modified";
   

Modified: nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?rev=1349226&r1=1349225&r2=1349226&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Tue Jun 12 
10:11:47 2012
@@ -66,4 +66,9 @@ public interface Nutch {
   public static final String REPR_URL_KEY = "_repr_";
 
   public static final Text WRITABLE_REPR_URL_KEY = new Text(REPR_URL_KEY);
+
+  /** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
+  public static final String CUSTOM_INTERVAL_KEY = "interval";
+
+  public static final Text WRITABLE_CUSTOM_INTERVAL_KEY = new 
Text(CUSTOM_INTERVAL_KEY);
 }

svn commit: r1349226 - in /nutch/trunk: ./ conf/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/metadata/

Reply via email to