Re: OPIC

Doug Cutting Wed, 19 Oct 2005 15:29:42 -0700

Here is a patch that implements this. I'm still testing it. If itappears to work well, I will commit it.


Doug Cutting wrote:

Massimo Miccoli wrote:
Any news about integration of OPIC in mapred? I have time to developOPIC on Nutch Mapred. Can you help me to start?By the email from Carlos Alberto-Alejandro CASTILLO-Ocaranza, seamsthat the best way to integrate OPIC in on old webdb, is this way validalso
CrawlDb in Mapred?
Yes.  I think the way to implement this in the mapred branch is:
1. In CrawlDatum.java, replace 'int linkCount' with 'float score'. Thedefault value of this should be 1.0f. This will require changes toaccessors, write, readFields, compareTo etc. A constructor whichspecifies the score should be added. The comparator should sort bydecreasing score.
2. In crawl/Fetcher.java, add the score to the Content's metadata:

  public static String SCORE_KEY = "org.apache.nutch.crawl.score";
  ...
  private void output(...) {
    ...
    content.getMetadata().setProperty(SCORE_KEY, datum.getScore());
    ...
  }
3. In ParseOutputFormat.java, when writing the CrawlDatum for eachoutlink (line 77), set the score of the link CrawlDatum to be the scoreof the page:
   float score =
     Float.valueOf(parse.getData().get(Fetcher.SCORE_KEY));
   score /= links.length;
   for (int i = 0; i < links.length, ...) {
     ...
       new CrawlDatum(CrawlDatum.STATUS_LINKED,
                      interval, score);
     ...
   }
4. In CrawlDbReducer.java, remove linkCount calculations. Replace thesewith something like:
  float scoreIncrement = 0.0f;
  while (values.next()) {
    ...
    switch (datum.getStatus()) {
    ...
    CrawlDatum.STATUS_LINKED:
      scoreIncrement += datum.getScore();
      break;
    ...
  }
  ...
  result.setScore(result.getScore() + scoreIncrement);

I think that should do it, no?

Doug

Index: conf/crawl-tool.xml
===================================================================
--- conf/crawl-tool.xml	(revision 326624)
+++ conf/crawl-tool.xml	(working copy)
@@ -15,13 +15,6 @@
 </property>
 
 <property>
-  <name>indexer.boost.by.link.count</name>
-  <value>true</value>
-  <description>When true scores for a page are multipled by the log of
-  the number of incoming links to the page.</description>
-</property>
-
-<property>
   <name>db.ignore.internal.links</name>
   <value>false</value>
   <description>If true, when adding new links to a page, links from
Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml	(revision 326624)
+++ conf/nutch-default.xml	(working copy)
@@ -440,24 +440,6 @@
 <!-- indexer properties -->
 
 <property>
-  <name>indexer.score.power</name>
-  <value>0.5</value>
-  <description>Determines the power of link analyis scores.  Each
-  pages's boost is set to <i>score<sup>scorePower</sup></i> where
-  <i>score</i> is its link analysis score and <i>scorePower</i> is the
-  value of this parameter.  This is compiled into indexes, so, when
-  this is changed, pages must be re-indexed for it to take
-  effect.</description>
-</property>
-
-<property>
-  <name>indexer.boost.by.link.count</name>
-  <value>true</value>
-  <description>When true scores for a page are multipled by the log of
-  the number of incoming links to the page.</description>
-</property>
-
-<property>
   <name>indexer.max.title.length</name>
   <value>100</value>
   <description>The maximum number of characters of a title that are indexed.
Index: src/java/org/apache/nutch/crawl/CrawlDatum.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDatum.java	(revision 326624)
+++ src/java/org/apache/nutch/crawl/CrawlDatum.java	(working copy)
@@ -31,7 +31,7 @@
   public static final String FETCH_DIR_NAME = "crawl_fetch";
   public static final String PARSE_DIR_NAME = "crawl_parse";
 
-  private final static byte CUR_VERSION = 1;
+  private final static byte CUR_VERSION = 2;
 
   public static final byte STATUS_DB_UNFETCHED = 1;
   public static final byte STATUS_DB_FETCHED = 2;
@@ -47,17 +47,20 @@
   private long fetchTime = System.currentTimeMillis();
   private byte retries;
   private float fetchInterval;
-  private int linkCount;
+  private float score = 1.0f;
 
   public CrawlDatum() {}
 
   public CrawlDatum(int status, float fetchInterval) {
     this.status = (byte)status;
     this.fetchInterval = fetchInterval;
-    if (status == STATUS_LINKED)
-      linkCount = 1;
   }
 
+  public CrawlDatum(int status, float fetchInterval, float score) {
+    this(status, fetchInterval);
+    this.score = score;
+  }
+
   //
   // accessor methods
   //
@@ -80,8 +83,8 @@
     this.fetchInterval = fetchInterval;
   }
 
-  public int getLinkCount() { return linkCount; }
-  public void setLinkCount(int linkCount) { this.linkCount = linkCount; }
+  public float getScore() { return score; }
+  public void setScore(float score) { this.score = score; }
 
   //
   // writable methods
@@ -96,18 +99,18 @@
 
   public void readFields(DataInput in) throws IOException {
     byte version = in.readByte();                 // read version
-    if (version > CUR_VERSION)                    // check version
+    if (version != CUR_VERSION)                   // check version
       throw new VersionMismatchException(CUR_VERSION, version);
 
     status = in.readByte();
     fetchTime = in.readLong();
     retries = in.readByte();
     fetchInterval = in.readFloat();
-    linkCount = in.readInt();
+    score = in.readFloat();
   }
 
-  /** The number of bytes into a CrawlDatum that the linkCount is stored. */
-  private static final int LINK_COUNT_OFFSET = 1 + 1 + 8 + 1 + 4;
+  /** The number of bytes into a CrawlDatum that the score is stored. */
+  private static final int SCORE_OFFSET = 1 + 1 + 8 + 1 + 4;
 
   public void write(DataOutput out) throws IOException {
     out.writeByte(CUR_VERSION);                   // store current version
@@ -115,7 +118,7 @@
     out.writeLong(fetchTime);
     out.writeByte(retries);
     out.writeFloat(fetchInterval);
-    out.writeInt(linkCount);
+    out.writeFloat(score);
   }
 
   /** Copy the contents of another instance into this instance. */
@@ -124,7 +127,7 @@
     this.fetchTime = that.fetchTime;
     this.retries = that.retries;
     this.fetchInterval = that.fetchInterval;
-    this.linkCount = that.linkCount;
+    this.score = that.score;
   }
 
 
@@ -132,11 +135,11 @@
   // compare methods
   //
   
-  /** Sort by decreasing link count. */
+  /** Sort by decreasing score. */
   public int compareTo(Object o) {
     CrawlDatum that = (CrawlDatum)o; 
-    if (that.linkCount != this.linkCount)
-      return that.linkCount - this.linkCount;
+    if (that.score != this.score)
+      return (that.score - this.score) > 0 ? 1 : -1;
     if (that.status != this.status)
       return this.status - that.status;
     if (that.fetchTime != this.fetchTime)
@@ -153,10 +156,10 @@
     public Comparator() { super(CrawlDatum.class); }
 
     public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
-      int linkCount1 = readInt(b1,s1+LINK_COUNT_OFFSET);
-      int linkCount2 = readInt(b2,s2+LINK_COUNT_OFFSET);
-      if (linkCount2 != linkCount1) {
-        return linkCount2 - linkCount1;
+      float score1 = readFloat(b1,s1+SCORE_OFFSET);
+      float score2 = readFloat(b2,s2+SCORE_OFFSET);
+      if (score2 != score1) {
+        return (score2 - score1) > 0 ? 1 : -1;
       }
       int status1 = b1[s1+1];
       int status2 = b2[s2+1];
@@ -194,7 +197,7 @@
     buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
     buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
     buf.append("Retry interval: " + getFetchInterval() + " days\n");
-    buf.append("Link Count: " + getLinkCount() + "\n");
+    buf.append("Score: " + getScore() + "\n");
     return buf.toString();
   }
 
@@ -207,7 +210,7 @@
       (this.fetchTime == other.fetchTime) &&
       (this.retries == other.retries) &&
       (this.fetchInterval == other.fetchInterval) &&
-      (this.linkCount == other.linkCount);
+      (this.score == other.score);
   }
 
   public int hashCode() {
@@ -216,7 +219,7 @@
       ((int)fetchTime) ^
       retries ^
       Float.floatToIntBits(fetchInterval) ^
-      linkCount;
+      Float.floatToIntBits(score);
   }
 
   public Object clone() {
Index: src/java/org/apache/nutch/crawl/ParseOutputFormat.java
===================================================================
--- src/java/org/apache/nutch/crawl/ParseOutputFormat.java	(revision 326624)
+++ src/java/org/apache/nutch/crawl/ParseOutputFormat.java	(working copy)
@@ -64,6 +64,11 @@
 
           // collect outlinks for subsequent db update
           Outlink[] links = parse.getData().getOutlinks();
+
+          // compute OPIC score contribution
+          float score = Float.valueOf(parse.getData().get(Fetcher.SCORE_KEY));
+          score /= links.length;
+                          
           for (int i = 0; i < links.length; i++) {
             String toUrl = links[i].getToUrl();
             try {
@@ -75,7 +80,7 @@
             if (toUrl != null)
               crawlOut.append(new UTF8(toUrl),
                               new CrawlDatum(CrawlDatum.STATUS_LINKED,
-                                             interval));
+                                             interval, score));
           }
         }
         
Index: src/java/org/apache/nutch/crawl/Fetcher.java
===================================================================
--- src/java/org/apache/nutch/crawl/Fetcher.java	(revision 326624)
+++ src/java/org/apache/nutch/crawl/Fetcher.java	(working copy)
@@ -38,6 +38,7 @@
   
   public static final String DIGEST_KEY = "nutch.content.digest";
   public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
+  public static final String SCORE_KEY = "nutch.crawl.score";
 
   public static class InputFormat extends SequenceFileInputFormat {
     /** Don't split inputs, to keep things polite. */
@@ -197,6 +198,8 @@
         (DIGEST_KEY, MD5Hash.digest(content.getContent()).toString());
       content.getMetadata().setProperty           // add segment to metadata
         (SEGMENT_NAME_KEY, segmentName);
+      content.getMetadata().setProperty           // add score to metadata
+        (SCORE_KEY, Float.toString(datum.getScore()));
 
       Parse parse = null;
       if (parsing) {
Index: src/java/org/apache/nutch/crawl/Generator.java
===================================================================
--- src/java/org/apache/nutch/crawl/Generator.java	(revision 326624)
+++ src/java/org/apache/nutch/crawl/Generator.java	(working copy)
@@ -61,7 +61,7 @@
       if (crawlDatum.getFetchTime() > curTime)
         return;                                   // not time yet
 
-      output.collect(crawlDatum, key);          // invert for sort by linkCount
+      output.collect(crawlDatum, key);          // invert for sort by score
     }
 
     /** Partition by host (value). */
Index: src/java/org/apache/nutch/crawl/CrawlDbReducer.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbReducer.java	(revision 326624)
+++ src/java/org/apache/nutch/crawl/CrawlDbReducer.java	(working copy)
@@ -38,11 +38,10 @@
 
     CrawlDatum highest = null;
     CrawlDatum old = null;
-    int linkCount = 0;
+    float scoreIncrement = 0.0f;
 
     while (values.hasNext()) {
       CrawlDatum datum = (CrawlDatum)values.next();
-      linkCount += datum.getLinkCount();          // sum link counts
 
       if (highest == null || datum.getStatus() > highest.getStatus()) {
         highest = datum;                          // find highest status
@@ -52,6 +51,10 @@
       case CrawlDatum.STATUS_DB_UNFETCHED:
       case CrawlDatum.STATUS_DB_FETCHED:
         old = datum;
+        break;
+      case CrawlDatum.STATUS_LINKED:
+        scoreIncrement += datum.getScore();
+        break;
       }
     }
 
@@ -99,7 +102,7 @@
     }
     
     if (result != null) {
-      result.setLinkCount(linkCount);
+      result.setScore(result.getScore() + scoreIncrement);
       output.collect(key, result);
     }
   }
Index: src/java/org/apache/nutch/crawl/Indexer.java
===================================================================
--- src/java/org/apache/nutch/crawl/Indexer.java	(revision 326624)
+++ src/java/org/apache/nutch/crawl/Indexer.java	(working copy)
@@ -138,12 +138,7 @@
     super(conf);
   }
 
-  private boolean boostByLinkCount;
-  private float scorePower;
-
   public void configure(JobConf job) {
-    boostByLinkCount = job.getBoolean("indexer.boost.by.link.count", false);
-    scorePower = job.getFloat("indexer.score.power", 0.5f);
   }
 
   public void reduce(WritableComparable key, Iterator values,
@@ -183,10 +178,8 @@
     // add digest, used by dedup
     doc.add(Field.UnIndexed("digest", meta.getProperty(Fetcher.DIGEST_KEY)));
 
-    // compute boost
-    float boost =
-      IndexSegment.calculateBoost(1.0f, scorePower, boostByLinkCount,
-                                  anchors.length);
+    // boost is log(opic)
+    float boost = (float)Math.log(Math.E + crawlDatum.getScore());
     // apply boost to all indexed fields.
     doc.setBoost(boost);
     // store boost for use by explain and dedup

Re: OPIC

Reply via email to