Re: OPIC

Massimo Miccoli Thu, 20 Oct 2005 09:53:45 -0700

Hi Doug,

Many thanks for your patch. I now try it. I'm also thinking to integratesome algo for near duplicated urls detection. I mean some like Shingling.

Is dedup the best place to integrate the algo?


Thanks,

Massimo

Doug Cutting ha scritto:

Here is a patch that implements this. I'm still testing it. If itappears to work well, I will commit it.


Doug Cutting wrote:

Massimo Miccoli wrote:
Any news about integration of OPIC in mapred? I have time todevelop OPIC on Nutch Mapred. Can you help me to start?By the email from Carlos Alberto-Alejandro CASTILLO-Ocaranza, seamsthat the best way to integrate OPIC in on old webdb, is this wayvalid also
CrawlDb in Mapred?
Yes.  I think the way to implement this in the mapred branch is:
1. In CrawlDatum.java, replace 'int linkCount' with 'float score'.The default value of this should be 1.0f. This will require changesto accessors, write, readFields, compareTo etc. A constructor whichspecifies the score should be added. The comparator should sort bydecreasing score.
2. In crawl/Fetcher.java, add the score to the Content's metadata:

  public static String SCORE_KEY = "org.apache.nutch.crawl.score";
  ...
  private void output(...) {
    ...
    content.getMetadata().setProperty(SCORE_KEY, datum.getScore());
    ...
  }
3. In ParseOutputFormat.java, when writing the CrawlDatum for eachoutlink (line 77), set the score of the link CrawlDatum to be thescore of the page:
   float score =
     Float.valueOf(parse.getData().get(Fetcher.SCORE_KEY));
   score /= links.length;
   for (int i = 0; i < links.length, ...) {
     ...
       new CrawlDatum(CrawlDatum.STATUS_LINKED,
                      interval, score);
     ...
   }
4. In CrawlDbReducer.java, remove linkCount calculations. Replacethese with something like:
  float scoreIncrement = 0.0f;
  while (values.next()) {
    ...
    switch (datum.getStatus()) {
    ...
    CrawlDatum.STATUS_LINKED:
      scoreIncrement += datum.getScore();
      break;
    ...
  }
  ...
  result.setScore(result.getScore() + scoreIncrement);

I think that should do it, no?

Doug


------------------------------------------------------------------------

Index: conf/crawl-tool.xml
===================================================================
--- conf/crawl-tool.xml (revision 326624)
+++ conf/crawl-tool.xml (working copy)
@@ -15,13 +15,6 @@
</property>

<property>
-  <name>indexer.boost.by.link.count</name>
-  <value>true</value>
-  <description>When true scores for a page are multipled by the log of
-  the number of incoming links to the page.</description>
-</property>
-
-<property>
  <name>db.ignore.internal.links</name>
  <value>false</value>
  <description>If true, when adding new links to a page, links from
Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml      (revision 326624)
+++ conf/nutch-default.xml      (working copy)
@@ -440,24 +440,6 @@
<!-- indexer properties -->

<property>
-  <name>indexer.score.power</name>
-  <value>0.5</value>
-  <description>Determines the power of link analyis scores.  Each
-  pages's boost is set to <i>score<sup>scorePower</sup></i> where
-  <i>score</i> is its link analysis score and <i>scorePower</i> is the
-  value of this parameter.  This is compiled into indexes, so, when
-  this is changed, pages must be re-indexed for it to take
-  effect.</description>
-</property>
-
-<property>
-  <name>indexer.boost.by.link.count</name>
-  <value>true</value>
-  <description>When true scores for a page are multipled by the log of
-  the number of incoming links to the page.</description>
-</property>
-
-<property>
  <name>indexer.max.title.length</name>
  <value>100</value>
  <description>The maximum number of characters of a title that are indexed.
Index: src/java/org/apache/nutch/crawl/CrawlDatum.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDatum.java     (revision 326624)
+++ src/java/org/apache/nutch/crawl/CrawlDatum.java     (working copy)
@@ -31,7 +31,7 @@
  public static final String FETCH_DIR_NAME = "crawl_fetch";
  public static final String PARSE_DIR_NAME = "crawl_parse";

-  private final static byte CUR_VERSION = 1;
+  private final static byte CUR_VERSION = 2;

  public static final byte STATUS_DB_UNFETCHED = 1;
  public static final byte STATUS_DB_FETCHED = 2;
@@ -47,17 +47,20 @@
  private long fetchTime = System.currentTimeMillis();
  private byte retries;
  private float fetchInterval;
-  private int linkCount;
+  private float score = 1.0f;

  public CrawlDatum() {}

  public CrawlDatum(int status, float fetchInterval) {
    this.status = (byte)status;
    this.fetchInterval = fetchInterval;
-    if (status == STATUS_LINKED)
-      linkCount = 1;
  }

+  public CrawlDatum(int status, float fetchInterval, float score) {
+    this(status, fetchInterval);
+    this.score = score;
+  }
+
  //
  // accessor methods
  //
@@ -80,8 +83,8 @@
    this.fetchInterval = fetchInterval;
  }

-  public int getLinkCount() { return linkCount; }
-  public void setLinkCount(int linkCount) { this.linkCount = linkCount; }
+  public float getScore() { return score; }
+  public void setScore(float score) { this.score = score; }

  //
  // writable methods
@@ -96,18 +99,18 @@

  public void readFields(DataInput in) throws IOException {
    byte version = in.readByte();                 // read version
-    if (version > CUR_VERSION)                    // check version
+    if (version != CUR_VERSION)                   // check version
      throw new VersionMismatchException(CUR_VERSION, version);

    status = in.readByte();
    fetchTime = in.readLong();
    retries = in.readByte();
    fetchInterval = in.readFloat();
-    linkCount = in.readInt();
+    score = in.readFloat();
  }

-  /** The number of bytes into a CrawlDatum that the linkCount is stored. */
-  private static final int LINK_COUNT_OFFSET = 1 + 1 + 8 + 1 + 4;
+  /** The number of bytes into a CrawlDatum that the score is stored. */
+  private static final int SCORE_OFFSET = 1 + 1 + 8 + 1 + 4;

  public void write(DataOutput out) throws IOException {
    out.writeByte(CUR_VERSION);                   // store current version
@@ -115,7 +118,7 @@
    out.writeLong(fetchTime);
    out.writeByte(retries);
    out.writeFloat(fetchInterval);
-    out.writeInt(linkCount);
+    out.writeFloat(score);
  }

  /** Copy the contents of another instance into this instance. */
@@ -124,7 +127,7 @@
    this.fetchTime = that.fetchTime;
    this.retries = that.retries;
    this.fetchInterval = that.fetchInterval;
-    this.linkCount = that.linkCount;
+    this.score = that.score;
  }


@@ -132,11 +135,11 @@
  // compare methods
  //

- /** Sort by decreasing link count. */

+  /** Sort by decreasing score. */
  public int compareTo(Object o) {

CrawlDatum that = (CrawlDatum)o;- if (that.linkCount != this.linkCount)

-      return that.linkCount - this.linkCount;
+    if (that.score != this.score)
+      return (that.score - this.score) > 0 ? 1 : -1;
    if (that.status != this.status)
      return this.status - that.status;
    if (that.fetchTime != this.fetchTime)
@@ -153,10 +156,10 @@
    public Comparator() { super(CrawlDatum.class); }

    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
-      int linkCount1 = readInt(b1,s1+LINK_COUNT_OFFSET);
-      int linkCount2 = readInt(b2,s2+LINK_COUNT_OFFSET);
-      if (linkCount2 != linkCount1) {
-        return linkCount2 - linkCount1;
+      float score1 = readFloat(b1,s1+SCORE_OFFSET);
+      float score2 = readFloat(b2,s2+SCORE_OFFSET);
+      if (score2 != score1) {
+        return (score2 - score1) > 0 ? 1 : -1;
      }
      int status1 = b1[s1+1];
      int status2 = b2[s2+1];
@@ -194,7 +197,7 @@
    buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
    buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
    buf.append("Retry interval: " + getFetchInterval() + " days\n");
-    buf.append("Link Count: " + getLinkCount() + "\n");
+    buf.append("Score: " + getScore() + "\n");
    return buf.toString();
  }

@@ -207,7 +210,7 @@
      (this.fetchTime == other.fetchTime) &&
      (this.retries == other.retries) &&
      (this.fetchInterval == other.fetchInterval) &&
-      (this.linkCount == other.linkCount);
+      (this.score == other.score);
  }

  public int hashCode() {
@@ -216,7 +219,7 @@
      ((int)fetchTime) ^
      retries ^
      Float.floatToIntBits(fetchInterval) ^
-      linkCount;
+      Float.floatToIntBits(score);
  }

  public Object clone() {
Index: src/java/org/apache/nutch/crawl/ParseOutputFormat.java
===================================================================
--- src/java/org/apache/nutch/crawl/ParseOutputFormat.java      (revision 
326624)
+++ src/java/org/apache/nutch/crawl/ParseOutputFormat.java      (working copy)
@@ -64,6 +64,11 @@

          // collect outlinks for subsequent db update
          Outlink[] links = parse.getData().getOutlinks();
+
+          // compute OPIC score contribution
+          float score = Float.valueOf(parse.getData().get(Fetcher.SCORE_KEY));
+          score /= links.length;

+for (int i = 0; i < links.length; i++) {

            String toUrl = links[i].getToUrl();
            try {
@@ -75,7 +80,7 @@
            if (toUrl != null)
              crawlOut.append(new UTF8(toUrl),
                              new CrawlDatum(CrawlDatum.STATUS_LINKED,
-                                             interval));
+                                             interval, score));
          }
        }

Index: src/java/org/apache/nutch/crawl/Fetcher.java

===================================================================
--- src/java/org/apache/nutch/crawl/Fetcher.java        (revision 326624)
+++ src/java/org/apache/nutch/crawl/Fetcher.java        (working copy)
@@ -38,6 +38,7 @@

public static final String DIGEST_KEY = "nutch.content.digest";

  public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
+  public static final String SCORE_KEY = "nutch.crawl.score";

  public static class InputFormat extends SequenceFileInputFormat {
    /** Don't split inputs, to keep things polite. */
@@ -197,6 +198,8 @@
        (DIGEST_KEY, MD5Hash.digest(content.getContent()).toString());
      content.getMetadata().setProperty           // add segment to metadata
        (SEGMENT_NAME_KEY, segmentName);
+      content.getMetadata().setProperty           // add score to metadata
+        (SCORE_KEY, Float.toString(datum.getScore()));

      Parse parse = null;
      if (parsing) {
Index: src/java/org/apache/nutch/crawl/Generator.java
===================================================================
--- src/java/org/apache/nutch/crawl/Generator.java      (revision 326624)
+++ src/java/org/apache/nutch/crawl/Generator.java      (working copy)
@@ -61,7 +61,7 @@
      if (crawlDatum.getFetchTime() > curTime)
        return;                                   // not time yet

-      output.collect(crawlDatum, key);          // invert for sort by linkCount
+      output.collect(crawlDatum, key);          // invert for sort by score
    }

    /** Partition by host (value). */
Index: src/java/org/apache/nutch/crawl/CrawlDbReducer.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbReducer.java (revision 326624)
+++ src/java/org/apache/nutch/crawl/CrawlDbReducer.java (working copy)
@@ -38,11 +38,10 @@

    CrawlDatum highest = null;
    CrawlDatum old = null;
-    int linkCount = 0;
+    float scoreIncrement = 0.0f;

    while (values.hasNext()) {
      CrawlDatum datum = (CrawlDatum)values.next();
-      linkCount += datum.getLinkCount();          // sum link counts

      if (highest == null || datum.getStatus() > highest.getStatus()) {
        highest = datum;                          // find highest status
@@ -52,6 +51,10 @@
      case CrawlDatum.STATUS_DB_UNFETCHED:
      case CrawlDatum.STATUS_DB_FETCHED:
        old = datum;
+        break;
+      case CrawlDatum.STATUS_LINKED:
+        scoreIncrement += datum.getScore();
+        break;
      }
    }

@@ -99,7 +102,7 @@
    }

if (result != null) {

-      result.setLinkCount(linkCount);
+      result.setScore(result.getScore() + scoreIncrement);
      output.collect(key, result);
    }
  }
Index: src/java/org/apache/nutch/crawl/Indexer.java
===================================================================
--- src/java/org/apache/nutch/crawl/Indexer.java        (revision 326624)
+++ src/java/org/apache/nutch/crawl/Indexer.java        (working copy)
@@ -138,12 +138,7 @@
    super(conf);
  }

-  private boolean boostByLinkCount;
-  private float scorePower;
-
  public void configure(JobConf job) {
-    boostByLinkCount = job.getBoolean("indexer.boost.by.link.count", false);
-    scorePower = job.getFloat("indexer.score.power", 0.5f);
  }

  public void reduce(WritableComparable key, Iterator values,
@@ -183,10 +178,8 @@
    // add digest, used by dedup
    doc.add(Field.UnIndexed("digest", meta.getProperty(Fetcher.DIGEST_KEY)));

-    // compute boost
-    float boost =
-      IndexSegment.calculateBoost(1.0f, scorePower, boostByLinkCount,
-                                  anchors.length);
+    // boost is log(opic)
+    float boost = (float)Math.log(Math.E + crawlDatum.getScore());
    // apply boost to all indexed fields.
    doc.setBoost(boost);
    // store boost for use by explain and dedup

Re: OPIC

Reply via email to