Here is a patch that implements this. I'm still testing it. If it
appears to work well, I will commit it.
Doug Cutting wrote:
Massimo Miccoli wrote:
Any news about integration of OPIC in mapred? I have time to develop
OPIC on Nutch Mapred. Can you help me to start?
By the email from Carlos Alberto-Alejandro CASTILLO-Ocaranza, seams
that the best way to integrate OPIC in on old webdb, is this way valid
also
CrawlDb in Mapred?
Yes. I think the way to implement this in the mapred branch is:
1. In CrawlDatum.java, replace 'int linkCount' with 'float score'. The
default value of this should be 1.0f. This will require changes to
accessors, write, readFields, compareTo etc. A constructor which
specifies the score should be added. The comparator should sort by
decreasing score.
2. In crawl/Fetcher.java, add the score to the Content's metadata:
public static String SCORE_KEY = "org.apache.nutch.crawl.score";
...
private void output(...) {
...
content.getMetadata().setProperty(SCORE_KEY, datum.getScore());
...
}
3. In ParseOutputFormat.java, when writing the CrawlDatum for each
outlink (line 77), set the score of the link CrawlDatum to be the score
of the page:
float score =
Float.valueOf(parse.getData().get(Fetcher.SCORE_KEY));
score /= links.length;
for (int i = 0; i < links.length, ...) {
...
new CrawlDatum(CrawlDatum.STATUS_LINKED,
interval, score);
...
}
4. In CrawlDbReducer.java, remove linkCount calculations. Replace these
with something like:
float scoreIncrement = 0.0f;
while (values.next()) {
...
switch (datum.getStatus()) {
...
CrawlDatum.STATUS_LINKED:
scoreIncrement += datum.getScore();
break;
...
}
...
result.setScore(result.getScore() + scoreIncrement);
I think that should do it, no?
Doug
Index: conf/crawl-tool.xml
===================================================================
--- conf/crawl-tool.xml (revision 326624)
+++ conf/crawl-tool.xml (working copy)
@@ -15,13 +15,6 @@
</property>
<property>
- <name>indexer.boost.by.link.count</name>
- <value>true</value>
- <description>When true scores for a page are multipled by the log of
- the number of incoming links to the page.</description>
-</property>
-
-<property>
<name>db.ignore.internal.links</name>
<value>false</value>
<description>If true, when adding new links to a page, links from
Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml (revision 326624)
+++ conf/nutch-default.xml (working copy)
@@ -440,24 +440,6 @@
<!-- indexer properties -->
<property>
- <name>indexer.score.power</name>
- <value>0.5</value>
- <description>Determines the power of link analyis scores. Each
- pages's boost is set to <i>score<sup>scorePower</sup></i> where
- <i>score</i> is its link analysis score and <i>scorePower</i> is the
- value of this parameter. This is compiled into indexes, so, when
- this is changed, pages must be re-indexed for it to take
- effect.</description>
-</property>
-
-<property>
- <name>indexer.boost.by.link.count</name>
- <value>true</value>
- <description>When true scores for a page are multipled by the log of
- the number of incoming links to the page.</description>
-</property>
-
-<property>
<name>indexer.max.title.length</name>
<value>100</value>
<description>The maximum number of characters of a title that are indexed.
Index: src/java/org/apache/nutch/crawl/CrawlDatum.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDatum.java (revision 326624)
+++ src/java/org/apache/nutch/crawl/CrawlDatum.java (working copy)
@@ -31,7 +31,7 @@
public static final String FETCH_DIR_NAME = "crawl_fetch";
public static final String PARSE_DIR_NAME = "crawl_parse";
- private final static byte CUR_VERSION = 1;
+ private final static byte CUR_VERSION = 2;
public static final byte STATUS_DB_UNFETCHED = 1;
public static final byte STATUS_DB_FETCHED = 2;
@@ -47,17 +47,20 @@
private long fetchTime = System.currentTimeMillis();
private byte retries;
private float fetchInterval;
- private int linkCount;
+ private float score = 1.0f;
public CrawlDatum() {}
public CrawlDatum(int status, float fetchInterval) {
this.status = (byte)status;
this.fetchInterval = fetchInterval;
- if (status == STATUS_LINKED)
- linkCount = 1;
}
+ public CrawlDatum(int status, float fetchInterval, float score) {
+ this(status, fetchInterval);
+ this.score = score;
+ }
+
//
// accessor methods
//
@@ -80,8 +83,8 @@
this.fetchInterval = fetchInterval;
}
- public int getLinkCount() { return linkCount; }
- public void setLinkCount(int linkCount) { this.linkCount = linkCount; }
+ public float getScore() { return score; }
+ public void setScore(float score) { this.score = score; }
//
// writable methods
@@ -96,18 +99,18 @@
public void readFields(DataInput in) throws IOException {
byte version = in.readByte(); // read version
- if (version > CUR_VERSION) // check version
+ if (version != CUR_VERSION) // check version
throw new VersionMismatchException(CUR_VERSION, version);
status = in.readByte();
fetchTime = in.readLong();
retries = in.readByte();
fetchInterval = in.readFloat();
- linkCount = in.readInt();
+ score = in.readFloat();
}
- /** The number of bytes into a CrawlDatum that the linkCount is stored. */
- private static final int LINK_COUNT_OFFSET = 1 + 1 + 8 + 1 + 4;
+ /** The number of bytes into a CrawlDatum that the score is stored. */
+ private static final int SCORE_OFFSET = 1 + 1 + 8 + 1 + 4;
public void write(DataOutput out) throws IOException {
out.writeByte(CUR_VERSION); // store current version
@@ -115,7 +118,7 @@
out.writeLong(fetchTime);
out.writeByte(retries);
out.writeFloat(fetchInterval);
- out.writeInt(linkCount);
+ out.writeFloat(score);
}
/** Copy the contents of another instance into this instance. */
@@ -124,7 +127,7 @@
this.fetchTime = that.fetchTime;
this.retries = that.retries;
this.fetchInterval = that.fetchInterval;
- this.linkCount = that.linkCount;
+ this.score = that.score;
}
@@ -132,11 +135,11 @@
// compare methods
//
- /** Sort by decreasing link count. */
+ /** Sort by decreasing score. */
public int compareTo(Object o) {
CrawlDatum that = (CrawlDatum)o;
- if (that.linkCount != this.linkCount)
- return that.linkCount - this.linkCount;
+ if (that.score != this.score)
+ return (that.score - this.score) > 0 ? 1 : -1;
if (that.status != this.status)
return this.status - that.status;
if (that.fetchTime != this.fetchTime)
@@ -153,10 +156,10 @@
public Comparator() { super(CrawlDatum.class); }
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
- int linkCount1 = readInt(b1,s1+LINK_COUNT_OFFSET);
- int linkCount2 = readInt(b2,s2+LINK_COUNT_OFFSET);
- if (linkCount2 != linkCount1) {
- return linkCount2 - linkCount1;
+ float score1 = readFloat(b1,s1+SCORE_OFFSET);
+ float score2 = readFloat(b2,s2+SCORE_OFFSET);
+ if (score2 != score1) {
+ return (score2 - score1) > 0 ? 1 : -1;
}
int status1 = b1[s1+1];
int status2 = b2[s2+1];
@@ -194,7 +197,7 @@
buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
buf.append("Retry interval: " + getFetchInterval() + " days\n");
- buf.append("Link Count: " + getLinkCount() + "\n");
+ buf.append("Score: " + getScore() + "\n");
return buf.toString();
}
@@ -207,7 +210,7 @@
(this.fetchTime == other.fetchTime) &&
(this.retries == other.retries) &&
(this.fetchInterval == other.fetchInterval) &&
- (this.linkCount == other.linkCount);
+ (this.score == other.score);
}
public int hashCode() {
@@ -216,7 +219,7 @@
((int)fetchTime) ^
retries ^
Float.floatToIntBits(fetchInterval) ^
- linkCount;
+ Float.floatToIntBits(score);
}
public Object clone() {
Index: src/java/org/apache/nutch/crawl/ParseOutputFormat.java
===================================================================
--- src/java/org/apache/nutch/crawl/ParseOutputFormat.java (revision 326624)
+++ src/java/org/apache/nutch/crawl/ParseOutputFormat.java (working copy)
@@ -64,6 +64,11 @@
// collect outlinks for subsequent db update
Outlink[] links = parse.getData().getOutlinks();
+
+ // compute OPIC score contribution
+ float score = Float.valueOf(parse.getData().get(Fetcher.SCORE_KEY));
+ score /= links.length;
+
for (int i = 0; i < links.length; i++) {
String toUrl = links[i].getToUrl();
try {
@@ -75,7 +80,7 @@
if (toUrl != null)
crawlOut.append(new UTF8(toUrl),
new CrawlDatum(CrawlDatum.STATUS_LINKED,
- interval));
+ interval, score));
}
}
Index: src/java/org/apache/nutch/crawl/Fetcher.java
===================================================================
--- src/java/org/apache/nutch/crawl/Fetcher.java (revision 326624)
+++ src/java/org/apache/nutch/crawl/Fetcher.java (working copy)
@@ -38,6 +38,7 @@
public static final String DIGEST_KEY = "nutch.content.digest";
public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
+ public static final String SCORE_KEY = "nutch.crawl.score";
public static class InputFormat extends SequenceFileInputFormat {
/** Don't split inputs, to keep things polite. */
@@ -197,6 +198,8 @@
(DIGEST_KEY, MD5Hash.digest(content.getContent()).toString());
content.getMetadata().setProperty // add segment to metadata
(SEGMENT_NAME_KEY, segmentName);
+ content.getMetadata().setProperty // add score to metadata
+ (SCORE_KEY, Float.toString(datum.getScore()));
Parse parse = null;
if (parsing) {
Index: src/java/org/apache/nutch/crawl/Generator.java
===================================================================
--- src/java/org/apache/nutch/crawl/Generator.java (revision 326624)
+++ src/java/org/apache/nutch/crawl/Generator.java (working copy)
@@ -61,7 +61,7 @@
if (crawlDatum.getFetchTime() > curTime)
return; // not time yet
- output.collect(crawlDatum, key); // invert for sort by linkCount
+ output.collect(crawlDatum, key); // invert for sort by score
}
/** Partition by host (value). */
Index: src/java/org/apache/nutch/crawl/CrawlDbReducer.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbReducer.java (revision 326624)
+++ src/java/org/apache/nutch/crawl/CrawlDbReducer.java (working copy)
@@ -38,11 +38,10 @@
CrawlDatum highest = null;
CrawlDatum old = null;
- int linkCount = 0;
+ float scoreIncrement = 0.0f;
while (values.hasNext()) {
CrawlDatum datum = (CrawlDatum)values.next();
- linkCount += datum.getLinkCount(); // sum link counts
if (highest == null || datum.getStatus() > highest.getStatus()) {
highest = datum; // find highest status
@@ -52,6 +51,10 @@
case CrawlDatum.STATUS_DB_UNFETCHED:
case CrawlDatum.STATUS_DB_FETCHED:
old = datum;
+ break;
+ case CrawlDatum.STATUS_LINKED:
+ scoreIncrement += datum.getScore();
+ break;
}
}
@@ -99,7 +102,7 @@
}
if (result != null) {
- result.setLinkCount(linkCount);
+ result.setScore(result.getScore() + scoreIncrement);
output.collect(key, result);
}
}
Index: src/java/org/apache/nutch/crawl/Indexer.java
===================================================================
--- src/java/org/apache/nutch/crawl/Indexer.java (revision 326624)
+++ src/java/org/apache/nutch/crawl/Indexer.java (working copy)
@@ -138,12 +138,7 @@
super(conf);
}
- private boolean boostByLinkCount;
- private float scorePower;
-
public void configure(JobConf job) {
- boostByLinkCount = job.getBoolean("indexer.boost.by.link.count", false);
- scorePower = job.getFloat("indexer.score.power", 0.5f);
}
public void reduce(WritableComparable key, Iterator values,
@@ -183,10 +178,8 @@
// add digest, used by dedup
doc.add(Field.UnIndexed("digest", meta.getProperty(Fetcher.DIGEST_KEY)));
- // compute boost
- float boost =
- IndexSegment.calculateBoost(1.0f, scorePower, boostByLinkCount,
- anchors.length);
+ // boost is log(opic)
+ float boost = (float)Math.log(Math.E + crawlDatum.getScore());
// apply boost to all indexed fields.
doc.setBoost(boost);
// store boost for use by explain and dedup