Author: ab
Date: Thu May 19 08:55:53 2005
New Revision: 170951
URL: http://svn.apache.org/viewcvs?rev=170951&view=rev
Log:
Fix SegmentMergeTool so that it uses the same dedup algorithm as the
DeleteDuplicates.java. Slightly refactor IndexSegment, so that the boost
calculation code can be reused.
Modified:
incubator/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
incubator/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java
Modified:
incubator/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
URL:
http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java?rev=170951&r1=170950&r2=170951&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
(original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
Thu May 19 08:55:53 2005
@@ -202,14 +202,8 @@
// add digest, used by dedup
doc.add(Field.UnIndexed("digest", fo.getMD5Hash().toString()));
- // compute boost
- // 1. Start with page's score from DB -- 1.0 if no link analysis.
- float boost = fo.getFetchListEntry().getPage().getScore();
- // 2. Apply scorePower to this.
- boost = (float)Math.pow(boost, scorePower);
- // 3. Optionally boost by log of incoming anchor count.
- if (boostByLinkCount)
- boost *= (float)Math.log(Math.E + fo.getAnchors().length);
+ float boost = calculateBoost(fo.getFetchListEntry().getPage().getScore(),
+ scorePower, boostByLinkCount, fo.getAnchors().length);
// 4. Apply boost to all indexed fields.
doc.setBoost(boost);
@@ -219,6 +213,17 @@
return doc;
}
+ public static float calculateBoost(float pageScore, float scorePower,
+ boolean boostByLinkCount, int linkCount) {
+ // 1. Start with page's score from DB -- 1.0 if no link analysis.
+ float res = pageScore;
+ // 2. Apply scorePower to this.
+ res = (float)Math.pow(pageScore, scorePower);
+ // 3. Optionally boost by log of incoming anchor count.
+ if (boostByLinkCount)
+ res *= (float)Math.log(Math.E + linkCount);
+ return res;
+ }
/**
* Create an index for the input files in the named directory.
Modified:
incubator/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java
URL:
http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java?rev=170951&r1=170950&r2=170951&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java
(original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java
Thu May 19 08:55:53 2005
@@ -36,6 +36,7 @@
import org.apache.nutch.segment.SegmentReader;
import org.apache.nutch.segment.SegmentWriter;
import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.DateField;
@@ -95,6 +96,11 @@
public static int INDEX_MERGE_FACTOR = 30;
public static int INDEX_MIN_MERGE_DOCS = 100;
+ private boolean boostByLinkCount =
+ NutchConf.get().getBoolean("indexer.boost.by.link.count", false);
+
+ private float scorePower = NutchConf.get().getFloat("indexer.score.power",
0.5f);
+
private NutchFileSystem nfs = null;
private File[] segments = null;
private int stage = SegmentMergeStatus.STAGE_OPENING;
@@ -231,10 +237,16 @@
if (!sr.get(i, fo, null, null, null)) break;
Document doc = new Document();
+
+ // compute boost
+ float boost =
IndexSegment.calculateBoost(fo.getFetchListEntry().getPage().getScore(),
+ scorePower, boostByLinkCount, fo.getAnchors().length);
doc.add(new Field("sd", name + "|" + i, true, false, false));
doc.add(new Field("uh",
MD5Hash.digest(fo.getUrl().toString()).toString(), true, true, false));
doc.add(new Field("ch", fo.getMD5Hash().toString(), true, true,
false));
doc.add(new Field("time",
DateField.timeToString(fo.getFetchDate()), true, false, false));
+ doc.add(new Field("score", boost + "", true, false, false));
+ doc.add(new Field("ul", fo.getUrl().toString().length() + "",
true, false, false));
iw.addDocument(doc);
processedRecords++;
if (processedRecords > 0 && (processedRecords % LOG_STEP == 0)) {
@@ -304,32 +316,82 @@
// Enumerate all docs with the same URL hash or content hash
TermDocs td = ir.termDocs(t);
if (td == null) continue;
- int id = -1;
- String time = null;
- Document doc = null;
- // Keep only the latest version of the document with
- // the same hash (url or content). Note: even if the content
- // hash is identical, other metadata may be different, so even
- // in this case it makes sense to keep the latest version.
- while (td.next()) {
- int docid = td.doc();
- if (!ir.isDeleted(docid)) {
- doc = ir.document(docid);
- if (time == null) {
- time = doc.get("time");
- id = docid;
- continue;
+ if (t.field().equals("uh")) {
+ // Keep only the latest version of the document with
+ // the same url hash. Note: even if the content
+ // hash is identical, other metadata may be different, so even
+ // in this case it makes sense to keep the latest version.
+ int id = -1;
+ String time = null;
+ Document doc = null;
+ while (td.next()) {
+ int docid = td.doc();
+ if (!ir.isDeleted(docid)) {
+ doc = ir.document(docid);
+ if (time == null) {
+ time = doc.get("time");
+ id = docid;
+ continue;
+ }
+ String dtime = doc.get("time");
+ // "time" is a DateField, and can be compared lexicographically
+ if (dtime.compareTo(time) > 0) {
+ if (id != -1) {
+ ir.delete(id);
+ }
+ time = dtime;
+ id = docid;
+ } else {
+ ir.delete(docid);
+ }
}
- String dtime = doc.get("time");
- // "time" is a DateField, and can be compared lexicographically
- if (dtime.compareTo(time) > 0) {
- if (id != -1) {
- ir.delete(id);
+ }
+ } else if (t.field().equals("ch")) {
+ // Keep only the version of the document with
+ // the highest score, and then with the shortest url.
+ int id = -1;
+ int ul = 0;
+ float score = 0.0f;
+ Document doc = null;
+ while (td.next()) {
+ int docid = td.doc();
+ if (!ir.isDeleted(docid)) {
+ doc = ir.document(docid);
+ if (ul == 0) {
+ try {
+ ul = Integer.parseInt(doc.get("ul"));
+ score = Float.parseFloat(doc.get("score"));
+ } catch (Exception e) {};
+ id = docid;
+ continue;
+ }
+ int dul = 0;
+ float dscore = 0.0f;
+ try {
+ dul = Integer.parseInt(doc.get("ul"));
+ dscore = Float.parseFloat(doc.get("score"));
+ } catch (Exception e) {};
+ int cmp = Float.compare(dscore, score);
+ if (cmp == 0) {
+ // equal scores, select the one with shortest url
+ if (dul < ul) {
+ if (id != -1) {
+ ir.delete(id);
+ }
+ ul = dul;
+ id = docid;
+ } else {
+ ir.delete(docid);
+ }
+ } else if (cmp < 0) {
+ ir.delete(docid);
+ } else {
+ if (id != -1) {
+ ir.delete(id);
+ }
+ ul = dul;
+ id = docid;
}
- time = dtime;
- id = docid;
- } else {
- ir.delete(docid);
}
}
}
-------------------------------------------------------
This SF.Net email is sponsored by Oracle Space Sweepstakes
Want to be the first software developer in space?
Enter now for the Oracle Space Sweepstakes!
http://ads.osdn.com/?ad_idt12&alloc_id344&op=click
_______________________________________________
Nutch-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs