SegmentMergeTool.java

ab Thu, 19 May 2005 16:35:09 -0700

Author: ab
Date: Thu May 19 08:55:53 2005
New Revision: 170951

URL: http://svn.apache.org/viewcvs?rev=170951&view=rev
Log:
Fix SegmentMergeTool so that it uses the same dedup algorithm as the
DeleteDuplicates.java. Slightly refactor IndexSegment, so that the boost
calculation code can be reused.


Modified:
    incubator/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
    incubator/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java

Modified: 
incubator/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java
URL: 
http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java?rev=170951&r1=170950&r2=170951&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java 
(original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/indexer/IndexSegment.java 
Thu May 19 08:55:53 2005
@@ -202,14 +202,8 @@
     // add digest, used by dedup
     doc.add(Field.UnIndexed("digest", fo.getMD5Hash().toString()));
 
-    // compute boost
-    // 1. Start with page's score from DB -- 1.0 if no link analysis.
-    float boost = fo.getFetchListEntry().getPage().getScore();
-    // 2. Apply scorePower to this.
-    boost = (float)Math.pow(boost, scorePower);
-    // 3. Optionally boost by log of incoming anchor count.
-    if (boostByLinkCount)
-      boost *= (float)Math.log(Math.E + fo.getAnchors().length);
+    float boost = calculateBoost(fo.getFetchListEntry().getPage().getScore(),
+            scorePower, boostByLinkCount, fo.getAnchors().length);
     // 4. Apply boost to all indexed fields.
     doc.setBoost(boost);
 
@@ -219,6 +213,17 @@
     return doc;
   }
 
+  public static float calculateBoost(float pageScore, float scorePower,
+          boolean boostByLinkCount, int linkCount) {
+    // 1. Start with page's score from DB -- 1.0 if no link analysis.
+    float res = pageScore;
+    // 2. Apply scorePower to this.
+    res = (float)Math.pow(pageScore, scorePower);
+    // 3. Optionally boost by log of incoming anchor count.
+    if (boostByLinkCount)
+      res *= (float)Math.log(Math.E + linkCount);
+    return res;
+  }
 
   /** 
    * Create an index for the input files in the named directory. 

Modified: 
incubator/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java
URL: 
http://svn.apache.org/viewcvs/incubator/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java?rev=170951&r1=170950&r2=170951&view=diff
==============================================================================
--- incubator/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java 
(original)
+++ incubator/nutch/trunk/src/java/org/apache/nutch/tools/SegmentMergeTool.java 
Thu May 19 08:55:53 2005
@@ -36,6 +36,7 @@
 import org.apache.nutch.segment.SegmentReader;
 import org.apache.nutch.segment.SegmentWriter;
 import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
 
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.document.DateField;
@@ -95,6 +96,11 @@
   public static int INDEX_MERGE_FACTOR = 30;
   public static int INDEX_MIN_MERGE_DOCS = 100;
   
+  private boolean boostByLinkCount =
+    NutchConf.get().getBoolean("indexer.boost.by.link.count", false);
+
+  private float scorePower = NutchConf.get().getFloat("indexer.score.power", 
0.5f);
+  
   private NutchFileSystem nfs = null;
   private File[] segments = null;
   private int stage = SegmentMergeStatus.STAGE_OPENING;
@@ -231,10 +237,16 @@
             if (!sr.get(i, fo, null, null, null)) break;
 
             Document doc = new Document();
+            
+            // compute boost
+            float boost = 
IndexSegment.calculateBoost(fo.getFetchListEntry().getPage().getScore(),
+                    scorePower, boostByLinkCount, fo.getAnchors().length);
             doc.add(new Field("sd", name + "|" + i, true, false, false));
             doc.add(new Field("uh", 
MD5Hash.digest(fo.getUrl().toString()).toString(), true, true, false));
             doc.add(new Field("ch", fo.getMD5Hash().toString(), true, true, 
false));
             doc.add(new Field("time", 
DateField.timeToString(fo.getFetchDate()), true, false, false));
+            doc.add(new Field("score", boost + "", true, false, false));
+            doc.add(new Field("ul", fo.getUrl().toString().length() + "", 
true, false, false));
             iw.addDocument(doc);
             processedRecords++;
             if (processedRecords > 0 && (processedRecords % LOG_STEP == 0)) {
@@ -304,32 +316,82 @@
         // Enumerate all docs with the same URL hash or content hash
         TermDocs td = ir.termDocs(t);
         if (td == null) continue;
-        int id = -1;
-        String time = null;
-        Document doc = null;
-        // Keep only the latest version of the document with
-        // the same hash (url or content). Note: even if the content
-        // hash is identical, other metadata may be different, so even
-        // in this case it makes sense to keep the latest version.
-        while (td.next()) {
-          int docid = td.doc();
-          if (!ir.isDeleted(docid)) {
-            doc = ir.document(docid);
-            if (time == null) {
-              time = doc.get("time");
-              id = docid;
-              continue;
+        if (t.field().equals("uh")) {
+          // Keep only the latest version of the document with
+          // the same url hash. Note: even if the content
+          // hash is identical, other metadata may be different, so even
+          // in this case it makes sense to keep the latest version.
+          int id = -1;
+          String time = null;
+          Document doc = null;
+          while (td.next()) {
+            int docid = td.doc();
+            if (!ir.isDeleted(docid)) {
+              doc = ir.document(docid);
+              if (time == null) {
+                time = doc.get("time");
+                id = docid;
+                continue;
+              }
+              String dtime = doc.get("time");
+              // "time" is a DateField, and can be compared lexicographically
+              if (dtime.compareTo(time) > 0) {
+                if (id != -1) {
+                  ir.delete(id);
+                }
+                time = dtime;
+                id = docid;
+              } else {
+                ir.delete(docid);
+              }
             }
-            String dtime = doc.get("time");
-            // "time" is a DateField, and can be compared lexicographically
-            if (dtime.compareTo(time) > 0) {
-              if (id != -1) {
-                ir.delete(id);
+          }
+        } else if (t.field().equals("ch")) {
+          // Keep only the version of the document with
+          // the highest score, and then with the shortest url.
+          int id = -1;
+          int ul = 0;
+          float score = 0.0f;
+          Document doc = null;
+          while (td.next()) {
+            int docid = td.doc();
+            if (!ir.isDeleted(docid)) {
+              doc = ir.document(docid);
+              if (ul == 0) {
+                try {
+                  ul = Integer.parseInt(doc.get("ul"));
+                  score = Float.parseFloat(doc.get("score"));
+                } catch (Exception e) {};
+                id = docid;
+                continue;
+              }
+              int dul = 0;
+              float dscore = 0.0f;
+              try {
+                dul = Integer.parseInt(doc.get("ul"));
+                dscore = Float.parseFloat(doc.get("score"));
+              } catch (Exception e) {};
+              int cmp = Float.compare(dscore, score);
+              if (cmp == 0) {
+                // equal scores, select the one with shortest url
+                if (dul < ul) {
+                  if (id != -1) {
+                    ir.delete(id);
+                  }
+                  ul = dul;
+                  id = docid;
+                } else {
+                  ir.delete(docid);
+                }
+              } else if (cmp < 0) {
+                ir.delete(docid);
+              } else {
+                if (id != -1) {
+                  ir.delete(id);
+                }
+                ul = dul;
+                id = docid;
               }
-              time = dtime;
-              id = docid;
-            } else {
-              ir.delete(docid);
             }
           }
         }




-------------------------------------------------------
This SF.Net email is sponsored by Oracle Space Sweepstakes
Want to be the first software developer in space?
Enter now for the Oracle Space Sweepstakes!
http://ads.osdn.com/?ad_idt12&alloc_id344&op=click
_______________________________________________
Nutch-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

[Nutch-cvs] svn commit: r170951 - in /incubator/nutch/trunk/src/java/org/apache/nutch: indexer/IndexSegment.java tools/SegmentMergeTool.java

Reply via email to