Hello,

During my experiment with adding title and host to scoring I have changed nutch source to allow setting boost values for fields in nutch config file. I am attaching patch for latest SVN version.

I was testing many values of these properties for our site and finally we use: 2.0 for host, 1.0 for all others. It was helping a little when our index contained a lot of pages from spam sites. When we removed those bad pages - it really does not matter as much as I hoped. Anyway I am attaching a patch so anyone can play with it.
Regards
Piotr


Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml      (revision 161968)
+++ conf/nutch-default.xml      (working copy)
@@ -669,4 +669,43 @@
   </description>
 </property>
 
+<!-- query-basic plugin properties -->
+
+  <property>
+    <name>query.url.boost</name>
+    <value>4.0</value>
+    <description> Used as a boost for url field in Lucene query.
+    </description>
+  </property>
+  
+  <property>
+    <name>query.anchor.boost</name>
+    <value>2.0</value>
+    <description> Used as a boost for anchor field in Lucene query.
+    </description>
+  </property>
+  
+  
+  <property>
+    <name>query.title.boost</name>
+    <value>1.5</value>
+    <description> Used as a boost for title field in Lucene query.
+    </description>
+  </property>
+  
+  <property>
+    <name>query.host.boost</name>
+    <value>2.0</value>
+    <description> Used as a boost for host field in Lucene query.
+    </description>
+  </property>
+  
+  <property>
+    <name>query.phrase.boost</name>
+    <value>1.0</value>
+    <description> Used as a boost for phrase in Lucene query.
+    Multiplied by boost for field phrase is matched in.
+    </description>
+  </property>
+
 </nutch-conf>
Index: 
src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
===================================================================
--- 
src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
       (revision 161968)
+++ 
src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
       (working copy)
@@ -26,6 +26,7 @@
 import org.apache.nutch.searcher.QueryFilter;
 import org.apache.nutch.searcher.Query;
 import org.apache.nutch.searcher.Query.*;
+import org.apache.nutch.util.NutchConf;
 
 import java.io.IOException;
 import java.util.HashSet;
@@ -33,15 +34,24 @@
 /** The default query filter.  Query terms in the default query field are
  * expanded to search the url, anchor and content document fields.*/
 public class BasicQueryFilter implements QueryFilter {
+    
+    private static float URL_BOOST = NutchConf.get().getFloat(
+            "query.url.boost", 4.0f);
 
-  private static float URL_BOOST = 4.0f;
-  private static float ANCHOR_BOOST = 2.0f;
-  private static float TITLE_BOOST = 1.5f;
-  private static float HOST_BOOST = 2.0f;
+    private static float ANCHOR_BOOST = NutchConf.get().getFloat(
+            "query.anchor.boost", 2.0f);
 
-  private static int SLOP = Integer.MAX_VALUE;
-  private static float PHRASE_BOOST = 1.0f;
+    private static float TITLE_BOOST = NutchConf.get().getFloat(
+            "query.title.boost", 1.5f);
 
+    private static float HOST_BOOST = NutchConf.get().getFloat(
+            "query.host.boost", 2.0f);
+
+    private static int SLOP = Integer.MAX_VALUE;
+
+    private static float PHRASE_BOOST = NutchConf.get().getFloat(
+            "query.phrase.boost", 1.0f);
+
   private static final String[] FIELDS =
   { "url", "anchor", "content", "title", "host" };
 

Reply via email to