Hello,
During my experiment with adding title and host to scoring I have changed nutch source to allow setting boost values for fields in nutch config file. I am attaching patch for latest SVN version.
I was testing many values of these properties for our site and finally we use: 2.0 for host, 1.0 for all others. It was helping a little when our index contained a lot of pages from spam sites. When we removed those bad pages - it really does not matter as much as I hoped. Anyway I am attaching a patch so anyone can play with it.
Regards
Piotr
Index: conf/nutch-default.xml
===================================================================
--- conf/nutch-default.xml (revision 161968)
+++ conf/nutch-default.xml (working copy)
@@ -669,4 +669,43 @@
</description>
</property>
+<!-- query-basic plugin properties -->
+
+ <property>
+ <name>query.url.boost</name>
+ <value>4.0</value>
+ <description> Used as a boost for url field in Lucene query.
+ </description>
+ </property>
+
+ <property>
+ <name>query.anchor.boost</name>
+ <value>2.0</value>
+ <description> Used as a boost for anchor field in Lucene query.
+ </description>
+ </property>
+
+
+ <property>
+ <name>query.title.boost</name>
+ <value>1.5</value>
+ <description> Used as a boost for title field in Lucene query.
+ </description>
+ </property>
+
+ <property>
+ <name>query.host.boost</name>
+ <value>2.0</value>
+ <description> Used as a boost for host field in Lucene query.
+ </description>
+ </property>
+
+ <property>
+ <name>query.phrase.boost</name>
+ <value>1.0</value>
+ <description> Used as a boost for phrase in Lucene query.
+ Multiplied by boost for field phrase is matched in.
+ </description>
+ </property>
+
</nutch-conf>
Index:
src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
===================================================================
---
src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
(revision 161968)
+++
src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java
(working copy)
@@ -26,6 +26,7 @@
import org.apache.nutch.searcher.QueryFilter;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.searcher.Query.*;
+import org.apache.nutch.util.NutchConf;
import java.io.IOException;
import java.util.HashSet;
@@ -33,15 +34,24 @@
/** The default query filter. Query terms in the default query field are
* expanded to search the url, anchor and content document fields.*/
public class BasicQueryFilter implements QueryFilter {
+
+ private static float URL_BOOST = NutchConf.get().getFloat(
+ "query.url.boost", 4.0f);
- private static float URL_BOOST = 4.0f;
- private static float ANCHOR_BOOST = 2.0f;
- private static float TITLE_BOOST = 1.5f;
- private static float HOST_BOOST = 2.0f;
+ private static float ANCHOR_BOOST = NutchConf.get().getFloat(
+ "query.anchor.boost", 2.0f);
- private static int SLOP = Integer.MAX_VALUE;
- private static float PHRASE_BOOST = 1.0f;
+ private static float TITLE_BOOST = NutchConf.get().getFloat(
+ "query.title.boost", 1.5f);
+ private static float HOST_BOOST = NutchConf.get().getFloat(
+ "query.host.boost", 2.0f);
+
+ private static int SLOP = Integer.MAX_VALUE;
+
+ private static float PHRASE_BOOST = NutchConf.get().getFloat(
+ "query.phrase.boost", 1.0f);
+
private static final String[] FIELDS =
{ "url", "anchor", "content", "title", "host" };
