Author: siren Date: Wed Feb 18 12:53:12 2009 New Revision: 745503 URL: http://svn.apache.org/viewvc?rev=745503&view=rev Log: NUTCH-563 Include custom fields in BasicQueryFilter, contributed by Julien Nioche
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/nutch-default.xml lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=745503&r1=745502&r2=745503&view=diff ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Wed Feb 18 12:53:12 2009 @@ -346,6 +346,9 @@ 129. NUTCH-691 - Update jakarta poi jars to the most relevant version (Dmitry Lihachev via siren) +130. NUTCH-563 - Include custom fields in BasicQueryFilter + (Julien Nioche via siren) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=745503&r1=745502&r2=745503&view=diff ============================================================================== --- lucene/nutch/trunk/conf/nutch-default.xml (original) +++ lucene/nutch/trunk/conf/nutch-default.xml Wed Feb 18 12:53:12 2009 @@ -1119,6 +1119,15 @@ </description> </property> +<!-- +<property> + <name>query.basic.description.boost</name> + <value>1.0</value> + <description> Declares a custom field and its boost to be added to the default fields of the Lucene query. + </description> +</property> +--> + <!-- creative-commons plugin properties --> <property> Modified: lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java?rev=745503&r1=745502&r2=745503&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java (original) +++ lucene/nutch/trunk/src/plugin/query-basic/src/java/org/apache/nutch/searcher/basic/BasicQueryFilter.java Wed Feb 18 12:53:12 2009 @@ -22,6 +22,13 @@ import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.TermQuery; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + import org.apache.nutch.analysis.NutchDocumentAnalyzer; import org.apache.nutch.analysis.CommonGrams; @@ -31,7 +38,12 @@ import org.apache.hadoop.conf.Configuration; /** The default query filter. Query terms in the default query field are - * expanded to search the url, anchor and content document fields.*/ + * expanded to search the url, anchor and content document fields. + * Additional fields can be added by specifying parameters of the form : query.basic.(fieldname).boost + * to the configuration files (see nutch-default.xml for an example).Such fields will be used in the clauses + * generated by the BasicQueryFilter e.g. for a user query A B, it generates +(field1:A field2:A ...) +(field1:B field2:B....). + * If you don't want the additional fields to be included in the clauses you will need to implement a custom query filter for it. + **/ public class BasicQueryFilter implements QueryFilter { private static final int URL_BOOST = 0; @@ -44,7 +56,7 @@ private float PHRASE_BOOST; - private static final String[] FIELDS = + private String[] FIELDS = { "url", "anchor", "content", "title", "host" }; private float[] FIELD_BOOSTS = new float[5]; @@ -177,9 +189,51 @@ this.FIELD_BOOSTS[TITLE_BOOST] = conf.getFloat("query.title.boost", 1.5f); this.FIELD_BOOSTS[HOST_BOOST] = conf.getFloat("query.host.boost", 2.0f); this.PHRASE_BOOST = conf.getFloat("query.phrase.boost", 1.0f); + findAdditionalFields(conf); } public Configuration getConf() { return this.conf; } + + /** Searches for parameters of the form : query.basic.(fieldname).boost + * and adds the fielname to the list of default fields. + **/ + private void findAdditionalFields(Configuration conf) { + // get additional fields specified in parameters + Pattern pat = Pattern.compile("query\\.basic\\.(.+)\\.boost"); + Iterator confEntriesIterator = conf.iterator(); + List existingFields = java.util.Arrays.asList(FIELDS); + ArrayList tempfieldNames = new ArrayList(); + ArrayList tempfieldBoosts = new ArrayList(); + while (confEntriesIterator.hasNext()){ + Map.Entry entry = (Map.Entry) confEntriesIterator.next(); + String key = entry.getKey().toString(); + Matcher match = pat.matcher(key); + if (!match.matches())continue; + String fieldName = match.group(1); + if (fieldName!=null){ + // check whether it matches one of the fields which are used by default + if (existingFields.contains(fieldName)) continue; + // reserved keyword + if (fieldName.equals("phrase")) continue; + float boostCustomField = conf.getFloat(key, 2.0f); + tempfieldNames.add(fieldName); + tempfieldBoosts.add(Float.valueOf(boostCustomField)); + } + } + if (tempfieldNames.size()==0) return; + // store additional fields names and boost values in corresponding fields + String[] tempNames = new String[5+tempfieldNames.size()]; + float[] tempBoosts = new float[5+tempfieldNames.size()]; + System.arraycopy(FIELDS, 0,tempNames, 0, 5); + System.arraycopy(this.FIELD_BOOSTS, 0,tempBoosts, 0, 5); + for (int newF=0; newF < tempfieldNames.size();newF++){ + tempNames[5+newF]=(String) tempfieldNames.get(newF); + tempBoosts[5+newF]= ((Float)tempfieldBoosts.get(newF)).floatValue(); + } + // replace original fields + this.FIELDS = tempNames; + this.FIELD_BOOSTS = tempBoosts; + } }