Author: rwesten
Date: Thu Apr 18 09:14:16 2013
New Revision: 1469233

URL: http://svn.apache.org/r1469233
Log:
STANBOL-1027: Implementation of the algorithm that partly compensate Solr 
length nore as additional index time boost of entities with multiple labels

Modified:
    
stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/SolrYard.java

Modified: 
stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/SolrYard.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/SolrYard.java?rev=1469233&r1=1469232&r2=1469233&view=diff
==============================================================================
--- 
stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/SolrYard.java
 (original)
+++ 
stanbol/trunk/entityhub/yard/solr/src/main/java/org/apache/stanbol/entityhub/yard/solr/impl/SolrYard.java
 Thu Apr 18 09:14:16 2013
@@ -23,10 +23,12 @@ import java.security.PrivilegedException
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.Set;
 
 import org.apache.solr.client.solrj.SolrQuery;
@@ -38,6 +40,7 @@ import org.apache.solr.client.solrj.resp
 import org.apache.solr.client.solrj.response.UpdateResponse;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.SolrInputField;
 import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixService;
 import org.apache.stanbol.commons.solr.utils.SolrUtil;
 import org.apache.stanbol.commons.solr.utils.StreamQueryRequest;
@@ -744,28 +747,25 @@ public class SolrYard extends AbstractYa
             // But this would also prevent the possibility to intentionally
             // override the boost.
             String field = fields.next();
-            float boost;
+            /*
+             * With STANBOL-1027 the calculation of the boost has changed to
+             * consider multiple values for Representation#get(field).
+             */
+            float baseBoost; //the boost without considering the number of 
values per solr field
             Float fieldBoost = fieldBoostMap == null ? null : 
fieldBoostMap.get(field);
-            //With solr 3.6 one can not set index time boosts on fields that 
omitNorms
-            //because of that we need to restrict the usage of boosts to those 
manually
-            //configured in the fieldBoostMap. Before bosts where dropped for 
fields that
-            //do not support them
+            final Map<String,int[]> fieldsToBoost; //used to keep track of 
field we need boost
             if(fieldBoost != null){
-                boost = documentBoost != null ? fieldBoost * documentBoost : 
fieldBoost;
+                baseBoost = documentBoost != null ? fieldBoost * documentBoost 
: fieldBoost;
+                fieldsToBoost = new HashMap<String,int[]>();
             } else { 
-                boost = -1;
+                baseBoost = -1;
+                fieldsToBoost = null;
             }
-            //the old code that does no longer work with Solr 3.6 :(
-//            if(documentBoost != null){
-//                boost = documentBoost;
-//                if(fieldBoost != null){
-//                    boost = boost*fieldBoost;
-//                }
-//            } else if(fieldBoost != null){
-//                boost = fieldBoost;
-//            } else {
-//                boost = -1;
-//            }
+            //NOTE: Setting a boost requires two iteration
+            //  (1) we add the values to the SolrInputDocument without an boost
+            //  (2) set the boost by using 
doc.setField(field,doc.getFieldValues(),boost)
+            //  Holding field values in an own map does not make sense as the 
SolrInputDocument
+            //  does already exactly that (in an more efficient way)
             for (Iterator<Object> values = representation.get(field); 
values.hasNext();) {
                 // now we need to get the indexField for the value
                 Object next = values.next();
@@ -773,10 +773,23 @@ public class SolrYard extends AbstractYa
                 try {
                     value = indexValueFactory.createIndexValue(next);
                     for (String fieldName : 
fieldMapper.getFieldNames(Arrays.asList(field), value)) {
-                        //Set Boosts only for text data types
-                        if(boost > 0){
-                            inputDocument.addField(fieldName, 
value.getValue(), boost);
+                        //In step (1) of boosting just keep track of the field
+                        if(fieldBoost != null){ //wee need to boost in (2)
+                            int[] numValues = fieldsToBoost.get(fieldName);
+                            if(numValues == null){
+                                numValues = new int[]{1};
+                                fieldsToBoost.put(fieldName, numValues);
+                                //the first time add the document with the 
baseBoost
+                                //as this will be the correct boost for single 
value fields
+                                inputDocument.addField(fieldName, 
value.getValue(),baseBoost);
+                            } else {
+                                numValues[0]++;
+                                //for multi valued fields the correct boost is 
set in (2)
+                                //so we can add here without an boost
+                                inputDocument.addField(fieldName, 
value.getValue());
+                            }
                         } else {
+                            //add add the values without boost
                             inputDocument.addField(fieldName, 
value.getValue());
                         }
                     }
@@ -795,6 +808,18 @@ public class SolrYard extends AbstractYa
                             next.getClass(), field), e);
                 }
             }
+            if(fieldBoost != null){ //we need still to do part (2) of setting 
the correct boost
+                for(Entry<String,int[]> entry : fieldsToBoost.entrySet()){
+                    if(entry.getValue()[0] > 1) { //adapt the boost only for 
multi valued fields
+                        SolrInputField solrField = 
inputDocument.getField(entry.getKey());
+                        //the correct bosst is baseBoost (representing entity 
boost with field
+                        //boost) multiplied with the sqrt(fieldValues). The 
2nd part aims to
+                        //compensate the Solr lengthNorm (1/sqrt(fieldTokens))
+                        //see STANBOL-1027 for details
+                        
solrField.setBoost(baseBoost*(float)Math.sqrt(entry.getValue()[0]));
+                    }
+                }
+            }
         }
         return inputDocument;
     }


Reply via email to