Author: ogrisel
Date: Mon Jan  9 14:28:33 2012
New Revision: 1229168

URL: http://svn.apache.org/viewvc?rev=1229168&view=rev
Log:
STANBOL-197: minimalist implementation of the model training algorithm

Modified:
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1229168&r1=1229167&r2=1229168&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
 Mon Jan  9 14:28:33 2012
@@ -22,6 +22,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Date;
 import java.util.Dictionary;
 import java.util.Iterator;
 import java.util.LinkedHashSet;
@@ -58,6 +59,7 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
+import org.apache.stanbol.enhancer.topic.Batch;
 import org.apache.stanbol.enhancer.topic.ClassifierException;
 import org.apache.stanbol.enhancer.topic.ConfiguredSolrCoreTracker;
 import org.apache.stanbol.enhancer.topic.TopicClassifier;
@@ -112,6 +114,12 @@ public class TopicClassificationEngine e
 
     private static final Logger log = 
LoggerFactory.getLogger(TopicClassificationEngine.class);
 
+    // TODO: make the following bounds configurable
+
+    public int MAX_CHARS_PER_TOPIC = 100000;
+
+    public Integer MAX_ROOTS = 1000;
+
     protected String engineId;
 
     protected List<String> acceptedLanguages;
@@ -156,7 +164,7 @@ public class TopicClassificationEngine e
 
         // optional fields, can be null
         broaderField = (String) config.get(BROADER_FIELD);
-        materializedPathField = (String) config.get(TOPIC_URI_FIELD);
+        materializedPathField = (String) config.get(MATERIALIZED_PATH_FIELD);
         modelUpdateDateField = (String) config.get(MODEL_UPDATE_DATE_FIELD);
         Object orderParamValue = config.get(ORDER);
         if (orderParamValue != null) {
@@ -340,10 +348,11 @@ public class TopicClassificationEngine e
 
     @Override
     public Set<String> getTopicRoots() throws ClassifierException {
-        // TODO: this can be very big on flat thesauri: should we enable a 
paging API instead?
         LinkedHashSet<String> rootTopics = new LinkedHashSet<String>();
         SolrServer solrServer = getActiveSolrServer();
         SolrQuery query = new SolrQuery();
+        // TODO: this can be very big on flat thesauri: should we enable a 
paging API instead?
+        query.setRows(MAX_ROOTS);
         if (broaderField != null) {
             // find any topic with an empty broaderField
             query.setParam("q", "-" + broaderField + ":[\"\" TO *]");
@@ -352,7 +361,12 @@ public class TopicClassificationEngine e
             query.setQuery("*:*");
         }
         try {
-            for (SolrDocument result : solrServer.query(query).getResults()) {
+            QueryResponse response = solrServer.query(query);
+            if (response.getResults().size() >= MAX_ROOTS) {
+                log.warn(String.format("TopicClassifier '%s' has more than %d 
registered topic roots."
+                                       + " Some roots might be ignored.", 
engineId, MAX_ROOTS));
+            }
+            for (SolrDocument result : response.getResults()) {
                 rootTopics.add(result.getFirstValue(topicUriField).toString());
             }
         } catch (SolrServerException e) {
@@ -402,16 +416,99 @@ public class TopicClassificationEngine e
     }
 
     @Override
-    public int updateModel(boolean incremental) throws TrainingSetException {
+    public int updateModel(boolean incremental) throws TrainingSetException, 
ClassifierException {
         checkTrainingSet();
-        // TODO:
-        // perform a first query to iterate over all the registered topics 
sorted by id (to allow for paging)
-        // for each topic find the last update date of the union of the topic 
and it's narrower topic
-        return 0;
+        if (incremental && modelUpdateDateField == null) {
+            log.warn(MODEL_UPDATE_DATE_FIELD + " field is not configured: 
switching to batch update mode.");
+            incremental = false;
+        }
+        // TODO: implement incremental update by using the date informations
+        int updatedTopics = 0;
+        SolrServer solrServer = getActiveSolrServer();
+        SolrQuery query = new SolrQuery();
+        String q = "*:*";
+        query.setFields(topicUriField, broaderField);
+        String offset = null;
+        boolean done = false;
+        int batchSize = 1000;
+        query.addSortField(topicUriField, SolrQuery.ORDER.asc);
+        query.setRows(batchSize + 1);
+        while (!done) {
+            // batch over all the indexed topics
+            try {
+                if (offset != null) {
+                    q += " AND " + topicUriField + ":[" + offset.toString() + 
" TO *]";
+                }
+                query.setQuery(q);
+                QueryResponse response = solrServer.query(query);
+                int count = 0;
+                for (SolrDocument result : response.getResults()) {
+                    String topicId = 
result.getFirstValue(topicUriField).toString();
+                    if (count == batchSize) {
+                        offset = topicId;
+                    } else {
+                        count++;
+                        updateTopic(topicId, 
result.getFieldValues(broaderField));
+                        updatedTopics++;
+                    }
+                }
+                if (count < batchSize) {
+                    done = true;
+                }
+            } catch (SolrServerException e) {
+                String msg = String.format("Error while updating topics on 
Solr Core '%s'.", solrCoreId);
+                throw new TrainingSetException(msg, e);
+            }
+        }
+        return updatedTopics;
+    }
+
+    /**
+     * @param topicId
+     * @throws TrainingSetException
+     * @throws ClassifierException
+     */
+    public void updateTopic(String topicId, Collection<Object> 
broaderTopicIds) throws TrainingSetException,
+                                                                               
ClassifierException {
+        ArrayList<String> impactedTopics = new ArrayList<String>();
+        impactedTopics.add(topicId);
+        impactedTopics.addAll(getNarrowerTopics(topicId));
+        Batch<String> examples = Batch.emtpyBatch(String.class);
+        StringBuffer sb = new StringBuffer();
+        do {
+            examples = trainingSet.getPositiveExamples(impactedTopics, 
examples.nextOffset);
+            for (String example : examples.items) {
+                sb.append(example);
+                sb.append("\n\n");
+            }
+        } while (sb.length() < MAX_CHARS_PER_TOPIC && examples.hasMore);
+
+        // reindex the topic with the new text data collected from the examples
+        SolrInputDocument doc = new SolrInputDocument();
+        doc.addField(topicUriField, topicId);
+        if (broaderTopicIds != null && broaderField != null) {
+            doc.addField(broaderField, broaderTopicIds);
+        }
+        if (sb.length() > 0) {
+            doc.addField(similarityField, sb);
+        }
+        if (modelUpdateDateField != null) {
+            // TODO: force UTC timezone here
+            doc.addField(modelUpdateDateField, new Date());
+        }
+        SolrServer solrServer = getActiveSolrServer();
+        try {
+            solrServer.add(doc);
+            solrServer.commit();
+        } catch (Exception e) {
+            String msg = String.format("Error updating topic with id '%s' on 
Solr Core '%s'", topicId,
+                solrCoreId);
+            throw new ClassifierException(msg, e);
+        }
     }
 
     protected void checkTrainingSet() throws TrainingSetException {
-        if (trainingSet != null) {
+        if (trainingSet == null) {
             throw new TrainingSetException(
                     String.format("TopicClassificationEngine %s has no 
registered"
                                   + " training set hence cannot be updated.", 
engineId));

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java?rev=1229168&r1=1229167&r2=1229168&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/Batch.java
 Mon Jan  9 14:28:33 2012
@@ -17,6 +17,7 @@
 package org.apache.stanbol.enhancer.topic;
 
 import java.io.Serializable;
+import java.util.ArrayList;
 import java.util.List;
 
 /**
@@ -53,4 +54,11 @@ public class Batch<T> implements Seriali
         this.hasMore = hasMore;
         this.nextOffset = nextOffset;
     }
+
+    /**
+     * Helper method to return a first empty batch to bootstrap an iteration 
loop.
+     */
+    public static <T2> Batch<T2> emtpyBatch(Class<T2> clazz) {
+        return new Batch<T2>(new ArrayList<T2>(), true, null);
+    }
 }

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java?rev=1229168&r1=1229167&r2=1229168&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
 Mon Jan  9 14:28:33 2012
@@ -193,8 +193,8 @@ public class SolrTrainingSet extends Con
         String offset = null;
         boolean done = false;
         query.addSortField(exampleIdField, SolrQuery.ORDER.asc);
-        query.set("rows", batchSize + 1);
-        query.set("fl", exampleIdField + "," + topicUrisField);
+        query.setRows(batchSize + 1);
+        query.setFields(exampleIdField, topicUrisField);
         while (!done) {
             try {
                 if (offset != null) {

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java?rev=1229168&r1=1229167&r2=1229168&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
 Mon Jan  9 14:28:33 2012
@@ -107,5 +107,5 @@ public interface TopicClassifier {
      * 
      * @return the number of updated topics
      */
-    int updateModel(boolean incremental) throws TrainingSetException;
+    int updateModel(boolean incremental) throws TrainingSetException, 
ClassifierException;
 }

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java?rev=1229168&r1=1229167&r2=1229168&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
 Mon Jan  9 14:28:33 2012
@@ -18,6 +18,7 @@ package org.apache.stanbol.enhancer.engi
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
 import java.io.File;
@@ -202,8 +203,8 @@ public class TopicEngineTest extends Bas
         assertEquals(bestSuggestion.uri, "Category:American_films");
     }
 
-    //@Test
-    public void testTrainClassifierFromExamples() throws Exception {
+    @Test
+    public void testBatchTrainClassifierFromExamples() throws Exception {
 
         // mini taxonomy for news articles
         String business = "urn:topics/business";
@@ -212,25 +213,28 @@ public class TopicEngineTest extends Bas
         String sport = "urn:topics/sport";
         String football = "urn:topics/football";
         String wordcup = "urn:topics/wordcup";
+        String music = "urn:topics/music";
 
         classifier.addTopic(business, null);
         classifier.addTopic(technology, null);
         classifier.addTopic(sport, null);
+        classifier.addTopic(music, null);
         classifier.addTopic(apple, Arrays.asList(business, technology));
         classifier.addTopic(football, Arrays.asList(sport));
         classifier.addTopic(wordcup, Arrays.asList(football));
 
         // train the classifier on an empty dataset
         classifier.setTrainingSet(trainingSet);
-        assertEquals(6, classifier.updateModel(true));
+        assertEquals(7, classifier.updateModel(false));
 
         // the model is updated but does not predict anything
         List<TopicSuggestion> suggestions = classifier
                 .suggestTopics("I like the sound of vuvuzula in the morning!");
         assertEquals(0, suggestions.size());
 
-        // further update of the model leave do not change any topic
-        assertEquals(0, classifier.updateModel(true));
+        // further update of the model leave do not change any topic but they 
are re-indexed anyway because
+        // incremental update is disabled.
+        assertEquals(7, classifier.updateModel(false));
 
         // lets register some examples
         trainingSet.registerExample(null, "Money, money, money is the root of 
all evil.",
@@ -243,15 +247,16 @@ public class TopicEngineTest extends Bas
             Arrays.asList(football));
         trainingSet.registerExample(null, "Vuvuzela made the soundtrack of the"
                                           + " football wordcup of 2010 in 
South Africa.",
-            Arrays.asList(football, wordcup));
+            Arrays.asList(football, wordcup, music));
 
-        // retrain the model: all 6 topics are impacted by the new examples
-        assertEquals(6, classifier.updateModel(true));
+        // retrain the model: all topics are recomputed
+        assertEquals(7, classifier.updateModel(false));
         suggestions = classifier.suggestTopics("I like the sound of vuvuzula 
in the morning!");
-        assertEquals(3, suggestions.size());
-        assertEquals(wordcup, suggestions.get(0).uri);
-        assertEquals(football, suggestions.get(1).uri);
-        assertEquals(sport, suggestions.get(2).uri);
+        assertTrue(suggestions.size() >= 4);
+        assertEquals(music, suggestions.get(0).uri);
+        assertEquals(wordcup, suggestions.get(1).uri);
+        assertEquals(football, suggestions.get(2).uri);
+        assertEquals(sport, suggestions.get(3).uri);
     }
 
     protected Hashtable<String,Object> getDefaultClassifierConfigParams() {


Reply via email to