Author: ogrisel
Date: Mon Jan  9 17:31:07 2012
New Revision: 1229267

URL: http://svn.apache.org/viewvc?rev=1229267&view=rev
Log:
STANBOL-197: WIP refactoring the TrainingSet API for incremental updates

Modified:
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
    
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1229267&r1=1229266&r2=1229267&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
 Mon Jan  9 17:31:07 2012
@@ -418,6 +418,7 @@ public class TopicClassificationEngine e
     @Override
     public int updateModel(boolean incremental) throws TrainingSetException, 
ClassifierException {
         checkTrainingSet();
+        long start = System.currentTimeMillis();
         if (incremental && modelUpdateDateField == null) {
             log.warn(MODEL_UPDATE_DATE_FIELD + " field is not configured: 
switching to batch update mode.");
             incremental = false;
@@ -427,7 +428,11 @@ public class TopicClassificationEngine e
         SolrServer solrServer = getActiveSolrServer();
         SolrQuery query = new SolrQuery();
         String q = "*:*";
-        query.setFields(topicUriField, broaderField);
+        if (modelUpdateDateField != null) {
+            query.setFields(topicUriField, broaderField);
+        } else {
+            query.setFields(topicUriField, broaderField, modelUpdateDateField);
+        }
         String offset = null;
         boolean done = false;
         int batchSize = 1000;
@@ -448,31 +453,46 @@ public class TopicClassificationEngine e
                         offset = topicId;
                     } else {
                         count++;
-                        updateTopic(topicId, 
result.getFieldValues(broaderField));
+                        List<String> impactedTopics = new ArrayList<String>();
+                        impactedTopics.add(topicId);
+                        impactedTopics.addAll(getNarrowerTopics(topicId));
+                        if (incremental) {
+                            Date lastModelUpdate = (Date) 
result.getFirstValue(modelUpdateDateField);
+                            if (lastModelUpdate != null
+                                && 
!trainingSet.hasChangedSince(impactedTopics, lastModelUpdate)) {
+                                continue;
+                            }
+                        }
+                        updateTopic(topicId, impactedTopics, 
result.getFieldValues(broaderField));
                         updatedTopics++;
                     }
                 }
                 if (count < batchSize) {
                     done = true;
                 }
-            } catch (SolrServerException e) {
+                solrServer.optimize();
+            } catch (Exception e) {
                 String msg = String.format("Error while updating topics on 
Solr Core '%s'.", solrCoreId);
                 throw new TrainingSetException(msg, e);
             }
         }
+        long stop = System.currentTimeMillis();
+        log.info("Sucessfully updated {} topics in {}s", updatedTopics, 
(double) (stop - start) / 1000.);
         return updatedTopics;
     }
 
     /**
      * @param topicId
-     * @throws TrainingSetException
-     * @throws ClassifierException
+     *            the topic model to update
+     * @param impactedTopics
+     *            the list of impacted topics (e.g. the topic node and direct 
children)
+     * @param broaderTopics
+     *            the collection of broader to re-add in the broader field
      */
-    public void updateTopic(String topicId, Collection<Object> 
broaderTopicIds) throws TrainingSetException,
-                                                                               
ClassifierException {
-        ArrayList<String> impactedTopics = new ArrayList<String>();
-        impactedTopics.add(topicId);
-        impactedTopics.addAll(getNarrowerTopics(topicId));
+    public void updateTopic(String topicId, List<String> impactedTopics, 
Collection<Object> broaderTopics) throws TrainingSetException,
+                                                                               
                           ClassifierException {
+        long start = System.currentTimeMillis();
+
         Batch<String> examples = Batch.emtpyBatch(String.class);
         StringBuffer sb = new StringBuffer();
         do {
@@ -486,8 +506,8 @@ public class TopicClassificationEngine e
         // reindex the topic with the new text data collected from the examples
         SolrInputDocument doc = new SolrInputDocument();
         doc.addField(topicUriField, topicId);
-        if (broaderTopicIds != null && broaderField != null) {
-            doc.addField(broaderField, broaderTopicIds);
+        if (broaderTopics != null && broaderField != null) {
+            doc.addField(broaderField, broaderTopics);
         }
         if (sb.length() > 0) {
             doc.addField(similarityField, sb);
@@ -505,6 +525,8 @@ public class TopicClassificationEngine e
                 solrCoreId);
             throw new ClassifierException(msg, e);
         }
+        long stop = System.currentTimeMillis();
+        log.debug("Sucessfully updated topic {} in {}s", topicId, (double) 
(stop - start) / 1000.);
     }
 
     protected void checkTrainingSet() throws TrainingSetException {

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java?rev=1229267&r1=1229266&r2=1229267&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
 Mon Jan  9 17:31:07 2012
@@ -184,7 +184,14 @@ public class SolrTrainingSet extends Con
         return exampleId;
     }
 
+
     @Override
+    public boolean hasChangedSince(List<String> topics, Date referenceDate) {
+        // TODO
+        return true;
+    }
+    
+    @Deprecated
     public Set<String> getUpdatedTopics(Calendar lastModificationDate) throws 
TrainingSetException {
         TreeSet<String> collectedTopics = new TreeSet<String>();
         SolrQuery query = new SolrQuery();
@@ -304,4 +311,5 @@ public class SolrTrainingSet extends Con
     public void setBatchSize(int batchSize) {
         this.batchSize = batchSize;
     }
+
 }

Modified: 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java?rev=1229267&r1=1229266&r2=1229267&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
 Mon Jan  9 17:31:07 2012
@@ -16,9 +16,8 @@
  */
 package org.apache.stanbol.enhancer.topic;
 
-import java.util.Calendar;
+import java.util.Date;
 import java.util.List;
-import java.util.Set;
 
 /**
  * Source of categorized text documents that can be used to build a the 
statistical model of a
@@ -50,15 +49,6 @@ public interface TrainingSet {
     String registerExample(String exampleId, String text, List<String> topics) 
throws TrainingSetException;
 
     /**
-     * @param lastModificationDate
-     *            typically the date of the last classifier model update or 
null to find the list of all
-     *            topics registered in the dataset.
-     * @return the set of topic ids that received some modifications (e.g. new 
or updated examples) since
-     *         {@code lastModificationDate}.
-     */
-    Set<String> getUpdatedTopics(Calendar lastModificationDate) throws 
TrainingSetException;
-
-    /**
      * Fetch examples representative of the set of topics passed as argument 
so as to be able to build a
      * statistical model.
      * 
@@ -93,4 +83,16 @@ public interface TrainingSet {
      */
     void setBatchSize(int batchSize);
 
+    /**
+     * Method to tell the classifier if topic model should be updated if there 
exists examples classified in
+     * one of those topics that has changed.
+     * 
+     * @param topics
+     *            topics to check
+     * @param referenceDate
+     *            look for changes after that date
+     * @return true if one of the passed topics has changed since the last date
+     */
+    boolean hasChangedSince(List<String> topics, Date referenceDate);
+
 }


Reply via email to