Author: ogrisel
Date: Mon Jan 9 17:31:07 2012
New Revision: 1229267
URL: http://svn.apache.org/viewvc?rev=1229267&view=rev
Log:
STANBOL-197: WIP refactoring the TrainingSet API for incremental updates
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1229267&r1=1229266&r2=1229267&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
Mon Jan 9 17:31:07 2012
@@ -418,6 +418,7 @@ public class TopicClassificationEngine e
@Override
public int updateModel(boolean incremental) throws TrainingSetException,
ClassifierException {
checkTrainingSet();
+ long start = System.currentTimeMillis();
if (incremental && modelUpdateDateField == null) {
log.warn(MODEL_UPDATE_DATE_FIELD + " field is not configured:
switching to batch update mode.");
incremental = false;
@@ -427,7 +428,11 @@ public class TopicClassificationEngine e
SolrServer solrServer = getActiveSolrServer();
SolrQuery query = new SolrQuery();
String q = "*:*";
- query.setFields(topicUriField, broaderField);
+ if (modelUpdateDateField != null) {
+ query.setFields(topicUriField, broaderField);
+ } else {
+ query.setFields(topicUriField, broaderField, modelUpdateDateField);
+ }
String offset = null;
boolean done = false;
int batchSize = 1000;
@@ -448,31 +453,46 @@ public class TopicClassificationEngine e
offset = topicId;
} else {
count++;
- updateTopic(topicId,
result.getFieldValues(broaderField));
+ List<String> impactedTopics = new ArrayList<String>();
+ impactedTopics.add(topicId);
+ impactedTopics.addAll(getNarrowerTopics(topicId));
+ if (incremental) {
+ Date lastModelUpdate = (Date)
result.getFirstValue(modelUpdateDateField);
+ if (lastModelUpdate != null
+ &&
!trainingSet.hasChangedSince(impactedTopics, lastModelUpdate)) {
+ continue;
+ }
+ }
+ updateTopic(topicId, impactedTopics,
result.getFieldValues(broaderField));
updatedTopics++;
}
}
if (count < batchSize) {
done = true;
}
- } catch (SolrServerException e) {
+ solrServer.optimize();
+ } catch (Exception e) {
String msg = String.format("Error while updating topics on
Solr Core '%s'.", solrCoreId);
throw new TrainingSetException(msg, e);
}
}
+ long stop = System.currentTimeMillis();
+ log.info("Sucessfully updated {} topics in {}s", updatedTopics,
(double) (stop - start) / 1000.);
return updatedTopics;
}
/**
* @param topicId
- * @throws TrainingSetException
- * @throws ClassifierException
+ * the topic model to update
+ * @param impactedTopics
+ * the list of impacted topics (e.g. the topic node and direct
children)
+ * @param broaderTopics
+ * the collection of broader to re-add in the broader field
*/
- public void updateTopic(String topicId, Collection<Object>
broaderTopicIds) throws TrainingSetException,
-
ClassifierException {
- ArrayList<String> impactedTopics = new ArrayList<String>();
- impactedTopics.add(topicId);
- impactedTopics.addAll(getNarrowerTopics(topicId));
+ public void updateTopic(String topicId, List<String> impactedTopics,
Collection<Object> broaderTopics) throws TrainingSetException,
+
ClassifierException {
+ long start = System.currentTimeMillis();
+
Batch<String> examples = Batch.emtpyBatch(String.class);
StringBuffer sb = new StringBuffer();
do {
@@ -486,8 +506,8 @@ public class TopicClassificationEngine e
// reindex the topic with the new text data collected from the examples
SolrInputDocument doc = new SolrInputDocument();
doc.addField(topicUriField, topicId);
- if (broaderTopicIds != null && broaderField != null) {
- doc.addField(broaderField, broaderTopicIds);
+ if (broaderTopics != null && broaderField != null) {
+ doc.addField(broaderField, broaderTopics);
}
if (sb.length() > 0) {
doc.addField(similarityField, sb);
@@ -505,6 +525,8 @@ public class TopicClassificationEngine e
solrCoreId);
throw new ClassifierException(msg, e);
}
+ long stop = System.currentTimeMillis();
+ log.debug("Sucessfully updated topic {} in {}s", topicId, (double)
(stop - start) / 1000.);
}
protected void checkTrainingSet() throws TrainingSetException {
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java?rev=1229267&r1=1229266&r2=1229267&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/SolrTrainingSet.java
Mon Jan 9 17:31:07 2012
@@ -184,7 +184,14 @@ public class SolrTrainingSet extends Con
return exampleId;
}
+
@Override
+ public boolean hasChangedSince(List<String> topics, Date referenceDate) {
+ // TODO
+ return true;
+ }
+
+ @Deprecated
public Set<String> getUpdatedTopics(Calendar lastModificationDate) throws
TrainingSetException {
TreeSet<String> collectedTopics = new TreeSet<String>();
SolrQuery query = new SolrQuery();
@@ -304,4 +311,5 @@ public class SolrTrainingSet extends Con
public void setBatchSize(int batchSize) {
this.batchSize = batchSize;
}
+
}
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java?rev=1229267&r1=1229266&r2=1229267&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TrainingSet.java
Mon Jan 9 17:31:07 2012
@@ -16,9 +16,8 @@
*/
package org.apache.stanbol.enhancer.topic;
-import java.util.Calendar;
+import java.util.Date;
import java.util.List;
-import java.util.Set;
/**
* Source of categorized text documents that can be used to build a the
statistical model of a
@@ -50,15 +49,6 @@ public interface TrainingSet {
String registerExample(String exampleId, String text, List<String> topics)
throws TrainingSetException;
/**
- * @param lastModificationDate
- * typically the date of the last classifier model update or
null to find the list of all
- * topics registered in the dataset.
- * @return the set of topic ids that received some modifications (e.g. new
or updated examples) since
- * {@code lastModificationDate}.
- */
- Set<String> getUpdatedTopics(Calendar lastModificationDate) throws
TrainingSetException;
-
- /**
* Fetch examples representative of the set of topics passed as argument
so as to be able to build a
* statistical model.
*
@@ -93,4 +83,16 @@ public interface TrainingSet {
*/
void setBatchSize(int batchSize);
+ /**
+ * Method to tell the classifier if topic model should be updated if there
exists examples classified in
+ * one of those topics that has changed.
+ *
+ * @param topics
+ * topics to check
+ * @param referenceDate
+ * look for changes after that date
+ * @return true if one of the passed topics has changed since the last date
+ */
+ boolean hasChangedSince(List<String> topics, Date referenceDate);
+
}