Author: ogrisel
Date: Mon Jan 16 17:03:37 2012
New Revision: 1232065
URL: http://svn.apache.org/viewvc?rev=1232065&view=rev
Log:
STANBOL-197: refactored scan over the topics to make it reusable for the
evaluation part
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/BatchProcessor.java
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/classifier/schema.xml
Added:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/BatchProcessor.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/BatchProcessor.java?rev=1232065&view=auto
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/BatchProcessor.java
(added)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/BatchProcessor.java
Mon Jan 16 17:03:37 2012
@@ -0,0 +1,12 @@
+package org.apache.stanbol.enhancer.engine.topic;
+
+import java.util.List;
+
+import org.apache.stanbol.enhancer.topic.ClassifierException;
+import org.apache.stanbol.enhancer.topic.TrainingSetException;
+
+public interface BatchProcessor<T> {
+
+ int process(List<T> batch) throws ClassifierException,
TrainingSetException;
+
+}
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1232065&r1=1232064&r2=1232065&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
Mon Jan 16 17:03:37 2012
@@ -18,6 +18,7 @@ package org.apache.stanbol.enhancer.engi
import static
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
+import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
@@ -36,6 +37,7 @@ import org.apache.clerezza.rdf.core.MGra
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.felix.scr.annotations.Activate;
@@ -48,6 +50,7 @@ import org.apache.felix.scr.annotations.
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.util.ClientUtils;
@@ -210,6 +213,8 @@ public class TopicClassificationEngine e
protected int cvFoldCount = 0;
+ protected File evaluationFolder;
+
@Activate
protected void activate(ComponentContext context) throws
ConfigurationException, InvalidSyntaxException {
@SuppressWarnings("unchecked")
@@ -553,25 +558,12 @@ public class TopicClassificationEngine e
this.trainingSet = trainingSet;
}
- @Override
- public int updateModel(boolean incremental) throws TrainingSetException,
ClassifierException {
- checkTrainingSet();
- long start = System.currentTimeMillis();
- if (incremental && modelUpdateDateField == null) {
- log.warn(MODEL_UPDATE_DATE_FIELD + " field is not configured:
switching to batch update mode.");
- incremental = false;
- }
+ protected int batchOverTopics(BatchProcessor<SolrDocument> processor)
throws TrainingSetException {
// TODO: implement incremental update by using the date informations
- int updatedTopics = 0;
+ int processedCount = 0;
SolrServer solrServer = getActiveSolrServer();
SolrQuery query = new SolrQuery();
String q = entryTypeField + ":" + METADATA_ENTRY;
- if (modelUpdateDateField != null) {
- query.setFields(topicUriField, entryIdField, modelEntryIdField,
broaderField,
- modelUpdateDateField);
- } else {
- query.setFields(topicUriField, entryIdField, modelEntryIdField,
broaderField);
- }
String offset = null;
boolean done = false;
int batchSize = 1000;
@@ -587,29 +579,17 @@ public class TopicClassificationEngine e
query.setQuery(q);
QueryResponse response = solrServer.query(query);
int count = 0;
+ List<SolrDocument> batchDocuments = new
ArrayList<SolrDocument>();
for (SolrDocument result : response.getResults()) {
String topicId =
result.getFirstValue(topicUriField).toString();
if (count == batchSize) {
offset = topicId;
} else {
count++;
- List<String> impactedTopics = new ArrayList<String>();
- impactedTopics.add(topicId);
- impactedTopics.addAll(getNarrowerTopics(topicId));
- if (incremental) {
- Date lastModelUpdate = (Date)
result.getFirstValue(modelUpdateDateField);
- if (lastModelUpdate != null
- &&
!trainingSet.hasChangedSince(impactedTopics, lastModelUpdate)) {
- continue;
- }
- }
- String metadataEntryId =
result.getFirstValue(entryIdField).toString();
- String modelEntryId =
result.getFirstValue(modelEntryIdField).toString();
- updateTopic(topicId, metadataEntryId, modelEntryId,
impactedTopics,
- result.getFieldValues(broaderField));
- updatedTopics++;
+ batchDocuments.add(result);
}
}
+ processedCount += processor.process(batchDocuments);
solrServer.commit();
if (count < batchSize) {
done = true;
@@ -620,6 +600,43 @@ public class TopicClassificationEngine e
String msg = String.format("Error while updating topics on Solr
Core '%s'.", solrCoreId);
throw new TrainingSetException(msg, e);
}
+ return processedCount;
+ }
+
+ @Override
+ public int updateModel(boolean incremental) throws TrainingSetException,
ClassifierException {
+ checkTrainingSet();
+ long start = System.currentTimeMillis();
+ if (incremental && modelUpdateDateField == null) {
+ log.warn(MODEL_UPDATE_DATE_FIELD + " field is not configured:
switching to batch update mode.");
+ incremental = false;
+ }
+ final boolean incr = incremental;
+ int updatedTopics = batchOverTopics(new BatchProcessor<SolrDocument>()
{
+ @Override
+ public int process(List<SolrDocument> batch) throws
ClassifierException, TrainingSetException {
+ int processed = 0;
+ for (SolrDocument result : batch) {
+ String topicId =
result.getFirstValue(topicUriField).toString();
+ List<String> impactedTopics = new ArrayList<String>();
+ impactedTopics.add(topicId);
+ impactedTopics.addAll(getNarrowerTopics(topicId));
+ if (incr) {
+ Date lastModelUpdate = (Date)
result.getFirstValue(modelUpdateDateField);
+ if (lastModelUpdate != null
+ && !trainingSet.hasChangedSince(impactedTopics,
lastModelUpdate)) {
+ continue;
+ }
+ }
+ String metadataEntryId =
result.getFirstValue(entryIdField).toString();
+ String modelEntryId =
result.getFirstValue(modelEntryIdField).toString();
+ updateTopic(topicId, metadataEntryId, modelEntryId,
impactedTopics,
+ result.getFieldValues(broaderField));
+ processed++;
+ }
+ return processed;
+ }
+ });
long stop = System.currentTimeMillis();
log.info("Sucessfully updated {} topics in {}s", updatedTopics,
(double) (stop - start) / 1000.);
return updatedTopics;
@@ -709,25 +726,86 @@ public class TopicClassificationEngine e
cvFoldCount = foldCount;
}
- @Override
- public TopicClassifier cloneWithEmdeddedModel() throws ClassifierException
{
- // TODO Auto-generated method stub
+ protected Dictionary<String,Object>
getCanonicalConfiguration(EmbeddedSolrServer server) {
+ // TODO
return null;
}
- @Override
- public void destroyModel() throws ClassifierException {
- // TODO Auto-generated method stub
+ protected EmbeddedSolrServer makeTopicClassifierSolrServer(File folder) {
+
+ // TODO
+ return null;
+ }
+ public boolean isEvaluationRunning() {
+ return evaluationFolder != null;
}
public int updatePerformanceEstimates(boolean incremental) throws
ClassifierException,
TrainingSetException {
+ if (evaluationFolder != null) {
+ throw new ClassifierException("Another evaluation is already
running");
+ }
int updatedTopics = 0;
- // TODO
+ int cvFoldCount = 3; // 3-folds CV is hardcoded for now
+
+ TopicClassificationEngine classifier = new TopicClassificationEngine();
+ classifier.setTrainingSet(trainingSet);
+ try {
+ // TODO: make the temporary folder path configurable with a
property
+ evaluationFolder =
File.createTempFile("stanbol-classifier-evaluation-", "-solr");
+ for (int cvFoldIndex = 0; cvFoldIndex < cvFoldCount;
cvFoldIndex++) {
+ performCVFold(classifier, cvFoldIndex, cvFoldCount);
+ }
+ } catch (ConfigurationException e) {
+ throw new ClassifierException(e);
+ } catch (IOException e) {
+ throw new ClassifierException(e);
+ } finally {
+ FileUtils.deleteQuietly(evaluationFolder);
+ evaluationFolder = null;
+ }
return updatedTopics;
}
+ protected void performCVFold(TopicClassificationEngine classifier, int
cvFoldIndex, int cvFoldCount) throws ConfigurationException,
+
TrainingSetException,
+
ClassifierException {
+
+ log.info(String.format("Performing evaluation CV iteration %d/%d on
classifier %s", cvFoldIndex + 1,
+ cvFoldCount, engineId));
+ long start = System.currentTimeMillis();
+ FileUtils.deleteQuietly(evaluationFolder);
+ evaluationFolder.mkdir();
+ EmbeddedSolrServer evaluationServer =
makeTopicClassifierSolrServer(evaluationFolder);
+ classifier.configure(getCanonicalConfiguration(evaluationServer));
+
+ // iterate over all the topics to register them in the evaluation
classifier
+ batchOverTopics(new BatchProcessor<SolrDocument>() {
+ @Override
+ public int process(List<SolrDocument> batch) {
+ return 0;
+ }
+ });
+
+ // build the model on the for the current train CV folds
+ classifier.setCrossValidationInfo(cvFoldIndex, cvFoldCount);
+ classifier.updateModel(false);
+
+ // iterate over the topics again to compute scores on the test fold
+ batchOverTopics(new BatchProcessor<SolrDocument>() {
+ @Override
+ public int process(List<SolrDocument> batch) {
+ return 0;
+ }
+ });
+
+ float averageF1 = 0.0f;
+ long stop = System.currentTimeMillis();
+ log.info(String.format("Finished CV iteration %d/%d on classifier %s
in %fs. F1-score = %f",
+ cvFoldIndex + 1, cvFoldCount, engineId, (stop - start) / 1000.0,
averageF1));
+ }
+
@Override
public ClassificationReport getPerformanceEstimates(String topic) throws
ClassifierException {
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java?rev=1232065&r1=1232064&r2=1232065&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/topic/TopicClassifier.java
Mon Jan 16 17:03:37 2012
@@ -133,18 +133,6 @@ public interface TopicClassifier {
void setCrossValidationInfo(int foldIndex, int foldCount);
/**
- * Clone the classifier to get a new independent instance with an empty
embedded model to be trained on a
- * subsample of the dataset in a cross validation setting for model
evaluation.
- */
- TopicClassifier cloneWithEmdeddedModel() throws ClassifierException;
-
- /**
- * Free the backing resources of the model (e.g. indices persisted on the
harddrive or a DB) once the
- * cross validation process is completed.
- */
- void destroyModel() throws ClassifierException;
-
- /**
* Get a classification report with various accuracy metrics (precision,
recall and f1-score) along with
* the example ids of some mistakes (false positives or false negatives).
*/
Modified:
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/classifier/schema.xml
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/classifier/schema.xml?rev=1232065&r1=1232064&r2=1232065&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/classifier/schema.xml
(original)
+++
incubator/stanbol/trunk/enhancer/engines/topic/src/test/resources/classifier/schema.xml
Mon Jan 16 17:03:37 2012
@@ -66,8 +66,8 @@
required="true" />
<!-- If entry_type can be model 'model' or 'metadata' -->
- <field name="entry_type" type="string" indexed="true" stored="true"
- required="true" />
+ <field name="entry_type" type="string" indexed="true" stored="true"
+ required="true" />
<!-- Mandatory classifier model attribute when entry_type == 'model' -->
<field name="classifier_features" type="text" indexed="true"
@@ -81,15 +81,21 @@
multiValued="true" />
<field name="last_update_dt" type="tdate" indexed="true"
stored="true" />
- <!-- Accuracy evaluation of the model -->
- <field name="precision" type="tfloat" indexed="true" stored="true" />
- <field name="recall" type="tfloat" indexed="true" stored="true" />
- <field name="f1" type="tfloat" indexed="true" stored="true" />
+ <!-- Accuracy evaluation of the model (accross CV folds) -->
+ <field name="precision" type="tfloat" indexed="true" stored="true"
+ multiValued="true" />
+ <field name="recall" type="tfloat" indexed="true" stored="true"
+ multiValued="true" />
+ <field name="f1" type="tfloat" indexed="true" stored="true"
+ multiValued="true" />
<field name="last_evaluation_dt" type="tdate" indexed="true"
stored="true" />
- <field name="positive_support" type="tint" indexed="false" stored="true" />
- <field name="negative_support" type="tint" indexed="false" stored="true" />
- <!-- Store ids of some false positive and negative examples -->
+ <field name="positive_support" type="tint" indexed="false"
+ stored="true" multiValued="true" />
+ <field name="negative_support" type="tint" indexed="false"
+ stored="true" multiValued="true" />
+ <!-- Store ids of some false positive and negative examples (accumulated
+ over several CV folds) -->
<field name="false_positives" type="string" indexed="false"
multiValued="true" stored="true" />
<field name="negative_positives" type="string" indexed="false"