Author: rwesten
Date: Wed Jun 5 07:23:15 2013
New Revision: 1489728
URL: http://svn.apache.org/r1489728
Log:
The Topic Engine now uses the ManagedSolrServer to init SolrCores. The same is
true for Trainingset. Note that this means that SolrCore configurations are now
loaded via the DataFileProvider infrastructure. This allows users to load
custom models and/or pre-trained models copied to the 'stanbol/datafiles'
folder - STANBOL-1087; updates default schemas used by the TopicEngine and
Trainingset to Solr 4 - STANBOL-1086; Added a default SolrCore configuration
for the Topic Engine that supports n-grams - STANBOL-1089; removed all
configuration properties used to configure the name of Solr fields from the
Felix Webconsole dialog. Added constants for the default values. Configuration
of those properties is still supported by parsing OSGI configuration files -
STANBOL-1090
Added:
stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/shingle-topic-model/
stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/shingle-topic-model/conf/
stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/shingle-topic-model/conf/schema.xml
stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/shingle-topic-model/conf/solrconfig.xml
Modified:
stanbol/trunk/enhancement-engines/topic/engine/pom.xml
stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/ConfiguredSolrCoreTracker.java
stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java
stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/OSGI-INF/metatype/metatype.properties
stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/default-topic-model/conf/schema.xml
stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/default-topic-model/conf/solrconfig.xml
stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/default-topic-trainingset/conf/solrconfig.xml
stanbol/trunk/enhancement-engines/topic/engine/src/test/java/org/apache/stanbol/enhancer/engine/topic/TopicEngineTest.java
Modified: stanbol/trunk/enhancement-engines/topic/engine/pom.xml
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/topic/engine/pom.xml?rev=1489728&r1=1489727&r2=1489728&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/topic/engine/pom.xml (original)
+++ stanbol/trunk/enhancement-engines/topic/engine/pom.xml Wed Jun 5 07:23:15
2013
@@ -74,7 +74,7 @@
<Private-Package>
org.apache.stanbol.enhancer.engine.topic
</Private-Package>
- <Install-Path>install-config</Install-Path>
+ <!-- Install-Path>install-config</Install-Path -->
<Data-Files>data-files</Data-Files>
<Data-Files-Priority>-100</Data-Files-Priority>
</instructions>
Modified:
stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1489728&r1=1489727&r2=1489728&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
(original)
+++
stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java
Wed Jun 5 07:23:15 2013
@@ -25,6 +25,7 @@ import java.util.Collections;
import java.util.Date;
import java.util.Dictionary;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.LinkedHashSet;
@@ -142,70 +143,118 @@ import org.slf4j.LoggerFactory;
@Component(metatype = true, immediate = true, configurationFactory = true,
policy = ConfigurationPolicy.REQUIRE)
@Service
@Properties(value = {
- @Property(name = EnhancementEngine.PROPERTY_NAME),
- @Property(name = TopicClassificationEngine.ORDER,
intValue = 100),
- @Property(name = TopicClassificationEngine.SOLR_CORE),
- @Property(name = TopicClassificationEngine.LANGUAGES),
- @Property(name =
TopicClassificationEngine.SIMILARTITY_FIELD, value = "classifier_features"),
- @Property(name =
TopicClassificationEngine.CONCEPT_URI_FIELD, value = "concept"),
- @Property(name =
TopicClassificationEngine.PRIMARY_TOPIC_URI_FIELD, value = "primary_topic"),
- @Property(name = TopicClassificationEngine.BROADER_FIELD,
value = "broader"),
- @Property(name =
TopicClassificationEngine.MODEL_UPDATE_DATE_FIELD, value = "last_update_dt"),
- @Property(name =
TopicClassificationEngine.PRECISION_FIELD, value = "precision"),
- @Property(name = TopicClassificationEngine.RECALL_FIELD,
value = "recall"),
- @Property(name =
TopicClassificationEngine.ENTRY_ID_FIELD, value = "entry_id"),
- @Property(name =
TopicClassificationEngine.MODEL_ENTRY_ID_FIELD, value = "model_entry_id"),
- @Property(name =
TopicClassificationEngine.ENTRY_TYPE_FIELD, value = "entry_type"),
- @Property(name =
TopicClassificationEngine.MODEL_EVALUATION_DATE_FIELD, value =
"last_evaluation_dt"),
- @Property(name =
TopicClassificationEngine.FALSE_NEGATIVES_FIELD, value = "false_negatives"),
- @Property(name =
TopicClassificationEngine.FALSE_POSITIVES_FIELD, value = "false_positives"),
- @Property(name =
TopicClassificationEngine.POSITIVE_SUPPORT_FIELD, value = "positive_support"),
- @Property(name =
TopicClassificationEngine.NEGATIVE_SUPPORT_FIELD, value = "negative_support"),
- @Property(name =
TopicClassificationEngine.TRAINING_SET_ID),
- @Property(name = Constants.SERVICE_RANKING, intValue =
0)})
+ @Property(name = EnhancementEngine.PROPERTY_NAME),
+ @Property(name = TopicClassificationEngine.SOLR_CORE),
+ @Property(name = TopicClassificationEngine.SOLR_CORE_CONFIG,
+ value = TopicClassificationEngine.DEFAULT_SOLR_CORE_CONFIG),
+ @Property(name = TopicClassificationEngine.LANGUAGES),
+// those properties can still be set via a configuration file, but as most
users
+// will not use them exclude those from the configuration form
+// @Property(name = TopicClassificationEngine.SIMILARTITY_FIELD, value
= TopicClassificationEngine.DEFAULT_SIMILARTITY_FIELD),
+// @Property(name = TopicClassificationEngine.CONCEPT_URI_FIELD, value
= TopicClassificationEngine.DEFAULT_CONCEPT_URI_FIELD),
+// @Property(name = TopicClassificationEngine.PRIMARY_TOPIC_URI_FIELD,
value = TopicClassificationEngine.DEFAULT_PRIMARY_TOPIC_URI_FIELD),
+// @Property(name = TopicClassificationEngine.BROADER_FIELD, value =
TopicClassificationEngine.DEFAULT_BROADER_FIELD),
+// @Property(name = TopicClassificationEngine.MODEL_UPDATE_DATE_FIELD,
value = TopicClassificationEngine.DEFAULT_MODEL_UPDATE_DATE_FIELD),
+// @Property(name = TopicClassificationEngine.PRECISION_FIELD, value =
TopicClassificationEngine.DEFAULT_PRECISION_FIELD),
+// @Property(name = TopicClassificationEngine.RECALL_FIELD, value =
TopicClassificationEngine.DEFAULT_RECALL_FIELD),
+// @Property(name = TopicClassificationEngine.ENTRY_ID_FIELD, value =
TopicClassificationEngine.DEFAULT_ENTRY_ID_FIELD),
+// @Property(name = TopicClassificationEngine.MODEL_ENTRY_ID_FIELD,
value = TopicClassificationEngine.DEFAULT_MODEL_ENTRY_ID_FIELD),
+// @Property(name = TopicClassificationEngine.ENTRY_TYPE_FIELD, value =
TopicClassificationEngine.DEFAULT_ENTRY_TYPE_FIELD),
+// @Property(name =
TopicClassificationEngine.MODEL_EVALUATION_DATE_FIELD, value =
TopicClassificationEngine.DEFAULT_MODEL_EVALUATION_DATE_FIELD),
+// @Property(name = TopicClassificationEngine.FALSE_NEGATIVES_FIELD,
value = TopicClassificationEngine.DEFAULT_FALSE_NEGATIVES_FIELD),
+// @Property(name = TopicClassificationEngine.FALSE_POSITIVES_FIELD,
value = TopicClassificationEngine.DEFAULT_FALSE_POSITIVES_FIELD),
+// @Property(name = TopicClassificationEngine.POSITIVE_SUPPORT_FIELD,
value = TopicClassificationEngine.DEFAULT_POSITIVE_SUPPORT_FIELD),
+// @Property(name = TopicClassificationEngine.NEGATIVE_SUPPORT_FIELD,
value = TopicClassificationEngine.DEFAULT_NEGATIVE_SUPPORT_FIELD),
+// @Property(name = TopicClassificationEngine.ORDER, intValue =
TopicClassificationEngine.DEFAULT_ENGINE_ORDER),
+ @Property(name = TopicClassificationEngine.TRAINING_SET_ID),
+ @Property(name = Constants.SERVICE_RANKING, intValue = 0)})
public class TopicClassificationEngine extends ConfiguredSolrCoreTracker
implements EnhancementEngine,
ServiceProperties, TopicClassifier {
+ public static final String DEFAULT_SOLR_CORE_CONFIG =
"default-topic-model.solrindex.zip";
+
public static final String MODEL_ENTRY = "model";
public static final String METADATA_ENTRY = "metadata";
-
+ /**
+ * The reference to the SolrCore used ny the TopicClassificationEngine.
+ * The default is the engine name with the suffix '-model'. This also
supports
+ * the {server-name}:{index-name} syntax. if n
+ */
public static final String SOLR_CORE =
"org.apache.stanbol.enhancer.engine.topic.solrCore";
+ /**
+ * The name of the Solr Index archive (default:
"default-topic-model.solrindex.zip").
+ * The file is loaded by using the DataFileProvider infrastructure. The
archive may
+ * also include a pre-trained model.
+ */
+ public static final String SOLR_CORE_CONFIG =
"org.apache.stanbol.enhancer.engine.topic.solrCoreConfig";
public static final String LANGUAGES =
"org.apache.stanbol.enhancer.engine.topic.languages";
public static final String ORDER =
"org.apache.stanbol.enhancer.engine.topic.order";
+
+ public static final Integer DEFAULT_ENGINE_ORDER =
ServiceProperties.ORDERING_CONTENT_EXTRACTION;
public static final String ENTRY_ID_FIELD =
"org.apache.stanbol.enhancer.engine.topic.entryIdField";
+
+ public static final String DEFAULT_ENTRY_ID_FIELD = "entry_id";
public static final String ENTRY_TYPE_FIELD =
"org.apache.stanbol.enhancer.engine.topic.entryTypeField";
+
+ public static final String DEFAULT_ENTRY_TYPE_FIELD = "entry_type";
public static final String SIMILARTITY_FIELD =
"org.apache.stanbol.enhancer.engine.topic.similarityField";
+
+ public static final String DEFAULT_SIMILARTITY_FIELD =
"classifier_features";
public static final String CONCEPT_URI_FIELD =
"org.apache.stanbol.enhancer.engine.topic.conceptUriField";
+
+ public static final String DEFAULT_CONCEPT_URI_FIELD = "concept";
public static final String BROADER_FIELD =
"org.apache.stanbol.enhancer.engine.topic.broaderField";
+
+ public static final String DEFAULT_BROADER_FIELD = "broader";
public static final String PRIMARY_TOPIC_URI_FIELD =
"org.apache.stanbol.enhancer.engine.topic.primaryTopicField";
+
+ public static final String DEFAULT_PRIMARY_TOPIC_URI_FIELD =
"primary_topic";
public static final String MODEL_UPDATE_DATE_FIELD =
"org.apache.stanbol.enhancer.engine.topic.modelUpdateDateField";
+ public static final String DEFAULT_MODEL_UPDATE_DATE_FIELD =
"last_update_dt";
+
public static final String MODEL_EVALUATION_DATE_FIELD =
"org.apache.stanbol.enhancer.engine.topic.modelEvaluationDateField";
+
+ public static final String DEFAULT_MODEL_EVALUATION_DATE_FIELD =
"last_evaluation_dt";
public static final String MODEL_ENTRY_ID_FIELD =
"org.apache.stanbol.enhancer.engine.topic.modelEntryIdField";
+
+ public static final String DEFAULT_MODEL_ENTRY_ID_FIELD = "model_entry_id";
public static final String PRECISION_FIELD =
"org.apache.stanbol.enhancer.engine.topic.precisionField";
+
+ public static final String DEFAULT_PRECISION_FIELD = "precision";
public static final String RECALL_FIELD =
"org.apache.stanbol.enhancer.engine.topic.recallField";
+
+ public static final String DEFAULT_RECALL_FIELD = "recall";
public static final String FALSE_POSITIVES_FIELD =
"org.apache.stanbol.enhancer.engine.topic.falsePositivesField";
+
+ public static final String DEFAULT_FALSE_POSITIVES_FIELD =
"false_positives";
public static final String FALSE_NEGATIVES_FIELD =
"org.apache.stanbol.enhancer.engine.topic.falseNegativesField";
+
+ public static final String DEFAULT_FALSE_NEGATIVES_FIELD =
"false_negatives";
public static final String POSITIVE_SUPPORT_FIELD =
"org.apache.stanbol.enhancer.engine.topic.positiveSupportField";
+ public static final String DEFAULT_POSITIVE_SUPPORT_FIELD =
"positive_support";
+
public static final String NEGATIVE_SUPPORT_FIELD =
"org.apache.stanbol.enhancer.engine.topic.negativeSupportField";
+ public static final String DEFAULT_NEGATIVE_SUPPORT_FIELD =
"negative_support";
+
public static final String TRAINING_SET_ID =
"org.apache.stanbol.enhancer.engine.topic.trainingSetId";
private static final Logger log =
LoggerFactory.getLogger(TopicClassificationEngine.class);
@@ -250,6 +299,7 @@ public class TopicClassificationEngine e
protected String engineName;
protected List<String> acceptedLanguages;
+ private Set<String> acceptedLanguageSet;
protected Integer order = ORDERING_EXTRACTION_ENHANCEMENT;
@@ -337,7 +387,6 @@ public class TopicClassificationEngine e
protected void activate(ComponentContext context,
Dictionary<String,Object> config) throws ConfigurationException,
InvalidSyntaxException {
this.context = context;
- indexArchiveName = "default-topic-model";
configure(config);
// if training set is not null, track it
@@ -376,36 +425,52 @@ public class TopicClassificationEngine e
public void configure(Dictionary<String,Object> config) throws
ConfigurationException {
engineName = getRequiredStringParam(config,
EnhancementEngine.PROPERTY_NAME);
- entryIdField = getRequiredStringParam(config, ENTRY_ID_FIELD);
- modelEntryIdField = getRequiredStringParam(config,
MODEL_ENTRY_ID_FIELD);
- conceptUriField = getRequiredStringParam(config, CONCEPT_URI_FIELD);
- entryTypeField = getRequiredStringParam(config, ENTRY_TYPE_FIELD);
- similarityField = getRequiredStringParam(config, SIMILARTITY_FIELD);
+ entryIdField = getRequiredStringParam(config, ENTRY_ID_FIELD,
DEFAULT_ENTRY_ID_FIELD);
+ modelEntryIdField = getRequiredStringParam(config,
MODEL_ENTRY_ID_FIELD, DEFAULT_MODEL_ENTRY_ID_FIELD);
+ conceptUriField = getRequiredStringParam(config, CONCEPT_URI_FIELD,
DEFAULT_CONCEPT_URI_FIELD);
+ entryTypeField = getRequiredStringParam(config, ENTRY_TYPE_FIELD,
DEFAULT_ENTRY_TYPE_FIELD);
+ similarityField = getRequiredStringParam(config, SIMILARTITY_FIELD,
DEFAULT_SIMILARTITY_FIELD);
acceptedLanguages = getStringListParan(config, LANGUAGES);
- precisionField = getRequiredStringParam(config, PRECISION_FIELD);
- recallField = getRequiredStringParam(config, RECALL_FIELD);
- modelUpdateDateField = getRequiredStringParam(config,
MODEL_UPDATE_DATE_FIELD);
- modelEvaluationDateField = getRequiredStringParam(config,
MODEL_EVALUATION_DATE_FIELD);
- falsePositivesField = getRequiredStringParam(config,
FALSE_POSITIVES_FIELD);
- falseNegativesField = getRequiredStringParam(config,
FALSE_NEGATIVES_FIELD);
- positiveSupportField = getRequiredStringParam(config,
POSITIVE_SUPPORT_FIELD);
- negativeSupportField = getRequiredStringParam(config,
NEGATIVE_SUPPORT_FIELD);
- configureSolrCore(config, SOLR_CORE, engineName + "-model");
+ acceptedLanguageSet = new HashSet<String>(acceptedLanguages);
+ precisionField = getRequiredStringParam(config, PRECISION_FIELD,
DEFAULT_PRECISION_FIELD);
+ recallField = getRequiredStringParam(config, RECALL_FIELD,
DEFAULT_RECALL_FIELD);
+ modelUpdateDateField = getRequiredStringParam(config,
MODEL_UPDATE_DATE_FIELD, DEFAULT_MODEL_UPDATE_DATE_FIELD);
+ modelEvaluationDateField = getRequiredStringParam(config,
MODEL_EVALUATION_DATE_FIELD, DEFAULT_MODEL_EVALUATION_DATE_FIELD);
+ falsePositivesField = getRequiredStringParam(config,
FALSE_POSITIVES_FIELD, DEFAULT_FALSE_POSITIVES_FIELD);
+ falseNegativesField = getRequiredStringParam(config,
FALSE_NEGATIVES_FIELD, DEFAULT_FALSE_NEGATIVES_FIELD);
+ positiveSupportField = getRequiredStringParam(config,
POSITIVE_SUPPORT_FIELD, DEFAULT_POSITIVE_SUPPORT_FIELD);
+ negativeSupportField = getRequiredStringParam(config,
NEGATIVE_SUPPORT_FIELD, DEFAULT_NEGATIVE_SUPPORT_FIELD);
+ configureSolrCore(config, SOLR_CORE, engineName +
"-model",SOLR_CORE_CONFIG);
// optional fields, can be null
- broaderField = (String) config.get(BROADER_FIELD);
- primaryTopicUriField = (String) config.get(PRIMARY_TOPIC_URI_FIELD);
+ broaderField = getRequiredStringParam(config, BROADER_FIELD,
DEFAULT_BROADER_FIELD);
+ primaryTopicUriField = getRequiredStringParam(config,
PRIMARY_TOPIC_URI_FIELD, DEFAULT_PRIMARY_TOPIC_URI_FIELD);
trainingSetId = (String) config.get(TRAINING_SET_ID);
Object orderParamValue = config.get(ORDER);
- if (orderParamValue != null) {
- order = (Integer) orderParamValue;
+ if (orderParamValue instanceof Number) {
+ order = ((Number) orderParamValue).intValue();
+ } else if(orderParamValue != null){
+ try {
+ Integer.parseInt(orderParamValue.toString());
+ }catch (NumberFormatException e) {
+ throw new ConfigurationException(ORDER, "The configured
EnhancementEngine "
+ + "order MUST BE an Intever value!",e);
+ }
+ } else {
+ order = DEFAULT_ENGINE_ORDER;
}
}
@Override
public int canEnhance(ContentItem ci) throws EngineException {
if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null &&
getActiveSolrServer() != null) {
- return ENHANCE_SYNCHRONOUS;
+ String language = EnhancementEngineHelper.getLanguage(ci);
+ if(acceptedLanguageSet.isEmpty() ||
acceptedLanguageSet.contains(language) ||
+ acceptedLanguageSet.contains("")){
+ return ENHANCE_SYNCHRONOUS;
+ } else {
+ return CANNOT_ENHANCE;
+ }
} else {
return CANNOT_ENHANCE;
}
@@ -421,6 +486,12 @@ public class TopicClassificationEngine e
+ "') -> this indicates that canEnhance was"
+ "NOT called and indicates a bug in the used
EnhancementJobManager!");
}
+ String language = EnhancementEngineHelper.getLanguage(ci);
+ if(!(acceptedLanguageSet.isEmpty() ||
acceptedLanguageSet.contains(language) ||
+ acceptedLanguageSet.contains(""))){
+ throw new IllegalStateException("The language '"+language+"' of
the ContentItem is not configured as "
+ +" active for this Engine (active: "+acceptedLanguageSet+").");
+ }
String text;
try {
text = ContentItemHelper.getText(contentPart.getValue());
@@ -551,7 +622,7 @@ public class TopicClassificationEngine e
List<TopicSuggestion> suggestedTopics = new
ArrayList<TopicSuggestion>(MAX_SUGGESTIONS * 3);
SolrServer solrServer = getActiveSolrServer();
SolrQuery query = new SolrQuery();
- query.setQueryType("/" + MoreLikeThisParams.MLT);
+ query.setRequestHandler("/" + MoreLikeThisParams.MLT);
query.setFilterQueries(entryTypeField + ":" + MODEL_ENTRY);
query.set(MoreLikeThisParams.MATCH_INCLUDE, false);
query.set(MoreLikeThisParams.MIN_DOC_FREQ, 1);
@@ -823,7 +894,15 @@ public class TopicClassificationEngine e
return trainingSet;
}
if (trainingSetTracker != null) {
- return (TrainingSet) trainingSetTracker.getService();
+ TrainingSet trainingsSet = (TrainingSet)
trainingSetTracker.getService();
+ if(trainingsSet == null){
+ for(int i=0; i < 5 && trainingsSet == null; i++){
+ try {
+ trainingsSet = (TrainingSet)
trainingSetTracker.waitForService(1000);
+ } catch (InterruptedException e) {/*ignore*/}
+ }
+ }
+ return trainingsSet;
}
return null;
}
@@ -1023,13 +1102,14 @@ public class TopicClassificationEngine e
cvFoldCount = foldCount;
}
- protected Dictionary<String,Object> getCanonicalConfiguration(Object
server) {
+ protected Dictionary<String,Object> getCanonicalConfiguration(Object
server, Object coreConfig) {
Hashtable<String,Object> config = new Hashtable<String,Object>();
config.put(EnhancementEngine.PROPERTY_NAME, engineName +
"-evaluation");
config.put(TopicClassificationEngine.ENTRY_ID_FIELD, "entry_id");
config.put(TopicClassificationEngine.ENTRY_TYPE_FIELD, "entry_type");
config.put(TopicClassificationEngine.MODEL_ENTRY_ID_FIELD,
"model_entry_id");
config.put(TopicClassificationEngine.SOLR_CORE, server);
+ config.put(TopicClassificationEngine.SOLR_CORE_CONFIG, coreConfig);
config.put(TopicClassificationEngine.CONCEPT_URI_FIELD, "concept");
config.put(TopicClassificationEngine.PRIMARY_TOPIC_URI_FIELD,
"primary_topic");
config.put(TopicClassificationEngine.SIMILARTITY_FIELD,
"classifier_features");
@@ -1085,12 +1165,8 @@ public class TopicClassificationEngine e
return updatedTopics;
}
- protected int performCVFold(int cvFoldIndex,
- int cvFoldCount,
- int cvIterations,
- boolean incremental) throws
ConfigurationException,
- TrainingSetException,
- ClassifierException {
+ protected int performCVFold(int cvFoldIndex, int cvFoldCount, int
cvIterations, boolean incremental)
+ throws ConfigurationException, TrainingSetException,
ClassifierException {
cvIterations = cvIterations <= 0 ? cvFoldCount : cvFoldCount;
log.info(String.format("Performing evaluation %d-fold CV iteration
%d/%d on classifier %s",
@@ -1102,7 +1178,9 @@ public class TopicClassificationEngine e
// OSGi setup: the evaluation server will be generated
automatically using the
// managedSolrServer
classifier.bindManagedSolrServer(managedSolrServer);
- classifier.activate(context,
getCanonicalConfiguration(engineName + "-evaluation"));
+ classifier.activate(context, getCanonicalConfiguration(
+ engineName + "-evaluation", //TODO: maybe we should use
the SolrCoreName instead
+ solrCoreConfig));
} else {
if(__evaluationServer == null){
__evaluationServerDir = new
File(embeddedSolrServerDir,engineName + "-evaluation");
@@ -1112,7 +1190,7 @@ public class TopicClassificationEngine e
__evaluationServer =
EmbeddedSolrHelper.makeEmbeddedSolrServer(__evaluationServerDir,
"evaluationclassifierserver", "default-topic-model",
"default-topic-model");
}
-
classifier.configure(getCanonicalConfiguration(__evaluationServer));
+
classifier.configure(getCanonicalConfiguration(__evaluationServer,solrCoreConfig));
}
} catch (Exception e) {
throw new ClassifierException(e);
Modified:
stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/ConfiguredSolrCoreTracker.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/ConfiguredSolrCoreTracker.java?rev=1489728&r1=1489727&r2=1489728&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/ConfiguredSolrCoreTracker.java
(original)
+++
stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/ConfiguredSolrCoreTracker.java
Wed Jun 5 07:23:15 2013
@@ -17,20 +17,22 @@
package org.apache.stanbol.enhancer.topic;
import java.io.IOException;
-import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Dictionary;
import java.util.List;
-import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.stanbol.commons.solr.IndexReference;
import org.apache.stanbol.commons.solr.RegisteredSolrServerTracker;
import org.apache.stanbol.commons.solr.managed.IndexMetadata;
+import org.apache.stanbol.commons.solr.managed.ManagedIndexState;
import org.apache.stanbol.commons.solr.managed.ManagedSolrServer;
+import org.apache.stanbol.enhancer.engine.topic.TopicClassificationEngine;
import org.osgi.service.cm.ConfigurationException;
import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
/**
@@ -38,6 +40,8 @@ import org.xml.sax.SAXException;
*/
public abstract class ConfiguredSolrCoreTracker {
+ protected final Logger log = LoggerFactory.getLogger(getClass());
+
protected ManagedSolrServer managedSolrServer;
protected String solrCoreId;
@@ -49,7 +53,9 @@ public abstract class ConfiguredSolrCore
protected ComponentContext context;
- protected String indexArchiveName;
+ protected String solrCoreConfig;
+
+ //protected String indexArchiveName;
abstract public void configure(Dictionary<String,Object> config) throws
ConfigurationException;
@@ -93,7 +99,22 @@ public abstract class ConfiguredSolrCore
* tracker.
*/
public SolrServer getActiveSolrServer() {
- SolrServer result = solrServer != null ? solrServer :
indexTracker.getService();
+ SolrServer result;
+ if(solrServer != null){
+ result = solrServer;
+ } else {
+ result = indexTracker.getService();
+ if(result == null){
+ //try to wait for the server (mainly because the evaluation
+ //server is created on demand and will need some time to be
+ //initialised).
+ for(int i = 0; i < 5 && result == null; i++){
+ try {
+ result = (SolrServer)
indexTracker.waitForService(1000);
+ } catch (InterruptedException e) {/* ignore */ }
+ }
+ }
+ }
if (result == null) {
if (solrCoreId != null) {
throw new RuntimeException("No Solr Core registered with id: "
+ solrCoreId);
@@ -105,27 +126,32 @@ public abstract class ConfiguredSolrCore
}
protected void configureSolrCore(Dictionary<String,Object> config,
- String solrCoreProperty,
- String defaultCoreId) throws
ConfigurationException {
+ String solrCoreProperty, String defaultCoreId,
+ String solrCoreConfigProperty)
+ throws ConfigurationException {
Object solrCoreInfo = config.get(solrCoreProperty);
if (solrCoreInfo instanceof SolrServer) {
// Bind a fixed Solr server client instead of doing dynamic OSGi
lookup using the service tracker.
// This can be useful both for unit-testing .
solrServer = (SolrServer) config.get(solrCoreProperty);
+ solrCoreConfig =
TopicClassificationEngine.DEFAULT_SOLR_CORE_CONFIG;
} else {
- if (solrCoreInfo != null &&
!solrCoreInfo.toString().trim().isEmpty()) {
- this.solrCoreId = solrCoreInfo.toString();
- } else {
- this.solrCoreId = defaultCoreId;
- }
if (context == null) {
throw new ConfigurationException(solrCoreProperty,
solrCoreProperty + " should be a SolrServer instance
for using"
+ " the engine without any OSGi context. Got:
" + solrCoreId);
}
+ if (solrCoreInfo != null &&
!solrCoreInfo.toString().trim().isEmpty()) {
+ this.solrCoreId = solrCoreInfo.toString().trim();
+ } else {
+ this.solrCoreId = defaultCoreId;
+ }
+ solrCoreConfig = getRequiredStringParam(config,
solrCoreConfigProperty,
+ this.solrCoreId + ".solrindex.zip");
try {
IndexReference indexReference =
IndexReference.parse(solrCoreId);
- indexReference = checkInitSolrIndex(indexReference);
+ //String configName = getRequiredStringParam(config,
SOLR_CONFIG, defaultValue)
+ indexReference = checkInitSolrIndex(indexReference,
solrCoreConfig);
// track the solr core OSGi updates
indexTracker = new
RegisteredSolrServerTracker(context.getBundleContext(), indexReference);
indexTracker.open();
@@ -134,30 +160,61 @@ public abstract class ConfiguredSolrCore
}
}
}
-
- protected IndexReference checkInitSolrIndex(IndexReference indexReference)
throws IOException,
-
ConfigurationException,
-
SAXException {
+ /**
+ * Checks if the SolrIndex is available and if not it tries to initialise
it
+ * @param indexReference the SolrCore reference
+ * @param solrCoreConfig the name of the SolrIndex configuration
({name}.solrindex.zip)
+ * @return
+ * @throws IOException
+ * @throws ConfigurationException
+ * @throws SAXException
+ */
+ protected IndexReference checkInitSolrIndex(IndexReference indexReference,
String solrCoreConfig)
+ throws IOException, ConfigurationException, SAXException {
// if the solr core is managed, check that the index is properly
activated
if (managedSolrServer != null &&
indexReference.checkServer(managedSolrServer.getServerName())
- && context != null) {
+ && context != null && solrCoreConfig != null) {
+ log.info(" > check/init index {} on ManagedSolrServer {}",
indexReference, managedSolrServer.getServerName());
String indexName = indexReference.getIndex();
- IndexMetadata indexMetadata =
managedSolrServer.getIndexMetadata(indexName);
- if (indexMetadata == null) {
- // TODO: debug the DataFileProvider init race conditions
instead
- // indexMetadata =
managedSolrServer.createSolrIndex(indexName, indexArchiveName, null);
- URL archiveUrl = context.getBundleContext().getBundle()
- .getEntry("/data-files/" + indexArchiveName +
".solrindex.zip");
- if (archiveUrl == null) {
- throw new ConfigurationException(solrCoreId, "Could not
find index archive for "
- +
indexArchiveName);
+ final IndexMetadata indexMetadata;
+ ManagedIndexState indexState =
managedSolrServer.getIndexState(indexName);
+ if(indexState == null){
+ if(solrCoreConfig.indexOf(".solrindex.") < 0){ //if the suffix
is missing
+ solrCoreConfig = solrCoreConfig + ".solrindex.zip";
//append it
+ }
+ log.info("Create SolrCore {} (config: {}) on ManagedSolrServer
{} ...",
+ new
Object[]{indexName,solrCoreConfig,managedSolrServer.getServerName()});
+ indexMetadata = managedSolrServer.createSolrIndex(indexName,
+ solrCoreConfig, null);
+ if(indexMetadata != null)
+ log.info(" ... created {}",
indexMetadata.getIndexReference());
+ } else {
+ indexMetadata = managedSolrServer.getIndexMetadata(indexName);
+ if(indexState != ManagedIndexState.ACTIVE){
+ log.info(" ... activate {}",
indexMetadata.getIndexReference());
+ managedSolrServer.activateIndex(indexName);
+ } else {
+ log.info(" ... index {} already active",
indexMetadata.getIndexReference());
}
- ZipArchiveInputStream zis = new
ZipArchiveInputStream(archiveUrl.openStream());
- indexMetadata = managedSolrServer.updateIndex(indexName, zis,
indexArchiveName);
- }
- if (!indexMetadata.isActive()) {
- managedSolrServer.activateIndex(indexName);
}
+// IndexMetadata indexMetadata =
managedSolrServer.getIndexMetadata(indexName);
+// if (indexMetadata == null) {
+// // TODO: debug the DataFileProvider init race conditions
instead
+// // indexMetadata =
managedSolrServer.createSolrIndex(indexName, indexArchiveName, null);
+//
dfp.getInputStream(context.getBundleContext().getBundle().getSymbolicName(),
+// indexArchiveName + ".solrindex.zip", null);
+// URL archiveUrl = context.getBundleContext().getBundle()
+// .getEntry("/data-files/" + indexArchiveName +
".solrindex.zip");
+// if (archiveUrl == null) {
+// throw new ConfigurationException(solrCoreId, "Could not
find index archive for "
+// +
indexArchiveName);
+// }
+// ZipArchiveInputStream zis = new
ZipArchiveInputStream(archiveUrl.openStream());
+// indexMetadata = managedSolrServer.updateIndex(indexName,
zis, indexArchiveName);
+// }
+// if (!indexMetadata.isActive()) {
+// managedSolrServer.activateIndex(indexName);
+// }
indexReference = indexMetadata.getIndexReference();
}
return indexReference;
Modified:
stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java?rev=1489728&r1=1489727&r2=1489728&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java
(original)
+++
stanbol/trunk/enhancement-engines/topic/engine/src/main/java/org/apache/stanbol/enhancer/topic/training/SolrTrainingSet.java
Wed Jun 5 07:23:15 2013
@@ -43,6 +43,7 @@ import org.apache.solr.client.solrj.util
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
import org.apache.stanbol.commons.solr.managed.ManagedSolrServer;
+import
org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
import org.apache.stanbol.enhancer.topic.Batch;
import org.apache.stanbol.enhancer.topic.ConfiguredSolrCoreTracker;
import org.apache.stanbol.enhancer.topic.UTCTimeStamper;
@@ -59,24 +60,38 @@ import org.slf4j.LoggerFactory;
@Component(metatype = true, immediate = true, configurationFactory = true,
policy = ConfigurationPolicy.REQUIRE)
@Service
@Properties(value = {@Property(name = SolrTrainingSet.TRAINING_SET_NAME),
- @Property(name = SolrTrainingSet.SOLR_CORE),
- @Property(name = SolrTrainingSet.EXAMPLE_ID_FIELD, value
= "id"),
- @Property(name = SolrTrainingSet.EXAMPLE_TEXT_FIELD,
value = "text"),
- @Property(name = SolrTrainingSet.TOPICS_URI_FIELD, value
= "topics"),
- @Property(name = SolrTrainingSet.MODIFICATION_DATE_FIELD,
value = "modification_dt")})
+ @Property(name = SolrTrainingSet.SOLR_CORE),
+ @Property(name = SolrTrainingSet.SOLR_CORE_CONFIG, value =
SolrTrainingSet.DEFAULT_SOLR_CORE_CONFIG)
+// @Property(name = SolrTrainingSet.EXAMPLE_ID_FIELD, value =
SolrTrainingSet.DEFAULT_EXAMPLE_ID_FIELD),
+// @Property(name = SolrTrainingSet.EXAMPLE_TEXT_FIELD, value =
SolrTrainingSet.DEFAULT_EXAMPLE_TEXT_FIELD),
+// @Property(name = SolrTrainingSet.TOPICS_URI_FIELD, value =
SolrTrainingSet.DEFAULT_TOPICS_URI_FIELD),
+// @Property(name = SolrTrainingSet.MODIFICATION_DATE_FIELD, value =
SolrTrainingSet.DEFAULT_MODIFICATION_DATE_FIELD)
+})
public class SolrTrainingSet extends ConfiguredSolrCoreTracker implements
TrainingSet {
public static final String TRAINING_SET_NAME =
"org.apache.stanbol.enhancer.topic.trainingset.id";
public static final String SOLR_CORE =
"org.apache.stanbol.enhancer.engine.topic.solrCore";
+ public static final String SOLR_CORE_CONFIG =
"org.apache.stanbol.enhancer.engine.topic.solrCoreConfig";
+
+ public static final String DEFAULT_SOLR_CORE_CONFIG =
"default-topic-trainingset.solrindex.zip";
+
public static final String TOPICS_URI_FIELD =
"org.apache.stanbol.enhancer.engine.topic.topicsUriField";
+
+ public static final String DEFAULT_TOPICS_URI_FIELD = "topics";
public static final String EXAMPLE_ID_FIELD =
"org.apache.stanbol.enhancer.engine.topic.exampleIdField";
+
+ public static final String DEFAULT_EXAMPLE_ID_FIELD = "id";
public static final String EXAMPLE_TEXT_FIELD =
"org.apache.stanbol.enhancer.engine.topic.exampleTextField";
+
+ public static final String DEFAULT_EXAMPLE_TEXT_FIELD = "text";
public static final String MODIFICATION_DATE_FIELD =
"org.apache.stanbol.enhancer.engine.topic.modificiationDateField";
+
+ public static final String DEFAULT_MODIFICATION_DATE_FIELD =
"modification_dt";
@SuppressWarnings("unused")
private static final Logger log =
LoggerFactory.getLogger(SolrTrainingSet.class);
@@ -96,14 +111,13 @@ public class SolrTrainingSet extends Con
@Reference(cardinality = ReferenceCardinality.OPTIONAL_UNARY, bind =
"bindManagedSolrServer", unbind = "unbindManagedSolrServer", strategy =
ReferenceStrategy.EVENT, policy = ReferencePolicy.DYNAMIC)
protected ManagedSolrServer managedSolrServer;
-
+
public String getName() {
return trainingSetId;
}
@Activate
protected void activate(ComponentContext context) throws
ConfigurationException, InvalidSyntaxException {
- indexArchiveName = "default-topic-trainingset";
@SuppressWarnings("unchecked")
Dictionary<String,Object> config = context.getProperties();
this.context = context;
@@ -120,11 +134,11 @@ public class SolrTrainingSet extends Con
@Override
public void configure(Dictionary<String,Object> config) throws
ConfigurationException {
trainingSetId = getRequiredStringParam(config, TRAINING_SET_NAME);
- exampleIdField = getRequiredStringParam(config, EXAMPLE_ID_FIELD);
- exampleTextField = getRequiredStringParam(config, EXAMPLE_TEXT_FIELD);
- topicUrisField = getRequiredStringParam(config, TOPICS_URI_FIELD);
- modificationDateField = getRequiredStringParam(config,
MODIFICATION_DATE_FIELD);
- configureSolrCore(config, SOLR_CORE, trainingSetId);
+ exampleIdField = getRequiredStringParam(config, EXAMPLE_ID_FIELD,
DEFAULT_EXAMPLE_ID_FIELD);
+ exampleTextField = getRequiredStringParam(config, EXAMPLE_TEXT_FIELD,
DEFAULT_EXAMPLE_TEXT_FIELD);
+ topicUrisField = getRequiredStringParam(config, TOPICS_URI_FIELD,
DEFAULT_TOPICS_URI_FIELD);
+ modificationDateField = getRequiredStringParam(config,
MODIFICATION_DATE_FIELD, DEFAULT_MODIFICATION_DATE_FIELD);
+ configureSolrCore(config, SOLR_CORE, trainingSetId, SOLR_CORE_CONFIG);
}
public static ConfiguredSolrCoreTracker
fromParameters(Dictionary<String,Object> config) throws ConfigurationException {
Modified:
stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/OSGI-INF/metatype/metatype.properties
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1489728&r1=1489727&r2=1489728&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/OSGI-INF/metatype/metatype.properties
(original)
+++
stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/OSGI-INF/metatype/metatype.properties
Wed Jun 5 07:23:15 2013
@@ -42,9 +42,27 @@ same value can be executed in parallel.
#org.apache.stanbol.enhancer.engine.topic.languages
org.apache.stanbol.enhancer.engine.topic.languages.name=Languages
+org.apache.stanbol.enhancer.engine.topic.languages.description=The list of \
+supported languages (default: all)
#org.apache.stanbol.enhancer.engine.topic.solrCore
org.apache.stanbol.enhancer.engine.topic.solrCore.name=Solr Core
+org.apache.stanbol.enhancer.engine.topic.solrCore.description=The name of the \
+Solr Core (default: '{engine-name}-model'). This also supports the \
+'{server-name}:{core-name}' syntax.
+
+#org.apache.stanbol.enhancer.engine.topic.solrCoreConfig
+org.apache.stanbol.enhancer.engine.topic.solrCoreConfig.name=Solr Core Config
+org.apache.stanbol.enhancer.engine.topic.solrCoreConfig.description=Allows to \
+specify the Solr Core Configuration used by the Topic Classification instance.
\
+The file with this name is loaded via the DataFileProvider. It may also
contain \
+a pre-trained model. In that case the
+
+#org.apache.stanbol.enhancer.engine.topic.trainingSetId
+org.apache.stanbol.enhancer.engine.topic.trainingSetId.name=Training Set
+org.apache.stanbol.enhancer.engine.topic.trainingSetId.description=The name of
\
+the Training Set used for this Topic Classification engine. If not specified
the \
+model of this Engine will be read-only.
#org.apache.stanbol.enhancer.engine.topic.entryIdField
org.apache.stanbol.enhancer.engine.topic.entryIdField.name=ID Field
@@ -93,3 +111,32 @@ org.apache.stanbol.enhancer.engine.topic
#org.apache.stanbol.enhancer.engine.topic.negativeSupportField
org.apache.stanbol.enhancer.engine.topic.negativeSupportField.name=Negative
Support Field
+
+org.apache.stanbol.enhancer.engine.topic.conceptUriField.name="Concept URI
Field"
+
+org.apache.stanbol.enhancer.engine.topic.primaryTopicField.name="Primary Topic
Field"
+
+# Configuration Properties for the Solr Training Set
+# org.apache.stanbol.enhancer.topic.training.SolrTrainingSet
+org.apache.stanbol.enhancer.topic.training.SolrTrainingSet.name=Apache Stanbol
\
+Enhancer: Solr based Topic Classifier TrainingSet
+org.apache.stanbol.enhancer.topic.training.SolrTrainingSet.description=Solr \
+based implementation of a TrainingSet for Topic Classifiers
+
+
+org.apache.stanbol.enhancer.topic.trainingset.id.name=Training Set Name
+org.apache.stanbol.enhancer.engine.topic.exampleIdField.name=Document ID Field
+org.apache.stanbol.enhancer.engine.topic.exampleIdField.description=The Solr \
+field name used to store the ID of the training document
+org.apache.stanbol.enhancer.engine.topic.exampleIdField.name=Document Text
Field
+org.apache.stanbol.enhancer.engine.topic.exampleIdField.description=The Solr \
+field name used to store the text of the training document
+org.apache.stanbol.enhancer.engine.topic.exampleIdField.name=Topic URI Field
+org.apache.stanbol.enhancer.engine.topic.exampleIdField.description=The Solr \
+field name used to store the URIs of Concepts the training document is
assigned to
+org.apache.stanbol.enhancer.engine.topic.exampleIdField.name=Modification Date
Field
+org.apache.stanbol.enhancer.engine.topic.exampleIdField.description=The Solr \
+field name used to store the last change to the training document
+
+
+
Modified:
stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/default-topic-model/conf/schema.xml
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/default-topic-model/conf/schema.xml?rev=1489728&r1=1489727&r2=1489728&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/default-topic-model/conf/schema.xml
(original)
+++
stanbol/trunk/enhancement-engines/topic/engine/src/main/resources/default-topic-model/conf/schema.xml
Wed Jun 5 07:23:15 2013
@@ -9,7 +9,7 @@
License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
OF ANY KIND, either express or implied. See the License for the specific
language governing permissions and limitations under the License. -->
-<schema name="example" version="1.3">
+<schema name="default-topic-model" version="1.3">
<types>
<fieldType name="uuid" class="solr.UUIDField" indexed="true" />