Author: rwesten
Date: Mon Apr 15 06:47:13 2013
New Revision: 1467865
URL: http://svn.apache.org/r1467865
Log:
STANBOL-1031: The Jena TDB LDpath RDFBackend implementation now correctly
handles literals with emtpy language; STANBOL-1016: implemented TripleFilter
for the Jena TDB indexing source
Added:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LiteralLanguageFilter.java
(with props)
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilter.java
(with props)
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfImportFilter.java
(with props)
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LanguageLiteralFilterTest.java
(with props)
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilterTest.java
(with props)
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/resources/prefix.config
Modified:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/AbstractTdbBackend.java
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfIndexingSource.java
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java
Modified:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/AbstractTdbBackend.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/AbstractTdbBackend.java?rev=1467865&r1=1467864&r2=1467865&view=diff
==============================================================================
---
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/AbstractTdbBackend.java
(original)
+++
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/AbstractTdbBackend.java
Mon Apr 15 06:47:13 2013
@@ -54,7 +54,8 @@ public abstract class AbstractTdbBackend
private TypeMapper typeMapper = TypeMapper.getInstance();
private Locale toLocale(String lang){
- if(lang == null){
+ //Jena TDB uses '' for representing Literals without language
+ if(lang == null || lang.isEmpty()){
return null;
}
Locale locale = localeCache.get(lang);
Modified:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java?rev=1467865&r1=1467864&r2=1467865&view=diff
==============================================================================
---
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java
(original)
+++
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java
Mon Apr 15 06:47:13 2013
@@ -1,6 +1,9 @@
package org.apache.stanbol.entityhub.indexing.source.jenatdb;
+import java.util.Map;
+
import org.apache.jena.atlas.lib.Tuple;
+import org.apache.jena.atlas.logging.Log;
import org.slf4j.Logger;
import com.hp.hpl.jena.graph.Node;
@@ -25,23 +28,52 @@ import com.hp.hpl.jena.tdb.sys.Names;
* <p>
* This code is based on the DestinationGraph implementation private to the
* {@link TDBLoader} class.
+ * <p>
+ * In addition this implementation supports an {@link RdfImportFilter} that
+ * can be used to filter RDF triples read from RDF files before adding them
+ * to the RDF TripleStore.
*
* @author Rupert Westenthaler
*
*/
class DestinationTripleGraph implements BulkStreamRDF {
+ /**
+ * ImportFilter that accepts all triples. This is used in case
+ * <code>null</code> is parsed as {@link RdfImportFilter} to the
constructor
+ */
+ private static final RdfImportFilter NO_FILTER = new RdfImportFilter() {
+ @Override
+ public void setConfiguration(Map<String,Object> config) {}
+ @Override
+ public boolean needsInitialisation() { return false;}
+ @Override
+ public void initialise() {}
+ @Override
+ public void close() {}
+ @Override
+ public boolean accept(Node s, Node p, Node o) {return true;}
+ };
final private DatasetGraphTDB dsg ;
final private LoadMonitor monitor ;
final private LoaderNodeTupleTable loaderTriples ;
final private boolean startedEmpty ;
private long count = 0 ;
+ private long filteredCount = 0;
private StatsCollector stats ;
+ private RdfImportFilter importFilter;
+ private final Logger importLog;
- DestinationTripleGraph(final DatasetGraphTDB dsg, Logger log) {
+ DestinationTripleGraph(final DatasetGraphTDB dsg, RdfImportFilter
importFilter, Logger log) {
this.dsg = dsg ;
startedEmpty = dsg.isEmpty() ;
monitor = new LoadMonitor(dsg, log, "triples",
BulkLoader.DataTickPoint, BulkLoader.IndexTickPoint) ;
loaderTriples = new
LoaderNodeTupleTable(dsg.getTripleTable().getNodeTupleTable(), "triples",
monitor) ;
+ if(importFilter == null){
+ this.importFilter = NO_FILTER;
+ } else {
+ this.importFilter = importFilter;
+ }
+ this.importLog = log;
}
@Override
@@ -49,19 +81,25 @@ class DestinationTripleGraph implements
{
loaderTriples.loadStart() ;
loaderTriples.loadDataStart() ;
-
this.stats = new StatsCollector() ;
}
+
+ private void triple(Node s, Node p, Node o){
+ if(importFilter.accept(s, p, o)){
+ loaderTriples.load(s, p, o);
+ stats.record(null, s, p, o);
+ count++;
+ } else {
+ filteredCount++;
+ if(filteredCount%100000 == 0){
+ importLog.info("Filtered: {} triples ({}%)",filteredCount,
+ ((double)filteredCount*100/(double)(filteredCount+count)));
+ }
+ }
+ }
@Override
- final public void triple(Triple triple)
- {
- Node s = triple.getSubject() ;
- Node p = triple.getPredicate() ;
- Node o = triple.getObject() ;
-
- loaderTriples.load(s, p, o) ;
- stats.record(null, s, p, o) ;
- count++ ;
+ final public void triple(Triple triple) {
+ triple(triple.getSubject(),triple.getPredicate(),triple.getObject());
}
@Override
@@ -81,23 +119,21 @@ class DestinationTripleGraph implements
}
@Override
- public void start() {}
+ public void start(){}
@Override
public void quad(Quad quad) {
- triple(quad.asTriple());
+ triple(quad.getSubject(),quad.getPredicate(),quad.getObject());
}
@Override
public void tuple(Tuple<Node> tuple) {
if(tuple.size() >= 3){
- loaderTriples.load(tuple.get(0), tuple.get(1), tuple.get(2)) ;
- stats.record(null, tuple.get(0), tuple.get(1), tuple.get(2)) ;
- count++ ;
+ triple(tuple.get(0),tuple.get(1),tuple.get(2));
} else {
throw new TDBException("Tuple with < 3 Nodes encountered while
loading a single graph");
}
}
@Override
- public void base(String base) { }
+ public void base(String base){}
@Override
public void prefix(String prefix, String iri) { } // TODO
@Override
Added:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LiteralLanguageFilter.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LiteralLanguageFilter.java?rev=1467865&view=auto
==============================================================================
---
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LiteralLanguageFilter.java
(added)
+++
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LiteralLanguageFilter.java
Mon Apr 15 06:47:13 2013
@@ -0,0 +1,134 @@
+package org.apache.stanbol.entityhub.indexing.source.jenatdb;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+import org.osgi.service.cm.ConfigurationException;
+
+import com.hp.hpl.jena.graph.Node;
+
+/**
+ * Allows to filter Tiples based on the language of the value. Triples with
+ * values other than <code>{@link Node#isLiteral()} == true</code> are
accepted.
+ * This is also true for all Literals that do not have a language assigned.
+ * @author Rupert Westenthaler
+ *
+ */
+public class LiteralLanguageFilter implements RdfImportFilter {
+ /**
+ * Allows to configure the literal languages included/excluded during the
+ * import of RDF data<p>
+ * <b>Syntax: </b><code>{lang1},!{lang2},*</code>
+ * <ul>
+ * <li>'{lang}' includes an language
+ * <li>'!{lang}'excludes an language
+ * <li>',' is the separator, additional spaces are trimmed
+ * <li>'*' will include all properties not explicitly excluded
+ * </ul>
+ */
+ public static final String PARAM_LITERAL_LANGUAGES = "if-literal-language";
+ private Set<String> configuredLanguages;
+ private Set<String> excludedLanguages;
+ private boolean includeAll;
+
+ public LiteralLanguageFilter(){}
+
+ /**
+ * For unit tests
+ * @param config the test config
+ */
+ protected LiteralLanguageFilter(String config){
+ parseLanguages(config);
+ }
+
+
+ @Override
+ public void setConfiguration(Map<String,Object> config) {
+
+ Object value = config.get(PARAM_LITERAL_LANGUAGES);
+ if(value == null){
+ includeAll = true;
+ excludedLanguages = Collections.emptySet();
+ configuredLanguages = Collections.emptySet();
+ } else {
+ parseLanguages(value.toString());
+ }
+ }
+
+ private void parseLanguages(String config){
+ configuredLanguages = new HashSet<String>();
+ excludedLanguages = new HashSet<String>();
+ String[] languages = config.split(",");
+ for(int i = 0;i < languages.length;i++){
+ languages[i] = languages[i].trim().toLowerCase(Locale.ROOT);
+ if(includeAll == false && languages[i].equals("*")){
+ includeAll = true;
+ }
+ }
+ for(String lang : languages) {
+ if(lang.isEmpty() || lang.equals("*")){
+ continue; //ignore null values and * is already processed
+ }
+ //lang = lang.toLowerCase(); //country codes are upper case
+ if(lang.charAt(0) == '!'){ //exclude
+ lang = lang.substring(1);
+ if(lang.isEmpty()){
+ continue; //only a '!' without an lanugage
+ }
+ if(configuredLanguages.contains(lang)){
+ throw new IllegalArgumentException(
+ "Langauge '"+lang+"' is both included and excluded
(config: "
+ + config+")");
+ }
+ excludedLanguages.add(lang);
+ } else{
+ if(excludedLanguages.contains(lang)){
+ throw new IllegalArgumentException(
+ "Langauge '"+lang+"' is both included and excluded
(config: "
+ + config+")");
+ }
+ configuredLanguages.add(lang);
+ }
+ }
+ }
+
+ @Override
+ public boolean needsInitialisation() {
+ return false;
+ }
+
+ @Override
+ public void initialise() {
+ }
+
+ @Override
+ public void close() {
+ }
+
+ @Override
+ public boolean accept(Node s, Node p, Node o) {
+ if(o.isLiteral()){
+ if(includeAll && excludedLanguages.isEmpty()){
+ return true; //deactivated
+ }
+ String lang = o.getLiteralLanguage();
+ if(lang != null && !lang.isEmpty()){
+ if(includeAll){
+ return !excludedLanguages.contains(lang);
+ } else {
+ return configuredLanguages.contains(lang);
+ }
+ } else { //no plain literal (null) or default language (empty)
+ return true; //accept it
+ }
+ } else {
+ return true; //accept all none literals
+ }
+ }
+
+}
Propchange:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LiteralLanguageFilter.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilter.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilter.java?rev=1467865&view=auto
==============================================================================
---
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilter.java
(added)
+++
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilter.java
Mon Apr 15 06:47:13 2013
@@ -0,0 +1,182 @@
+package org.apache.stanbol.entityhub.indexing.source.jenatdb;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.TreeMap;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.stanbol.commons.namespaceprefix.NamespaceMappingUtils;
+import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixProvider;
+import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixService;
+import org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.hp.hpl.jena.graph.Node;
+
+public class PropertyPrefixFilter implements RdfImportFilter {
+
+ private final Logger log =
LoggerFactory.getLogger(PropertyPrefixFilter.class);
+
+ /**
+ * Links to a file that defines included & excluded properties (one per
line)<p>
+ * <b>Syntax</b>
+ * <ul>
+ * <li>Lines starting with '#' are ignored
+ * <li>'!{prefix}' will exclude all properties starting with the {prefix}.
+ * <li>'{prefix}' will include all properties starting with {prefix}
+ * <li>'*' will include all properties not explicitly excluded
+ * <li> Namespace prefixes are supported!
+ * <li> '{prefix}*' is also supported. However all {prefix} values are
+ * interpreted like that.
+ * </ul>
+ * <b>NOTES</b>: (1) Longer prefixes are matched first. (1) All processed
+ * values are stored in-memory. That means that matching prefixes are only
+ * calculate on the first appearance of an property.
+ */
+ public static final String PARAM_PROPERTY_FILTERS = "if-property-filter";
+
+
+ public PropertyPrefixFilter(){}
+ /**
+ * For unit tests only
+ * @param nsPrefixService
+ * @param lines
+ */
+ protected PropertyPrefixFilter(NamespacePrefixProvider nsPrefixService,
+ List<String> lines){
+ parsePropertyPrefixConfig(nsPrefixService, lines);
+ }
+
+ private Map<String, Boolean> propertyPrefixMap;
+ private Map<String, Boolean> propertyMap;
+ private boolean includeAll;
+
+
+ @Override
+ public void setConfiguration(Map<String,Object> config) {
+ IndexingConfig indexingConfig =
(IndexingConfig)config.get(IndexingConfig.KEY_INDEXING_CONFIG);
+ NamespacePrefixService nsPrefixService =
indexingConfig.getNamespacePrefixService();
+ log.info("Configure {}",getClass().getSimpleName());
+ Object value = config.get(PARAM_PROPERTY_FILTERS);
+ if(value == null){
+ propertyPrefixMap = Collections.emptyMap();
+ propertyMap = Collections.emptyMap();
+ includeAll = true;
+ } else {
+ log.info(" > property Prefix Filters");
+ //ensure that longer prefixes are first
+ File propertyPrefixConfig =
indexingConfig.getConfigFile(value.toString());
+ List<String> lines;
+ InputStream in = null;
+ try {
+ in = new FileInputStream(propertyPrefixConfig);
+ lines = IOUtils.readLines(in,"UTF-8");
+ }catch (IOException e) {
+ throw new IllegalArgumentException("Unable to read property
filter configuration "
+ + "from the configured File
"+propertyPrefixConfig.getAbsolutePath(),e);
+ } finally {
+ IOUtils.closeQuietly(in);
+ }
+ parsePropertyPrefixConfig(nsPrefixService, lines);
+ }
+
+ }
+
+ /**
+ * @param nsPrefixService
+ * @param propertyPrefixConfig
+ */
+ private void parsePropertyPrefixConfig(NamespacePrefixProvider
nsPrefixService, List<String> lines) {
+ propertyPrefixMap = new TreeMap<String,Boolean>(new
Comparator<String>() {
+ @Override
+ public int compare(String o1, String o2) {
+ int length = o2.length() - o1.length();
+ if(length != 0){
+ return length;
+ } else {
+ return o1.compareTo(o2);
+ }
+ }
+ });
+ propertyMap = new HashMap<String,Boolean>();
+ includeAll = lines.remove("*");
+ log.info(" - includeAll: {}",includeAll);
+ for(String line : lines){
+ if(line.startsWith("#") || line.isEmpty() || line.equals("*")){
+ continue; //ignore comment, empty lines and multiple '*'
+ }
+ boolean exclude = line.charAt(0) == '!';
+ String prefix = exclude ? line.substring(1) : line;
+ prefix = prefix.trim();
+ if(includeAll && !exclude){
+ continue; //ignore includes if * is active
+ }
+ String uri;
+ String nsPrefix = NamespaceMappingUtils.getPrefix(prefix);
+ if(nsPrefix != null){
+ String ns = nsPrefixService.getNamespace(nsPrefix);
+ if(ns == null){
+ throw new IllegalArgumentException("Unable to resolve
namesoace prefix used by '"
+ +prefix+"' by using the NamespacePrefixService!");
+ }
+ uri = new StringBuilder(ns).append(prefix,nsPrefix.length()+1,
prefix.length()).toString();
+ } else {
+ uri = prefix;
+ }
+ if(uri.charAt(uri.length()-1) == '*'){
+ uri = uri.substring(0, uri.length()-1);
+ }
+ log.info(" - '{}' {}", uri, exclude ? "excluded" : "included");
+ propertyPrefixMap.put(uri, !exclude);
+ }
+ }
+
+ @Override
+ public boolean needsInitialisation() {
+ return false;
+ }
+
+ @Override
+ public void initialise() {
+ }
+
+ @Override
+ public void close() {
+ }
+
+ @Override
+ public boolean accept(Node s, Node p, Node o) {
+ if(p.isURI()){
+ if(includeAll && propertyPrefixMap.isEmpty()){
+ return true;
+ }
+ String property = p.getURI();
+ Boolean state = propertyMap.get(property);
+ if(state != null){
+ return state;
+ }
+ //first time we encounter this property ... need to calculate
+ for(Entry<String,Boolean> entry : propertyPrefixMap.entrySet()){
+ if(property.startsWith(entry.getKey())){
+ propertyMap.put(property, entry.getValue());
+ return entry.getValue();
+ }
+ }
+ //no match ... set based on includeAll
+ propertyMap.put(property, includeAll);
+ } else {
+ return false;
+ }
+ return false;
+ }
+
+}
Propchange:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilter.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfImportFilter.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfImportFilter.java?rev=1467865&view=auto
==============================================================================
---
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfImportFilter.java
(added)
+++
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfImportFilter.java
Mon Apr 15 06:47:13 2013
@@ -0,0 +1,18 @@
+package org.apache.stanbol.entityhub.indexing.source.jenatdb;
+
+import org.apache.stanbol.entityhub.indexing.core.IndexingComponent;
+
+import com.hp.hpl.jena.graph.Node;
+
+/**
+ * Allows to filter Triples parsed from RDF files. Useful to NOT import some
+ * RDF triples from RDF dumps that are not relevant for the indexing process.
+ * @author Rupert Westenthaler
+ *
+ */
+public interface RdfImportFilter extends IndexingComponent{
+
+
+ public boolean accept(Node s, Node p, Node o);
+
+}
Propchange:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfImportFilter.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Modified:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfIndexingSource.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfIndexingSource.java?rev=1467865&r1=1467864&r2=1467865&view=diff
==============================================================================
---
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfIndexingSource.java
(original)
+++
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfIndexingSource.java
Mon Apr 15 06:47:13 2013
@@ -44,6 +44,7 @@ import org.apache.stanbol.entityhub.serv
import org.apache.stanbol.entityhub.servicesapi.model.Text;
import org.apache.stanbol.entityhub.servicesapi.model.ValueFactory;
import org.apache.stanbol.entityhub.servicesapi.util.ModelUtils;
+import org.joda.time.field.ImpreciseDateTimeField;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -114,6 +115,13 @@ public class RdfIndexingSource extends A
*/
public static final String PARAM_IMPORT_SOURCE = "import";
/**
+ * Allows to configure a {@link RdfImportFilter} (full qualified class
name).
+ * If present it gets the full configuration set for this component parsed.
+ * This means that the import filter can be configured by the same
+ * configuration as this component.
+ */
+ public static final String PARAM_IMPORT_FILTER = "import-filter";
+ /**
* The default directory name used to search for RDF files to be imported
*/
public static final String DEFAULT_SOURCE_FOLDER_NAME = "rdfdata";
@@ -140,6 +148,7 @@ public class RdfIndexingSource extends A
* used for logging a single WARN level entry on the first ignored BNode
*/
private boolean bnodeIgnored = false;
+ private RdfImportFilter importFilter;
/**
* Default Constructor relaying on that {@link #setConfiguration(Map)} is
@@ -168,17 +177,20 @@ public class RdfIndexingSource extends A
* imported
* @param valueFactory The {@link ValueFactory} used to create instances
* or <code>null</code> to use the default implementation.
+ * @param importFilter Optionally an importFilter used for filtering some
+ * triples read from the RDF source files.
*/
public RdfIndexingSource(File modelLocation,
File sourceFileOrDirectory,
- ValueFactory valueFactory){
+ ValueFactory valueFactory,
+ RdfImportFilter importFilter){
if(modelLocation == null){
throw new IllegalArgumentException("The parsed model location MUST
NOT be NULL!");
}
//init the store
this.indexingDataset = initTDBDataset(modelLocation);
//use a ResourceLoader that fails on the first invalid RDF file
(STANBOL-328)
- this.loader = new ResourceLoader(new
RdfResourceImporter(indexingDataset), true,true);
+ this.loader = new ResourceLoader(new
RdfResourceImporter(indexingDataset,importFilter), true,true);
loader.addResource(sourceFileOrDirectory);
}
@Override
@@ -187,10 +199,48 @@ public class RdfIndexingSource extends A
//first init the RDF Model
this.indexingDataset = Utils.getTDBDataset(config);
//second we need to check if we need to import RDF files to the RDF
model
+ //look if we need want to use an import filter
+ Object value = config.get(PARAM_IMPORT_FILTER);
+ if(value == null){
+ log.info("No RDF Import Filter configured");
+ importFilter = null;
+ } else {
+ String[] filterNames = value.toString().split(",");
+ List<RdfImportFilter> filters = new ArrayList<RdfImportFilter>();
+ ClassLoader cl = indexingConfig.getClass().getClassLoader();
+ for(String filterName : filterNames){
+ filterName = filterName.trim();
+ try {
+ Class<? extends RdfImportFilter> importFilterClass =
cl.loadClass(
+ filterName).asSubclass(RdfImportFilter.class);
+ RdfImportFilter filter = importFilterClass.newInstance();
+ filter.setConfiguration(config);
+ filters.add(filter);
+ log.info("Use RDF ImportFilter {} (type:
{})",importFilter,importFilterClass.getSimpleName());
+ } catch (ClassNotFoundException e) {
+ throw new IllegalArgumentException("Configured
RdfImportFilter '"
+ +filterName+"' not found", e);
+ } catch (InstantiationException e) {
+ throw new IllegalArgumentException("Configured
RdfImportFilter '"
+ +filterName+"' can not be instantiated", e);
+ } catch (IllegalAccessException e) {
+ throw new IllegalArgumentException("Configured
RdfImportFilter '"
+ +filterName+"' can not be created", e);
+ }
+ }
+ if(filters.isEmpty()){
+ this.importFilter = null;
+ } else if(filters.size() == 1){
+ this.importFilter = filters.get(0);
+ } else {
+ this.importFilter = new UnionImportFilter(filters.toArray(
+ new RdfImportFilter[filters.size()]));
+ }
+ }
//create the ResourceLoader
- this.loader = new ResourceLoader(new
RdfResourceImporter(indexingDataset), true);
+ this.loader = new ResourceLoader(new
RdfResourceImporter(indexingDataset, importFilter), true);
- Object value = config.get(PARAM_IMPORTED_FOLDER);
+ value = config.get(PARAM_IMPORTED_FOLDER);
String importedFolderName;
if(value != null && !value.toString().isEmpty()){
importedFolderName = value.toString();
@@ -281,17 +331,25 @@ public class RdfIndexingSource extends A
}
@Override
public boolean needsInitialisation() {
- //if there are resources with the state REGISTERED we need an
initialisation
- return !loader.getResources(ResourceState.REGISTERED).isEmpty();
+ return (importFilter != null && importFilter.needsInitialisation()) ||
+ !loader.getResources(ResourceState.REGISTERED).isEmpty();
}
@Override
public void initialise(){
- loader.loadResources();
+ if(importFilter != null && importFilter.needsInitialisation()){
+ importFilter.initialise();
+ }
+ if(!loader.getResources(ResourceState.REGISTERED).isEmpty()){
+ loader.loadResources();
+ }
}
@Override
public void close() {
loader = null;
indexingDataset.close();
+ if(importFilter != null){
+ importFilter.close();
+ }
}
public void debug(){
String entityVar = "s";
@@ -345,20 +403,32 @@ public class RdfIndexingSource extends A
resource = Node.createURI(id);
}
Representation source = vf.createRepresentation(id);
- ExtendedIterator<Triple> outgoing =
indexingDataset.getDefaultGraph().find(resource, null, null);
- boolean found = outgoing.hasNext();
- while(outgoing.hasNext()){ //iterate over the statements for that
resource
- Triple statement = outgoing.next();
- Node predicate = statement.getPredicate();
- if(predicate == null || !predicate.isURI()){
- log.warn("Ignore field {} for resource {} because it is null
or not an URI!",
- predicate,resource);
- } else {
- String field = predicate.getURI();
- Node value = statement.getObject();
- processValue(value, source, field);
- } //end else predicate != null
- } //end iteration over resource triple
+ boolean found;
+ ExtendedIterator<Triple> outgoing = null;
+ try { // There may still be exceptions while reading triples
+ outgoing = indexingDataset.getDefaultGraph().find(resource, null,
null);
+ found = outgoing.hasNext();
+ while(outgoing.hasNext()){ //iterate over the statements for that
resource
+ Triple statement = outgoing.next();
+ Node predicate = statement.getPredicate();
+ if(predicate == null || !predicate.isURI()){
+ log.warn("Ignore field {} for resource {} because it is
null or not an URI!",
+ predicate,resource);
+ } else {
+ String field = predicate.getURI();
+ Node value = statement.getObject();
+ processValue(value, source, field);
+ } //end else predicate != null
+ } //end iteration over resource triple
+ } catch (Exception e) {
+ log.warn("Unable to retrieve entity data for Entity '"+id+"'",e);
+ found = false;
+ try {
+ if(outgoing != null){
+ outgoing.close();
+ }
+ } catch (Exception e1) { /* ignore */}
+ }
if(found) {
if(log.isTraceEnabled()){
log.info("Resource: \n{}",
ModelUtils.getRepresentationInfo(source));
@@ -407,9 +477,9 @@ public class RdfIndexingSource extends A
if(duration != null && !duration.isEmpty()) {
source.add(field, literalValue.toString());
}
- } else {
+ } else if(!ll.getLexicalForm().isEmpty()){
source.add(field, literalValue);
- }
+ } //else ignore literals that are empty
} catch (DatatypeFormatException e) {
log.warn(" Unable to convert {} to {} -> use lecicalForm",
ll.getLexicalForm(),ll.getDatatype());
@@ -765,5 +835,40 @@ public class RdfIndexingSource extends A
return super.createURI(uri);
}
}
+ /**
+ * used in case multiple {@link RdfImportFilter}s are configured.
+ * @author Rupert Westenthaler
+ *
+ */
+ private class UnionImportFilter implements RdfImportFilter {
+
+ RdfImportFilter[] filters;
+
+ UnionImportFilter(RdfImportFilter[] filters){
+ this.filters = filters;
+ }
+
+ @Override
+ public void setConfiguration(Map<String,Object> config) {}
+
+ @Override
+ public boolean needsInitialisation() { return false;}
+
+ @Override
+ public void initialise() {}
+
+ @Override
+ public void close() {}
+
+ @Override
+ public boolean accept(Node s, Node p, Node o) {
+ boolean state = true;
+ for(int i=0;state && i < filters.length;i++){
+ state = filters[i].accept(s, p, o);
+ }
+ return state;
+ }
+
+ }
}
Modified:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java?rev=1467865&r1=1467864&r2=1467865&view=diff
==============================================================================
---
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java
(original)
+++
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java
Mon Apr 15 06:47:13 2013
@@ -37,12 +37,12 @@ public class RdfResourceImporter impleme
private static final Logger log =
LoggerFactory.getLogger(RdfResourceImporter.class);
// private final DatasetGraphTDB indexingDataset;
private final DestinationTripleGraph destination;
- public RdfResourceImporter(DatasetGraphTDB indexingDataset){
+ public RdfResourceImporter(DatasetGraphTDB indexingDataset,
RdfImportFilter importFilter){
if(indexingDataset == null){
throw new IllegalArgumentException("The parsed DatasetGraphTDB
instance MUST NOT be NULL!");
}
//this.indexingDataset = indexingDataset;
- this.destination = new DestinationTripleGraph(indexingDataset,log);
+ this.destination = new
DestinationTripleGraph(indexingDataset,importFilter,log);
}
@Override
Added:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LanguageLiteralFilterTest.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LanguageLiteralFilterTest.java?rev=1467865&view=auto
==============================================================================
---
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LanguageLiteralFilterTest.java
(added)
+++
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LanguageLiteralFilterTest.java
Mon Apr 15 06:47:13 2013
@@ -0,0 +1,58 @@
+package org.apache.stanbol.entityhub.indexing.source.jenatdb;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import com.hp.hpl.jena.datatypes.RDFDatatype;
+import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
+import com.hp.hpl.jena.graph.Node;
+
+public class LanguageLiteralFilterTest {
+
+ @Test(expected=IllegalArgumentException.class)
+ public void testIncludeExcludeConfig1(){
+ new LiteralLanguageFilter("en,de,!de");
+ }
+ @Test(expected=IllegalArgumentException.class)
+ public void testIncludeExcludeConfig2(){
+ new LiteralLanguageFilter("en,!de,de");
+ }
+ @Test
+ public void testDataTypes(){
+ RdfImportFilter filter = new LiteralLanguageFilter("en,de");
+
+ Assert.assertTrue(filter.accept(null, null,
+ Node.createLiteral("test", "en", false)));
+ Assert.assertTrue(filter.accept(null, null,
+ Node.createLiteral("test")));
+ Assert.assertTrue(filter.accept(null, null,
+ Node.createLiteral("10",XSDDatatype.XSDint)));
+ Assert.assertTrue(filter.accept(null, null,
+ Node.createAnon()));
+ Assert.assertTrue(filter.accept(null, null,
+ Node.createURI("urn:test")));
+ }
+ @Test
+ public void testIncludeTest(){
+ RdfImportFilter filter = new LiteralLanguageFilter("en,de");
+
+ Assert.assertTrue(filter.accept(null, null,
+ Node.createLiteral("test", "en", false)));
+ Assert.assertTrue(filter.accept(null, null,
+ Node.createLiteral("test", "de", false)));
+ Assert.assertFalse(filter.accept(null, null,
+ Node.createLiteral("test", "fr", false)));
+ }
+ @Test
+ public void testExcludeTest(){
+ RdfImportFilter filter = new LiteralLanguageFilter("*,en,!de");
+
+ Assert.assertTrue(filter.accept(null, null,
+ Node.createLiteral("test", "en", false)));
+ Assert.assertFalse(filter.accept(null, null,
+ Node.createLiteral("test", "de", false)));
+ Assert.assertTrue(filter.accept(null, null,
+ Node.createLiteral("test", "fr", false)));
+ }
+
+}
Propchange:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LanguageLiteralFilterTest.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilterTest.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilterTest.java?rev=1467865&view=auto
==============================================================================
---
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilterTest.java
(added)
+++
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilterTest.java
Mon Apr 15 06:47:13 2013
@@ -0,0 +1,87 @@
+package org.apache.stanbol.entityhub.indexing.source.jenatdb;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixProvider;
+import
org.apache.stanbol.commons.namespaceprefix.impl.NamespacePrefixProviderImpl;
+import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.hp.hpl.jena.graph.Node;
+
+public class PropertyPrefixFilterTest {
+
+ private static final String FB = "http://rdf.freebase.com/ns/";
+
+ private static final String TEST_CONFIG = "prefix.config";
+
+
+ private static NamespacePrefixProvider nsPrefixProvider;
+
+ private static final Map<String,String> nsMappings = new
HashMap<String,String>();
+ static {
+ nsMappings.put("fb", FB);
+ nsMappings.put("rdf", NamespaceEnum.rdf.getNamespace());
+ nsMappings.put("rdfs", NamespaceEnum.rdfs.getNamespace());
+ nsMappings.put("skos", NamespaceEnum.skos.getNamespace());
+ }
+
+ private static List<String> configLines;
+
+ private RdfImportFilter importFilter;
+
+ @BeforeClass
+ public static void init() throws IOException{
+ nsPrefixProvider = new NamespacePrefixProviderImpl(nsMappings);
+ InputStream in =
PropertyPrefixFilterTest.class.getClassLoader().getResourceAsStream(TEST_CONFIG);
+ Assert.assertNotNull("Unable to read test config",in);
+ configLines = (List<String>)IOUtils.readLines(in, "UTF-8");
+ }
+
+ @Before
+ public void createImportFilter(){
+ importFilter = new PropertyPrefixFilter(nsPrefixProvider, configLines);
+ }
+
+ @Test
+ public void testMappings(){
+ Node subject = Node.createURI("urn:subject");
+ Node value = Node.createURI("urn:value");
+
+ Node rdfType = Node.createURI(NamespaceEnum.rdf+"type");
+ Assert.assertTrue(importFilter.accept(subject,rdfType,value));
+
+ Node rdfsLabel = Node.createURI(NamespaceEnum.rdfs+"label");
+ Assert.assertTrue(importFilter.accept(subject,rdfsLabel,value));
+
+ Node guid = Node.createURI(FB+"type.object.guid");
+ Assert.assertFalse(importFilter.accept(subject,guid,value));
+
+ Node permission = Node.createURI(FB+"type.object.permission");
+ Assert.assertFalse(importFilter.accept(subject,permission,value));
+
+ Node name = Node.createURI(FB+"type.object.name");
+ Assert.assertTrue(importFilter.accept(subject,name,value));
+
+ Node description = Node.createURI(FB+"type.object.description");
+ Assert.assertTrue(importFilter.accept(subject,description,value));
+
+ Node dummy = Node.createURI(FB+"type.dummy");
+ Assert.assertFalse(importFilter.accept(subject,dummy,value));
+
+ Node typePlain = Node.createURI(FB+"type");
+ Assert.assertFalse(importFilter.accept(subject,typePlain,value));
+
+ Node other = Node.createURI(NamespaceEnum.cc+"license");
+ Assert.assertFalse(importFilter.accept(subject,other,value));
+ }
+
+}
Propchange:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilterTest.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added:
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/resources/prefix.config
URL:
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/resources/prefix.config?rev=1467865&view=auto
==============================================================================
---
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/resources/prefix.config
(added)
+++
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/resources/prefix.config
Mon Apr 15 06:47:13 2013
@@ -0,0 +1,11 @@
+
+# all from rdf and rdfs namespace
+rdf:*
+rdfs:*
+# exclude some specific type.object properties
+!fb:type.object.guid
+!fb:type.object.permission
+# import all type.object (other than excluded)
+fb:type.object
+# exclude all type properties (other than type.object)
+!fb:type