Author: rwesten
Date: Wed Feb 20 12:44:56 2013
New Revision: 1448128
URL: http://svn.apache.org/r1448128
Log:
STANBOL-947: The TikaEngine now supports adding unmapped properties to the
Metadata of the ContentItem. Currently only properties with a
{prefix}:{localname} syntax are considered. Other properties are still ignored.
Added also an UnitTest for the new Feature
Modified:
stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java
stanbol/trunk/enhancement-engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java
Modified:
stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java?rev=1448128&r1=1448127&r2=1448128&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
(original)
+++
stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
Wed Feb 20 12:44:56 2013
@@ -39,10 +39,14 @@ import java.security.PrivilegedException
import java.util.Arrays;
import java.util.Collections;
import java.util.Dictionary;
+import java.util.HashSet;
import java.util.Map;
import java.util.Set;
+import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.commons.io.IOUtils;
import org.apache.felix.scr.annotations.Component;
import org.apache.felix.scr.annotations.Properties;
@@ -63,6 +67,7 @@ import org.apache.stanbol.enhancer.servi
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
@@ -96,7 +101,8 @@ import org.xml.sax.SAXException;
@Property(name=TikaEngine.MAPPING_NEPOMUK_EXIF,boolValue=TikaEngine.DEFAULT_MAPPING_NEPOMUK_EXIF_STATE),
@Property(name=TikaEngine.MAPPING_SKOS,boolValue=TikaEngine.DEFAULT_MAPPING_SKOS_STATE),
@Property(name=TikaEngine.MAPPING_RDFS,boolValue=TikaEngine.DEFAULT_MAPPING_RDFS_STATE),
-
@Property(name=TikaEngine.MAPPING_GEO,boolValue=TikaEngine.DEFAULT_MAPPING_GEO_STATE)
+
@Property(name=TikaEngine.MAPPING_GEO,boolValue=TikaEngine.DEFAULT_MAPPING_GEO_STATE),
+
@Property(name=TikaEngine.UNMAPPED_PROPERTIES,boolValue=TikaEngine.DEFAULT_UNMAPPED_PROPERTIES_STATE)
})
public class TikaEngine
extends AbstractEnhancementEngine<RuntimeException,RuntimeException>
@@ -120,9 +126,17 @@ public class TikaEngine
public static final String MAPPING_GEO = "stanbol.engine.tika.mapping.geo";
public static final boolean DEFAULT_MAPPING_GEO_STATE = true;
+ public static final String UNMAPPED_PROPERTIES =
"stanbol.engine.tika.mapping.unmapped";
+ public static final boolean DEFAULT_UNMAPPED_PROPERTIES_STATE = false;
+
public static final boolean DEFAULT_SKIP_LINEBREAKS = false;
private boolean skipLinebreaks = DEFAULT_SKIP_LINEBREAKS;
+
+ /**
+ * This prefix is used as prefix for Tika properties to ensure valid URN.
+ */
+ public static final String TIKA_URN_PREFIX = "urn:tika.apache.org:tika:";
/**
* The default value for the Execution of this Engine. Currently set to
* {@link ServiceProperties#ORDERING_PRE_PROCESSING}
@@ -142,6 +156,18 @@ public class TikaEngine
*/
@Reference
private ContentItemFactory ciFactory;
+
+ /**
+ * If <code>true</code> unmapped properties are added by using
+ * <code>urn:tika.apache.org:tika:{property-name}</code> to the URI of the
+ * contentItem.
+ */
+ private boolean includeUnmappedProperties;
+
+ /**
+ * Include also properties without a namespace. Currently those are ignored
+ */
+ private boolean includeAllUnmappedProperties = false;
private static class MediaTypeAndStream {
String uri;
@@ -261,7 +287,23 @@ public class TikaEngine
}
ci.getLock().writeLock().lock();
try {
- ontologyMappings.apply(ci.getMetadata(), ci.getUri(),
metadata);
+ MGraph graph = ci.getMetadata();
+ UriRef id = ci.getUri();
+ Set<String> mapped = ontologyMappings.apply(graph, id,
metadata);
+ if(includeUnmappedProperties){
+ Set<String> unmapped = new
HashSet<String>(Arrays.asList(metadata.names()));
+ unmapped.removeAll(mapped);
+ for(String name : unmapped){
+ if(name.indexOf(':') >=0 ||
includeAllUnmappedProperties){ //only mapped
+ UriRef prop = new UriRef(new
StringBuilder(TIKA_URN_PREFIX).append(name).toString());
+ for(String value : metadata.getValues(name)){
+ //TODO: without the Property for the name we
have no datatype
+ // information ... so we add
PlainLiterals for now
+ graph.add(new TripleImpl(id, prop, new
PlainLiteralImpl(value)));
+ }
+ }
+ }
+ }
}finally{
ci.getLock().writeLock().unlock();
}
@@ -351,6 +393,8 @@ public class TikaEngine
MAPPING_GEO, DEFAULT_MAPPING_GEO_STATE)){
addGeoMappings(ontologyMappings);
}
+ includeUnmappedProperties = getBoolean(ctx.getProperties(),
+ UNMAPPED_PROPERTIES, DEFAULT_UNMAPPED_PROPERTIES_STATE);
}
@Override
protected void deactivate(ComponentContext ctx) throws RuntimeException {
Modified:
stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java?rev=1448128&r1=1448127&r2=1448128&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java
(original)
+++
stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java
Wed Feb 20 12:44:56 2013
@@ -45,8 +45,12 @@ import org.apache.tika.metadata.HttpHead
import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TIFF;
import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.parser.opendocument.OpenOfficeParser;
/**
* Defines mappings for keys used by Apache Tika in the {@link Metadata} to
@@ -165,12 +169,12 @@ public class OntologyMappings implements
String dc = NamespaceEnum.dc.getNamespace();
mappings.addMapping(
new PropertyMapping(dc+"contributor",
- DublinCore.CONTRIBUTOR.getName(),MSOffice.LAST_AUTHOR));
+
DublinCore.CONTRIBUTOR.getName(),Office.LAST_AUTHOR.getName()));
mappings.addMapping(
new PropertyMapping(dc+"coverage",DublinCore.COVERAGE.getName()));
mappings.addMappings(
new PropertyMapping(dc+"creator",
-
DublinCore.CREATOR.getName(),MSOffice.AUTHOR,"initial-creator"));
+
DublinCore.CREATOR.getName(),Office.AUTHOR.getName(),"initial-creator"));
mappings.addMappings(
new
PropertyMapping(dc+"description",DublinCore.DESCRIPTION.getName()));
mappings.addMappings(
@@ -195,17 +199,22 @@ public class OntologyMappings implements
new PropertyMapping(dc+"source",DublinCore.SOURCE.getName()));
mappings.addMappings(
new PropertyMapping(dc+"subject",
- DublinCore.SUBJECT.getName(),MSOffice.KEYWORDS));
+ DublinCore.SUBJECT.getName(),Office.KEYWORDS.getName()));
mappings.addMappings(
new PropertyMapping(dc+"title",DublinCore.TITLE.getName()));
mappings.addMappings(
new PropertyMapping(dc+"type",DublinCore.TYPE.getName()));
mappings.addMappings(
new
PropertyMapping(dc+"date",XSD.dateTime,DublinCore.DATE.getName()));
+ mappings.addMappings(
+ new PropertyMapping(dc+"created",XSD.dateTime,
+ DublinCore.CREATED.getName(),"created"));
//MS Office -> DC
+ mappings.addMappings(
+ new
PropertyMapping(dc+"title",OfficeOpenXMLCore.SUBJECT.getName()));
mappings.addMappings(
- new PropertyMapping(NamespaceEnum.dc+"created",XSD.dateTime,
- MSOffice.CREATION_DATE.getName(),"created"));
+ new PropertyMapping(dc+"created",XSD.dateTime,
+ Office.CREATION_DATE.getName(),"created"));
}
public static void addMediaResourceOntologyMappings(OntologyMappings
mappings){
@@ -464,14 +473,25 @@ public class OntologyMappings implements
this.mappings.remove(property);
}
- public void apply(MGraph graph, UriRef context, Metadata metadata){
+ /**
+ * Applies the registered Ontology Mappings to the parsed metadata and
+ * context. Mappings are added to the parsed Graph
+ * @param graph
+ * @param context
+ * @param metadata
+ * @return Set containing the names of mapped keys
+ */
+ public Set<String> apply(MGraph graph, UriRef context, Metadata metadata){
Set<String> keys = new
HashSet<String>(Arrays.asList(metadata.names()));
+ Set<String> mappedKeys = new HashSet<String>();
for(Mapping mapping : this){
if(mapping.getMappedTikaProperties().isEmpty() ||
!disjoint(keys, mapping.getMappedTikaProperties())){
mapping.apply(graph, context, metadata);
+ mappedKeys.addAll(mapping.getMappedTikaProperties());
}
}
+ return mappedKeys;
}
@Override
public Iterator<Mapping> iterator() {
Modified:
stanbol/trunk/enhancement-engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java?rev=1448128&r1=1448127&r2=1448128&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java
(original)
+++
stanbol/trunk/enhancement-engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java
Wed Feb 20 12:44:56 2013
@@ -47,6 +47,7 @@ import java.util.regex.Pattern;
import org.apache.clerezza.rdf.core.Literal;
import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.NonLiteral;
import org.apache.clerezza.rdf.core.PlainLiteral;
import org.apache.clerezza.rdf.core.Resource;
@@ -93,6 +94,8 @@ public class TikaEngineTest {
public static void setUpServices() throws IOException {
context = new MockComponentContext();
context.properties.put(TikaEngine.PROPERTY_NAME, "tika");
+ //to test unmapped properties
+ context.properties.put(TikaEngine.UNMAPPED_PROPERTIES, "true");
}
@Before
@@ -490,6 +493,24 @@ public class TikaEngineTest {
verifyValues(ci, new UriRef(NamespaceEnum.dc+"subject"), null,
"serbor","moscow-birds","canon-55-250");
}
+ /**
+ * Tests unmapped properties as added by <a
href="https://issues.apache.org/jira/browse/STANBOL-947">
+ * STANBOL-947</a>
+ * @throws EngineException
+ * @throws IOException
+ * @throws ParseException
+ */
+ @Test
+ public void testUnmappedProperties() throws EngineException, IOException,
ParseException {
+ log.info(">>> testUnmappedProperties <<<");
+ //reuses the image with EXIF metadata
+ ContentItem ci = createContentItem("testMP4.m4a", "audio/mp4");
+ assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+ engine.computeEnhancements(ci);
+ //test that the "xmpDM:logComment" is present
+ verifyValue(ci, new
UriRef("urn:tika.apache.org:tika:xmpDM:logComment"), null,"Test Comments");
+ }
+
@Test
public void testContentTypeDetection() throws EngineException, IOException
{
log.info(">>> testContentTypeDetection <<<");