Author: rwesten
Date: Wed Feb 20 12:44:56 2013
New Revision: 1448128

URL: http://svn.apache.org/r1448128
Log:
STANBOL-947: The TikaEngine now supports adding unmapped properties to the 
Metadata of the ContentItem. Currently only properties with a 
{prefix}:{localname} syntax are considered. Other properties are still ignored. 
Added also an UnitTest for the new Feature

Modified:
    
stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
    
stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java
    
stanbol/trunk/enhancement-engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java

Modified: 
stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java?rev=1448128&r1=1448127&r2=1448128&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
 Wed Feb 20 12:44:56 2013
@@ -39,10 +39,14 @@ import java.security.PrivilegedException
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Dictionary;
+import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
 
+import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.commons.io.IOUtils;
 import org.apache.felix.scr.annotations.Component;
 import org.apache.felix.scr.annotations.Properties;
@@ -63,6 +67,7 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.MSOffice;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
@@ -96,7 +101,8 @@ import org.xml.sax.SAXException;
     
@Property(name=TikaEngine.MAPPING_NEPOMUK_EXIF,boolValue=TikaEngine.DEFAULT_MAPPING_NEPOMUK_EXIF_STATE),
     
@Property(name=TikaEngine.MAPPING_SKOS,boolValue=TikaEngine.DEFAULT_MAPPING_SKOS_STATE),
     
@Property(name=TikaEngine.MAPPING_RDFS,boolValue=TikaEngine.DEFAULT_MAPPING_RDFS_STATE),
-    
@Property(name=TikaEngine.MAPPING_GEO,boolValue=TikaEngine.DEFAULT_MAPPING_GEO_STATE)
+    
@Property(name=TikaEngine.MAPPING_GEO,boolValue=TikaEngine.DEFAULT_MAPPING_GEO_STATE),
+    
@Property(name=TikaEngine.UNMAPPED_PROPERTIES,boolValue=TikaEngine.DEFAULT_UNMAPPED_PROPERTIES_STATE)
 })
 public class TikaEngine 
         extends AbstractEnhancementEngine<RuntimeException,RuntimeException> 
@@ -120,9 +126,17 @@ public class TikaEngine 
     public static final String MAPPING_GEO = "stanbol.engine.tika.mapping.geo";
     public static final boolean DEFAULT_MAPPING_GEO_STATE = true;
     
+    public static final String UNMAPPED_PROPERTIES = 
"stanbol.engine.tika.mapping.unmapped";
+    public static final boolean DEFAULT_UNMAPPED_PROPERTIES_STATE = false;
+    
     public static final boolean DEFAULT_SKIP_LINEBREAKS = false;
     
     private boolean skipLinebreaks = DEFAULT_SKIP_LINEBREAKS;
+    
+    /**
+     * This prefix is used as prefix for Tika properties to ensure valid URN. 
+     */
+    public static final String TIKA_URN_PREFIX = "urn:tika.apache.org:tika:";
     /**
      * The default value for the Execution of this Engine. Currently set to
      * {@link ServiceProperties#ORDERING_PRE_PROCESSING}
@@ -142,6 +156,18 @@ public class TikaEngine 
      */
     @Reference
     private ContentItemFactory ciFactory;
+
+    /**
+     * If <code>true</code> unmapped properties are added by using
+     * <code>urn:tika.apache.org:tika:{property-name}</code> to the URI of the
+     * contentItem.
+     */
+    private boolean includeUnmappedProperties;
+    
+    /**
+     * Include also properties without a namespace. Currently those are ignored
+     */
+    private boolean includeAllUnmappedProperties = false;
     
     private static class MediaTypeAndStream {
         String uri;
@@ -261,7 +287,23 @@ public class TikaEngine 
             }
             ci.getLock().writeLock().lock();
             try {
-                ontologyMappings.apply(ci.getMetadata(), ci.getUri(), 
metadata);
+                MGraph graph = ci.getMetadata();
+                UriRef id = ci.getUri();
+                Set<String> mapped = ontologyMappings.apply(graph, id, 
metadata);
+                if(includeUnmappedProperties){
+                    Set<String> unmapped = new 
HashSet<String>(Arrays.asList(metadata.names()));
+                    unmapped.removeAll(mapped);
+                    for(String name : unmapped){
+                        if(name.indexOf(':') >=0 || 
includeAllUnmappedProperties){ //only mapped
+                            UriRef prop = new UriRef(new 
StringBuilder(TIKA_URN_PREFIX).append(name).toString());
+                            for(String value : metadata.getValues(name)){
+                                //TODO: without the Property for the name we 
have no datatype
+                                //      information ... so we add 
PlainLiterals for now
+                                graph.add(new TripleImpl(id, prop, new 
PlainLiteralImpl(value)));
+                            }
+                        }
+                    }
+                }
             }finally{
                 ci.getLock().writeLock().unlock();
             }
@@ -351,6 +393,8 @@ public class TikaEngine 
             MAPPING_GEO, DEFAULT_MAPPING_GEO_STATE)){
             addGeoMappings(ontologyMappings);
         }
+        includeUnmappedProperties = getBoolean(ctx.getProperties(), 
+            UNMAPPED_PROPERTIES, DEFAULT_UNMAPPED_PROPERTIES_STATE);
     }
     @Override
     protected void deactivate(ComponentContext ctx) throws RuntimeException {

Modified: 
stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java?rev=1448128&r1=1448127&r2=1448128&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/metadata/OntologyMappings.java
 Wed Feb 20 12:44:56 2013
@@ -45,8 +45,12 @@ import org.apache.tika.metadata.HttpHead
 import org.apache.tika.metadata.MSOffice;
 import org.apache.tika.metadata.Message;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TIFF;
 import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.parser.opendocument.OpenOfficeParser;
 
 /**
  * Defines mappings for keys used by Apache Tika in the {@link Metadata} to
@@ -165,12 +169,12 @@ public class OntologyMappings implements
         String dc = NamespaceEnum.dc.getNamespace();
         mappings.addMapping(
             new PropertyMapping(dc+"contributor",
-                DublinCore.CONTRIBUTOR.getName(),MSOffice.LAST_AUTHOR));
+                
DublinCore.CONTRIBUTOR.getName(),Office.LAST_AUTHOR.getName()));
         mappings.addMapping(
             new PropertyMapping(dc+"coverage",DublinCore.COVERAGE.getName()));
         mappings.addMappings(
             new PropertyMapping(dc+"creator",
-                
DublinCore.CREATOR.getName(),MSOffice.AUTHOR,"initial-creator"));
+                
DublinCore.CREATOR.getName(),Office.AUTHOR.getName(),"initial-creator"));
         mappings.addMappings( 
             new 
PropertyMapping(dc+"description",DublinCore.DESCRIPTION.getName()));
         mappings.addMappings( 
@@ -195,17 +199,22 @@ public class OntologyMappings implements
             new PropertyMapping(dc+"source",DublinCore.SOURCE.getName()));
         mappings.addMappings( 
             new PropertyMapping(dc+"subject",
-                DublinCore.SUBJECT.getName(),MSOffice.KEYWORDS));
+                DublinCore.SUBJECT.getName(),Office.KEYWORDS.getName()));
         mappings.addMappings( 
             new PropertyMapping(dc+"title",DublinCore.TITLE.getName()));
         mappings.addMappings( 
             new PropertyMapping(dc+"type",DublinCore.TYPE.getName()));
         mappings.addMappings( 
             new 
PropertyMapping(dc+"date",XSD.dateTime,DublinCore.DATE.getName()));
+        mappings.addMappings(
+            new PropertyMapping(dc+"created",XSD.dateTime,
+                DublinCore.CREATED.getName(),"created"));
         //MS Office -> DC
+        mappings.addMappings( 
+            new 
PropertyMapping(dc+"title",OfficeOpenXMLCore.SUBJECT.getName()));
         mappings.addMappings(
-            new PropertyMapping(NamespaceEnum.dc+"created",XSD.dateTime,
-                MSOffice.CREATION_DATE.getName(),"created"));
+            new PropertyMapping(dc+"created",XSD.dateTime,
+                Office.CREATION_DATE.getName(),"created"));
         
     }
     public static void addMediaResourceOntologyMappings(OntologyMappings 
mappings){
@@ -464,14 +473,25 @@ public class OntologyMappings implements
         this.mappings.remove(property);
     }
     
-    public void apply(MGraph graph, UriRef context, Metadata metadata){
+    /**
+     * Applies the registered Ontology Mappings to the parsed metadata and
+     * context. Mappings are added to the parsed Graph
+     * @param graph
+     * @param context
+     * @param metadata
+     * @return Set containing the names of mapped keys
+     */
+    public Set<String> apply(MGraph graph, UriRef context, Metadata metadata){
         Set<String> keys = new 
HashSet<String>(Arrays.asList(metadata.names()));
+        Set<String> mappedKeys = new HashSet<String>();
         for(Mapping mapping : this){
             if(mapping.getMappedTikaProperties().isEmpty() ||
                     !disjoint(keys, mapping.getMappedTikaProperties())){
                 mapping.apply(graph, context, metadata);
+                mappedKeys.addAll(mapping.getMappedTikaProperties());
             }
         }
+        return mappedKeys;
     }
     @Override
     public Iterator<Mapping> iterator() {

Modified: 
stanbol/trunk/enhancement-engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java?rev=1448128&r1=1448127&r2=1448128&view=diff
==============================================================================
--- 
stanbol/trunk/enhancement-engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java
 (original)
+++ 
stanbol/trunk/enhancement-engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java
 Wed Feb 20 12:44:56 2013
@@ -47,6 +47,7 @@ import java.util.regex.Pattern;
 
 import org.apache.clerezza.rdf.core.Literal;
 import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.NonLiteral;
 import org.apache.clerezza.rdf.core.PlainLiteral;
 import org.apache.clerezza.rdf.core.Resource;
@@ -93,6 +94,8 @@ public class TikaEngineTest {
     public static void setUpServices() throws IOException {
         context = new MockComponentContext();
         context.properties.put(TikaEngine.PROPERTY_NAME, "tika");
+        //to test unmapped properties
+        context.properties.put(TikaEngine.UNMAPPED_PROPERTIES, "true");
     }
 
     @Before
@@ -490,6 +493,24 @@ public class TikaEngineTest {
         verifyValues(ci, new UriRef(NamespaceEnum.dc+"subject"), null, 
"serbor","moscow-birds","canon-55-250");
     }
     
+    /**
+     * Tests unmapped properties as added by <a 
href="https://issues.apache.org/jira/browse/STANBOL-947";>
+     * STANBOL-947</a>
+     * @throws EngineException
+     * @throws IOException
+     * @throws ParseException 
+     */
+    @Test
+    public void testUnmappedProperties() throws EngineException, IOException, 
ParseException {
+        log.info(">>> testUnmappedProperties <<<");
+        //reuses the image with EXIF metadata
+        ContentItem ci = createContentItem("testMP4.m4a", "audio/mp4");
+        assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+        engine.computeEnhancements(ci);
+        //test that the "xmpDM:logComment" is present
+        verifyValue(ci, new 
UriRef("urn:tika.apache.org:tika:xmpDM:logComment"), null,"Test Comments");
+    }
+    
     @Test
     public void testContentTypeDetection() throws EngineException, IOException 
{
         log.info(">>> testContentTypeDetection <<<");


Reply via email to