This is an automated email from the ASF dual-hosted git repository.

dklco pushed a commit to branch master
in repository 
https://gitbox.apache.org/repos/asf/sling-org-apache-sling-app-cms.git


The following commit(s) were added to refs/heads/master by this push:
     new 7982ca65 SLING-11760 - Improving the metadata parsing
7982ca65 is described below

commit 7982ca65de69e25056cb447f0b76fb118772c305
Author: Dan Klco <[email protected]>
AuthorDate: Mon Jan 23 22:31:29 2023 -0500

    SLING-11760 - Improving the metadata parsing
---
 .../core/internal/FileMetadataExtractorImpl.java   |  92 ++++++++++++---
 .../internal/FileMetadataExtractorImplTest.java    | 124 ++++++++++++++++++++-
 feature/src/main/features/cms/cms-repoinit.txt     |  34 +++++-
 3 files changed, 231 insertions(+), 19 deletions(-)

diff --git 
a/core/src/main/java/org/apache/sling/cms/core/internal/FileMetadataExtractorImpl.java
 
b/core/src/main/java/org/apache/sling/cms/core/internal/FileMetadataExtractorImpl.java
index 9313d68e..48b14744 100644
--- 
a/core/src/main/java/org/apache/sling/cms/core/internal/FileMetadataExtractorImpl.java
+++ 
b/core/src/main/java/org/apache/sling/cms/core/internal/FileMetadataExtractorImpl.java
@@ -22,10 +22,19 @@ import java.util.Calendar;
 import java.util.HashMap;
 import java.util.Map;
 
+import javax.jcr.NamespaceRegistry;
+import javax.jcr.RepositoryException;
+import javax.jcr.Session;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.lang3.ArrayUtils;
+import org.apache.commons.lang3.StringUtils;
 import org.apache.jackrabbit.JcrConstants;
-import org.apache.jackrabbit.util.Text;
+import org.apache.sling.api.resource.LoginException;
 import org.apache.sling.api.resource.ModifiableValueMap;
 import org.apache.sling.api.resource.Resource;
+import org.apache.sling.api.resource.ResourceResolver;
+import org.apache.sling.api.resource.ResourceResolverFactory;
 import org.apache.sling.cms.CMSConstants;
 import org.apache.sling.cms.File;
 import org.apache.sling.cms.FileMetadataExtractor;
@@ -36,7 +45,9 @@ import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
+import org.osgi.service.component.annotations.Activate;
 import org.osgi.service.component.annotations.Component;
+import org.osgi.service.component.annotations.Reference;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.SAXException;
@@ -46,11 +57,18 @@ public class FileMetadataExtractorImpl implements 
FileMetadataExtractor {
 
     private static final Logger log = 
LoggerFactory.getLogger(FileMetadataExtractorImpl.class);
 
+    private ResourceResolverFactory resolverFactory;
+
+    @Activate
+    public FileMetadataExtractorImpl(@Reference ResourceResolverFactory 
resolverFactory) {
+        this.resolverFactory = resolverFactory;
+    }
+
     @Override
     public Map<String, Object> extractMetadata(File file) throws IOException {
         try {
             return extractMetadata(file.getResource());
-        } catch (SAXException | TikaException e) {
+        } catch (SAXException | TikaException | RepositoryException | 
LoginException e) {
             throw new IOException("Failed to parse metadata", e);
         }
     }
@@ -80,6 +98,8 @@ public class FileMetadataExtractorImpl implements 
FileMetadataExtractor {
             }
             if (properties != null) {
                 properties.putAll(extractMetadata(file.getResource()));
+                properties.put("SHA1", generateSha1(resource));
+                resource.getResourceResolver().refresh();
                 if (metadata == null) {
                     resource.getResourceResolver().create(content, 
CMSConstants.NN_METADATA, properties);
                 }
@@ -90,30 +110,74 @@ public class FileMetadataExtractorImpl implements 
FileMetadataExtractor {
             } else {
                 throw new IOException("Unable to update metadata for " + 
resource.getPath());
             }
-        } catch (SAXException | TikaException e) {
+        } catch (SAXException | TikaException | RepositoryException | 
LoginException e) {
             throw new IOException("Failed to parse metadata", e);
         }
+    }
 
+    public String generateSha1(Resource resource) throws IOException {
+        try (InputStream is = resource.adaptTo(InputStream.class)) {
+            String sha1 = DigestUtils.sha1Hex(is);
+            log.info("Generated SHA {} for {}", sha1, resource.getPath());
+            return sha1;
+        }
     }
 
-    public Map<String, Object> extractMetadata(Resource resource) throws 
IOException, SAXException, TikaException {
+    public Map<String, Object> extractMetadata(Resource resource)
+            throws IOException, SAXException, TikaException, 
RepositoryException, LoginException {
         log.info("Extracting metadata from {}", resource.getPath());
-        InputStream is = resource.adaptTo(InputStream.class);
         Map<String, Object> properties = new HashMap<>();
-        Parser parser = new AutoDetectParser();
-        BodyContentHandler handler = new BodyContentHandler();
-        Metadata md = new Metadata();
-        ParseContext context = new ParseContext();
-        parser.parse(is, handler, md, context);
-        for (String name : md.names()) {
-            putMetadata(properties, name, md);
+        try (InputStream is = resource.adaptTo(InputStream.class)) {
+            Parser parser = new AutoDetectParser();
+            BodyContentHandler handler = new BodyContentHandler();
+            Metadata md = new Metadata();
+            ParseContext context = new ParseContext();
+            try {
+                parser.parse(is, handler, md, context);
+            } catch (SAXException se) {
+                if 
("WriteLimitReachedException".equals(se.getClass().getSimpleName())) {
+                    log.info("Write limit reached for {}", resource.getPath());
+                } else {
+                    throw se;
+                }
+            }
+
+            try (ResourceResolver adminResolver = 
resolverFactory.getAdministrativeResourceResolver(null)) {
+                NamespaceRegistry registry = 
adminResolver.adaptTo(Session.class).getWorkspace().getNamespaceRegistry();
+                for (String name : md.names()) {
+                    putMetadata(properties, name, md, registry);
+                }
+            }
         }
         return properties;
     }
 
-    private void putMetadata(Map<String, Object> properties, String name, 
Metadata metadata) {
+    protected String formatKey(String initialKey, NamespaceRegistry registry)
+            throws RepositoryException {
+        String namespace = null;
+        String key = null;
+        if (initialKey.contains(":")) {
+            namespace = StringUtils.substringBefore(initialKey, ":");
+            key = StringUtils.substringAfter(initialKey, ":");
+        } else {
+            key = initialKey;
+        }
+        key = key.replace(" ", "").replace("/", "-");
+        if (namespace != null) {
+            namespace = namespace.replace(" ", "").replace("/", "-");
+            if (!ArrayUtils.contains(registry.getPrefixes(), namespace)) {
+                registry.registerNamespace(namespace, 
"http://sling.apache.org/cms/ns/"; + namespace);
+            }
+            return namespace + ":" + key;
+        } else {
+            return key;
+        }
+    }
+
+    private void putMetadata(Map<String, Object> properties, String name, 
Metadata metadata, NamespaceRegistry registry)
+            throws RepositoryException {
         log.trace("Updating property: {}", name);
-        String filtered = Text.escapeIllegalJcrChars(name);
+        String filtered = formatKey(name, registry);
         Property property = Property.get(name);
         if (property != null) {
             if (metadata.isMultiValued(property)) {
diff --git 
a/core/src/test/java/org/apache/sling/cms/core/internal/FileMetadataExtractorImplTest.java
 
b/core/src/test/java/org/apache/sling/cms/core/internal/FileMetadataExtractorImplTest.java
index 9ddf32fb..3600c69d 100644
--- 
a/core/src/test/java/org/apache/sling/cms/core/internal/FileMetadataExtractorImplTest.java
+++ 
b/core/src/test/java/org/apache/sling/cms/core/internal/FileMetadataExtractorImplTest.java
@@ -16,16 +16,31 @@
  */
 package org.apache.sling.cms.core.internal;
 
+import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
+import static org.mockito.ArgumentMatchers.anyString;
+import static org.mockito.ArgumentMatchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Iterator;
 import java.util.Map;
+import java.util.stream.Stream;
 
+import javax.jcr.NamespaceRegistry;
+import javax.jcr.RepositoryException;
+import javax.jcr.Session;
+import javax.jcr.Workspace;
+
+import org.apache.sling.api.resource.LoginException;
 import org.apache.sling.api.resource.Resource;
+import org.apache.sling.api.resource.ResourceResolver;
+import org.apache.sling.api.resource.ResourceResolverFactory;
 import org.apache.sling.cms.File;
-import org.apache.sling.cms.FileMetadataExtractor;
 import org.junit.Before;
 import org.junit.Test;
 import org.mockito.Mockito;
@@ -38,20 +53,38 @@ public class FileMetadataExtractorImplTest {
 
     private File file;
 
+    private NamespaceRegistry registry;
+
+    private ResourceResolverFactory resolverFactory;
+
+    private FileMetadataExtractorImpl extractor;
+
     @Before
-    public void init() {
+    public void init() throws RepositoryException, LoginException {
+
+        ResourceResolver resolver = mock(ResourceResolver.class);
 
-        Resource resource = Mockito.mock(Resource.class);
+        resolverFactory = mock(ResourceResolverFactory.class);
+        
when(resolverFactory.getAdministrativeResourceResolver(null)).thenReturn(resolver);
+        registry = mock(NamespaceRegistry.class);
+
+        Workspace workspace = mock(Workspace.class);
+        when(workspace.getNamespaceRegistry()).thenReturn(registry);
+        Session session = mock(Session.class);
+        when(session.getWorkspace()).thenReturn(workspace);
+        when(resolver.adaptTo(Session.class)).thenReturn(session);
+
+        Resource resource = mock(Resource.class);
         Mockito.when(resource.adaptTo(InputStream.class))
                 
.thenReturn(FileMetadataExtractorImplTest.class.getClassLoader().getResourceAsStream("apache.png"));
 
         file = Mockito.mock(File.class);
         Mockito.when(file.getResource()).thenReturn(resource);
+        extractor = new FileMetadataExtractorImpl(resolverFactory);
     }
 
     @Test
     public void testExtractMetadata() throws IOException {
-        FileMetadataExtractor extractor = new FileMetadataExtractorImpl();
         Map<String, Object> metadata = extractor.extractMetadata(file);
 
         assertNotNull(metadata);
@@ -59,4 +92,87 @@ public class FileMetadataExtractorImplTest {
 
         log.info("Extracted metadata: {}", metadata);
     }
+
+    @Test
+    public void testFormatKey() throws IOException, RepositoryException {
+
+        when(registry.getPrefixes()).thenReturn(new String[] { "GPS", 
"ExifSubIFD", "ExifIFD0", "tiff", "dcterms" });
+
+        String[] keys = new String[] {
+                "GPS:GPS Img Direction", "Exif SubIFD:Subject Distance Range", 
"Compression Type",
+                "Number of Components", "Component 2", "Component 1", "Exif 
IFD0:X Resolution", "tiff:ResolutionUnit",
+                "Exif SubIFD:Scene Type", "Exif SubIFD:Exposure Mode", 
"tiff:Make", "Exif SubIFD:Sharpness",
+                "Exif SubIFD:Custom Rendered", "Component 3", "Exif 
SubIFD:Components Configuration",
+                "Exif SubIFD:Metering Mode", "Exif SubIFD:White Balance Mode", 
"tiff:BitsPerSample",
+                "Exif SubIFD:Sub-Sec Time Original", "meta:creation-date", 
"Creation-Date", "tiff:Orientation",
+                "tiff:Software", "geo:long", "Exif SubIFD:Digital Zoom Ratio", 
"tiff:YResolution", "Y Resolution",
+                "Exif SubIFD:Flash", "Thumbnail Height Pixels", 
"Last-Modified", "Exif SubIFD:Sub-Sec Time",
+                "exif:ExposureTime", "File Size", "Exif SubIFD:Exif Version", 
"GPS:GPS Img Direction Ref",
+                "Exif SubIFD:Focal Length", "Exif IFD0:Resolution Unit", "Exif 
SubIFD:Lens Model",
+                "Exif SubIFD:Date/Time Original", "Exif SubIFD:Sub-Sec Time 
Digitized", "Resolution Units",
+                "File Modified Date", "Exif SubIFD:Sensing Method", "Image 
Height", "Thumbnail Width Pixels",
+                "GPS:GPS Longitude", "Exif SubIFD:Time Zone Original", 
"GPS:GPS Longitude Ref", "tiff:Model",
+                "Exif SubIFD:Brightness Value", "exif:IsoSpeedRatings", "Exif 
SubIFD:Exposure Program",
+                "Exif IFD0:Make", "GPS:GPS Altitude Ref", "Exif 
SubIFD:Aperture Value",
+                "Exif SubIFD:Date/Time Digitized", "tiff:ImageWidth", "GPS:GPS 
Altitude", "Exif IFD0:Y Resolution",
+                "date", "Exif SubIFD:ISO Speed Ratings", "Number of Tables", 
"Exif SubIFD:Time Zone Digitized",
+                "Exif SubIFD:Exif Image Width", "Exif SubIFD:Contrast", "X 
Resolution",
+                "Exif SubIFD:Exposure Bias Value", "Exif SubIFD:Saturation", 
"modified", "exif:FNumber",
+                "Exif SubIFD:Shutter Speed Value", "Exif IFD0:Orientation", 
"Exif SubIFD:F-Number", "exif:FocalLength",
+                "Exif IFD0:Software", "Exif IFD0:Date/Time", "Exif 
SubIFD:Scene Capture Type", "Exif SubIFD:Time Zone",
+                "geo:lat", "Data Precision", "tiff:ImageLength", "Exif 
IFD0:Model", "dcterms:created",
+                "dcterms:modified", "exif:Flash", "Last-Save-Date", "Exif 
SubIFD:Color Space",
+                "Exif SubIFD:Focal Length 35", "Exif SubIFD:Exposure Time", 
"meta:save-date", "File Name",
+                "GPS:GPS Latitude Ref", "Content-Type", "X-Parsed-By", "Exif 
SubIFD:Max Aperture Value",
+                "tiff:XResolution", "exif:DateTimeOriginal", "Exif 
SubIFD:Subject Distance",
+                "Exif SubIFD:FlashPix Version", "Exif SubIFD:Exif Image 
Height", "Image Width", "GPS:GPS Latitude",
+                "Exif SubIFD:Lens Make", "GPS:GPS Date Stamp" };
+
+        String[] formatted = new String[keys.length];
+        for (int i = 0; i < keys.length; i++) {
+            formatted[i] = extractor.formatKey(keys[i], registry);
+        }
+
+        String[] expected = new String[] {
+                "GPS:GPSImgDirection", "ExifSubIFD:SubjectDistanceRange", 
"CompressionType",
+                "NumberofComponents", "Component2", "Component1", 
"ExifIFD0:XResolution", "tiff:ResolutionUnit",
+                "ExifSubIFD:SceneType", "ExifSubIFD:ExposureMode", 
"tiff:Make", "ExifSubIFD:Sharpness",
+                "ExifSubIFD:CustomRendered", "Component3", 
"ExifSubIFD:ComponentsConfiguration",
+                "ExifSubIFD:MeteringMode", "ExifSubIFD:WhiteBalanceMode", 
"tiff:BitsPerSample",
+                "ExifSubIFD:Sub-SecTimeOriginal", "meta:creation-date", 
"Creation-Date", "tiff:Orientation",
+                "tiff:Software", "geo:long", "ExifSubIFD:DigitalZoomRatio", 
"tiff:YResolution", "YResolution",
+                "ExifSubIFD:Flash", "ThumbnailHeightPixels", "Last-Modified", 
"ExifSubIFD:Sub-SecTime",
+                "exif:ExposureTime", "FileSize", "ExifSubIFD:ExifVersion", 
"GPS:GPSImgDirectionRef",
+                "ExifSubIFD:FocalLength", "ExifIFD0:ResolutionUnit", 
"ExifSubIFD:LensModel",
+                "ExifSubIFD:Date-TimeOriginal", 
"ExifSubIFD:Sub-SecTimeDigitized", "ResolutionUnits",
+                "FileModifiedDate", "ExifSubIFD:SensingMethod", "ImageHeight", 
"ThumbnailWidthPixels",
+                "GPS:GPSLongitude", "ExifSubIFD:TimeZoneOriginal", 
"GPS:GPSLongitudeRef", "tiff:Model",
+                "ExifSubIFD:BrightnessValue", "exif:IsoSpeedRatings", 
"ExifSubIFD:ExposureProgram",
+                "ExifIFD0:Make", "GPS:GPSAltitudeRef", 
"ExifSubIFD:ApertureValue",
+                "ExifSubIFD:Date-TimeDigitized", "tiff:ImageWidth", 
"GPS:GPSAltitude", "ExifIFD0:YResolution",
+                "date", "ExifSubIFD:ISOSpeedRatings", "NumberofTables", 
"ExifSubIFD:TimeZoneDigitized",
+                "ExifSubIFD:ExifImageWidth", "ExifSubIFD:Contrast", 
"XResolution",
+                "ExifSubIFD:ExposureBiasValue", "ExifSubIFD:Saturation", 
"modified", "exif:FNumber",
+                "ExifSubIFD:ShutterSpeedValue", "ExifIFD0:Orientation", 
"ExifSubIFD:F-Number", "exif:FocalLength",
+                "ExifIFD0:Software", "ExifIFD0:Date-Time", 
"ExifSubIFD:SceneCaptureType", "ExifSubIFD:TimeZone",
+                "geo:lat", "DataPrecision", "tiff:ImageLength", 
"ExifIFD0:Model", "dcterms:created",
+                "dcterms:modified", "exif:Flash", "Last-Save-Date", 
"ExifSubIFD:ColorSpace",
+                "ExifSubIFD:FocalLength35", "ExifSubIFD:ExposureTime", 
"meta:save-date", "FileName",
+                "GPS:GPSLatitudeRef", "Content-Type", "X-Parsed-By", 
"ExifSubIFD:MaxApertureValue",
+                "tiff:XResolution", "exif:DateTimeOriginal", 
"ExifSubIFD:SubjectDistance",
+                "ExifSubIFD:FlashPixVersion", "ExifSubIFD:ExifImageHeight", 
"ImageWidth", "GPS:GPSLatitude",
+                "ExifSubIFD:LensMake", "GPS:GPSDateStamp" };
+        assertArrayEquals(expected, formatted);
+    }
+
+    @Test
+    public void willRegisterNamespace() throws RepositoryException {
+        when(registry.getPrefixes()).thenReturn(new String[] { "GPS" });
+        Iterator<String> keys = Stream
+                .of("GPS:GPSLatitudeRef", "Content-Type", "X-Parsed-By", 
"ExifSubIFD:MaxApertureValue").iterator();
+        while (keys.hasNext()) {
+            extractor.formatKey(keys.next(), registry);
+        }
+        verify(registry).registerNamespace(eq("ExifSubIFD"), anyString());
+    }
 }
diff --git a/feature/src/main/features/cms/cms-repoinit.txt 
b/feature/src/main/features/cms/cms-repoinit.txt
index 53187c86..03ed22a0 100644
--- a/feature/src/main/features/cms/cms-repoinit.txt
+++ b/feature/src/main/features/cms/cms-repoinit.txt
@@ -89,4 +89,36 @@ create service user sling-cms-versionmgr
 set ACL for sling-cms-versionmgr
     allow   jcr:write,jcr:nodeTypeManagement,jcr:versionManagement    on /
     allow   jcr:read    on /jcr:system/jcr:versionStorage
-end
\ No newline at end of file
+end
+
+# Namespaces
+register namespace (acdsee) "http://ns.acdsee.com/iptc/1.0/";
+register namespace (album) "http://ns.adobe.com/album/1.0/";
+register namespace (cc) "http://creativecommons.org/ns#";
+register namespace (crs) "http://ns.adobe.com/camera-raw-settings/1.0/";
+register namespace (dc) "http://purl.org/dc/elements/1.1/";
+register namespace (dcterms) "http://purl.org/dc/terms/";
+register namespace (Iptc4xmpCore) "http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/";
+register namespace (lr) "http://ns.adobe.com/lightroom/1.0/";
+register namespace (mediapro) "http://ns.iview-multimedia.com/mediapro/1.0/";
+register namespace (MP) "http://ns.microsoft.com/photo/1.2/";
+register namespace (photoshop) "http://ns.adobe.com/photoshop/1.0/";
+register namespace (plus) "http://ns.useplus.org/ldf/xmp/1.0/";
+register namespace (pdf) "http://ns.adobe.com/pdf/1.3/";
+register namespace (pdfx) "http://ns.adobe.com/pdfx/1.3/";
+register namespace (prism) "http://prismstandard.org/namespaces/basic/2.1/";
+register namespace (prl) "http://prismstandard.org/namespaces/prl/2.1/";
+register namespace (psAux) "http://ns.adobe.com/exif/1.0/aux/";
+register namespace (rdf) "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+register namespace (stEvt) "http://ns.adobe.com/xap/1.0/sType/ResourceEvent#";
+register namespace (stEvt) "http://ns.adobe.com/xap/1.0/sType/ResourceEvent#";
+register namespace (tiff) "http://ns.adobe.com/tiff/1.0/";
+register namespace (xmp) "http://ns.adobe.com/xap/1.0/";
+register namespace (xmpBJ) "http://ns.adobe.com/xap/1.0/bj/";
+register namespace (xmpDM) "http://ns.adobe.com/xmp/1.0/DynamicMedia/";
+register namespace (xmpMM) "http://ns.adobe.com/xap/1.0/mm/";
+register namespace (xmpNote) "http://ns.adobe.com/xmp/note/";
+register namespace (xmpPLUS) "http://ns.adobe.com/xap/1.0/PLUS/";
+register namespace (xmpRights) "http://ns.adobe.com/xap/1.0/rights/";
+register namespace (xmpTPg) "http://ns.adobe.com/xap/1.0/t/pg/";
+

Reply via email to