This is an automated email from the ASF dual-hosted git repository.
dklco pushed a commit to branch master
in repository
https://gitbox.apache.org/repos/asf/sling-org-apache-sling-app-cms.git
The following commit(s) were added to refs/heads/master by this push:
new 7982ca65 SLING-11760 - Improving the metadata parsing
7982ca65 is described below
commit 7982ca65de69e25056cb447f0b76fb118772c305
Author: Dan Klco <[email protected]>
AuthorDate: Mon Jan 23 22:31:29 2023 -0500
SLING-11760 - Improving the metadata parsing
---
.../core/internal/FileMetadataExtractorImpl.java | 92 ++++++++++++---
.../internal/FileMetadataExtractorImplTest.java | 124 ++++++++++++++++++++-
feature/src/main/features/cms/cms-repoinit.txt | 34 +++++-
3 files changed, 231 insertions(+), 19 deletions(-)
diff --git
a/core/src/main/java/org/apache/sling/cms/core/internal/FileMetadataExtractorImpl.java
b/core/src/main/java/org/apache/sling/cms/core/internal/FileMetadataExtractorImpl.java
index 9313d68e..48b14744 100644
---
a/core/src/main/java/org/apache/sling/cms/core/internal/FileMetadataExtractorImpl.java
+++
b/core/src/main/java/org/apache/sling/cms/core/internal/FileMetadataExtractorImpl.java
@@ -22,10 +22,19 @@ import java.util.Calendar;
import java.util.HashMap;
import java.util.Map;
+import javax.jcr.NamespaceRegistry;
+import javax.jcr.RepositoryException;
+import javax.jcr.Session;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.lang3.ArrayUtils;
+import org.apache.commons.lang3.StringUtils;
import org.apache.jackrabbit.JcrConstants;
-import org.apache.jackrabbit.util.Text;
+import org.apache.sling.api.resource.LoginException;
import org.apache.sling.api.resource.ModifiableValueMap;
import org.apache.sling.api.resource.Resource;
+import org.apache.sling.api.resource.ResourceResolver;
+import org.apache.sling.api.resource.ResourceResolverFactory;
import org.apache.sling.cms.CMSConstants;
import org.apache.sling.cms.File;
import org.apache.sling.cms.FileMetadataExtractor;
@@ -36,7 +45,9 @@ import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
+import org.osgi.service.component.annotations.Activate;
import org.osgi.service.component.annotations.Component;
+import org.osgi.service.component.annotations.Reference;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
@@ -46,11 +57,18 @@ public class FileMetadataExtractorImpl implements
FileMetadataExtractor {
private static final Logger log =
LoggerFactory.getLogger(FileMetadataExtractorImpl.class);
+ private ResourceResolverFactory resolverFactory;
+
+ @Activate
+ public FileMetadataExtractorImpl(@Reference ResourceResolverFactory
resolverFactory) {
+ this.resolverFactory = resolverFactory;
+ }
+
@Override
public Map<String, Object> extractMetadata(File file) throws IOException {
try {
return extractMetadata(file.getResource());
- } catch (SAXException | TikaException e) {
+ } catch (SAXException | TikaException | RepositoryException |
LoginException e) {
throw new IOException("Failed to parse metadata", e);
}
}
@@ -80,6 +98,8 @@ public class FileMetadataExtractorImpl implements
FileMetadataExtractor {
}
if (properties != null) {
properties.putAll(extractMetadata(file.getResource()));
+ properties.put("SHA1", generateSha1(resource));
+ resource.getResourceResolver().refresh();
if (metadata == null) {
resource.getResourceResolver().create(content,
CMSConstants.NN_METADATA, properties);
}
@@ -90,30 +110,74 @@ public class FileMetadataExtractorImpl implements
FileMetadataExtractor {
} else {
throw new IOException("Unable to update metadata for " +
resource.getPath());
}
- } catch (SAXException | TikaException e) {
+ } catch (SAXException | TikaException | RepositoryException |
LoginException e) {
throw new IOException("Failed to parse metadata", e);
}
+ }
+ public String generateSha1(Resource resource) throws IOException {
+ try (InputStream is = resource.adaptTo(InputStream.class)) {
+ String sha1 = DigestUtils.sha1Hex(is);
+ log.info("Generated SHA {} for {}", sha1, resource.getPath());
+ return sha1;
+ }
}
- public Map<String, Object> extractMetadata(Resource resource) throws
IOException, SAXException, TikaException {
+ public Map<String, Object> extractMetadata(Resource resource)
+ throws IOException, SAXException, TikaException,
RepositoryException, LoginException {
log.info("Extracting metadata from {}", resource.getPath());
- InputStream is = resource.adaptTo(InputStream.class);
Map<String, Object> properties = new HashMap<>();
- Parser parser = new AutoDetectParser();
- BodyContentHandler handler = new BodyContentHandler();
- Metadata md = new Metadata();
- ParseContext context = new ParseContext();
- parser.parse(is, handler, md, context);
- for (String name : md.names()) {
- putMetadata(properties, name, md);
+ try (InputStream is = resource.adaptTo(InputStream.class)) {
+ Parser parser = new AutoDetectParser();
+ BodyContentHandler handler = new BodyContentHandler();
+ Metadata md = new Metadata();
+ ParseContext context = new ParseContext();
+ try {
+ parser.parse(is, handler, md, context);
+ } catch (SAXException se) {
+ if
("WriteLimitReachedException".equals(se.getClass().getSimpleName())) {
+ log.info("Write limit reached for {}", resource.getPath());
+ } else {
+ throw se;
+ }
+ }
+
+ try (ResourceResolver adminResolver =
resolverFactory.getAdministrativeResourceResolver(null)) {
+ NamespaceRegistry registry =
adminResolver.adaptTo(Session.class).getWorkspace().getNamespaceRegistry();
+ for (String name : md.names()) {
+ putMetadata(properties, name, md, registry);
+ }
+ }
}
return properties;
}
- private void putMetadata(Map<String, Object> properties, String name,
Metadata metadata) {
+ protected String formatKey(String initialKey, NamespaceRegistry registry)
+ throws RepositoryException {
+ String namespace = null;
+ String key = null;
+ if (initialKey.contains(":")) {
+ namespace = StringUtils.substringBefore(initialKey, ":");
+ key = StringUtils.substringAfter(initialKey, ":");
+ } else {
+ key = initialKey;
+ }
+ key = key.replace(" ", "").replace("/", "-");
+ if (namespace != null) {
+ namespace = namespace.replace(" ", "").replace("/", "-");
+ if (!ArrayUtils.contains(registry.getPrefixes(), namespace)) {
+ registry.registerNamespace(namespace,
"http://sling.apache.org/cms/ns/" + namespace);
+ }
+ return namespace + ":" + key;
+ } else {
+ return key;
+ }
+ }
+
+ private void putMetadata(Map<String, Object> properties, String name,
Metadata metadata, NamespaceRegistry registry)
+ throws RepositoryException {
log.trace("Updating property: {}", name);
- String filtered = Text.escapeIllegalJcrChars(name);
+ String filtered = formatKey(name, registry);
Property property = Property.get(name);
if (property != null) {
if (metadata.isMultiValued(property)) {
diff --git
a/core/src/test/java/org/apache/sling/cms/core/internal/FileMetadataExtractorImplTest.java
b/core/src/test/java/org/apache/sling/cms/core/internal/FileMetadataExtractorImplTest.java
index 9ddf32fb..3600c69d 100644
---
a/core/src/test/java/org/apache/sling/cms/core/internal/FileMetadataExtractorImplTest.java
+++
b/core/src/test/java/org/apache/sling/cms/core/internal/FileMetadataExtractorImplTest.java
@@ -16,16 +16,31 @@
*/
package org.apache.sling.cms.core.internal;
+import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
+import static org.mockito.ArgumentMatchers.anyString;
+import static org.mockito.ArgumentMatchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
import java.io.IOException;
import java.io.InputStream;
+import java.util.Iterator;
import java.util.Map;
+import java.util.stream.Stream;
+import javax.jcr.NamespaceRegistry;
+import javax.jcr.RepositoryException;
+import javax.jcr.Session;
+import javax.jcr.Workspace;
+
+import org.apache.sling.api.resource.LoginException;
import org.apache.sling.api.resource.Resource;
+import org.apache.sling.api.resource.ResourceResolver;
+import org.apache.sling.api.resource.ResourceResolverFactory;
import org.apache.sling.cms.File;
-import org.apache.sling.cms.FileMetadataExtractor;
import org.junit.Before;
import org.junit.Test;
import org.mockito.Mockito;
@@ -38,20 +53,38 @@ public class FileMetadataExtractorImplTest {
private File file;
+ private NamespaceRegistry registry;
+
+ private ResourceResolverFactory resolverFactory;
+
+ private FileMetadataExtractorImpl extractor;
+
@Before
- public void init() {
+ public void init() throws RepositoryException, LoginException {
+
+ ResourceResolver resolver = mock(ResourceResolver.class);
- Resource resource = Mockito.mock(Resource.class);
+ resolverFactory = mock(ResourceResolverFactory.class);
+
when(resolverFactory.getAdministrativeResourceResolver(null)).thenReturn(resolver);
+ registry = mock(NamespaceRegistry.class);
+
+ Workspace workspace = mock(Workspace.class);
+ when(workspace.getNamespaceRegistry()).thenReturn(registry);
+ Session session = mock(Session.class);
+ when(session.getWorkspace()).thenReturn(workspace);
+ when(resolver.adaptTo(Session.class)).thenReturn(session);
+
+ Resource resource = mock(Resource.class);
Mockito.when(resource.adaptTo(InputStream.class))
.thenReturn(FileMetadataExtractorImplTest.class.getClassLoader().getResourceAsStream("apache.png"));
file = Mockito.mock(File.class);
Mockito.when(file.getResource()).thenReturn(resource);
+ extractor = new FileMetadataExtractorImpl(resolverFactory);
}
@Test
public void testExtractMetadata() throws IOException {
- FileMetadataExtractor extractor = new FileMetadataExtractorImpl();
Map<String, Object> metadata = extractor.extractMetadata(file);
assertNotNull(metadata);
@@ -59,4 +92,87 @@ public class FileMetadataExtractorImplTest {
log.info("Extracted metadata: {}", metadata);
}
+
+ @Test
+ public void testFormatKey() throws IOException, RepositoryException {
+
+ when(registry.getPrefixes()).thenReturn(new String[] { "GPS",
"ExifSubIFD", "ExifIFD0", "tiff", "dcterms" });
+
+ String[] keys = new String[] {
+ "GPS:GPS Img Direction", "Exif SubIFD:Subject Distance Range",
"Compression Type",
+ "Number of Components", "Component 2", "Component 1", "Exif
IFD0:X Resolution", "tiff:ResolutionUnit",
+ "Exif SubIFD:Scene Type", "Exif SubIFD:Exposure Mode",
"tiff:Make", "Exif SubIFD:Sharpness",
+ "Exif SubIFD:Custom Rendered", "Component 3", "Exif
SubIFD:Components Configuration",
+ "Exif SubIFD:Metering Mode", "Exif SubIFD:White Balance Mode",
"tiff:BitsPerSample",
+ "Exif SubIFD:Sub-Sec Time Original", "meta:creation-date",
"Creation-Date", "tiff:Orientation",
+ "tiff:Software", "geo:long", "Exif SubIFD:Digital Zoom Ratio",
"tiff:YResolution", "Y Resolution",
+ "Exif SubIFD:Flash", "Thumbnail Height Pixels",
"Last-Modified", "Exif SubIFD:Sub-Sec Time",
+ "exif:ExposureTime", "File Size", "Exif SubIFD:Exif Version",
"GPS:GPS Img Direction Ref",
+ "Exif SubIFD:Focal Length", "Exif IFD0:Resolution Unit", "Exif
SubIFD:Lens Model",
+ "Exif SubIFD:Date/Time Original", "Exif SubIFD:Sub-Sec Time
Digitized", "Resolution Units",
+ "File Modified Date", "Exif SubIFD:Sensing Method", "Image
Height", "Thumbnail Width Pixels",
+ "GPS:GPS Longitude", "Exif SubIFD:Time Zone Original",
"GPS:GPS Longitude Ref", "tiff:Model",
+ "Exif SubIFD:Brightness Value", "exif:IsoSpeedRatings", "Exif
SubIFD:Exposure Program",
+ "Exif IFD0:Make", "GPS:GPS Altitude Ref", "Exif
SubIFD:Aperture Value",
+ "Exif SubIFD:Date/Time Digitized", "tiff:ImageWidth", "GPS:GPS
Altitude", "Exif IFD0:Y Resolution",
+ "date", "Exif SubIFD:ISO Speed Ratings", "Number of Tables",
"Exif SubIFD:Time Zone Digitized",
+ "Exif SubIFD:Exif Image Width", "Exif SubIFD:Contrast", "X
Resolution",
+ "Exif SubIFD:Exposure Bias Value", "Exif SubIFD:Saturation",
"modified", "exif:FNumber",
+ "Exif SubIFD:Shutter Speed Value", "Exif IFD0:Orientation",
"Exif SubIFD:F-Number", "exif:FocalLength",
+ "Exif IFD0:Software", "Exif IFD0:Date/Time", "Exif
SubIFD:Scene Capture Type", "Exif SubIFD:Time Zone",
+ "geo:lat", "Data Precision", "tiff:ImageLength", "Exif
IFD0:Model", "dcterms:created",
+ "dcterms:modified", "exif:Flash", "Last-Save-Date", "Exif
SubIFD:Color Space",
+ "Exif SubIFD:Focal Length 35", "Exif SubIFD:Exposure Time",
"meta:save-date", "File Name",
+ "GPS:GPS Latitude Ref", "Content-Type", "X-Parsed-By", "Exif
SubIFD:Max Aperture Value",
+ "tiff:XResolution", "exif:DateTimeOriginal", "Exif
SubIFD:Subject Distance",
+ "Exif SubIFD:FlashPix Version", "Exif SubIFD:Exif Image
Height", "Image Width", "GPS:GPS Latitude",
+ "Exif SubIFD:Lens Make", "GPS:GPS Date Stamp" };
+
+ String[] formatted = new String[keys.length];
+ for (int i = 0; i < keys.length; i++) {
+ formatted[i] = extractor.formatKey(keys[i], registry);
+ }
+
+ String[] expected = new String[] {
+ "GPS:GPSImgDirection", "ExifSubIFD:SubjectDistanceRange",
"CompressionType",
+ "NumberofComponents", "Component2", "Component1",
"ExifIFD0:XResolution", "tiff:ResolutionUnit",
+ "ExifSubIFD:SceneType", "ExifSubIFD:ExposureMode",
"tiff:Make", "ExifSubIFD:Sharpness",
+ "ExifSubIFD:CustomRendered", "Component3",
"ExifSubIFD:ComponentsConfiguration",
+ "ExifSubIFD:MeteringMode", "ExifSubIFD:WhiteBalanceMode",
"tiff:BitsPerSample",
+ "ExifSubIFD:Sub-SecTimeOriginal", "meta:creation-date",
"Creation-Date", "tiff:Orientation",
+ "tiff:Software", "geo:long", "ExifSubIFD:DigitalZoomRatio",
"tiff:YResolution", "YResolution",
+ "ExifSubIFD:Flash", "ThumbnailHeightPixels", "Last-Modified",
"ExifSubIFD:Sub-SecTime",
+ "exif:ExposureTime", "FileSize", "ExifSubIFD:ExifVersion",
"GPS:GPSImgDirectionRef",
+ "ExifSubIFD:FocalLength", "ExifIFD0:ResolutionUnit",
"ExifSubIFD:LensModel",
+ "ExifSubIFD:Date-TimeOriginal",
"ExifSubIFD:Sub-SecTimeDigitized", "ResolutionUnits",
+ "FileModifiedDate", "ExifSubIFD:SensingMethod", "ImageHeight",
"ThumbnailWidthPixels",
+ "GPS:GPSLongitude", "ExifSubIFD:TimeZoneOriginal",
"GPS:GPSLongitudeRef", "tiff:Model",
+ "ExifSubIFD:BrightnessValue", "exif:IsoSpeedRatings",
"ExifSubIFD:ExposureProgram",
+ "ExifIFD0:Make", "GPS:GPSAltitudeRef",
"ExifSubIFD:ApertureValue",
+ "ExifSubIFD:Date-TimeDigitized", "tiff:ImageWidth",
"GPS:GPSAltitude", "ExifIFD0:YResolution",
+ "date", "ExifSubIFD:ISOSpeedRatings", "NumberofTables",
"ExifSubIFD:TimeZoneDigitized",
+ "ExifSubIFD:ExifImageWidth", "ExifSubIFD:Contrast",
"XResolution",
+ "ExifSubIFD:ExposureBiasValue", "ExifSubIFD:Saturation",
"modified", "exif:FNumber",
+ "ExifSubIFD:ShutterSpeedValue", "ExifIFD0:Orientation",
"ExifSubIFD:F-Number", "exif:FocalLength",
+ "ExifIFD0:Software", "ExifIFD0:Date-Time",
"ExifSubIFD:SceneCaptureType", "ExifSubIFD:TimeZone",
+ "geo:lat", "DataPrecision", "tiff:ImageLength",
"ExifIFD0:Model", "dcterms:created",
+ "dcterms:modified", "exif:Flash", "Last-Save-Date",
"ExifSubIFD:ColorSpace",
+ "ExifSubIFD:FocalLength35", "ExifSubIFD:ExposureTime",
"meta:save-date", "FileName",
+ "GPS:GPSLatitudeRef", "Content-Type", "X-Parsed-By",
"ExifSubIFD:MaxApertureValue",
+ "tiff:XResolution", "exif:DateTimeOriginal",
"ExifSubIFD:SubjectDistance",
+ "ExifSubIFD:FlashPixVersion", "ExifSubIFD:ExifImageHeight",
"ImageWidth", "GPS:GPSLatitude",
+ "ExifSubIFD:LensMake", "GPS:GPSDateStamp" };
+ assertArrayEquals(expected, formatted);
+ }
+
+ @Test
+ public void willRegisterNamespace() throws RepositoryException {
+ when(registry.getPrefixes()).thenReturn(new String[] { "GPS" });
+ Iterator<String> keys = Stream
+ .of("GPS:GPSLatitudeRef", "Content-Type", "X-Parsed-By",
"ExifSubIFD:MaxApertureValue").iterator();
+ while (keys.hasNext()) {
+ extractor.formatKey(keys.next(), registry);
+ }
+ verify(registry).registerNamespace(eq("ExifSubIFD"), anyString());
+ }
}
diff --git a/feature/src/main/features/cms/cms-repoinit.txt
b/feature/src/main/features/cms/cms-repoinit.txt
index 53187c86..03ed22a0 100644
--- a/feature/src/main/features/cms/cms-repoinit.txt
+++ b/feature/src/main/features/cms/cms-repoinit.txt
@@ -89,4 +89,36 @@ create service user sling-cms-versionmgr
set ACL for sling-cms-versionmgr
allow jcr:write,jcr:nodeTypeManagement,jcr:versionManagement on /
allow jcr:read on /jcr:system/jcr:versionStorage
-end
\ No newline at end of file
+end
+
+# Namespaces
+register namespace (acdsee) "http://ns.acdsee.com/iptc/1.0/"
+register namespace (album) "http://ns.adobe.com/album/1.0/"
+register namespace (cc) "http://creativecommons.org/ns#"
+register namespace (crs) "http://ns.adobe.com/camera-raw-settings/1.0/"
+register namespace (dc) "http://purl.org/dc/elements/1.1/"
+register namespace (dcterms) "http://purl.org/dc/terms/"
+register namespace (Iptc4xmpCore) "http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/"
+register namespace (lr) "http://ns.adobe.com/lightroom/1.0/"
+register namespace (mediapro) "http://ns.iview-multimedia.com/mediapro/1.0/"
+register namespace (MP) "http://ns.microsoft.com/photo/1.2/"
+register namespace (photoshop) "http://ns.adobe.com/photoshop/1.0/"
+register namespace (plus) "http://ns.useplus.org/ldf/xmp/1.0/"
+register namespace (pdf) "http://ns.adobe.com/pdf/1.3/"
+register namespace (pdfx) "http://ns.adobe.com/pdfx/1.3/"
+register namespace (prism) "http://prismstandard.org/namespaces/basic/2.1/"
+register namespace (prl) "http://prismstandard.org/namespaces/prl/2.1/"
+register namespace (psAux) "http://ns.adobe.com/exif/1.0/aux/"
+register namespace (rdf) "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+register namespace (stEvt) "http://ns.adobe.com/xap/1.0/sType/ResourceEvent#"
+register namespace (stEvt) "http://ns.adobe.com/xap/1.0/sType/ResourceEvent#"
+register namespace (tiff) "http://ns.adobe.com/tiff/1.0/"
+register namespace (xmp) "http://ns.adobe.com/xap/1.0/"
+register namespace (xmpBJ) "http://ns.adobe.com/xap/1.0/bj/"
+register namespace (xmpDM) "http://ns.adobe.com/xmp/1.0/DynamicMedia/"
+register namespace (xmpMM) "http://ns.adobe.com/xap/1.0/mm/"
+register namespace (xmpNote) "http://ns.adobe.com/xmp/note/"
+register namespace (xmpPLUS) "http://ns.adobe.com/xap/1.0/PLUS/"
+register namespace (xmpRights) "http://ns.adobe.com/xap/1.0/rights/"
+register namespace (xmpTPg) "http://ns.adobe.com/xap/1.0/t/pg/"
+