svn commit: r1930760 - in pdfbox/branches/2.0/xmpbox/src: main/java/org/apache/xmpbox/xml test/java/org/apache/xmpbox/xml

tilman Sat, 20 Dec 2025 09:13:57 -0800

Author: tilman
Date: Sat Dec 20 17:13:48 2025
New Revision: 1930760

Log:
PDFBOX-6130: parse XMP files without processing instructions; improve javadoc; 
fix problem with attributes that had no prefix; put prefix into QName (for 
debugging); add test


Modified:
   
pdfbox/branches/2.0/xmpbox/src/main/java/org/apache/xmpbox/xml/DomXmpParser.java
   
pdfbox/branches/2.0/xmpbox/src/test/java/org/apache/xmpbox/xml/DomXmpParserTest.java

Modified: 
pdfbox/branches/2.0/xmpbox/src/main/java/org/apache/xmpbox/xml/DomXmpParser.java
==============================================================================
--- 
pdfbox/branches/2.0/xmpbox/src/main/java/org/apache/xmpbox/xml/DomXmpParser.java
    Sat Dec 20 16:00:25 2025        (r1930759)
+++ 
pdfbox/branches/2.0/xmpbox/src/main/java/org/apache/xmpbox/xml/DomXmpParser.java
    Sat Dec 20 17:13:48 2025        (r1930760)
@@ -109,9 +109,10 @@ public class DomXmpParser
     /**
      * Enable or disable strict parsing mode.
      *
-     * @param strictParsing Whether to be strict when parsing XMP: true (the 
default) means that
-     * malformed XMP will result in an exception, false means that if 
malformed content is
-     * encountered, the parser will continue its work if possible.
+     * @param strictParsing Whether to be strict or lenient when parsing XMP. 
True (the default)
+     * means that malformed XMP will result in an exception, false (lenient) 
means that if malformed
+     * content is encountered, the parser will continue its work if possible. 
Use strict mode if you
+     * want to work with PDF/A files. Use lenient mode if you care more about 
getting metadata.
      */
     public void setStrictParsing(boolean strictParsing)
     {
@@ -151,7 +152,17 @@ public class DomXmpParser
         // expect xpacket processing instruction
         if (!(node instanceof ProcessingInstruction))
         {
-            throw new XmpParsingException(ErrorType.XpacketBadStart, "xmp 
should start with a processing instruction");
+            if (strictParsing)
+            {
+                throw new XmpParsingException(ErrorType.XpacketBadStart, "xmp 
should start with a processing instruction");
+            }
+            else
+            {
+                xmp = 
XMPMetadata.createXMPMetadata(XmpConstants.DEFAULT_XPACKET_BEGIN,
+                        XmpConstants.DEFAULT_XPACKET_ID, 
+                        XmpConstants.DEFAULT_XPACKET_BYTES,
+                        XmpConstants.DEFAULT_XPACKET_ENCODING);
+            }
         }
         else
         {
@@ -178,7 +189,14 @@ public class DomXmpParser
         // expect xpacket end
         if (!(node instanceof ProcessingInstruction))
         {
-            throw new XmpParsingException(ErrorType.XpacketBadEnd, "xmp should 
end with a processing instruction");
+            if (strictParsing)
+            {
+                throw new XmpParsingException(ErrorType.XpacketBadEnd, "xmp 
should end with a processing instruction");
+            }
+            else
+            {
+                xmp.setEndXPacket(XmpConstants.DEFAULT_XPACKET_END);
+            }
         }
         else
         {
@@ -191,7 +209,7 @@ public class DomXmpParser
             throw new XmpParsingException(ErrorType.XpacketBadEnd,
                     "xmp should end after xpacket end processing instruction");
         }
-        // xpacket is OK and the is no more nodes
+        // xpacket is OK and there are no more nodes
         // Now, parse the content of root
         Element rdfRdf = findDescriptionsParent(root);
         nsFinder.push(rdfRdf); // PDFBOX-6099: push namespaces in rdf:RDF
@@ -320,7 +338,7 @@ public class DomXmpParser
         {
             ComplexPropertyContainer container = schema.getContainer();
             PropertyType type = checkPropertyDefinition(xmp,
-                    new QName(attr.getNamespaceURI(), attr.getLocalName()));
+                    new QName(attr.getNamespaceURI(), attr.getLocalName(), 
attr.getPrefix()));
 
             if (type == null)
             {
@@ -715,8 +733,11 @@ public class DomXmpParser
                     ((XMPSchema) sp).setAboutAsSimple(attr.getValue());
                 }
             }
-            else
+            else if (XMLConstants.XML_NS_URI.equals(attr.getNamespaceURI()))
             {
+                // This part was the fallback before PDFBOX-6130, now 
restricted:
+                // Do not load "ordinary" attributes here because these will 
be handled by
+                // tryParseAttributesAsProperties() and 
parseDescriptionRootAttr()
                 Attribute attribute = new Attribute(XMLConstants.XML_NS_URI, 
attr.getLocalName(), attr.getValue());
                 sp.setAttribute(attribute);
             }
@@ -921,12 +942,20 @@ public class DomXmpParser
 
     private Element findDescriptionsParent(Element root) throws 
XmpParsingException
     {
-        Element rdfRdf;
+        Element rdfRdf = null;
         // check if already rdf element, as xmpmeta wrapper can be optional
         if (!XmpConstants.RDF_NAMESPACE.equals(root.getNamespaceURI()))
         {
             // always <x:xmpmeta xmlns:x="adobe:ns:meta/">
-            expectNaming(root, "adobe:ns:meta/", "x", "xmpmeta");
+            if (!strictParsing && "xapmeta".equals(root.getLocalName()))
+            {
+                // older XMP content
+                expectNaming(root, "adobe:ns:meta/", "x", "xapmeta");
+            }
+            else
+            {
+                expectNaming(root, "adobe:ns:meta/", "x", "xmpmeta");
+            }
             // should only have one child
             NodeList nl = root.getChildNodes();
             if (nl.getLength() == 0)
@@ -937,14 +966,25 @@ public class DomXmpParser
             else if (nl.getLength() > 1)
             {
                 // only expect one element
-                throw new XmpParsingException(ErrorType.Format, "More than one 
element found in x:xmpmeta");
+                if (strictParsing)
+                {
+                    throw new XmpParsingException(ErrorType.Format, "More than 
one element found in x:xmpmeta");
+                }
+            }
+            // find element (there may be a text before the element)
+            for (int i = 0; i < nl.getLength(); ++i)
+            {
+                if (nl.item(i) instanceof Element)
+                {
+                    rdfRdf = (Element) nl.item(i);
+                    break;
+                }
             }
-            else if (!(root.getFirstChild() instanceof Element))
+            if (rdfRdf == null)
             {
                 // should be an element
                 throw new XmpParsingException(ErrorType.Format, "x:xmpmeta 
does not contains rdf:RDF element");
             } // else let's parse
-            rdfRdf = (Element) root.getFirstChild();
         }
         else
         {

Modified: 
pdfbox/branches/2.0/xmpbox/src/test/java/org/apache/xmpbox/xml/DomXmpParserTest.java
==============================================================================
--- 
pdfbox/branches/2.0/xmpbox/src/test/java/org/apache/xmpbox/xml/DomXmpParserTest.java
        Sat Dec 20 16:00:25 2025        (r1930759)
+++ 
pdfbox/branches/2.0/xmpbox/src/test/java/org/apache/xmpbox/xml/DomXmpParserTest.java
        Sat Dec 20 17:13:48 2025        (r1930760)
@@ -24,6 +24,7 @@ import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
 import java.util.Calendar;
 import java.util.List;
 import javax.xml.transform.TransformerException;
@@ -34,6 +35,8 @@ import org.apache.xmpbox.schema.DublinCo
 import org.apache.xmpbox.schema.ExifSchema;
 import org.apache.xmpbox.schema.PDFAIdentificationSchema;
 import org.apache.xmpbox.schema.PhotoshopSchema;
+import org.apache.xmpbox.schema.TiffSchema;
+import org.apache.xmpbox.schema.XMPBasicSchema;
 import org.apache.xmpbox.schema.XMPMediaManagementSchema;
 import org.apache.xmpbox.schema.XMPSchema;
 import org.apache.xmpbox.schema.XMPageTextSchema;
@@ -677,6 +680,10 @@ public class DomXmpParserTest
         {
             assertEquals("Expecting local name 'xmpmeta' and found 'xapmeta'", 
ex.getMessage());
         }
+        DomXmpParser xmpParser2 = new DomXmpParser();
+        xmpParser2.setStrictParsing(false);
+        XMPMetadata xmp2 = xmpParser2.parse(s.getBytes("utf-8"));
+        assertEquals(0, xmp2.getAllSchemas().size());
     }
 
     @Test
@@ -1327,4 +1334,96 @@ public class DomXmpParserTest
         assertEquals("uuid:0b306144-6a43-dcbd-6b3e-c6b6b1df873d", 
xmpMediaManagementSchema.getInstanceID());
         assertEquals("uuid:0b306144-6a43-dcbd-6b3e-c6b6b1df873d", 
xmpMediaManagementSchema.getDocumentID());
     }
-}
\ No newline at end of file
+
+    @Test
+    public void testNoProcessingInstruction() throws XmpParsingException, 
UnsupportedEncodingException, TransformerException
+    {
+        // From file 000163.pdf
+        // Coastal Services Magazine Volume 11_6 November/December
+        String s = "<x:xmpmeta xmlns:x=\"adobe:ns:meta/\" x:xmptk=\"Adobe XMP 
Core 4.1-c037 46.282696, Mon Apr 02 2007 18:36:42        \">\n" +
+                " <rdf:RDF 
xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\";>\n" +
+                "  <rdf:Description rdf:about=\"\"\n" +
+                "    xmlns:xapMM=\"http://ns.adobe.com/xap/1.0/mm/\"\n"; +
+                "    
xmlns:stRef=\"http://ns.adobe.com/xap/1.0/sType/ResourceRef#\"\n"; +
+                "    xmlns:tiff=\"http://ns.adobe.com/tiff/1.0/\"\n"; +
+                "    xmlns:xap=\"http://ns.adobe.com/xap/1.0/\"\n"; +
+                "    xmlns:exif=\"http://ns.adobe.com/exif/1.0/\"\n"; +
+                "    xmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n"; +
+                "    xmlns:photoshop=\"http://ns.adobe.com/photoshop/1.0/\"\n"; 
+
+                "   
xapMM:DocumentID=\"uuid:F1FEDA1D7D03DA11B0F6E4B4E63B0143\"\n" +
+                "   
xapMM:InstanceID=\"uuid:7A28FBF56920DA11B4BBB356C0A5C72B\"\n" +
+                "   tiff:Orientation=\"1\"\n" +
+                "   tiff:XResolution=\"3050000/10000\"\n" +
+                "   tiff:YResolution=\"3050000/10000\"\n" +
+                "   tiff:ResolutionUnit=\"2\"\n" +
+                "   tiff:NativeDigest=\"123456\"\n" +
+                "   xap:ModifyDate=\"2005-09-08T09:13:10-04:00\"\n" +
+                "   xap:CreatorTool=\"Adobe Photoshop CS2 Windows\"\n" +
+                "   xap:CreateDate=\"2005-08-02T13:47:24-04:00\"\n" +
+                "   xap:MetadataDate=\"2005-09-08T09:13:10-04:00\"\n" +
+                "   exif:ColorSpace=\"-1\"\n" +
+                "   exif:PixelXDimension=\"1525\"\n" +
+                "   exif:PixelYDimension=\"387\"\n" +
+                "   exif:NativeDigest=\"12345678\"\n" +
+                "   dc:format=\"image/tiff\"\n" +
+                "   photoshop:ColorMode=\"4\"\n" +
+                "   photoshop:ICCProfile=\"U.S. Web Coated (SWOP) v2\"\n" +
+                "   photoshop:History=\"\">\n" +
+                "   <xapMM:DerivedFrom\n" +
+                "    
stRef:instanceID=\"adobe:docid:photoshop:28ff3dc5-4801-11d8-85d1-bb49d244e2ef\"\n"
 +
+                "    
stRef:documentID=\"adobe:docid:photoshop:28ff3dc5-4801-11d8-85d1-bb49d244e2ef\"/>\n"
 +
+                "  </rdf:Description>\n" +
+                " </rdf:RDF>\n" +
+                "</x:xmpmeta>";
+        try
+        {
+            new DomXmpParser().parse(s.getBytes("utf-8"));
+            fail("XmpParsingException expected");
+        }
+        catch (XmpParsingException ex)
+        {
+            assertEquals("xmp should start with a processing instruction", 
ex.getMessage());
+        }
+        DomXmpParser xmpParser2 = new DomXmpParser();
+        xmpParser2.setStrictParsing(false);
+        XMPMetadata xmp2 = 
xmpParser2.parse(s.getBytes(StandardCharsets.UTF_8));
+        DublinCoreSchema dublinCoreSchema = xmp2.getDublinCoreSchema();
+        assertEquals("image/tiff", dublinCoreSchema.getFormat());
+        XMPMediaManagementSchema xmpMediaManagementSchema = 
xmp2.getXMPMediaManagementSchema();
+        assertEquals("uuid:F1FEDA1D7D03DA11B0F6E4B4E63B0143", 
xmpMediaManagementSchema.getDocumentID());
+        TiffSchema tiffSchema = (TiffSchema) xmp2.getSchema(TiffSchema.class);
+        assertEquals("[Orientation=IntegerType:1]", 
tiffSchema.getProperty(TiffSchema.ORIENTATION).toString());
+        PhotoshopSchema photoshopSchema = xmp2.getPhotoshopSchema();
+        assertEquals((Integer) 4, photoshopSchema.getColorMode());
+        ExifSchema exifSchema = (ExifSchema) xmp2.getSchema(ExifSchema.class);
+        assertEquals("[PixelXDimension=IntegerType:1525]", 
exifSchema.getProperty(ExifSchema.PIXEL_X_DIMENSION).toString());
+        XMPBasicSchema xmpBasicSchema = xmp2.getXMPBasicSchema();
+        assertEquals("Adobe Photoshop CS2 Windows", 
xmpBasicSchema.getCreatorTool());
+        XmpSerializer serializer = new XmpSerializer();
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        serializer.serialize(xmp2, baos, true);
+        // check that there are no isolated properties
+        // (Happened before the change at the bottom of loadAttributes())
+        String s2 = baos.toString("utf-8");
+        assertFalse(s2.contains(" ColorMode="));
+        assertFalse(s2.contains(" CreateDate="));
+        assertFalse(s2.contains(" CreatorTool="));
+        assertFalse(s2.contains(" DocumentID="));
+        // now make sure that parsing again still brings the same data
+        DomXmpParser xmpParser3 = new DomXmpParser();
+        xmpParser3.setStrictParsing(false);
+        XMPMetadata xmp3 = xmpParser3.parse(baos.toByteArray());
+        DublinCoreSchema dublinCoreSchema3 = xmp3.getDublinCoreSchema();
+        assertEquals("image/tiff", dublinCoreSchema3.getFormat());
+        XMPMediaManagementSchema xmpMediaManagementSchema3 = 
xmp3.getXMPMediaManagementSchema();
+        assertEquals("uuid:F1FEDA1D7D03DA11B0F6E4B4E63B0143", 
xmpMediaManagementSchema3.getDocumentID());
+        TiffSchema tiffSchema3 = (TiffSchema) xmp3.getSchema(TiffSchema.class);
+        assertEquals("[Orientation=IntegerType:1]", 
tiffSchema3.getProperty(TiffSchema.ORIENTATION).toString());
+        PhotoshopSchema photoshopSchema3 = xmp3.getPhotoshopSchema();
+        assertEquals((Integer) 4, photoshopSchema3.getColorMode());
+        ExifSchema exifSchema3 = (ExifSchema) xmp3.getSchema(ExifSchema.class);
+        assertEquals("[PixelXDimension=IntegerType:1525]", 
exifSchema3.getProperty(ExifSchema.PIXEL_X_DIMENSION).toString());
+        XMPBasicSchema xmpBasicSchema3 = xmp3.getXMPBasicSchema();
+        assertEquals("Adobe Photoshop CS2 Windows", 
xmpBasicSchema3.getCreatorTool());
+    }
+}

svn commit: r1930760 - in pdfbox/branches/2.0/xmpbox/src: main/java/org/apache/xmpbox/xml test/java/org/apache/xmpbox/xml

Reply via email to