This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 6a098b7d2 TIKA-4357 -- improve metadata key prefixing for PDFs and 
html (#2061)
6a098b7d2 is described below

commit 6a098b7d27b3011caf6c92639678d1413bf8929c
Author: Tim Allison <[email protected]>
AuthorDate: Wed Dec 4 16:11:50 2024 -0500

    TIKA-4357 -- improve metadata key prefixing for PDFs and html (#2061)
    
    * TIKA-4357 -- improve metadata key prefixing for PDFs and html
    
    * TIKA-4357 -- fix unit test
---
 .../test/java/org/apache/tika/cli/TikaCLITest.java    |  2 +-
 .../src/main/java/org/apache/tika/metadata/HTML.java  |  4 ++--
 .../java/org/apache/tika/parser/html/HtmlHandler.java |  7 ++++---
 .../org/apache/tika/parser/html/HtmlParserTest.java   | 19 ++++++++-----------
 .../java/org/apache/tika/parser/pdf/PDFParser.java    |  2 --
 .../org/apache/tika/parser/pdf/PDFParserTest.java     | 12 ++++++------
 6 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java 
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 286a020ee..bc2211826 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -219,7 +219,7 @@ public class TikaCLITest {
     public void testJsonMetadataOutput() throws Exception {
         String json = getParamOutContent("--json", "--digest=MD2", 
resourcePrefix + "testJsonMultipleInts.html");
         //TIKA-1310
-        assertTrue(json.contains("\"" + "fb:admins\":\"1,2,3,4\","));
+        assertTrue(json.contains("\"html_meta:fb:admins\":\"1,2,3,4\","));
         assertTrue(json.contains("\"X-TIKA:digest:MD2\":"));
     }
 
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/HTML.java 
b/tika-core/src/main/java/org/apache/tika/metadata/HTML.java
index 3e37cf632..c5984caba 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/HTML.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/HTML.java
@@ -16,7 +16,7 @@ package org.apache.tika.metadata; /*
  */
 
 public interface HTML {
-    String PREFIX_HTML_META = "html_meta";
+    String PREFIX_HTML_META = "html_meta" + 
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
 
 
     /**
@@ -24,6 +24,6 @@ public interface HTML {
      * is set in the embedded document's metadata
      */
     Property SCRIPT_SOURCE = Property.internalText(
-            PREFIX_HTML_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + 
"scriptSrc");
+            PREFIX_HTML_META + "scriptSrc");
 
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
index d39fe0da6..f2bcba722 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -143,7 +143,7 @@ class HtmlHandler extends TextContentHandler {
                     addHtmlMetadata(atts.getValue("name"), 
atts.getValue("content"));
                 } else if (atts.getValue("property") != null) {
                     // TIKA-983: Handle <meta property="og:xxx" content="yyy" 
/> tags
-                    metadata.add(atts.getValue("property"), 
atts.getValue("content"));
+                    metadata.add(HTML.PREFIX_HTML_META + 
atts.getValue("property"), atts.getValue("content"));
                 }
             } else if ("BASE".equals(name) && atts.getValue("href") != null) {
                 startElementWithSafeAttributes("base", atts);
@@ -222,14 +222,15 @@ class HtmlHandler extends TextContentHandler {
             if (property.equals(TikaCoreProperties.TITLE) && 
isTitleSetToMetadata) {
                 //prefer the title element if it is already set
                 //do nothing
+                metadata.add(HTML.PREFIX_HTML_META + 
TikaCoreProperties.TITLE.getName(), value);
             } else if (property.isMultiValuePermitted()) {
                 metadata.add(property, value);
             } else {
                 metadata.set(property, value);
             }
+        } else {
+            metadata.add(HTML.PREFIX_HTML_META + name, value);
         }
-        //TODO -- we should prefix these raw names to avoid collisions
-        metadata.add(name, value);
     }
 
     private void startElementWithSafeAttributes(String name, Attributes atts) 
throws SAXException {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 2fcc4f6b0..72c93c138 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -70,6 +70,7 @@ import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Geographic;
+import org.apache.tika.metadata.HTML;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -109,8 +110,8 @@ public class HtmlParserTest extends TikaTest {
         }
 
         assertEquals("Title : Test Indexation Html", 
metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Tika Developers", metadata.get("Author"));
-        assertEquals("5", metadata.get("refresh"));
+        assertEquals("Tika Developers", 
metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("5", metadata.get(HTML.PREFIX_HTML_META + "refresh"));
 
         assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
         assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
@@ -152,8 +153,8 @@ public class HtmlParserTest extends TikaTest {
                 
metadata.get(Metadata.CONTENT_TYPE).startsWith("application/xhtml+xml; 
charset="));
         assertEquals("XHTML test document", 
metadata.get(TikaCoreProperties.TITLE));
 
-        assertEquals("Tika Developers", metadata.get("Author"));
-        assertEquals("5", metadata.get("refresh"));
+        assertEquals("Tika Developers", 
metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("5", metadata.get(HTML.PREFIX_HTML_META + "refresh"));
         assertContains("ability of Apache Tika", content);
         assertContains("extract content", content);
         assertContains("an XHTML document", content);
@@ -809,8 +810,8 @@ public class HtmlParserTest extends TikaTest {
         Metadata metadata = new Metadata();
         new JSoupParser().parse(new 
ByteArrayInputStream(test1.getBytes(ISO_8859_1)),
                 new BodyContentHandler(), metadata, new ParseContext());
-        assertEquals("some description", metadata.get("og:description"));
-        assertTrue(metadata.isMultiValued("og:image"));
+        assertEquals("some description", metadata.get(HTML.PREFIX_HTML_META + 
"og:description"));
+        assertTrue(metadata.isMultiValued(HTML.PREFIX_HTML_META + "og:image"));
     }
 
     // TIKA-1011
@@ -1220,19 +1221,15 @@ public class HtmlParserTest extends TikaTest {
         List<Metadata> metadataList = 
getRecursiveMetadata("testHTML_metadata.html");
         Metadata m = metadataList.get(0);
         assertEquals("Free Web tutorials", 
m.get(TikaCoreProperties.DESCRIPTION));
-        assertEquals("Free Web tutorials", m.get("description"));
 
         assertEquals("HTML,CSS,XML,JavaScript", 
m.get(TikaCoreProperties.SUBJECT));
-        assertEquals("HTML,CSS,XML,JavaScript", m.get("keywords"));
 
         assertEquals("HTML,CSS,XML,JavaScript", m.get(Office.KEYWORDS));
         assertEquals("HTML,CSS,XML,JavaScript", m.get(Office.KEYWORDS));
 
         assertEquals("OldMetaTitle", m.get(TikaCoreProperties.TITLE));
-        assertEquals("OldMetaTitle", m.get("title"));
 
         assertEquals("John Doe", m.get(TikaCoreProperties.CREATOR));
-        assertEquals("John Doe", m.get("author"));
     }
 
     @Test
@@ -1242,7 +1239,7 @@ public class HtmlParserTest extends TikaTest {
         Metadata m = metadataList.get(0);
 
         assertEquals("ActualTitle", m.get(TikaCoreProperties.TITLE));
-        assertEquals("OldMetaTitle", m.get("title"));
+        assertEquals("OldMetaTitle", m.get(HTML.PREFIX_HTML_META + 
TikaCoreProperties.TITLE.getName()));
     }
 
     @Test
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 77ccb9231..afebecfca 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -630,8 +630,6 @@ public class PDFParser implements Parser, RenderingParser, 
Initializable {
         for (COSName key : info.getCOSObject().keySet()) {
             String name = key.getName();
             if (!handledMetadata.contains(name)) {
-                PDMetadataExtractor
-                        .addMetadata(metadata, name, 
info.getCOSObject().getDictionaryObject(key));
                 PDMetadataExtractor.addMetadata(metadata, 
PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name,
                         info.getCOSObject().getDictionaryObject(key));
             }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 0c1a0bae2..b0612f6ce 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -150,7 +150,7 @@ public class PDFParserTest extends TikaTest {
     }
 
     @Test
-    public void testCustomMetadata() throws Exception {
+    public void testCustomMetadataInPDDocInfo() throws Exception {
 
         XMLResult r = getXML("testPDF-custommetadata.pdf");
         Metadata metadata = r.metadata;
@@ -158,12 +158,12 @@ public class PDFParserTest extends TikaTest {
         assertEquals("Document author", 
metadata.get(TikaCoreProperties.CREATOR));
         assertEquals("Document title", metadata.get(TikaCoreProperties.TITLE));
 
-        assertEquals("Custom Value", metadata.get("Custom Property"));
+        assertEquals("Custom Value", 
metadata.get(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Property"));
 
-        assertEquals("Array Entry 1", metadata.get("Custom Array"));
-        assertEquals(2, metadata.getValues("Custom Array").length);
-        assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]);
-        assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]);
+        assertEquals("Array Entry 1", 
metadata.get(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array"));
+        assertEquals(2, metadata.getValues(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + 
"Custom Array").length);
+        assertEquals("Array Entry 1", 
metadata.getValues(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array")[0]);
+        assertEquals("Array Entry 2", 
metadata.getValues(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array")[1]);
 
         assertContains("Hello World!", r.xml);
     }

Reply via email to