This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 6a098b7d2 TIKA-4357 -- improve metadata key prefixing for PDFs and
html (#2061)
6a098b7d2 is described below
commit 6a098b7d27b3011caf6c92639678d1413bf8929c
Author: Tim Allison <[email protected]>
AuthorDate: Wed Dec 4 16:11:50 2024 -0500
TIKA-4357 -- improve metadata key prefixing for PDFs and html (#2061)
* TIKA-4357 -- improve metadata key prefixing for PDFs and html
* TIKA-4357 -- fix unit test
---
.../test/java/org/apache/tika/cli/TikaCLITest.java | 2 +-
.../src/main/java/org/apache/tika/metadata/HTML.java | 4 ++--
.../java/org/apache/tika/parser/html/HtmlHandler.java | 7 ++++---
.../org/apache/tika/parser/html/HtmlParserTest.java | 19 ++++++++-----------
.../java/org/apache/tika/parser/pdf/PDFParser.java | 2 --
.../org/apache/tika/parser/pdf/PDFParserTest.java | 12 ++++++------
6 files changed, 21 insertions(+), 25 deletions(-)
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 286a020ee..bc2211826 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -219,7 +219,7 @@ public class TikaCLITest {
public void testJsonMetadataOutput() throws Exception {
String json = getParamOutContent("--json", "--digest=MD2",
resourcePrefix + "testJsonMultipleInts.html");
//TIKA-1310
- assertTrue(json.contains("\"" + "fb:admins\":\"1,2,3,4\","));
+ assertTrue(json.contains("\"html_meta:fb:admins\":\"1,2,3,4\","));
assertTrue(json.contains("\"X-TIKA:digest:MD2\":"));
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/HTML.java
b/tika-core/src/main/java/org/apache/tika/metadata/HTML.java
index 3e37cf632..c5984caba 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/HTML.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/HTML.java
@@ -16,7 +16,7 @@ package org.apache.tika.metadata; /*
*/
public interface HTML {
- String PREFIX_HTML_META = "html_meta";
+ String PREFIX_HTML_META = "html_meta" +
TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
/**
@@ -24,6 +24,6 @@ public interface HTML {
* is set in the embedded document's metadata
*/
Property SCRIPT_SOURCE = Property.internalText(
- PREFIX_HTML_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
"scriptSrc");
+ PREFIX_HTML_META + "scriptSrc");
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
index d39fe0da6..f2bcba722 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -143,7 +143,7 @@ class HtmlHandler extends TextContentHandler {
addHtmlMetadata(atts.getValue("name"),
atts.getValue("content"));
} else if (atts.getValue("property") != null) {
// TIKA-983: Handle <meta property="og:xxx" content="yyy"
/> tags
- metadata.add(atts.getValue("property"),
atts.getValue("content"));
+ metadata.add(HTML.PREFIX_HTML_META +
atts.getValue("property"), atts.getValue("content"));
}
} else if ("BASE".equals(name) && atts.getValue("href") != null) {
startElementWithSafeAttributes("base", atts);
@@ -222,14 +222,15 @@ class HtmlHandler extends TextContentHandler {
if (property.equals(TikaCoreProperties.TITLE) &&
isTitleSetToMetadata) {
//prefer the title element if it is already set
//do nothing
+ metadata.add(HTML.PREFIX_HTML_META +
TikaCoreProperties.TITLE.getName(), value);
} else if (property.isMultiValuePermitted()) {
metadata.add(property, value);
} else {
metadata.set(property, value);
}
+ } else {
+ metadata.add(HTML.PREFIX_HTML_META + name, value);
}
- //TODO -- we should prefix these raw names to avoid collisions
- metadata.add(name, value);
}
private void startElementWithSafeAttributes(String name, Attributes atts)
throws SAXException {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 2fcc4f6b0..72c93c138 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -70,6 +70,7 @@ import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Geographic;
+import org.apache.tika.metadata.HTML;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -109,8 +110,8 @@ public class HtmlParserTest extends TikaTest {
}
assertEquals("Title : Test Indexation Html",
metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Tika Developers", metadata.get("Author"));
- assertEquals("5", metadata.get("refresh"));
+ assertEquals("Tika Developers",
metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("5", metadata.get(HTML.PREFIX_HTML_META + "refresh"));
assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));
@@ -152,8 +153,8 @@ public class HtmlParserTest extends TikaTest {
metadata.get(Metadata.CONTENT_TYPE).startsWith("application/xhtml+xml;
charset="));
assertEquals("XHTML test document",
metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Tika Developers", metadata.get("Author"));
- assertEquals("5", metadata.get("refresh"));
+ assertEquals("Tika Developers",
metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("5", metadata.get(HTML.PREFIX_HTML_META + "refresh"));
assertContains("ability of Apache Tika", content);
assertContains("extract content", content);
assertContains("an XHTML document", content);
@@ -809,8 +810,8 @@ public class HtmlParserTest extends TikaTest {
Metadata metadata = new Metadata();
new JSoupParser().parse(new
ByteArrayInputStream(test1.getBytes(ISO_8859_1)),
new BodyContentHandler(), metadata, new ParseContext());
- assertEquals("some description", metadata.get("og:description"));
- assertTrue(metadata.isMultiValued("og:image"));
+ assertEquals("some description", metadata.get(HTML.PREFIX_HTML_META +
"og:description"));
+ assertTrue(metadata.isMultiValued(HTML.PREFIX_HTML_META + "og:image"));
}
// TIKA-1011
@@ -1220,19 +1221,15 @@ public class HtmlParserTest extends TikaTest {
List<Metadata> metadataList =
getRecursiveMetadata("testHTML_metadata.html");
Metadata m = metadataList.get(0);
assertEquals("Free Web tutorials",
m.get(TikaCoreProperties.DESCRIPTION));
- assertEquals("Free Web tutorials", m.get("description"));
assertEquals("HTML,CSS,XML,JavaScript",
m.get(TikaCoreProperties.SUBJECT));
- assertEquals("HTML,CSS,XML,JavaScript", m.get("keywords"));
assertEquals("HTML,CSS,XML,JavaScript", m.get(Office.KEYWORDS));
assertEquals("HTML,CSS,XML,JavaScript", m.get(Office.KEYWORDS));
assertEquals("OldMetaTitle", m.get(TikaCoreProperties.TITLE));
- assertEquals("OldMetaTitle", m.get("title"));
assertEquals("John Doe", m.get(TikaCoreProperties.CREATOR));
- assertEquals("John Doe", m.get("author"));
}
@Test
@@ -1242,7 +1239,7 @@ public class HtmlParserTest extends TikaTest {
Metadata m = metadataList.get(0);
assertEquals("ActualTitle", m.get(TikaCoreProperties.TITLE));
- assertEquals("OldMetaTitle", m.get("title"));
+ assertEquals("OldMetaTitle", m.get(HTML.PREFIX_HTML_META +
TikaCoreProperties.TITLE.getName()));
}
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 77ccb9231..afebecfca 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -630,8 +630,6 @@ public class PDFParser implements Parser, RenderingParser,
Initializable {
for (COSName key : info.getCOSObject().keySet()) {
String name = key.getName();
if (!handledMetadata.contains(name)) {
- PDMetadataExtractor
- .addMetadata(metadata, name,
info.getCOSObject().getDictionaryObject(key));
PDMetadataExtractor.addMetadata(metadata,
PDF.PDF_DOC_INFO_CUSTOM_PREFIX + name,
info.getCOSObject().getDictionaryObject(key));
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 0c1a0bae2..b0612f6ce 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -150,7 +150,7 @@ public class PDFParserTest extends TikaTest {
}
@Test
- public void testCustomMetadata() throws Exception {
+ public void testCustomMetadataInPDDocInfo() throws Exception {
XMLResult r = getXML("testPDF-custommetadata.pdf");
Metadata metadata = r.metadata;
@@ -158,12 +158,12 @@ public class PDFParserTest extends TikaTest {
assertEquals("Document author",
metadata.get(TikaCoreProperties.CREATOR));
assertEquals("Document title", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Custom Value", metadata.get("Custom Property"));
+ assertEquals("Custom Value",
metadata.get(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Property"));
- assertEquals("Array Entry 1", metadata.get("Custom Array"));
- assertEquals(2, metadata.getValues("Custom Array").length);
- assertEquals("Array Entry 1", metadata.getValues("Custom Array")[0]);
- assertEquals("Array Entry 2", metadata.getValues("Custom Array")[1]);
+ assertEquals("Array Entry 1",
metadata.get(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array"));
+ assertEquals(2, metadata.getValues(PDF.PDF_DOC_INFO_CUSTOM_PREFIX +
"Custom Array").length);
+ assertEquals("Array Entry 1",
metadata.getValues(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array")[0]);
+ assertEquals("Array Entry 2",
metadata.getValues(PDF.PDF_DOC_INFO_CUSTOM_PREFIX + "Custom Array")[1]);
assertContains("Hello World!", r.xml);
}