This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push: new 1a33c7ec2 TIKA-3915 -- extract errors field from siegfried; add test for warnings 1a33c7ec2 is described below commit 1a33c7ec287f7cbcfde399a74dc1428a16210a2d Author: tballison <talli...@apache.org> AuthorDate: Thu Nov 3 13:37:22 2022 -0400 TIKA-3915 -- extract errors field from siegfried; add test for warnings --- .../tika/detect/siegfried/SiegfriedDetector.java | 19 ++++++++++-- .../detect/siegfried/TestSiegfriedJsonParsing.java | 35 ++++++++++++++++++++++ .../src/test/resources/json/test-basic.json | 2 +- .../src/test/resources/json/test-errors.json | 1 + .../src/test/resources/json/test-warnings.json | 1 + 5 files changed, 55 insertions(+), 3 deletions(-) diff --git a/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java b/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java index 5a2c11079..fe42a9f3d 100644 --- a/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java +++ b/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java @@ -69,6 +69,9 @@ public class SiegfriedDetector implements Detector { public static Property SIEGFRIED_IDENTIFIERS_DETAILS = Property.externalTextBag(SIEGFRIED_PREFIX + "identifiers_details"); + public static Property SIEGFRIED_ERRORS = + Property.externalTextBag(SIEGFRIED_PREFIX + "errors"); + //TODO -- grab errors and warnings public static String ID = "id"; @@ -78,6 +81,8 @@ public class SiegfriedDetector implements Detector { public static String WARNING = "warning"; public static String BASIS = "basis"; + public static String ERRORS = "errors"; + private static final Logger LOGGER = LoggerFactory.getLogger(SiegfriedDetector.class); private static final long DEFAULT_TIMEOUT_MS = 6000; private static final String DEFAULT_SIEGFRIED_PATH = "sf"; @@ -203,8 +208,18 @@ public class SiegfriedDetector implements Detector { MediaType mt = MediaType.OCTET_STREAM; if (root.has("files")) { for (JsonNode file : root.get("files")) { - //TODO -/// String errors = file.get("errors").asText(""); + + if (file.has(ERRORS)) { + JsonNode errors = file.get(ERRORS); + if (errors.isTextual()) { + metadata.add(SIEGFRIED_ERRORS, file.get(ERRORS).asText()); + } else if (errors.isArray()) { + //is this even possible?! + for (JsonNode e : errors) { + metadata.add(SIEGFRIED_ERRORS, e.asText()); + } + } + } for (JsonNode match : file.get("matches")) { String ns = match.has("ns") ? match.get("ns").asText(StringUtils.EMPTY) : StringUtils.EMPTY; diff --git a/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedJsonParsing.java b/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedJsonParsing.java index c438d2c87..237eab51f 100644 --- a/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedJsonParsing.java +++ b/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedJsonParsing.java @@ -17,6 +17,8 @@ package org.apache.tika.detect.siegfried; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -47,6 +49,39 @@ public class TestSiegfriedJsonParsing extends TikaTest { } + @Test + public void testErrors() throws Exception { + FileProcessResult fileProcessResult = load("test-errors.json"); + Metadata metadata = new Metadata(); + SiegfriedDetector.processResult(fileProcessResult, metadata, false); + //debug(metadata); + assertEquals("1.9.5", metadata.get(SiegfriedDetector.SIEGFRIED_VERSION)); + assertEquals("default.sig", metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE)); + assertEquals("x-fmt/111", metadata.get("sf:pronom:id")); + assertEquals("extension match txt", metadata.get("sf:pronom:basis")); + assertEquals("Plain Text File", metadata.get("sf:pronom:format")); + assertEquals("text/plain", metadata.get("sf:pronom:mime")); + assertNull(metadata.get("sf:pronom:version")); + assertEquals("empty source", metadata.get(SiegfriedDetector.SIEGFRIED_ERRORS)); + } + + @Test + public void testWarnings() throws Exception { + FileProcessResult fileProcessResult = load("test-warnings.json"); + Metadata metadata = new Metadata(); + SiegfriedDetector.processResult(fileProcessResult, metadata, false); + assertEquals("1.9.5", metadata.get(SiegfriedDetector.SIEGFRIED_VERSION)); + assertEquals("default.sig", metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE)); + assertEquals("UNKNOWN", metadata.get("sf:pronom:id")); + assertNull(metadata.get("sf:pronom:basis")); + assertNull(metadata.get("sf:pronom:format")); + assertNull(metadata.get("sf:pronom:mime")); + assertNull(metadata.get("sf:pronom:version")); + assertTrue(metadata.get("sf:pronom:warning") + .startsWith("no match; possibilities based on extension are fmt/14, fmt/15, fmt/16, " + + "fmt/17, fmt/18, fmt/19")); + } + diff --git a/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-basic.json b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-basic.json index 2ad099711..0a7c48aab 100644 --- a/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-basic.json +++ b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-basic.json @@ -3,7 +3,7 @@ "files": [ { "errors": "", - "filename": "/home/tallison/data/jfk/oswald/104-10263-10202.pdf", + "filename": "something.pdf", "filesize": 810825, "matches": [ { diff --git a/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-errors.json b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-errors.json new file mode 100644 index 000000000..49baf8dd6 --- /dev/null +++ b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-errors.json @@ -0,0 +1 @@ +{"siegfried":"1.9.5","scandate":"2022-11-03T13:12:46-04:00","signature":"default.sig","created":"2022-09-12T23:45:48+02:00","identifiers":[{"name":"pronom","details":"DROID_SignatureFile_V108.xml; container-signature-20220905.xml"}],"files":[{"filename":"tst.txt","filesize": 0,"modified":"2022-11-03T13:12:41-04:00","errors": "empty source","matches": [{"ns":"pronom","id":"x-fmt/111","format":"Plain Text File","version":"","mime":"text/plain","basis":"extension match txt","warning":"match [...] \ No newline at end of file diff --git a/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-warnings.json b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-warnings.json new file mode 100644 index 000000000..b83b03749 --- /dev/null +++ b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-warnings.json @@ -0,0 +1 @@ +{"siegfried":"1.9.5","scandate":"2022-11-03T13:28:19-04:00","signature":"default.sig","created":"2022-09-12T23:45:48+02:00","identifiers":[{"name":"pronom","details":"DROID_SignatureFile_V108.xml; container-signature-20220905.xml"}],"files":[{"filename":"pub1859-1-truncated.pdf","filesize": 159315,"modified":"2022-07-26T11:26:07-04:00","errors": "","matches": [{"ns":"pronom","id":"UNKNOWN","format":"","version":"","mime":"","basis":"","warning":"no match; possibilities based on extension [...] \ No newline at end of file