This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 0b93901 TIKA-3645 -- improve flexibility of specifying maps as params
for parsers in TikaConfig
0b93901 is described below
commit 0b93901c4aa1a48a885b59b95f7c5d6689067ab6
Author: tballison <[email protected]>
AuthorDate: Wed Jan 12 10:41:25 2022 -0500
TIKA-3645 -- improve flexibility of specifying maps as params for parsers
in TikaConfig
---
CHANGES.txt | 3 ++
.../main/java/org/apache/tika/config/Param.java | 15 +++++-
.../tika/parser/external2/ExternalParserTest.java | 20 ++++++++
.../tika/config/TIKA-3557-exiftool-example.xml | 53 +++++++++++++++++++++
.../test-documents/testOverlappingText.pdf | Bin 0 -> 899 bytes
5 files changed, 89 insertions(+), 2 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index fa692ed..9efa78d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 2.2.2 - ???
+ * Improve configuration of maps as params for parsers in
+ TikaConfig (TIKA-3645).
+
* Improve identification of iWorks 13 files and add parsing
for thumbnails, some metadata and attachments (TIKA-3634).
diff --git a/tika-core/src/main/java/org/apache/tika/config/Param.java
b/tika-core/src/main/java/org/apache/tika/config/Param.java
index 73d2154..7de36ac 100644
--- a/tika-core/src/main/java/org/apache/tika/config/Param.java
+++ b/tika-core/src/main/java/org/apache/tika/config/Param.java
@@ -233,8 +233,19 @@ public class Param<T> implements Serializable {
ret.actualValue = (T) new HashMap<>();
while (child != null) {
if (child.getNodeType() == Node.ELEMENT_NODE) {
- String key = child.getLocalName();
- String value = child.getTextContent();
+ String key = "";
+ String value = "";
+ if (child.getAttributes().getNamedItem("key") != null) {
+ key =
child.getAttributes().getNamedItem("key").getNodeValue();
+ if (child.getAttributes().getNamedItem("value") != null) {
+ value =
child.getAttributes().getNamedItem("value").getNodeValue();
+ } else {
+ value = child.getTextContent();
+ }
+ } else {
+ key = child.getLocalName();
+ value = child.getTextContent();
+ }
if (((Map)ret.actualValue).containsKey(key)) {
throw new TikaConfigException("Duplicate keys are not
allowed: " + key);
}
diff --git
a/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java
b/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java
index d510034..d4c3899 100644
---
a/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java
+++
b/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java
@@ -21,6 +21,7 @@ import static org.junit.jupiter.api.Assumptions.assumeTrue;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
+import java.util.List;
import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
@@ -30,6 +31,7 @@ import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -79,4 +81,22 @@ public class ExternalParserTest extends TikaTest {
assertContains("<body>text/xml</body>",
xmlResult.xml.replaceAll("[\r\n]", ""));
}
}
+
+ @Test
+ public void testExifTool() throws Exception {
+ assumeTrue(org.apache.tika.parser.external.ExternalParser.check(new
String[]{"exiftool",
+ "-ver"}));
+ try (InputStream is =
+
TikaConfig.class.getResourceAsStream("TIKA-3557-exiftool-example.xml")) {
+ TikaConfig config = new TikaConfig(is);
+ Parser p = new AutoDetectParser(config);
+ //this was the smallest pdf we had
+ List<Metadata> metadataList =
getRecursiveMetadata("testOverlappingText.pdf", p);
+ assertEquals(1, metadataList.size());
+ Metadata m = metadataList.get(0);
+ assertEquals("application/pdf", m.get("mime"));
+ assertEquals("1", m.get("pages"));
+ assertEquals("1.4", m.get("pdf:version"));
+ }
+ }
}
diff --git
a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3557-exiftool-example.xml
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3557-exiftool-example.xml
new file mode 100644
index 0000000..efe430f
--- /dev/null
+++
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3557-exiftool-example.xml
@@ -0,0 +1,53 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.external2.ExternalParser">
+ <params>
+ <param name="supportedTypes" type="list">
+ <string>application/octet-stream</string>
+ </param>
+ <param name="commandLine" type="list">
+ <string>exiftool</string>
+ <string>${INPUT_FILE}</string>
+ </param>
+ <param name="checkExitValues" type="list">
+ <!-- which exit values confirm that the application basically works
-->
+ <int>0</int>
+ </param>
+ <param name="outputParser" type="class"
class="org.apache.tika.parser.RegexCaptureParser">
+ <params>
+ <param name="regexMap" type="map">
+ <match key="mime" value="^MIME Type\s+: ([^\r\n]+)"/>
+ <match key="pages" value="^Page Count\s+: ([^\r\n]+)"/>
+ <match key="pdf:version" value="^PDF Version\s+: ([^\r\n]+)"/>
+ <!--
+ <match key="dc:title" value="^Title\s+: ([^\r\n]+)"/>
+ <match key="dc:creator" value="^Author\s+: ([^\r\n]+)"/>
+ <match key="dc:created" value="^Create Date\s+: ([^\r\n]+)"/>
+ <match key="dc:modify" value="^Modify Date\s+: ([^\r\n]+)"/>
+ <match key="producer" value="^Producer\s+: ([^\r\n]+)"/>
+ <match key="creator_tool" value="^Creator Tool\s+: ([^\r\n]+)"/>
+ <match key="tagged" value="^Tagged PDF\s+: ([^\r\n]+)"/> -->
+ </param>
+ </params>
+ </param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git
a/tika-core/src/test/resources/test-documents/testOverlappingText.pdf
b/tika-core/src/test/resources/test-documents/testOverlappingText.pdf
new file mode 100644
index 0000000..282a1ab
Binary files /dev/null and
b/tika-core/src/test/resources/test-documents/testOverlappingText.pdf differ