This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 0b93901  TIKA-3645 -- improve flexibility of specifying maps as params 
for parsers in TikaConfig
0b93901 is described below

commit 0b93901c4aa1a48a885b59b95f7c5d6689067ab6
Author: tballison <[email protected]>
AuthorDate: Wed Jan 12 10:41:25 2022 -0500

    TIKA-3645 -- improve flexibility of specifying maps as params for parsers 
in TikaConfig
---
 CHANGES.txt                                        |   3 ++
 .../main/java/org/apache/tika/config/Param.java    |  15 +++++-
 .../tika/parser/external2/ExternalParserTest.java  |  20 ++++++++
 .../tika/config/TIKA-3557-exiftool-example.xml     |  53 +++++++++++++++++++++
 .../test-documents/testOverlappingText.pdf         | Bin 0 -> 899 bytes
 5 files changed, 89 insertions(+), 2 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index fa692ed..9efa78d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 2.2.2 - ???
 
+   * Improve configuration of maps as params for parsers in
+     TikaConfig (TIKA-3645).
+
    * Improve identification of iWorks 13 files and add parsing
      for thumbnails, some metadata and attachments (TIKA-3634).
 
diff --git a/tika-core/src/main/java/org/apache/tika/config/Param.java 
b/tika-core/src/main/java/org/apache/tika/config/Param.java
index 73d2154..7de36ac 100644
--- a/tika-core/src/main/java/org/apache/tika/config/Param.java
+++ b/tika-core/src/main/java/org/apache/tika/config/Param.java
@@ -233,8 +233,19 @@ public class Param<T> implements Serializable {
         ret.actualValue = (T) new HashMap<>();
         while (child != null) {
             if (child.getNodeType() == Node.ELEMENT_NODE) {
-                String key = child.getLocalName();
-                String value = child.getTextContent();
+                String key = "";
+                String value = "";
+                if (child.getAttributes().getNamedItem("key") != null) {
+                    key = 
child.getAttributes().getNamedItem("key").getNodeValue();
+                    if (child.getAttributes().getNamedItem("value") != null) {
+                        value = 
child.getAttributes().getNamedItem("value").getNodeValue();
+                    } else {
+                        value = child.getTextContent();
+                    }
+                } else {
+                    key = child.getLocalName();
+                    value = child.getTextContent();
+                }
                 if (((Map)ret.actualValue).containsKey(key)) {
                     throw new TikaConfigException("Duplicate keys are not 
allowed: " + key);
                 }
diff --git 
a/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java
 
b/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java
index d510034..d4c3899 100644
--- 
a/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java
+++ 
b/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java
@@ -21,6 +21,7 @@ import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
+import java.util.List;
 
 import org.junit.jupiter.api.Test;
 import org.xml.sax.ContentHandler;
@@ -30,6 +31,7 @@ import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -79,4 +81,22 @@ public class ExternalParserTest extends TikaTest {
             assertContains("<body>text/xml</body>", 
xmlResult.xml.replaceAll("[\r\n]", ""));
         }
     }
+
+    @Test
+    public void testExifTool() throws Exception {
+        assumeTrue(org.apache.tika.parser.external.ExternalParser.check(new 
String[]{"exiftool",
+                "-ver"}));
+        try (InputStream is =
+                     
TikaConfig.class.getResourceAsStream("TIKA-3557-exiftool-example.xml")) {
+            TikaConfig config = new TikaConfig(is);
+            Parser p = new AutoDetectParser(config);
+            //this was the smallest pdf we had
+            List<Metadata> metadataList = 
getRecursiveMetadata("testOverlappingText.pdf", p);
+            assertEquals(1, metadataList.size());
+            Metadata m = metadataList.get(0);
+            assertEquals("application/pdf", m.get("mime"));
+            assertEquals("1", m.get("pages"));
+            assertEquals("1.4", m.get("pdf:version"));
+        }
+    }
 }
diff --git 
a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3557-exiftool-example.xml
 
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3557-exiftool-example.xml
new file mode 100644
index 0000000..efe430f
--- /dev/null
+++ 
b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3557-exiftool-example.xml
@@ -0,0 +1,53 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.external2.ExternalParser">
+      <params>
+        <param name="supportedTypes" type="list">
+          <string>application/octet-stream</string>
+        </param>
+        <param name="commandLine" type="list">
+          <string>exiftool</string>
+          <string>${INPUT_FILE}</string>
+        </param>
+        <param name="checkExitValues" type="list">
+          <!-- which exit values confirm that the application basically works 
-->
+          <int>0</int>
+        </param>
+        <param name="outputParser" type="class" 
class="org.apache.tika.parser.RegexCaptureParser">
+          <params>
+            <param name="regexMap" type="map">
+              <match key="mime" value="^MIME Type\s+: ([^\r\n]+)"/>
+              <match key="pages" value="^Page Count\s+: ([^\r\n]+)"/>
+              <match key="pdf:version" value="^PDF Version\s+: ([^\r\n]+)"/>
+              <!--
+              <match key="dc:title" value="^Title\s+: ([^\r\n]+)"/>
+              <match key="dc:creator" value="^Author\s+: ([^\r\n]+)"/>
+              <match key="dc:created" value="^Create Date\s+: ([^\r\n]+)"/>
+              <match key="dc:modify" value="^Modify Date\s+: ([^\r\n]+)"/>
+              <match key="producer" value="^Producer\s+: ([^\r\n]+)"/>
+              <match key="creator_tool" value="^Creator Tool\s+: ([^\r\n]+)"/>
+              <match key="tagged" value="^Tagged PDF\s+: ([^\r\n]+)"/> -->
+            </param>
+          </params>
+        </param>
+      </params>
+    </parser>
+  </parsers>
+</properties>
diff --git 
a/tika-core/src/test/resources/test-documents/testOverlappingText.pdf 
b/tika-core/src/test/resources/test-documents/testOverlappingText.pdf
new file mode 100644
index 0000000..282a1ab
Binary files /dev/null and 
b/tika-core/src/test/resources/test-documents/testOverlappingText.pdf differ

Reply via email to