This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 8c7592534 TIKA-3840: add extraction of ODF version from ODF files
8c7592534 is described below

commit 8c759253411562973a7445dcd2a7a24abbc2db84
Author: tallison <[email protected]>
AuthorDate: Mon Aug 29 11:25:45 2022 -0400

    TIKA-3840: add extraction of ODF version from ODF files
---
 CHANGES.txt                                        |   2 ++
 .../tika/parser/odf/OpenDocumentMetaParser.java    |  16 +++++++++++++-
 .../org/apache/tika/parser/odf/ODFParserTest.java  |  23 +++++++++++++++++++++
 .../versions/LibreOfficeBase_odb_1.3.odb           | Bin 0 -> 2419 bytes
 .../versions/LibreOfficeCalc_ods_1.3.ods           | Bin 0 -> 8812 bytes
 .../versions/LibreOfficeDraw_odg_1.3.odg           | Bin 0 -> 9166 bytes
 .../versions/LibreOfficeImpress_odp_1.3.odp        | Bin 0 -> 24035 bytes
 .../versions/LibreOfficeWriter_odt_1.3.odt         | Bin 0 -> 9683 bytes
 8 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 779fe25f6..a02a68c8c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 2.4.2 - ???
 
+   * Add extraction of ODF version from ODF files (TIKA-3840).
+
    * tika-parser-html-commons (BoilerPipeHandler) is no longer a
      a dependency of tika-parser-html-module. tika-app and tika-server-standard
      have added a dependency on tika-parser-html-commons.  However,
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
index 3cdd458a7..ae06aec02 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
@@ -50,14 +50,19 @@ import org.apache.tika.sax.xpath.XPathParser;
  * Parser for OpenDocument <code>meta.xml</code> files.
  */
 public class OpenDocumentMetaParser extends XMLParser {
+
+    public static final String ODF_VERSION_KEY = "odf:version";
     /**
      * Serial version UID
      */
     private static final long serialVersionUID = -8739250869531737584L;
 
     private static final String META_NS = 
"urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
+
+    private static final String OFFICE_NS = 
"urn:oasis:names:tc:opendocument:xmlns:office:1.0";
     private static final XPathParser META_XPATH = new XPathParser("meta", 
META_NS);
 
+    private static final XPathParser OFFICE_XPATH = new XPathParser("office", 
OFFICE_NS);
     private static ContentHandler getDublinCoreHandler(Metadata metadata, 
Property property,
                                                        String element) {
         return new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC, 
element, metadata, property);
@@ -83,6 +88,15 @@ public class OpenDocumentMetaParser extends XMLParser {
         return new TeeContentHandler(ch, branch);
     }
 
+    private static ContentHandler getVersion(ContentHandler ch, Metadata md) {
+        Matcher matcher = 
OFFICE_XPATH.parse("/office:document-meta/@office:version");
+        ContentHandler branch = new MatchingContentHandler(
+                new AttributeMetadataHandler(
+                        OFFICE_NS, "version", md,
+                        ODF_VERSION_KEY), matcher);
+        return new TeeContentHandler(ch, branch);
+    }
+
     @Deprecated
     private static ContentHandler getStatistic(ContentHandler ch, Metadata md, 
String name,
                                                String attribute) {
@@ -115,7 +129,7 @@ public class OpenDocumentMetaParser extends XMLParser {
                         getDublinCoreHandler(md, 
TikaCoreProperties.IDENTIFIER, "identifier"),
                         getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE, 
"language"),
                         getDublinCoreHandler(md, TikaCoreProperties.RIGHTS, 
"rights"));
-
+        ch = getVersion(ch, md);
         // Process the OO Meta Attributes
         ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
         // ODF uses dc:date for modified
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index 7a6decc8a..b9ee45bc2 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -118,6 +118,8 @@ public class ODFParserTest extends TikaTest {
                 assertEquals(null, metadata.get("custom:Info 3"));
                 assertEquals(null, metadata.get("custom:Info 4"));
 
+                assertEquals("1.0", 
metadata.get(OpenDocumentMetaParser.ODF_VERSION_KEY));
+
                 String content = handler.toString();
                 assertTrue(content.contains("This is a sample Open Office 
document," +
                         " written in NeoOffice 2.2.1 for the Mac."));
@@ -173,6 +175,8 @@ public class ODFParserTest extends TikaTest {
             assertEquals(null, metadata.get("nbPara"));
             assertEquals(null, metadata.get("nbWord"));
             assertEquals(null, metadata.get("nbCharacter"));
+            assertEquals("1.0", 
metadata.get(OpenDocumentMetaParser.ODF_VERSION_KEY));
+
 
             // Note - contents of maths files not currently supported
             String content = handler.toString().trim();
@@ -221,6 +225,7 @@ public class ODFParserTest extends TikaTest {
             assertEquals("0", metadata.get(Office.TABLE_COUNT));
             assertEquals("2", metadata.get(Office.OBJECT_COUNT));
             assertEquals("0", metadata.get(Office.IMAGE_COUNT));
+            assertEquals("1.1", 
metadata.get(OpenDocumentMetaParser.ODF_VERSION_KEY));
 
             String content = handler.toString();
             assertTrue(content.contains("Apache Tika Tika is part of the 
Lucene project."));
@@ -277,6 +282,7 @@ public class ODFParserTest extends TikaTest {
 
             assertEquals("application/vnd.oasis.opendocument.text",
                     metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("1.1", 
metadata.get(OpenDocumentMetaParser.ODF_VERSION_KEY));
 
             String content = handler.toString();
             assertContains("Tika is part of the Lucene project.", content);
@@ -490,4 +496,21 @@ public class ODFParserTest extends TikaTest {
         assertTrue(filesTested > 10);
     }
 
+    @Test
+    public void testVersions() throws Exception {
+        //test at least that all files from
+        // 
https://github.com/openpreserve/format-corpus/tree/master/office-examples/LibreOffice7-ODF-1.3
+        //pass as 1.3.  Note that we don't currently parse base files, so skip 
that one.
+        for (String name : new String[]{
+                //"LibreOfficeBase_odb_1.3.odb",
+                "LibreOfficeCalc_ods_1.3.ods",
+                "LibreOfficeDraw_odg_1.3.odg",
+                "LibreOfficeImpress_odp_1.3.odp",
+                "LibreOfficeWriter_odt_1.3.odt",
+        }) {
+            List<Metadata> metadataList = getRecursiveMetadata("/versions/" + 
name);
+            Metadata metadata = metadataList.get(0);
+            assertEquals("1.3", 
metadata.get(OpenDocumentMetaParser.ODF_VERSION_KEY), "failed on " + name);
+        }
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeBase_odb_1.3.odb
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeBase_odb_1.3.odb
new file mode 100644
index 000000000..1e3116708
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeBase_odb_1.3.odb
 differ
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeCalc_ods_1.3.ods
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeCalc_ods_1.3.ods
new file mode 100644
index 000000000..2e67be516
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeCalc_ods_1.3.ods
 differ
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeDraw_odg_1.3.odg
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeDraw_odg_1.3.odg
new file mode 100644
index 000000000..4f85be270
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeDraw_odg_1.3.odg
 differ
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeImpress_odp_1.3.odp
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeImpress_odp_1.3.odp
new file mode 100644
index 000000000..d7200d876
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeImpress_odp_1.3.odp
 differ
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeWriter_odt_1.3.odt
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeWriter_odt_1.3.odt
new file mode 100644
index 000000000..acf29661a
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeWriter_odt_1.3.odt
 differ

Reply via email to