This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 8c7592534 TIKA-3840: add extraction of ODF version from ODF files
8c7592534 is described below
commit 8c759253411562973a7445dcd2a7a24abbc2db84
Author: tallison <[email protected]>
AuthorDate: Mon Aug 29 11:25:45 2022 -0400
TIKA-3840: add extraction of ODF version from ODF files
---
CHANGES.txt | 2 ++
.../tika/parser/odf/OpenDocumentMetaParser.java | 16 +++++++++++++-
.../org/apache/tika/parser/odf/ODFParserTest.java | 23 +++++++++++++++++++++
.../versions/LibreOfficeBase_odb_1.3.odb | Bin 0 -> 2419 bytes
.../versions/LibreOfficeCalc_ods_1.3.ods | Bin 0 -> 8812 bytes
.../versions/LibreOfficeDraw_odg_1.3.odg | Bin 0 -> 9166 bytes
.../versions/LibreOfficeImpress_odp_1.3.odp | Bin 0 -> 24035 bytes
.../versions/LibreOfficeWriter_odt_1.3.odt | Bin 0 -> 9683 bytes
8 files changed, 40 insertions(+), 1 deletion(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 779fe25f6..a02a68c8c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 2.4.2 - ???
+ * Add extraction of ODF version from ODF files (TIKA-3840).
+
* tika-parser-html-commons (BoilerPipeHandler) is no longer a
a dependency of tika-parser-html-module. tika-app and tika-server-standard
have added a dependency on tika-parser-html-commons. However,
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
index 3cdd458a7..ae06aec02 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentMetaParser.java
@@ -50,14 +50,19 @@ import org.apache.tika.sax.xpath.XPathParser;
* Parser for OpenDocument <code>meta.xml</code> files.
*/
public class OpenDocumentMetaParser extends XMLParser {
+
+ public static final String ODF_VERSION_KEY = "odf:version";
/**
* Serial version UID
*/
private static final long serialVersionUID = -8739250869531737584L;
private static final String META_NS =
"urn:oasis:names:tc:opendocument:xmlns:meta:1.0";
+
+ private static final String OFFICE_NS =
"urn:oasis:names:tc:opendocument:xmlns:office:1.0";
private static final XPathParser META_XPATH = new XPathParser("meta",
META_NS);
+ private static final XPathParser OFFICE_XPATH = new XPathParser("office",
OFFICE_NS);
private static ContentHandler getDublinCoreHandler(Metadata metadata,
Property property,
String element) {
return new ElementMetadataHandler(DublinCore.NAMESPACE_URI_DC,
element, metadata, property);
@@ -83,6 +88,15 @@ public class OpenDocumentMetaParser extends XMLParser {
return new TeeContentHandler(ch, branch);
}
+ private static ContentHandler getVersion(ContentHandler ch, Metadata md) {
+ Matcher matcher =
OFFICE_XPATH.parse("/office:document-meta/@office:version");
+ ContentHandler branch = new MatchingContentHandler(
+ new AttributeMetadataHandler(
+ OFFICE_NS, "version", md,
+ ODF_VERSION_KEY), matcher);
+ return new TeeContentHandler(ch, branch);
+ }
+
@Deprecated
private static ContentHandler getStatistic(ContentHandler ch, Metadata md,
String name,
String attribute) {
@@ -115,7 +129,7 @@ public class OpenDocumentMetaParser extends XMLParser {
getDublinCoreHandler(md,
TikaCoreProperties.IDENTIFIER, "identifier"),
getDublinCoreHandler(md, TikaCoreProperties.LANGUAGE,
"language"),
getDublinCoreHandler(md, TikaCoreProperties.RIGHTS,
"rights"));
-
+ ch = getVersion(ch, md);
// Process the OO Meta Attributes
ch = getMeta(ch, md, TikaCoreProperties.CREATED, "creation-date");
// ODF uses dc:date for modified
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index 7a6decc8a..b9ee45bc2 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -118,6 +118,8 @@ public class ODFParserTest extends TikaTest {
assertEquals(null, metadata.get("custom:Info 3"));
assertEquals(null, metadata.get("custom:Info 4"));
+ assertEquals("1.0",
metadata.get(OpenDocumentMetaParser.ODF_VERSION_KEY));
+
String content = handler.toString();
assertTrue(content.contains("This is a sample Open Office
document," +
" written in NeoOffice 2.2.1 for the Mac."));
@@ -173,6 +175,8 @@ public class ODFParserTest extends TikaTest {
assertEquals(null, metadata.get("nbPara"));
assertEquals(null, metadata.get("nbWord"));
assertEquals(null, metadata.get("nbCharacter"));
+ assertEquals("1.0",
metadata.get(OpenDocumentMetaParser.ODF_VERSION_KEY));
+
// Note - contents of maths files not currently supported
String content = handler.toString().trim();
@@ -221,6 +225,7 @@ public class ODFParserTest extends TikaTest {
assertEquals("0", metadata.get(Office.TABLE_COUNT));
assertEquals("2", metadata.get(Office.OBJECT_COUNT));
assertEquals("0", metadata.get(Office.IMAGE_COUNT));
+ assertEquals("1.1",
metadata.get(OpenDocumentMetaParser.ODF_VERSION_KEY));
String content = handler.toString();
assertTrue(content.contains("Apache Tika Tika is part of the
Lucene project."));
@@ -277,6 +282,7 @@ public class ODFParserTest extends TikaTest {
assertEquals("application/vnd.oasis.opendocument.text",
metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("1.1",
metadata.get(OpenDocumentMetaParser.ODF_VERSION_KEY));
String content = handler.toString();
assertContains("Tika is part of the Lucene project.", content);
@@ -490,4 +496,21 @@ public class ODFParserTest extends TikaTest {
assertTrue(filesTested > 10);
}
+ @Test
+ public void testVersions() throws Exception {
+ //test at least that all files from
+ //
https://github.com/openpreserve/format-corpus/tree/master/office-examples/LibreOffice7-ODF-1.3
+ //pass as 1.3. Note that we don't currently parse base files, so skip
that one.
+ for (String name : new String[]{
+ //"LibreOfficeBase_odb_1.3.odb",
+ "LibreOfficeCalc_ods_1.3.ods",
+ "LibreOfficeDraw_odg_1.3.odg",
+ "LibreOfficeImpress_odp_1.3.odp",
+ "LibreOfficeWriter_odt_1.3.odt",
+ }) {
+ List<Metadata> metadataList = getRecursiveMetadata("/versions/" +
name);
+ Metadata metadata = metadataList.get(0);
+ assertEquals("1.3",
metadata.get(OpenDocumentMetaParser.ODF_VERSION_KEY), "failed on " + name);
+ }
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeBase_odb_1.3.odb
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeBase_odb_1.3.odb
new file mode 100644
index 000000000..1e3116708
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeBase_odb_1.3.odb
differ
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeCalc_ods_1.3.ods
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeCalc_ods_1.3.ods
new file mode 100644
index 000000000..2e67be516
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeCalc_ods_1.3.ods
differ
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeDraw_odg_1.3.odg
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeDraw_odg_1.3.odg
new file mode 100644
index 000000000..4f85be270
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeDraw_odg_1.3.odg
differ
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeImpress_odp_1.3.odp
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeImpress_odp_1.3.odp
new file mode 100644
index 000000000..d7200d876
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeImpress_odp_1.3.odp
differ
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeWriter_odt_1.3.odt
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeWriter_odt_1.3.odt
new file mode 100644
index 000000000..acf29661a
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/resources/test-documents/versions/LibreOfficeWriter_odt_1.3.odt
differ