This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-3308 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 04341240651f12bd63585ef3b4dc78df83630c7c Author: tallison <talli...@apache.org> AuthorDate: Fri Nov 18 10:21:11 2022 -0500 TIKA-3308 -- add detection for svg files that lack the xml header --- CHANGES.txt | 2 ++ .../src/main/resources/org/apache/tika/mime/tika-mimetypes.xml | 6 ++++++ .../src/test/resources/test-documents/testSVG_no_xml_header.svg | 4 ++++ .../src/test/java/org/apache/tika/mime/TestMimeTypes.java | 1 + 4 files changed, 13 insertions(+) diff --git a/CHANGES.txt b/CHANGES.txt index 54d14214e..85c7796a6 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,7 @@ Release 2.6.1 - ??? + * Add SVG detection for svg files lacking the xml header (TIKA-3308). + * Add a JDBCPipesReporter (TIKA-3931). * Add multivalued field strategy option in jdbc-emitter (TIKA-3930). diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 2baa84d0e..4d1347c93 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -5827,6 +5827,12 @@ <acronym>SVG</acronym> <_comment>Scalable Vector Graphics</_comment> <root-XML localName="svg" namespaceURI="http://www.w3.org/2000/svg"/> + <magic priority="50"> + <!-- Version of 0x0001 is PSD --> + <match value="<svg" type="string" offset="0"> + <match value="http://www.w3.org/2000/svg" type="string" offset="5:256"/> + </match> + </magic> <glob pattern="*.svg"/> <glob pattern="*.svgz"/> </mime-type> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testSVG_no_xml_header.svg b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testSVG_no_xml_header.svg new file mode 100644 index 000000000..0e53461be --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testSVG_no_xml_header.svg @@ -0,0 +1,4 @@ +<svg width="1cm" height="1cm" version="1.1" xmlns="http://www.w3.org/2000/svg"> + <desc>Test SVG image</desc> + <rect x="0.1cm" y="0.1cm" width="0.8cm" height="0.8cm"/> +</svg> \ No newline at end of file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java index 1f475f2bd..d14c5eb9b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -607,6 +607,7 @@ public class TestMimeTypes { assertTypeByData("image/svg+xml", "testSVG.svg"); assertTypeByName("image/svg+xml", "x.svg"); assertTypeByName("image/svg+xml", "x.SVG"); + assertTypeByData("image/svg+xml", "testSVG_no_xml_header.svg"); // Should *.svgz be svg or gzip assertType("application/gzip", "testSVG.svgz");