This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-3991 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 93e256523a7e50e0c9688d39dcdc1e0010c931c5 Author: tallison <talli...@apache.org> AuthorDate: Wed Mar 22 11:51:32 2023 -0400 TIKA-3991 -- add detection for cannon raw crw, cr2 and cr3 --- CHANGES.txt | 3 ++ .../org/apache/tika/mime/tika-mimetypes.xml | 36 +++++++++++++++++++++- .../java/org/apache/tika/mime/TestMimeTypes.java | 5 +-- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 818e935c3..e4fe13726 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,8 @@ Release 2.7.1 - ??? + * Fix 'image/x-raw-canon' to 'image/x-canon-crw' and add detection + magic detection for canon raw file types: crw, cr2 and cr3 (TIKA-3991). + * Add detection and a parser for ActiveMime files (TIKA-3987). * Users may now avoid the ZeroByteFileException via a diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index cb1b5d48c..e74de8a48 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -6268,12 +6268,46 @@ <glob pattern="*.raf"/> </mime-type> - <mime-type type="image/x-raw-canon"> + <mime-type type="image/x-canon-crw"> <_comment>Canon raw image</_comment> + <magic priority="50"> + <match value="\x49\x49\x1a\x00\x00\x00HEAPCCDR" type="string" offset="0"/> + </magic> <glob pattern="*.crw"/> + </mime-type> + + <mime-type type="image/x-canon-cr2"> + <_comment>Canon raw image, version 2, TIFF-based</_comment> + <!-- basically tiff header with 'CR' at offset 8; major version is at offset 9, minor at 10/. + priority must be higher than tiff --> + <magic priority="60"> + <!-- MM.* = Big endian (M=Motorola) and 0x002a in big endian --> + <match value="MM\x00\x2a" type="string" offset="0"> + <match value="CR" type="string" offset="8"/> + </match> + <!-- II*. = Little endian (I=Intel) and 0x002a in little endian --> + <match value="II\x2a\x00" type="string" offset="0"> + <match value="CR" type="string" offset="8"/> + </match> + <!-- MM.+ = Big endian (M=Motorola) and 0x002a in big endian--> + <match value="MM\x00\x2b" type="string" offset="0"> + <match value="CR" type="string" offset="8"/> + </match> + </magic> + <sub-class-of type="image/tiff" /> <glob pattern="*.cr2"/> </mime-type> + <mime-type type="image/x-canon-cr3"> + <_comment>Canon raw image, version 3, Quicktime-based</_comment> + <glob pattern="*.cr3"/> + <!-- needs to be higher than quicktime --> + <magic priority="60"> + <match value="ftypcrx " type="string" offset="4"/> + </magic> + <sub-class-of type="video/quicktime" /> + </mime-type> + <mime-type type="image/x-raw-kodak"> <_comment>Kodak raw image</_comment> <glob pattern="*.k25"/> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java index d14c5eb9b..ea9d8d5f8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -832,8 +832,9 @@ public class TestMimeTypes { assertTypeByName("image/x-raw-adobe", "x.DNG"); assertTypeByName("image/x-raw-hasselblad", "x.3fr"); assertTypeByName("image/x-raw-fuji", "x.raf"); - assertTypeByName("image/x-raw-canon", "x.crw"); - assertTypeByName("image/x-raw-canon", "x.cr2"); + assertTypeByName("image/x-canon-crw", "x.crw"); + assertTypeByName("image/x-canon-cr2", "x.cr2"); + assertTypeByName("image/x-canon-cr3", "x.cr3"); assertTypeByName("image/x-raw-kodak", "x.k25"); assertTypeByName("image/x-raw-kodak", "x.kdc"); assertTypeByName("image/x-raw-kodak", "x.dcs");