This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4244 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6772787f97471181158e90319ee7d2a682fd6365 Author: tallison <talli...@apache.org> AuthorDate: Thu Apr 25 11:38:34 2024 -0400 TIKA-4244 -- improve ics detection --- .../main/resources/org/apache/tika/mime/tika-mimetypes.xml | 2 +- .../src/test/java/org/apache/tika/mime/TestMimeTypes.java | 4 ++++ .../resources/test-documents/testICalendar_w_prodId.ics | 13 +++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index a1e9de0fd..09bbd963c 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -7319,7 +7319,7 @@ <mime-type type="text/calendar"> <magic priority="50"> <match value="BEGIN:VCALENDAR" type="string" offset="0"> - <match value="VERSION:2.0" type="string" offset="15:30"/> + <match value="\nVERSION:2.0" type="string" offset="15:360"/> </match> </magic> <glob pattern="*.ics"/> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java index cd6705b69..a988c440e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypes.java @@ -1235,6 +1235,10 @@ public class TestMimeTypes { assertType("text/x-vcalendar", "testVCalendar.vcs"); assertTypeByData("text/calendar", "testICalendar.ics"); assertTypeByData("text/x-vcalendar", "testVCalendar.vcs"); + //TIKA-4244 + //this tests detection with content intervening between the BEGIN:VCALENDAR and the VERSION:2.0 entry + assertType("text/calendar", "testICalendar_w_prodId.ics"); + assertTypeByData("text/calendar", "testICalendar_w_prodId.ics"); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testICalendar_w_prodId.ics b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testICalendar_w_prodId.ics new file mode 100644 index 000000000..0af25fc46 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/testICalendar_w_prodId.ics @@ -0,0 +1,13 @@ +BEGIN:VCALENDAR +PRODID:-//Example Corp//iCalendar Export//EN +VERSION:2.0 +BEGIN:VEVENT +UID:1234567...@example.com +DTSTAMP:20240101T080000Z +DTSTART:20240101T100000Z +DTEND:20240101T120000Z +SUMMARY:Sample HTML Event +DESCRIPTION:This is a sample event with an HTML description. +X-ALT-DESC;FMTTYPE=text/html:<html><body><h1>Sample HTML Event</h1><p>This is a sample event with an <strong>HTML</strong> description.</p></body></html> +END:VEVENT +END:VCALENDAR