This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push: new 7d48d00ac TIKA-4188 (#1587) 7d48d00ac is described below commit 7d48d00ac1febfb1ac70e4887268b28fb4951b78 Author: Tim Allison <talli...@apache.org> AuthorDate: Fri Feb 9 10:43:40 2024 -0500 TIKA-4188 (#1587) * TIKA-4188 -- add parsing for arc files --- .../detect/gzip/GZipSpecializationDetector.java | 4 ++ .../org/apache/tika/parser/warc/WARCParser.java | 14 ++++-- .../apache/tika/parser/warc/WARCParserTest.java | 31 ++++++++++++- .../test/resources/test-documents/example.arc.gz | Bin 0 -> 1027 bytes .../src/test/resources/test-documents/testARC.arc | 50 +++++++++++++++++++++ 5 files changed, 94 insertions(+), 5 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java index e3d743ad3..b87115b3b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java @@ -38,6 +38,8 @@ public class GZipSpecializationDetector implements Detector { public static MediaType GZ = MediaType.application("gzip"); public static MediaType WARC_GZ = MediaType.application("warc+gz"); + public static MediaType ARC_GZ = MediaType.application("arc+gz"); + @Override public MediaType detect(InputStream input, Metadata metadata) throws IOException { if (input == null) { @@ -84,6 +86,8 @@ public class GZipSpecializationDetector implements Detector { String s = new String(bytes.toByteArray(), StandardCharsets.UTF_8); if (s.startsWith("WARC/")) { return WARC_GZ; + } else if (s.startsWith("filedesc://")) { + return ARC_GZ; } return GZ; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java index 2c61cae91..ad4894b54 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java @@ -49,11 +49,16 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.StringUtils; +/** + * This uses jwarc to parse warc files and arc files + */ public class WARCParser implements Parser { private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet( new HashSet<>(Arrays.asList(MediaType.application("warc"), - MediaType.application("warc+gz")))); + MediaType.application("warc+gz"), + MediaType.application("x-internet-archive"), + MediaType.application("arc+gz")))); public static String WARC_PREFIX = "warc:"; public static String WARC_HTTP_PREFIX = WARC_PREFIX + "http:"; @@ -130,9 +135,10 @@ public class WARCParser implements Parser { setNotNull(WARC.WARC_PAYLOAD_CONTENT_TYPE, warcResponse.payloadType(), metadata); processWarcMetadata(warcResponse, metadata); processHttpResponseMetadata(warcResponse.http(), metadata); - - String id = warcResponse.id().toString(); - metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, id); + if (warcResponse.warcinfoID().isPresent()) { + String id = warcResponse.id().toString(); + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, id); + } WarcPayload payload = optionalPayload.get(); metadata.set(WARC.WARC_RECORD_CONTENT_TYPE, payload.type().toString()); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(payload.body().size())); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java index c92f8ec15..56d49aa2b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java @@ -31,7 +31,7 @@ import org.apache.tika.sax.BasicContentHandlerFactory; public class WARCParserTest extends TikaTest { - // the cc.warc.gz and gzip_extra_sl.warc.gz files come + // the cc.warc.gz and gzip_extra_sl.warc.gz and the testARC.arc files come // from the jwarc unit tests. @Test @@ -64,4 +64,33 @@ public class WARCParserTest extends TikaTest { assertEquals("application/warc", metadataList.get(0).get(Metadata.CONTENT_TYPE)); assertEquals("application/warc+gz", gzMetadataList.get(0).get(Metadata.CONTENT_TYPE)); } + + @Test + public void testARC() throws Exception { + //test file comes from: + // https://github.com/iipc/jwarc/blob/master/test/org/netpreserve/jwarc/apitests/ArcTest.java + + List<Metadata> metadataList = getRecursiveMetadata("testARC.arc", + BasicContentHandlerFactory.HANDLER_TYPE.TEXT); + assertEquals(2, metadataList.size()); + assertContains("The document has moved here", + metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); + assertEquals("http://www.uq.edu.au/robots.txt", + metadataList.get(1).get("warc:WARC-Target-URI")); + assertEquals("http://www.uq.edu.au/", + metadataList.get(1).get("warc:http:Location")); + } + + @Test + public void testArcGZ() throws Exception { + //test file from https://github.com/webrecorder/warcio/blob/master/test/data/example.arc.gz + List<Metadata> metadataList = getRecursiveMetadata("example.arc.gz", + BasicContentHandlerFactory.HANDLER_TYPE.TEXT); + assertEquals(2, metadataList.size()); + assertEquals("application/arc+gz", metadataList.get(0).get(Metadata.CONTENT_TYPE)); + assertContains("This domain is established", + metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); + + //TODO -- we should try to find an example gz with multiple arcs + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc.gz new file mode 100644 index 000000000..bc959cf18 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc.gz differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc new file mode 100644 index 000000000..b7f099eb3 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testARC.arc @@ -0,0 +1,50 @@ +filedesc://example.arc 0.0.0.0 20050614070144 text/plain 1338 +1 1 InternetArchive +URL IP-address Archive-date Content-type Archive-length +<?xml version="1.0" encoding="UTF-8" standalone="yes"?> +<arcmetadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:arc="http://archive.org/arc/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://archive.org/arc/1.0/" xsi:schemaLocation="http://archive.org/arc/1.0/ http://www.archive.org/arc/1.0/arc.xsd"> +<arc:software>Heritrix 1.5.0-200506132127 http://crawler.archive.org</arc:software> +<arc:hostname>example.org</arc:hostname> +<arc:ip>127.0.0.1</arc:ip> +<dcterms:isPartOf>CRAWL</dcterms:isPartOf> +<dc:description>Example crawl</dc:description> +<arc:operator>Example</arc:operator> +<dc:publisher>Example</dc:publisher> +<dcterms:audience>Example</dcterms:audience> +<ns0:date xmlns:ns0="http://purl.org/dc/elements/1.1/" xsi:type="dcterms:W3CDTF">2005-06-14T06:37:49+00:00</ns0:date> +<arc:http-header-user-agent>Mozilla/5.0 (compatible; heritrix/1.5.0-200506132127 +http://example.org/)</arc:http-header-user-agent> +<arc:http-header-from>exam...@example.org</arc:http-header-from> +<arc:robots>classic</arc:robots> +<dc:format>ARC file version 1.1</dc:format> +<dcterms:conformsTo xsi:type="dcterms:URI">http://www.archive.org/web/researcher/ArcFileFormat.php</dcterms:conformsTo> +</arcmetadata> + +dns:www.law.gov.au 207.241.224.11 20050614070144 text/dns 55 +20050614070144 +www.law.gov.au. 6858 IN A 152.91.15.12 + +http://www.uq.edu.au/robots.txt 130.102.5.51 20050614070151 text/html 524 +HTTP/1.1 302 Found +Date: Tue, 14 Jun 2005 07:01:49 GMT +Server: Apache/1.3.28 (Unix) DAV/1.0.3 PHP/4.2.2 mod_perl/1.24_01 mod_ssl/2.8.15 OpenSSL/0.9.7c +Location: http://www.uq.edu.au/ +Connection: close +Content-Type: text/html; charset=iso-8859-1 + +<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN"> +<HTML><HEAD> +<TITLE>302 Found</TITLE> +</HEAD><BODY> +<H1>Found</H1> +The document has moved <A HREF="http://www.uq.edu.au/">here</A>.<P> +<HR> +<ADDRESS>Apache/1.3.28 Server at www.uq.edu.au Port 80</ADDRESS> +</BODY></HTML> + + + + + + + +