This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4188 in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-4188 by this push: new 6ac12b6c8 TIKA-4188 -- upgrade jwarc and add unit test 6ac12b6c8 is described below commit 6ac12b6c8bff62269e4ecb5a9f6d00f8a7495d20 Author: tallison <talli...@apache.org> AuthorDate: Fri Feb 9 09:43:28 2024 -0500 TIKA-4188 -- upgrade jwarc and add unit test --- tika-parent/pom.xml | 2 +- .../apache/tika/parser/warc/WARCParserTest.java | 12 +++- .../src/test/resources/test-documents/example.arc | 69 ---------------------- 3 files changed, 12 insertions(+), 71 deletions(-) diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index fe002e538..5f76e65c7 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -378,7 +378,7 @@ <junit5.version>5.10.2</junit5.version> <juniversalchardet.version>2.4.0</juniversalchardet.version> <junrar.version>7.5.5</junrar.version> - <jwarc.version>0.28.5</jwarc.version> + <jwarc.version>0.28.6</jwarc.version> <kafka.version>3.6.1</kafka.version> <libpst.version>0.9.3</libpst.version> <log4j2.version>2.22.1</log4j2.version> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java index bb7031550..8dc35bcf9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java @@ -18,7 +18,6 @@ package org.apache.tika.parser.warc; import static org.junit.jupiter.api.Assertions.assertEquals; -import java.io.File; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -81,4 +80,15 @@ public class WARCParserTest extends TikaTest { assertEquals("http://www.uq.edu.au/", metadataList.get(1).get("warc:http:Location")); } + + @Test + public void testExampleARC() throws Exception { + //test file from https://github.com/webrecorder/warcio/blob/master/test/data/example.arc.gz + List<Metadata> metadataList = getRecursiveMetadata("example.arc.gz", + BasicContentHandlerFactory.HANDLER_TYPE.TEXT); + assertEquals(2, metadataList.size()); + assertEquals("application/arc+gz", metadataList.get(0).get(Metadata.CONTENT_TYPE)); + assertContains("This domain is established", + metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc deleted file mode 100644 index 0d2af2bd2..000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/example.arc +++ /dev/null @@ -1,69 +0,0 @@ -filedesc://live-web-example.arc.gz 127.0.0.1 20140216050221 text/plain 75 -1 0 LiveWeb Capture -URL IP-address Archive-date Content-type Archive-length - -http://example.com/ 93.184.216.119 20140216050221 text/html 1591 -HTTP/1.1 200 OK -Accept-Ranges: bytes -Cache-Control: max-age=604800 -Content-Type: text/html -Date: Sun, 16 Feb 2014 05:02:20 GMT -Etag: "359670651" -Expires: Sun, 23 Feb 2014 05:02:20 GMT -Last-Modified: Fri, 09 Aug 2013 23:54:35 GMT -Server: ECS (sjc/4FCE) -X-Cache: HIT -x-ec-custom-error: 1 -Content-Length: 1270 - -<!doctype html> -<html> -<head> - <title>Example Domain</title> - - <meta charset="utf-8" /> - <meta http-equiv="Content-type" content="text/html; charset=utf-8" /> - <meta name="viewport" content="width=device-width, initial-scale=1" /> - <style type="text/css"> - body { - background-color: #f0f0f2; - margin: 0; - padding: 0; - font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif; - - } - div { - width: 600px; - margin: 5em auto; - padding: 50px; - background-color: #fff; - border-radius: 1em; - } - a:link, a:visited { - color: #38488f; - text-decoration: none; - } - @media (max-width: 700px) { - body { - background-color: #fff; - } - div { - width: auto; - margin: 0 auto; - border-radius: 0; - padding: 1em; - } - } - </style> -</head> - -<body> -<div> - <h1>Example Domain</h1> - <p>This domain is established to be used for illustrative examples in documents. You may use this - domain in examples without prior coordination or asking for permission.</p> - <p><a href="http://www.iana.org/domains/example">More information...</a></p> -</div> -</body> -</html> -