This is an automated email from the ASF dual-hosted git repository. rzo1 pushed a commit to branch fix_issues_from_ipmc_release_vote in repository https://gitbox.apache.org/repos/asf/incubator-stormcrawler.git
commit 553fb584a9ca04bc40f1b2ec71094b7cf85f4e77 Author: Richard Zowalla <[email protected]> AuthorDate: Fri Nov 22 08:13:54 2024 +0100 Remove references to digitalpebble.com (README, HTML) --- README.md | 2 - .../stormcrawler/bolt/FeedParserBoltTest.java | 2 +- .../stormcrawler/bolt/JSoupParserBoltTest.java | 16 +-- .../stormcrawler/bolt/SiteMapParserBoltTest.java | 18 +-- .../stormcrawler/indexer/BasicIndexingTest.java | 18 +-- .../apache/stormcrawler/json/JsoupFilterTest.java | 6 +- .../stormcrawler/jsoup/JSoupFiltersTest.java | 12 +- .../stormcrawler/parse/DuplicateLinksTest.java | 2 +- .../parse/filter/CSVMetadataFilterTest.java | 4 +- .../parse/filter/SubDocumentsFilterTest.java | 2 +- .../stormcrawler/parse/filter/XPathFilterTest.java | 8 +- core/src/test/resources/digitalpebble.com.html | 156 --------------------- .../test/resources/stormcrawler.apache.org.html | 151 ++++++++++++++++++++ 13 files changed, 195 insertions(+), 202 deletions(-) diff --git a/README.md b/README.md index c8699555..5596019e 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,6 @@ NOTE: These instructions assume that you have [Apache Maven](https://maven.apach StormCrawler requires Java 11 or above. To execute tests, it requires you to have a locally installed and working Docker environment. -DigitalPebble's [Ansible-Storm](https://github.com/DigitalPebble/ansible-storm) repository contains resources to install Apache Storm using Ansible. Alternatively, this [stormcrawler-docker](https://github.com/DigitalPebble/stormcrawler-docker) project should help you run Apache Storm on Docker. - Once Storm is installed, the easiest way to get started is to generate a new StormCrawler project following the instructions below: ```shell diff --git a/core/src/test/java/org/apache/stormcrawler/bolt/FeedParserBoltTest.java b/core/src/test/java/org/apache/stormcrawler/bolt/FeedParserBoltTest.java index d1e0f137..c2677529 100644 --- a/core/src/test/java/org/apache/stormcrawler/bolt/FeedParserBoltTest.java +++ b/core/src/test/java/org/apache/stormcrawler/bolt/FeedParserBoltTest.java @@ -86,7 +86,7 @@ class FeedParserBoltTest extends ParsingTester { void testNonFeedParsing() throws IOException { prepareParserBolt("test.parsefilters.json"); // do not specify that it is a feed file - parse("http://www.digitalpebble.com", "digitalpebble.com.html", new Metadata()); + parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html", new Metadata()); Assertions.assertEquals(1, output.getEmitted().size()); } } diff --git a/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java b/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java index 68e39047..f7a6d614 100644 --- a/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java +++ b/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java @@ -119,7 +119,7 @@ class JSoupParserBoltTest extends ParsingTester { void testNoScriptInText() throws IOException { bolt.prepare( new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); - parse("http://www.digitalpebble.com", "digitalpebble.com.html"); + parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); List<Object> parsedTuple = output.getEmitted().remove(0); // check in the metadata that the values match String text = (String) parsedTuple.get(3); @@ -133,9 +133,9 @@ class JSoupParserBoltTest extends ParsingTester { void testNoFollowOutlinks() throws IOException { bolt.prepare( new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); - parse("http://www.digitalpebble.com", "digitalpebble.com.html"); + parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); List<List<Object>> statusTuples = output.getEmitted(Constants.StatusStreamName); - Assertions.assertEquals(10, statusTuples.size()); + Assertions.assertEquals(25, statusTuples.size()); } @Test @@ -144,7 +144,7 @@ class JSoupParserBoltTest extends ParsingTester { new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); Metadata metadata = new Metadata(); metadata.setValues("X-Robots-Tag", new String[] {"noindex", "nofollow"}); - parse("http://www.digitalpebble.com", "digitalpebble.com.html", metadata); + parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html", metadata); List<List<Object>> statusTuples = output.getEmitted(Constants.StatusStreamName); // no outlinks at all Assertions.assertEquals(0, statusTuples.size()); @@ -170,7 +170,7 @@ class JSoupParserBoltTest extends ParsingTester { new HashMap(), TestUtil.getMockedTopologyContext(), new OutputCollector(output)); for (int i = 0; i < tests.length; i++) { byte[] bytes = tests[i].getBytes(StandardCharsets.UTF_8); - parse("http://www.digitalpebble.com", bytes, new Metadata()); + parse("http://stormcrawler.apache.org", bytes, new Metadata()); Assertions.assertEquals(1, output.getEmitted().size()); List<Object> parsedTuple = output.getEmitted().remove(0); // check in the metadata that the values match @@ -205,7 +205,7 @@ class JSoupParserBoltTest extends ParsingTester { void testExecuteWithOutlinksLimit() throws IOException { stormConf.put("parser.emitOutlinks.max.per.page", 5); bolt.prepare(stormConf, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); - parse("http://www.digitalpebble.com", "digitalpebble.com.html"); + parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); List<List<Object>> statusTuples = output.getEmitted(Constants.StatusStreamName); // outlinks being limited by property Assertions.assertEquals(5, statusTuples.size()); @@ -215,10 +215,10 @@ class JSoupParserBoltTest extends ParsingTester { void testExecuteWithOutlinksLimitDisabled() throws IOException { stormConf.put("parser.emitOutlinks.max.per.page", -1); bolt.prepare(stormConf, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); - parse("http://www.digitalpebble.com", "digitalpebble.com.html"); + parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); List<List<Object>> statusTuples = output.getEmitted(Constants.StatusStreamName); // outlinks NOT being limited by property, since is disabled with -1 - Assertions.assertEquals(10, statusTuples.size()); + Assertions.assertEquals(25, statusTuples.size()); } @Test diff --git a/core/src/test/java/org/apache/stormcrawler/bolt/SiteMapParserBoltTest.java b/core/src/test/java/org/apache/stormcrawler/bolt/SiteMapParserBoltTest.java index 725028c4..d96ce5f6 100644 --- a/core/src/test/java/org/apache/stormcrawler/bolt/SiteMapParserBoltTest.java +++ b/core/src/test/java/org/apache/stormcrawler/bolt/SiteMapParserBoltTest.java @@ -52,7 +52,7 @@ class SiteMapParserBoltTest extends ParsingTester { metadata.setValue(SiteMapParserBolt.isSitemapKey, "true"); // and its mime-type metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); - parse("http://www.digitalpebble.com/sitemap.xml", "digitalpebble.sitemap.xml", metadata); + parse("http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.xml", metadata); Assertions.assertEquals(6, output.getEmitted(Constants.StatusStreamName).size()); // TODO test that the new links have the right metadata List<Object> fields = output.getEmitted(Constants.StatusStreamName).get(0); @@ -100,7 +100,7 @@ class SiteMapParserBoltTest extends ParsingTester { // and its mime-type metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( - "http://www.digitalpebble.com/sitemap.xml", + "http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.extensions.image.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); @@ -119,7 +119,7 @@ class SiteMapParserBoltTest extends ParsingTester { // and its mime-type metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( - "http://www.digitalpebble.com/sitemap.xml", + "http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.extensions.mobile.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); @@ -138,7 +138,7 @@ class SiteMapParserBoltTest extends ParsingTester { // and its mime-type metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( - "http://www.digitalpebble.com/sitemap.xml", + "http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.extensions.links.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); @@ -157,7 +157,7 @@ class SiteMapParserBoltTest extends ParsingTester { // and its mime-type metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( - "http://www.digitalpebble.com/sitemap.xml", + "http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.extensions.news.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); @@ -176,7 +176,7 @@ class SiteMapParserBoltTest extends ParsingTester { // and its mime-type metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( - "http://www.digitalpebble.com/sitemap.xml", + "http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.extensions.video.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); @@ -202,7 +202,7 @@ class SiteMapParserBoltTest extends ParsingTester { // and its mime-type metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( - "http://www.digitalpebble.com/sitemap.xml", + "http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.extensions.all.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); @@ -237,7 +237,7 @@ class SiteMapParserBoltTest extends ParsingTester { Metadata metadata = new Metadata(); // do not specify that it is a sitemap file // do not set the mimetype - parse("http://www.digitalpebble.com/sitemap.xml", "digitalpebble.sitemap.xml", metadata); + parse("http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.xml", metadata); Assertions.assertEquals(6, output.getEmitted(Constants.StatusStreamName).size()); // TODO test that the new links have the right metadata List<Object> fields = output.getEmitted(Constants.StatusStreamName).get(0); @@ -248,7 +248,7 @@ class SiteMapParserBoltTest extends ParsingTester { void testNonSitemapParsing() throws IOException { prepareParserBolt("test.parsefilters.json"); // do not specify that it is a sitemap file - parse("http://www.digitalpebble.com", "digitalpebble.com.html", new Metadata()); + parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html", new Metadata()); Assertions.assertEquals(1, output.getEmitted().size()); } diff --git a/core/src/test/java/org/apache/stormcrawler/indexer/BasicIndexingTest.java b/core/src/test/java/org/apache/stormcrawler/indexer/BasicIndexingTest.java index f47c2f16..9b73fc26 100644 --- a/core/src/test/java/org/apache/stormcrawler/indexer/BasicIndexingTest.java +++ b/core/src/test/java/org/apache/stormcrawler/indexer/BasicIndexingTest.java @@ -29,7 +29,7 @@ import org.junit.jupiter.api.Test; class BasicIndexingTest extends IndexerTester { - private static final String URL = "http://www.digitalpebble.com"; + private static final String URL = "http://stormcrawler.apache.org"; @BeforeEach void setupIndexerBolt() { @@ -55,12 +55,12 @@ class BasicIndexingTest extends IndexerTester { config.put(AbstractIndexerBolt.urlFieldParamName, "url"); config.put(AbstractIndexerBolt.canonicalMetadataParamName, "canonical"); Metadata metadata = new Metadata(); - metadata.setValue("canonical", "http://www.digitalpebble.com/"); + metadata.setValue("canonical", "http://stormcrawler.apache.org/"); prepareIndexerBolt(config); index(URL, metadata); Map<String, String> fields = ((DummyIndexer) bolt).returnFields(); Assertions.assertEquals( - "http://www.digitalpebble.com/", + "http://stormcrawler.apache.org/", fields.get("url"), "Use the canonical URL if found"); } @@ -76,7 +76,7 @@ class BasicIndexingTest extends IndexerTester { index(URL, metadata); Map<String, String> fields = ((DummyIndexer) bolt).returnFields(); Assertions.assertEquals( - "http://www.digitalpebble.com/home", + "http://stormcrawler.apache.org/home", fields.get("url"), "Use the canonical URL if found"); } @@ -92,7 +92,7 @@ class BasicIndexingTest extends IndexerTester { index(URL, metadata); Map<String, String> fields = ((DummyIndexer) bolt).returnFields(); Assertions.assertEquals( - "http://www.digitalpebble.com", + "http://stormcrawler.apache.org", fields.get("url"), "Use the default URL if a bad canonical URL is found"); } @@ -108,7 +108,7 @@ class BasicIndexingTest extends IndexerTester { index(URL, metadata); Map<String, String> fields = ((DummyIndexer) bolt).returnFields(); Assertions.assertEquals( - "http://www.digitalpebble.com", + "http://stormcrawler.apache.org", fields.get("url"), "Ignore if the canonical URL references other host"); } @@ -118,12 +118,12 @@ class BasicIndexingTest extends IndexerTester { Map config = new HashMap(); config.put(AbstractIndexerBolt.urlFieldParamName, "url"); Metadata metadata = new Metadata(); - metadata.setValue("canonical", "http://www.digitalpebble.com/"); + metadata.setValue("canonical", "http://stormcrawler.apache.org/"); prepareIndexerBolt(config); index(URL, metadata); Map<String, String> fields = ((DummyIndexer) bolt).returnFields(); Assertions.assertEquals( - "http://www.digitalpebble.com", + "http://stormcrawler.apache.org", fields.get("url"), "Use the canonical URL if found"); } @@ -139,7 +139,7 @@ class BasicIndexingTest extends IndexerTester { index(URL, metadata); Map<String, String> fields = ((DummyIndexer) bolt).returnFields(); Assertions.assertEquals( - "http://www.digitalpebble.com", + "http://stormcrawler.apache.org", fields.get("url"), "The document must pass if the key/value is found in the metadata"); } diff --git a/core/src/test/java/org/apache/stormcrawler/json/JsoupFilterTest.java b/core/src/test/java/org/apache/stormcrawler/json/JsoupFilterTest.java index dc89f38d..5a56d655 100644 --- a/core/src/test/java/org/apache/stormcrawler/json/JsoupFilterTest.java +++ b/core/src/test/java/org/apache/stormcrawler/json/JsoupFilterTest.java @@ -45,7 +45,7 @@ class JsoupFilterTest extends ParsingTester { @Test void testLDJsonExtraction() throws IOException { prepareParserBolt("test.jsoupfilters.json"); - parse("http://www.digitalpebble.com", "digitalpebble.com.html"); + parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); Assertions.assertEquals(1, output.getEmitted().size()); List<Object> parsedTuple = output.getEmitted().get(0); Metadata metadata = (Metadata) parsedTuple.get(2); @@ -57,9 +57,9 @@ class JsoupFilterTest extends ParsingTester { @Test void testLinkFilter() throws IOException { prepareParserBolt("test.jsoupfilters.json"); - parse("http://www.digitalpebble.com", "digitalpebble.com.html"); + parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); List<List<Object>> status = output.getEmitted("status"); - Assertions.assertEquals(16, status.size()); + Assertions.assertEquals(31, status.size()); List<Object> parsedTuple = status.get(0); parsedTuple.toArray(); } diff --git a/core/src/test/java/org/apache/stormcrawler/jsoup/JSoupFiltersTest.java b/core/src/test/java/org/apache/stormcrawler/jsoup/JSoupFiltersTest.java index 9de60371..de433d1c 100644 --- a/core/src/test/java/org/apache/stormcrawler/jsoup/JSoupFiltersTest.java +++ b/core/src/test/java/org/apache/stormcrawler/jsoup/JSoupFiltersTest.java @@ -46,7 +46,7 @@ class JSoupFiltersTest extends ParsingTester { @Test void testBasicExtraction() throws IOException { prepareParserBolt("test.jsoupfilters.json"); - parse("http://www.digitalpebble.com", "digitalpebble.com.html"); + parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); Assertions.assertEquals(1, output.getEmitted().size()); List<Object> parsedTuple = output.getEmitted().get(0); Metadata metadata = (Metadata) parsedTuple.get(2); @@ -61,7 +61,7 @@ class JSoupFiltersTest extends ParsingTester { // https://github.com/DigitalPebble/storm-crawler/issues/219 void testScriptExtraction() throws IOException { prepareParserBolt("test.jsoupfilters.json"); - parse("http://www.digitalpebble.com", "digitalpebble.com.html"); + parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); Assertions.assertEquals(1, output.getEmitted().size()); List<Object> parsedTuple = output.getEmitted().get(0); Metadata metadata = (Metadata) parsedTuple.get(2); @@ -71,13 +71,13 @@ class JSoupFiltersTest extends ParsingTester { // should be 2 of them Assertions.assertEquals(2, scripts.length); Assertions.assertEquals("", scripts[0].trim()); - Assertions.assertTrue(scripts[1].contains("urchinTracker();")); + Assertions.assertTrue(scripts[1].contains("_paq")); } @Test void testLDJsonExtraction() throws IOException { prepareParserBolt("test.jsoupfilters.json"); - parse("http://www.digitalpebble.com", "digitalpebble.com.html"); + parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); Assertions.assertEquals(1, output.getEmitted().size()); List<Object> parsedTuple = output.getEmitted().get(0); Metadata metadata = (Metadata) parsedTuple.get(2); @@ -89,7 +89,7 @@ class JSoupFiltersTest extends ParsingTester { @Test void testExtraLink() throws IOException { prepareParserBolt("test.jsoupfilters.json"); - parse("http://www.digitalpebble.com", "digitalpebble.com.html"); - Assertions.assertEquals(16, output.getEmitted("status").size()); + parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); + Assertions.assertEquals(31, output.getEmitted("status").size()); } } diff --git a/core/src/test/java/org/apache/stormcrawler/parse/DuplicateLinksTest.java b/core/src/test/java/org/apache/stormcrawler/parse/DuplicateLinksTest.java index 9483d169..28835355 100644 --- a/core/src/test/java/org/apache/stormcrawler/parse/DuplicateLinksTest.java +++ b/core/src/test/java/org/apache/stormcrawler/parse/DuplicateLinksTest.java @@ -44,7 +44,7 @@ class DuplicateLinksTest extends ParsingTester { config.put("urlfilters.config.file", "basicurlnormalizer.json"); bolt.prepare(config, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); Metadata metadata = new Metadata(); - parse("http://www.digitalpebble.com/duplicates.html", "duplicateLinks.html", metadata); + parse("http://stormcrawler.apache.org/duplicates.html", "duplicateLinks.html", metadata); Assertions.assertEquals(1, output.getEmitted(Constants.StatusStreamName).size()); } } diff --git a/core/src/test/java/org/apache/stormcrawler/parse/filter/CSVMetadataFilterTest.java b/core/src/test/java/org/apache/stormcrawler/parse/filter/CSVMetadataFilterTest.java index 79d2ec5b..d460c6a5 100644 --- a/core/src/test/java/org/apache/stormcrawler/parse/filter/CSVMetadataFilterTest.java +++ b/core/src/test/java/org/apache/stormcrawler/parse/filter/CSVMetadataFilterTest.java @@ -36,13 +36,13 @@ class CSVMetadataFilterTest extends ParsingTester { @Test void testMultivalued() throws IOException { prepareParserBolt("test.parsefilters.json"); - parse("http://www.digitalpebble.com", "digitalpebble.com.html"); + parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); Assertions.assertEquals(1, output.getEmitted().size()); List<Object> parsedTuple = output.getEmitted().get(0); Metadata metadata = (Metadata) parsedTuple.get(2); Assertions.assertNotNull(metadata); String[] kws = metadata.getValues("keywords"); Assertions.assertNotNull(kws); - Assertions.assertEquals(12, kws.length); + Assertions.assertEquals(8, kws.length); } } diff --git a/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsFilterTest.java b/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsFilterTest.java index d0eb1f18..408d8503 100644 --- a/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsFilterTest.java +++ b/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsFilterTest.java @@ -40,7 +40,7 @@ class SubDocumentsFilterTest extends ParsingTester { config.put("detect.mimetype", false); prepareParserBolt("test.subdocfilter.json", config); Metadata metadata = new Metadata(); - parse("http://www.digitalpebble.com/sitemap.xml", "digitalpebble.sitemap.xml", metadata); + parse("http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.xml", metadata); Assertions.assertEquals(6, output.getEmitted().size()); } } diff --git a/core/src/test/java/org/apache/stormcrawler/parse/filter/XPathFilterTest.java b/core/src/test/java/org/apache/stormcrawler/parse/filter/XPathFilterTest.java index 4a8c49a0..a15e0833 100644 --- a/core/src/test/java/org/apache/stormcrawler/parse/filter/XPathFilterTest.java +++ b/core/src/test/java/org/apache/stormcrawler/parse/filter/XPathFilterTest.java @@ -36,7 +36,7 @@ class XPathFilterTest extends ParsingTester { @Test void testBasicExtraction() throws IOException { prepareParserBolt("test.parsefilters.json"); - parse("http://www.digitalpebble.com", "digitalpebble.com.html"); + parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); Assertions.assertEquals(1, output.getEmitted().size()); List<Object> parsedTuple = output.getEmitted().get(0); Metadata metadata = (Metadata) parsedTuple.get(2); @@ -51,7 +51,7 @@ class XPathFilterTest extends ParsingTester { // https://github.com/DigitalPebble/storm-crawler/issues/219 void testScriptExtraction() throws IOException { prepareParserBolt("test.parsefilters.json"); - parse("http://www.digitalpebble.com", "digitalpebble.com.html"); + parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); Assertions.assertEquals(1, output.getEmitted().size()); List<Object> parsedTuple = output.getEmitted().get(0); Metadata metadata = (Metadata) parsedTuple.get(2); @@ -61,13 +61,13 @@ class XPathFilterTest extends ParsingTester { // should be 2 of them Assertions.assertEquals(2, scripts.length); Assertions.assertEquals("", scripts[0].trim()); - Assertions.assertTrue(scripts[1].contains("urchinTracker();")); + Assertions.assertTrue(scripts[1].contains("_paq")); } @Test void testLDJsonExtraction() throws IOException { prepareParserBolt("test.parsefilters.json"); - parse("http://www.digitalpebble.com", "digitalpebble.com.html"); + parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); Assertions.assertEquals(1, output.getEmitted().size()); List<Object> parsedTuple = output.getEmitted().get(0); Metadata metadata = (Metadata) parsedTuple.get(2); diff --git a/core/src/test/resources/digitalpebble.com.html b/core/src/test/resources/digitalpebble.com.html deleted file mode 100644 index 68332164..00000000 --- a/core/src/test/resources/digitalpebble.com.html +++ /dev/null @@ -1,156 +0,0 @@ -<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> -<!-- -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, -software distributed under the License is distributed on an -"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -KIND, either express or implied. See the License for the -specific language governing permissions and limitations -under the License. ---> -<html> -<head> - <!-- #BeginTemplate "/Templates/model3.dwt" --> - <meta name="description" - content="DigitalPebble Ltd is a consultancy specialised in web crawling, natural language processing, information retrieval and extraction. Our expertise is based on open source solutions, such as Nutch, Gate or SOLR."> - <meta name="keywords" content="crawl, gate, consultant, consultancy, consulting, information extraction, information retrieval, NLP, IR, IE, nutch, solr"> - <link rel="icon" href="img/favicon.ico" type="image/vnd.microsoft.icon"> - <link type="text/css" href="style.css" rel="stylesheet"> - <meta name="google-site-verification" content="ZNIbylXN61hwJhB39tK17-u7RsU5kgiHXWbQ5F7lrNc" /> - <!-- #BeginEditable "doctitle" --> - <title>DigitalPebble Ltd - Open Source Solutions for Text Engineering</title> - <!-- #EndEditable --> -</head> - -<script type="application/ld+json"> -{ - "@context": "http://schema.org", - "type": "Organization", - "email": "[email protected]", - "telephone": "(+44)7758085585", - "logo": "http://digitalpebble.com/img/logo.gif", - "location": { - "type": "PostalAddress", - "addressCountry": "United Kingdom", - "addressLocality": "Bristol, Avon,", - "postalCode": "BS7 8ET", - "streetAddress": "16 Codrington Road" - }, - "url": "http://digitalpebble.com/" -} -</script> - -<a rel="nofollow" href="inexistent.html"/> -<a rel="nofollow somevalue" href="another_inexistent.html"/> - -<body style="color: rgb(0, 0, 0); background-color: rgb(255, 255, 255);" -alink="#000000" link="#000000" vlink="#000000"> - -<table align="center" border="0" cellspacing="0" width="70%"> - <tbody> - <tr> - <td valign="bottom"><img src="img/logo.gif" alt="digitalpebble" - align="bottom" height="60" width="310"></td> - </tr> - <tr> - <td> </td> - </tr> - <tr> - <td align="left" valign="middle"><!-- #BeginEditable "menu" --> - - <table border="0" cellpadding="0" cellspacing="0"> - <tbody> - <tr> - <td><a href="index.html"><img name="Home" - src="./img/menu/home2.png" border="0"></a></td> - <td> </td> - <td><a href="solutions.html"><img name="Solutions" - src="./img/menu/solutions1.png" border="0"></a></td> - <td> </td> - <td><a href="references.html"><img name="Clients" - src="./img/menu/clients1.png" border="0"></a></td> - <td> </td> - <td><a href="contact.html"><img src="img/menu/contact1.png" - style="border: 0px solid ; width: 108px; height: 32px;" - alt="" name="Contact" onload=""></a></td> - <td> </td> - </tr> - </tbody> - </table> - <!-- #EndEditable --> - </td> - </tr> - <tr> - <td> - <div id="tabs"> - </div> - </td> - </tr> - </tbody> -</table> - -<table align="center" border="0" cellspacing="0" width="70%"> - <!-- #BeginEditable "crumbs" --> - <tbody> - <tr> - <td colspan="3" align="left" valign="top"> </td> - </tr> - <!-- #EndEditable --> - <tr> - <td valign="top" width="270"><!-- #BeginEditable "picture" --> - <img src="img/small5.jpg" - alt="" - height="182" width="255"><!-- #EndEditable --> - </td> - <td valign="top" width="50"> </td> - <td align="left" valign="top" width="*"><!-- #BeginEditable "text" --> - <p class="aligned"><span class="concept">DigitalPebble Ltd</span> - is a consultancy and solution provider specialising in web crawling, natural language processing, - document retrieval and information extraction.</p> - - <p class="aligned">We advise, evaluate and implement solutions based - on leading <a href="solutions.html">open source solutions</a>, such - as <a href="http://nutch.apache.org/">Apache Nutch</a>, - <a href="http://gate.ac.uk">GATE</a> or <a href="http://lucene.apache.org/solr">SOLR</a>. We aim to combine open - source tools to provide efficient, reliable and low cost - made-to-order solutions.</p> - - <p class="aligned">Our unique expertise covers all aspects of - documents life cycle, from web-wide crawling and collection, content - analysis, filtering and categorization to indexing. We are - specialised in large scale processing using <a - href="http://hadoop.apache.org/">Hadoop</a> or <a href="http://storm.apache.org/">Storm</a> and have expertise in cloud platforms such as Amazon AWS, which has allowed - us to successfully deploy solutions scaling up to billions of documents for our <a href="references.html">clients</a>. </p> - - <p class="aligned">Not only to we have an extensive knowledge of open source solutions, we are also active contributors and - provide some of the <a href="https://github.com/DigitalPebble">resources</a> that we have developed over the years under open source licenses. - </p> - - <p class="aligned">Our <a href="references.html">clients</a> range from startup in stealth mode to NASDAQ listed companies and operate in domains as varied as business intelligence, media monitoring, - telecommunications or software development. - </p> - - </td> - </td> - - </tr> - </tbody> -</table> -<script type="text/javascript" -src="http://www.google-analytics.com/urchin.js"> -</script> -<script type="text/javascript"> -_uacct = "UA-357582-1"; -urchinTracker();</script> -<!-- #EndTemplate --> -</body> -</html> diff --git a/core/src/test/resources/stormcrawler.apache.org.html b/core/src/test/resources/stormcrawler.apache.org.html new file mode 100644 index 00000000..9d26cce2 --- /dev/null +++ b/core/src/test/resources/stormcrawler.apache.org.html @@ -0,0 +1,151 @@ +<!DOCTYPE html> +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> +<html> + +<head> + <meta charset="utf-8"> + <meta http-equiv="X-UA-Compatible" content="IE=edge"> + <meta name="viewport" content="width=device-width, initial-scale=1"> + + <title>Apache StormCrawler (Incubating)</title> + <meta name="description" content="Apache StormCrawler (Incubating) is collection of resources for building low-latency, scalable web crawlers on Apache Storm"> + <meta name="keywords" content="crawl, information extraction, information retrieval, NLP, IR, IE, nutch, solr"> + <link rel="stylesheet" href="/css/main.css"> + <link rel="canonical" href="https://stormcrawler.apache.org/"> + <link rel="alternate" type="application/rss+xml" title="Apache StormCrawler (Incubating)" href="https://stormcrawler.apache.org/feed.xml"> + <link rel="icon" type="/image/png" href="/img/favicon.png" /> +</head> + +<script type="application/ld+json"> + { + "@context": "http://schema.org", + "type": "Organization", + "email": "[email protected]", + "location": { + "type": "PostalAddress", + "addressCountry": "U.S.A", + "addressLocality": "Wilmington", + "streetAddress": "1000 N West Street, Suite 1200" + }, + "url": "http://stormcrawler.apache.org/" + } +</script> + +<body class="home"> + +<header class="site-header"> + <div class="site-header__wrap"> + <div class="site-header__logo"> + <a href="/"><img src="/img/incubator_logo.png" alt="Apache StormCrawler (Incubating)"></a> + </div> + </div> +</header> +<nav class="site-nav"> + <ul> + <li><a href="/index.html">Home</a> + <li><a href="/download/index.html">Download</a> + <li><a href="https://github.com/apache/incubator-stormcrawler">Source Code</a></li> + <li><a href="/getting-started/">Getting Started</a></li> + <li><a href="https://javadoc.io/doc/org.apache.stormcrawler/stormcrawler-core/3.1.0/index.html">JavaDocs</a> + <li><a href="/faq/">FAQ</a></li> + <li><a href="/support/">Support</a></li> + </ul> +</nav> + + +<main class="main-content"> + <div class="page-title"> + <h1>A collection of resources for building low-latency, scalable web crawlers on Apache Storm®</h1> + </div> + </div> + <div class="row row-col"> + <p><strong><span class="concept">Apache StormCrawler (Incubating)</span></strong> is an open source SDK for building distributed web crawlers based on <a href="http://storm.apache.org">Apache Storm®</a>. The project is under Apache license v2 and consists of a collection of reusable resources and components, written mostly in Java.</p> + <p>The aim of Apache StormCrawler (Incubating) is to help build web crawlers that are :</p> + <ul> + <li>scalable</li> + <li>resilient</li> + <li>low latency</li> + <li>easy to extend</li> + <li>polite yet efficient</li> + </ul> + <p><strong>Apache StormCrawler (Incubating)</strong> is a library and collection of resources that developers can leverage to build their own crawlers. The good news is that doing so can be pretty straightforward! Have a look at the <a href="getting-started/">Getting Started</a> section for more details.</p> + <p>Apart from the core components, we provide some <a href="https://github.com/apache/incubator-stormcrawler/tree/main/external">external resources</a> that you can reuse in your project, like for instance our spout and bolts for <a href="https://opensearch.org/">OpenSearch®</a> or a ParserBolt which uses <a href="http://tika.apache.org">Apache Tika®</a> to parse various document formats.</p> + <p><strong>Apache StormCrawler (Incubating)</strong> is perfectly suited to use cases where the URL to fetch and parse come as streams but is also an appropriate solution for large scale recursive crawls, particularly where low latency is required. The project is used in production by <a href="https://github.com/apache/incubator-stormcrawler/wiki/Powered-By">many organisations</a> and is actively developed and maintained.</p> + <p>The <a href="https://github.com/apache/incubator-stormcrawler/wiki/Presentations">Presentations</a> page contains links to some recent presentations made about this project.</p> + </div> + + <div class="row row-col"> + <div class="used-by-panel"> + <h2>Used by</h2> + <a href="https://pixray.com/" target="_blank"> + <img src="/img/pixray.png" alt="Pixray" height=80> + </a> + <a href="https://www.gov.nt.ca/" target="_blank"> + <img src="/img/gnwt.png" alt="Government of Northwest Territories"> + </a> + <a href="https://www.stolencamerafinder.com/" target="_blank"> + <img src="/img/stolen-camera-finder.png" alt="StolenCameraFinder"> + </a> + <a href="https://www.polecat.com/" target="_blank"> + <img src="/img/polecat.svg" alt="Polecat" height=70> + </a> + <br> + <a href="http://github.com/apache/incubator-stormcrawler/wiki/Powered-By">and many more...</a> + </div> + </div> + +</main> + +<footer class="site-footer"> + <img src="/img/incubator_feather_egg_logo_bw_crop.png" alt="Apache Incubator Logo" width="500"><br/> + + Apache StormCrawler is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. Incubation is required of all newly accepted projects until a further review indicates that the infrastructure, communications, and decision making process have stabilized in a manner consistent with other successful ASF projects. While incubation status is not necessarily a reflection of the completeness or stability of the code, it does indicate that th [...] + <br/> <br/> + © 2024 <a href="https://www.apache.org/">The Apache Software Foundation</a><br/><br/> + Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. <br/> Apache StormCrawler, StormCrawler, the Apache feather logo are trademarks of The Apache Software Foundation. <br/> All other marks mentioned may be trademarks or registered trademarks of their respective owners. <br/><br/> + <a href="https://privacy.apache.org/policies/privacy-policy-public.html">Privacy Policy</a> | <a href="https://www.apache.org/security/">Security</a> | <a href="https://www.apache.org/foundation/sponsorship">Sponsorship</a> | <a href="https://www.apache.org/foundation/sponsors">Sponsors</a><br/><br/> + <div class="footer-widget"> + <a class="acevent" data-format="wide" data-mode="dark"></a> + </div> +</footer> + + +</body> + +<script type="text/javascript" src="https://www.apachecon.com/event-images/snippet.js"></script> + +<!-- Matomo --> +<script type="text/javascript"> + var _paq = window._paq = window._paq || []; + /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ + _paq.push(["setDoNotTrack", true]); + _paq.push(["disableCookies"]); + _paq.push(['trackPageView']); + _paq.push(['enableLinkTracking']); + (function() { + var u="https://analytics.apache.org/"; + _paq.push(['setTrackerUrl', u+'matomo.php']); + _paq.push(['setSiteId', '58']); + var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; + g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); + })(); +</script> +<!-- End Matomo Code --> +</html>
