This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch fix_issues_from_ipmc_release_vote
in repository https://gitbox.apache.org/repos/asf/incubator-stormcrawler.git

commit 553fb584a9ca04bc40f1b2ec71094b7cf85f4e77
Author: Richard Zowalla <[email protected]>
AuthorDate: Fri Nov 22 08:13:54 2024 +0100

    Remove references to digitalpebble.com (README, HTML)
---
 README.md                                          |   2 -
 .../stormcrawler/bolt/FeedParserBoltTest.java      |   2 +-
 .../stormcrawler/bolt/JSoupParserBoltTest.java     |  16 +--
 .../stormcrawler/bolt/SiteMapParserBoltTest.java   |  18 +--
 .../stormcrawler/indexer/BasicIndexingTest.java    |  18 +--
 .../apache/stormcrawler/json/JsoupFilterTest.java  |   6 +-
 .../stormcrawler/jsoup/JSoupFiltersTest.java       |  12 +-
 .../stormcrawler/parse/DuplicateLinksTest.java     |   2 +-
 .../parse/filter/CSVMetadataFilterTest.java        |   4 +-
 .../parse/filter/SubDocumentsFilterTest.java       |   2 +-
 .../stormcrawler/parse/filter/XPathFilterTest.java |   8 +-
 core/src/test/resources/digitalpebble.com.html     | 156 ---------------------
 .../test/resources/stormcrawler.apache.org.html    | 151 ++++++++++++++++++++
 13 files changed, 195 insertions(+), 202 deletions(-)

diff --git a/README.md b/README.md
index c8699555..5596019e 100644
--- a/README.md
+++ b/README.md
@@ -13,8 +13,6 @@ NOTE: These instructions assume that you have [Apache 
Maven](https://maven.apach
 
 StormCrawler requires Java 11 or above. To execute tests, it requires you to 
have a locally installed and working Docker environment.
 
-DigitalPebble's 
[Ansible-Storm](https://github.com/DigitalPebble/ansible-storm) repository 
contains resources to install Apache Storm using Ansible. Alternatively, this 
[stormcrawler-docker](https://github.com/DigitalPebble/stormcrawler-docker) 
project should help you run Apache Storm on Docker.
-
 Once Storm is installed, the easiest way to get started is to generate a new 
StormCrawler project following the instructions below: 
 
 ```shell
diff --git 
a/core/src/test/java/org/apache/stormcrawler/bolt/FeedParserBoltTest.java 
b/core/src/test/java/org/apache/stormcrawler/bolt/FeedParserBoltTest.java
index d1e0f137..c2677529 100644
--- a/core/src/test/java/org/apache/stormcrawler/bolt/FeedParserBoltTest.java
+++ b/core/src/test/java/org/apache/stormcrawler/bolt/FeedParserBoltTest.java
@@ -86,7 +86,7 @@ class FeedParserBoltTest extends ParsingTester {
     void testNonFeedParsing() throws IOException {
         prepareParserBolt("test.parsefilters.json");
         // do not specify that it is a feed file
-        parse("http://www.digitalpebble.com";, "digitalpebble.com.html", new 
Metadata());
+        parse("http://stormcrawler.apache.org";, 
"stormcrawler.apache.org.html", new Metadata());
         Assertions.assertEquals(1, output.getEmitted().size());
     }
 }
diff --git 
a/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java 
b/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java
index 68e39047..f7a6d614 100644
--- a/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java
+++ b/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java
@@ -119,7 +119,7 @@ class JSoupParserBoltTest extends ParsingTester {
     void testNoScriptInText() throws IOException {
         bolt.prepare(
                 new HashMap(), TestUtil.getMockedTopologyContext(), new 
OutputCollector(output));
-        parse("http://www.digitalpebble.com";, "digitalpebble.com.html");
+        parse("http://stormcrawler.apache.org";, 
"stormcrawler.apache.org.html");
         List<Object> parsedTuple = output.getEmitted().remove(0);
         // check in the metadata that the values match
         String text = (String) parsedTuple.get(3);
@@ -133,9 +133,9 @@ class JSoupParserBoltTest extends ParsingTester {
     void testNoFollowOutlinks() throws IOException {
         bolt.prepare(
                 new HashMap(), TestUtil.getMockedTopologyContext(), new 
OutputCollector(output));
-        parse("http://www.digitalpebble.com";, "digitalpebble.com.html");
+        parse("http://stormcrawler.apache.org";, 
"stormcrawler.apache.org.html");
         List<List<Object>> statusTuples = 
output.getEmitted(Constants.StatusStreamName);
-        Assertions.assertEquals(10, statusTuples.size());
+        Assertions.assertEquals(25, statusTuples.size());
     }
 
     @Test
@@ -144,7 +144,7 @@ class JSoupParserBoltTest extends ParsingTester {
                 new HashMap(), TestUtil.getMockedTopologyContext(), new 
OutputCollector(output));
         Metadata metadata = new Metadata();
         metadata.setValues("X-Robots-Tag", new String[] {"noindex", 
"nofollow"});
-        parse("http://www.digitalpebble.com";, "digitalpebble.com.html", 
metadata);
+        parse("http://stormcrawler.apache.org";, 
"stormcrawler.apache.org.html", metadata);
         List<List<Object>> statusTuples = 
output.getEmitted(Constants.StatusStreamName);
         // no outlinks at all
         Assertions.assertEquals(0, statusTuples.size());
@@ -170,7 +170,7 @@ class JSoupParserBoltTest extends ParsingTester {
                 new HashMap(), TestUtil.getMockedTopologyContext(), new 
OutputCollector(output));
         for (int i = 0; i < tests.length; i++) {
             byte[] bytes = tests[i].getBytes(StandardCharsets.UTF_8);
-            parse("http://www.digitalpebble.com";, bytes, new Metadata());
+            parse("http://stormcrawler.apache.org";, bytes, new Metadata());
             Assertions.assertEquals(1, output.getEmitted().size());
             List<Object> parsedTuple = output.getEmitted().remove(0);
             // check in the metadata that the values match
@@ -205,7 +205,7 @@ class JSoupParserBoltTest extends ParsingTester {
     void testExecuteWithOutlinksLimit() throws IOException {
         stormConf.put("parser.emitOutlinks.max.per.page", 5);
         bolt.prepare(stormConf, TestUtil.getMockedTopologyContext(), new 
OutputCollector(output));
-        parse("http://www.digitalpebble.com";, "digitalpebble.com.html");
+        parse("http://stormcrawler.apache.org";, 
"stormcrawler.apache.org.html");
         List<List<Object>> statusTuples = 
output.getEmitted(Constants.StatusStreamName);
         // outlinks being limited by property
         Assertions.assertEquals(5, statusTuples.size());
@@ -215,10 +215,10 @@ class JSoupParserBoltTest extends ParsingTester {
     void testExecuteWithOutlinksLimitDisabled() throws IOException {
         stormConf.put("parser.emitOutlinks.max.per.page", -1);
         bolt.prepare(stormConf, TestUtil.getMockedTopologyContext(), new 
OutputCollector(output));
-        parse("http://www.digitalpebble.com";, "digitalpebble.com.html");
+        parse("http://stormcrawler.apache.org";, 
"stormcrawler.apache.org.html");
         List<List<Object>> statusTuples = 
output.getEmitted(Constants.StatusStreamName);
         // outlinks NOT being limited by property, since is disabled with -1
-        Assertions.assertEquals(10, statusTuples.size());
+        Assertions.assertEquals(25, statusTuples.size());
     }
 
     @Test
diff --git 
a/core/src/test/java/org/apache/stormcrawler/bolt/SiteMapParserBoltTest.java 
b/core/src/test/java/org/apache/stormcrawler/bolt/SiteMapParserBoltTest.java
index 725028c4..d96ce5f6 100644
--- a/core/src/test/java/org/apache/stormcrawler/bolt/SiteMapParserBoltTest.java
+++ b/core/src/test/java/org/apache/stormcrawler/bolt/SiteMapParserBoltTest.java
@@ -52,7 +52,7 @@ class SiteMapParserBoltTest extends ParsingTester {
         metadata.setValue(SiteMapParserBolt.isSitemapKey, "true");
         // and its mime-type
         metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
-        parse("http://www.digitalpebble.com/sitemap.xml";, 
"digitalpebble.sitemap.xml", metadata);
+        parse("http://stormcrawler.apache.org/sitemap.xml";, 
"digitalpebble.sitemap.xml", metadata);
         Assertions.assertEquals(6, 
output.getEmitted(Constants.StatusStreamName).size());
         // TODO test that the new links have the right metadata
         List<Object> fields = 
output.getEmitted(Constants.StatusStreamName).get(0);
@@ -100,7 +100,7 @@ class SiteMapParserBoltTest extends ParsingTester {
         // and its mime-type
         metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
         parse(
-                "http://www.digitalpebble.com/sitemap.xml";,
+                "http://stormcrawler.apache.org/sitemap.xml";,
                 "digitalpebble.sitemap.extensions.image.xml",
                 metadata);
         Values values = (Values) 
output.getEmitted(Constants.StatusStreamName).get(0);
@@ -119,7 +119,7 @@ class SiteMapParserBoltTest extends ParsingTester {
         // and its mime-type
         metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
         parse(
-                "http://www.digitalpebble.com/sitemap.xml";,
+                "http://stormcrawler.apache.org/sitemap.xml";,
                 "digitalpebble.sitemap.extensions.mobile.xml",
                 metadata);
         Values values = (Values) 
output.getEmitted(Constants.StatusStreamName).get(0);
@@ -138,7 +138,7 @@ class SiteMapParserBoltTest extends ParsingTester {
         // and its mime-type
         metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
         parse(
-                "http://www.digitalpebble.com/sitemap.xml";,
+                "http://stormcrawler.apache.org/sitemap.xml";,
                 "digitalpebble.sitemap.extensions.links.xml",
                 metadata);
         Values values = (Values) 
output.getEmitted(Constants.StatusStreamName).get(0);
@@ -157,7 +157,7 @@ class SiteMapParserBoltTest extends ParsingTester {
         // and its mime-type
         metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
         parse(
-                "http://www.digitalpebble.com/sitemap.xml";,
+                "http://stormcrawler.apache.org/sitemap.xml";,
                 "digitalpebble.sitemap.extensions.news.xml",
                 metadata);
         Values values = (Values) 
output.getEmitted(Constants.StatusStreamName).get(0);
@@ -176,7 +176,7 @@ class SiteMapParserBoltTest extends ParsingTester {
         // and its mime-type
         metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
         parse(
-                "http://www.digitalpebble.com/sitemap.xml";,
+                "http://stormcrawler.apache.org/sitemap.xml";,
                 "digitalpebble.sitemap.extensions.video.xml",
                 metadata);
         Values values = (Values) 
output.getEmitted(Constants.StatusStreamName).get(0);
@@ -202,7 +202,7 @@ class SiteMapParserBoltTest extends ParsingTester {
         // and its mime-type
         metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml");
         parse(
-                "http://www.digitalpebble.com/sitemap.xml";,
+                "http://stormcrawler.apache.org/sitemap.xml";,
                 "digitalpebble.sitemap.extensions.all.xml",
                 metadata);
         Values values = (Values) 
output.getEmitted(Constants.StatusStreamName).get(0);
@@ -237,7 +237,7 @@ class SiteMapParserBoltTest extends ParsingTester {
         Metadata metadata = new Metadata();
         // do not specify that it is a sitemap file
         // do not set the mimetype
-        parse("http://www.digitalpebble.com/sitemap.xml";, 
"digitalpebble.sitemap.xml", metadata);
+        parse("http://stormcrawler.apache.org/sitemap.xml";, 
"digitalpebble.sitemap.xml", metadata);
         Assertions.assertEquals(6, 
output.getEmitted(Constants.StatusStreamName).size());
         // TODO test that the new links have the right metadata
         List<Object> fields = 
output.getEmitted(Constants.StatusStreamName).get(0);
@@ -248,7 +248,7 @@ class SiteMapParserBoltTest extends ParsingTester {
     void testNonSitemapParsing() throws IOException {
         prepareParserBolt("test.parsefilters.json");
         // do not specify that it is a sitemap file
-        parse("http://www.digitalpebble.com";, "digitalpebble.com.html", new 
Metadata());
+        parse("http://stormcrawler.apache.org";, 
"stormcrawler.apache.org.html", new Metadata());
         Assertions.assertEquals(1, output.getEmitted().size());
     }
 
diff --git 
a/core/src/test/java/org/apache/stormcrawler/indexer/BasicIndexingTest.java 
b/core/src/test/java/org/apache/stormcrawler/indexer/BasicIndexingTest.java
index f47c2f16..9b73fc26 100644
--- a/core/src/test/java/org/apache/stormcrawler/indexer/BasicIndexingTest.java
+++ b/core/src/test/java/org/apache/stormcrawler/indexer/BasicIndexingTest.java
@@ -29,7 +29,7 @@ import org.junit.jupiter.api.Test;
 
 class BasicIndexingTest extends IndexerTester {
 
-    private static final String URL = "http://www.digitalpebble.com";;
+    private static final String URL = "http://stormcrawler.apache.org";;
 
     @BeforeEach
     void setupIndexerBolt() {
@@ -55,12 +55,12 @@ class BasicIndexingTest extends IndexerTester {
         config.put(AbstractIndexerBolt.urlFieldParamName, "url");
         config.put(AbstractIndexerBolt.canonicalMetadataParamName, 
"canonical");
         Metadata metadata = new Metadata();
-        metadata.setValue("canonical", "http://www.digitalpebble.com/";);
+        metadata.setValue("canonical", "http://stormcrawler.apache.org/";);
         prepareIndexerBolt(config);
         index(URL, metadata);
         Map<String, String> fields = ((DummyIndexer) bolt).returnFields();
         Assertions.assertEquals(
-                "http://www.digitalpebble.com/";,
+                "http://stormcrawler.apache.org/";,
                 fields.get("url"),
                 "Use the canonical URL if found");
     }
@@ -76,7 +76,7 @@ class BasicIndexingTest extends IndexerTester {
         index(URL, metadata);
         Map<String, String> fields = ((DummyIndexer) bolt).returnFields();
         Assertions.assertEquals(
-                "http://www.digitalpebble.com/home";,
+                "http://stormcrawler.apache.org/home";,
                 fields.get("url"),
                 "Use the canonical URL if found");
     }
@@ -92,7 +92,7 @@ class BasicIndexingTest extends IndexerTester {
         index(URL, metadata);
         Map<String, String> fields = ((DummyIndexer) bolt).returnFields();
         Assertions.assertEquals(
-                "http://www.digitalpebble.com";,
+                "http://stormcrawler.apache.org";,
                 fields.get("url"),
                 "Use the default URL if a bad canonical URL is found");
     }
@@ -108,7 +108,7 @@ class BasicIndexingTest extends IndexerTester {
         index(URL, metadata);
         Map<String, String> fields = ((DummyIndexer) bolt).returnFields();
         Assertions.assertEquals(
-                "http://www.digitalpebble.com";,
+                "http://stormcrawler.apache.org";,
                 fields.get("url"),
                 "Ignore if the canonical URL references other host");
     }
@@ -118,12 +118,12 @@ class BasicIndexingTest extends IndexerTester {
         Map config = new HashMap();
         config.put(AbstractIndexerBolt.urlFieldParamName, "url");
         Metadata metadata = new Metadata();
-        metadata.setValue("canonical", "http://www.digitalpebble.com/";);
+        metadata.setValue("canonical", "http://stormcrawler.apache.org/";);
         prepareIndexerBolt(config);
         index(URL, metadata);
         Map<String, String> fields = ((DummyIndexer) bolt).returnFields();
         Assertions.assertEquals(
-                "http://www.digitalpebble.com";,
+                "http://stormcrawler.apache.org";,
                 fields.get("url"),
                 "Use the canonical URL if found");
     }
@@ -139,7 +139,7 @@ class BasicIndexingTest extends IndexerTester {
         index(URL, metadata);
         Map<String, String> fields = ((DummyIndexer) bolt).returnFields();
         Assertions.assertEquals(
-                "http://www.digitalpebble.com";,
+                "http://stormcrawler.apache.org";,
                 fields.get("url"),
                 "The document must pass if the key/value is found in the 
metadata");
     }
diff --git 
a/core/src/test/java/org/apache/stormcrawler/json/JsoupFilterTest.java 
b/core/src/test/java/org/apache/stormcrawler/json/JsoupFilterTest.java
index dc89f38d..5a56d655 100644
--- a/core/src/test/java/org/apache/stormcrawler/json/JsoupFilterTest.java
+++ b/core/src/test/java/org/apache/stormcrawler/json/JsoupFilterTest.java
@@ -45,7 +45,7 @@ class JsoupFilterTest extends ParsingTester {
     @Test
     void testLDJsonExtraction() throws IOException {
         prepareParserBolt("test.jsoupfilters.json");
-        parse("http://www.digitalpebble.com";, "digitalpebble.com.html");
+        parse("http://stormcrawler.apache.org";, 
"stormcrawler.apache.org.html");
         Assertions.assertEquals(1, output.getEmitted().size());
         List<Object> parsedTuple = output.getEmitted().get(0);
         Metadata metadata = (Metadata) parsedTuple.get(2);
@@ -57,9 +57,9 @@ class JsoupFilterTest extends ParsingTester {
     @Test
     void testLinkFilter() throws IOException {
         prepareParserBolt("test.jsoupfilters.json");
-        parse("http://www.digitalpebble.com";, "digitalpebble.com.html");
+        parse("http://stormcrawler.apache.org";, 
"stormcrawler.apache.org.html");
         List<List<Object>> status = output.getEmitted("status");
-        Assertions.assertEquals(16, status.size());
+        Assertions.assertEquals(31, status.size());
         List<Object> parsedTuple = status.get(0);
         parsedTuple.toArray();
     }
diff --git 
a/core/src/test/java/org/apache/stormcrawler/jsoup/JSoupFiltersTest.java 
b/core/src/test/java/org/apache/stormcrawler/jsoup/JSoupFiltersTest.java
index 9de60371..de433d1c 100644
--- a/core/src/test/java/org/apache/stormcrawler/jsoup/JSoupFiltersTest.java
+++ b/core/src/test/java/org/apache/stormcrawler/jsoup/JSoupFiltersTest.java
@@ -46,7 +46,7 @@ class JSoupFiltersTest extends ParsingTester {
     @Test
     void testBasicExtraction() throws IOException {
         prepareParserBolt("test.jsoupfilters.json");
-        parse("http://www.digitalpebble.com";, "digitalpebble.com.html");
+        parse("http://stormcrawler.apache.org";, 
"stormcrawler.apache.org.html");
         Assertions.assertEquals(1, output.getEmitted().size());
         List<Object> parsedTuple = output.getEmitted().get(0);
         Metadata metadata = (Metadata) parsedTuple.get(2);
@@ -61,7 +61,7 @@ class JSoupFiltersTest extends ParsingTester {
     // https://github.com/DigitalPebble/storm-crawler/issues/219
     void testScriptExtraction() throws IOException {
         prepareParserBolt("test.jsoupfilters.json");
-        parse("http://www.digitalpebble.com";, "digitalpebble.com.html");
+        parse("http://stormcrawler.apache.org";, 
"stormcrawler.apache.org.html");
         Assertions.assertEquals(1, output.getEmitted().size());
         List<Object> parsedTuple = output.getEmitted().get(0);
         Metadata metadata = (Metadata) parsedTuple.get(2);
@@ -71,13 +71,13 @@ class JSoupFiltersTest extends ParsingTester {
         // should be 2 of them
         Assertions.assertEquals(2, scripts.length);
         Assertions.assertEquals("", scripts[0].trim());
-        Assertions.assertTrue(scripts[1].contains("urchinTracker();"));
+        Assertions.assertTrue(scripts[1].contains("_paq"));
     }
 
     @Test
     void testLDJsonExtraction() throws IOException {
         prepareParserBolt("test.jsoupfilters.json");
-        parse("http://www.digitalpebble.com";, "digitalpebble.com.html");
+        parse("http://stormcrawler.apache.org";, 
"stormcrawler.apache.org.html");
         Assertions.assertEquals(1, output.getEmitted().size());
         List<Object> parsedTuple = output.getEmitted().get(0);
         Metadata metadata = (Metadata) parsedTuple.get(2);
@@ -89,7 +89,7 @@ class JSoupFiltersTest extends ParsingTester {
     @Test
     void testExtraLink() throws IOException {
         prepareParserBolt("test.jsoupfilters.json");
-        parse("http://www.digitalpebble.com";, "digitalpebble.com.html");
-        Assertions.assertEquals(16, output.getEmitted("status").size());
+        parse("http://stormcrawler.apache.org";, 
"stormcrawler.apache.org.html");
+        Assertions.assertEquals(31, output.getEmitted("status").size());
     }
 }
diff --git 
a/core/src/test/java/org/apache/stormcrawler/parse/DuplicateLinksTest.java 
b/core/src/test/java/org/apache/stormcrawler/parse/DuplicateLinksTest.java
index 9483d169..28835355 100644
--- a/core/src/test/java/org/apache/stormcrawler/parse/DuplicateLinksTest.java
+++ b/core/src/test/java/org/apache/stormcrawler/parse/DuplicateLinksTest.java
@@ -44,7 +44,7 @@ class DuplicateLinksTest extends ParsingTester {
         config.put("urlfilters.config.file", "basicurlnormalizer.json");
         bolt.prepare(config, TestUtil.getMockedTopologyContext(), new 
OutputCollector(output));
         Metadata metadata = new Metadata();
-        parse("http://www.digitalpebble.com/duplicates.html";, 
"duplicateLinks.html", metadata);
+        parse("http://stormcrawler.apache.org/duplicates.html";, 
"duplicateLinks.html", metadata);
         Assertions.assertEquals(1, 
output.getEmitted(Constants.StatusStreamName).size());
     }
 }
diff --git 
a/core/src/test/java/org/apache/stormcrawler/parse/filter/CSVMetadataFilterTest.java
 
b/core/src/test/java/org/apache/stormcrawler/parse/filter/CSVMetadataFilterTest.java
index 79d2ec5b..d460c6a5 100644
--- 
a/core/src/test/java/org/apache/stormcrawler/parse/filter/CSVMetadataFilterTest.java
+++ 
b/core/src/test/java/org/apache/stormcrawler/parse/filter/CSVMetadataFilterTest.java
@@ -36,13 +36,13 @@ class CSVMetadataFilterTest extends ParsingTester {
     @Test
     void testMultivalued() throws IOException {
         prepareParserBolt("test.parsefilters.json");
-        parse("http://www.digitalpebble.com";, "digitalpebble.com.html");
+        parse("http://stormcrawler.apache.org";, 
"stormcrawler.apache.org.html");
         Assertions.assertEquals(1, output.getEmitted().size());
         List<Object> parsedTuple = output.getEmitted().get(0);
         Metadata metadata = (Metadata) parsedTuple.get(2);
         Assertions.assertNotNull(metadata);
         String[] kws = metadata.getValues("keywords");
         Assertions.assertNotNull(kws);
-        Assertions.assertEquals(12, kws.length);
+        Assertions.assertEquals(8, kws.length);
     }
 }
diff --git 
a/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsFilterTest.java
 
b/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsFilterTest.java
index d0eb1f18..408d8503 100644
--- 
a/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsFilterTest.java
+++ 
b/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsFilterTest.java
@@ -40,7 +40,7 @@ class SubDocumentsFilterTest extends ParsingTester {
         config.put("detect.mimetype", false);
         prepareParserBolt("test.subdocfilter.json", config);
         Metadata metadata = new Metadata();
-        parse("http://www.digitalpebble.com/sitemap.xml";, 
"digitalpebble.sitemap.xml", metadata);
+        parse("http://stormcrawler.apache.org/sitemap.xml";, 
"digitalpebble.sitemap.xml", metadata);
         Assertions.assertEquals(6, output.getEmitted().size());
     }
 }
diff --git 
a/core/src/test/java/org/apache/stormcrawler/parse/filter/XPathFilterTest.java 
b/core/src/test/java/org/apache/stormcrawler/parse/filter/XPathFilterTest.java
index 4a8c49a0..a15e0833 100644
--- 
a/core/src/test/java/org/apache/stormcrawler/parse/filter/XPathFilterTest.java
+++ 
b/core/src/test/java/org/apache/stormcrawler/parse/filter/XPathFilterTest.java
@@ -36,7 +36,7 @@ class XPathFilterTest extends ParsingTester {
     @Test
     void testBasicExtraction() throws IOException {
         prepareParserBolt("test.parsefilters.json");
-        parse("http://www.digitalpebble.com";, "digitalpebble.com.html");
+        parse("http://stormcrawler.apache.org";, 
"stormcrawler.apache.org.html");
         Assertions.assertEquals(1, output.getEmitted().size());
         List<Object> parsedTuple = output.getEmitted().get(0);
         Metadata metadata = (Metadata) parsedTuple.get(2);
@@ -51,7 +51,7 @@ class XPathFilterTest extends ParsingTester {
     // https://github.com/DigitalPebble/storm-crawler/issues/219
     void testScriptExtraction() throws IOException {
         prepareParserBolt("test.parsefilters.json");
-        parse("http://www.digitalpebble.com";, "digitalpebble.com.html");
+        parse("http://stormcrawler.apache.org";, 
"stormcrawler.apache.org.html");
         Assertions.assertEquals(1, output.getEmitted().size());
         List<Object> parsedTuple = output.getEmitted().get(0);
         Metadata metadata = (Metadata) parsedTuple.get(2);
@@ -61,13 +61,13 @@ class XPathFilterTest extends ParsingTester {
         // should be 2 of them
         Assertions.assertEquals(2, scripts.length);
         Assertions.assertEquals("", scripts[0].trim());
-        Assertions.assertTrue(scripts[1].contains("urchinTracker();"));
+        Assertions.assertTrue(scripts[1].contains("_paq"));
     }
 
     @Test
     void testLDJsonExtraction() throws IOException {
         prepareParserBolt("test.parsefilters.json");
-        parse("http://www.digitalpebble.com";, "digitalpebble.com.html");
+        parse("http://stormcrawler.apache.org";, 
"stormcrawler.apache.org.html");
         Assertions.assertEquals(1, output.getEmitted().size());
         List<Object> parsedTuple = output.getEmitted().get(0);
         Metadata metadata = (Metadata) parsedTuple.get(2);
diff --git a/core/src/test/resources/digitalpebble.com.html 
b/core/src/test/resources/digitalpebble.com.html
deleted file mode 100644
index 68332164..00000000
--- a/core/src/test/resources/digitalpebble.com.html
+++ /dev/null
@@ -1,156 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<!--
-Licensed to the Apache Software Foundation (ASF) under one
-or more contributor license agreements.  See the NOTICE file
-distributed with this work for additional information
-regarding copyright ownership.  The ASF licenses this file
-to you under the Apache License, Version 2.0 (the
-"License"); you may not use this file except in compliance
-with the License.  You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing,
-software distributed under the License is distributed on an
-"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-KIND, either express or implied.  See the License for the
-specific language governing permissions and limitations
-under the License.
--->
-<html>
-<head>
-  <!-- #BeginTemplate "/Templates/model3.dwt" -->
-  <meta name="description"
-  content="DigitalPebble Ltd is a consultancy specialised in web crawling, 
natural language processing, information retrieval and extraction. Our 
expertise is based on open source solutions, such as Nutch, Gate or SOLR.">
-  <meta name="keywords" content="crawl, gate, consultant, consultancy, 
consulting, information extraction, information retrieval, NLP, IR, IE, nutch, 
solr">
-  <link rel="icon" href="img/favicon.ico" type="image/vnd.microsoft.icon">
-  <link type="text/css" href="style.css" rel="stylesheet">
-  <meta name="google-site-verification" 
content="ZNIbylXN61hwJhB39tK17-u7RsU5kgiHXWbQ5F7lrNc" />
-  <!-- #BeginEditable "doctitle" -->
-  <title>DigitalPebble Ltd - Open Source Solutions for Text Engineering</title>
-  <!-- #EndEditable -->
-</head>
-
-<script type="application/ld+json">
-{
-  "@context": "http://schema.org";,
-  "type": "Organization",
-  "email": "[email protected]",
-  "telephone": "(+44)7758085585",
-  "logo": "http://digitalpebble.com/img/logo.gif";,
-  "location": {
-    "type": "PostalAddress",
-    "addressCountry": "United Kingdom",
-    "addressLocality": "Bristol, Avon,",
-    "postalCode": "BS7 8ET",
-    "streetAddress": "16 Codrington Road"
-  },
-  "url": "http://digitalpebble.com/";
-}
-</script>
-
-<a rel="nofollow" href="inexistent.html"/>
-<a rel="nofollow somevalue" href="another_inexistent.html"/>
-
-<body style="color: rgb(0, 0, 0); background-color: rgb(255, 255, 255);"
-alink="#000000" link="#000000" vlink="#000000">
-
-<table align="center" border="0" cellspacing="0" width="70%">
-  <tbody>
-    <tr>
-      <td valign="bottom"><img src="img/logo.gif" alt="digitalpebble"
-        align="bottom" height="60" width="310"></td>
-    </tr>
-    <tr>
-      <td>&nbsp;</td>
-    </tr>
-    <tr>
-      <td align="left" valign="middle"><!-- #BeginEditable "menu" -->
-
-        <table border="0" cellpadding="0" cellspacing="0">
-          <tbody>
-            <tr>
-              <td><a href="index.html"><img name="Home"
-                src="./img/menu/home2.png" border="0"></a></td>
-              <td>&nbsp;</td>
-              <td><a href="solutions.html"><img name="Solutions"
-                src="./img/menu/solutions1.png" border="0"></a></td>
-              <td>&nbsp;</td>
-             <td><a href="references.html"><img name="Clients"
-                src="./img/menu/clients1.png" border="0"></a></td>
-              <td>&nbsp;</td>
-              <td><a href="contact.html"><img src="img/menu/contact1.png"
-                style="border: 0px solid ; width: 108px; height: 32px;"
-                alt="" name="Contact" onload=""></a></td>
-              <td>&nbsp;</td>
-            </tr>
-          </tbody>
-        </table>
-        <!-- #EndEditable -->
-      </td>
-    </tr>
-    <tr>
-      <td>
-        <div id="tabs">
-        </div>
-      </td>
-    </tr>
-  </tbody>
-</table>
-
-<table align="center" border="0" cellspacing="0" width="70%">
-  <!-- #BeginEditable "crumbs" -->
-  <tbody>
-    <tr>
-      <td colspan="3" align="left" valign="top">&nbsp;</td>
-    </tr>
-    <!-- #EndEditable -->
-    <tr>
-      <td valign="top" width="270"><!-- #BeginEditable "picture" -->
-        <img src="img/small5.jpg"
-        alt=""
-        height="182" width="255"><!-- #EndEditable -->
-      </td>
-      <td valign="top" width="50">&nbsp;</td>
-      <td align="left" valign="top" width="*"><!-- #BeginEditable "text" -->
-        <p class="aligned"><span class="concept">DigitalPebble Ltd</span>
-        is a consultancy and solution provider specialising in web crawling, 
natural language processing, 
-        document retrieval and information extraction.</p>
-
-        <p class="aligned">We advise, evaluate and implement solutions based
-        on leading <a href="solutions.html">open source solutions</a>, such
-        as <a href="http://nutch.apache.org/";>Apache Nutch</a>,
-       <a href="http://gate.ac.uk";>GATE</a> or <a 
href="http://lucene.apache.org/solr";>SOLR</a>. We aim to combine open
-        source tools to provide efficient, reliable and low cost
-        made-to-order solutions.</p>
-
-        <p class="aligned">Our unique expertise covers all aspects of
-        documents life cycle, from web-wide crawling and collection, content
-        analysis, filtering and categorization to indexing. We are
-        specialised in large scale processing using <a
-        href="http://hadoop.apache.org/";>Hadoop</a> or <a 
href="http://storm.apache.org/";>Storm</a> and have expertise in cloud platforms 
such as Amazon AWS, which has allowed
-        us to successfully deploy solutions scaling up to billions of 
documents for our <a href="references.html">clients</a>. </p>
-        
-        <p class="aligned">Not only to we have an extensive knowledge of open 
source solutions, we are also active contributors and 
-        provide some of the <a 
href="https://github.com/DigitalPebble";>resources</a> that we have developed 
over the years under open source licenses.
-         </p>
-        
-        <p class="aligned">Our <a href="references.html">clients</a> range 
from startup in stealth mode to NASDAQ listed companies and operate in domains 
as varied as business intelligence, media monitoring, 
-        telecommunications or software development.
-        </p>
-        
-      </td>
-     </td>
-      
-    </tr>
-  </tbody>
-</table>
-<script type="text/javascript"
-src="http://www.google-analytics.com/urchin.js";>
-</script>
-<script type="text/javascript">
-_uacct = "UA-357582-1";
-urchinTracker();</script>
-<!-- #EndTemplate -->
-</body>
-</html>
diff --git a/core/src/test/resources/stormcrawler.apache.org.html 
b/core/src/test/resources/stormcrawler.apache.org.html
new file mode 100644
index 00000000..9d26cce2
--- /dev/null
+++ b/core/src/test/resources/stormcrawler.apache.org.html
@@ -0,0 +1,151 @@
+<!DOCTYPE html>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<html>
+
+<head>
+    <meta charset="utf-8">
+    <meta http-equiv="X-UA-Compatible" content="IE=edge">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+
+    <title>Apache StormCrawler (Incubating)</title>
+    <meta name="description" content="Apache StormCrawler (Incubating) is 
collection of resources for building low-latency, scalable web crawlers on 
Apache Storm">
+    <meta name="keywords" content="crawl, information extraction, information 
retrieval, NLP, IR, IE, nutch, solr">
+    <link rel="stylesheet" href="/css/main.css">
+    <link rel="canonical" href="https://stormcrawler.apache.org/";>
+    <link rel="alternate" type="application/rss+xml" title="Apache 
StormCrawler (Incubating)" href="https://stormcrawler.apache.org/feed.xml";>
+    <link rel="icon" type="/image/png" href="/img/favicon.png" />
+</head>
+
+<script type="application/ld+json">
+    {
+        "@context": "http://schema.org";,
+        "type": "Organization",
+        "email": "[email protected]",
+        "location": {
+            "type": "PostalAddress",
+            "addressCountry": "U.S.A",
+            "addressLocality": "Wilmington",
+            "streetAddress": "1000 N West Street, Suite 1200"
+        },
+        "url": "http://stormcrawler.apache.org/";
+    }
+</script>
+
+<body class="home">
+
+<header class="site-header">
+    <div class="site-header__wrap">
+        <div class="site-header__logo">
+            <a href="/"><img src="/img/incubator_logo.png" alt="Apache 
StormCrawler (Incubating)"></a>
+        </div>
+    </div>
+</header>
+<nav class="site-nav">
+    <ul>
+        <li><a href="/index.html">Home</a>
+        <li><a href="/download/index.html">Download</a>
+        <li><a href="https://github.com/apache/incubator-stormcrawler";>Source 
Code</a></li>
+        <li><a href="/getting-started/">Getting Started</a></li>
+        <li><a 
href="https://javadoc.io/doc/org.apache.stormcrawler/stormcrawler-core/3.1.0/index.html";>JavaDocs</a>
+        <li><a href="/faq/">FAQ</a></li>
+        <li><a href="/support/">Support</a></li>
+    </ul>
+</nav>
+
+
+<main class="main-content">
+    <div class="page-title">
+        <h1>A collection of resources for building low-latency, scalable web 
crawlers on Apache Storm®</h1>
+    </div>
+    </div>
+    <div class="row row-col">
+        <p><strong><span class="concept">Apache StormCrawler 
(Incubating)</span></strong> is an open source SDK for building distributed web 
crawlers based on <a href="http://storm.apache.org";>Apache Storm®</a>. The 
project is under Apache license v2 and consists of a collection of reusable 
resources and components, written mostly in Java.</p>
+        <p>The aim of Apache StormCrawler (Incubating) is to help build web 
crawlers that are :</p>
+        <ul>
+            <li>scalable</li>
+            <li>resilient</li>
+            <li>low latency</li>
+            <li>easy to extend</li>
+            <li>polite yet efficient</li>
+        </ul>
+        <p><strong>Apache StormCrawler (Incubating)</strong> is a library and 
collection of resources that developers can leverage to build their own 
crawlers. The good news is that doing so can be pretty straightforward! Have a 
look at the <a href="getting-started/">Getting Started</a> section for more 
details.</p>
+        <p>Apart from the core components, we provide some <a 
href="https://github.com/apache/incubator-stormcrawler/tree/main/external";>external
 resources</a> that you can reuse in your project, like for instance our spout 
and bolts for <a href="https://opensearch.org/";>OpenSearch®</a> or a ParserBolt 
which uses <a href="http://tika.apache.org";>Apache Tika®</a> to parse various 
document formats.</p>
+        <p><strong>Apache StormCrawler (Incubating)</strong> is perfectly 
suited to use cases where the URL to fetch and parse come as streams but is 
also an appropriate solution for large scale recursive crawls, particularly 
where low latency is required. The project is used in production by <a 
href="https://github.com/apache/incubator-stormcrawler/wiki/Powered-By";>many 
organisations</a> and is actively developed and maintained.</p>
+        <p>The <a 
href="https://github.com/apache/incubator-stormcrawler/wiki/Presentations";>Presentations</a>
 page contains links to some recent presentations made about this project.</p>
+    </div>
+
+    <div class="row row-col">
+        <div class="used-by-panel">
+            <h2>Used by</h2>
+            <a href="https://pixray.com/"; target="_blank">
+                <img src="/img/pixray.png" alt="Pixray" height=80>
+            </a>
+            <a href="https://www.gov.nt.ca/"; target="_blank">
+                <img src="/img/gnwt.png" alt="Government of Northwest 
Territories">
+            </a>
+            <a href="https://www.stolencamerafinder.com/"; target="_blank">
+                <img src="/img/stolen-camera-finder.png" 
alt="StolenCameraFinder">
+            </a>
+            <a href="https://www.polecat.com/"; target="_blank">
+                <img src="/img/polecat.svg" alt="Polecat" height=70>
+            </a>
+            <br>
+            <a 
href="http://github.com/apache/incubator-stormcrawler/wiki/Powered-By";>and many 
more...</a>
+        </div>
+    </div>
+
+</main>
+
+<footer class="site-footer">
+    <img src="/img/incubator_feather_egg_logo_bw_crop.png" alt="Apache 
Incubator Logo" width="500"><br/>
+
+    Apache StormCrawler is an effort undergoing incubation at The Apache 
Software Foundation (ASF), sponsored by the Apache Incubator. Incubation is 
required of all newly accepted projects until a further review indicates that 
the infrastructure, communications, and decision making process have stabilized 
in a manner consistent with other successful ASF projects. While incubation 
status is not necessarily a reflection of the completeness or stability of the 
code, it does indicate that th [...]
+    <br/> <br/>
+    &copy; 2024 <a href="https://www.apache.org/";>The Apache Software 
Foundation</a><br/><br/>
+    Licensed under the <a 
href="https://www.apache.org/licenses/LICENSE-2.0";>Apache License, Version 
2.0</a>. <br/> Apache StormCrawler, StormCrawler, the Apache feather logo are 
trademarks of The Apache Software Foundation. <br/> All other marks mentioned 
may be trademarks or registered trademarks of their respective owners. 
<br/><br/>
+    <a 
href="https://privacy.apache.org/policies/privacy-policy-public.html";>Privacy 
Policy</a> | <a href="https://www.apache.org/security/";>Security</a> | <a 
href="https://www.apache.org/foundation/sponsorship";>Sponsorship</a> | <a 
href="https://www.apache.org/foundation/sponsors";>Sponsors</a><br/><br/>
+    <div class="footer-widget">
+        <a class="acevent" data-format="wide" data-mode="dark"></a>
+    </div>
+</footer>
+
+
+</body>
+
+<script type="text/javascript" 
src="https://www.apachecon.com/event-images/snippet.js";></script>
+
+<!-- Matomo -->
+<script type="text/javascript">
+    var _paq = window._paq = window._paq || [];
+    /* tracker methods like "setCustomDimension" should be called before 
"trackPageView" */
+    _paq.push(["setDoNotTrack", true]);
+    _paq.push(["disableCookies"]);
+    _paq.push(['trackPageView']);
+    _paq.push(['enableLinkTracking']);
+    (function() {
+        var u="https://analytics.apache.org/";;
+        _paq.push(['setTrackerUrl', u+'matomo.php']);
+        _paq.push(['setSiteId', '58']);
+        var d=document, g=d.createElement('script'), 
s=d.getElementsByTagName('script')[0];
+        g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
+    })();
+</script>
+<!-- End Matomo Code -->
+</html>


Reply via email to