This is an automated email from the ASF dual-hosted git repository. rzo1 pushed a commit to branch fix_issues_from_ipmc_release_vote in repository https://gitbox.apache.org/repos/asf/incubator-stormcrawler.git
commit 60e195d3f189bef56b3e27667def15d8b033bfa8 Author: Richard Zowalla <[email protected]> AuthorDate: Fri Nov 22 08:25:44 2024 +0100 Remove references to digitalpebble.com (sitemaps, src issue refs) --- .../main/resources/archetype-resources/README.md | 3 +- .../org/apache/stormcrawler/bolt/FetcherBolt.java | 6 +- .../apache/stormcrawler/bolt/JSoupParserBolt.java | 2 +- .../stormcrawler/bolt/SimpleFetcherBolt.java | 4 +- .../filtering/basic/BasicURLNormalizer.java | 2 +- .../filtering/regex/FastURLFilter.java | 2 +- .../filtering/sitemap/SitemapFilter.java | 2 +- .../persistence/AbstractStatusUpdaterBolt.java | 2 +- .../stormcrawler/protocol/ProtocolResponse.java | 2 +- .../stormcrawler/util/CharsetIdentification.java | 2 +- .../stormcrawler/bolt/SiteMapParserBoltTest.java | 20 +++--- .../filtering/BasicURLNormalizerTest.java | 2 +- .../stormcrawler/filtering/FastURLFilterTest.java | 2 +- .../stormcrawler/indexer/BasicIndexingTest.java | 2 +- .../stormcrawler/jsoup/JSoupFiltersTest.java | 2 +- .../stormcrawler/parse/StackOverflowTest.java | 4 +- .../parse/filter/SubDocumentsFilterTest.java | 4 +- .../stormcrawler/parse/filter/XPathFilterTest.java | 2 +- .../protocol/DelegationProtocolTest.java | 2 +- .../digitalpebble.sitemap.extensions.news.xml | 69 ------------------ .../digitalpebble.sitemap.extensions.video.xml | 79 --------------------- core/src/test/resources/digitalpebble.sitemap.xml | 57 --------------- core/src/test/resources/fast.urlfilter.json | 2 +- ...xml => stormcrawler.sitemap.extensions.all.xml} | 54 ++++++++------- ...l => stormcrawler.sitemap.extensions.image.xml} | 54 ++++++++------- ...l => stormcrawler.sitemap.extensions.links.xml} | 63 +++++++++-------- ... => stormcrawler.sitemap.extensions.mobile.xml} | 63 +++++++++-------- .../stormcrawler.sitemap.extensions.news.xml | 70 +++++++++++++++++++ .../stormcrawler.sitemap.extensions.video.xml | 81 ++++++++++++++++++++++ core/src/test/resources/stormcrawler.sitemap.xml | 60 ++++++++++++++++ .../stormcrawler/opensearch/bolt/DeletionBolt.java | 2 +- .../stormcrawler/opensearch/bolt/IndexerBolt.java | 2 +- .../opensearch/persistence/StatusUpdaterBolt.java | 2 +- .../opensearch/bolt/IndexerBoltTest.java | 2 +- .../opensearch/bolt/StatusBoltTest.java | 2 +- .../apache/stormcrawler/tika/ParserBoltTest.java | 2 +- .../urlfrontier/ManagedChannelUtil.java | 2 +- .../stormcrawler/warc/WARCRequestRecordFormat.java | 2 +- 38 files changed, 375 insertions(+), 360 deletions(-) diff --git a/archetype/src/main/resources/archetype-resources/README.md b/archetype/src/main/resources/archetype-resources/README.md index e973f08f..9f4fce32 100644 --- a/archetype/src/main/resources/archetype-resources/README.md +++ b/archetype/src/main/resources/archetype-resources/README.md @@ -3,8 +3,7 @@ Have a look at the code and resources and modify them to your heart's content. # Prerequisites -You need to install Apache Storm. The instructions on [setting up a Storm cluster](https://storm.apache.org/releases/2.6.2/Setting-up-a-Storm-cluster.html) should help. Alternatively, -the [stormcrawler-docker](https://github.com/DigitalPebble/stormcrawler-docker) project contains resources for running Apache Storm on Docker. +You need to install Apache Storm. The instructions on [setting up a Storm cluster](https://storm.apache.org/releases/2.6.2/Setting-up-a-Storm-cluster.html) should help. You also need to have an instance of URLFrontier running. See [the URLFrontier README](https://github.com/crawler-commons/url-frontier/tree/master/service); the easiest way is to use Docker, like so: diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java index b4da630a..3f1477d1 100644 --- a/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/bolt/FetcherBolt.java @@ -509,7 +509,7 @@ public class FetcherBolt extends StatusEmitterBolt { metadata = new Metadata(); } - // https://github.com/DigitalPebble/storm-crawler/issues/813 + // https://github.com/apache/incubator-stormcrawler/issues/813 metadata.remove("fetch.exception"); boolean asap = false; @@ -568,7 +568,7 @@ public class FetcherBolt extends StatusEmitterBolt { } // has found sitemaps - // https://github.com/DigitalPebble/storm-crawler/issues/710 + // https://github.com/apache/incubator-stormcrawler/issues/710 // note: we don't care if the sitemap URLs where actually // kept boolean foundSitemap = (rules.getSitemaps().size() > 0); @@ -732,7 +732,7 @@ public class FetcherBolt extends StatusEmitterBolt { mergedMD.setValue("_redirTo", redirection); } - // https://github.com/DigitalPebble/storm-crawler/issues/954 + // https://github.com/apache/incubator-stormcrawler/issues/954 if (allowRedirs() && StringUtils.isNotBlank(redirection)) { emitOutlink(fit.t, url, redirection, mergedMD); } diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java index 015403d0..17214a4d 100644 --- a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java @@ -347,7 +347,7 @@ public class JSoupParserBolt extends StatusEmitterBolt { LOG.info("Found redir in {} to {}", url, redirection); metadata.setValue("_redirTo", redirection); - // https://github.com/DigitalPebble/storm-crawler/issues/954 + // https://github.com/apache/incubator-stormcrawler/issues/954 if (allowRedirs() && StringUtils.isNotBlank(redirection)) { emitOutlink(tuple, new URL(url), redirection, metadata); } diff --git a/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java b/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java index 7c5ccfcc..0f783d78 100644 --- a/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/bolt/SimpleFetcherBolt.java @@ -256,7 +256,7 @@ public class SimpleFetcherBolt extends StatusEmitterBolt { metadata = new Metadata(); } - // https://github.com/DigitalPebble/storm-crawler/issues/813 + // https://github.com/apache/incubator-stormcrawler/issues/813 metadata.remove("fetch.exception"); URL url; @@ -326,7 +326,7 @@ public class SimpleFetcherBolt extends StatusEmitterBolt { } // has found sitemaps - // https://github.com/DigitalPebble/storm-crawler/issues/710 + // https://github.com/apache/incubator-stormcrawler/issues/710 // note: we don't care if the sitemap URLs where actually // kept boolean foundSitemap = (rules.getSitemaps().size() > 0); diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java b/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java index 7550327c..629bc976 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/basic/BasicURLNormalizer.java @@ -50,7 +50,7 @@ public class BasicURLNormalizer extends URLFilter { /** Nutch 1098 - finds URL encoded parts of the URL */ private static final Pattern unescapeRulePattern = Pattern.compile("%([0-9A-Fa-f]{2})"); - /** https://github.com/DigitalPebble/storm-crawler/issues/401 * */ + /** https://github.com/apache/incubator-stormcrawler/issues/401 * */ private static final Pattern illegalEscapePattern = Pattern.compile("%u([0-9A-Fa-f]{4})"); // charset used for encoding URLs before escaping diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java index 671b9c0f..50f528f2 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java @@ -112,7 +112,7 @@ public class FastURLFilter extends URLFilter implements JSONResource { // if it contains a single object // jump directly to its content - // https://github.com/DigitalPebble/storm-crawler/issues/1013 + // https://github.com/apache/incubator-stormcrawler/issues/1013 if (rootNode.size() == 1 && rootNode.isObject()) { rootNode = rootNode.fields().next().getValue(); } diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java b/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java index 6670663e..5beec278 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/sitemap/SitemapFilter.java @@ -36,7 +36,7 @@ import org.jetbrains.annotations.Nullable; * </pre> * * Will be replaced by <a href= - * "https://github.com/DigitalPebble/storm-crawler/issues/711">MetadataFilter to filter based on + * "https://github.com/apache/incubator-stormcrawler/issues/711">MetadataFilter to filter based on * multiple key values</a> * * @since 1.14 diff --git a/core/src/main/java/org/apache/stormcrawler/persistence/AbstractStatusUpdaterBolt.java b/core/src/main/java/org/apache/stormcrawler/persistence/AbstractStatusUpdaterBolt.java index 04bf9bfe..44d7a89f 100644 --- a/core/src/main/java/org/apache/stormcrawler/persistence/AbstractStatusUpdaterBolt.java +++ b/core/src/main/java/org/apache/stormcrawler/persistence/AbstractStatusUpdaterBolt.java @@ -207,7 +207,7 @@ public abstract class AbstractStatusUpdaterBolt extends BaseRichBolt { if (!status.equals(Status.FETCH_ERROR)) { metadata.remove(Constants.fetchErrorCountParamName); } - // https://github.com/DigitalPebble/storm-crawler/issues/415 + // https://github.com/apache/incubator-stormcrawler/issues/415 // remove error related key values in case of success if (status.equals(Status.FETCHED) || status.equals(Status.REDIRECTION)) { metadata.remove(Constants.STATUS_ERROR_CAUSE); diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/ProtocolResponse.java b/core/src/main/java/org/apache/stormcrawler/protocol/ProtocolResponse.java index f997957f..b79163d8 100644 --- a/core/src/main/java/org/apache/stormcrawler/protocol/ProtocolResponse.java +++ b/core/src/main/java/org/apache/stormcrawler/protocol/ProtocolResponse.java @@ -58,7 +58,7 @@ public class ProtocolResponse { /** * @since 1.17 - * @see <a href="https://github.com/DigitalPebble/storm-crawler/issues/776">Issue 776</a> + * @see <a href="https://github.com/apache/incubator-stormcrawler/issues/776">Issue 776</a> */ public static final String PROTOCOL_MD_PREFIX_PARAM = "protocol.md.prefix"; diff --git a/core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java b/core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java index b9a767a9..1ef8a712 100644 --- a/core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java +++ b/core/src/main/java/org/apache/stormcrawler/util/CharsetIdentification.java @@ -186,7 +186,7 @@ public class CharsetIdentification { int start = html.indexOf("<meta charset=\""); if (start != -1) { int end = html.indexOf('"', start + 15); - // https://github.com/DigitalPebble/storm-crawler/issues/870 + // https://github.com/apache/incubator-stormcrawler/issues/870 // try on a slightly larger section of text if it is trimmed if (end == -1 && ((maxlength + 10) < buffer.length)) { return getCharsetFromMeta(buffer, maxlength + 10); diff --git a/core/src/test/java/org/apache/stormcrawler/bolt/SiteMapParserBoltTest.java b/core/src/test/java/org/apache/stormcrawler/bolt/SiteMapParserBoltTest.java index d96ce5f6..de8d7778 100644 --- a/core/src/test/java/org/apache/stormcrawler/bolt/SiteMapParserBoltTest.java +++ b/core/src/test/java/org/apache/stormcrawler/bolt/SiteMapParserBoltTest.java @@ -52,8 +52,8 @@ class SiteMapParserBoltTest extends ParsingTester { metadata.setValue(SiteMapParserBolt.isSitemapKey, "true"); // and its mime-type metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); - parse("http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.xml", metadata); - Assertions.assertEquals(6, output.getEmitted(Constants.StatusStreamName).size()); + parse("http://stormcrawler.apache.org/sitemap.xml", "stormcrawler.sitemap.xml", metadata); + Assertions.assertEquals(7, output.getEmitted(Constants.StatusStreamName).size()); // TODO test that the new links have the right metadata List<Object> fields = output.getEmitted(Constants.StatusStreamName).get(0); Assertions.assertEquals(3, fields.size()); @@ -101,7 +101,7 @@ class SiteMapParserBoltTest extends ParsingTester { metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( "http://stormcrawler.apache.org/sitemap.xml", - "digitalpebble.sitemap.extensions.image.xml", + "stormcrawler.sitemap.extensions.image.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); Metadata parsedMetadata = (Metadata) values.get(1); @@ -120,7 +120,7 @@ class SiteMapParserBoltTest extends ParsingTester { metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( "http://stormcrawler.apache.org/sitemap.xml", - "digitalpebble.sitemap.extensions.mobile.xml", + "stormcrawler.sitemap.extensions.mobile.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); Metadata parsedMetadata = (Metadata) values.get(1); @@ -139,7 +139,7 @@ class SiteMapParserBoltTest extends ParsingTester { metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( "http://stormcrawler.apache.org/sitemap.xml", - "digitalpebble.sitemap.extensions.links.xml", + "stormcrawler.sitemap.extensions.links.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); Metadata parsedMetadata = (Metadata) values.get(1); @@ -158,7 +158,7 @@ class SiteMapParserBoltTest extends ParsingTester { metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( "http://stormcrawler.apache.org/sitemap.xml", - "digitalpebble.sitemap.extensions.news.xml", + "stormcrawler.sitemap.extensions.news.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); Metadata parsedMetadata = (Metadata) values.get(1); @@ -177,7 +177,7 @@ class SiteMapParserBoltTest extends ParsingTester { metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( "http://stormcrawler.apache.org/sitemap.xml", - "digitalpebble.sitemap.extensions.video.xml", + "stormcrawler.sitemap.extensions.video.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); Metadata parsedMetadata = (Metadata) values.get(1); @@ -203,7 +203,7 @@ class SiteMapParserBoltTest extends ParsingTester { metadata.setValue(HttpHeaders.CONTENT_TYPE, "application/xml"); parse( "http://stormcrawler.apache.org/sitemap.xml", - "digitalpebble.sitemap.extensions.all.xml", + "stormcrawler.sitemap.extensions.all.xml", metadata); Values values = (Values) output.getEmitted(Constants.StatusStreamName).get(0); Metadata parsedMetadata = (Metadata) values.get(1); @@ -237,8 +237,8 @@ class SiteMapParserBoltTest extends ParsingTester { Metadata metadata = new Metadata(); // do not specify that it is a sitemap file // do not set the mimetype - parse("http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.xml", metadata); - Assertions.assertEquals(6, output.getEmitted(Constants.StatusStreamName).size()); + parse("http://stormcrawler.apache.org/sitemap.xml", "stormcrawler.sitemap.xml", metadata); + Assertions.assertEquals(7, output.getEmitted(Constants.StatusStreamName).size()); // TODO test that the new links have the right metadata List<Object> fields = output.getEmitted(Constants.StatusStreamName).get(0); Assertions.assertEquals(3, fields.size()); diff --git a/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java b/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java index b9594cc3..250ea401 100644 --- a/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java +++ b/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java @@ -289,7 +289,7 @@ class BasicURLNormalizerTest { assertEquals(expectedResult, normalizedUrl, "Failed to filter query string"); } - // https://github.com/DigitalPebble/storm-crawler/issues/401 + // https://github.com/apache/incubator-stormcrawler/issues/401 @Test void testNonStandardPercentEncoding() throws MalformedURLException { URLFilter urlFilter = createFilter(false, false); diff --git a/core/src/test/java/org/apache/stormcrawler/filtering/FastURLFilterTest.java b/core/src/test/java/org/apache/stormcrawler/filtering/FastURLFilterTest.java index fdf68fb3..4ea88b5b 100644 --- a/core/src/test/java/org/apache/stormcrawler/filtering/FastURLFilterTest.java +++ b/core/src/test/java/org/apache/stormcrawler/filtering/FastURLFilterTest.java @@ -53,7 +53,7 @@ class FastURLFilterTest { String filterResult = createFilter().filter(url, metadata, url.toExternalForm()); Assertions.assertEquals(null, filterResult); // allowed - url = new URL("http://stormcrawler.net/digitalpebble/"); + url = new URL("http://stormcrawler.net/bla/"); filterResult = createFilter().filter(url, metadata, url.toExternalForm()); Assertions.assertEquals(url.toString(), filterResult); } diff --git a/core/src/test/java/org/apache/stormcrawler/indexer/BasicIndexingTest.java b/core/src/test/java/org/apache/stormcrawler/indexer/BasicIndexingTest.java index 9b73fc26..d00bd4a6 100644 --- a/core/src/test/java/org/apache/stormcrawler/indexer/BasicIndexingTest.java +++ b/core/src/test/java/org/apache/stormcrawler/indexer/BasicIndexingTest.java @@ -87,7 +87,7 @@ class BasicIndexingTest extends IndexerTester { config.put(AbstractIndexerBolt.urlFieldParamName, "url"); config.put(AbstractIndexerBolt.canonicalMetadataParamName, "canonical"); Metadata metadata = new Metadata(); - metadata.setValue("canonical", "htp://www.digitalpebble.com/"); + metadata.setValue("canonical", "htp://stormcrawler.apache.org/"); prepareIndexerBolt(config); index(URL, metadata); Map<String, String> fields = ((DummyIndexer) bolt).returnFields(); diff --git a/core/src/test/java/org/apache/stormcrawler/jsoup/JSoupFiltersTest.java b/core/src/test/java/org/apache/stormcrawler/jsoup/JSoupFiltersTest.java index de433d1c..33f96dbb 100644 --- a/core/src/test/java/org/apache/stormcrawler/jsoup/JSoupFiltersTest.java +++ b/core/src/test/java/org/apache/stormcrawler/jsoup/JSoupFiltersTest.java @@ -58,7 +58,7 @@ class JSoupFiltersTest extends ParsingTester { } @Test - // https://github.com/DigitalPebble/storm-crawler/issues/219 + // https://github.com/apache/incubator-stormcrawler/issues/219 void testScriptExtraction() throws IOException { prepareParserBolt("test.jsoupfilters.json"); parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); diff --git a/core/src/test/java/org/apache/stormcrawler/parse/StackOverflowTest.java b/core/src/test/java/org/apache/stormcrawler/parse/StackOverflowTest.java index 3a0a3956..02abfab5 100644 --- a/core/src/test/java/org/apache/stormcrawler/parse/StackOverflowTest.java +++ b/core/src/test/java/org/apache/stormcrawler/parse/StackOverflowTest.java @@ -28,7 +28,7 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; /** - * @see https://github.com/DigitalPebble/storm-crawler/pull/653 * + * @see https://github.com/apache/incubator-stormcrawler/pull/653 * */ class StackOverflowTest extends ParsingTester { @@ -47,7 +47,7 @@ class StackOverflowTest extends ParsingTester { } /** - * @see https://github.com/DigitalPebble/storm-crawler/issues/666 * + * @see https://github.com/apache/incubator-stormcrawler/issues/666 * */ @Test void testNamespaceExtraction() throws IOException { diff --git a/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsFilterTest.java b/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsFilterTest.java index 408d8503..f74c34a6 100644 --- a/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsFilterTest.java +++ b/core/src/test/java/org/apache/stormcrawler/parse/filter/SubDocumentsFilterTest.java @@ -40,7 +40,7 @@ class SubDocumentsFilterTest extends ParsingTester { config.put("detect.mimetype", false); prepareParserBolt("test.subdocfilter.json", config); Metadata metadata = new Metadata(); - parse("http://stormcrawler.apache.org/sitemap.xml", "digitalpebble.sitemap.xml", metadata); - Assertions.assertEquals(6, output.getEmitted().size()); + parse("http://stormcrawler.apache.org/sitemap.xml", "stormcrawler.sitemap.xml", metadata); + Assertions.assertEquals(7, output.getEmitted().size()); } } diff --git a/core/src/test/java/org/apache/stormcrawler/parse/filter/XPathFilterTest.java b/core/src/test/java/org/apache/stormcrawler/parse/filter/XPathFilterTest.java index a15e0833..7a8077f3 100644 --- a/core/src/test/java/org/apache/stormcrawler/parse/filter/XPathFilterTest.java +++ b/core/src/test/java/org/apache/stormcrawler/parse/filter/XPathFilterTest.java @@ -48,7 +48,7 @@ class XPathFilterTest extends ParsingTester { } @Test - // https://github.com/DigitalPebble/storm-crawler/issues/219 + // https://github.com/apache/incubator-stormcrawler/issues/219 void testScriptExtraction() throws IOException { prepareParserBolt("test.parsefilters.json"); parse("http://stormcrawler.apache.org", "stormcrawler.apache.org.html"); diff --git a/core/src/test/java/org/apache/stormcrawler/protocol/DelegationProtocolTest.java b/core/src/test/java/org/apache/stormcrawler/protocol/DelegationProtocolTest.java index 9a706829..a4d25cb6 100644 --- a/core/src/test/java/org/apache/stormcrawler/protocol/DelegationProtocolTest.java +++ b/core/src/test/java/org/apache/stormcrawler/protocol/DelegationProtocolTest.java @@ -40,7 +40,7 @@ class DelegationProtocolTest { // try single filter Metadata meta = new Metadata(); meta.setValue("js", "true"); - FilteredProtocol pf = superProto.getProtocolFor("https://digitalpebble.com", meta); + FilteredProtocol pf = superProto.getProtocolFor("https://stormcrawler.apache.org", meta); Assertions.assertEquals(pf.id, "second"); // no filter at all meta = new Metadata(); diff --git a/core/src/test/resources/digitalpebble.sitemap.extensions.news.xml b/core/src/test/resources/digitalpebble.sitemap.extensions.news.xml deleted file mode 100644 index 9243b66b..00000000 --- a/core/src/test/resources/digitalpebble.sitemap.extensions.news.xml +++ /dev/null @@ -1,69 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, -software distributed under the License is distributed on an -"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -KIND, either express or implied. See the License for the -specific language governing permissions and limitations -under the License. ---> -<urlset - xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 - http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" - xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> -<!-- created with Free Online Sitemap Generator www.xml-sitemaps.com --> - -<url> - <loc>http://digitalpebble.com/</loc> - <lastmod>2012-12-05T10:59:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>1.00</priority> - <news:news> - <news:publication> - <news:name>The Example Times</news:name> - <news:language>en</news:language> - </news:publication> - <news:genres>PressRelease, Blog</news:genres> - <news:publication_date>2008-12-23</news:publication_date> - <news:title>Companies A, B in Merger Talks</news:title> - <news:keywords>business, merger, acquisition, A, B</news:keywords> - <news:stock_tickers>NASDAQ:A, NASDAQ:B</news:stock_tickers> - </news:news> -</url> -<url> - <loc>http://digitalpebble.com/index.html</loc> - <lastmod>2012-12-05T10:59:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/solutions.html</loc> - <lastmod>2012-09-06T16:53:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/references.html</loc> - <lastmod>2014-04-16T14:40:10+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/contact.html</loc> - <lastmod>2012-12-05T10:59:00+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -</urlset> diff --git a/core/src/test/resources/digitalpebble.sitemap.extensions.video.xml b/core/src/test/resources/digitalpebble.sitemap.extensions.video.xml deleted file mode 100644 index 20a6a792..00000000 --- a/core/src/test/resources/digitalpebble.sitemap.extensions.video.xml +++ /dev/null @@ -1,79 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, -software distributed under the License is distributed on an -"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -KIND, either express or implied. See the License for the -specific language governing permissions and limitations -under the License. ---> -<urlset - xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 - http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" - xmlns:video="http://www.google.com/schemas/sitemap-video/1.1"> -<!-- created with Free Online Sitemap Generator www.xml-sitemaps.com --> - -<url> - <loc>http://digitalpebble.com/</loc> - <lastmod>2012-12-05T10:59:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>1.00</priority> - <video:video> - <video:thumbnail_loc>http://www.example.com/thumbs/123.jpg</video:thumbnail_loc> - <video:title>Grilling steaks for summer</video:title> - <video:description>Alkis shows you how to get perfectly done steaks every time</video:description> - <video:content_loc>http://www.example.com/video123.flv</video:content_loc> - <video:player_loc allow_embed="yes" autoplay="ap=1">http://www.example.com/videoplayer.swf?video=123</video:player_loc> - <video:duration>600</video:duration> - <video:expiration_date>2009-11-05T19:20:30+08:00</video:expiration_date> - <video:rating>4.2</video:rating> - <video:view_count>12345</video:view_count> - <video:publication_date>2007-11-05T19:20:30+08:00</video:publication_date> - <video:tag>sample_tag1</video:tag> - <video:tag>sample_tag2</video:tag> - <video:family_friendly>yes</video:family_friendly> - <video:restriction relationship="allow">IE GB US CA</video:restriction> - <video:gallery_loc title="Cooking Videos">http://cooking.example.com</video:gallery_loc> - <video:price currency="EUR">1.99</video:price> - <video:requires_subscription>yes</video:requires_subscription> - <video:uploader info="http://www.example.com/users/grillymcgrillerson">GrillyMcGrillerson</video:uploader> - <video:live>no</video:live> - </video:video> -</url> -<url> - <loc>http://digitalpebble.com/index.html</loc> - <lastmod>2012-12-05T10:59:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/solutions.html</loc> - <lastmod>2012-09-06T16:53:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/references.html</loc> - <lastmod>2014-04-16T14:40:10+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/contact.html</loc> - <lastmod>2012-12-05T10:59:00+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -</urlset> diff --git a/core/src/test/resources/digitalpebble.sitemap.xml b/core/src/test/resources/digitalpebble.sitemap.xml deleted file mode 100644 index 09cea4ba..00000000 --- a/core/src/test/resources/digitalpebble.sitemap.xml +++ /dev/null @@ -1,57 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, -software distributed under the License is distributed on an -"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -KIND, either express or implied. See the License for the -specific language governing permissions and limitations -under the License. ---> -<urlset - xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 - http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"> -<!-- created with Free Online Sitemap Generator www.xml-sitemaps.com --> - -<url> - <loc>http://digitalpebble.com/</loc> - <lastmod>2012-12-05T10:59:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>1.00</priority> -</url> -<url> - <loc>http://digitalpebble.com/index.html</loc> - <lastmod>2012-12-05T10:59:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/solutions.html</loc> - <lastmod>2012-09-06T16:53:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/references.html</loc> - <lastmod>2014-04-16T14:40:10+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/contact.html</loc> - <lastmod>2012-12-05T10:59:00+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -</urlset> diff --git a/core/src/test/resources/fast.urlfilter.json b/core/src/test/resources/fast.urlfilter.json index d51953b8..866d8c62 100644 --- a/core/src/test/resources/fast.urlfilter.json +++ b/core/src/test/resources/fast.urlfilter.json @@ -4,7 +4,7 @@ "patterns" : [ "DenyPathQuery \\.jpg" ] }, { "scope" : "domain:stormcrawler.net", - "patterns" : [ "AllowPath /digitalpebble/", "DenyPath .+" ] + "patterns" : [ "AllowPath /bla/", "DenyPath .+" ] }, { "scope" : "metadata:key=value", "patterns" : [ "DenyPath .+" ] diff --git a/core/src/test/resources/digitalpebble.sitemap.extensions.all.xml b/core/src/test/resources/stormcrawler.sitemap.extensions.all.xml similarity index 80% rename from core/src/test/resources/digitalpebble.sitemap.extensions.all.xml rename to core/src/test/resources/stormcrawler.sitemap.extensions.all.xml index af3f14c7..6958b115 100644 --- a/core/src/test/resources/digitalpebble.sitemap.extensions.all.xml +++ b/core/src/test/resources/stormcrawler.sitemap.extensions.all.xml @@ -76,28 +76,34 @@ under the License. <video:live>no</video:live> </video:video> </url> -<url> - <loc>http://digitalpebble.com/index.html</loc> - <lastmod>2012-12-05T10:59:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/solutions.html</loc> - <lastmod>2012-09-06T16:53:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/references.html</loc> - <lastmod>2014-04-16T14:40:10+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/contact.html</loc> - <lastmod>2012-12-05T10:59:00+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> + <url> + <loc>https://stormcrawler.apache.org/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>1.00</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/index.html</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/download/index.html</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/getting-started/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/faq/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/support/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> </urlset> diff --git a/core/src/test/resources/digitalpebble.sitemap.extensions.image.xml b/core/src/test/resources/stormcrawler.sitemap.extensions.image.xml similarity index 65% rename from core/src/test/resources/digitalpebble.sitemap.extensions.image.xml rename to core/src/test/resources/stormcrawler.sitemap.extensions.image.xml index f5dd7bbb..99ecb553 100644 --- a/core/src/test/resources/digitalpebble.sitemap.extensions.image.xml +++ b/core/src/test/resources/stormcrawler.sitemap.extensions.image.xml @@ -38,28 +38,34 @@ under the License. <image:license>https://creativecommons.org/licenses/by/4.0/legalcode</image:license> </image:image> </url> -<url> - <loc>http://digitalpebble.com/index.html</loc> - <lastmod>2012-12-05T10:59:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/solutions.html</loc> - <lastmod>2012-09-06T16:53:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/references.html</loc> - <lastmod>2014-04-16T14:40:10+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/contact.html</loc> - <lastmod>2012-12-05T10:59:00+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> + <url> + <loc>https://stormcrawler.apache.org/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>1.00</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/index.html</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/download/index.html</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/getting-started/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/faq/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/support/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> </urlset> diff --git a/core/src/test/resources/digitalpebble.sitemap.extensions.links.xml b/core/src/test/resources/stormcrawler.sitemap.extensions.links.xml similarity index 54% rename from core/src/test/resources/digitalpebble.sitemap.extensions.links.xml rename to core/src/test/resources/stormcrawler.sitemap.extensions.links.xml index 41382dce..4d52b284 100644 --- a/core/src/test/resources/digitalpebble.sitemap.extensions.links.xml +++ b/core/src/test/resources/stormcrawler.sitemap.extensions.links.xml @@ -24,36 +24,35 @@ under the License. http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" xmlns:xhtml="http://www.w3.org/1999/xhtml"> <!-- created with Free Online Sitemap Generator www.xml-sitemaps.com --> - -<url> - <loc>http://digitalpebble.com/</loc> - <lastmod>2012-12-05T10:59:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>1.00</priority> - <xhtml:link rel="alternate" hreflang="en" href="http://www.example.com/english/" /> -</url> -<url> - <loc>http://digitalpebble.com/index.html</loc> - <lastmod>2012-12-05T10:59:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/solutions.html</loc> - <lastmod>2012-09-06T16:53:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/references.html</loc> - <lastmod>2014-04-16T14:40:10+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/contact.html</loc> - <lastmod>2012-12-05T10:59:00+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> + <url> + <loc>https://stormcrawler.apache.org/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>1.00</priority> + <xhtml:link rel="alternate" hreflang="en" href="http://www.example.com/english/" /> + </url> + <url> + <loc>https://stormcrawler.apache.org/index.html</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/download/index.html</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/getting-started/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/faq/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/support/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> </urlset> diff --git a/core/src/test/resources/digitalpebble.sitemap.extensions.mobile.xml b/core/src/test/resources/stormcrawler.sitemap.extensions.mobile.xml similarity index 56% rename from core/src/test/resources/digitalpebble.sitemap.extensions.mobile.xml rename to core/src/test/resources/stormcrawler.sitemap.extensions.mobile.xml index 16351dc7..685e302b 100644 --- a/core/src/test/resources/digitalpebble.sitemap.extensions.mobile.xml +++ b/core/src/test/resources/stormcrawler.sitemap.extensions.mobile.xml @@ -24,36 +24,35 @@ under the License. http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" xmlns:mobile="http://www.google.com/schemas/sitemap-mobile/1.0"> <!-- created with Free Online Sitemap Generator www.xml-sitemaps.com --> - -<url> - <loc>http://digitalpebble.com/</loc> - <lastmod>2012-12-05T10:59:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>1.00</priority> - <mobile:mobile/> -</url> -<url> - <loc>http://digitalpebble.com/index.html</loc> - <lastmod>2012-12-05T10:59:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/solutions.html</loc> - <lastmod>2012-09-06T16:53:04+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/references.html</loc> - <lastmod>2014-04-16T14:40:10+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> -<url> - <loc>http://digitalpebble.com/contact.html</loc> - <lastmod>2012-12-05T10:59:00+00:00</lastmod> - <changefreq>monthly</changefreq> - <priority>0.80</priority> -</url> + <url> + <loc>https://stormcrawler.apache.org/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>1.00</priority> + <mobile:mobile/> + </url> + <url> + <loc>https://stormcrawler.apache.org/index.html</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/download/index.html</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/getting-started/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/faq/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/support/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> </urlset> diff --git a/core/src/test/resources/stormcrawler.sitemap.extensions.news.xml b/core/src/test/resources/stormcrawler.sitemap.extensions.news.xml new file mode 100644 index 00000000..7723c3c6 --- /dev/null +++ b/core/src/test/resources/stormcrawler.sitemap.extensions.news.xml @@ -0,0 +1,70 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> +<urlset + xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 + http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" + xmlns:news="http://www.google.com/schemas/sitemap-news/0.9"> +<!-- created with Free Online Sitemap Generator www.xml-sitemaps.com --> + <url> + <loc>https://stormcrawler.apache.org/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <changefreq>monthly</changefreq> + <priority>1.00</priority> + <news:news> + <news:publication> + <news:name>The Example Times</news:name> + <news:language>en</news:language> + </news:publication> + <news:genres>PressRelease, Blog</news:genres> + <news:publication_date>2008-12-23</news:publication_date> + <news:title>Companies A, B in Merger Talks</news:title> + <news:keywords>business, merger, acquisition, A, B</news:keywords> + <news:stock_tickers>NASDAQ:A, NASDAQ:B</news:stock_tickers> + </news:news> + </url> + <url> + <loc>https://stormcrawler.apache.org/index.html</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/download/index.html</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/getting-started/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/faq/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/support/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + +</urlset> diff --git a/core/src/test/resources/stormcrawler.sitemap.extensions.video.xml b/core/src/test/resources/stormcrawler.sitemap.extensions.video.xml new file mode 100644 index 00000000..8023bdda --- /dev/null +++ b/core/src/test/resources/stormcrawler.sitemap.extensions.video.xml @@ -0,0 +1,81 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> +<urlset + xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 + http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd" + xmlns:video="http://www.google.com/schemas/sitemap-video/1.1"> +<!-- created with Free Online Sitemap Generator www.xml-sitemaps.com --> + + <url> + <loc>https://stormcrawler.apache.org/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <changefreq>monthly</changefreq> + <priority>1.00</priority> + <video:video> + <video:thumbnail_loc>http://www.example.com/thumbs/123.jpg</video:thumbnail_loc> + <video:title>Grilling steaks for summer</video:title> + <video:description>Alkis shows you how to get perfectly done steaks every time</video:description> + <video:content_loc>http://www.example.com/video123.flv</video:content_loc> + <video:player_loc allow_embed="yes" autoplay="ap=1">http://www.example.com/videoplayer.swf?video=123</video:player_loc> + <video:duration>600</video:duration> + <video:expiration_date>2009-11-05T19:20:30+08:00</video:expiration_date> + <video:rating>4.2</video:rating> + <video:view_count>12345</video:view_count> + <video:publication_date>2007-11-05T19:20:30+08:00</video:publication_date> + <video:tag>sample_tag1</video:tag> + <video:tag>sample_tag2</video:tag> + <video:family_friendly>yes</video:family_friendly> + <video:restriction relationship="allow">IE GB US CA</video:restriction> + <video:gallery_loc title="Cooking Videos">http://cooking.example.com</video:gallery_loc> + <video:price currency="EUR">1.99</video:price> + <video:requires_subscription>yes</video:requires_subscription> + <video:uploader info="http://www.example.com/users/grillymcgrillerson">GrillyMcGrillerson</video:uploader> + <video:live>no</video:live> + </video:video> + </url> + <url> + <loc>https://stormcrawler.apache.org/index.html</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/download/index.html</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/getting-started/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/faq/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/support/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + +</urlset> diff --git a/core/src/test/resources/stormcrawler.sitemap.xml b/core/src/test/resources/stormcrawler.sitemap.xml new file mode 100644 index 00000000..7561b576 --- /dev/null +++ b/core/src/test/resources/stormcrawler.sitemap.xml @@ -0,0 +1,60 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> +<urlset + xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 + http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"> + <!-- created with Free Online Sitemap Generator www.xml-sitemaps.com --> + + + <url> + <loc>https://stormcrawler.apache.org/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>1.00</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/index.html</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/download/index.html</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/getting-started/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/faq/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + <url> + <loc>https://stormcrawler.apache.org/support/</loc> + <lastmod>2024-10-19T11:21:53+00:00</lastmod> + <priority>0.80</priority> + </url> + + +</urlset> \ No newline at end of file diff --git a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java index ceb976c4..d90c4c69 100644 --- a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java +++ b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/DeletionBolt.java @@ -196,7 +196,7 @@ public class DeletionBolt extends BaseRichBolt return new BulkItemResponseToFailedFlag(bir, failed); }) .collect( - // https://github.com/DigitalPebble/storm-crawler/issues/832 + // https://github.com/apache/incubator-stormcrawler/issues/832 Collectors.groupingBy( idWithFailedFlagTuple -> idWithFailedFlagTuple.id, Collectors.toUnmodifiableList())); diff --git a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java index ee553106..183bf15e 100644 --- a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java +++ b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/bolt/IndexerBolt.java @@ -306,7 +306,7 @@ public class IndexerBolt extends AbstractIndexerBolt return new BulkItemResponseToFailedFlag(bir, failed); }) .collect( - // https://github.com/DigitalPebble/storm-crawler/issues/832 + // https://github.com/apache/incubator-stormcrawler/issues/832 Collectors.groupingBy( idWithFailedFlagTuple -> idWithFailedFlagTuple.id, Collectors.toUnmodifiableList())); diff --git a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java index 1f8ea55a..a7708db3 100644 --- a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java +++ b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java @@ -339,7 +339,7 @@ public class StatusUpdaterBolt extends AbstractStatusUpdaterBolt return new BulkItemResponseToFailedFlag(bir, failed); }) .collect( - // https://github.com/DigitalPebble/storm-crawler/issues/832 + // https://github.com/apache/incubator-stormcrawler/issues/832 Collectors.groupingBy( idWithFailedFlagTuple -> idWithFailedFlagTuple.id, Collectors.toUnmodifiableList())); diff --git a/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/bolt/IndexerBoltTest.java b/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/bolt/IndexerBoltTest.java index 60afe2f2..a53047da 100644 --- a/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/bolt/IndexerBoltTest.java +++ b/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/bolt/IndexerBoltTest.java @@ -114,7 +114,7 @@ class IndexerBoltTest extends AbstractOpenSearchTest { @Test @Timeout(value = 2, unit = TimeUnit.MINUTES) - // https://github.com/DigitalPebble/storm-crawler/issues/832 + // https://github.com/apache/incubator-stormcrawler/issues/832 void simultaneousCanonicals() throws ExecutionException, InterruptedException, TimeoutException { Metadata m1 = new Metadata(); diff --git a/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java b/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java index b6e41f62..6e738b0c 100644 --- a/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java +++ b/external/opensearch/src/test/java/org/apache/stormcrawler/opensearch/bolt/StatusBoltTest.java @@ -129,7 +129,7 @@ class StatusBoltTest extends AbstractOpenSearchTest { @Test @Timeout(value = 2, unit = TimeUnit.MINUTES) - // see https://github.com/DigitalPebble/storm-crawler/issues/885 + // see https://github.com/apache/incubator-stormcrawler/issues/885 void checkListKeyFromOpensearch() throws IOException, ExecutionException, InterruptedException, TimeoutException { String url = "https://www.url.net/something"; diff --git a/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java b/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java index c41c1403..f6196b87 100644 --- a/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java +++ b/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java @@ -74,7 +74,7 @@ class ParserBoltTest extends ParsingTester { /** * Checks that the mimetype whitelists are handled correctly * - * @see https://github.com/DigitalPebble/storm-crawler/issues/712 + * @see https://github.com/apache/incubator-stormcrawler/issues/712 */ void testMimeTypeWhileList() throws IOException { Map conf = new HashMap(); diff --git a/external/urlfrontier/src/main/java/org/apache/stormcrawler/urlfrontier/ManagedChannelUtil.java b/external/urlfrontier/src/main/java/org/apache/stormcrawler/urlfrontier/ManagedChannelUtil.java index 1a7c65c8..360b04a8 100644 --- a/external/urlfrontier/src/main/java/org/apache/stormcrawler/urlfrontier/ManagedChannelUtil.java +++ b/external/urlfrontier/src/main/java/org/apache/stormcrawler/urlfrontier/ManagedChannelUtil.java @@ -27,7 +27,7 @@ import org.slf4j.LoggerFactory; /* * At some point we have to write a mechanism to share the same ManagedChannel in the same runtime - * see: https://github.com/DigitalPebble/storm-crawler/pull/982#issuecomment-1175272094 + * see: https://github.com/apache/incubator-stormcrawler/pull/982#issuecomment-1175272094 */ final class ManagedChannelUtil { private ManagedChannelUtil() {} diff --git a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRequestRecordFormat.java b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRequestRecordFormat.java index 7e786dc4..d8c8ec66 100644 --- a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRequestRecordFormat.java +++ b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRequestRecordFormat.java @@ -74,7 +74,7 @@ public class WARCRequestRecordFormat extends WARCRecordFormat { /* * The request record ID is stored in the metadata so that a WARC * response record can later refer to it. Deactivated because of - * https://github.com/DigitalPebble/storm-crawler/issues/721 + * https://github.com/apache/incubator-stormcrawler/issues/721 */ // metadata.setValue("_request.warc_record_id_", mainID);
