This is an automated email from the ASF dual-hosted git repository. rzo1 pushed a commit to branch fix_issues in repository https://gitbox.apache.org/repos/asf/incubator-stormcrawler.git
commit b49b86e0156f68b1b762cdc27dc3f42df27eea7d Author: Richard Zowalla <[email protected]> AuthorDate: Fri Nov 29 06:23:31 2024 +0100 Remove references towards DigitalPebble --- .../org/apache/stormcrawler/filtering/regex/FastURLFilter.java | 2 +- .../org/apache/stormcrawler/persistence/DefaultScheduler.java | 8 ++------ core/src/test/resources/stormcrawler.sitemap.extensions.all.xml | 2 +- core/src/test/resources/stormcrawler.sitemap.extensions.image.xml | 2 +- .../src/main/resources/archetype-resources/opensearch-conf.yaml | 2 +- external/opensearch/opensearch-conf.yaml | 2 +- external/sql/README.md | 2 -- .../test/java/org/apache/stormcrawler/tika/ParserBoltTest.java | 4 ++-- 8 files changed, 9 insertions(+), 15 deletions(-) diff --git a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java index 50f528f2..230796ac 100644 --- a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java +++ b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java @@ -55,7 +55,7 @@ import org.slf4j.LoggerFactory; * "patterns" : [ "DenyPathQuery \\.jpg" ] * }, { * "scope" : "domain:stormcrawler.net", - * "patterns" : [ "AllowPath /digitalpebble/", "DenyPath .+" ] + * "patterns" : [ "AllowPath /stormcrawler/", "DenyPath .+" ] * }, { * "scope" : "metadata:key=value", * "patterns" : [ "DenyPath .+" ] diff --git a/core/src/main/java/org/apache/stormcrawler/persistence/DefaultScheduler.java b/core/src/main/java/org/apache/stormcrawler/persistence/DefaultScheduler.java index 86ee0295..8ec6d926 100644 --- a/core/src/main/java/org/apache/stormcrawler/persistence/DefaultScheduler.java +++ b/core/src/main/java/org/apache/stormcrawler/persistence/DefaultScheduler.java @@ -96,12 +96,8 @@ public class DefaultScheduler extends Scheduler { customIntervals = intervals.values().toArray(new CustomInterval[0]); } - /* - * (non-Javadoc) - * - * @see org.apache.stormcrawler.persistence.Scheduler#schedule(com. - * digitalpebble. stormcrawler.persistence .Status, - * org.apache.stormcrawler.Metadata) + /** + * @see org.apache.stormcrawler.persistence.Scheduler#schedule(Status, Metadata) */ @Override public Optional<Date> schedule(Status status, Metadata metadata) { diff --git a/core/src/test/resources/stormcrawler.sitemap.extensions.all.xml b/core/src/test/resources/stormcrawler.sitemap.extensions.all.xml index 6958b115..595cfa43 100644 --- a/core/src/test/resources/stormcrawler.sitemap.extensions.all.xml +++ b/core/src/test/resources/stormcrawler.sitemap.extensions.all.xml @@ -30,7 +30,7 @@ under the License. <!-- created with Free Online Sitemap Generator www.xml-sitemaps.com --> <url> - <loc>http://digitalpebble.com/</loc> + <loc>http://www.example.com/</loc> <lastmod>2012-12-05T10:59:04+00:00</lastmod> <changefreq>monthly</changefreq> <priority>1.00</priority> diff --git a/core/src/test/resources/stormcrawler.sitemap.extensions.image.xml b/core/src/test/resources/stormcrawler.sitemap.extensions.image.xml index 99ecb553..177f45e5 100644 --- a/core/src/test/resources/stormcrawler.sitemap.extensions.image.xml +++ b/core/src/test/resources/stormcrawler.sitemap.extensions.image.xml @@ -26,7 +26,7 @@ under the License. <!-- created with Free Online Sitemap Generator www.xml-sitemaps.com --> <url> - <loc>http://digitalpebble.com/</loc> + <loc>http://www.example.com/</loc> <lastmod>2012-12-05T10:59:04+00:00</lastmod> <changefreq>monthly</changefreq> <priority>1.00</priority> diff --git a/external/opensearch/archetype/src/main/resources/archetype-resources/opensearch-conf.yaml b/external/opensearch/archetype/src/main/resources/archetype-resources/opensearch-conf.yaml index f8b291fe..b346eb2f 100644 --- a/external/opensearch/archetype/src/main/resources/archetype-resources/opensearch-conf.yaml +++ b/external/opensearch/archetype/src/main/resources/archetype-resources/opensearch-conf.yaml @@ -67,7 +67,7 @@ config: # positive or negative filters parsable by the Lucene Query Parser # opensearch.status.filterQuery: # - "-(key:stormcrawler.net)" - # - "-(key:digitalpebble.com)" + # - "-(key:stormcrawler.apache.org)" # time in secs for which the URLs will be considered for fetching after a ack of fail spout.ttl.purgatory: 30 diff --git a/external/opensearch/opensearch-conf.yaml b/external/opensearch/opensearch-conf.yaml index f8b291fe..a0a651aa 100644 --- a/external/opensearch/opensearch-conf.yaml +++ b/external/opensearch/opensearch-conf.yaml @@ -67,7 +67,7 @@ config: # positive or negative filters parsable by the Lucene Query Parser # opensearch.status.filterQuery: # - "-(key:stormcrawler.net)" - # - "-(key:digitalpebble.com)" + # - "-(key:apache.stormcrawler.org)" # time in secs for which the URLs will be considered for fetching after a ack of fail spout.ttl.purgatory: 30 diff --git a/external/sql/README.md b/external/sql/README.md index 25554a4d..880e477a 100644 --- a/external/sql/README.md +++ b/external/sql/README.md @@ -4,8 +4,6 @@ The [tableCreation.script](https://github.com/apache/incubator-stormcrawler/blob/main/external/sql/tableCreation.script) is based on MySQL and is used for the creation of the tables. -This [tutorial](https://digitalpebble.blogspot.co.uk/2015/09/index-web-with-aws-cloudsearch.html) uses this module. - Check that you have specified a configuration file such as [sql-conf.yaml](https://github.com/apache/incubator-stormcrawler/blob/master/external/sql/sql-conf.yaml) and have a Java driver in the dependencies of your POM ``` diff --git a/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java b/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java index f6196b87..6f98ad12 100644 --- a/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java +++ b/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java @@ -55,7 +55,7 @@ class ParserBoltTest extends ParsingTester { conf.put("parser.extract.embedded", true); bolt.prepare(conf, TestUtil.getMockedTopologyContext(), new OutputCollector(output)); parse( - "http://www.digitalpebble.com/test_recursive_embedded.docx", + "http://stormcrawler.apache.org/test_recursive_embedded.docx", "test_recursive_embedded.docx"); List<List<Object>> outTuples = output.getEmitted(); // TODO could we get as many subdocs as embedded in the original one? @@ -99,7 +99,7 @@ class ParserBoltTest extends ParsingTester { "http." + HttpHeaders.CONTENT_TYPE, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); parse( - "http://www.digitalpebble.com/test_recursive_embedded.docx", + "http://stormcrawler.apache.org/test_recursive_embedded.docx", "test_recursive_embedded.docx", metadata); outTuples = output.getEmitted();
