(incubator-stormcrawler) 01/03: Remove references towards DigitalPebble

rzo1 Thu, 28 Nov 2024 21:50:28 -0800

This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch fix_issues
in repository https://gitbox.apache.org/repos/asf/incubator-stormcrawler.git


commit b49b86e0156f68b1b762cdc27dc3f42df27eea7d
Author: Richard Zowalla <[email protected]>
AuthorDate: Fri Nov 29 06:23:31 2024 +0100

    Remove references towards DigitalPebble
---
 .../org/apache/stormcrawler/filtering/regex/FastURLFilter.java    | 2 +-
 .../org/apache/stormcrawler/persistence/DefaultScheduler.java     | 8 ++------
 core/src/test/resources/stormcrawler.sitemap.extensions.all.xml   | 2 +-
 core/src/test/resources/stormcrawler.sitemap.extensions.image.xml | 2 +-
 .../src/main/resources/archetype-resources/opensearch-conf.yaml   | 2 +-
 external/opensearch/opensearch-conf.yaml                          | 2 +-
 external/sql/README.md                                            | 2 --
 .../test/java/org/apache/stormcrawler/tika/ParserBoltTest.java    | 4 ++--
 8 files changed, 9 insertions(+), 15 deletions(-)

diff --git 
a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java 
b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java
index 50f528f2..230796ac 100644
--- 
a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java
+++ 
b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java
@@ -55,7 +55,7 @@ import org.slf4j.LoggerFactory;
  *    "patterns" : [ "DenyPathQuery \\.jpg" ]
  *  }, {
  *    "scope" : "domain:stormcrawler.net",
- *    "patterns" : [ "AllowPath /digitalpebble/", "DenyPath .+" ]
+ *    "patterns" : [ "AllowPath /stormcrawler/", "DenyPath .+" ]
  *  }, {
  *    "scope" : "metadata:key=value",
  *   "patterns" : [ "DenyPath .+" ]
diff --git 
a/core/src/main/java/org/apache/stormcrawler/persistence/DefaultScheduler.java 
b/core/src/main/java/org/apache/stormcrawler/persistence/DefaultScheduler.java
index 86ee0295..8ec6d926 100644
--- 
a/core/src/main/java/org/apache/stormcrawler/persistence/DefaultScheduler.java
+++ 
b/core/src/main/java/org/apache/stormcrawler/persistence/DefaultScheduler.java
@@ -96,12 +96,8 @@ public class DefaultScheduler extends Scheduler {
         customIntervals = intervals.values().toArray(new CustomInterval[0]);
     }
 
-    /*
-     * (non-Javadoc)
-     *
-     * @see org.apache.stormcrawler.persistence.Scheduler#schedule(com.
-     * digitalpebble. stormcrawler.persistence .Status,
-     * org.apache.stormcrawler.Metadata)
+    /**
+     * @see org.apache.stormcrawler.persistence.Scheduler#schedule(Status, 
Metadata)
      */
     @Override
     public Optional<Date> schedule(Status status, Metadata metadata) {
diff --git a/core/src/test/resources/stormcrawler.sitemap.extensions.all.xml 
b/core/src/test/resources/stormcrawler.sitemap.extensions.all.xml
index 6958b115..595cfa43 100644
--- a/core/src/test/resources/stormcrawler.sitemap.extensions.all.xml
+++ b/core/src/test/resources/stormcrawler.sitemap.extensions.all.xml
@@ -30,7 +30,7 @@ under the License.
 <!-- created with Free Online Sitemap Generator www.xml-sitemaps.com -->
 
 <url>
-  <loc>http://digitalpebble.com/</loc>
+  <loc>http://www.example.com/</loc>
   <lastmod>2012-12-05T10:59:04+00:00</lastmod>
   <changefreq>monthly</changefreq>
   <priority>1.00</priority>
diff --git a/core/src/test/resources/stormcrawler.sitemap.extensions.image.xml 
b/core/src/test/resources/stormcrawler.sitemap.extensions.image.xml
index 99ecb553..177f45e5 100644
--- a/core/src/test/resources/stormcrawler.sitemap.extensions.image.xml
+++ b/core/src/test/resources/stormcrawler.sitemap.extensions.image.xml
@@ -26,7 +26,7 @@ under the License.
 <!-- created with Free Online Sitemap Generator www.xml-sitemaps.com -->
 
 <url>
-  <loc>http://digitalpebble.com/</loc>
+  <loc>http://www.example.com/</loc>
   <lastmod>2012-12-05T10:59:04+00:00</lastmod>
   <changefreq>monthly</changefreq>
   <priority>1.00</priority>
diff --git 
a/external/opensearch/archetype/src/main/resources/archetype-resources/opensearch-conf.yaml
 
b/external/opensearch/archetype/src/main/resources/archetype-resources/opensearch-conf.yaml
index f8b291fe..b346eb2f 100644
--- 
a/external/opensearch/archetype/src/main/resources/archetype-resources/opensearch-conf.yaml
+++ 
b/external/opensearch/archetype/src/main/resources/archetype-resources/opensearch-conf.yaml
@@ -67,7 +67,7 @@ config:
   # positive or negative filters parsable by the Lucene Query Parser
   # opensearch.status.filterQuery: 
   #  - "-(key:stormcrawler.net)"
-  #  - "-(key:digitalpebble.com)"
+  #  - "-(key:stormcrawler.apache.org)"
 
   # time in secs for which the URLs will be considered for fetching after a 
ack of fail
   spout.ttl.purgatory: 30
diff --git a/external/opensearch/opensearch-conf.yaml 
b/external/opensearch/opensearch-conf.yaml
index f8b291fe..a0a651aa 100644
--- a/external/opensearch/opensearch-conf.yaml
+++ b/external/opensearch/opensearch-conf.yaml
@@ -67,7 +67,7 @@ config:
   # positive or negative filters parsable by the Lucene Query Parser
   # opensearch.status.filterQuery: 
   #  - "-(key:stormcrawler.net)"
-  #  - "-(key:digitalpebble.com)"
+  #  - "-(key:apache.stormcrawler.org)"
 
   # time in secs for which the URLs will be considered for fetching after a 
ack of fail
   spout.ttl.purgatory: 30
diff --git a/external/sql/README.md b/external/sql/README.md
index 25554a4d..880e477a 100644
--- a/external/sql/README.md
+++ b/external/sql/README.md
@@ -4,8 +4,6 @@
 
 The 
[tableCreation.script](https://github.com/apache/incubator-stormcrawler/blob/main/external/sql/tableCreation.script)
 is based on MySQL and is used for the creation of the tables.
 
-This 
[tutorial](https://digitalpebble.blogspot.co.uk/2015/09/index-web-with-aws-cloudsearch.html)
 uses this module.
-
 Check that you have specified a configuration file such as 
[sql-conf.yaml](https://github.com/apache/incubator-stormcrawler/blob/master/external/sql/sql-conf.yaml)
 and have a Java driver in the dependencies of your POM
 
 ```
diff --git 
a/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java 
b/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java
index f6196b87..6f98ad12 100644
--- 
a/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java
+++ 
b/external/tika/src/test/java/org/apache/stormcrawler/tika/ParserBoltTest.java
@@ -55,7 +55,7 @@ class ParserBoltTest extends ParsingTester {
         conf.put("parser.extract.embedded", true);
         bolt.prepare(conf, TestUtil.getMockedTopologyContext(), new 
OutputCollector(output));
         parse(
-                "http://www.digitalpebble.com/test_recursive_embedded.docx";,
+                "http://stormcrawler.apache.org/test_recursive_embedded.docx";,
                 "test_recursive_embedded.docx");
         List<List<Object>> outTuples = output.getEmitted();
         // TODO could we get as many subdocs as embedded in the original one?
@@ -99,7 +99,7 @@ class ParserBoltTest extends ParsingTester {
                 "http." + HttpHeaders.CONTENT_TYPE,
                 
"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
         parse(
-                "http://www.digitalpebble.com/test_recursive_embedded.docx";,
+                "http://stormcrawler.apache.org/test_recursive_embedded.docx";,
                 "test_recursive_embedded.docx",
                 metadata);
         outTuples = output.getEmitted();

(incubator-stormcrawler) 01/03: Remove references towards DigitalPebble

Reply via email to