(stormcrawler) 01/02: WIP: Started to work on a first documentation of SC

rzo1 Sat, 16 Aug 2025 01:58:37 -0700

This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch docs
in repository https://gitbox.apache.org/repos/asf/stormcrawler.git


commit 59da0d90eff085d5c9edac850a765d89a84a6a7e
Author: Richard Zowalla <r...@apache.org>
AuthorDate: Mon Jun 16 21:05:12 2025 +0200

    WIP: Started to work on a first documentation of SC
---
 docs/pom.xml                             |  67 ++++++++++
 docs/src/main/asciidoc/architecture.adoc |  17 +++
 docs/src/main/asciidoc/index.adoc        |  31 +++++
 docs/src/main/asciidoc/overview.adoc     |  61 +++++++++
 docs/src/main/asciidoc/powered-by.adoc   |  43 ++++++
 docs/src/main/asciidoc/quick-start.adoc  | 217 +++++++++++++++++++++++++++++++
 pom.xml                                  |   3 +-
 7 files changed, 438 insertions(+), 1 deletion(-)

diff --git a/docs/pom.xml b/docs/pom.xml
new file mode 100644
index 00000000..c1a8b79e
--- /dev/null
+++ b/docs/pom.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0";
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>org.apache.stormcrawler</groupId>
+        <artifactId>stormcrawler</artifactId>
+        <version>3.3.1-SNAPSHOT</version>
+    </parent>
+
+    <artifactId>stormcrawler-docs</artifactId>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.asciidoctor</groupId>
+                <artifactId>asciidoctor-maven-plugin</artifactId>
+                <version>3.2.0</version>
+                <executions>
+                    <execution>
+                        <id>output-html</id>
+                        <phase>generate-resources</phase>
+                        <goals>
+                            <goal>process-asciidoc</goal>
+                        </goals>
+                        <configuration>
+                            <doctype>article</doctype>
+                            <attributes>
+                                
<source-highlighter>coderay</source-highlighter>
+                                <toc />
+                                <linkcss>false</linkcss>
+                                <icons>font</icons>
+                            </attributes>
+                        </configuration>
+                    </execution>
+                </executions>
+                <configuration>
+                    <sourceDirectory>src/main/asciidoc</sourceDirectory>
+                    <headerFooter>true</headerFooter>
+                    <attributes>
+                        <imagesdir>src/main/asciidoc/images</imagesdir>
+                    </attributes>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+
+</project>
\ No newline at end of file
diff --git a/docs/src/main/asciidoc/architecture.adoc 
b/docs/src/main/asciidoc/architecture.adoc
new file mode 100644
index 00000000..cf9297d7
--- /dev/null
+++ b/docs/src/main/asciidoc/architecture.adoc
@@ -0,0 +1,17 @@
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+== Understanding StormCrawler's Architecture
+:imagesdir: images
+
diff --git a/docs/src/main/asciidoc/index.adoc 
b/docs/src/main/asciidoc/index.adoc
new file mode 100644
index 00000000..db791608
--- /dev/null
+++ b/docs/src/main/asciidoc/index.adoc
@@ -0,0 +1,31 @@
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+Apache StormCrawler 3.3.x - Documentation
+=========================================
+Apache Software Foundation
+:doctype: article
+:toc: left
+:toclevels: 3
+:toc-position: left
+:toc-title: Apache StormCrawler 3.3.x - Documentation
+:numbered:
+
+include::overview.adoc[]
+
+include::quick-start.adoc[]
+
+include::architecture.adoc[]
+
+include::powered-by.adoc[]
diff --git a/docs/src/main/asciidoc/overview.adoc 
b/docs/src/main/asciidoc/overview.adoc
new file mode 100644
index 00000000..97a20305
--- /dev/null
+++ b/docs/src/main/asciidoc/overview.adoc
@@ -0,0 +1,61 @@
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+== Overview
+:imagesdir: images
+
+Apache StormCrawler is an open source collection of resources for building 
low-latency, scalable web crawlers on link:http://storm.apache.org/[Apache 
Storm]. It is provided under the 
link:http://www.apache.org/licenses/LICENSE-2.0[Apache License] and is written 
mostly in Java.
+
+The aims of StormCrawler are to help build web crawlers that are:
+
+* Scalable
+* Low latency
+* Easy to extend
+* Polite yet efficient
+
+StormCrawler is both a library and a collection of reusable components 
designed to help developers build custom web crawlers with ease.
+Getting started is simple — the Maven archetypes allow you to quickly scaffold 
a new project, which you can then adapt to fit your specific needs.
+
+In addition to its core modules, StormCrawler offers a range of external 
resources that can be easily integrated into your project.
+These include spouts and bolts for OpenSearch, as well as a ParserBolt that 
leverages Apache Tika to handle various document formats and many more.
+
+StormCrawler is well-suited for scenarios where URLs to fetch and parse arrive 
as continuous streams, but it also performs exceptionally in large-scale, 
recursive crawls where low latency is essential.
+The project is actively maintained, widely adopted in production environments, 
and supported by an engaged community.
+
+You can find links to recent talks and demos later in this document, 
showcasing real-world applications and use cases.
+
+== Key Features
+
+Here is a short list of provided features:
+
+* Integration with 
link:https://github.com/crawler-commons/url-frontier[URLFrontier] for 
distributed URL management
+* Pluggable components (Spouts and Bolts from Apache Storm) for flexibility 
and modularity — adding custom components is straightforward
+* Support for link:https://tika.apache.org/[Apache Tika] for document parsing 
via `ParserBolt`
+* Integration with OpenSearch and Apache Solr for indexing and status storage
+* Option to store crawled data as WARC (Web ARChive) files
+* Support for headless crawling using Playwright
+* Optional GenAI-based modules for advanced text extraction
+* Proxy support for distributed and controlled crawling
+* Flexible and pluggable filtering mechanisms:
+** URL Filters for pre-fetch filtering
+** Parse Filters for post-fetch content filtering
+* Built-in support for crawl metrics and monitoring
+* Configurable politeness policies (e.g., crawl delay, user agent management)
+* Robust HTTP fetcher based on Apache HttpComponents
+* MIME type detection and response-based filtering
+* Support for parsing and honoring `robots.txt` and sitemaps
+* Stream-based, real-time architecture using Apache Storm — suitable for both 
recursive and one-shot crawling tasks
+* Can run in both local and distributed environments
+* Maven archetypes for quickly bootstrapping new crawler projects
+* Actively developed and used in production by multiple organizations
diff --git a/docs/src/main/asciidoc/powered-by.adoc 
b/docs/src/main/asciidoc/powered-by.adoc
new file mode 100644
index 00000000..5cd7a707
--- /dev/null
+++ b/docs/src/main/asciidoc/powered-by.adoc
@@ -0,0 +1,43 @@
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+== Companies & Projects Using StormCrawler
+:imagesdir: images
+
+* link:http://www.careerbuilder.com/[CareerBuilder]
+* link:http://www.stolencamerafinder.com/[StolenCameraFinder]
+* link:http://www.weborama.com/[Weborama]
+* link:http://www.ontopic.io/[Ontopic]
+* link:http://www.shopstyle.com/[ShopStyle]
+* link:http://www.wombatsoftware.de/[Wombat Software]
+* link:http://commoncrawl.org/2016/10/news-dataset-available/[CommonCrawl]
+* link:https://webfinery.com/[WebFinery]
+* link:http://www.reportlinker.com/[ReportLinker]
+* link:http://www.tokenmill.lt/[TokenMill]
+* link:http://www.polecat.com/[Polecat]
+* link:http://www.wizenoze.com/en/[WizeNoze]
+* link:http://iproduct.io/[IProduct.io]
+* link:https://www.cgi.com/[CGI]
+* link:https://github.com/miras-tech/MirasText[MirasText]
+* link:https://www.g2webservices.com/[G2 Web Services]
+* link:https://www.gov.nt.ca/[Government of Northwest Territories]
+* 
link:https://digitalpebble.blogspot.com/2019/02/meet-stormcrawler-users-q-with-pixray.html[Pixray]
+* link:https://www.cameraforensics.com/[CameraForensics]
+* link:https://gagepiracy.com/[Gage Piracy]
+* link:https://www.clarin.eu/[Clarin ERIC]
+* link:https://openwebsearch.eu/owler/[OpenWebSearch]
+* link:https://shc-info.zml.hs-heilbronn.de/[Heilbronn University]
+* link:https://www.contexity.com[Contexity]
+* 
link:https://https://www.kodis.iao.fraunhofer.de/de/projekte/SPIDERWISE.html[Fraunhofer
 IAO - KODIS]
+
+Drop us a line at 
mailto:d...@stormcrawler.apache.org[d...@stormcrawler.apache.org] if you want 
to be added to this page.
diff --git a/docs/src/main/asciidoc/quick-start.adoc 
b/docs/src/main/asciidoc/quick-start.adoc
new file mode 100644
index 00000000..d353e2d4
--- /dev/null
+++ b/docs/src/main/asciidoc/quick-start.adoc
@@ -0,0 +1,217 @@
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+== Quick Start
+
+These instructions should help you get Apache StormCrawler up and running in 5 
to 15 minutes.
+
+=== Prerequisites
+
+To run StormCrawler, you will need Java SE 17 or later.
+
+Additionally, since we'll be running the required Apache Storm cluster using 
Docker Compose,
+make sure Docker is installed on your operating system.
+
+=== Terminology
+
+Before starting, we will give a quick overview of **central** Storm concepts 
and terminology, you need to know before starting with StormCrawler:
+
+- *Topology*: A topology is the overall data processing graph in Storm, 
consisting of spouts and bolts connected together to perform continuous, 
real-time computations.
+
+- *Spout*: A spout is a source component in a Storm topology that emits 
streams of data into the processing pipeline.
+
+- *Bolt*: A bolt processes, transforms, or routes data streams emitted by 
spouts or other bolts within the topology.
+
+- *Flux*: In Apache Storm, Flux is a declarative configuration framework that 
enables you to define and run Storm topologies using YAML files instead of 
writing Java code. This simplifies topology management and deployment.
+
+- *Frontier*: In the context of a web crawler, the Frontier is the component 
responsible for managing and prioritizing the list of URLs to be fetched next.
+
+- *Seed*: In web crawling, a Seed is an initial URL or set of URLs from which 
the crawler starts its discovery and fetching process.
+
+=== Bootstrapping a StormCrawler Project
+
+You can quickly generate a new StormCrawler project using the Maven archetype:
+
+[source,shell]
+----
+mvn archetype:generate -DarchetypeGroupId=org.apache.stormcrawler \
+                       -DarchetypeArtifactId=stormcrawler-archetype \
+                       -DarchetypeVersion=3.3.0
+----
+
+During the process, you’ll be prompted to provide the following:
+
+* `groupId` (e.g. `com.mycompany.crawler`)
+* `artifactId` (e.g. `stormcrawler`)
+* Version
+* Package name
+* User agent details
+
+IMPORTANT: Specifying a user agent is important for crawler ethics because it 
identifies your crawler to websites, promoting transparency and allowing site 
owners to manage or block requests if needed. Be sure to provide a crawler 
information website as well.
+
+The archetype will generate a fully-structured project including:
+
+* A pre-configured `pom.xml` with the necessary dependencies
+* Default resource files
+* A sample `crawler.flux` configuration
+* A basic configuration file
+
+After generation, navigate into the newly created directory (named after the 
`artifactId` you specified).
+
+TIP: You can learn more about the architecture and how each component works 
together if you look into link:architecture.adoc[the architecture 
documentation].
+By exploring that part of the documentation, you can gain a better 
understanding of how StormCrawler performs crawling and how bolts, spouts, as 
well as parse and URL filters, collaborate in the process.
+
+==== Docker Compose Setup
+
+Below is a simple `docker-compose.yaml` configuration to spin up URLFrontier, 
Zookeeper, Storm Nimbus, Storm Supervisor, and the Storm UI:
+
+[source,yaml]
+----
+services:
+  zookeeper:
+    image: zookeeper:3.9.3
+    container_name: zookeeper
+    restart: always
+
+  nimbus:
+    image: storm:latest
+    container_name: nimbus
+    hostname: nimbus
+    command: storm nimbus
+    depends_on:
+      - zookeeper
+    restart: always
+
+  supervisor:
+    image: storm:latest
+    container_name: supervisor
+    command: storm supervisor -c worker.childopts=-Xmx%HEAP-MEM%m
+    depends_on:
+      - nimbus
+      - zookeeper
+    restart: always
+
+  ui:
+    image: storm:latest
+    container_name: ui
+    command: storm ui
+    depends_on:
+      - nimbus
+    restart: always
+    ports:
+      - "127.0.0.1:8080:8080"
+
+  urlfrontier:
+    image: crawlercommons/url-frontier:latest
+    container_name: urlfrontier
+    restart: always
+    ports:
+      - "127.0.0.1:7071:7071"
+----
+
+Notes:
+
+- This example Docker Compose uses the official Storm and Zookeeper images.
+- URLFrontier is an additional service used by StormCrawler to act as 
Frontier. Please note, that we also offer other Frontier implementations like 
OpenSearch or Apache Solr.
+- Ports may need adjustment depending on your environment.
+- The Storm UI runs on port 8080 by default.
+- Ensure network connectivity between services; Docker Compose handles this by 
default.
+
+After setting up your Docker Compose, you should start it up:
+
+[source,shell]
+----
+docker compose up -d
+----
+
+Check the logs and see, if every service is up and running:
+
+[source,shell]
+----
+docker compose logs -f
+----
+
+Next, access the Storm UI via `http://localhost:8080` and check, that a Storm 
Nimbus as well as a Storm Supervisor is available.
+
+==== Compile
+
+Build the generated archetype by running
+
+[source,shell]
+----
+mvn package
+----
+
+This will create a uberjar named `${artifactId}-${version}.jar` (matches the 
artifact id and the version specified during the archetype generation) in your 
`target` directory.
+
+==== Inject Your First Seeds
+
+Now you are ready to insert your first seeds into URLFrontier. To do so, 
create a file `seeds.txt` containing your seeds:
+
+[source,text]
+----
+https://stormcrawler.apache.org
+----
+
+After you have saved it, we need to inject the seeds into URLFrontier. This 
can be done by running URLFrontiers client:
+
+[source,shell]
+----
+java -cp target/${artifactId}-${version}.jar 
crawlercommons.urlfrontier.client.Client PutURLs -f seeds.txt
+----
+
+where _seeds.txt_ is the previously created file containing URLs to inject, 
with one URL per line.
+
+==== Run Your First Crawl
+
+Now it is time to run our first crawl. To do so, we need to start our crawler 
topolog in distributed mode and deploy it on our Storm Cluster.
+
+[source,shell]
+----
+docker run --network ${NETWORK} -it \
+--rm \
+-v "$(pwd)/crawler-conf.yaml:/apache-storm/crawler-conf.yaml" \
+-v "$(pwd)/crawler.flux:/apache-storm/crawler.flux" \
+-v 
"$(pwd)/${artifactId}-${version}.jar:/apache-storm/${artifactId}-${version}.jar"
 \
+storm:latest \
+storm jar ${artifactId}-${version}.jar org.apache.storm.flux.Flux --remote 
crawler.flux
+----
+
+where `${NETWORK}` is the name of the Docker network of the previously started 
Docker Compose. You can find this name by running
+
+[source,shell]
+----
+docker network ls
+----
+
+After running the `storm jar` command, you should carefully monitor the logs 
via
+
+[source,shell]
+----
+docker compose logs -f
+----
+
+as well as the Storm UI. It should now list a running topology.
+
+In the default archetype, the fetched content is printed out to the default 
system out print stream.
+
+NOTE: In a Storm topology defined with Flux, parallelism specifies the number 
of tasks or instances of a spout or bolt to run concurrently, enabling scalable 
and efficient processing of data streams. In the archetype every component is 
set to a parallelism of **1**.
+
+Congratulations! You learned how to start your first simple crawl using 
StormCrawler.
+
+Feel free to explore the rest of our documentation to build more complex 
crawler topologies.
+
+=== Summary
+
+This document shows how simple it is to get Apache StormCrawler up and running 
and to run a simple crawl.
diff --git a/pom.xml b/pom.xml
index 36a79bd4..335f94df 100644
--- a/pom.xml
+++ b/pom.xml
@@ -653,6 +653,7 @@ under the License.
                <module>archetype</module>
                <module>external/opensearch/archetype</module>
                <module>external/solr/archetype</module>
-    </modules>
+               <module>docs</module>
+       </modules>
 
 </project>

(stormcrawler) 01/02: WIP: Started to work on a first documentation of SC

Reply via email to