This is an automated email from the ASF dual-hosted git repository. rzo1 pushed a commit to branch docs in repository https://gitbox.apache.org/repos/asf/stormcrawler.git
commit 59da0d90eff085d5c9edac850a765d89a84a6a7e Author: Richard Zowalla <r...@apache.org> AuthorDate: Mon Jun 16 21:05:12 2025 +0200 WIP: Started to work on a first documentation of SC --- docs/pom.xml | 67 ++++++++++ docs/src/main/asciidoc/architecture.adoc | 17 +++ docs/src/main/asciidoc/index.adoc | 31 +++++ docs/src/main/asciidoc/overview.adoc | 61 +++++++++ docs/src/main/asciidoc/powered-by.adoc | 43 ++++++ docs/src/main/asciidoc/quick-start.adoc | 217 +++++++++++++++++++++++++++++++ pom.xml | 3 +- 7 files changed, 438 insertions(+), 1 deletion(-) diff --git a/docs/pom.xml b/docs/pom.xml new file mode 100644 index 00000000..c1a8b79e --- /dev/null +++ b/docs/pom.xml @@ -0,0 +1,67 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.stormcrawler</groupId> + <artifactId>stormcrawler</artifactId> + <version>3.3.1-SNAPSHOT</version> + </parent> + + <artifactId>stormcrawler-docs</artifactId> + + <build> + <plugins> + <plugin> + <groupId>org.asciidoctor</groupId> + <artifactId>asciidoctor-maven-plugin</artifactId> + <version>3.2.0</version> + <executions> + <execution> + <id>output-html</id> + <phase>generate-resources</phase> + <goals> + <goal>process-asciidoc</goal> + </goals> + <configuration> + <doctype>article</doctype> + <attributes> + <source-highlighter>coderay</source-highlighter> + <toc /> + <linkcss>false</linkcss> + <icons>font</icons> + </attributes> + </configuration> + </execution> + </executions> + <configuration> + <sourceDirectory>src/main/asciidoc</sourceDirectory> + <headerFooter>true</headerFooter> + <attributes> + <imagesdir>src/main/asciidoc/images</imagesdir> + </attributes> + </configuration> + </plugin> + </plugins> + </build> + +</project> \ No newline at end of file diff --git a/docs/src/main/asciidoc/architecture.adoc b/docs/src/main/asciidoc/architecture.adoc new file mode 100644 index 00000000..cf9297d7 --- /dev/null +++ b/docs/src/main/asciidoc/architecture.adoc @@ -0,0 +1,17 @@ +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +== Understanding StormCrawler's Architecture +:imagesdir: images + diff --git a/docs/src/main/asciidoc/index.adoc b/docs/src/main/asciidoc/index.adoc new file mode 100644 index 00000000..db791608 --- /dev/null +++ b/docs/src/main/asciidoc/index.adoc @@ -0,0 +1,31 @@ +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +Apache StormCrawler 3.3.x - Documentation +========================================= +Apache Software Foundation +:doctype: article +:toc: left +:toclevels: 3 +:toc-position: left +:toc-title: Apache StormCrawler 3.3.x - Documentation +:numbered: + +include::overview.adoc[] + +include::quick-start.adoc[] + +include::architecture.adoc[] + +include::powered-by.adoc[] diff --git a/docs/src/main/asciidoc/overview.adoc b/docs/src/main/asciidoc/overview.adoc new file mode 100644 index 00000000..97a20305 --- /dev/null +++ b/docs/src/main/asciidoc/overview.adoc @@ -0,0 +1,61 @@ +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +== Overview +:imagesdir: images + +Apache StormCrawler is an open source collection of resources for building low-latency, scalable web crawlers on link:http://storm.apache.org/[Apache Storm]. It is provided under the link:http://www.apache.org/licenses/LICENSE-2.0[Apache License] and is written mostly in Java. + +The aims of StormCrawler are to help build web crawlers that are: + +* Scalable +* Low latency +* Easy to extend +* Polite yet efficient + +StormCrawler is both a library and a collection of reusable components designed to help developers build custom web crawlers with ease. +Getting started is simple — the Maven archetypes allow you to quickly scaffold a new project, which you can then adapt to fit your specific needs. + +In addition to its core modules, StormCrawler offers a range of external resources that can be easily integrated into your project. +These include spouts and bolts for OpenSearch, as well as a ParserBolt that leverages Apache Tika to handle various document formats and many more. + +StormCrawler is well-suited for scenarios where URLs to fetch and parse arrive as continuous streams, but it also performs exceptionally in large-scale, recursive crawls where low latency is essential. +The project is actively maintained, widely adopted in production environments, and supported by an engaged community. + +You can find links to recent talks and demos later in this document, showcasing real-world applications and use cases. + +== Key Features + +Here is a short list of provided features: + +* Integration with link:https://github.com/crawler-commons/url-frontier[URLFrontier] for distributed URL management +* Pluggable components (Spouts and Bolts from Apache Storm) for flexibility and modularity — adding custom components is straightforward +* Support for link:https://tika.apache.org/[Apache Tika] for document parsing via `ParserBolt` +* Integration with OpenSearch and Apache Solr for indexing and status storage +* Option to store crawled data as WARC (Web ARChive) files +* Support for headless crawling using Playwright +* Optional GenAI-based modules for advanced text extraction +* Proxy support for distributed and controlled crawling +* Flexible and pluggable filtering mechanisms: +** URL Filters for pre-fetch filtering +** Parse Filters for post-fetch content filtering +* Built-in support for crawl metrics and monitoring +* Configurable politeness policies (e.g., crawl delay, user agent management) +* Robust HTTP fetcher based on Apache HttpComponents +* MIME type detection and response-based filtering +* Support for parsing and honoring `robots.txt` and sitemaps +* Stream-based, real-time architecture using Apache Storm — suitable for both recursive and one-shot crawling tasks +* Can run in both local and distributed environments +* Maven archetypes for quickly bootstrapping new crawler projects +* Actively developed and used in production by multiple organizations diff --git a/docs/src/main/asciidoc/powered-by.adoc b/docs/src/main/asciidoc/powered-by.adoc new file mode 100644 index 00000000..5cd7a707 --- /dev/null +++ b/docs/src/main/asciidoc/powered-by.adoc @@ -0,0 +1,43 @@ +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +== Companies & Projects Using StormCrawler +:imagesdir: images + +* link:http://www.careerbuilder.com/[CareerBuilder] +* link:http://www.stolencamerafinder.com/[StolenCameraFinder] +* link:http://www.weborama.com/[Weborama] +* link:http://www.ontopic.io/[Ontopic] +* link:http://www.shopstyle.com/[ShopStyle] +* link:http://www.wombatsoftware.de/[Wombat Software] +* link:http://commoncrawl.org/2016/10/news-dataset-available/[CommonCrawl] +* link:https://webfinery.com/[WebFinery] +* link:http://www.reportlinker.com/[ReportLinker] +* link:http://www.tokenmill.lt/[TokenMill] +* link:http://www.polecat.com/[Polecat] +* link:http://www.wizenoze.com/en/[WizeNoze] +* link:http://iproduct.io/[IProduct.io] +* link:https://www.cgi.com/[CGI] +* link:https://github.com/miras-tech/MirasText[MirasText] +* link:https://www.g2webservices.com/[G2 Web Services] +* link:https://www.gov.nt.ca/[Government of Northwest Territories] +* link:https://digitalpebble.blogspot.com/2019/02/meet-stormcrawler-users-q-with-pixray.html[Pixray] +* link:https://www.cameraforensics.com/[CameraForensics] +* link:https://gagepiracy.com/[Gage Piracy] +* link:https://www.clarin.eu/[Clarin ERIC] +* link:https://openwebsearch.eu/owler/[OpenWebSearch] +* link:https://shc-info.zml.hs-heilbronn.de/[Heilbronn University] +* link:https://www.contexity.com[Contexity] +* link:https://https://www.kodis.iao.fraunhofer.de/de/projekte/SPIDERWISE.html[Fraunhofer IAO - KODIS] + +Drop us a line at mailto:d...@stormcrawler.apache.org[d...@stormcrawler.apache.org] if you want to be added to this page. diff --git a/docs/src/main/asciidoc/quick-start.adoc b/docs/src/main/asciidoc/quick-start.adoc new file mode 100644 index 00000000..d353e2d4 --- /dev/null +++ b/docs/src/main/asciidoc/quick-start.adoc @@ -0,0 +1,217 @@ +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +== Quick Start + +These instructions should help you get Apache StormCrawler up and running in 5 to 15 minutes. + +=== Prerequisites + +To run StormCrawler, you will need Java SE 17 or later. + +Additionally, since we'll be running the required Apache Storm cluster using Docker Compose, +make sure Docker is installed on your operating system. + +=== Terminology + +Before starting, we will give a quick overview of **central** Storm concepts and terminology, you need to know before starting with StormCrawler: + +- *Topology*: A topology is the overall data processing graph in Storm, consisting of spouts and bolts connected together to perform continuous, real-time computations. + +- *Spout*: A spout is a source component in a Storm topology that emits streams of data into the processing pipeline. + +- *Bolt*: A bolt processes, transforms, or routes data streams emitted by spouts or other bolts within the topology. + +- *Flux*: In Apache Storm, Flux is a declarative configuration framework that enables you to define and run Storm topologies using YAML files instead of writing Java code. This simplifies topology management and deployment. + +- *Frontier*: In the context of a web crawler, the Frontier is the component responsible for managing and prioritizing the list of URLs to be fetched next. + +- *Seed*: In web crawling, a Seed is an initial URL or set of URLs from which the crawler starts its discovery and fetching process. + +=== Bootstrapping a StormCrawler Project + +You can quickly generate a new StormCrawler project using the Maven archetype: + +[source,shell] +---- +mvn archetype:generate -DarchetypeGroupId=org.apache.stormcrawler \ + -DarchetypeArtifactId=stormcrawler-archetype \ + -DarchetypeVersion=3.3.0 +---- + +During the process, you’ll be prompted to provide the following: + +* `groupId` (e.g. `com.mycompany.crawler`) +* `artifactId` (e.g. `stormcrawler`) +* Version +* Package name +* User agent details + +IMPORTANT: Specifying a user agent is important for crawler ethics because it identifies your crawler to websites, promoting transparency and allowing site owners to manage or block requests if needed. Be sure to provide a crawler information website as well. + +The archetype will generate a fully-structured project including: + +* A pre-configured `pom.xml` with the necessary dependencies +* Default resource files +* A sample `crawler.flux` configuration +* A basic configuration file + +After generation, navigate into the newly created directory (named after the `artifactId` you specified). + +TIP: You can learn more about the architecture and how each component works together if you look into link:architecture.adoc[the architecture documentation]. +By exploring that part of the documentation, you can gain a better understanding of how StormCrawler performs crawling and how bolts, spouts, as well as parse and URL filters, collaborate in the process. + +==== Docker Compose Setup + +Below is a simple `docker-compose.yaml` configuration to spin up URLFrontier, Zookeeper, Storm Nimbus, Storm Supervisor, and the Storm UI: + +[source,yaml] +---- +services: + zookeeper: + image: zookeeper:3.9.3 + container_name: zookeeper + restart: always + + nimbus: + image: storm:latest + container_name: nimbus + hostname: nimbus + command: storm nimbus + depends_on: + - zookeeper + restart: always + + supervisor: + image: storm:latest + container_name: supervisor + command: storm supervisor -c worker.childopts=-Xmx%HEAP-MEM%m + depends_on: + - nimbus + - zookeeper + restart: always + + ui: + image: storm:latest + container_name: ui + command: storm ui + depends_on: + - nimbus + restart: always + ports: + - "127.0.0.1:8080:8080" + + urlfrontier: + image: crawlercommons/url-frontier:latest + container_name: urlfrontier + restart: always + ports: + - "127.0.0.1:7071:7071" +---- + +Notes: + +- This example Docker Compose uses the official Storm and Zookeeper images. +- URLFrontier is an additional service used by StormCrawler to act as Frontier. Please note, that we also offer other Frontier implementations like OpenSearch or Apache Solr. +- Ports may need adjustment depending on your environment. +- The Storm UI runs on port 8080 by default. +- Ensure network connectivity between services; Docker Compose handles this by default. + +After setting up your Docker Compose, you should start it up: + +[source,shell] +---- +docker compose up -d +---- + +Check the logs and see, if every service is up and running: + +[source,shell] +---- +docker compose logs -f +---- + +Next, access the Storm UI via `http://localhost:8080` and check, that a Storm Nimbus as well as a Storm Supervisor is available. + +==== Compile + +Build the generated archetype by running + +[source,shell] +---- +mvn package +---- + +This will create a uberjar named `${artifactId}-${version}.jar` (matches the artifact id and the version specified during the archetype generation) in your `target` directory. + +==== Inject Your First Seeds + +Now you are ready to insert your first seeds into URLFrontier. To do so, create a file `seeds.txt` containing your seeds: + +[source,text] +---- +https://stormcrawler.apache.org +---- + +After you have saved it, we need to inject the seeds into URLFrontier. This can be done by running URLFrontiers client: + +[source,shell] +---- +java -cp target/${artifactId}-${version}.jar crawlercommons.urlfrontier.client.Client PutURLs -f seeds.txt +---- + +where _seeds.txt_ is the previously created file containing URLs to inject, with one URL per line. + +==== Run Your First Crawl + +Now it is time to run our first crawl. To do so, we need to start our crawler topolog in distributed mode and deploy it on our Storm Cluster. + +[source,shell] +---- +docker run --network ${NETWORK} -it \ +--rm \ +-v "$(pwd)/crawler-conf.yaml:/apache-storm/crawler-conf.yaml" \ +-v "$(pwd)/crawler.flux:/apache-storm/crawler.flux" \ +-v "$(pwd)/${artifactId}-${version}.jar:/apache-storm/${artifactId}-${version}.jar" \ +storm:latest \ +storm jar ${artifactId}-${version}.jar org.apache.storm.flux.Flux --remote crawler.flux +---- + +where `${NETWORK}` is the name of the Docker network of the previously started Docker Compose. You can find this name by running + +[source,shell] +---- +docker network ls +---- + +After running the `storm jar` command, you should carefully monitor the logs via + +[source,shell] +---- +docker compose logs -f +---- + +as well as the Storm UI. It should now list a running topology. + +In the default archetype, the fetched content is printed out to the default system out print stream. + +NOTE: In a Storm topology defined with Flux, parallelism specifies the number of tasks or instances of a spout or bolt to run concurrently, enabling scalable and efficient processing of data streams. In the archetype every component is set to a parallelism of **1**. + +Congratulations! You learned how to start your first simple crawl using StormCrawler. + +Feel free to explore the rest of our documentation to build more complex crawler topologies. + +=== Summary + +This document shows how simple it is to get Apache StormCrawler up and running and to run a simple crawl. diff --git a/pom.xml b/pom.xml index 36a79bd4..335f94df 100644 --- a/pom.xml +++ b/pom.xml @@ -653,6 +653,7 @@ under the License. <module>archetype</module> <module>external/opensearch/archetype</module> <module>external/solr/archetype</module> - </modules> + <module>docs</module> + </modules> </project>