This is an automated email from the ASF dual-hosted git repository. jnioche pushed a commit to branch externaliseSelenium in repository https://gitbox.apache.org/repos/asf/stormcrawler.git
commit 02e0e90a65d852305a8f3e2886018c8b76b1778b Author: Julien Nioche <jul...@digitalpebble.com> AuthorDate: Tue Sep 2 06:46:58 2025 +0100 Externalise Selenium, fixes #1604 Signed-off-by: Julien Nioche <jul...@digitalpebble.com> --- core/pom.xml | 29 +----- core/src/main/resources/crawler-default.yaml | 24 ----- external/selenium/README.md | 6 ++ external/selenium/pom.xml | 108 +++++++++++++++++++++ external/selenium/selenium-conf.yaml | 41 ++++++++ .../protocol/selenium/NavigationFilter.java | 0 .../protocol/selenium/NavigationFilters.java | 0 .../protocol/selenium/RemoteDriverProtocol.java | 7 +- .../protocol/selenium/SeleniumProtocol.java | 0 .../protocol/selenium/ProtocolTest.java | 0 pom.xml | 1 + 11 files changed, 166 insertions(+), 50 deletions(-) diff --git a/core/pom.xml b/core/pom.xml index 86d54c35..2ae9ce90 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -46,12 +46,12 @@ under the License. <commons.lang.version>2.6</commons.lang.version> <wiremock.version>3.13.1</wiremock.version> <rometools.version>2.1.0</rometools.version> - <selenium.version>4.35.0</selenium.version> <cli.version>1.10.0</cli.version> <okhttp.version>5.1.0</okhttp.version> <caffeine.version>3.2.2</caffeine.version> <xsoup.version>0.3.7</xsoup.version> <awaitility.version>4.3.0</awaitility.version> + <guava.version>33.4.8-jre</guava.version> <jacoco.haltOnFailure>true</jacoco.haltOnFailure> <jacoco.classRatio>0.73</jacoco.classRatio> <jacoco.instructionRatio>0.54</jacoco.instructionRatio> @@ -220,18 +220,6 @@ under the License. <version>${commons.lang.version}</version> </dependency> - <dependency> - <groupId>org.seleniumhq.selenium</groupId> - <artifactId>selenium-remote-driver</artifactId> - <version>${selenium.version}</version> - </dependency> - - <dependency> - <groupId>org.seleniumhq.selenium</groupId> - <artifactId>selenium-support</artifactId> - <version>${selenium.version}</version> - </dependency> - <dependency> <groupId>us.codecraft</groupId> <artifactId>xsoup</artifactId> @@ -258,19 +246,12 @@ under the License. </dependency> <dependency> - <groupId>org.testcontainers</groupId> - <artifactId>selenium</artifactId> - <scope>test</scope> - </dependency> - - <dependency> - <groupId>org.seleniumhq.selenium</groupId> - <artifactId>selenium-chrome-driver</artifactId> - <version>${selenium.version}</version> - <scope>test</scope> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>${guava.version}</version> </dependency> - </dependencies> + <dependencyManagement> <dependencies> <dependency> diff --git a/core/src/main/resources/crawler-default.yaml b/core/src/main/resources/crawler-default.yaml index 27021730..56350b67 100644 --- a/core/src/main/resources/crawler-default.yaml +++ b/core/src/main/resources/crawler-default.yaml @@ -208,30 +208,6 @@ config: # or transferred protocol metadata must also be prefixed. protocol.md.prefix: "protocol." - # navigationfilters.config.file: "navigationfilters.json" - # selenium.addresses: "http://localhost:9515" - - selenium.tracing: false - - # rely on selenium's default values - # set to a value >= 0 to override - selenium.timeouts: - script: -1 - pageLoad: -1 - implicit: -1 - - # selenium.capabilities: - # a browser name is required - # browserName:"chrome" - # illustrates the use of the variable for user agent - # phantomjs.page.settings.userAgent: "$userAgent" - # ChromeDriver config - # goog:chromeOptions: - # args: - # - "--headless" - # - "--disable-gpu" - # - "--mute-audio" - # no url or parsefilters by default # parsefilters.config.file: "parsefilters.json" # urlfilters.config.file: "urlfilters.json" diff --git a/external/selenium/README.md b/external/selenium/README.md new file mode 100644 index 00000000..f0347466 --- /dev/null +++ b/external/selenium/README.md @@ -0,0 +1,6 @@ +# Selenium +Protocol implementation for Apache StormCrawler based on Selenium + +Add `selenium-conf.yaml ` to the configuration of your topology. + + diff --git a/external/selenium/pom.xml b/external/selenium/pom.xml new file mode 100644 index 00000000..55ba25fe --- /dev/null +++ b/external/selenium/pom.xml @@ -0,0 +1,108 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.stormcrawler</groupId> + <artifactId>stormcrawler-external</artifactId> + <version>3.4.1-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + + <artifactId>stormcrawler-selenium</artifactId> + <packaging>jar</packaging> + + <name>stormcrawler-selenium</name> + <url>https://github.com/apache/stormcrawler/tree/master/external/selenium</url> + <description>selenium-based protocol for StormCrawler</description> + + <properties> + <selenium.version>4.35.0</selenium.version> + <awaitility.version>4.3.0</awaitility.version> + <wiremock.version>3.13.1</wiremock.version> + <jacoco.haltOnFailure>true</jacoco.haltOnFailure> + <jacoco.classRatio>0.00</jacoco.classRatio> + <jacoco.instructionRatio>0.00</jacoco.instructionRatio> + <jacoco.methodRatio>0.00</jacoco.methodRatio> + <jacoco.branchRatio>0.00</jacoco.branchRatio> + <jacoco.lineRatio>0.00</jacoco.lineRatio> + <jacoco.complexityRatio>0.00</jacoco.complexityRatio> + </properties> + + <dependencies> + <dependency> + <groupId>org.seleniumhq.selenium</groupId> + <artifactId>selenium-remote-driver</artifactId> + <version>${selenium.version}</version> + </dependency> + + <dependency> + <groupId>org.seleniumhq.selenium</groupId> + <artifactId>selenium-support</artifactId> + <version>${selenium.version}</version> + </dependency> + + <dependency> + <groupId>org.apache.stormcrawler</groupId> + <artifactId>stormcrawler-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.wiremock</groupId> + <artifactId>wiremock</artifactId> + <version>${wiremock.version}</version> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.awaitility</groupId> + <artifactId>awaitility</artifactId> + <version>${awaitility.version}</version> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.testcontainers</groupId> + <artifactId>selenium</artifactId> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.seleniumhq.selenium</groupId> + <artifactId>selenium-chrome-driver</artifactId> + <version>${selenium.version}</version> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.testcontainers</groupId> + <artifactId>junit-jupiter</artifactId> + <scope>test</scope> + </dependency> + + </dependencies> + +</project> diff --git a/external/selenium/selenium-conf.yaml b/external/selenium/selenium-conf.yaml new file mode 100644 index 00000000..83250ed7 --- /dev/null +++ b/external/selenium/selenium-conf.yaml @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +config: + # navigationfilters.config.file: "navigationfilters.json" + # selenium.addresses: "http://localhost:9515" + + selenium.tracing: false + + # rely on selenium's default values + # set to a value >= 0 to override + selenium.timeouts: + script: -1 + pageLoad: -1 + implicit: -1 + + # selenium.capabilities: + # a browser name is required + # browserName:"chrome" + # illustrates the use of the variable for user agent + # phantomjs.page.settings.userAgent: "$userAgent" + # ChromeDriver config + # goog:chromeOptions: + # args: + # - "--headless" + # - "--disable-gpu" + # - "--mute-audio" + + diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/selenium/NavigationFilter.java b/external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/NavigationFilter.java similarity index 100% rename from core/src/main/java/org/apache/stormcrawler/protocol/selenium/NavigationFilter.java rename to external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/NavigationFilter.java diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/selenium/NavigationFilters.java b/external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/NavigationFilters.java similarity index 100% rename from core/src/main/java/org/apache/stormcrawler/protocol/selenium/NavigationFilters.java rename to external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/NavigationFilters.java diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/selenium/RemoteDriverProtocol.java b/external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/RemoteDriverProtocol.java similarity index 95% rename from core/src/main/java/org/apache/stormcrawler/protocol/selenium/RemoteDriverProtocol.java rename to external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/RemoteDriverProtocol.java index 219f4b0a..75b03bdf 100644 --- a/core/src/main/java/org/apache/stormcrawler/protocol/selenium/RemoteDriverProtocol.java +++ b/external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/RemoteDriverProtocol.java @@ -19,6 +19,7 @@ package org.apache.stormcrawler.protocol.selenium; import java.net.URL; import java.time.Duration; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -38,7 +39,9 @@ public class RemoteDriverProtocol extends SeleniumProtocol { private void substituteUserAgent(Map<String, Object> keyvals, final String userAgentString) { if (keyvals == null) return; - for (Entry<String, Object> entry : keyvals.entrySet()) { + Iterator<Entry<String, Object>> iter = keyvals.entrySet().iterator(); + while (iter.hasNext()) { + Entry<String, Object> entry = iter.next(); Object val = entry.getValue(); // substitute variable $useragent for the real value if (val instanceof String && val.toString().contains("$useragent")) { @@ -47,7 +50,7 @@ public class RemoteDriverProtocol extends SeleniumProtocol { } else if (val instanceof Map<?, ?>) { substituteUserAgent((Map<String, Object>) val, userAgentString); } else if (val instanceof List<?>) { - List<String> newList = new ArrayList<>(); + List newList = new ArrayList<String>(); ((List<String>) val) .forEach( s -> { diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/selenium/SeleniumProtocol.java b/external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/SeleniumProtocol.java similarity index 100% rename from core/src/main/java/org/apache/stormcrawler/protocol/selenium/SeleniumProtocol.java rename to external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/SeleniumProtocol.java diff --git a/core/src/test/java/org/apache/stormcrawler/protocol/selenium/ProtocolTest.java b/external/selenium/src/test/java/org/apache/stormcrawler/protocol/selenium/ProtocolTest.java similarity index 100% rename from core/src/test/java/org/apache/stormcrawler/protocol/selenium/ProtocolTest.java rename to external/selenium/src/test/java/org/apache/stormcrawler/protocol/selenium/ProtocolTest.java diff --git a/pom.xml b/pom.xml index 484f6c6c..2bde4648 100644 --- a/pom.xml +++ b/pom.xml @@ -645,6 +645,7 @@ under the License. <module>external/langid</module> <module>external/opensearch</module> <module>external/playwright</module> + <module>external/selenium</module> <module>external/solr</module> <module>external/sql</module> <module>external/tika</module>