This is an automated email from the ASF dual-hosted git repository.

jnioche pushed a commit to branch externaliseSelenium
in repository https://gitbox.apache.org/repos/asf/stormcrawler.git

commit 02e0e90a65d852305a8f3e2886018c8b76b1778b
Author: Julien Nioche <jul...@digitalpebble.com>
AuthorDate: Tue Sep 2 06:46:58 2025 +0100

    Externalise Selenium, fixes #1604
    
    Signed-off-by: Julien Nioche <jul...@digitalpebble.com>
---
 core/pom.xml                                       |  29 +-----
 core/src/main/resources/crawler-default.yaml       |  24 -----
 external/selenium/README.md                        |   6 ++
 external/selenium/pom.xml                          | 108 +++++++++++++++++++++
 external/selenium/selenium-conf.yaml               |  41 ++++++++
 .../protocol/selenium/NavigationFilter.java        |   0
 .../protocol/selenium/NavigationFilters.java       |   0
 .../protocol/selenium/RemoteDriverProtocol.java    |   7 +-
 .../protocol/selenium/SeleniumProtocol.java        |   0
 .../protocol/selenium/ProtocolTest.java            |   0
 pom.xml                                            |   1 +
 11 files changed, 166 insertions(+), 50 deletions(-)

diff --git a/core/pom.xml b/core/pom.xml
index 86d54c35..2ae9ce90 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -46,12 +46,12 @@ under the License.
                <commons.lang.version>2.6</commons.lang.version>
                <wiremock.version>3.13.1</wiremock.version>
                <rometools.version>2.1.0</rometools.version>
-               <selenium.version>4.35.0</selenium.version>
                <cli.version>1.10.0</cli.version>
                <okhttp.version>5.1.0</okhttp.version>
                <caffeine.version>3.2.2</caffeine.version>
                <xsoup.version>0.3.7</xsoup.version>
                <awaitility.version>4.3.0</awaitility.version>
+               <guava.version>33.4.8-jre</guava.version>
                <jacoco.haltOnFailure>true</jacoco.haltOnFailure>
                <jacoco.classRatio>0.73</jacoco.classRatio>
                <jacoco.instructionRatio>0.54</jacoco.instructionRatio>
@@ -220,18 +220,6 @@ under the License.
                        <version>${commons.lang.version}</version>
                </dependency>
 
-               <dependency>
-                       <groupId>org.seleniumhq.selenium</groupId>
-                       <artifactId>selenium-remote-driver</artifactId>
-                       <version>${selenium.version}</version>
-               </dependency>
-
-               <dependency>
-                       <groupId>org.seleniumhq.selenium</groupId>
-                       <artifactId>selenium-support</artifactId>
-                       <version>${selenium.version}</version>
-               </dependency>
-
                <dependency>
                        <groupId>us.codecraft</groupId>
                        <artifactId>xsoup</artifactId>
@@ -258,19 +246,12 @@ under the License.
                </dependency>
 
                <dependency>
-                       <groupId>org.testcontainers</groupId>
-                       <artifactId>selenium</artifactId>
-                       <scope>test</scope>
-               </dependency>
-
-               <dependency>
-                       <groupId>org.seleniumhq.selenium</groupId>
-                       <artifactId>selenium-chrome-driver</artifactId>
-                       <version>${selenium.version}</version>
-                       <scope>test</scope>
+                       <groupId>com.google.guava</groupId>
+                       <artifactId>guava</artifactId>
+                       <version>${guava.version}</version>
                </dependency>
-
        </dependencies>
+
        <dependencyManagement>
                <dependencies>
                        <dependency>
diff --git a/core/src/main/resources/crawler-default.yaml 
b/core/src/main/resources/crawler-default.yaml
index 27021730..56350b67 100644
--- a/core/src/main/resources/crawler-default.yaml
+++ b/core/src/main/resources/crawler-default.yaml
@@ -208,30 +208,6 @@ config:
   # or transferred protocol metadata must also be prefixed.
   protocol.md.prefix: "protocol."
 
-  # navigationfilters.config.file: "navigationfilters.json"
-  # selenium.addresses: "http://localhost:9515";
-
-  selenium.tracing: false
-
-  # rely on selenium's default values
-  # set to a value >= 0 to override
-  selenium.timeouts:
-    script: -1
-    pageLoad: -1
-    implicit: -1
-
-  # selenium.capabilities:
-    # a browser name is required
-    # browserName:"chrome"
-    # illustrates the use of the variable for user agent
-    # phantomjs.page.settings.userAgent: "$userAgent"
-    # ChromeDriver config
-    # goog:chromeOptions:
-    #   args: 
-    #      - "--headless"
-    #      - "--disable-gpu"
-    #      - "--mute-audio"
-
   # no url or parsefilters by default
   # parsefilters.config.file: "parsefilters.json"
   # urlfilters.config.file: "urlfilters.json"
diff --git a/external/selenium/README.md b/external/selenium/README.md
new file mode 100644
index 00000000..f0347466
--- /dev/null
+++ b/external/selenium/README.md
@@ -0,0 +1,6 @@
+# Selenium
+Protocol implementation for Apache StormCrawler based on Selenium
+
+Add `selenium-conf.yaml ` to the configuration of your topology.
+
+
diff --git a/external/selenium/pom.xml b/external/selenium/pom.xml
new file mode 100644
index 00000000..55ba25fe
--- /dev/null
+++ b/external/selenium/pom.xml
@@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"; 
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+       <modelVersion>4.0.0</modelVersion>
+
+       <parent>
+               <groupId>org.apache.stormcrawler</groupId>
+               <artifactId>stormcrawler-external</artifactId>
+               <version>3.4.1-SNAPSHOT</version>
+               <relativePath>../pom.xml</relativePath>
+       </parent>
+
+       <artifactId>stormcrawler-selenium</artifactId>
+       <packaging>jar</packaging>
+
+       <name>stormcrawler-selenium</name>
+       
<url>https://github.com/apache/stormcrawler/tree/master/external/selenium</url>
+       <description>selenium-based protocol for StormCrawler</description>
+
+       <properties>
+               <selenium.version>4.35.0</selenium.version>
+               <awaitility.version>4.3.0</awaitility.version>
+               <wiremock.version>3.13.1</wiremock.version>
+               <jacoco.haltOnFailure>true</jacoco.haltOnFailure>
+               <jacoco.classRatio>0.00</jacoco.classRatio>
+               <jacoco.instructionRatio>0.00</jacoco.instructionRatio>
+               <jacoco.methodRatio>0.00</jacoco.methodRatio>
+               <jacoco.branchRatio>0.00</jacoco.branchRatio>
+               <jacoco.lineRatio>0.00</jacoco.lineRatio>
+               <jacoco.complexityRatio>0.00</jacoco.complexityRatio>
+       </properties>
+
+       <dependencies>
+               <dependency>
+                       <groupId>org.seleniumhq.selenium</groupId>
+                       <artifactId>selenium-remote-driver</artifactId>
+                       <version>${selenium.version}</version>
+               </dependency>
+
+               <dependency>
+                       <groupId>org.seleniumhq.selenium</groupId>
+                       <artifactId>selenium-support</artifactId>
+                       <version>${selenium.version}</version>
+               </dependency>
+
+               <dependency>
+                       <groupId>org.apache.stormcrawler</groupId>
+                       <artifactId>stormcrawler-core</artifactId>
+                       <version>${project.version}</version>
+                       <type>test-jar</type>
+                       <scope>test</scope>
+               </dependency>
+
+               <dependency>
+                       <groupId>org.wiremock</groupId>
+                       <artifactId>wiremock</artifactId>
+                       <version>${wiremock.version}</version>
+                       <scope>test</scope>
+               </dependency>
+
+               <dependency>
+                       <groupId>org.awaitility</groupId>
+                       <artifactId>awaitility</artifactId>
+                       <version>${awaitility.version}</version>
+                       <scope>test</scope>
+               </dependency>
+
+               <dependency>
+                       <groupId>org.testcontainers</groupId>
+                       <artifactId>selenium</artifactId>
+                       <scope>test</scope>
+               </dependency>
+
+               <dependency>
+                       <groupId>org.seleniumhq.selenium</groupId>
+                       <artifactId>selenium-chrome-driver</artifactId>
+                       <version>${selenium.version}</version>
+                       <scope>test</scope>
+               </dependency>
+
+               <dependency>
+                       <groupId>org.testcontainers</groupId>
+                       <artifactId>junit-jupiter</artifactId>
+                       <scope>test</scope>
+               </dependency>
+
+       </dependencies>
+
+</project>
diff --git a/external/selenium/selenium-conf.yaml 
b/external/selenium/selenium-conf.yaml
new file mode 100644
index 00000000..83250ed7
--- /dev/null
+++ b/external/selenium/selenium-conf.yaml
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+config:
+  # navigationfilters.config.file: "navigationfilters.json"
+  # selenium.addresses: "http://localhost:9515";
+
+  selenium.tracing: false
+
+  # rely on selenium's default values
+  # set to a value >= 0 to override
+  selenium.timeouts:
+    script: -1
+    pageLoad: -1
+    implicit: -1
+
+    # selenium.capabilities:
+    # a browser name is required
+    # browserName:"chrome"
+    # illustrates the use of the variable for user agent
+    # phantomjs.page.settings.userAgent: "$userAgent"
+    # ChromeDriver config
+    # goog:chromeOptions:
+    #   args:
+    #      - "--headless"
+    #      - "--disable-gpu"
+    #      - "--mute-audio"
+
+
diff --git 
a/core/src/main/java/org/apache/stormcrawler/protocol/selenium/NavigationFilter.java
 
b/external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/NavigationFilter.java
similarity index 100%
rename from 
core/src/main/java/org/apache/stormcrawler/protocol/selenium/NavigationFilter.java
rename to 
external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/NavigationFilter.java
diff --git 
a/core/src/main/java/org/apache/stormcrawler/protocol/selenium/NavigationFilters.java
 
b/external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/NavigationFilters.java
similarity index 100%
rename from 
core/src/main/java/org/apache/stormcrawler/protocol/selenium/NavigationFilters.java
rename to 
external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/NavigationFilters.java
diff --git 
a/core/src/main/java/org/apache/stormcrawler/protocol/selenium/RemoteDriverProtocol.java
 
b/external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/RemoteDriverProtocol.java
similarity index 95%
rename from 
core/src/main/java/org/apache/stormcrawler/protocol/selenium/RemoteDriverProtocol.java
rename to 
external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/RemoteDriverProtocol.java
index 219f4b0a..75b03bdf 100644
--- 
a/core/src/main/java/org/apache/stormcrawler/protocol/selenium/RemoteDriverProtocol.java
+++ 
b/external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/RemoteDriverProtocol.java
@@ -19,6 +19,7 @@ package org.apache.stormcrawler.protocol.selenium;
 import java.net.URL;
 import java.time.Duration;
 import java.util.ArrayList;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
@@ -38,7 +39,9 @@ public class RemoteDriverProtocol extends SeleniumProtocol {
     private void substituteUserAgent(Map<String, Object> keyvals, final String 
userAgentString) {
         if (keyvals == null) return;
 
-        for (Entry<String, Object> entry : keyvals.entrySet()) {
+        Iterator<Entry<String, Object>> iter = keyvals.entrySet().iterator();
+        while (iter.hasNext()) {
+            Entry<String, Object> entry = iter.next();
             Object val = entry.getValue();
             // substitute variable $useragent for the real value
             if (val instanceof String && 
val.toString().contains("$useragent")) {
@@ -47,7 +50,7 @@ public class RemoteDriverProtocol extends SeleniumProtocol {
             } else if (val instanceof Map<?, ?>) {
                 substituteUserAgent((Map<String, Object>) val, 
userAgentString);
             } else if (val instanceof List<?>) {
-                List<String> newList = new ArrayList<>();
+                List newList = new ArrayList<String>();
                 ((List<String>) val)
                         .forEach(
                                 s -> {
diff --git 
a/core/src/main/java/org/apache/stormcrawler/protocol/selenium/SeleniumProtocol.java
 
b/external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/SeleniumProtocol.java
similarity index 100%
rename from 
core/src/main/java/org/apache/stormcrawler/protocol/selenium/SeleniumProtocol.java
rename to 
external/selenium/src/main/java/org/apache/stormcrawler/protocol/selenium/SeleniumProtocol.java
diff --git 
a/core/src/test/java/org/apache/stormcrawler/protocol/selenium/ProtocolTest.java
 
b/external/selenium/src/test/java/org/apache/stormcrawler/protocol/selenium/ProtocolTest.java
similarity index 100%
rename from 
core/src/test/java/org/apache/stormcrawler/protocol/selenium/ProtocolTest.java
rename to 
external/selenium/src/test/java/org/apache/stormcrawler/protocol/selenium/ProtocolTest.java
diff --git a/pom.xml b/pom.xml
index 484f6c6c..2bde4648 100644
--- a/pom.xml
+++ b/pom.xml
@@ -645,6 +645,7 @@ under the License.
                <module>external/langid</module>
                <module>external/opensearch</module>
                <module>external/playwright</module>
+               <module>external/selenium</module>
                <module>external/solr</module>
                <module>external/sql</module>
                <module>external/tika</module>

Reply via email to