This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 820d129a8 NUTCH-3000 - the selenium protocol should return the full
html, not just the inner body element.
new c1ba16cba Merge pull request #773 from tballison/NUTCH-3000
820d129a8 is described below
commit 820d129a8adff9a34eed2ed3c04cfee377b56b63
Author: tallison <[email protected]>
AuthorDate: Wed Sep 13 10:26:25 2023 -0400
NUTCH-3000 - the selenium protocol should return the full html, not just
the inner body element.
---
.../java/org/apache/nutch/protocol/selenium/HttpWebClient.java | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)
diff --git
a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
index 4b998d1bc..b0b12004d 100644
---
a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
+++
b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
@@ -234,7 +234,7 @@ public class HttpWebClient {
}
/**
- * Function for obtaining the HTML BODY using the selected <a href=
+ * Function for obtaining the HTML using the selected <a href=
*
'https://seleniumhq.github.io/selenium/docs/api/java/org/openqa/selenium/WebDriver.html'>selenium
* webdriver</a> There are a number of configuration properties within
* <code>nutch-site.xml</code> which determine whether to take screenshots of
@@ -244,7 +244,7 @@ public class HttpWebClient {
* the URL to fetch and render
* @param conf
* the {@link org.apache.hadoop.conf.Configuration}
- * @return the rendered inner HTML page
+ * @return the html page
*/
public static String getHtmlPage(String url, Configuration conf) {
WebDriver driver = getDriverForPage(url, conf);
@@ -253,10 +253,7 @@ public class HttpWebClient {
if (conf.getBoolean("take.screenshot", false)) {
takeScreenshot(driver, conf);
}
-
- String innerHtml = driver.findElement(By.tagName("body"))
- .getAttribute("innerHTML");
- return innerHtml;
+ return driver.getPageSource();
// I'm sure this catch statement is a code smell ; borrowing it from
// lib-htmlunit