This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 820d129a8 NUTCH-3000 - the selenium protocol should return the full html, not just the inner body element. new c1ba16cba Merge pull request #773 from tballison/NUTCH-3000 820d129a8 is described below commit 820d129a8adff9a34eed2ed3c04cfee377b56b63 Author: tallison <talli...@apache.org> AuthorDate: Wed Sep 13 10:26:25 2023 -0400 NUTCH-3000 - the selenium protocol should return the full html, not just the inner body element. --- .../java/org/apache/nutch/protocol/selenium/HttpWebClient.java | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java index 4b998d1bc..b0b12004d 100644 --- a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java +++ b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java @@ -234,7 +234,7 @@ public class HttpWebClient { } /** - * Function for obtaining the HTML BODY using the selected <a href= + * Function for obtaining the HTML using the selected <a href= * 'https://seleniumhq.github.io/selenium/docs/api/java/org/openqa/selenium/WebDriver.html'>selenium * webdriver</a> There are a number of configuration properties within * <code>nutch-site.xml</code> which determine whether to take screenshots of @@ -244,7 +244,7 @@ public class HttpWebClient { * the URL to fetch and render * @param conf * the {@link org.apache.hadoop.conf.Configuration} - * @return the rendered inner HTML page + * @return the html page */ public static String getHtmlPage(String url, Configuration conf) { WebDriver driver = getDriverForPage(url, conf); @@ -253,10 +253,7 @@ public class HttpWebClient { if (conf.getBoolean("take.screenshot", false)) { takeScreenshot(driver, conf); } - - String innerHtml = driver.findElement(By.tagName("body")) - .getAttribute("innerHTML"); - return innerHtml; + return driver.getPageSource(); // I'm sure this catch statement is a code smell ; borrowing it from // lib-htmlunit