Repository: nutch Updated Branches: refs/heads/master a3e742049 -> a9b2491a3
fix for NUTCH-2241 contributed by karanjeets Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/230693d6 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/230693d6 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/230693d6 Branch: refs/heads/master Commit: 230693d6dc648f587e88e59817eea934166c9247 Parents: a3e7420 Author: Karanjeet Singh <[email protected]> Authored: Sat Mar 19 16:55:40 2016 -0700 Committer: Karanjeet Singh <[email protected]> Committed: Sat Mar 19 16:55:40 2016 -0700 ---------------------------------------------------------------------- conf/nutch-default.xml | 50 +++++++++++++++++++ .../nutch/protocol/selenium/HttpWebClient.java | 52 ++++++++++++++------ 2 files changed, 86 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/230693d6/conf/nutch-default.xml ---------------------------------------------------------------------- diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 71f7efb..93503f3 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1953,6 +1953,56 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> </description> </property> +<!-- selenium firefox configuration; + applies to protocol-selenium and protocol-interactiveselenium plugins --> +<property> + <name>selenium.firefox.allowed.hosts</name> + <value>localhost</value> + <description>A String value representing the allowed hosts preference + according to the operating system hosts file (Example - /etc/hosts in Unix). + Currently this option exist for - 'firefox' </description> +</property> + +<property> + <name>selenium.firefox.binary.timeout</name> + <value>45</value> + <description>A Long value representing the timeout value + for firefox to be available for command execution. The value is in seconds. + Currently this option exist for - 'firefox' </description> +</property> + +<property> + <name>selenium.firefox.enable.flash</name> + <value>false</value> + <description>A Boolean value representing if flash should + be enabled or disabled. The default value is disabled. + Currently this option exist for - 'firefox' </description> +</property> + +<property> + <name>selenium.firefox.load.image</name> + <value>1</value> + <description>An Integer value representing the restriction on + loading images. The default value is no restriction i.e. load all images. + Other options are: + 1: Load all images, regardless of origin + 2: Block all images + 3: Prevent third-party images from loading + Currently this option exist for - 'firefox' </description> +</property> + +<property> + <name>selenium.firefox.load.stylesheet</name> + <value>1</value> + <description>An Integer value representing the restriction on + loading stylesheet. The default value is no restriction i.e. load + all stylesheet. + Other options are: + 1: Load all stylesheet + 2: Block all stylesheet + Currently this option exist for - 'firefox' </description> +</property> + <!-- lib-selenium configuration --> <property> <name>libselenium.page.load.delay</name> http://git-wip-us.apache.org/repos/asf/nutch/blob/230693d6/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java index 20d0b23..583b840 100644 --- a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java +++ b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java @@ -16,35 +16,36 @@ */ package org.apache.nutch.protocol.selenium; +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.URL; +import java.util.concurrent.TimeUnit; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.openqa.selenium.By; import org.openqa.selenium.OutputType; import org.openqa.selenium.TakesScreenshot; +import org.openqa.selenium.TimeoutException; import org.openqa.selenium.WebDriver; import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.firefox.FirefoxBinary; import org.openqa.selenium.firefox.FirefoxDriver; import org.openqa.selenium.firefox.FirefoxProfile; +import org.openqa.selenium.io.TemporaryFilesystem; import org.openqa.selenium.remote.DesiredCapabilities; import org.openqa.selenium.remote.RemoteWebDriver; import org.openqa.selenium.safari.SafariDriver; -import org.openqa.selenium.support.ui.WebDriverWait; -import org.openqa.selenium.io.TemporaryFilesystem; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.opera.core.systems.OperaDriver; -import java.io.BufferedInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.InputStream; -import java.io.OutputStream; -import java.lang.String; -import java.net.URL; - public class HttpWebClient { private static final Logger LOG = LoggerFactory.getLogger(HttpWebClient.class); @@ -58,6 +59,7 @@ public class HttpWebClient { profile.setPreference("permissions.default.stylesheet", 2); profile.setPreference("permissions.default.image", 2); profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", "false"); + profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, "localhost"); WebDriver driver = new FirefoxDriver(profile); return driver; }; @@ -72,7 +74,19 @@ public class HttpWebClient { String driverType = conf.get("selenium.driver", "firefox"); switch (driverType) { case "firefox": - driver = new FirefoxDriver(); + String allowedHost = conf.get("selenium.firefox.allowed.hosts", "localhost"); + long firefoxBinaryTimeout = conf.getLong("selenium.firefox.binary.timeout", 45); + boolean enableFlashPlayer = conf.getBoolean("selenium.firefox.enable.flash", false); + int loadImage = conf.getInt("selenium.firefox.load.image", 1); + int loadStylesheet = conf.getInt("selenium.firefox.load.stylesheet", 1); + FirefoxProfile profile = new FirefoxProfile(); + FirefoxBinary binary = new FirefoxBinary(); + profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, allowedHost); + profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", enableFlashPlayer); + profile.setPreference("permissions.default.stylesheet", loadStylesheet); + profile.setPreference("permissions.default.image", loadImage); + binary.setTimeout(TimeUnit.SECONDS.toMillis(firefoxBinaryTimeout)); + driver = new FirefoxDriver(binary, profile); break; case "chrome": driver = new ChromeDriver(); @@ -112,11 +126,16 @@ public class HttpWebClient { } LOG.debug("Selenium {} WebDriver selected.", driverType); + driver.manage().timeouts().pageLoadTimeout(pageLoadWait, TimeUnit.SECONDS); driver.get(url); - new WebDriverWait(driver, pageLoadWait); } catch (Exception e) { - throw new RuntimeException(e); - } + if(e instanceof TimeoutException) { + LOG.debug("Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far..."); + return driver; + } + cleanUpDriver(driver); + throw new RuntimeException(e); + } return driver; } @@ -132,6 +151,7 @@ public class HttpWebClient { public static void cleanUpDriver(WebDriver driver) { if (driver != null) { try { + driver.close(); driver.quit(); TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); } catch (Exception e) {
