Repository: nutch
Updated Branches:
  refs/heads/master a3e742049 -> a9b2491a3


fix for NUTCH-2241 contributed by karanjeets


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/230693d6
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/230693d6
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/230693d6

Branch: refs/heads/master
Commit: 230693d6dc648f587e88e59817eea934166c9247
Parents: a3e7420
Author: Karanjeet Singh <[email protected]>
Authored: Sat Mar 19 16:55:40 2016 -0700
Committer: Karanjeet Singh <[email protected]>
Committed: Sat Mar 19 16:55:40 2016 -0700

----------------------------------------------------------------------
 conf/nutch-default.xml                          | 50 +++++++++++++++++++
 .../nutch/protocol/selenium/HttpWebClient.java  | 52 ++++++++++++++------
 2 files changed, 86 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nutch/blob/230693d6/conf/nutch-default.xml
----------------------------------------------------------------------
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 71f7efb..93503f3 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1953,6 +1953,56 @@ visit 
https://wiki.apache.org/nutch/SimilarityScoringFilter-->
  </description>
 </property>
 
+<!-- selenium firefox configuration; 
+     applies to protocol-selenium and protocol-interactiveselenium plugins -->
+<property>
+  <name>selenium.firefox.allowed.hosts</name>
+  <value>localhost</value>
+  <description>A String value representing the allowed hosts preference
+  according to the operating system hosts file (Example - /etc/hosts in Unix). 
+  Currently this option exist for - 'firefox' </description>
+</property>
+
+<property>
+  <name>selenium.firefox.binary.timeout</name>
+  <value>45</value>
+  <description>A Long value representing the timeout value
+  for firefox to be available for command execution. The value is in seconds. 
+  Currently this option exist for - 'firefox' </description>
+</property>
+
+<property>
+  <name>selenium.firefox.enable.flash</name>
+  <value>false</value>
+  <description>A Boolean value representing if flash should
+  be enabled or disabled. The default value is disabled. 
+  Currently this option exist for - 'firefox' </description>
+</property>
+
+<property>
+  <name>selenium.firefox.load.image</name>
+  <value>1</value>
+  <description>An Integer value representing the restriction on
+  loading images. The default value is no restriction i.e. load all images.
+  Other options are:
+  1: Load all images, regardless of origin
+  2: Block all images
+  3: Prevent third-party images from loading 
+  Currently this option exist for - 'firefox' </description>
+</property>
+
+<property>
+  <name>selenium.firefox.load.stylesheet</name>
+  <value>1</value>
+  <description>An Integer value representing the restriction on
+  loading stylesheet. The default value is no restriction i.e. load 
+  all stylesheet.
+  Other options are:
+  1: Load all stylesheet
+  2: Block all stylesheet
+  Currently this option exist for - 'firefox' </description>
+</property>
+
 <!-- lib-selenium configuration -->
 <property>
   <name>libselenium.page.load.delay</name>

http://git-wip-us.apache.org/repos/asf/nutch/blob/230693d6/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
----------------------------------------------------------------------
diff --git 
a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
 
b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
index 20d0b23..583b840 100644
--- 
a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
+++ 
b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
@@ -16,35 +16,36 @@
  */
 package org.apache.nutch.protocol.selenium;
 
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.URL;
+import java.util.concurrent.TimeUnit;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IOUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import org.openqa.selenium.By;
 import org.openqa.selenium.OutputType;
 import org.openqa.selenium.TakesScreenshot;
+import org.openqa.selenium.TimeoutException;
 import org.openqa.selenium.WebDriver;
 import org.openqa.selenium.chrome.ChromeDriver;
+import org.openqa.selenium.firefox.FirefoxBinary;
 import org.openqa.selenium.firefox.FirefoxDriver;
 import org.openqa.selenium.firefox.FirefoxProfile;
+import org.openqa.selenium.io.TemporaryFilesystem;
 import org.openqa.selenium.remote.DesiredCapabilities;
 import org.openqa.selenium.remote.RemoteWebDriver;
 import org.openqa.selenium.safari.SafariDriver;
-import org.openqa.selenium.support.ui.WebDriverWait;
-import org.openqa.selenium.io.TemporaryFilesystem;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import com.opera.core.systems.OperaDriver;
 
-import java.io.BufferedInputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.lang.String;
-import java.net.URL;
-
 public class HttpWebClient {
 
   private static final Logger LOG = 
LoggerFactory.getLogger(HttpWebClient.class);
@@ -58,6 +59,7 @@ public class HttpWebClient {
       profile.setPreference("permissions.default.stylesheet", 2);
       profile.setPreference("permissions.default.image", 2);
       profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", 
"false");
+      profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, 
"localhost");
       WebDriver driver = new FirefoxDriver(profile);
       return driver;
     };
@@ -72,7 +74,19 @@ public class HttpWebClient {
         String driverType  = conf.get("selenium.driver", "firefox");
         switch (driverType) {
           case "firefox":
-            driver = new FirefoxDriver();
+               String allowedHost = conf.get("selenium.firefox.allowed.hosts", 
"localhost");
+               long firefoxBinaryTimeout = 
conf.getLong("selenium.firefox.binary.timeout", 45);
+               boolean enableFlashPlayer = 
conf.getBoolean("selenium.firefox.enable.flash", false);
+               int loadImage = conf.getInt("selenium.firefox.load.image", 1);
+               int loadStylesheet = 
conf.getInt("selenium.firefox.load.stylesheet", 1);
+                   FirefoxProfile profile = new FirefoxProfile();
+                   FirefoxBinary binary = new FirefoxBinary();
+                   
profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, allowedHost);
+                   
profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", 
enableFlashPlayer);
+                   profile.setPreference("permissions.default.stylesheet", 
loadStylesheet);
+               profile.setPreference("permissions.default.image", loadImage);
+                   
binary.setTimeout(TimeUnit.SECONDS.toMillis(firefoxBinaryTimeout));
+            driver = new FirefoxDriver(binary, profile);
             break;
           case "chrome":
             driver = new ChromeDriver();
@@ -112,11 +126,16 @@ public class HttpWebClient {
         }
         LOG.debug("Selenium {} WebDriver selected.", driverType);
   
+        driver.manage().timeouts().pageLoadTimeout(pageLoadWait, 
TimeUnit.SECONDS);
         driver.get(url);
-        new WebDriverWait(driver, pageLoadWait);
       } catch (Exception e) {
-        throw new RuntimeException(e);
-      }
+                       if(e instanceof TimeoutException) {
+                               LOG.debug("Selenium WebDriver: Timeout 
Exception: Capturing whatever loaded so far...");
+                               return driver;
+                       }
+                       cleanUpDriver(driver);
+                   throw new RuntimeException(e);
+           } 
 
       return driver;
   }
@@ -132,6 +151,7 @@ public class HttpWebClient {
   public static void cleanUpDriver(WebDriver driver) {
       if (driver != null) {
           try {
+             driver.close();
               driver.quit();
               TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
           } catch (Exception e) {

Reply via email to