Author: mattmann
Date: Sun Aug  2 23:08:51 2015
New Revision: 1693837

URL: http://svn.apache.org/r1693837
Log:
Fix for NUTCH-2062: Add Plugin for interacting with Selenium WebDriver 
contributed by Michael Joyce <[email protected]> this closes #46

Added:
    nutch/trunk/src/plugin/protocol-interactiveselenium/
    nutch/trunk/src/plugin/protocol-interactiveselenium/README.md
    nutch/trunk/src/plugin/protocol-interactiveselenium/build-ivy.xml
    nutch/trunk/src/plugin/protocol-interactiveselenium/build.xml
    nutch/trunk/src/plugin/protocol-interactiveselenium/ivy.xml
    nutch/trunk/src/plugin/protocol-interactiveselenium/plugin.xml
    nutch/trunk/src/plugin/protocol-interactiveselenium/src/
    nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/
    nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/
    nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/
    
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/
    
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/
    
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/
    
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/Http.java
    
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
    
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/
    
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java
    
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java
    
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/package.html
Modified:
    nutch/trunk/build.xml
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/plugin/build.xml
    
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java

Modified: nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1693837&r1=1693836&r2=1693837&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Sun Aug  2 23:08:51 2015
@@ -198,6 +198,7 @@
       <packageset dir="${plugins.dir}/protocol-ftp/src/java"/>
       <packageset dir="${plugins.dir}/protocol-http/src/java"/>
       <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/>
+      <packageset dir="${plugins.dir}/protocol-interactiveselenium/src/java"/>
       <packageset dir="${plugins.dir}/protocol-selenium/src/java"/>
       <packageset dir="${plugins.dir}/scoring-depth/src/java"/>
       <packageset dir="${plugins.dir}/scoring-similarity/src/java"/>

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1693837&r1=1693836&r2=1693837&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Sun Aug  2 23:08:51 2015
@@ -1772,4 +1772,26 @@ CAUTION: Set the parser.timeout to -1 or
   </description>
 </property>
 
+<!-- lib-selenium configuration -->
+<property>
+  <name>libselenium.page.load.delay</name>
+  <value>3</value>
+  <description>
+    The delay in seconds to use when loading a page with lib-selenium. This
+    setting is used by protocol-selenium and protocol-interactiveselenium
+    since they depending on lib-selenium for fetching.
+  </description>
+</property>
+
+<!-- protocol-interactiveselenium configuration -->
+<property>
+  <name>interactiveselenium.handlers</name>
+  <value>DefaultHandler</value>
+  <description>
+    A comma separated list of Selenium handlers that should be run for a given
+    URL. The DefaultHandler causes the same functionality as protocol-selenium.
+    Custom handlers can be implemented in the plugin package and included here.
+  </description>
+</property>
+
 </configuration>

Modified: nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1693837&r1=1693836&r2=1693837&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Sun Aug  2 23:08:51 2015
@@ -53,6 +53,7 @@
      <ant dir="protocol-httpclient" target="deploy"/>
      <ant dir="lib-selenium" target="deploy"/>
      <ant dir="protocol-selenium" target="deploy" />
+     <ant dir="protocol-interactiveselenium" target="deploy" />
      <ant dir="parse-ext" target="deploy"/>
      <ant dir="parse-js" target="deploy"/>
      <ant dir="parse-html" target="deploy"/>
@@ -160,6 +161,7 @@
     <ant dir="protocol-httpclient" target="clean"/>
     <ant dir="lib-selenium" target="clean"/>
     <ant dir="protocol-selenium" target="clean" />
+    <ant dir="protocol-interactiveselenium" target="clean" />
     <ant dir="parse-ext" target="clean"/>
     <ant dir="parse-js" target="clean"/>
     <ant dir="parse-html" target="clean"/>

Modified: 
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java?rev=1693837&r1=1693836&r2=1693837&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
 Sun Aug  2 23:08:51 2015
@@ -58,6 +58,59 @@ public class HttpWebClient {
     };
   };
 
+  public static WebDriver getDriverForPage(String url, Configuration conf) {
+      WebDriver driver = null;
+      long pageLoadWait = conf.getLong("libselenium.page.load.delay", 3);
+
+      try {
+        String driverType  = conf.get("selenium.driver", "firefox");
+        switch (driverType) {
+          case "firefox":
+            driver = new FirefoxDriver();
+            break;
+          case "chrome":
+            driver = new ChromeDriver();
+            break;
+          case "safari":
+            driver = new SafariDriver();
+            break;
+          case "opera":
+            driver = new OperaDriver();
+            break;
+          default:
+            LOG.error("The Selenium WebDriver choice {} is not available... 
defaulting to FirefoxDriver().", driverType);
+            driver = new FirefoxDriver();
+            break;
+        }
+        LOG.debug("Selenium {} WebDriver selected.", driverType);
+  
+        driver.get(url);
+        new WebDriverWait(driver, pageLoadWait);
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+
+      return driver;
+  }
+
+  public static String getHTMLContent(WebDriver driver, Configuration conf) {
+      if (conf.getBoolean("selenium.take.screenshot", false)) {
+        takeScreenshot(driver, conf);
+      }
+
+      return driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+  }
+
+  public static void cleanUpDriver(WebDriver driver) {
+      if (driver != null) {
+          try {
+              driver.quit();
+          } catch (Exception e) {
+              throw new RuntimeException(e);
+          }
+      }
+  }
+
   /**
    * Function for obtaining the HTML BODY using the selected
    * {@link org.openqa.selenium.WebDriver}.
@@ -70,52 +123,11 @@ public class HttpWebClient {
    * @return the rendered inner HTML page
    */
   public static String getHtmlPage(String url, Configuration conf) {
+    WebDriver driver = getDriverForPage(url, conf);
 
-    WebDriver driver = null;
     try {
-      String driverType  = conf.get("selenium.driver", "firefox");
-      switch (driverType) {
-      case "firefox":
-        driver = new FirefoxDriver();
-        break;
-      case "chrome":
-        driver = new ChromeDriver();
-        break;
-      case "safari":
-        driver = new SafariDriver();
-        break;
-      case "opera":
-        driver = new OperaDriver();
-        break;
-      default:
-        LOG.error("The Selenium WebDriver choice {} is not available... 
defaulting to FirefoxDriver().", driverType);
-        driver = new FirefoxDriver();
-        break;
-      }
-      LOG.debug("Selenium {} WebDriver selected.", driverType);
-      driver.get(url);
-
-      // Wait for the page to load, timeout after 3 seconds
-      new WebDriverWait(driver, 3);
-
       if (conf.getBoolean("selenium.take.screenshot", false)) {
-        File srcFile = 
((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
-        LOG.debug("In-memory screenshot taken of: {}", url);
-        FileSystem fs = FileSystem.get(conf);
-        Path screenshotPath = new 
Path(conf.get("selenium.screenshot.location") + "/" + srcFile.getName());
-        if (screenshotPath != null) {
-          OutputStream os = null;
-          if (!fs.exists(screenshotPath)) {
-            LOG.debug("No existing screenshot already exists... creating new 
file at {} {}.", screenshotPath, srcFile.getName());
-            os = fs.create(screenshotPath);
-          }
-          InputStream is = new BufferedInputStream(new 
FileInputStream(srcFile));
-          IOUtils.copyBytes(is, os, conf);
-          LOG.debug("Screenshot for {} successfully saved to: {} {}", url, 
screenshotPath, srcFile.getName()); 
-        } else {
-          LOG.warn("Screenshot for {} not saved to HDFS (subsequently 
disgarded) as value for "
-              + "'selenium.screenshot.location' is absent from 
nutch-site.xml.", url);
-        }
+        takeScreenshot(driver, conf);
       }
 
       String innerHtml = 
driver.findElement(By.tagName("body")).getAttribute("innerHTML");
@@ -125,11 +137,36 @@ public class HttpWebClient {
     } catch (Exception e) {
       throw new RuntimeException(e);
     } finally {
-      if (driver != null) try { driver.quit(); } catch (Exception e) { throw 
new RuntimeException(e); }
+      cleanUpDriver(driver);
     }
-  };
+  }
 
   public static String getHtmlPage(String url) {
     return getHtmlPage(url, null);
   }
+
+  private static void takeScreenshot(WebDriver driver, Configuration conf) {
+    try {
+      String url = driver.getCurrentUrl();
+      File srcFile = 
((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
+      LOG.debug("In-memory screenshot taken of: {}", url);
+      FileSystem fs = FileSystem.get(conf);
+      Path screenshotPath = new Path(conf.get("selenium.screenshot.location") 
+ "/" + srcFile.getName());
+      if (screenshotPath != null) {
+        OutputStream os = null;
+        if (!fs.exists(screenshotPath)) {
+          LOG.debug("No existing screenshot already exists... creating new 
file at {} {}.", screenshotPath, srcFile.getName());
+          os = fs.create(screenshotPath);
+        }
+        InputStream is = new BufferedInputStream(new FileInputStream(srcFile));
+        IOUtils.copyBytes(is, os, conf);
+        LOG.debug("Screenshot for {} successfully saved to: {} {}", url, 
screenshotPath, srcFile.getName()); 
+      } else {
+        LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) 
as value for "
+            + "'selenium.screenshot.location' is absent from nutch-site.xml.", 
url);
+      }
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
 }

Added: nutch/trunk/src/plugin/protocol-interactiveselenium/README.md
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/README.md?rev=1693837&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/README.md (added)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/README.md Sun Aug  2 
23:08:51 2015
@@ -0,0 +1,38 @@
+Nutch Interactive Selenium
+==========================
+
+This protocol plugin allows you to fetch and interact with pages using 
[Selenium](http://www.seleniumhq.org/).
+
+# Dependencies and Configuration
+
+You will need to have [Selenium](http://www.seleniumhq.org/) and a compatible 
version of Firefox installed to use this plugin.
+
+Set the protocol to be used in your Nutch configuration files.
+```
+<!-- NUTCH_HOME/conf/nutch-site.xml -->
+
+<configuration>
+  ...
+  <property>
+    <name>plugin.includes</name>
+    <value>protocol-interactiveselenium|urlfilter-regex| ... </value>
+    <description></description>
+  </property>
+```
+
+# Custom Handlers
+
+Only basic functionality is included in the DefaultHandler that comes with the 
plugin. If you want additional functionality you can implement custom handlers 
by implementing the InteractiveSeleniumHandler interface in the plugin package. 
Be sure to also update the plugin config to include your new handler.
+
+```
+<!-- NUTCH_HOME/conf/nutch-site.xml -->
+<property>
+  <name>interactiveselenium.handlers</name>
+  <value>NewCustomHandler,DefaultHandler</value>
+  <description></description>
+</property>
+```
+
+# Handler Info
+
+Handlers are called in the order that they're specified in the configuration. 
A "clean" driver is used for each handler so multiple handlers won't interfere 
with each other. Page content is appended together from each handler and 
returned for the request.

Added: nutch/trunk/src/plugin/protocol-interactiveselenium/build-ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/build-ivy.xml?rev=1693837&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/build-ivy.xml (added)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/build-ivy.xml Sun Aug  
2 23:08:51 2015
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-interactiveselenium" default="deps-jar" 
xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without 
any special installation -->
+        <get 
src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar";
 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not 
already dropped
+              it into ant's lib dir (note that the latter copy will always 
take precedence).
+              We will not fail as long as local lib dir exists (it may be 
empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+  </target>
+
+</project>

Added: nutch/trunk/src/plugin/protocol-interactiveselenium/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/build.xml?rev=1693837&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/build.xml (added)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/build.xml Sun Aug  2 
23:08:51 2015
@@ -0,0 +1,37 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-interactiveselenium" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+    <ant target="jar" inheritall="false" dir="../lib-selenium"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+      <include name="**/lib-selenium/*.jar" />
+      <include name="**/protocol-selenium/*.jar" />
+    </fileset>
+  </path>
+
+</project>

Added: nutch/trunk/src/plugin/protocol-interactiveselenium/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/ivy.xml?rev=1693837&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/ivy.xml (added)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/ivy.xml Sun Aug  2 
23:08:51 2015
@@ -0,0 +1,48 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="default"/>
+  </publications>
+
+  <dependencies>
+    <!-- begin selenium dependencies -->
+    <dependency org="org.seleniumhq.selenium" name="selenium-java" 
rev="2.44.0" />
+    
+    <dependency org="com.opera" name="operadriver" rev="1.5">
+      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+    </dependency>
+    <!-- end selenium dependencies -->
+  </dependencies>
+  
+</ivy-module>

Added: nutch/trunk/src/plugin/protocol-interactiveselenium/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/plugin.xml?rev=1693837&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-interactiveselenium/plugin.xml (added)
+++ nutch/trunk/src/plugin/protocol-interactiveselenium/plugin.xml Sun Aug  2 
23:08:51 2015
@@ -0,0 +1,90 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-interactiveselenium"
+   name="Http Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="protocol-interactiveselenium.jar">
+         <export name="*"/>
+      </library>
+      <library name="cglib-nodep-2.1_3.jar"/>
+      <library name="commons-codec-1.9.jar"/>
+      <library name="commons-collections-3.2.1.jar"/>
+      <library name="commons-exec-1.1.jar"/>
+      <library name="commons-io-2.4.jar"/>
+      <library name="commons-jxpath-1.3.jar"/>
+      <library name="commons-lang3-3.3.2.jar"/>
+      <library name="commons-logging-1.1.3.jar"/>
+      <library name="cssparser-0.9.14.jar"/>
+      <library name="gson-2.3.jar"/>
+      <library name="guava-18.0.jar"/>
+      <library name="htmlunit-2.15.jar"/>
+      <library name="htmlunit-core-js-2.15.jar"/>
+      <library name="httpclient-4.3.4.jar"/>
+      <library name="httpcore-4.3.2.jar"/>
+      <library name="httpmime-4.3.3.jar"/>
+      <library name="ini4j-0.5.2.jar"/>
+      <library name="jetty-http-8.1.15.v20140411.jar"/>
+      <library name="jetty-io-8.1.15.v20140411.jar"/>
+      <library name="jetty-util-8.1.15.v20140411.jar"/>
+      <library name="jetty-websocket-8.1.15.v20140411.jar"/>
+      <library name="jna-3.4.0.jar"/>
+      <library name="nekohtml-1.9.21.jar"/>
+      <library name="netty-3.5.2.Final.jar"/>
+      <library name="operadriver-1.5.jar"/>
+      <library name="operalaunchers-1.1.jar"/>
+      <library name="platform-3.4.0.jar"/>
+      <library name="protobuf-java-2.4.1.jar"/>
+      <library name="sac-1.3.jar"/>
+      <library name="selenium-api-2.44.0.jar"/>
+      <library name="selenium-chrome-driver-2.44.0.jar"/>
+      <library name="selenium-firefox-driver-2.44.0.jar"/>
+      <library name="selenium-htmlunit-driver-2.44.0.jar"/>
+      <library name="selenium-ie-driver-2.44.0.jar"/>
+      <library name="selenium-java-2.44.0.jar"/>
+      <library name="selenium-remote-driver-2.44.0.jar"/>
+      <library name="selenium-safari-driver-2.44.0.jar"/>
+      <library name="selenium-support-2.44.0.jar"/>
+      <library name="serializer-2.7.1.jar"/>
+      <library name="webbit-0.4.14.jar"/>
+      <library name="xalan-2.7.1.jar"/>
+      <library name="xercesImpl-2.11.0.jar"/>
+      <library name="xml-apis-1.4.01.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-http"/>
+      <import plugin="lib-selenium"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.interactiveselenium"
+              name="HttpProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.interactiveselenium.Http"
+                      
class="org.apache.nutch.protocol.interactiveselenium.Http">
+        <parameter name="protocolName" value="http"/>
+      </implementation>
+
+   </extension>
+
+</plugin>

Added: 
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/Http.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/Http.java?rev=1693837&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/Http.java
 (added)
+++ 
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/Http.java
 Sun Aug  2 23:08:51 2015
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.interactiveselenium;
+
+// JDK imports
+import java.io.IOException;
+import java.net.URL;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.nutch.protocol.interactiveselenium.HttpResponse;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Http extends HttpBase {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+  public Http() {
+    super(LOG);
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+  }
+
+  public static void main(String[] args) throws Exception {
+    Http http = new Http();
+    http.setConf(NutchConfiguration.create());
+    main(http, args);
+  }
+
+  @Override
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    return new HttpResponse(this, url, datum);
+  }
+
+}

Added: 
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java?rev=1693837&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
 (added)
+++ 
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
 Sun Aug  2 23:08:51 2015
@@ -0,0 +1,396 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.interactiveselenium;
+
+// JDK imports
+import java.io.BufferedInputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.PushbackInputStream;
+import java.net.InetSocketAddress;
+import java.net.Socket;
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.openqa.selenium.WebDriver;
+
+import org.apache.nutch.protocol.selenium.HttpWebClient;
+
+/* Most of this code was borrowed from protocol-htmlunit; which in turn 
borrowed it from protocol-httpclient */
+
+public class HttpResponse implements Response {
+
+  private Http http;
+  private URL url;
+  private String orig;
+  private String base;
+  private byte[] content;
+  private int code;
+  private Metadata headers = new SpellCheckedMetadata();
+  private static InteractiveSeleniumHandler[] handlers;
+
+  /** The nutch configuration */
+  private Configuration conf = null;
+
+  public HttpResponse(Http http, URL url, CrawlDatum datum) throws 
ProtocolException, IOException {
+
+    this.conf = http.getConf();
+    this.http = http;
+    this.url = url;
+    this.orig = url.toString();
+    this.base = url.toString();
+
+    if (!"http".equals(url.getProtocol()))
+      throw new HttpException("Not an HTTP url:" + url);
+
+    if (Http.LOG.isTraceEnabled()) {
+      Http.LOG.trace("fetching " + url);
+    }
+
+    String path = "".equals(url.getFile()) ? "/" : url.getFile();
+
+    // some servers will redirect a request with a host line like
+    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
+    // don't want the :80...
+
+    String host = url.getHost();
+    int port;
+    String portString;
+    if (url.getPort() == -1) {
+      port = 80;
+      portString = "";
+    } else {
+      port = url.getPort();
+      portString = ":" + port;
+    }
+    Socket socket = null;
+
+    try {
+      socket = new Socket(); // create the socket
+      socket.setSoTimeout(http.getTimeout());
+
+      // connect
+      String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
+      int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
+      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
+      socket.connect(sockAddr, http.getTimeout());
+
+      // make request
+      OutputStream req = socket.getOutputStream();
+
+      StringBuffer reqStr = new StringBuffer("GET ");
+      if (http.useProxy(url)) {
+        reqStr.append(url.getProtocol() + "://" + host + portString + path);
+      } else {
+        reqStr.append(path);
+      }
+
+      reqStr.append(" HTTP/1.0\r\n");
+
+      reqStr.append("Host: ");
+      reqStr.append(host);
+      reqStr.append(portString);
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
+
+      String userAgent = http.getUserAgent();
+      if ((userAgent == null) || (userAgent.length() == 0)) {
+        if (Http.LOG.isErrorEnabled()) {
+          Http.LOG.error("User-agent is not set!");
+        }
+      } else {
+        reqStr.append("User-Agent: ");
+        reqStr.append(userAgent);
+        reqStr.append("\r\n");
+      }
+
+      reqStr.append("Accept-Language: ");
+      reqStr.append(this.http.getAcceptLanguage());
+      reqStr.append("\r\n");
+
+      reqStr.append("Accept: ");
+      reqStr.append(this.http.getAccept());
+      reqStr.append("\r\n");
+
+      if (datum.getModifiedTime() > 0) {
+        reqStr.append("If-Modified-Since: " + 
HttpDateFormat.toString(datum.getModifiedTime()));
+        reqStr.append("\r\n");
+      }
+      reqStr.append("\r\n");
+
+      byte[] reqBytes = reqStr.toString().getBytes();
+
+      req.write(reqBytes);
+      req.flush();
+
+      PushbackInputStream in = // process response
+          new PushbackInputStream(new 
BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE),
+              Http.BUFFER_SIZE);
+
+      StringBuffer line = new StringBuffer();
+
+      boolean haveSeenNonContinueStatus = false;
+      while (!haveSeenNonContinueStatus) {
+        // parse status code line
+        this.code = parseStatusLine(in, line);
+        // parse headers
+        parseHeaders(in, line);
+        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
+      }
+
+      // Get Content type header
+      String contentType = getHeader(Response.CONTENT_TYPE);
+
+      // handle with Selenium only if content type in HTML or XHTML 
+      if (contentType != null) {
+        if (contentType.contains("text/html") || 
contentType.contains("application/xhtml")) {
+          readPlainContent(url);
+        } else {
+          try {
+            int contentLength = Integer.MAX_VALUE;
+            String contentLengthString = headers.get(Response.CONTENT_LENGTH);
+            if (contentLengthString != null) {
+              try {
+                contentLength = Integer.parseInt(contentLengthString.trim());
+              } catch (NumberFormatException ex) {
+                throw new HttpException("bad content length: " + 
contentLengthString);
+              }
+            }
+
+            if (http.getMaxContent() >= 0 && contentLength > 
http.getMaxContent()) {
+              contentLength = http.getMaxContent();
+            }
+
+            byte[] buffer = new byte[HttpBase.BUFFER_SIZE];
+            int bufferFilled = 0;
+            int totalRead = 0;
+            ByteArrayOutputStream out = new ByteArrayOutputStream();
+            while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1
+                && totalRead + bufferFilled <= contentLength) {
+              totalRead += bufferFilled;
+              out.write(buffer, 0, bufferFilled);
+            }
+
+            content = out.toByteArray();
+
+          } catch (Exception e) {
+            if (code == 200)
+              throw new IOException(e.toString());
+            // for codes other than 200 OK, we are fine with empty content
+          } finally {
+            if (in != null) {
+              in.close();
+            }
+          }
+        }
+      } 
+
+    } finally {
+      if (socket != null)
+        socket.close();
+    }
+  }
+
+  /* ------------------------- *
+   * <implementation:Response> *
+   * ------------------------- */
+
+  public URL getUrl() {
+    return url;
+  }
+
+  public int getCode() {
+    return code;
+  }
+
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public Metadata getHeaders() {
+    return headers;
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  /* ------------------------- *
+   * <implementation:Response> *
+   * ------------------------- */
+  private void loadSeleniumHandlers() {
+    if (handlers != null) return;
+
+    String handlerConfig = this.conf.get("interactiveselenium.handlers", 
"DefaultHandler");
+    String[] handlerNames = handlerConfig.split(",");
+    handlers = new InteractiveSeleniumHandler[handlerNames.length];
+    for (int i = 0; i < handlerNames.length; i++) {
+        try {
+            String classToLoad = this.getClass().getPackage().getName() + "." 
+ handlerNames[i];
+            handlers[i] = 
InteractiveSeleniumHandler.class.cast(Class.forName(classToLoad).newInstance());
+            Http.LOG.info("Successfully loaded " + classToLoad);
+        } catch (ClassNotFoundException e) {
+            Http.LOG.info("Unable to load Handler class for: " + 
handlerNames[i]);
+        } catch (InstantiationException e) {
+            Http.LOG.info("Unable to instantiate Handler: " + handlerNames[i]);
+        } catch (IllegalAccessException e) {
+            Http.LOG.info("Illegal access with Handler: " + handlerNames[i]);
+        }
+    }
+  }
+
+  private void readPlainContent(URL url) throws IOException {
+    if (handlers == null)
+        loadSeleniumHandlers();
+
+    String processedPage = "";
+
+    for (InteractiveSeleniumHandler handler : this.handlers) {
+        WebDriver driver = HttpWebClient.getDriverForPage(url.toString(), 
conf);
+
+        handler.processDriver(driver);
+        processedPage += HttpWebClient.getHTMLContent(driver, conf);
+
+        HttpWebClient.cleanUpDriver(driver);
+    }
+
+    content = processedPage.getBytes("UTF-8");
+  }
+
+  private int parseStatusLine(PushbackInputStream in, StringBuffer line) 
throws IOException, HttpException {
+    readLine(in, line, false);
+
+    int codeStart = line.indexOf(" ");
+    int codeEnd = line.indexOf(" ", codeStart + 1);
+
+    // handle lines with no plaintext result code, ie:
+    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
+    if (codeEnd == -1)
+      codeEnd = line.length();
+
+    int code;
+    try {
+      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
+    } catch (NumberFormatException e) {
+      throw new HttpException("bad status line '" + line + "': " + 
e.getMessage(), e);
+    }
+
+    return code;
+  }
+
+  private void processHeaderLine(StringBuffer line) throws IOException, 
HttpException {
+
+    int colonIndex = line.indexOf(":"); // key is up to colon
+    if (colonIndex == -1) {
+      int i;
+      for (i = 0; i < line.length(); i++)
+        if (!Character.isWhitespace(line.charAt(i)))
+          break;
+      if (i == line.length())
+        return;
+      throw new HttpException("No colon in header:" + line);
+    }
+    String key = line.substring(0, colonIndex);
+
+    int valueStart = colonIndex + 1; // skip whitespace
+    while (valueStart < line.length()) {
+      int c = line.charAt(valueStart);
+      if (c != ' ' && c != '\t')
+        break;
+      valueStart++;
+    }
+    String value = line.substring(valueStart);
+    headers.set(key, value);
+  }
+
+  // Adds headers to our headers Metadata
+  private void parseHeaders(PushbackInputStream in, StringBuffer line) throws 
IOException, HttpException {
+
+    while (readLine(in, line, true) != 0) {
+
+      // handle HTTP responses with missing blank line after headers
+      int pos;
+      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = 
line.indexOf("<HTML")) != -1)
+          || ((pos = line.indexOf("<html")) != -1)) {
+
+        in.unread(line.substring(pos).getBytes("UTF-8"));
+        line.setLength(pos);
+
+        try {
+          //TODO: (CM) We don't know the header names here
+          //since we're just handling them generically. It would
+          //be nice to provide some sort of mapping function here
+          //for the returned header names to the standard metadata
+          //names in the ParseData class
+          processHeaderLine(line);
+        } catch (Exception e) {
+          // fixme:
+          Http.LOG.warn("Error: ", e);
+        }
+        return;
+      }
+
+      processHeaderLine(line);
+    }
+  }
+
+  private static int readLine(PushbackInputStream in, StringBuffer line, 
boolean allowContinuedLine)
+      throws IOException {
+    line.setLength(0);
+    for (int c = in.read(); c != -1; c = in.read()) {
+      switch (c) {
+      case '\r':
+        if (peek(in) == '\n') {
+          in.read();
+        }
+      case '\n':
+        if (line.length() > 0) {
+          // at EOL -- check for continued line if the current
+          // (possibly continued) line wasn't blank
+          if (allowContinuedLine)
+            switch (peek(in)) {
+            case ' ':
+            case '\t': // line is continued
+              in.read();
+              continue;
+            }
+        }
+        return line.length(); // else complete
+      default:
+        line.append((char) c);
+      }
+    }
+    throw new EOFException();
+  }
+
+  private static int peek(PushbackInputStream in) throws IOException {
+    int value = in.read();
+    in.unread(value);
+    return value;
+  }
+}

Added: 
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java?rev=1693837&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java
 (added)
+++ 
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java
 Sun Aug  2 23:08:51 2015
@@ -0,0 +1,24 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.interactiveselenium;
+
+import org.openqa.selenium.WebDriver;
+
+public class DefaultHandler implements InteractiveSeleniumHandler {
+    public void processDriver(WebDriver driver) {}
+}

Added: 
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java?rev=1693837&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java
 (added)
+++ 
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java
 Sun Aug  2 23:08:51 2015
@@ -0,0 +1,24 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.interactiveselenium;
+
+import org.openqa.selenium.WebDriver;
+
+public interface InteractiveSeleniumHandler {
+    public void processDriver(WebDriver driver);
+}

Added: 
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/package.html
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/package.html?rev=1693837&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/package.html
 (added)
+++ 
nutch/trunk/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/package.html
 Sun Aug  2 23:08:51 2015
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via selenium.</p><p></p>
+</body>
+</html>


Reply via email to