Author: lewismc
Date: Wed Jul 22 04:08:20 2015
New Revision: 1692216
URL: http://svn.apache.org/r1692216
Log:
NUTCH-2021 Use protocol-selenium to Capture Screenshots of the Page as it is
Fetched
Added:
nutch/trunk/src/plugin/lib-selenium/build-ivy.xml
- copied, changed from r1687398,
nutch/trunk/src/plugin/parse-tika/build-ivy.xml
nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt
- copied, changed from r1687398,
nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
nutch/trunk/src/plugin/lib-selenium/ivy.xml
nutch/trunk/src/plugin/lib-selenium/plugin.xml
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
nutch/trunk/src/plugin/protocol-selenium/ivy.xml
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1692216&r1=1692215&r2=1692216&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jul 22 04:08:20 2015
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development 1.11-SNAPSHOT
+* NUTCH-2021 Use protocol-selenium to Capture Screenshots of the Page as it is
Fetched (lewismc)
+
* NUTCH-2058 Indexer plugin that allows RegEx replacements on the
NutchDocument
field values (Peter Ciuffetti via mattmann)
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1692216&r1=1692215&r2=1692216&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed Jul 22 04:08:20 2015
@@ -1736,4 +1736,40 @@ CAUTION: Set the parser.timeout to -1 or
</description>
</property>
+<!-- protocol-selenium plugin properties -->
+
+<property>
+ <name>selenium.driver</name>
+ <value>firefox</value>
+ <description>
+ A String value representing the flavour of Selenium
+ WebDriver() to use. Currently the following options
+ exist - firefox, chrome, safari and opera.
+ </description>
+</property>
+
+<property>
+ <name>selenium.take.screenshot</name>
+ <value>false</value>
+ <description>
+ Boolean property determining whether the protocol-selenium
+ WebDriver should capture a screenshot of the URL. If set to
+ true remember to define the 'selenium.screenshot.location'
+ property as this determines the location screenshots should be
+ persisted to on HDFS. If that property is not set, screenshots
+ are simply discarded.
+ </description>
+</property>
+
+<property>
+ <name>selenium.screenshot.location</name>
+ <value></value>
+ <description>
+ The location on disk where a URL screenshot should be saved
+ to if the 'selenium.take.screenshot' proerty is set to true.
+ By default this is null, in this case screenshots held in memory
+ are simply discarded.
+ </description>
+</property>
+
</configuration>
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1692216&r1=1692215&r2=1692216&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Wed Jul 22
04:08:20 2015
@@ -26,8 +26,6 @@ import java.io.ByteArrayInputStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
-import java.security.MessageDigest;
-
import com.google.common.base.Strings;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
@@ -102,7 +100,7 @@ import org.slf4j.LoggerFactory;
* {"mimeType":"video/quicktime","count":"2"}
* {"mimeType":"image/gif","count":"63"}
* ]
- *
+ * }
* </pre>
* <p>
* In the case above, the tool would have been run with the <b>-mimeType
Copied: nutch/trunk/src/plugin/lib-selenium/build-ivy.xml (from r1687398,
nutch/trunk/src/plugin/parse-tika/build-ivy.xml)
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/build-ivy.xml?p2=nutch/trunk/src/plugin/lib-selenium/build-ivy.xml&p1=nutch/trunk/src/plugin/parse-tika/build-ivy.xml&r1=1687398&r2=1692216&rev=1692216&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/build-ivy.xml (original)
+++ nutch/trunk/src/plugin/lib-selenium/build-ivy.xml Wed Jul 22 04:08:20 2015
@@ -15,7 +15,7 @@
See the License for the specific language governing permissions and
limitations under the License.
-->
-<project name="parse-tika" default="deps-jar"
xmlns:ivy="antlib:org.apache.ivy.ant">
+<project name="lib-selenium" default="deps-jar"
xmlns:ivy="antlib:org.apache.ivy.ant">
<property name="ivy.install.version" value="2.1.0" />
<condition property="ivy.home" value="${env.IVY_HOME}">
Copied: nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt (from
r1687398, nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt)
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt?p2=nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt&p1=nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt&r1=1687398&r2=1692216&rev=1692216&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt (original)
+++ nutch/trunk/src/plugin/lib-selenium/howto_upgrade_selenium.txt Wed Jul 22
04:08:20 2015
@@ -1,8 +1,6 @@
-1. Upgrade Tika depencency in trunk/ivy/ivy.xml
+1. Upgrade various driver versions dependency in
src/plugin/lib-selenium/ivy.xml
-2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml
-
-3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml
+2. Upgrade Tika's own dependencies in src/plugin/lib-selenium/plugin.xml
To get the list of dependencies and their versions execute:
$ ant -f ./build-ivy.xml
$ ls lib | sed 's/^/ <library name="/g' | sed 's/$/"\/>/g'
Modified: nutch/trunk/src/plugin/lib-selenium/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/ivy.xml?rev=1692216&r1=1692215&r2=1692216&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/ivy.xml (original)
+++ nutch/trunk/src/plugin/lib-selenium/ivy.xml Wed Jul 22 04:08:20 2015
@@ -42,6 +42,10 @@
<dependency org="com.opera" name="operadriver" rev="1.5">
<exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
</dependency>
+ <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" >
+ <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+ <exclude org="org.seleniumhq.selenium" name="selenium-java" />
+ </dependency>
<!-- end selenium dependencies -->
</dependencies>
Modified: nutch/trunk/src/plugin/lib-selenium/plugin.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/plugin.xml?rev=1692216&r1=1692215&r2=1692216&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-selenium/plugin.xml (original)
+++ nutch/trunk/src/plugin/lib-selenium/plugin.xml Wed Jul 22 04:08:20 2015
@@ -27,11 +27,54 @@
<runtime>
<library name="lib-selenium.jar">
<export name="*"/>
- </library>
+ </library>
+ <library name="cglib-nodep-2.1_3.jar"/>
+ <library name="commons-codec-1.9.jar"/>
+ <library name="commons-collections-3.2.1.jar"/>
+ <library name="commons-exec-1.1.jar"/>
+ <library name="commons-io-2.4.jar"/>
+ <library name="commons-jxpath-1.3.jar"/>
+ <library name="commons-lang3-3.3.2.jar"/>
+ <library name="commons-logging-1.1.3.jar"/>
+ <library name="cssparser-0.9.14.jar"/>
+ <library name="gson-2.3.jar"/>
+ <library name="guava-18.0.jar"/>
+ <library name="htmlunit-2.15.jar"/>
+ <library name="htmlunit-core-js-2.15.jar"/>
+ <library name="httpclient-4.3.4.jar"/>
+ <library name="httpcore-4.3.2.jar"/>
+ <library name="httpmime-4.3.3.jar"/>
+ <library name="ini4j-0.5.2.jar"/>
+ <library name="jetty-http-8.1.15.v20140411.jar"/>
+ <library name="jetty-io-8.1.15.v20140411.jar"/>
+ <library name="jetty-util-8.1.15.v20140411.jar"/>
+ <library name="jetty-websocket-8.1.15.v20140411.jar"/>
+ <library name="jna-3.4.0.jar"/>
+ <library name="nekohtml-1.9.21.jar"/>
+ <library name="netty-3.5.2.Final.jar"/>
+ <library name="operadriver-1.5.jar"/>
+ <library name="operalaunchers-1.1.jar"/>
+ <library name="platform-3.4.0.jar"/>
+ <library name="protobuf-java-2.4.1.jar"/>
+ <library name="sac-1.3.jar"/>
+ <library name="selenium-api-2.44.0.jar"/>
+ <library name="selenium-chrome-driver-2.44.0.jar"/>
+ <library name="selenium-firefox-driver-2.44.0.jar"/>
+ <library name="selenium-htmlunit-driver-2.44.0.jar"/>
+ <library name="selenium-ie-driver-2.44.0.jar"/>
+ <library name="selenium-java-2.44.0.jar"/>
+ <library name="selenium-remote-driver-2.44.0.jar"/>
+ <library name="selenium-safari-driver-2.44.0.jar"/>
+ <library name="selenium-support-2.44.0.jar"/>
+ <library name="serializer-2.7.1.jar"/>
+ <library name="webbit-0.4.14.jar"/>
+ <library name="xalan-2.7.1.jar"/>
+ <library name="xercesImpl-2.11.0.jar"/>
+ <library name="xml-apis-1.4.01.jar"/>
</runtime>
<requires>
- <library name="selenium-java-2.4.0.jar">
+ <library name="selenium-java-2.44.0.jar">
<export name="*"/>
</library>
<library name="operadriver-1.5.jar">
Modified:
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java?rev=1692216&r1=1692215&r2=1692216&view=diff
==============================================================================
---
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
(original)
+++
nutch/trunk/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
Wed Jul 22 04:08:20 2015
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -17,19 +17,32 @@
package org.apache.nutch.protocol.selenium;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.openqa.selenium.By;
+import org.openqa.selenium.OutputType;
+import org.openqa.selenium.TakesScreenshot;
import org.openqa.selenium.WebDriver;
+import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.firefox.FirefoxProfile;
+import org.openqa.selenium.safari.SafariDriver;
import org.openqa.selenium.support.ui.WebDriverWait;
-
+import com.opera.core.systems.OperaDriver;
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.io.OutputStream;
import java.lang.String;
public class HttpWebClient {
- private static final Logger LOG =
LoggerFactory.getLogger("org.apache.nutch.protocol");
+ private static final Logger LOG =
LoggerFactory.getLogger(HttpWebClient.class);
public static ThreadLocal<WebDriver> threadWebDriver = new
ThreadLocal<WebDriver>() {
@@ -45,23 +58,67 @@ public class HttpWebClient {
};
};
+ /**
+ * Function for obtaining the HTML BODY using the selected
+ * {@link org.openqa.selenium.WebDriver}.
+ * There are a number of configuration properties within
+ * <code>nutch-site.xml</code> which determine whether to
+ * take screenshots of the rendered pages and persist them
+ * as timestamped .png's into HDFS.
+ * @param url the URL to fetch and render
+ * @param conf the {@link org.apache.hadoop.conf.Configuration}
+ * @return the rendered inner HTML page
+ */
public static String getHtmlPage(String url, Configuration conf) {
- WebDriver driver = null;
+ WebDriver driver = null;
try {
- driver = new FirefoxDriver();
- //} WebDriver driver = threadWebDriver.get();
- // if (driver == null) {
- // driver = new FirefoxDriver();
- // }
-
+ String driverType = conf.get("selenium.driver", "firefox");
+ switch (driverType) {
+ case "firefox":
+ driver = new FirefoxDriver();
+ break;
+ case "chrome":
+ driver = new ChromeDriver();
+ break;
+ case "safari":
+ driver = new SafariDriver();
+ break;
+ case "opera":
+ driver = new OperaDriver();
+ break;
+ default:
+ LOG.error("The Selenium WebDriver choice {} is not available...
defaulting to FirefoxDriver().", driverType);
+ driver = new FirefoxDriver();
+ break;
+ }
+ LOG.debug("Selenium {} WebDriver selected.", driverType);
driver.get(url);
// Wait for the page to load, timeout after 3 seconds
new WebDriverWait(driver, 3);
- String innerHtml =
driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+ if (conf.getBoolean("selenium.take.screenshot", false)) {
+ File srcFile =
((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
+ LOG.debug("In-memory screenshot taken of: {}", url);
+ FileSystem fs = FileSystem.get(conf);
+ Path screenshotPath = new
Path(conf.get("selenium.screenshot.location") + "/" + srcFile.getName());
+ if (screenshotPath != null) {
+ OutputStream os = null;
+ if (!fs.exists(screenshotPath)) {
+ LOG.debug("No existing screenshot already exists... creating new
file at {} {}.", screenshotPath, srcFile.getName());
+ os = fs.create(screenshotPath);
+ }
+ InputStream is = new BufferedInputStream(new
FileInputStream(srcFile));
+ IOUtils.copyBytes(is, os, conf);
+ LOG.debug("Screenshot for {} successfully saved to: {} {}", url,
screenshotPath, srcFile.getName());
+ } else {
+ LOG.warn("Screenshot for {} not saved to HDFS (subsequently
disgarded) as value for "
+ + "'selenium.screenshot.location' is absent from
nutch-site.xml.", url);
+ }
+ }
+ String innerHtml =
driver.findElement(By.tagName("body")).getAttribute("innerHTML");
return innerHtml;
// I'm sure this catch statement is a code smell ; borrowing it from
lib-htmlunit
@@ -75,4 +132,4 @@ public class HttpWebClient {
public static String getHtmlPage(String url) {
return getHtmlPage(url, null);
}
-}
\ No newline at end of file
+}
Modified: nutch/trunk/src/plugin/protocol-selenium/ivy.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/ivy.xml?rev=1692216&r1=1692215&r2=1692216&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-selenium/ivy.xml (original)
+++ nutch/trunk/src/plugin/protocol-selenium/ivy.xml Wed Jul 22 04:08:20 2015
@@ -42,6 +42,10 @@
<dependency org="com.opera" name="operadriver" rev="1.5">
<exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
</dependency>
+ <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" >
+ <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
+ <exclude org="org.seleniumhq.selenium" name="selenium-java" />
+ </dependency>
<!-- end selenium dependencies -->
</dependencies>