Repository: nutch Updated Branches: refs/heads/master d6bcefd92 -> 044e8e77e
fix for NUTCH-2191 contributed by karanjeets Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/fa334722 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/fa334722 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/fa334722 Branch: refs/heads/master Commit: fa33472297aca6a6468461bb6945225c93590d6d Parents: a9b2491 Author: Karanjeet Singh <[email protected]> Authored: Sat Mar 26 23:21:28 2016 -0700 Committer: Karanjeet Singh <[email protected]> Committed: Sat Mar 26 23:21:28 2016 -0700 ---------------------------------------------------------------------- build.xml | 6 + conf/nutch-default.xml | 66 ++++ src/plugin/build.xml | 4 + src/plugin/lib-htmlunit/build-ivy.xml | 54 +++ src/plugin/lib-htmlunit/build.xml | 28 ++ src/plugin/lib-htmlunit/ivy.xml | 52 +++ src/plugin/lib-htmlunit/plugin.xml | 166 +++++++++ .../protocol/htmlunit/HtmlUnitWebDriver.java | 190 ++++++++++ .../htmlunit/HtmlUnitWebWindowListener.java | 36 ++ src/plugin/protocol-htmlunit/build.xml | 53 +++ src/plugin/protocol-htmlunit/ivy.xml | 38 ++ src/plugin/protocol-htmlunit/plugin.xml | 51 +++ .../apache/nutch/protocol/htmlunit/Http.java | 67 ++++ .../nutch/protocol/htmlunit/HttpResponse.java | 350 +++++++++++++++++++ .../apache/nutch/protocol/htmlunit/package.html | 5 + 15 files changed, 1166 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/build.xml ---------------------------------------------------------------------- diff --git a/build.xml b/build.xml index f8aa196..5cff1ea 100644 --- a/build.xml +++ b/build.xml @@ -189,6 +189,7 @@ <packageset dir="${plugins.dir}/indexer-elastic/src/java/" /> <packageset dir="${plugins.dir}/indexer-solr/src/java"/> <packageset dir="${plugins.dir}/language-identifier/src/java"/> + <packageset dir="${plugins.dir}/lib-htmlunit/src/java"/> <packageset dir="${plugins.dir}/lib-http/src/java"/> <packageset dir="${plugins.dir}/lib-selenium/src/java"/> <packageset dir="${plugins.dir}/lib-regex-filter/src/java"/> @@ -203,6 +204,7 @@ <packageset dir="${plugins.dir}/parsefilter-regex/src/java"/> <packageset dir="${plugins.dir}/protocol-file/src/java"/> <packageset dir="${plugins.dir}/protocol-ftp/src/java"/> + <packageset dir="${plugins.dir}/protocol-htmlunit/src/java"/> <packageset dir="${plugins.dir}/protocol-http/src/java"/> <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/> <packageset dir="${plugins.dir}/protocol-interactiveselenium/src/java"/> @@ -629,6 +631,7 @@ <packageset dir="${plugins.dir}/indexer-elastic/src/java/" /> <packageset dir="${plugins.dir}/indexer-solr/src/java"/> <packageset dir="${plugins.dir}/language-identifier/src/java"/> + <packageset dir="${plugins.dir}/lib-htmlunit/src/java"/> <packageset dir="${plugins.dir}/lib-http/src/java"/> <packageset dir="${plugins.dir}/lib-selenium/src/java"/> <packageset dir="${plugins.dir}/lib-regex-filter/src/java"/> @@ -643,6 +646,7 @@ <packageset dir="${plugins.dir}/parse-zip/src/java"/> <packageset dir="${plugins.dir}/protocol-file/src/java"/> <packageset dir="${plugins.dir}/protocol-ftp/src/java"/> + <packageset dir="${plugins.dir}/protocol-htmlunit/src/java"/> <packageset dir="${plugins.dir}/protocol-http/src/java"/> <packageset dir="${plugins.dir}/protocol-httpclient/src/java"/> <packageset dir="${plugins.dir}/protocol-selenium/src/java"/> @@ -1033,6 +1037,7 @@ <source path="${plugins.dir}/index-static/src/test/" /> <source path="${plugins.dir}/language-identifier/src/java/" /> <source path="${plugins.dir}/language-identifier/src/test/" /> + <source path="${plugins.dir}/lib-htmlunit/src/java/" /> <source path="${plugins.dir}/lib-http/src/java/" /> <source path="${plugins.dir}/lib-http/src/test/" /> <source path="${plugins.dir}/lib-selenium/src/java/" /> @@ -1057,6 +1062,7 @@ <source path="${plugins.dir}/protocol-file/src/java/" /> <source path="${plugins.dir}/protocol-file/src/test/" /> <source path="${plugins.dir}/protocol-ftp/src/java/" /> + <source path="${plugins.dir}/protocol-htmlunit/src/java"/> <source path="${plugins.dir}/protocol-httpclient/src/java/" /> <source path="${plugins.dir}/protocol-httpclient/src/test/" /> <source path="${plugins.dir}/protocol-http/src/java/" /> http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/conf/nutch-default.xml ---------------------------------------------------------------------- diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 93503f3..a5f17bf 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1874,6 +1874,72 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> </description> </property> + +<!-- lib-htmlunit plugin properties; applies to protocol-htmlunit --> + +<property> + <name>htmlunit.page.load.delay</name> + <value>3</value> + <description> + The delay in seconds to use when loading a page with lib-htmlunit. This + setting is used by protocol-htmlunit since they depending on + lib-htmlunit for fetching. + </description> +</property> + +<property> + <name>htmlunit.enable.javascript</name> + <value>true</value> + <description> + A Boolean value representing if javascript should + be enabled or disabled when using htmlunit. The default value is enabled. + </description> +</property> + +<property> + <name>htmlunit.javascript.timeout</name> + <value>3500</value> + <description> + The timeout in milliseconds when loading javascript with lib-htmlunit. This + setting is used by protocol-htmlunit since they depending on + lib-htmlunit for fetching. + </description> +</property> + +<property> + <name>htmlunit.enable.css</name> + <value>false</value> + <description> + A Boolean value representing if CSS should + be enabled or disabled when using htmlunit. The default value is disabled. + </description> +</property> + +<property> + <name>htmlunit.take.screenshot</name> + <value>false</value> + <description> + Boolean property determining whether the protocol-htmlunit + WebDriver should capture a screenshot of the URL. If set to + true remember to define the 'htmlunit.screenshot.location' + property as this determines the location screenshots should be + persisted to on HDFS. If that property is not set, screenshots + are simply discarded. + </description> +</property> + +<property> + <name>htmlunit.screenshot.location</name> + <value></value> + <description> + The location on disk where a URL screenshot should be saved + to if the 'htmlunit.take.screenshot' property is set to true. + By default this is null, in this case screenshots held in memory + are simply discarded. + </description> +</property> + + <!-- protocol-selenium plugin properties --> <property> http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/build.xml b/src/plugin/build.xml index 10731b3..75ae2e7 100755 --- a/src/plugin/build.xml +++ b/src/plugin/build.xml @@ -53,6 +53,8 @@ <ant dir="protocol-ftp" target="deploy"/> <ant dir="protocol-http" target="deploy"/> <ant dir="protocol-httpclient" target="deploy"/> + <ant dir="lib-htmlunit" target="deploy"/> + <ant dir="protocol-htmlunit" target="deploy" /> <ant dir="lib-selenium" target="deploy"/> <ant dir="protocol-selenium" target="deploy" /> <ant dir="protocol-interactiveselenium" target="deploy" /> @@ -170,6 +172,8 @@ <ant dir="protocol-ftp" target="clean"/> <ant dir="protocol-http" target="clean"/> <ant dir="protocol-httpclient" target="clean"/> + <ant dir="lib-htmlunit" target="clean"/> + <ant dir="protocol-htmlunit" target="clean" /> <ant dir="lib-selenium" target="clean"/> <ant dir="protocol-selenium" target="clean" /> <ant dir="protocol-interactiveselenium" target="clean" /> http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/build-ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/lib-htmlunit/build-ivy.xml b/src/plugin/lib-htmlunit/build-ivy.xml new file mode 100644 index 0000000..7022f4e --- /dev/null +++ b/src/plugin/lib-htmlunit/build-ivy.xml @@ -0,0 +1,54 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="lib-htmlunit" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> + + <property name="ivy.install.version" value="2.1.0" /> + <condition property="ivy.home" value="${env.IVY_HOME}"> + <isset property="env.IVY_HOME" /> + </condition> + <property name="ivy.home" value="${user.home}/.ant" /> + <property name="ivy.checksums" value="" /> + <property name="ivy.jar.dir" value="${ivy.home}/lib" /> + <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" /> + + <target name="download-ivy" unless="offline"> + + <mkdir dir="${ivy.jar.dir}"/> + <!-- download Ivy from web site so that it can be used even without any special installation --> + <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" + dest="${ivy.jar.file}" usetimestamp="true"/> + </target> + + <target name="init-ivy" depends="download-ivy"> + <!-- try to load ivy here from ivy home, in case the user has not already dropped + it into ant's lib dir (note that the latter copy will always take precedence). + We will not fail as long as local lib dir exists (it may be empty) and + ivy is in at least one of ant's lib dir or the local lib dir. --> + <path id="ivy.lib.path"> + <fileset dir="${ivy.jar.dir}" includes="*.jar"/> + + </path> + <taskdef resource="org/apache/ivy/ant/antlib.xml" + uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/> + </target> + + <target name="deps-jar" depends="init-ivy"> + <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]" sync="true"/> + </target> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/lib-htmlunit/build.xml b/src/plugin/lib-htmlunit/build.xml new file mode 100644 index 0000000..14f5d8f --- /dev/null +++ b/src/plugin/lib-htmlunit/build.xml @@ -0,0 +1,28 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="lib-htmlunit" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Add compilation dependencies to classpath --> + <path id="plugin.deps"> + <fileset dir="${nutch.root}/build"> + <include name="**/lib-http/*.jar" /> + </fileset> + </path> +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/lib-htmlunit/ivy.xml b/src/plugin/lib-htmlunit/ivy.xml new file mode 100644 index 0000000..6430535 --- /dev/null +++ b/src/plugin/lib-htmlunit/ivy.xml @@ -0,0 +1,52 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../../ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + <!-- begin selenium dependencies --> + <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.44.0" /> + + <dependency org="com.opera" name="operadriver" rev="1.5"> + <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" /> + </dependency> + <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" > + <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" /> + <exclude org="org.seleniumhq.selenium" name="selenium-java" /> + </dependency> + <!-- end selenium dependencies --> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/lib-htmlunit/plugin.xml b/src/plugin/lib-htmlunit/plugin.xml new file mode 100644 index 0000000..290a137 --- /dev/null +++ b/src/plugin/lib-htmlunit/plugin.xml @@ -0,0 +1,166 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!-- + ! A common framework for http protocol implementations + !--> +<plugin + id="lib-htmlunit" + name="HTTP Framework" + version="1.0" + provider-name="org.apache.nutch"> + + <runtime> + <library name="lib-htmlunit.jar"> + <export name="*"/> + </library> + <!-- all classes from dependent libraries are exported --> + <library name="cglib-nodep-2.1_3.jar"> + <export name="*"/> + </library> + <library name="commons-codec-1.9.jar"> + <export name="*"/> + </library> + <library name="commons-collections-3.2.1.jar"> + <export name="*"/> + </library> + <library name="commons-exec-1.1.jar"> + <export name="*"/> + </library> + <library name="commons-io-2.4.jar"> + <export name="*"/> + </library> + <library name="commons-jxpath-1.3.jar"> + <export name="*"/> + </library> + <library name="commons-lang3-3.3.2.jar"> + <export name="*"/> + </library> + <library name="commons-logging-1.1.3.jar"> + <export name="*"/> + </library> + <library name="cssparser-0.9.14.jar"> + <export name="*"/> + </library> + <library name="gson-2.3.jar"> + <export name="*"/> + </library> + <library name="guava-18.0.jar"> + <export name="*"/> + </library> + <library name="htmlunit-2.15.jar"> + <export name="*"/> + </library> + <library name="htmlunit-core-js-2.15.jar"> + <export name="*"/> + </library> + <library name="httpclient-4.3.4.jar"> + <export name="*"/> + </library> + <library name="httpcore-4.3.2.jar"> + <export name="*"/> + </library> + <library name="httpmime-4.3.3.jar"> + <export name="*"/> + </library> + <library name="ini4j-0.5.2.jar"> + <export name="*"/> + </library> + <library name="jetty-http-8.1.15.v20140411.jar"> + <export name="*"/> + </library> + <library name="jetty-io-8.1.15.v20140411.jar"> + <export name="*"/> + </library> + <library name="jetty-util-8.1.15.v20140411.jar"> + <export name="*"/> + </library> + <library name="jetty-websocket-8.1.15.v20140411.jar"> + <export name="*"/> + </library> + <library name="jna-3.4.0.jar"> + <export name="*"/> + </library> + <library name="nekohtml-1.9.21.jar"> + <export name="*"/> + </library> + <library name="netty-3.5.2.Final.jar"> + <export name="*"/> + </library> + <library name="operadriver-1.5.jar"> + <export name="*"/> + </library> + <library name="operalaunchers-1.1.jar"> + <export name="*"/> + </library> + <library name="phantomjsdriver-1.2.1.jar"> + <export name="*"/> + </library> + <library name="platform-3.4.0.jar"> + <export name="*"/> + </library> + <library name="protobuf-java-2.4.1.jar"> + <export name="*"/> + </library> + <library name="sac-1.3.jar"> + <export name="*"/> + </library> + <library name="selenium-api-2.44.0.jar"> + <export name="*"/> + </library> + <library name="selenium-chrome-driver-2.44.0.jar"> + <export name="*"/> + </library> + <library name="selenium-firefox-driver-2.44.0.jar"> + <export name="*"/> + </library> + <library name="selenium-htmlunit-driver-2.44.0.jar"> + <export name="*"/> + </library> + <library name="selenium-ie-driver-2.44.0.jar"> + <export name="*"/> + </library> + <library name="selenium-java-2.44.0.jar"> + <export name="*"/> + </library> + <library name="selenium-remote-driver-2.44.0.jar"> + <export name="*"/> + </library> + <library name="selenium-safari-driver-2.44.0.jar"> + <export name="*"/> + </library> + <library name="selenium-support-2.44.0.jar"> + <export name="*"/> + </library> + <library name="serializer-2.7.1.jar"> + <export name="*"/> + </library> + <library name="webbit-0.4.14.jar"> + <export name="*"/> + </library> + <library name="xalan-2.7.1.jar"> + <export name="*"/> + </library> + <library name="xercesImpl-2.11.0.jar"> + <export name="*"/> + </library> + <library name="xml-apis-1.4.01.jar"> + <export name="*"/> + </library> + </runtime> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java new file mode 100644 index 0000000..fc231c3 --- /dev/null +++ b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebDriver.java @@ -0,0 +1,190 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.htmlunit; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.concurrent.TimeUnit; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IOUtils; +import org.openqa.selenium.By; +import org.openqa.selenium.JavascriptExecutor; +import org.openqa.selenium.OutputType; +import org.openqa.selenium.TakesScreenshot; +import org.openqa.selenium.TimeoutException; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.htmlunit.HtmlUnitDriver; +import org.openqa.selenium.io.TemporaryFilesystem; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.gargoylesoftware.htmlunit.WebClient; + +public class HtmlUnitWebDriver extends HtmlUnitDriver { + + private static final Logger LOG = LoggerFactory.getLogger(HtmlUnitWebDriver.class); + private static boolean enableJavascript; + private static boolean enableCss; + private static boolean enableRedirect; + private static long javascriptTimeout; + private static int maxRedirects; + + public HtmlUnitWebDriver() { + super(enableJavascript); + } + + @Override + protected WebClient modifyWebClient(WebClient client) { + client.getOptions().setJavaScriptEnabled(enableJavascript); + client.getOptions().setCssEnabled(enableCss); + client.getOptions().setRedirectEnabled(enableRedirect); + if(enableJavascript) + client.setJavaScriptTimeout(javascriptTimeout); + client.getOptions().setThrowExceptionOnScriptError(false); + if(enableRedirect) + client.addWebWindowListener(new HtmlUnitWebWindowListener(maxRedirects)); + return client; + } + + public static WebDriver getDriverForPage(String url, Configuration conf) { + long pageLoadTimout = conf.getLong("htmlunit.page.load.delay", 3); + enableJavascript = conf.getBoolean("htmlunit.enable.javascript", true); + enableCss = conf.getBoolean("htmlunit.enable.css", false); + javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500); + int redirects = Integer.parseInt(conf.get("http.redirect.max", "0")); + enableRedirect = redirects <= 0 ? false : true; + maxRedirects = redirects; + + WebDriver driver = null; + + try { + driver = new HtmlUnitWebDriver(); + driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, TimeUnit.SECONDS); + driver.get(url); + } catch(Exception e) { + if(e instanceof TimeoutException) { + LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far..."); + return driver; + } + cleanUpDriver(driver); + throw new RuntimeException(e); + } + + return driver; + } + + public static String getHTMLContent(WebDriver driver, Configuration conf) { + try { + if (conf.getBoolean("htmlunit.take.screenshot", false)) + takeScreenshot(driver, conf); + + String innerHtml = ""; + if(enableJavascript) { + WebElement body = driver.findElement(By.tagName("body")); + innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); + } + else + innerHtml = driver.getPageSource().replaceAll("&", "&"); + return innerHtml; + } catch(Exception e) { + TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); + cleanUpDriver(driver); + throw new RuntimeException(e); + } + } + + public static void cleanUpDriver(WebDriver driver) { + if (driver != null) { + try { + driver.close(); + driver.quit(); + TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } + + /** + * Function for obtaining the HTML BODY using the selected + * {@link org.openqa.selenium.WebDriver}. + * There are a number of configuration properties within + * <code>nutch-site.xml</code> which determine whether to + * take screenshots of the rendered pages and persist them + * as timestamped .png's into HDFS. + * @param url the URL to fetch and render + * @param conf the {@link org.apache.hadoop.conf.Configuration} + * @return the rendered inner HTML page + */ + public static String getHtmlPage(String url, Configuration conf) { + WebDriver driver = getDriverForPage(url, conf); + + try { + if (conf.getBoolean("htmlunit.take.screenshot", false)) + takeScreenshot(driver, conf); + + + String innerHtml = ""; + if(enableJavascript) { + WebElement body = driver.findElement(By.tagName("body")); + innerHtml = (String)((JavascriptExecutor)driver).executeScript("return arguments[0].innerHTML;", body); + } + else + innerHtml = driver.getPageSource().replaceAll("&", "&"); + return innerHtml; + + } catch (Exception e) { + TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); + throw new RuntimeException(e); + } finally { + cleanUpDriver(driver); + } + } + + private static void takeScreenshot(WebDriver driver, Configuration conf) { + try { + String url = driver.getCurrentUrl(); + File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE); + LOG.debug("In-memory screenshot taken of: {}", url); + FileSystem fs = FileSystem.get(conf); + if (conf.get("htmlunit.screenshot.location") != null) { + Path screenshotPath = new Path(conf.get("htmlunit.screenshot.location") + "/" + srcFile.getName()); + OutputStream os = null; + if (!fs.exists(screenshotPath)) { + LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName()); + os = fs.create(screenshotPath); + } + InputStream is = new BufferedInputStream(new FileInputStream(srcFile)); + IOUtils.copyBytes(is, os, conf); + LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName()); + } else { + LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for " + + "'htmlunit.screenshot.location' is absent from nutch-site.xml.", url); + } + } catch (Exception e) { + cleanUpDriver(driver); + throw new RuntimeException(e); + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java new file mode 100644 index 0000000..760f4aa --- /dev/null +++ b/src/plugin/lib-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HtmlUnitWebWindowListener.java @@ -0,0 +1,36 @@ +package org.apache.nutch.protocol.htmlunit; + +import com.gargoylesoftware.htmlunit.WebWindowEvent; +import com.gargoylesoftware.htmlunit.WebWindowListener; + +public class HtmlUnitWebWindowListener implements WebWindowListener { + + private Integer redirectCount = 0; + private Integer maxRedirects = 0; + + public HtmlUnitWebWindowListener() { + + } + + public HtmlUnitWebWindowListener(int maxRedirects) { + this.maxRedirects = maxRedirects; + } + + @Override + public void webWindowOpened(WebWindowEvent event) { + + } + + @Override + public void webWindowContentChanged(WebWindowEvent event) { + redirectCount++; + if(redirectCount > maxRedirects) + throw new RuntimeException("Redirect Count: " + redirectCount + " exceeded the Maximum Redirects allowed: " + maxRedirects); + } + + @Override + public void webWindowClosed(WebWindowEvent event) { + + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/build.xml b/src/plugin/protocol-htmlunit/build.xml new file mode 100644 index 0000000..0ed0228 --- /dev/null +++ b/src/plugin/protocol-htmlunit/build.xml @@ -0,0 +1,53 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="protocol-htmlunit" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Build compilation dependencies --> + <target name="deps-jar"> + <ant target="jar" inheritall="false" dir="../lib-http"/> + <ant target="jar" inheritall="false" dir="../lib-htmlunit"/> + </target> + + <!-- Add compilation dependencies to classpath --> + <path id="plugin.deps"> + <fileset dir="${nutch.root}/build"> + <include name="**/lib-http/*.jar" /> + <include name="**/lib-htmlunit/*.jar" /> + </fileset> + <pathelement location="${build.dir}/test/conf"/> + </path> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../lib-http"/> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> + <copy toDir="${build.test}"> + <fileset dir="${src.test}" excludes="**/*.java"/> + </copy> + </target> + + <!-- for junit test --> + <!-- + <mkdir dir="${build.test}/data" /> + <copy todir="${build.test}/data"> + <fileset dir="jsp"/> + </copy>--> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/ivy.xml b/src/plugin/protocol-htmlunit/ivy.xml new file mode 100644 index 0000000..8aa78d2 --- /dev/null +++ b/src/plugin/protocol-htmlunit/ivy.xml @@ -0,0 +1,38 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/plugin.xml b/src/plugin/protocol-htmlunit/plugin.xml new file mode 100644 index 0000000..36bcb80 --- /dev/null +++ b/src/plugin/protocol-htmlunit/plugin.xml @@ -0,0 +1,51 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="protocol-htmlunit" + name="HtmlUnit Protocol Plug-in" + version="1.0.0" + provider-name="nutch.apache.org"> + + <runtime> + <library name="protocol-htmlunit.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + <import plugin="lib-http"/> + <import plugin="lib-htmlunit"/> + </requires> + + <extension id="org.apache.nutch.protocol.http" + name="HttpProtocol" + point="org.apache.nutch.protocol.Protocol"> + + <implementation id="org.apache.nutch.protocol.htmlunit.Http" + class="org.apache.nutch.protocol.htmlunit.Http"> + <parameter name="protocolName" value="http"/> + </implementation> + + <implementation id="org.apache.nutch.protocol.htmlunit.Http" + class="org.apache.nutch.protocol.htmlunit.Http"> + <parameter name="protocolName" value="https"/> + </implementation> + + </extension> +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java new file mode 100644 index 0000000..83b7687 --- /dev/null +++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.htmlunit; + +import java.io.IOException; +import java.net.URL; + +import org.apache.hadoop.conf.Configuration; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.http.api.HttpBase; +import org.apache.nutch.util.NutchConfiguration; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * + * + */ +public class Http extends HttpBase { + + public static final Logger LOG = LoggerFactory.getLogger(Http.class); + + /** + * Default constructor. + */ + public Http() { + super(LOG); + } + + /** + * Set the {@link org.apache.hadoop.conf.Configuration} object. + * + * @param conf + */ + public void setConf(Configuration conf) { + super.setConf(conf); + } + + public static void main(String[] args) throws Exception { + Http http = new Http(); + http.setConf(NutchConfiguration.create()); + main(http, args); + } + + protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) + throws ProtocolException, IOException { + return new HttpResponse(this, url, datum); + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java new file mode 100644 index 0000000..72b1fa1 --- /dev/null +++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java @@ -0,0 +1,350 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.htmlunit; + +import java.io.BufferedInputStream; +import java.io.ByteArrayOutputStream; +import java.io.EOFException; +import java.io.IOException; +import java.io.OutputStream; +import java.io.PushbackInputStream; +import java.net.InetSocketAddress; +import java.net.Socket; +import java.net.URL; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.SpellCheckedMetadata; +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.http.api.HttpBase; +import org.apache.nutch.protocol.http.api.HttpException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * An HTTP response. + * + */ +public class HttpResponse implements Response { + + private static final Logger LOG = LoggerFactory.getLogger(HttpResponse.class); + + private Http http; + private URL url; + private byte[] content; + private int code; + private Metadata headers = new SpellCheckedMetadata(); + + /** The nutch configuration */ + private Configuration conf = null; + + public HttpResponse(Http http, URL url, CrawlDatum datum) throws ProtocolException, IOException { + + this.conf = http.getConf(); + this.http = http; + this.url = url; + + LOG.info("fetching " + url); + + String path = "".equals(url.getFile()) ? "/" : url.getFile(); + + // some servers will redirect a request with a host line like + // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they + // don't want the :80... + String host = url.getHost(); + int port; + String portString; + if (url.getPort() == -1) { + port = 80; + portString = ""; + } else { + port = url.getPort(); + portString = ":" + port; + } + + Socket socket = null; + + try { + socket = new Socket(); // create the socket + socket.setSoTimeout(http.getTimeout()); + + // connect + String sockHost = http.useProxy(url) ? http.getProxyHost() : host; + int sockPort = http.useProxy(url) ? http.getProxyPort() : port; + InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort); + socket.connect(sockAddr, http.getTimeout()); + + // make request + OutputStream req = socket.getOutputStream(); + + StringBuffer reqStr = new StringBuffer("GET "); + if (http.useProxy(url)) { + reqStr.append(url.getProtocol() + "://" + host + portString + path); + } else { + reqStr.append(path); + } + + // TODO: Write code for Https + reqStr.append(" HTTP/1.0\r\n"); + + reqStr.append("Host: "); + reqStr.append(host); + reqStr.append(portString); + reqStr.append("\r\n"); + + reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n"); + + String userAgent = http.getUserAgent(); + if ((userAgent == null) || (userAgent.length() == 0)) { + if (Http.LOG.isErrorEnabled()) { + Http.LOG.error("User-agent is not set!"); + } + } else { + reqStr.append("User-Agent: "); + reqStr.append(userAgent); + reqStr.append("\r\n"); + } + + reqStr.append("Accept-Language: "); + reqStr.append(this.http.getAcceptLanguage()); + reqStr.append("\r\n"); + + reqStr.append("Accept: "); + reqStr.append(this.http.getAccept()); + reqStr.append("\r\n"); + + if (datum.getModifiedTime() > 0) { + reqStr.append("If-Modified-Since: " + HttpDateFormat.toString(datum.getModifiedTime())); + reqStr.append("\r\n"); + } + reqStr.append("\r\n"); + + byte[] reqBytes = reqStr.toString().getBytes(); + + req.write(reqBytes); + req.flush(); + + PushbackInputStream in = // process response + new PushbackInputStream(new BufferedInputStream(socket.getInputStream(), Http.BUFFER_SIZE), + Http.BUFFER_SIZE); + + StringBuffer line = new StringBuffer(); + + boolean haveSeenNonContinueStatus = false; + while (!haveSeenNonContinueStatus) { + // parse status code line + this.code = parseStatusLine(in, line); + // parse headers + parseHeaders(in, line); + haveSeenNonContinueStatus = code != 100; // 100 is "Continue" + } + + // Get Content type header + String contentType = getHeader(Response.CONTENT_TYPE); + + // handle with Selenium only if content type in HTML or XHTML + if (contentType != null) { + if (contentType.contains("text/html") || contentType.contains("application/xhtml")) { + readPlainContent(url); + } else { + try { + int contentLength = Integer.MAX_VALUE; + String contentLengthString = headers.get(Response.CONTENT_LENGTH); + if (contentLengthString != null) { + try { + contentLength = Integer.parseInt(contentLengthString.trim()); + } catch (NumberFormatException ex) { + throw new HttpException("bad content length: " + contentLengthString); + } + } + + if (http.getMaxContent() >= 0 && contentLength > http.getMaxContent()) { + contentLength = http.getMaxContent(); + } + + byte[] buffer = new byte[HttpBase.BUFFER_SIZE]; + int bufferFilled = 0; + int totalRead = 0; + ByteArrayOutputStream out = new ByteArrayOutputStream(); + while ((bufferFilled = in.read(buffer, 0, buffer.length)) != -1 + && totalRead + bufferFilled <= contentLength) { + totalRead += bufferFilled; + out.write(buffer, 0, bufferFilled); + } + + content = out.toByteArray(); + + } catch (Exception e) { + if (code == 200) + throw new IOException(e.toString()); + // for codes other than 200 OK, we are fine with empty content + } finally { + if (in != null) { + in.close(); + } + } + } + } + + } finally { + if (socket != null) + socket.close(); + } + } + + private void readPlainContent(URL url) throws IOException { + String page = HtmlUnitWebDriver.getHtmlPage(url.toString(), conf); + content = page.getBytes("UTF-8"); + } + + private int parseStatusLine(PushbackInputStream in, StringBuffer line) throws IOException, HttpException { + readLine(in, line, false); + + int codeStart = line.indexOf(" "); + int codeEnd = line.indexOf(" ", codeStart + 1); + + // handle lines with no plaintext result code, ie: + // "HTTP/1.1 200" vs "HTTP/1.1 200 OK" + if (codeEnd == -1) + codeEnd = line.length(); + + int code; + try { + code = Integer.parseInt(line.substring(codeStart + 1, codeEnd)); + } catch (NumberFormatException e) { + throw new HttpException("bad status line '" + line + "': " + e.getMessage(), e); + } + + return code; + } + + private void processHeaderLine(StringBuffer line) throws IOException, HttpException { + + int colonIndex = line.indexOf(":"); // key is up to colon + if (colonIndex == -1) { + int i; + for (i = 0; i < line.length(); i++) + if (!Character.isWhitespace(line.charAt(i))) + break; + if (i == line.length()) + return; + throw new HttpException("No colon in header:" + line); + } + String key = line.substring(0, colonIndex); + + int valueStart = colonIndex + 1; // skip whitespace + while (valueStart < line.length()) { + int c = line.charAt(valueStart); + if (c != ' ' && c != '\t') + break; + valueStart++; + } + String value = line.substring(valueStart); + headers.set(key, value); + } + + // Adds headers to our headers Metadata + private void parseHeaders(PushbackInputStream in, StringBuffer line) throws IOException, HttpException { + + while (readLine(in, line, true) != 0) { + + // handle HTTP responses with missing blank line after headers + int pos; + if (((pos = line.indexOf("<!DOCTYPE")) != -1) || ((pos = line.indexOf("<HTML")) != -1) + || ((pos = line.indexOf("<html")) != -1)) { + + in.unread(line.substring(pos).getBytes("UTF-8")); + line.setLength(pos); + + try { + //TODO: (CM) We don't know the header names here + //since we're just handling them generically. It would + //be nice to provide some sort of mapping function here + //for the returned header names to the standard metadata + //names in the ParseData class + processHeaderLine(line); + } catch (Exception e) { + // fixme: + Http.LOG.warn("Error: ", e); + } + return; + } + + processHeaderLine(line); + } + } + + private static int readLine(PushbackInputStream in, StringBuffer line, boolean allowContinuedLine) + throws IOException { + line.setLength(0); + for (int c = in.read(); c != -1; c = in.read()) { + switch (c) { + case '\r': + if (peek(in) == '\n') { + in.read(); + } + case '\n': + if (line.length() > 0) { + // at EOL -- check for continued line if the current + // (possibly continued) line wasn't blank + if (allowContinuedLine) + switch (peek(in)) { + case ' ': + case '\t': // line is continued + in.read(); + continue; + } + } + return line.length(); // else complete + default: + line.append((char) c); + } + } + throw new EOFException(); + } + + private static int peek(PushbackInputStream in) throws IOException { + int value = in.read(); + in.unread(value); + return value; + } + + public URL getUrl() { + return url; + } + + public String getHeader(String name) { + return headers.get(name); + } + + public Metadata getHeaders() { + return headers; + } + + public byte[] getContent() { + return content; + } + + @Override + public int getCode() { + // TODO Auto-generated method stub + return code; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/fa334722/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html new file mode 100644 index 0000000..34d1d1c --- /dev/null +++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html @@ -0,0 +1,5 @@ +<html> +<body> +<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> +</body> +</html>
