http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpWebClient.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/nutch-plugins/lib-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpWebClient.java new file mode 100644 index 0000000..86692ae --- /dev/null +++ b/nutch-plugins/lib-selenium/src/main/java/org/apache/nutch/protocol/selenium/HttpWebClient.java @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.protocol.selenium; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.URL; +import java.util.concurrent.TimeUnit; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IOUtils; +import org.openqa.selenium.By; +import org.openqa.selenium.OutputType; +import org.openqa.selenium.TakesScreenshot; +import org.openqa.selenium.TimeoutException; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.firefox.FirefoxBinary; +import org.openqa.selenium.firefox.FirefoxDriver; +import org.openqa.selenium.firefox.FirefoxProfile; +import org.openqa.selenium.io.TemporaryFilesystem; +import org.openqa.selenium.remote.DesiredCapabilities; +import org.openqa.selenium.remote.RemoteWebDriver; +import org.openqa.selenium.safari.SafariDriver; +import org.openqa.selenium.phantomjs.PhantomJSDriver; +import org.openqa.selenium.phantomjs.PhantomJSDriverService; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.opera.core.systems.OperaDriver; + +public class HttpWebClient { + + private static final Logger LOG = LoggerFactory.getLogger(HttpWebClient.class); + + public static ThreadLocal<WebDriver> threadWebDriver = new ThreadLocal<WebDriver>() { + + @Override + protected WebDriver initialValue() + { + FirefoxProfile profile = new FirefoxProfile(); + profile.setPreference("permissions.default.stylesheet", 2); + profile.setPreference("permissions.default.image", 2); + profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", "false"); + profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, "localhost"); + WebDriver driver = new FirefoxDriver(profile); + return driver; + }; + }; + + public static WebDriver getDriverForPage(String url, Configuration conf) { + WebDriver driver = null; + DesiredCapabilities capabilities = null; + long pageLoadWait = conf.getLong("page.load.delay", 3); + + try { + String driverType = conf.get("selenium.driver", "firefox"); + switch (driverType) { + case "firefox": + String allowedHost = conf.get("selenium.firefox.allowed.hosts", "localhost"); + long firefoxBinaryTimeout = conf.getLong("selenium.firefox.binary.timeout", 45); + boolean enableFlashPlayer = conf.getBoolean("selenium.firefox.enable.flash", false); + int loadImage = conf.getInt("selenium.firefox.load.image", 1); + int loadStylesheet = conf.getInt("selenium.firefox.load.stylesheet", 1); + FirefoxProfile profile = new FirefoxProfile(); + FirefoxBinary binary = new FirefoxBinary(); + profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, allowedHost); + profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", enableFlashPlayer); + profile.setPreference("permissions.default.stylesheet", loadStylesheet); + profile.setPreference("permissions.default.image", loadImage); + binary.setTimeout(TimeUnit.SECONDS.toMillis(firefoxBinaryTimeout)); + driver = new FirefoxDriver(binary, profile); + break; + case "chrome": + driver = new ChromeDriver(); + break; + case "safari": + driver = new SafariDriver(); + break; + case "opera": + driver = new OperaDriver(); + break; + case "phantomjs": + driver = new PhantomJSDriver(); + break; + case "remote": + String seleniumHubHost = conf.get("selenium.hub.host", "localhost"); + int seleniumHubPort = Integer.parseInt(conf.get("selenium.hub.port", "4444")); + String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub"); + String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http"); + String seleniumGridDriver = conf.get("selenium.grid.driver","firefox"); + String seleniumGridBinary = conf.get("selenium.grid.binary"); + + switch (seleniumGridDriver){ + case "firefox": + capabilities = DesiredCapabilities.firefox(); + capabilities.setBrowserName("firefox"); + capabilities.setJavascriptEnabled(true); + capabilities.setCapability("firefox_binary",seleniumGridBinary); + System.setProperty("webdriver.reap_profile", "false"); + driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities); + break; + case "phantomjs": + capabilities = DesiredCapabilities.phantomjs(); + capabilities.setBrowserName("phantomjs"); + capabilities.setJavascriptEnabled(true); + capabilities.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,seleniumGridBinary); + driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities); + break; + default: + LOG.error("The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType); + driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), DesiredCapabilities.firefox()); + break; + } + break; + default: + LOG.error("The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType); + driver = new FirefoxDriver(); + break; + } + LOG.debug("Selenium {} WebDriver selected.", driverType); + + driver.manage().timeouts().pageLoadTimeout(pageLoadWait, TimeUnit.SECONDS); + driver.get(url); + } catch (Exception e) { + if(e instanceof TimeoutException) { + LOG.debug("Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far..."); + return driver; + } + cleanUpDriver(driver); + throw new RuntimeException(e); + } + + return driver; + } + + public static String getHTMLContent(WebDriver driver, Configuration conf) { + if (conf.getBoolean("take.screenshot", false)) { + takeScreenshot(driver, conf); + } + + return driver.findElement(By.tagName("body")).getAttribute("innerHTML"); + } + + public static void cleanUpDriver(WebDriver driver) { + if (driver != null) { + try { + driver.close(); + driver.quit(); + TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } + + /** + * Function for obtaining the HTML BODY using the selected + * {@link org.openqa.selenium.WebDriver}. + * There are a number of configuration properties within + * <code>nutch-site.xml</code> which determine whether to + * take screenshots of the rendered pages and persist them + * as timestamped .png's into HDFS. + * @param url the URL to fetch and render + * @param conf the {@link org.apache.hadoop.conf.Configuration} + * @return the rendered inner HTML page + */ + public static String getHtmlPage(String url, Configuration conf) { + WebDriver driver = getDriverForPage(url, conf); + + try { + if (conf.getBoolean("take.screenshot", false)) { + takeScreenshot(driver, conf); + } + + String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML"); + return innerHtml; + + // I'm sure this catch statement is a code smell ; borrowing it from lib-htmlunit + } catch (Exception e) { + TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); + throw new RuntimeException(e); + } finally { + cleanUpDriver(driver); + } + } + + public static String getHtmlPage(String url) { + return getHtmlPage(url, null); + } + + private static void takeScreenshot(WebDriver driver, Configuration conf) { + try { + String url = driver.getCurrentUrl(); + File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE); + LOG.debug("In-memory screenshot taken of: {}", url); + FileSystem fs = FileSystem.get(conf); + if (conf.get("screenshot.location") != null) { + Path screenshotPath = new Path(conf.get("screenshot.location") + "/" + srcFile.getName()); + OutputStream os = null; + if (!fs.exists(screenshotPath)) { + LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName()); + os = fs.create(screenshotPath); + } + InputStream is = new BufferedInputStream(new FileInputStream(srcFile)); + IOUtils.copyBytes(is, os, conf); + LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName()); + } else { + LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for " + + "'screenshot.location' is absent from nutch-site.xml.", url); + } + } catch (Exception e) { + cleanUpDriver(driver); + throw new RuntimeException(e); + } + } +}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-xml/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-xml/build.xml b/nutch-plugins/lib-xml/build.xml new file mode 100644 index 0000000..0f87c07 --- /dev/null +++ b/nutch-plugins/lib-xml/build.xml @@ -0,0 +1,36 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="lib-xml" default="jar"> + + <import file="../build-plugin.xml" /> + + <!-- + ! Override the compile and jar targets, + ! since there is nothing to compile here. + ! --> + <target name="compile" depends="init, resolve-default" /> + + <!-- + <target name="jar" depends="compile"> + <copy todir="${build.dir}" verbose="true"> + <fileset dir="./lib" includes="**/*.jar" /> + </copy> + </target> + --> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-xml/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-xml/ivy.xml b/nutch-plugins/lib-xml/ivy.xml new file mode 100644 index 0000000..414f38a --- /dev/null +++ b/nutch-plugins/lib-xml/ivy.xml @@ -0,0 +1,44 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + <dependency org="org.jdom" name="jdom" rev="1.1" conf="*->default"/> + <dependency org="jaxen" name="jaxen" rev="1.1.1" conf="*->master"/> + <dependency org="xerces" name="xercesImpl" rev="2.11.0" conf="*->master"/> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-xml/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-xml/plugin.xml b/nutch-plugins/lib-xml/plugin.xml new file mode 100644 index 0000000..79bd17f --- /dev/null +++ b/nutch-plugins/lib-xml/plugin.xml @@ -0,0 +1,65 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!-- + ! XML library - Gathers many XML related libraries: + ! + ! * Jaxen + ! - Download : http://jaxen.org/releases.html + ! - License : http://jaxen.org/license.html + ! + ! * Xerces-J 2.6.1 + ! - Download : http://xerces.apache.org/xerces2-j/download.cgi + ! - License : http://www.apache.org/licenses/LICENSE-2.0 + ! + ! * SAXPath 1.0 FCS + ! - Note : SAXPath has been incorporated into Jaxen. + ! It has been merged into the Jaxen codebase + ! and is no longer being maintained separately + ! - Download : http://sourceforge.net/project/showfiles.php?group_id=26014 + ! - License : OSI-Approved Open Source + ! + ! * jdom 1.0 beta8-dev + ! - Download : http://www.jdom.org/downloads/index.html + ! - License : http://www.jdom.org/docs/faq.html#a0030 + ! + !--> +<plugin + id="lib-xml" + name="XML Libraries" + version="1.0" + provider-name="org.apache.nutch.xml"> + + <runtime> + <library name="jaxen-core.jar"> + <export name="*"/> + </library> + <library name="jaxen-jdom.jar"> + <export name="*"/> + </library> + <library name="xercesImpl.jar"> + <export name="*"/> + </library> + <library name="saxpath.jar"> + <export name="*"/> + </library> + <library name="jdom.jar"> + <export name="*"/> + </library> + </runtime> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/lib-xml/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/lib-xml/pom.xml b/nutch-plugins/lib-xml/pom.xml new file mode 100644 index 0000000..132d0f2 --- /dev/null +++ b/nutch-plugins/lib-xml/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>lib-xml</artifactId> + <packaging>jar</packaging> + + <name>lib-xml</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/microformats-reltag/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/microformats-reltag/build.xml b/nutch-plugins/microformats-reltag/build.xml new file mode 100644 index 0000000..395afee --- /dev/null +++ b/nutch-plugins/microformats-reltag/build.xml @@ -0,0 +1,27 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="microformats-reltag" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> + </target> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/microformats-reltag/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/microformats-reltag/ivy.xml b/nutch-plugins/microformats-reltag/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/microformats-reltag/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/microformats-reltag/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/microformats-reltag/plugin.xml b/nutch-plugins/microformats-reltag/plugin.xml new file mode 100644 index 0000000..b35e1f4 --- /dev/null +++ b/nutch-plugins/microformats-reltag/plugin.xml @@ -0,0 +1,49 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="microformats-reltag" + name="Rel-Tag microformat Parser/Indexer/Querier" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="microformats-reltag.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.microformats.reltag.RelTagParser" + name="Rel-Tag parser" + point="org.apache.nutch.parse.HtmlParseFilter"> + <implementation id="RelTagParser" + class="org.apache.nutch.microformats.reltag.RelTagParser"/> + </extension> + + <extension id="org.apache.nutch.microformats.reltag.RelTagIndexingFilter" + name="Rel-Tag indexing filter" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="RelTagIndexingFilter" + class="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"/> + </extension> + +</plugin> + http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/microformats-reltag/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/microformats-reltag/pom.xml b/nutch-plugins/microformats-reltag/pom.xml new file mode 100644 index 0000000..8579cb5 --- /dev/null +++ b/nutch-plugins/microformats-reltag/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>microformats-reltag</artifactId> + <packaging>jar</packaging> + + <name>microformats-reltag</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java new file mode 100644 index 0000000..e50a150 --- /dev/null +++ b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java @@ -0,0 +1,77 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.microformats.reltag; + +// Nutch imports +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.hadoop.io.Text; +import org.apache.nutch.parse.Parse; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +/** + * An {@link org.apache.nutch.indexer.IndexingFilter} that add <code>tag</code> + * field(s) to the document. + * + * @see <a href="http://www.microformats.org/wiki/rel-tag"> + * http://www.microformats.org/wiki/rel-tag</a> + * @author Jérôme Charron + */ +public class RelTagIndexingFilter implements IndexingFilter { + + private Configuration conf; + + // Inherited JavaDoc + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + // Check if some Rel-Tags found, possibly put there by RelTagParser + String[] tags = parse.getData().getParseMeta() + .getValues(RelTagParser.REL_TAG); + if (tags != null) { + for (int i = 0; i < tags.length; i++) { + doc.add("tag", tags[i]); + } + } + + return doc; + } + + /* + * ----------------------------- * <implementation:Configurable> * + * ----------------------------- + */ + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return this.conf; + } + + /* + * ------------------------------ * </implementation:Configurable> * + * ------------------------------ + */ + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagParser.java b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagParser.java new file mode 100644 index 0000000..9176a1e --- /dev/null +++ b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/RelTagParser.java @@ -0,0 +1,148 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.microformats.reltag; + +// JDK imports +import java.net.URL; +import java.net.URLDecoder; +import java.util.Iterator; +import java.util.Set; +import java.util.TreeSet; +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +// Commons Logging imports +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +// Nutch imports +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.HtmlParseFilter; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.StringUtil; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +/** + * Adds microformat rel-tags of document if found. + * + * @see <a href="http://www.microformats.org/wiki/rel-tag"> + * http://www.microformats.org/wiki/rel-tag</a> + */ +public class RelTagParser implements HtmlParseFilter { + + public final static Logger LOG = LoggerFactory.getLogger(RelTagParser.class); + + public final static String REL_TAG = "Rel-Tag"; + + private Configuration conf = null; + + /** + * Scan the HTML document looking at possible rel-tags + */ + public ParseResult filter(Content content, ParseResult parseResult, + HTMLMetaTags metaTags, DocumentFragment doc) { + + // get parse obj + Parse parse = parseResult.get(content.getUrl()); + // Trying to find the document's rel-tags + Parser parser = new Parser(doc); + Set<?> tags = parser.getRelTags(); + Iterator<?> iter = tags.iterator(); + Metadata metadata = parse.getData().getParseMeta(); + while (iter.hasNext()) + metadata.add(REL_TAG, (String) iter.next()); + + return parseResult; + } + + private static class Parser { + + Set<String> tags = null; + + Parser(Node node) { + tags = new TreeSet<String>(); + parse(node); + } + + Set<String> getRelTags() { + return tags; + } + + void parse(Node node) { + + if (node.getNodeType() == Node.ELEMENT_NODE) { + // Look for <a> tag + if ("a".equalsIgnoreCase(node.getNodeName())) { + NamedNodeMap attrs = node.getAttributes(); + Node hrefNode = attrs.getNamedItem("href"); + // Checks that it contains a href attribute + if (hrefNode != null) { + Node relNode = attrs.getNamedItem("rel"); + // Checks that it contains a rel attribute too + if (relNode != null) { + // Finaly checks that rel=tag + if ("tag".equalsIgnoreCase(relNode.getNodeValue())) { + String tag = parseTag(hrefNode.getNodeValue()); + if (!StringUtil.isEmpty(tag)) { + if (!tags.contains(tag)) { + tags.add(tag); + LOG.debug("Adding tag: " + tag + " to tag set."); + } + } + } + } + } + } + } + + // Recurse + NodeList children = node.getChildNodes(); + for (int i = 0; children != null && i < children.getLength(); i++) + parse(children.item(i)); + } + + private final static String parseTag(String url) { + String tag = null; + try { + URL u = new URL(url); + String path = u.getPath(); + tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), + "UTF-8"); + } catch (Exception e) { + // Malformed tag... + tag = null; + } + return tag; + } + + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return this.conf; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/package.html b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/package.html new file mode 100644 index 0000000..bef5409 --- /dev/null +++ b/nutch-plugins/microformats-reltag/src/main/java/org/apache/nutch/microformats/reltag/package.html @@ -0,0 +1,8 @@ +<html> +<body> +<p> +A microformats <a href="http://www.microformats.org/wiki/Rel-Tag">Rel-Tag</a> +Parser/Indexer/Querier plugin. +</p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/mimetype-filter/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/mimetype-filter/build.xml b/nutch-plugins/mimetype-filter/build.xml new file mode 100644 index 0000000..977e643 --- /dev/null +++ b/nutch-plugins/mimetype-filter/build.xml @@ -0,0 +1,28 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="mimetype-filter" default="jar-core"> + + <import file="../build-plugin.xml" /> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="sample" includes="**/*.txt"/> + </copy> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/mimetype-filter/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/mimetype-filter/ivy.xml b/nutch-plugins/mimetype-filter/ivy.xml new file mode 100644 index 0000000..0a363f7 --- /dev/null +++ b/nutch-plugins/mimetype-filter/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="${nutch.root}/ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/mimetype-filter/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/mimetype-filter/plugin.xml b/nutch-plugins/mimetype-filter/plugin.xml new file mode 100644 index 0000000..d038447 --- /dev/null +++ b/nutch-plugins/mimetype-filter/plugin.xml @@ -0,0 +1,37 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="mimetype-filter" + name="Filter indexed documents by the detected MIME" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="mimetype-filter.jar"> + <export name="*"/> + </library> + </runtime> + + <extension id="org.apache.nutch.indexer.filter" + name="Nutch MIME filter" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="MimeTypeIndexingFilter" + class="org.apache.nutch.indexer.filter.MimeTypeIndexingFilter"/> + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/mimetype-filter/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/mimetype-filter/pom.xml b/nutch-plugins/mimetype-filter/pom.xml new file mode 100644 index 0000000..29c0798 --- /dev/null +++ b/nutch-plugins/mimetype-filter/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>mimetype-filter</artifactId> + <packaging>jar</packaging> + + <name>mimetype-filter</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/mimetype-filter/sample/allow-images.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/mimetype-filter/sample/allow-images.txt b/nutch-plugins/mimetype-filter/sample/allow-images.txt new file mode 100644 index 0000000..0f5f136 --- /dev/null +++ b/nutch-plugins/mimetype-filter/sample/allow-images.txt @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This filter can be configured to work in one of two modes (similar to +# suffix-url-filter) + +# default to reject ('-'): in this mode, only documents with a mimetype that +# match the ones specified in the config file will be accepted, all other +# mimetypes will be rejected. + +# default to accept ('+'): in this mode, only documents with a mimetype +# that match the ones specified in the config file will be rejected, +# all other mimetypes will be accepted. + +# The format of this config file is one mimetype per line, with no preceding +# whitespace. Order, in which suffixes are specified, doesn't matter. Blank +# lines and comments (#) are allowed. +# + +- + +image http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/mimetype-filter/sample/block-html.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/mimetype-filter/sample/block-html.txt b/nutch-plugins/mimetype-filter/sample/block-html.txt new file mode 100644 index 0000000..69600ec --- /dev/null +++ b/nutch-plugins/mimetype-filter/sample/block-html.txt @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This filter can be configured to work in one of two modes (similar to +# suffix-url-filter) + +# default to reject ('-'): in this mode, only documents with a mimetype that +# match the ones specified in the config file will be accepted, all other +# mimetypes will be rejected. + +# default to accept ('+'): in this mode, only documents with a mimetype +# that match the ones specified in the config file will be rejected, +# all other mimetypes will be accepted. + +# The format of this config file is one mimetype per line, with no preceding +# whitespace. Order, in which suffixes are specified, doesn't matter. Blank +# lines and comments (#) are allowed. +# + ++ + +text/html \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/mimetype-filter/src/main/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/mimetype-filter/src/main/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java b/nutch-plugins/mimetype-filter/src/main/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java new file mode 100644 index 0000000..494d888 --- /dev/null +++ b/nutch-plugins/mimetype-filter/src/main/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer.filter; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.GnuParser; +import org.apache.commons.cli.UnrecognizedOptionException; + +// Nutch imports +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; + +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; + +import org.apache.nutch.net.protocols.Response; + +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; + +import org.apache.nutch.metadata.Metadata; + +import org.apache.nutch.util.MimeUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.PrefixStringMatcher; +import org.apache.nutch.util.TrieStringMatcher; +import org.apache.tika.Tika; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.ArrayList; +import java.util.List; + +/** + * An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering + * of documents based on the MIME Type detected by Tika + * + */ +public class MimeTypeIndexingFilter implements IndexingFilter { + + public static final String MIMEFILTER_REGEX_FILE = "mimetype.filter.file"; + + private static final Logger LOG = LoggerFactory + .getLogger(MimeTypeIndexingFilter.class); + + private MimeUtil MIME; + private Tika tika = new Tika(); + + private TrieStringMatcher trie; + + private Configuration conf; + + private boolean acceptMode = true; + + // Inherited JavaDoc + @Override + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + String mimeType; + String contentType; + + Writable tcontentType = datum.getMetaData() + .get(new Text(Response.CONTENT_TYPE)); + + if (tcontentType != null) { + contentType = tcontentType.toString(); + } else { + contentType = parse.getData().getMeta(Response.CONTENT_TYPE); + } + + if (contentType == null) { + mimeType = tika.detect(url.toString()); + } else { + mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType)); + } + + contentType = mimeType; + + if (LOG.isInfoEnabled()) { + LOG.info(String.format("[%s] %s", contentType, url)); + } + + if (trie != null) { + if (trie.shortestMatch(contentType) == null) { + // no match, but + if (acceptMode) { + return doc; + } + return null; + } else { + // matched, but we are blocking + if (acceptMode) { + return null; + } + } + } + + return doc; + } + + /* + * ----------------------------- + * <implementation:Configurable> * + * ----------------------------- + */ + @Override + public void setConf(Configuration conf) { + this.conf = conf; + MIME = new MimeUtil(conf); + + // load the file of the values + String file = conf.get(MIMEFILTER_REGEX_FILE, ""); + + if (file != null) { + if (file.isEmpty()) { + LOG.warn(String + .format("Missing %s property, ALL mimetypes will be allowed", + MIMEFILTER_REGEX_FILE)); + } else { + Reader reader = conf.getConfResourceAsReader(file); + + try { + readConfiguration(reader); + } catch (IOException e) { + if (LOG.isErrorEnabled()) { + LOG.error(e.getMessage()); + } + + throw new RuntimeException(e.getMessage(), e); + } + } + } + } + + private void readConfiguration(Reader reader) throws IOException { + BufferedReader in = new BufferedReader(reader); + String line; + List rules = new ArrayList(); + + while (null != (line = in.readLine())) { + if (line.length() == 0) { + continue; + } + + char first = line.charAt(0); + switch (first) { + case ' ': + case '\n': + case '#': // skip blank & comment lines + break; + case '+': + acceptMode = true; + break; + case '-': + acceptMode = false; + break; + default: + rules.add(line); + break; + } + } + + trie = new PrefixStringMatcher(rules); + } + + @Override + public Configuration getConf() { + return this.conf; + } + + /** + * Main method for invoking this tool + * + * @throws IOException, IndexingException + */ + public static void main(String[] args) throws IOException, IndexingException { + Option helpOpt = new Option("h", "help", false, "show this help message"); + Option rulesOpt = OptionBuilder.withArgName("file").hasArg() + .withDescription( + "Rules file to be used in the tests relative to the conf directory") + .isRequired().create("rules"); + + Options options = new Options(); + options.addOption(helpOpt).addOption(rulesOpt); + + CommandLineParser parser = new GnuParser(); + HelpFormatter formatter = new HelpFormatter(); + String rulesFile; + + try { + CommandLine line = parser.parse(options, args); + + if (line.hasOption("help") || !line.hasOption("rules")) { + formatter + .printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", + options, true); + return; + } + + rulesFile = line.getOptionValue("rules"); + } catch (UnrecognizedOptionException e) { + formatter + .printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", + options, true); + return; + } catch (Exception e) { + LOG.error(StringUtils.stringifyException(e)); + e.printStackTrace(); + return; + } + + MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter(); + Configuration conf = NutchConfiguration.create(); + conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile); + filter.setConf(conf); + + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + String line; + + while ((line = in.readLine()) != null && !line.isEmpty()) { + Metadata metadata = new Metadata(); + metadata.set(Response.CONTENT_TYPE, line); + ParseImpl parse = new ParseImpl("text", + new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)); + + NutchDocument doc = filter.filter(new NutchDocument(), parse, + new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); + + if (doc != null) { + System.out.print("+ "); + System.out.println(line); + } else { + System.out.print("- "); + System.out.println(line); + } + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java b/nutch-plugins/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java new file mode 100644 index 0000000..bca230f --- /dev/null +++ b/nutch-plugins/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer.filter; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.util.NutchConfiguration; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +/** + * JUnit based tests of class + * {@link org.apache.nutch.indexer.filter.MimeTypeIndexingFilter} + * + */ +public class MimeTypeIndexingFilterTest { + + private Configuration conf = NutchConfiguration.create(); + private MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter(); + private String[] MIME_TYPES = { "text/html", "image/png", "application/pdf" }; + private ParseImpl[] parses = new ParseImpl[MIME_TYPES.length]; + private String sampleDir = System.getProperty("test.data", "."); + + @Before + public void setUp() throws Exception { + for (int i = 0; i < MIME_TYPES.length; i++) { + Metadata metadata = new Metadata(); + metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]); + + ParseImpl parse = new ParseImpl("text", + new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)); + + parses[i] = parse; + } + } + + @Test + public void testMissingConfigFile() throws Exception { + String file = conf.get(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, ""); + Assert.assertEquals(String + .format("Property %s must not be present in the the configuration file", + MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE), "", file); + + filter.setConf(conf); + + // property not set so in this cases all documents must pass the filter + for (int i = 0; i < parses.length; i++) { + NutchDocument doc = filter.filter(new NutchDocument(), parses[i], + new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); + + Assert.assertNotNull("All documents must be allowed by default", doc); + } + } + + @Test + public void testAllowOnlyImages() throws Exception { + conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "allow-images.txt"); + filter.setConf(conf); + + for (int i = 0; i < parses.length; i++) { + NutchDocument doc = filter.filter(new NutchDocument(), parses[i], + new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); + + if (MIME_TYPES[i].contains("image")) { + Assert.assertNotNull("Allow only images", doc); + } else { + Assert.assertNull("Block everything else", doc); + } + } + } + + @Test + public void testBlockHTML() throws Exception { + conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "block-html.txt"); + filter.setConf(conf); + + for (int i = 0; i < parses.length; i++) { + NutchDocument doc = filter.filter(new NutchDocument(), parses[i], + new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); + + if (MIME_TYPES[i].contains("html")) { + Assert.assertNull("Block only HTML documents", doc); + } else { + Assert.assertNotNull("Allow everything else", doc); + } + } + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/nutch-extensionpoints/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/nutch-extensionpoints/build.xml b/nutch-plugins/nutch-extensionpoints/build.xml new file mode 100644 index 0000000..45eb815 --- /dev/null +++ b/nutch-plugins/nutch-extensionpoints/build.xml @@ -0,0 +1,30 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="nutch-extensionpoints" default="jar"> + + <import file="../build-plugin.xml"/> + + <!-- + ! Override the compile and jar targets, + ! since there is nothing to compile here. + ! --> + <target name="compile" depends="init, resolve-default"/> + + <!--target name="jar" depends="compile"/--> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/nutch-extensionpoints/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/nutch-extensionpoints/ivy.xml b/nutch-plugins/nutch-extensionpoints/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/nutch-extensionpoints/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/nutch-extensionpoints/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/nutch-extensionpoints/plugin.xml b/nutch-plugins/nutch-extensionpoints/plugin.xml new file mode 100644 index 0000000..8cf7a23 --- /dev/null +++ b/nutch-plugins/nutch-extensionpoints/plugin.xml @@ -0,0 +1,67 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="nutch-extensionpoints" + name="the nutch core extension points" + version="2.0.0" + provider-name="nutch.org"> + + <!-- this file hosts all extension points nutch core code offers. + Please not that plugins can define extension points as well to be extendable.--> + +<extension-point + id="org.apache.nutch.indexer.IndexingFilter" + name="Nutch Indexing Filter"/> + +<extension-point + id="org.apache.nutch.indexer.IndexWriter" + name="Nutch Index Writer"/> + +<extension-point + id="org.apache.nutch.parse.Parser" + name="Nutch Content Parser"/> + +<extension-point + id="org.apache.nutch.parse.HtmlParseFilter" + name="HTML Parse Filter"/> + +<extension-point + id="org.apache.nutch.protocol.Protocol" + name="Nutch Protocol"/> + +<extension-point + id="org.apache.nutch.net.URLFilter" + name="Nutch URL Filter"/> + +<extension-point + id="org.apache.nutch.net.URLExemptionFilter" + name="Nutch URL Ignore Exemption Filter"/> + +<extension-point + id="org.apache.nutch.net.URLNormalizer" + name="Nutch URL Normalizer"/> + +<extension-point + id="org.apache.nutch.scoring.ScoringFilter" + name="Nutch Scoring"/> + +<extension-point + id="org.apache.nutch.segment.SegmentMergeFilter" + name="Nutch Segment Merge Filter"/> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/nutch-extensionpoints/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/nutch-extensionpoints/pom.xml b/nutch-plugins/nutch-extensionpoints/pom.xml new file mode 100644 index 0000000..db76178 --- /dev/null +++ b/nutch-plugins/nutch-extensionpoints/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>nutch-extensionpoints</artifactId> + <packaging>jar</packaging> + + <name>nutch-extensionpoints</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-ext/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-ext/build.xml b/nutch-plugins/parse-ext/build.xml new file mode 100644 index 0000000..25552fa --- /dev/null +++ b/nutch-plugins/parse-ext/build.xml @@ -0,0 +1,32 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parse-ext" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> + <ant target="deploy" inheritall="false" dir="../protocol-file"/> + </target> + + + <copy file="command" todir="${deploy.dir}" preservelastmodified="true"/> + <chmod file="${deploy.dir}/command" perm="755"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-ext/command ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-ext/command b/nutch-plugins/parse-ext/command new file mode 100644 index 0000000..f42c055 --- /dev/null +++ b/nutch-plugins/parse-ext/command @@ -0,0 +1,24 @@ +#!/bin/bash +# +# Sample bash script as external command invoked by parse-ext plugin +# +# 20040701, John Xing + +set -e + +if [ $# -ne 1 ]; then + echo Usage:$0 mimeType >&2 + exit 1 +fi + +case $1 in +"application/vnd.nutch.example.cat") + cat + ;; +"application/vnd.nutch.example.md5sum") + md5sum + ;; +*) + echo "Can't parse mimeType $1" >&2 + exit 1 +esac http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-ext/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-ext/ivy.xml b/nutch-plugins/parse-ext/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/parse-ext/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-ext/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-ext/plugin.xml b/nutch-plugins/parse-ext/plugin.xml new file mode 100644 index 0000000..6819b36 --- /dev/null +++ b/nutch-plugins/parse-ext/plugin.xml @@ -0,0 +1,60 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="parse-ext" + name="External Parser Plug-in" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="parse-ext.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.parse.ext" + name="ExtParse" + point="org.apache.nutch.parse.Parser"> + + <implementation id="ExtParser" + class="org.apache.nutch.parse.ext.ExtParser"> + <parameter name="contentType" value="application/vnd.nutch.example.cat"/> + <parameter name="pathSuffix" value=""/> + <parameter name="command" value="./build/plugins/parse-ext/command"/> + <parameter name="timeout" value="10"/> + <!-- can optionally specify an encoding parameter now, see NUTCH-564--> + <!-- <parameter name="encoding" value="UTF-8"/> --> + </implementation> + + <implementation id="ExtParser" + class="org.apache.nutch.parse.ext.ExtParser"> + <parameter name="contentType" value="application/vnd.nutch.example.md5sum"/> + <parameter name="pathSuffix" value=""/> + <parameter name="command" value="./build/plugins/parse-ext/command"/> + <parameter name="timeout" value="20"/> + <!-- can optionally specify an encoding parameter now, see NUTCH-564--> + <!-- <parameter name="encoding" value="UTF-8"/> --> + </implementation> + + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-ext/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-ext/pom.xml b/nutch-plugins/parse-ext/pom.xml new file mode 100644 index 0000000..5a7b7be --- /dev/null +++ b/nutch-plugins/parse-ext/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>parse-ext</artifactId> + <packaging>jar</packaging> + + <name>parse-ext</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/ExtParser.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/ExtParser.java b/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/ExtParser.java new file mode 100644 index 0000000..94d9b32 --- /dev/null +++ b/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/ExtParser.java @@ -0,0 +1,183 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.ext; + +import org.apache.nutch.protocol.Content; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.Parser; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.OutlinkExtractor; + +import org.apache.nutch.util.CommandRunner; +import org.apache.nutch.net.protocols.Response; +import org.apache.hadoop.conf.Configuration; + +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Hashtable; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.nio.charset.Charset; + +/** + * A wrapper that invokes external command to do real parsing job. + * + * @author John Xing + */ + +public class ExtParser implements Parser { + + public static final Logger LOG = LoggerFactory + .getLogger("org.apache.nutch.parse.ext"); + + static final int BUFFER_SIZE = 4096; + + static final int TIMEOUT_DEFAULT = 30; // in seconds + + // handy map from String contentType to String[] {command, timeoutString, + // encoding} + Hashtable<String, String[]> TYPE_PARAMS_MAP = new Hashtable<String, String[]>(); + + private Configuration conf; + + public ExtParser() { + } + + public ParseResult getParse(Content content) { + + String contentType = content.getContentType(); + + String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType); + if (params == null) + return new ParseStatus(ParseStatus.FAILED, + "No external command defined for contentType: " + contentType) + .getEmptyParseResult(content.getUrl(), getConf()); + + String command = params[0]; + int timeout = Integer.parseInt(params[1]); + String encoding = params[2]; + + if (LOG.isTraceEnabled()) { + LOG.trace("Use " + command + " with timeout=" + timeout + "secs"); + } + + String text = null; + String title = null; + + try { + + byte[] raw = content.getContent(); + + String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH); + if (contentLength != null + && raw.length != Integer.parseInt(contentLength)) { + return new ParseStatus(ParseStatus.FAILED, + ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length + + " bytes. Parser can't handle incomplete " + contentType + + " file.").getEmptyParseResult(content.getUrl(), getConf()); + } + + ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE); + ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4); + + CommandRunner cr = new CommandRunner(); + + cr.setCommand(command + " " + contentType); + cr.setInputStream(new ByteArrayInputStream(raw)); + cr.setStdOutputStream(os); + cr.setStdErrorStream(es); + + cr.setTimeout(timeout); + + cr.evaluate(); + + if (cr.getExitValue() != 0) + return new ParseStatus(ParseStatus.FAILED, "External command " + + command + " failed with error: " + es.toString()) + .getEmptyParseResult(content.getUrl(), getConf()); + + text = os.toString(encoding); + + } catch (Exception e) { // run time exception + return new ParseStatus(e) + .getEmptyParseResult(content.getUrl(), getConf()); + } + + if (text == null) + text = ""; + + if (title == null) + title = ""; + + // collect outlink + Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf()); + + ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, + outlinks, content.getMetadata()); + return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, + parseData)); + } + + public void setConf(Configuration conf) { + this.conf = conf; + Extension[] extensions = PluginRepository.get(conf) + .getExtensionPoint("org.apache.nutch.parse.Parser").getExtensions(); + + String contentType, command, timeoutString, encoding; + + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + + // only look for extensions defined by plugin parse-ext + if (!extension.getDescriptor().getPluginId().equals("parse-ext")) + continue; + + contentType = extension.getAttribute("contentType"); + if (contentType == null || contentType.equals("")) + continue; + + command = extension.getAttribute("command"); + if (command == null || command.equals("")) + continue; + + // null encoding means default + encoding = extension.getAttribute("encoding"); + if (encoding == null) + encoding = Charset.defaultCharset().name(); + + timeoutString = extension.getAttribute("timeout"); + if (timeoutString == null || timeoutString.equals("")) + timeoutString = "" + TIMEOUT_DEFAULT; + + TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString, + encoding }); + } + } + + public Configuration getConf() { + return this.conf; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/package-info.java b/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/package-info.java new file mode 100644 index 0000000..6394489 --- /dev/null +++ b/nutch-plugins/parse-ext/src/main/java/org/apache/nutch/parse/ext/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Parse wrapper to run external command to do the parsing. + */ +package org.apache.nutch.parse.ext; +
