http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java deleted file mode 100644 index 86692ae..0000000 --- a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.protocol.selenium; - -import java.io.BufferedInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.InputStream; -import java.io.OutputStream; -import java.net.URL; -import java.util.concurrent.TimeUnit; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IOUtils; -import org.openqa.selenium.By; -import org.openqa.selenium.OutputType; -import org.openqa.selenium.TakesScreenshot; -import org.openqa.selenium.TimeoutException; -import org.openqa.selenium.WebDriver; -import org.openqa.selenium.chrome.ChromeDriver; -import org.openqa.selenium.firefox.FirefoxBinary; -import org.openqa.selenium.firefox.FirefoxDriver; -import org.openqa.selenium.firefox.FirefoxProfile; -import org.openqa.selenium.io.TemporaryFilesystem; -import org.openqa.selenium.remote.DesiredCapabilities; -import org.openqa.selenium.remote.RemoteWebDriver; -import org.openqa.selenium.safari.SafariDriver; -import org.openqa.selenium.phantomjs.PhantomJSDriver; -import org.openqa.selenium.phantomjs.PhantomJSDriverService; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.opera.core.systems.OperaDriver; - -public class HttpWebClient { - - private static final Logger LOG = LoggerFactory.getLogger(HttpWebClient.class); - - public static ThreadLocal<WebDriver> threadWebDriver = new ThreadLocal<WebDriver>() { - - @Override - protected WebDriver initialValue() - { - FirefoxProfile profile = new FirefoxProfile(); - profile.setPreference("permissions.default.stylesheet", 2); - profile.setPreference("permissions.default.image", 2); - profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", "false"); - profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, "localhost"); - WebDriver driver = new FirefoxDriver(profile); - return driver; - }; - }; - - public static WebDriver getDriverForPage(String url, Configuration conf) { - WebDriver driver = null; - DesiredCapabilities capabilities = null; - long pageLoadWait = conf.getLong("page.load.delay", 3); - - try { - String driverType = conf.get("selenium.driver", "firefox"); - switch (driverType) { - case "firefox": - String allowedHost = conf.get("selenium.firefox.allowed.hosts", "localhost"); - long firefoxBinaryTimeout = conf.getLong("selenium.firefox.binary.timeout", 45); - boolean enableFlashPlayer = conf.getBoolean("selenium.firefox.enable.flash", false); - int loadImage = conf.getInt("selenium.firefox.load.image", 1); - int loadStylesheet = conf.getInt("selenium.firefox.load.stylesheet", 1); - FirefoxProfile profile = new FirefoxProfile(); - FirefoxBinary binary = new FirefoxBinary(); - profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, allowedHost); - profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", enableFlashPlayer); - profile.setPreference("permissions.default.stylesheet", loadStylesheet); - profile.setPreference("permissions.default.image", loadImage); - binary.setTimeout(TimeUnit.SECONDS.toMillis(firefoxBinaryTimeout)); - driver = new FirefoxDriver(binary, profile); - break; - case "chrome": - driver = new ChromeDriver(); - break; - case "safari": - driver = new SafariDriver(); - break; - case "opera": - driver = new OperaDriver(); - break; - case "phantomjs": - driver = new PhantomJSDriver(); - break; - case "remote": - String seleniumHubHost = conf.get("selenium.hub.host", "localhost"); - int seleniumHubPort = Integer.parseInt(conf.get("selenium.hub.port", "4444")); - String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub"); - String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http"); - String seleniumGridDriver = conf.get("selenium.grid.driver","firefox"); - String seleniumGridBinary = conf.get("selenium.grid.binary"); - - switch (seleniumGridDriver){ - case "firefox": - capabilities = DesiredCapabilities.firefox(); - capabilities.setBrowserName("firefox"); - capabilities.setJavascriptEnabled(true); - capabilities.setCapability("firefox_binary",seleniumGridBinary); - System.setProperty("webdriver.reap_profile", "false"); - driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities); - break; - case "phantomjs": - capabilities = DesiredCapabilities.phantomjs(); - capabilities.setBrowserName("phantomjs"); - capabilities.setJavascriptEnabled(true); - capabilities.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,seleniumGridBinary); - driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities); - break; - default: - LOG.error("The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType); - driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), DesiredCapabilities.firefox()); - break; - } - break; - default: - LOG.error("The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType); - driver = new FirefoxDriver(); - break; - } - LOG.debug("Selenium {} WebDriver selected.", driverType); - - driver.manage().timeouts().pageLoadTimeout(pageLoadWait, TimeUnit.SECONDS); - driver.get(url); - } catch (Exception e) { - if(e instanceof TimeoutException) { - LOG.debug("Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far..."); - return driver; - } - cleanUpDriver(driver); - throw new RuntimeException(e); - } - - return driver; - } - - public static String getHTMLContent(WebDriver driver, Configuration conf) { - if (conf.getBoolean("take.screenshot", false)) { - takeScreenshot(driver, conf); - } - - return driver.findElement(By.tagName("body")).getAttribute("innerHTML"); - } - - public static void cleanUpDriver(WebDriver driver) { - if (driver != null) { - try { - driver.close(); - driver.quit(); - TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); - } catch (Exception e) { - throw new RuntimeException(e); - } - } - } - - /** - * Function for obtaining the HTML BODY using the selected - * {@link org.openqa.selenium.WebDriver}. - * There are a number of configuration properties within - * <code>nutch-site.xml</code> which determine whether to - * take screenshots of the rendered pages and persist them - * as timestamped .png's into HDFS. - * @param url the URL to fetch and render - * @param conf the {@link org.apache.hadoop.conf.Configuration} - * @return the rendered inner HTML page - */ - public static String getHtmlPage(String url, Configuration conf) { - WebDriver driver = getDriverForPage(url, conf); - - try { - if (conf.getBoolean("take.screenshot", false)) { - takeScreenshot(driver, conf); - } - - String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML"); - return innerHtml; - - // I'm sure this catch statement is a code smell ; borrowing it from lib-htmlunit - } catch (Exception e) { - TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles(); - throw new RuntimeException(e); - } finally { - cleanUpDriver(driver); - } - } - - public static String getHtmlPage(String url) { - return getHtmlPage(url, null); - } - - private static void takeScreenshot(WebDriver driver, Configuration conf) { - try { - String url = driver.getCurrentUrl(); - File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE); - LOG.debug("In-memory screenshot taken of: {}", url); - FileSystem fs = FileSystem.get(conf); - if (conf.get("screenshot.location") != null) { - Path screenshotPath = new Path(conf.get("screenshot.location") + "/" + srcFile.getName()); - OutputStream os = null; - if (!fs.exists(screenshotPath)) { - LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName()); - os = fs.create(screenshotPath); - } - InputStream is = new BufferedInputStream(new FileInputStream(srcFile)); - IOUtils.copyBytes(is, os, conf); - LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName()); - } else { - LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for " - + "'screenshot.location' is absent from nutch-site.xml.", url); - } - } catch (Exception e) { - cleanUpDriver(driver); - throw new RuntimeException(e); - } - } -}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-xml/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/lib-xml/build.xml b/src/plugin/lib-xml/build.xml deleted file mode 100644 index 0f87c07..0000000 --- a/src/plugin/lib-xml/build.xml +++ /dev/null @@ -1,36 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="lib-xml" default="jar"> - - <import file="../build-plugin.xml" /> - - <!-- - ! Override the compile and jar targets, - ! since there is nothing to compile here. - ! --> - <target name="compile" depends="init, resolve-default" /> - - <!-- - <target name="jar" depends="compile"> - <copy todir="${build.dir}" verbose="true"> - <fileset dir="./lib" includes="**/*.jar" /> - </copy> - </target> - --> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-xml/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/lib-xml/ivy.xml b/src/plugin/lib-xml/ivy.xml deleted file mode 100644 index 414f38a..0000000 --- a/src/plugin/lib-xml/ivy.xml +++ /dev/null @@ -1,44 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - <dependency org="org.jdom" name="jdom" rev="1.1" conf="*->default"/> - <dependency org="jaxen" name="jaxen" rev="1.1.1" conf="*->master"/> - <dependency org="xerces" name="xercesImpl" rev="2.11.0" conf="*->master"/> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/lib-xml/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/lib-xml/plugin.xml b/src/plugin/lib-xml/plugin.xml deleted file mode 100644 index 79bd17f..0000000 --- a/src/plugin/lib-xml/plugin.xml +++ /dev/null @@ -1,65 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<!-- - ! XML library - Gathers many XML related libraries: - ! - ! * Jaxen - ! - Download : http://jaxen.org/releases.html - ! - License : http://jaxen.org/license.html - ! - ! * Xerces-J 2.6.1 - ! - Download : http://xerces.apache.org/xerces2-j/download.cgi - ! - License : http://www.apache.org/licenses/LICENSE-2.0 - ! - ! * SAXPath 1.0 FCS - ! - Note : SAXPath has been incorporated into Jaxen. - ! It has been merged into the Jaxen codebase - ! and is no longer being maintained separately - ! - Download : http://sourceforge.net/project/showfiles.php?group_id=26014 - ! - License : OSI-Approved Open Source - ! - ! * jdom 1.0 beta8-dev - ! - Download : http://www.jdom.org/downloads/index.html - ! - License : http://www.jdom.org/docs/faq.html#a0030 - ! - !--> -<plugin - id="lib-xml" - name="XML Libraries" - version="1.0" - provider-name="org.apache.nutch.xml"> - - <runtime> - <library name="jaxen-core.jar"> - <export name="*"/> - </library> - <library name="jaxen-jdom.jar"> - <export name="*"/> - </library> - <library name="xercesImpl.jar"> - <export name="*"/> - </library> - <library name="saxpath.jar"> - <export name="*"/> - </library> - <library name="jdom.jar"> - <export name="*"/> - </library> - </runtime> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/microformats-reltag/build.xml b/src/plugin/microformats-reltag/build.xml deleted file mode 100644 index 395afee..0000000 --- a/src/plugin/microformats-reltag/build.xml +++ /dev/null @@ -1,27 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="microformats-reltag" default="jar-core"> - - <import file="../build-plugin.xml"/> - - <!-- Deploy Unit test dependencies --> - <target name="deps-test"> - <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> - </target> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/microformats-reltag/ivy.xml b/src/plugin/microformats-reltag/ivy.xml deleted file mode 100644 index 1a86d68..0000000 --- a/src/plugin/microformats-reltag/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/microformats-reltag/plugin.xml b/src/plugin/microformats-reltag/plugin.xml deleted file mode 100644 index b35e1f4..0000000 --- a/src/plugin/microformats-reltag/plugin.xml +++ /dev/null @@ -1,49 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="microformats-reltag" - name="Rel-Tag microformat Parser/Indexer/Querier" - version="1.0.0" - provider-name="nutch.org"> - - <runtime> - <library name="microformats-reltag.jar"> - <export name="*"/> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - </requires> - - <extension id="org.apache.nutch.microformats.reltag.RelTagParser" - name="Rel-Tag parser" - point="org.apache.nutch.parse.HtmlParseFilter"> - <implementation id="RelTagParser" - class="org.apache.nutch.microformats.reltag.RelTagParser"/> - </extension> - - <extension id="org.apache.nutch.microformats.reltag.RelTagIndexingFilter" - name="Rel-Tag indexing filter" - point="org.apache.nutch.indexer.IndexingFilter"> - <implementation id="RelTagIndexingFilter" - class="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"/> - </extension> - -</plugin> - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java deleted file mode 100644 index e50a150..0000000 --- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java +++ /dev/null @@ -1,77 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.microformats.reltag; - -// Nutch imports -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.IndexingFilter; -import org.apache.nutch.indexer.IndexingException; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.hadoop.io.Text; -import org.apache.nutch.parse.Parse; - -// Hadoop imports -import org.apache.hadoop.conf.Configuration; - -/** - * An {@link org.apache.nutch.indexer.IndexingFilter} that add <code>tag</code> - * field(s) to the document. - * - * @see <a href="http://www.microformats.org/wiki/rel-tag"> - * http://www.microformats.org/wiki/rel-tag</a> - * @author Jérôme Charron - */ -public class RelTagIndexingFilter implements IndexingFilter { - - private Configuration conf; - - // Inherited JavaDoc - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, - CrawlDatum datum, Inlinks inlinks) throws IndexingException { - - // Check if some Rel-Tags found, possibly put there by RelTagParser - String[] tags = parse.getData().getParseMeta() - .getValues(RelTagParser.REL_TAG); - if (tags != null) { - for (int i = 0; i < tags.length; i++) { - doc.add("tag", tags[i]); - } - } - - return doc; - } - - /* - * ----------------------------- * <implementation:Configurable> * - * ----------------------------- - */ - - public void setConf(Configuration conf) { - this.conf = conf; - } - - public Configuration getConf() { - return this.conf; - } - - /* - * ------------------------------ * </implementation:Configurable> * - * ------------------------------ - */ - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java deleted file mode 100644 index 9176a1e..0000000 --- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java +++ /dev/null @@ -1,148 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.microformats.reltag; - -// JDK imports -import java.net.URL; -import java.net.URLDecoder; -import java.util.Iterator; -import java.util.Set; -import java.util.TreeSet; -import org.w3c.dom.DocumentFragment; -import org.w3c.dom.NamedNodeMap; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; - -// Commons Logging imports -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -// Nutch imports -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.parse.HTMLMetaTags; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.parse.HtmlParseFilter; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.util.StringUtil; - -// Hadoop imports -import org.apache.hadoop.conf.Configuration; - -/** - * Adds microformat rel-tags of document if found. - * - * @see <a href="http://www.microformats.org/wiki/rel-tag"> - * http://www.microformats.org/wiki/rel-tag</a> - */ -public class RelTagParser implements HtmlParseFilter { - - public final static Logger LOG = LoggerFactory.getLogger(RelTagParser.class); - - public final static String REL_TAG = "Rel-Tag"; - - private Configuration conf = null; - - /** - * Scan the HTML document looking at possible rel-tags - */ - public ParseResult filter(Content content, ParseResult parseResult, - HTMLMetaTags metaTags, DocumentFragment doc) { - - // get parse obj - Parse parse = parseResult.get(content.getUrl()); - // Trying to find the document's rel-tags - Parser parser = new Parser(doc); - Set<?> tags = parser.getRelTags(); - Iterator<?> iter = tags.iterator(); - Metadata metadata = parse.getData().getParseMeta(); - while (iter.hasNext()) - metadata.add(REL_TAG, (String) iter.next()); - - return parseResult; - } - - private static class Parser { - - Set<String> tags = null; - - Parser(Node node) { - tags = new TreeSet<String>(); - parse(node); - } - - Set<String> getRelTags() { - return tags; - } - - void parse(Node node) { - - if (node.getNodeType() == Node.ELEMENT_NODE) { - // Look for <a> tag - if ("a".equalsIgnoreCase(node.getNodeName())) { - NamedNodeMap attrs = node.getAttributes(); - Node hrefNode = attrs.getNamedItem("href"); - // Checks that it contains a href attribute - if (hrefNode != null) { - Node relNode = attrs.getNamedItem("rel"); - // Checks that it contains a rel attribute too - if (relNode != null) { - // Finaly checks that rel=tag - if ("tag".equalsIgnoreCase(relNode.getNodeValue())) { - String tag = parseTag(hrefNode.getNodeValue()); - if (!StringUtil.isEmpty(tag)) { - if (!tags.contains(tag)) { - tags.add(tag); - LOG.debug("Adding tag: " + tag + " to tag set."); - } - } - } - } - } - } - } - - // Recurse - NodeList children = node.getChildNodes(); - for (int i = 0; children != null && i < children.getLength(); i++) - parse(children.item(i)); - } - - private final static String parseTag(String url) { - String tag = null; - try { - URL u = new URL(url); - String path = u.getPath(); - tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), - "UTF-8"); - } catch (Exception e) { - // Malformed tag... - tag = null; - } - return tag; - } - - } - - public void setConf(Configuration conf) { - this.conf = conf; - } - - public Configuration getConf() { - return this.conf; - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html ---------------------------------------------------------------------- diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html deleted file mode 100644 index bef5409..0000000 --- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html +++ /dev/null @@ -1,8 +0,0 @@ -<html> -<body> -<p> -A microformats <a href="http://www.microformats.org/wiki/Rel-Tag">Rel-Tag</a> -Parser/Indexer/Querier plugin. -</p> -</body> -</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/mimetype-filter/build.xml b/src/plugin/mimetype-filter/build.xml deleted file mode 100644 index 977e643..0000000 --- a/src/plugin/mimetype-filter/build.xml +++ /dev/null @@ -1,28 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="mimetype-filter" default="jar-core"> - - <import file="../build-plugin.xml" /> - - <!-- for junit test --> - <mkdir dir="${build.test}/data"/> - <copy todir="${build.test}/data"> - <fileset dir="sample" includes="**/*.txt"/> - </copy> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/mimetype-filter/ivy.xml b/src/plugin/mimetype-filter/ivy.xml deleted file mode 100644 index 0a363f7..0000000 --- a/src/plugin/mimetype-filter/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="${nutch.root}/ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/mimetype-filter/plugin.xml b/src/plugin/mimetype-filter/plugin.xml deleted file mode 100644 index d038447..0000000 --- a/src/plugin/mimetype-filter/plugin.xml +++ /dev/null @@ -1,37 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="mimetype-filter" - name="Filter indexed documents by the detected MIME" - version="1.0.0" - provider-name="nutch.org"> - - <runtime> - <library name="mimetype-filter.jar"> - <export name="*"/> - </library> - </runtime> - - <extension id="org.apache.nutch.indexer.filter" - name="Nutch MIME filter" - point="org.apache.nutch.indexer.IndexingFilter"> - <implementation id="MimeTypeIndexingFilter" - class="org.apache.nutch.indexer.filter.MimeTypeIndexingFilter"/> - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/sample/allow-images.txt ---------------------------------------------------------------------- diff --git a/src/plugin/mimetype-filter/sample/allow-images.txt b/src/plugin/mimetype-filter/sample/allow-images.txt deleted file mode 100644 index 0f5f136..0000000 --- a/src/plugin/mimetype-filter/sample/allow-images.txt +++ /dev/null @@ -1,34 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This filter can be configured to work in one of two modes (similar to -# suffix-url-filter) - -# default to reject ('-'): in this mode, only documents with a mimetype that -# match the ones specified in the config file will be accepted, all other -# mimetypes will be rejected. - -# default to accept ('+'): in this mode, only documents with a mimetype -# that match the ones specified in the config file will be rejected, -# all other mimetypes will be accepted. - -# The format of this config file is one mimetype per line, with no preceding -# whitespace. Order, in which suffixes are specified, doesn't matter. Blank -# lines and comments (#) are allowed. -# - -- - -image http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/sample/block-html.txt ---------------------------------------------------------------------- diff --git a/src/plugin/mimetype-filter/sample/block-html.txt b/src/plugin/mimetype-filter/sample/block-html.txt deleted file mode 100644 index 69600ec..0000000 --- a/src/plugin/mimetype-filter/sample/block-html.txt +++ /dev/null @@ -1,34 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This filter can be configured to work in one of two modes (similar to -# suffix-url-filter) - -# default to reject ('-'): in this mode, only documents with a mimetype that -# match the ones specified in the config file will be accepted, all other -# mimetypes will be rejected. - -# default to accept ('+'): in this mode, only documents with a mimetype -# that match the ones specified in the config file will be rejected, -# all other mimetypes will be accepted. - -# The format of this config file is one mimetype per line, with no preceding -# whitespace. Order, in which suffixes are specified, doesn't matter. Blank -# lines and comments (#) are allowed. -# - -+ - -text/html \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java b/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java deleted file mode 100644 index 494d888..0000000 --- a/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java +++ /dev/null @@ -1,273 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.indexer.filter; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.commons.cli.Option; -import org.apache.commons.cli.Options; -import org.apache.commons.cli.OptionBuilder; -import org.apache.commons.cli.CommandLine; -import org.apache.commons.cli.CommandLineParser; -import org.apache.commons.cli.HelpFormatter; -import org.apache.commons.cli.GnuParser; -import org.apache.commons.cli.UnrecognizedOptionException; - -// Nutch imports -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; - -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; - -import org.apache.nutch.indexer.IndexingException; -import org.apache.nutch.indexer.IndexingFilter; -import org.apache.nutch.indexer.NutchDocument; - -import org.apache.nutch.net.protocols.Response; - -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseStatus; - -import org.apache.nutch.metadata.Metadata; - -import org.apache.nutch.util.MimeUtil; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.nutch.util.PrefixStringMatcher; -import org.apache.nutch.util.TrieStringMatcher; -import org.apache.tika.Tika; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.Reader; -import java.util.ArrayList; -import java.util.List; - -/** - * An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering - * of documents based on the MIME Type detected by Tika - * - */ -public class MimeTypeIndexingFilter implements IndexingFilter { - - public static final String MIMEFILTER_REGEX_FILE = "mimetype.filter.file"; - - private static final Logger LOG = LoggerFactory - .getLogger(MimeTypeIndexingFilter.class); - - private MimeUtil MIME; - private Tika tika = new Tika(); - - private TrieStringMatcher trie; - - private Configuration conf; - - private boolean acceptMode = true; - - // Inherited JavaDoc - @Override - public NutchDocument filter(NutchDocument doc, Parse parse, Text url, - CrawlDatum datum, Inlinks inlinks) throws IndexingException { - - String mimeType; - String contentType; - - Writable tcontentType = datum.getMetaData() - .get(new Text(Response.CONTENT_TYPE)); - - if (tcontentType != null) { - contentType = tcontentType.toString(); - } else { - contentType = parse.getData().getMeta(Response.CONTENT_TYPE); - } - - if (contentType == null) { - mimeType = tika.detect(url.toString()); - } else { - mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType)); - } - - contentType = mimeType; - - if (LOG.isInfoEnabled()) { - LOG.info(String.format("[%s] %s", contentType, url)); - } - - if (trie != null) { - if (trie.shortestMatch(contentType) == null) { - // no match, but - if (acceptMode) { - return doc; - } - return null; - } else { - // matched, but we are blocking - if (acceptMode) { - return null; - } - } - } - - return doc; - } - - /* - * ----------------------------- - * <implementation:Configurable> * - * ----------------------------- - */ - @Override - public void setConf(Configuration conf) { - this.conf = conf; - MIME = new MimeUtil(conf); - - // load the file of the values - String file = conf.get(MIMEFILTER_REGEX_FILE, ""); - - if (file != null) { - if (file.isEmpty()) { - LOG.warn(String - .format("Missing %s property, ALL mimetypes will be allowed", - MIMEFILTER_REGEX_FILE)); - } else { - Reader reader = conf.getConfResourceAsReader(file); - - try { - readConfiguration(reader); - } catch (IOException e) { - if (LOG.isErrorEnabled()) { - LOG.error(e.getMessage()); - } - - throw new RuntimeException(e.getMessage(), e); - } - } - } - } - - private void readConfiguration(Reader reader) throws IOException { - BufferedReader in = new BufferedReader(reader); - String line; - List rules = new ArrayList(); - - while (null != (line = in.readLine())) { - if (line.length() == 0) { - continue; - } - - char first = line.charAt(0); - switch (first) { - case ' ': - case '\n': - case '#': // skip blank & comment lines - break; - case '+': - acceptMode = true; - break; - case '-': - acceptMode = false; - break; - default: - rules.add(line); - break; - } - } - - trie = new PrefixStringMatcher(rules); - } - - @Override - public Configuration getConf() { - return this.conf; - } - - /** - * Main method for invoking this tool - * - * @throws IOException, IndexingException - */ - public static void main(String[] args) throws IOException, IndexingException { - Option helpOpt = new Option("h", "help", false, "show this help message"); - Option rulesOpt = OptionBuilder.withArgName("file").hasArg() - .withDescription( - "Rules file to be used in the tests relative to the conf directory") - .isRequired().create("rules"); - - Options options = new Options(); - options.addOption(helpOpt).addOption(rulesOpt); - - CommandLineParser parser = new GnuParser(); - HelpFormatter formatter = new HelpFormatter(); - String rulesFile; - - try { - CommandLine line = parser.parse(options, args); - - if (line.hasOption("help") || !line.hasOption("rules")) { - formatter - .printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", - options, true); - return; - } - - rulesFile = line.getOptionValue("rules"); - } catch (UnrecognizedOptionException e) { - formatter - .printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", - options, true); - return; - } catch (Exception e) { - LOG.error(StringUtils.stringifyException(e)); - e.printStackTrace(); - return; - } - - MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter(); - Configuration conf = NutchConfiguration.create(); - conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile); - filter.setConf(conf); - - BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); - String line; - - while ((line = in.readLine()) != null && !line.isEmpty()) { - Metadata metadata = new Metadata(); - metadata.set(Response.CONTENT_TYPE, line); - ParseImpl parse = new ParseImpl("text", - new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)); - - NutchDocument doc = filter.filter(new NutchDocument(), parse, - new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); - - if (doc != null) { - System.out.print("+ "); - System.out.println(line); - } else { - System.out.print("- "); - System.out.println(line); - } - } - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java ---------------------------------------------------------------------- diff --git a/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java b/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java deleted file mode 100644 index bca230f..0000000 --- a/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.indexer.filter; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.net.protocols.Response; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.util.NutchConfiguration; - -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -/** - * JUnit based tests of class - * {@link org.apache.nutch.indexer.filter.MimeTypeIndexingFilter} - * - */ -public class MimeTypeIndexingFilterTest { - - private Configuration conf = NutchConfiguration.create(); - private MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter(); - private String[] MIME_TYPES = { "text/html", "image/png", "application/pdf" }; - private ParseImpl[] parses = new ParseImpl[MIME_TYPES.length]; - private String sampleDir = System.getProperty("test.data", "."); - - @Before - public void setUp() throws Exception { - for (int i = 0; i < MIME_TYPES.length; i++) { - Metadata metadata = new Metadata(); - metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]); - - ParseImpl parse = new ParseImpl("text", - new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)); - - parses[i] = parse; - } - } - - @Test - public void testMissingConfigFile() throws Exception { - String file = conf.get(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, ""); - Assert.assertEquals(String - .format("Property %s must not be present in the the configuration file", - MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE), "", file); - - filter.setConf(conf); - - // property not set so in this cases all documents must pass the filter - for (int i = 0; i < parses.length; i++) { - NutchDocument doc = filter.filter(new NutchDocument(), parses[i], - new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); - - Assert.assertNotNull("All documents must be allowed by default", doc); - } - } - - @Test - public void testAllowOnlyImages() throws Exception { - conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "allow-images.txt"); - filter.setConf(conf); - - for (int i = 0; i < parses.length; i++) { - NutchDocument doc = filter.filter(new NutchDocument(), parses[i], - new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); - - if (MIME_TYPES[i].contains("image")) { - Assert.assertNotNull("Allow only images", doc); - } else { - Assert.assertNull("Block everything else", doc); - } - } - } - - @Test - public void testBlockHTML() throws Exception { - conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "block-html.txt"); - filter.setConf(conf); - - for (int i = 0; i < parses.length; i++) { - NutchDocument doc = filter.filter(new NutchDocument(), parses[i], - new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); - - if (MIME_TYPES[i].contains("html")) { - Assert.assertNull("Block only HTML documents", doc); - } else { - Assert.assertNotNull("Allow everything else", doc); - } - } - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/nutch-extensionpoints/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/nutch-extensionpoints/build.xml b/src/plugin/nutch-extensionpoints/build.xml deleted file mode 100644 index 45eb815..0000000 --- a/src/plugin/nutch-extensionpoints/build.xml +++ /dev/null @@ -1,30 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="nutch-extensionpoints" default="jar"> - - <import file="../build-plugin.xml"/> - - <!-- - ! Override the compile and jar targets, - ! since there is nothing to compile here. - ! --> - <target name="compile" depends="init, resolve-default"/> - - <!--target name="jar" depends="compile"/--> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/nutch-extensionpoints/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/nutch-extensionpoints/ivy.xml b/src/plugin/nutch-extensionpoints/ivy.xml deleted file mode 100644 index 1a86d68..0000000 --- a/src/plugin/nutch-extensionpoints/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/nutch-extensionpoints/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/nutch-extensionpoints/plugin.xml b/src/plugin/nutch-extensionpoints/plugin.xml deleted file mode 100644 index 8cf7a23..0000000 --- a/src/plugin/nutch-extensionpoints/plugin.xml +++ /dev/null @@ -1,67 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="nutch-extensionpoints" - name="the nutch core extension points" - version="2.0.0" - provider-name="nutch.org"> - - <!-- this file hosts all extension points nutch core code offers. - Please not that plugins can define extension points as well to be extendable.--> - -<extension-point - id="org.apache.nutch.indexer.IndexingFilter" - name="Nutch Indexing Filter"/> - -<extension-point - id="org.apache.nutch.indexer.IndexWriter" - name="Nutch Index Writer"/> - -<extension-point - id="org.apache.nutch.parse.Parser" - name="Nutch Content Parser"/> - -<extension-point - id="org.apache.nutch.parse.HtmlParseFilter" - name="HTML Parse Filter"/> - -<extension-point - id="org.apache.nutch.protocol.Protocol" - name="Nutch Protocol"/> - -<extension-point - id="org.apache.nutch.net.URLFilter" - name="Nutch URL Filter"/> - -<extension-point - id="org.apache.nutch.net.URLExemptionFilter" - name="Nutch URL Ignore Exemption Filter"/> - -<extension-point - id="org.apache.nutch.net.URLNormalizer" - name="Nutch URL Normalizer"/> - -<extension-point - id="org.apache.nutch.scoring.ScoringFilter" - name="Nutch Scoring"/> - -<extension-point - id="org.apache.nutch.segment.SegmentMergeFilter" - name="Nutch Segment Merge Filter"/> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-ext/build.xml b/src/plugin/parse-ext/build.xml deleted file mode 100644 index 25552fa..0000000 --- a/src/plugin/parse-ext/build.xml +++ /dev/null @@ -1,32 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="parse-ext" default="jar-core"> - - <import file="../build-plugin.xml"/> - - <!-- Deploy Unit test dependencies --> - <target name="deps-test"> - <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> - <ant target="deploy" inheritall="false" dir="../protocol-file"/> - </target> - - - <copy file="command" todir="${deploy.dir}" preservelastmodified="true"/> - <chmod file="${deploy.dir}/command" perm="755"/> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/command ---------------------------------------------------------------------- diff --git a/src/plugin/parse-ext/command b/src/plugin/parse-ext/command deleted file mode 100644 index f42c055..0000000 --- a/src/plugin/parse-ext/command +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash -# -# Sample bash script as external command invoked by parse-ext plugin -# -# 20040701, John Xing - -set -e - -if [ $# -ne 1 ]; then - echo Usage:$0 mimeType >&2 - exit 1 -fi - -case $1 in -"application/vnd.nutch.example.cat") - cat - ;; -"application/vnd.nutch.example.md5sum") - md5sum - ;; -*) - echo "Can't parse mimeType $1" >&2 - exit 1 -esac http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-ext/ivy.xml b/src/plugin/parse-ext/ivy.xml deleted file mode 100644 index 1a86d68..0000000 --- a/src/plugin/parse-ext/ivy.xml +++ /dev/null @@ -1,41 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../..//ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-ext/plugin.xml b/src/plugin/parse-ext/plugin.xml deleted file mode 100644 index 6819b36..0000000 --- a/src/plugin/parse-ext/plugin.xml +++ /dev/null @@ -1,60 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="parse-ext" - name="External Parser Plug-in" - version="1.0.0" - provider-name="nutch.org"> - - <runtime> - <library name="parse-ext.jar"> - <export name="*"/> - </library> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - </requires> - - <extension id="org.apache.nutch.parse.ext" - name="ExtParse" - point="org.apache.nutch.parse.Parser"> - - <implementation id="ExtParser" - class="org.apache.nutch.parse.ext.ExtParser"> - <parameter name="contentType" value="application/vnd.nutch.example.cat"/> - <parameter name="pathSuffix" value=""/> - <parameter name="command" value="./build/plugins/parse-ext/command"/> - <parameter name="timeout" value="10"/> - <!-- can optionally specify an encoding parameter now, see NUTCH-564--> - <!-- <parameter name="encoding" value="UTF-8"/> --> - </implementation> - - <implementation id="ExtParser" - class="org.apache.nutch.parse.ext.ExtParser"> - <parameter name="contentType" value="application/vnd.nutch.example.md5sum"/> - <parameter name="pathSuffix" value=""/> - <parameter name="command" value="./build/plugins/parse-ext/command"/> - <parameter name="timeout" value="20"/> - <!-- can optionally specify an encoding parameter now, see NUTCH-564--> - <!-- <parameter name="encoding" value="UTF-8"/> --> - </implementation> - - </extension> - -</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java b/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java deleted file mode 100644 index 94d9b32..0000000 --- a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java +++ /dev/null @@ -1,183 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.ext; - -import org.apache.nutch.protocol.Content; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.parse.Parser; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.Outlink; -import org.apache.nutch.parse.OutlinkExtractor; - -import org.apache.nutch.util.CommandRunner; -import org.apache.nutch.net.protocols.Response; -import org.apache.hadoop.conf.Configuration; - -import org.apache.nutch.plugin.Extension; -import org.apache.nutch.plugin.PluginRepository; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.Hashtable; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.nio.charset.Charset; - -/** - * A wrapper that invokes external command to do real parsing job. - * - * @author John Xing - */ - -public class ExtParser implements Parser { - - public static final Logger LOG = LoggerFactory - .getLogger("org.apache.nutch.parse.ext"); - - static final int BUFFER_SIZE = 4096; - - static final int TIMEOUT_DEFAULT = 30; // in seconds - - // handy map from String contentType to String[] {command, timeoutString, - // encoding} - Hashtable<String, String[]> TYPE_PARAMS_MAP = new Hashtable<String, String[]>(); - - private Configuration conf; - - public ExtParser() { - } - - public ParseResult getParse(Content content) { - - String contentType = content.getContentType(); - - String[] params = (String[]) TYPE_PARAMS_MAP.get(contentType); - if (params == null) - return new ParseStatus(ParseStatus.FAILED, - "No external command defined for contentType: " + contentType) - .getEmptyParseResult(content.getUrl(), getConf()); - - String command = params[0]; - int timeout = Integer.parseInt(params[1]); - String encoding = params[2]; - - if (LOG.isTraceEnabled()) { - LOG.trace("Use " + command + " with timeout=" + timeout + "secs"); - } - - String text = null; - String title = null; - - try { - - byte[] raw = content.getContent(); - - String contentLength = content.getMetadata().get(Response.CONTENT_LENGTH); - if (contentLength != null - && raw.length != Integer.parseInt(contentLength)) { - return new ParseStatus(ParseStatus.FAILED, - ParseStatus.FAILED_TRUNCATED, "Content truncated at " + raw.length - + " bytes. Parser can't handle incomplete " + contentType - + " file.").getEmptyParseResult(content.getUrl(), getConf()); - } - - ByteArrayOutputStream os = new ByteArrayOutputStream(BUFFER_SIZE); - ByteArrayOutputStream es = new ByteArrayOutputStream(BUFFER_SIZE / 4); - - CommandRunner cr = new CommandRunner(); - - cr.setCommand(command + " " + contentType); - cr.setInputStream(new ByteArrayInputStream(raw)); - cr.setStdOutputStream(os); - cr.setStdErrorStream(es); - - cr.setTimeout(timeout); - - cr.evaluate(); - - if (cr.getExitValue() != 0) - return new ParseStatus(ParseStatus.FAILED, "External command " - + command + " failed with error: " + es.toString()) - .getEmptyParseResult(content.getUrl(), getConf()); - - text = os.toString(encoding); - - } catch (Exception e) { // run time exception - return new ParseStatus(e) - .getEmptyParseResult(content.getUrl(), getConf()); - } - - if (text == null) - text = ""; - - if (title == null) - title = ""; - - // collect outlink - Outlink[] outlinks = OutlinkExtractor.getOutlinks(text, getConf()); - - ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, - outlinks, content.getMetadata()); - return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, - parseData)); - } - - public void setConf(Configuration conf) { - this.conf = conf; - Extension[] extensions = PluginRepository.get(conf) - .getExtensionPoint("org.apache.nutch.parse.Parser").getExtensions(); - - String contentType, command, timeoutString, encoding; - - for (int i = 0; i < extensions.length; i++) { - Extension extension = extensions[i]; - - // only look for extensions defined by plugin parse-ext - if (!extension.getDescriptor().getPluginId().equals("parse-ext")) - continue; - - contentType = extension.getAttribute("contentType"); - if (contentType == null || contentType.equals("")) - continue; - - command = extension.getAttribute("command"); - if (command == null || command.equals("")) - continue; - - // null encoding means default - encoding = extension.getAttribute("encoding"); - if (encoding == null) - encoding = Charset.defaultCharset().name(); - - timeoutString = extension.getAttribute("timeout"); - if (timeoutString == null || timeoutString.equals("")) - timeoutString = "" + TIMEOUT_DEFAULT; - - TYPE_PARAMS_MAP.put(contentType, new String[] { command, timeoutString, - encoding }); - } - } - - public Configuration getConf() { - return this.conf; - } -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java b/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java deleted file mode 100644 index 6394489..0000000 --- a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Parse wrapper to run external command to do the parsing. - */ -package org.apache.nutch.parse.ext; - http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java b/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java deleted file mode 100644 index a399273..0000000 --- a/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java +++ /dev/null @@ -1,130 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.parse.ext; - -import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.nutch.protocol.Protocol; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolException; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.parse.ParseException; -import org.apache.hadoop.conf.Configuration; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; - -/** - * Unit tests for ExtParser. First creates a temp file with fixed content, then - * fetch and parse it using external command 'cat' and 'md5sum' alternately for - * 10 times. Doing so also does a light stress test for class CommandRunner.java - * (as used in ExtParser.java). - * - * Warning: currently only do test on linux platform. - * - * @author John Xing - */ -public class TestExtParser { - private File tempFile = null; - private String urlString = null; - private Content content = null; - private Parse parse = null; - - private String expectedText = "nutch rocks nutch rocks nutch rocks"; - // echo -n "nutch rocks nutch rocks nutch rocks" | md5sum - private String expectedMD5sum = "df46711a1a48caafc98b1c3b83aa1526"; - - @Before - protected void setUp() throws ProtocolException, IOException { - // prepare a temp file with expectedText as its content - // This system property is defined in ./src/plugin/build-plugin.xml - String path = System.getProperty("test.data"); - if (path != null) { - File tempDir = new File(path); - if (!tempDir.exists()) - tempDir.mkdir(); - tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt", - tempDir); - } else { - // otherwise in java.io.tmpdir - tempFile = File.createTempFile("nutch.test.plugin.ExtParser.", ".txt"); - } - urlString = tempFile.toURI().toURL().toString(); - - FileOutputStream fos = new FileOutputStream(tempFile); - fos.write(expectedText.getBytes()); - fos.close(); - - // get nutch content - Protocol protocol = new ProtocolFactory(NutchConfiguration.create()) - .getProtocol(urlString); - content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()) - .getContent(); - protocol = null; - } - - @After - protected void tearDown() { - // clean content - content = null; - - // clean temp file - // if (tempFile != null && tempFile.exists()) - // tempFile.delete(); - } - - @Test - public void testIt() throws ParseException { - String contentType; - - // now test only on linux platform - if (!System.getProperty("os.name").equalsIgnoreCase("linux")) { - System.err - .println("Current OS is " + System.getProperty("os.name") + "."); - System.err.println("No test is run on OS other than linux."); - return; - } - - Configuration conf = NutchConfiguration.create(); - // loop alternately, total 10*2 times of invoking external command - for (int i = 0; i < 10; i++) { - // check external parser that does 'cat' - contentType = "application/vnd.nutch.example.cat"; - content.setContentType(contentType); - parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get( - content.getUrl()); - Assert.assertEquals(expectedText, parse.getText()); - - // check external parser that does 'md5sum' - contentType = "application/vnd.nutch.example.md5sum"; - content.setContentType(contentType); - parse = new ParseUtil(conf).parseByExtensionId("parse-ext", content).get( - content.getUrl()); - Assert.assertTrue(parse.getText().startsWith(expectedMD5sum)); - } - } - -} http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/build.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-html/build.xml b/src/plugin/parse-html/build.xml deleted file mode 100755 index a5b99b5..0000000 --- a/src/plugin/parse-html/build.xml +++ /dev/null @@ -1,40 +0,0 @@ -<?xml version="1.0"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project name="parse-html" default="jar-core"> - - <import file="../build-plugin.xml"/> - - <!-- Build compilation dependencies --> - <target name="deps-jar"> - <ant target="jar" inheritall="false" dir="../lib-nekohtml"/> - </target> - - <!-- Add compilation dependencies to classpath --> - <path id="plugin.deps"> - <fileset dir="${nutch.root}/build"> - <include name="**/lib-nekohtml/*.jar" /> - </fileset> - </path> - - <!-- Deploy Unit test dependencies --> - <target name="deps-test"> - <ant target="deploy" inheritall="false" dir="../lib-nekohtml"/> - <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> - </target> - -</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/ivy.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-html/ivy.xml b/src/plugin/parse-html/ivy.xml deleted file mode 100644 index e8a6135..0000000 --- a/src/plugin/parse-html/ivy.xml +++ /dev/null @@ -1,42 +0,0 @@ -<?xml version="1.0" ?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<ivy-module version="1.0"> - <info organisation="org.apache.nutch" module="${ant.project.name}"> - <license name="Apache 2.0"/> - <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> - <description> - Apache Nutch - </description> - </info> - - <configurations> - <include file="../../../ivy/ivy-configurations.xml"/> - </configurations> - - <publications> - <!--get the artifact from our module name--> - <artifact conf="master"/> - </publications> - - <dependencies> - <dependency org="org.ccil.cowan.tagsoup" name="tagsoup" rev="1.2.1"/> - </dependencies> - -</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/parse-html/plugin.xml ---------------------------------------------------------------------- diff --git a/src/plugin/parse-html/plugin.xml b/src/plugin/parse-html/plugin.xml deleted file mode 100755 index 3be70c3..0000000 --- a/src/plugin/parse-html/plugin.xml +++ /dev/null @@ -1,48 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<plugin - id="parse-html" - name="Html Parse Plug-in" - version="1.0.0" - provider-name="nutch.org"> - - <runtime> - <library name="parse-html.jar"> - <export name="*"/> - </library> - <library name="tagsoup-1.2.1.jar"/> - </runtime> - - <requires> - <import plugin="nutch-extensionpoints"/> - <import plugin="lib-nekohtml"/> - </requires> - - <extension id="org.apache.nutch.parse.html" - name="HtmlParse" - point="org.apache.nutch.parse.Parser"> - - <implementation id="org.apache.nutch.parse.html.HtmlParser" - class="org.apache.nutch.parse.html.HtmlParser"> - <parameter name="contentType" value="text/html|application/xhtml+xml"/> - <parameter name="pathSuffix" value=""/> - </implementation> - - </extension> - -</plugin>
