http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-regex/plugin.xml b/nutch-plugins/parsefilter-regex/plugin.xml new file mode 100644 index 0000000..0725492 --- /dev/null +++ b/nutch-plugins/parsefilter-regex/plugin.xml @@ -0,0 +1,42 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="parsefilter-regex" + name="Regex Parse Filter" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="parsefilter-regex.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.htmlparsefilter.regex" + name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter"> + <implementation id="RegexParseFilter" + class="org.apache.nutch.parsefilter.regex.RegexParseFilter"> + <parameter name="file" value="regex-parsefilter.txt"/> + </implementation> + </extension> + +</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-regex/pom.xml b/nutch-plugins/parsefilter-regex/pom.xml new file mode 100644 index 0000000..19b6452 --- /dev/null +++ b/nutch-plugins/parsefilter-regex/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>parsefilter-regex</artifactId> + <packaging>jar</packaging> + + <name>parsefilter-regex</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java new file mode 100644 index 0000000..0752c91 --- /dev/null +++ b/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java @@ -0,0 +1,199 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parsefilter.regex; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.FileReader; +import java.io.Reader; +import java.io.StringReader; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.HtmlParseFilter; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.protocol.Content; + +import org.apache.commons.lang.StringUtils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.w3c.dom.*; + +/** + * RegexParseFilter. If a regular expression matches either HTML or + * extracted text, a configurable field is set to true. + */ +public class RegexParseFilter implements HtmlParseFilter { + + private static final Logger LOG = LoggerFactory.getLogger(RegexParseFilter.class); + private static String attributeFile = null; + private String regexFile = null; + + private Configuration conf; + private DocumentFragment doc; + + private static final Map<String,RegexRule> rules = new HashMap<String,RegexRule>(); + + public RegexParseFilter() {} + + public RegexParseFilter(String regexFile) { + this.regexFile = regexFile; + } + + public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { + Parse parse = parseResult.get(content.getUrl()); + String html = new String(content.getContent()); + String text = parse.getText(); + + for (Map.Entry<String, RegexRule> entry : rules.entrySet()) { + String field = entry.getKey(); + RegexRule regexRule = entry.getValue(); + + String source = null; + if (regexRule.source.equalsIgnoreCase("html")) { + source = html; + } + if (regexRule.source.equalsIgnoreCase("text")) { + source = text; + } + + if (source == null) { + LOG.error("source for regex rule: " + field + " misconfigured"); + } + + if (matches(source, regexRule.regex)) { + parse.getData().getParseMeta().set(field, "true"); + } else { + parse.getData().getParseMeta().set(field, "false"); + } + } + + return parseResult; + } + + public void setConf(Configuration conf) { + this.conf = conf; + + // get the extensions for domain urlfilter + String pluginName = "parsefilter-regex"; + Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( + HtmlParseFilter.class.getName()).getExtensions(); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + if (extension.getDescriptor().getPluginId().equals(pluginName)) { + attributeFile = extension.getAttribute("file"); + break; + } + } + + // handle blank non empty input + if (attributeFile != null && attributeFile.trim().equals("")) { + attributeFile = null; + } + + if (attributeFile != null) { + if (LOG.isInfoEnabled()) { + LOG.info("Attribute \"file\" is defined for plugin " + pluginName + + " as " + attributeFile); + } + } + else { + if (LOG.isWarnEnabled()) { + LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " + + pluginName); + } + } + + // domain file and attribute "file" take precedence if defined + String file = conf.get("parsefilter.regex.file"); + String stringRules = conf.get("parsefilter.regex.rules"); + if (regexFile != null) { + file = regexFile; + } + else if (attributeFile != null) { + file = attributeFile; + } + Reader reader = null; + if (stringRules != null) { // takes precedence over files + reader = new StringReader(stringRules); + } else { + reader = conf.getConfResourceAsReader(file); + } + try { + if (reader == null) { + reader = new FileReader(file); + } + readConfiguration(reader); + } + catch (IOException e) { + LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); + } + } + + public Configuration getConf() { + return this.conf; + } + + private boolean matches(String value, Pattern pattern) { + if (value != null) { + Matcher matcher = pattern.matcher(value); + return matcher.find(); + } + + return false; + } + + private synchronized void readConfiguration(Reader configReader) throws IOException { + if (rules.size() > 0) { + return; + } + + String line; + BufferedReader reader = new BufferedReader(configReader); + while ((line = reader.readLine()) != null) { + if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { + line = line.trim(); + String[] parts = line.split("\t"); + + String field = parts[0].trim(); + String source = parts[1].trim(); + String regex = parts[2].trim(); + + rules.put(field, new RegexRule(source, regex)); + } + } + } + + private static class RegexRule { + public RegexRule(String source, String regex) { + this.source = source; + this.regex = Pattern.compile(regex); + } + String source; + Pattern regex; + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/package-info.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/package-info.java b/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/package-info.java new file mode 100644 index 0000000..f8f46ee --- /dev/null +++ b/nutch-plugins/parsefilter-regex/src/main/java/org/apache/nutch/parsefilter/regex/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * RegexParseFilter. If a regular expression matches either HTML or + * extracted text, a configurable field is set to true. + */ +package org.apache.nutch.parsefilter.regex; + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java b/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java new file mode 100644 index 0000000..9bd7149 --- /dev/null +++ b/nutch-plugins/parsefilter-regex/src/test/java/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parsefilter.regex; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.NutchConfiguration; +import junit.framework.TestCase; + +public class TestRegexParseFilter extends TestCase { + + private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SAMPLES = System.getProperty("test.data", "."); + + public void testPositiveFilter() throws Exception { + Configuration conf = NutchConfiguration.create(); + + String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt"; + RegexParseFilter filter = new RegexParseFilter(file); + filter.setConf(conf); + + String url = "http://nutch.apache.org/"; + String html = "<body><html><h1>nutch</h1><p>this is the extracted text blablabla</p></body></html>"; + Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf); + Parse parse = new ParseImpl("nutch this is the extracted text blablabla", new ParseData()); + + ParseResult result = ParseResult.createParseResult(url, parse); + result = filter.filter(content, result, null, null); + + Metadata meta = parse.getData().getParseMeta(); + + assertEquals("true", meta.get("first")); + assertEquals("true", meta.get("second")); + } + + public void testNegativeFilter() throws Exception { + Configuration conf = NutchConfiguration.create(); + + String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt"; + RegexParseFilter filter = new RegexParseFilter(file); + filter.setConf(conf); + + String url = "http://nutch.apache.org/"; + String html = "<body><html><h2>nutch</h2><p>this is the extracted text no bla</p></body></html>"; + Content content = new Content(url, url, html.getBytes("UTF-8"), "text/html", new Metadata(), conf); + Parse parse = new ParseImpl("nutch this is the extracted text bla", new ParseData()); + + ParseResult result = ParseResult.createParseResult(url, parse); + result = filter.filter(content, result, null, null); + + Metadata meta = parse.getData().getParseMeta(); + + assertEquals("false", meta.get("first")); + assertEquals("false", meta.get("second")); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt b/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt new file mode 100644 index 0000000..9d15cd8 --- /dev/null +++ b/nutch-plugins/parsefilter-regex/src/test/resources/regex-parsefilter.txt @@ -0,0 +1,10 @@ +# Example configuration file for parsefilter-regex +# +# Parse metadata field <name> is set to true if the HTML matches the regex. The +# source can either be html or text. If source is html, the regex is applied to +# the entire HTML tree. If source is text, the regex is applied to the +# extracted text. +# +# format: <name>\t<source>\t<regex>\n +first html h1 +second text blablabla http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/plugin.dtd ---------------------------------------------------------------------- diff --git a/nutch-plugins/plugin.dtd b/nutch-plugins/plugin.dtd new file mode 100644 index 0000000..9b67da7 --- /dev/null +++ b/nutch-plugins/plugin.dtd @@ -0,0 +1,206 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- + ! Licensed to the Apache Software Foundation (ASF) under one or more + ! contributor license agreements. See the NOTICE file distributed with + ! this work for additional information regarding copyright ownership. + ! The ASF licenses this file to You under the Apache License, Version 2.0 + ! (the "License"); you may not use this file except in compliance with + ! the License. You may obtain a copy of the License at + ! + ! http://www.apache.org/licenses/LICENSE-2.0 + ! + ! Unless required by applicable law or agreed to in writing, software + ! distributed under the License is distributed on an "AS IS" BASIS, + ! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ! See the License for the specific language governing permissions and + ! limitations under the License. + ! + ! + ! Document : plugin.dtd + ! Created on : 14 avril 2006, 22:14 + ! Author : Chris Mattmann, Jerome Charron + ! Description: Nutch plug-in manifest DTD + ! + ! PUBLIC ID : -//Apache Software Fundation//DTD Nutch Plugin Manifest 1.0//EN + ! SYSTEM ID : http://lucene.apache.org/nutch/plugin.dtd +--> + + + +<!-- + ! The <plugin> element defines the body of the manifest. + ! It optionally contains definitions for the plug-in runtime, + ! definitions of other plug-ins required by this one, + ! declarations of any new extension points being introduced by the plug-in, + ! as well as configuration of functional extensions + ! (configured into extension points defined by other plug-ins, + ! or introduced by this plug-in). + !--> +<!ELEMENT plugin (runtime?, requires?, extension-point*, extension*)> + +<!-- A user displayable name for the plug-in --> +<!ATTLIST plugin name CDATA #REQUIRED> + +<!-- + ! A unique identifier for the plug-in. + ! To minimize potential for naming collisions, + ! the identifier should be derived from the internet domain id + ! of the supplying provider (reversing the domain name tokens and + ! appending additional name tokens separated by dot [.]). + ! For example, provider nutch.org could define plug-in identifier + ! org.nutch.myplugin + !--> +<!ATTLIST plugin id CDATA #REQUIRED> + +<!-- + ! The plug-in version number. + ! NOTE : Version numbers compatibility are not yet implemented. + !--> +<!ATTLIST plugin version CDATA #REQUIRED> + +<!-- The user-displayable name of the provider supplying the plug-in. --> +<!ATTLIST plugin provider-name CDATA #IMPLIED> + +<!-- + ! The name of the plug-in class for this plug-in. + ! The class must be a subclass of org.apache.nutch.plugin.Plugin + !--> +<!ATTLIST plugin class CDATA #IMPLIED> + + +<!-- + ! The <requires> section of the manifest declares + ! any dependencies on other plug-ins. + !--> +<!ELEMENT requires (import+)> + + +<!-- Each dependency is specified using an <import> element. --> +<!ELEMENT import EMPTY> + +<!-- The identifier of the required plug-in. --> +<!ATTLIST import plugin CDATA #REQUIRED> + + +<!-- + ! The <runtime> section of the manifest contains a definition of one or more + ! libraries that make up the plug-in runtime. + ! The referenced libraries are used by the plugin execution mechanisms + ! (the plug-in class loader) to load and execute the correct code required by + ! the plug-in. + !--> +<!ELEMENT runtime (library+)> + + +<!-- + !The <library> elements collectively define the plug-in runtime. + ! At least one <library> must be specified. + !--> +<!ELEMENT library (export*)> + +<!-- + ! A string reference to a library file or directory containing classes + ! (relative to the plug-in install directory). + ! Directory references must contain trailing file separator. + !--> +<!ATTLIST library name CDATA #REQUIRED> + + +<!-- + ! Each <library> element can specify which portion + ! of the library should be exported. + ! The export rules are specified as a set of export masks. + ! By default (no export rules specified), + ! the library is considered to be private. + ! Each export mask is specified using the name attribute. + !--> +<!ELEMENT export EMPTY> + +<!-- + ! The export mask can have the following values: + ! * - indicates all contents of library are exported (public) + ! package.name.* - indicates all classes in the specified package + ! are exported. The matching rules are the same as in the + ! Java import statement. + ! package.name.ClassName - fully qualified java class name + ! + ! NOTE : export mask is not yet implemented in Nutch. + !--> +<!ATTLIST export name CDATA #REQUIRED> + + +<!-- + ! Nutch's architecture is based on the notion of configurable extension points. + ! Nutch itself predefines a set of extension points that cover the task of + ! extending it (for example, adding parser, indexing filter, ...). + ! In addition to the predefined extension points, each supplied plug-in can + ! declare additional extension points. By declaring an extension point the + ! plug-in is essentially advertising the ability to configure the plug-in + ! function with externally supplied extensions. + !--> +<!ELEMENT extension-point EMPTY> + +<!-- A user-displayable name for the extension point. --> +<!ATTLIST extension-point name CDATA #REQUIRED> + +<!-- A simple id, unique within this plug-in --> +<!ATTLIST extension-point id CDATA #REQUIRED> + + +<!-- + ! Actual extensions are configured into extension points + ! (predefined, or newly declared in this plug-in) in the <extension> section. + ! + ! The configuration information is specified by at least one implementation + ! with some parameters. + !--> +<!ELEMENT extension (implementation+)> + +<!-- + ! A reference to an extension point being configured. + ! The extension point can be one defined in this plug-in or another plug-in. + !--> +<!ATTLIST extension point CDATA #REQUIRED> + +<!-- + ! Optional identifier for this extension point configuration instance. + ! This is used by extension points that need to uniquely identify + ! (rather than just enumerate) the specific configured extensions. + ! The identifier is specified as a simple token unique within the definition + ! of the declaring plug-in. When used globally, the extension identifier + ! is qualified by the plug-in identifier. + ! FIXME : Seems it is never read in the code. + !--> +<!ATTLIST extension id CDATA #IMPLIED> + +<!-- + ! A user-displayable name for the extension. + ! FIXME : Seems it is never read in the code. + !--> +<!ATTLIST extension name CDATA #IMPLIED> + + +<!-- + ! Defines a specific implementation for the extension. + ! This implementation can define some special name/value parameters + ! used at runtime. + !--> +<!ELEMENT implementation (parameter*)> + +<!-- A unique identifier for this implementation --> +<!ATTLIST implementation id CDATA #REQUIRED> + +<!-- The fully-qualified Java Class that implements this extension-point --> +<!ATTLIST implementation class CDATA #REQUIRED> + + +<!-- Defines a name/value parameter --> +<!ELEMENT parameter EMPTY> + +<!-- The parameter's name (should be unique for an extension) --> +<!ATTLIST parameter name CDATA #REQUIRED> + +<!-- The parameter's value --> +<!ATTLIST parameter value CDATA #REQUIRED> + http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/plugin/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/plugin/pom.xml b/nutch-plugins/plugin/pom.xml new file mode 100644 index 0000000..2ac06ee --- /dev/null +++ b/nutch-plugins/plugin/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>plugin</artifactId> + <packaging>jar</packaging> + + <name>plugin</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/pom.xml b/nutch-plugins/pom.xml new file mode 100644 index 0000000..e07f487 --- /dev/null +++ b/nutch-plugins/pom.xml @@ -0,0 +1,164 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-parent</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>nutch-plugins</artifactId> + <packaging>pom</packaging> + + <name>nutch-plugins</name> + <url>http://nutch.apache.org</url> + + <modules> + <!--<module>indexer-solr</module>--> + <module>creativecommons</module> + <module>feed</module> + <module>headings</module> + <module>index-anchor</module> + <module>index-basic</module> + <module>index-geoip</module> + <module>index-links</module> + <module>index-metadata</module> + <module>index-more</module> + <module>index-replace</module> + <module>index-static</module> + <module>indexer-cloudsearch</module> + <module>indexer-dummy</module> + <module>indexer-elastic</module> + <module>indexer-solr</module> + <module>language-identifier</module> + <module>lib-htmlunit</module> + <module>lib-http</module> + <module>lib-nekohtml</module> + <module>lib-regex-filter</module> + <module>lib-selenium</module> + <module>lib-xml</module> + <module>microformats-reltag</module> + <module>mimetype-filter</module> + <module>nutch-extensionpoints</module> + <module>parse-ext</module> + <module>parse-html</module> + <module>parse-js</module> + <module>parse-metatags</module> + <module>parse-replace</module> + <module>parse-swf</module> + <module>parse-tika</module> + <module>parse-zip</module> + <module>parsefilter-naivebayes</module> + <module>parsefilter-regex</module> + <module>plugin</module> + <module>protocol-file</module> + <module>protocol-ftp</module> + <module>protocol-htmlunit</module> + <module>protocol-http</module> + <module>protocol-httpclient</module> + <module>protocol-interactiveselenium</module> + <module>protocol-selenium</module> + <module>scoring-depth</module> + <module>scoring-link</module> + <module>scoring-opic</module> + <module>scoring-similarity</module> + <module>subcollection</module> + <module>tld</module> + <module>urlfilter-automaton</module> + <module>urlfilter-domain</module> + <module>urlfilter-domainblacklist</module> + <module>urlfilter-ignoreexempt</module> + <module>urlfilter-prefix</module> + <module>urlfilter-regex</module> + <module>urlfilter-suffix</module> + <module>urlfilter-validator</module> + <module>urlmeta</module> + <module>urlnormalizer-ajax</module> + <module>urlnormalizer-basic</module> + <module>urlnormalizer-host</module> + <module>urlnormalizer-pass</module> + <module>urlnormalizer-protocol</module> + <module>urlnormalizer-querystring</module> + <module>urlnormalizer-regex</module> + <module>urlnormalizer-slash</module> + </modules> + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + <!-- Note : an additional level is for the child modules (defined ahead in hierarchy)--> + <dir.root>..${file.separator}..${file.separator}</dir.root> + <libs.dir>${dir.local.plugins}${file.separator}${project.artifactId}</libs.dir> + </properties> + <dependencies> + <dependency> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-core</artifactId> + <version>${project.parent.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-core</artifactId> + <version>${project.parent.version}</version> + <scope>test</scope> + <type>test-jar</type> + </dependency> + </dependencies> + <build> + <finalName>${project.artifactId}</finalName> + <plugins> + <plugin> + <artifactId>maven-resources-plugin</artifactId> + <version>3.0.1</version> + <executions> + <execution> + <id>copy-resources</id> + <phase>package</phase> + <goals> + <goal>copy-resources</goal> + </goals> + <configuration> + <outputDirectory>${libs.dir}</outputDirectory> + <resources> + <resource> + <directory>${project.build.directory}</directory> + <include>${build.finalName}.jar</include> + </resource> + <resource> + <directory>${project.basedir}</directory> + <include>plugin.xml</include> + </resource> + </resources> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <artifactId>maven-surefire-plugin</artifactId> + <version>2.19.1</version> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-failsafe-plugin</artifactId> + <version>2.19.1</version> + </plugin> + </plugins> + </build> +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/build.xml b/nutch-plugins/protocol-file/build.xml new file mode 100644 index 0000000..121b1fe --- /dev/null +++ b/nutch-plugins/protocol-file/build.xml @@ -0,0 +1,29 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="protocol-file" default="jar-core"> + + <import file="../build-plugin.xml"/> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="sample"> + <include name="*.txt"/> + </fileset> + </copy> +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/ivy.xml b/nutch-plugins/protocol-file/ivy.xml new file mode 100644 index 0000000..1a86d68 --- /dev/null +++ b/nutch-plugins/protocol-file/ivy.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/plugin.xml b/nutch-plugins/protocol-file/plugin.xml new file mode 100644 index 0000000..1647ce4 --- /dev/null +++ b/nutch-plugins/protocol-file/plugin.xml @@ -0,0 +1,46 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="protocol-file" + name="File Protocol Plug-in" + version="1.0.0" + provider-name="nutch.org"> + + + <runtime> + <library name="protocol-file.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.protocol.file" + name="FileProtocol" + point="org.apache.nutch.protocol.Protocol"> + + <implementation id="org.apache.nutch.protocol.file.File" + class="org.apache.nutch.protocol.file.File"> + <parameter name="protocolName" value="file"/> + </implementation> + + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/pom.xml b/nutch-plugins/protocol-file/pom.xml new file mode 100644 index 0000000..2ab2f75 --- /dev/null +++ b/nutch-plugins/protocol-file/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>protocol-file</artifactId> + <packaging>jar</packaging> + + <name>protocol-file</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/File.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/File.java b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/File.java new file mode 100644 index 0000000..2712218 --- /dev/null +++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/File.java @@ -0,0 +1,228 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.file; + +import java.net.URL; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolStatus; +import org.apache.nutch.protocol.RobotRulesParser; +import org.apache.nutch.util.NutchConfiguration; + +import crawlercommons.robots.BaseRobotRules; + +/** + * This class is a protocol plugin used for file: scheme. It creates + * {@link FileResponse} object and gets the content of the url from it. + * Configurable parameters are {@code file.content.limit} and + * {@code file.crawl.parent} in nutch-default.xml defined under + * "file properties" section. + * + * @author John Xing + */ +public class File implements Protocol { + + public static final Logger LOG = LoggerFactory.getLogger(File.class); + + static final int MAX_REDIRECTS = 5; + + int maxContentLength; + boolean crawlParents; + + /** + * if true return a redirect for symbolic links and do not resolve the links + * internally + */ + boolean symlinksAsRedirects = true; + + private Configuration conf; + + public File() { + } + + /** + * Set the {@link Configuration} object + */ + public void setConf(Configuration conf) { + this.conf = conf; + this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024); + this.crawlParents = conf.getBoolean("file.crawl.parent", true); + this.symlinksAsRedirects = conf.getBoolean( + "file.crawl.redirect_noncanonical", true); + } + + /** + * Get the {@link Configuration} object + */ + public Configuration getConf() { + return this.conf; + } + + /** + * Set the length after at which content is truncated. + */ + public void setMaxContentLength(int maxContentLength) { + this.maxContentLength = maxContentLength; + } + + /** + * Creates a {@link FileResponse} object corresponding to the url and return a + * {@link ProtocolOutput} object as per the content received + * + * @param url + * Text containing the url + * @param datum + * The CrawlDatum object corresponding to the url + * + * @return {@link ProtocolOutput} object for the content of the file indicated + * by url + */ + public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { + String urlString = url.toString(); + try { + URL u = new URL(urlString); + + int redirects = 0; + + while (true) { + FileResponse response; + response = new FileResponse(u, datum, this, getConf()); // make a + // request + + int code = response.getCode(); + + if (code == 200) { // got a good response + return new ProtocolOutput(response.toContent()); // return it + + } else if (code == 304) { // got not modified + return new ProtocolOutput(response.toContent(), + ProtocolStatus.STATUS_NOTMODIFIED); + + } else if (code == 401) { // access denied / no read permissions + return new ProtocolOutput(response.toContent(), new ProtocolStatus( + ProtocolStatus.ACCESS_DENIED)); + + } else if (code == 404) { // no such file + return new ProtocolOutput(response.toContent(), + ProtocolStatus.STATUS_NOTFOUND); + + } else if (code >= 300 && code < 400) { // handle redirect + u = new URL(response.getHeader("Location")); + if (LOG.isTraceEnabled()) { + LOG.trace("redirect to " + u); + } + if (symlinksAsRedirects) { + return new ProtocolOutput(response.toContent(), new ProtocolStatus( + ProtocolStatus.MOVED, u)); + } else if (redirects == MAX_REDIRECTS) { + LOG.trace("Too many redirects: {}", url); + return new ProtocolOutput(response.toContent(), new ProtocolStatus( + ProtocolStatus.REDIR_EXCEEDED, u)); + } + redirects++; + + } else { // convert to exception + throw new FileError(code); + } + } + } catch (Exception e) { + e.printStackTrace(); + return new ProtocolOutput(null, new ProtocolStatus(e)); + } + } + + /** + * Quick way for running this class. Useful for debugging. + */ + public static void main(String[] args) throws Exception { + int maxContentLength = Integer.MIN_VALUE; + String logLevel = "info"; + boolean dumpContent = false; + String urlString = null; + + String usage = "Usage: File [-logLevel level] [-maxContentLength L] [-dumpContent] url"; + + if (args.length == 0) { + System.err.println(usage); + System.exit(-1); + } + + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-logLevel")) { + logLevel = args[++i]; + } else if (args[i].equals("-maxContentLength")) { + maxContentLength = Integer.parseInt(args[++i]); + } else if (args[i].equals("-dumpContent")) { + dumpContent = true; + } else if (i != args.length - 1) { + System.err.println(usage); + System.exit(-1); + } else + urlString = args[i]; + } + + File file = new File(); + file.setConf(NutchConfiguration.create()); + + if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength + file.setMaxContentLength(maxContentLength); + + // set log level + // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase())); + + ProtocolOutput output = file.getProtocolOutput(new Text(urlString), + new CrawlDatum()); + Content content = output.getContent(); + + System.err.println("URL: " + content.getUrl()); + System.err.println("Status: " + output.getStatus()); + System.err.println("Content-Type: " + content.getContentType()); + System.err.println("Content-Length: " + + content.getMetadata().get(Response.CONTENT_LENGTH)); + System.err.println("Last-Modified: " + + content.getMetadata().get(Response.LAST_MODIFIED)); + String redirectLocation = content.getMetadata().get("Location"); + if (redirectLocation != null) { + System.err.println("Location: " + redirectLocation); + } + + if (dumpContent) { + System.out.print(new String(content.getContent())); + } + + file = null; + } + + /** + * No robots parsing is done for file protocol. So this returns a set of empty + * rules which will allow every url. + */ + public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) { + return RobotRulesParser.EMPTY_RULES; + } +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileError.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileError.java b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileError.java new file mode 100644 index 0000000..4fef340 --- /dev/null +++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileError.java @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.file; + +/** + * Thrown for File error codes. + */ +public class FileError extends FileException { + + private int code; + + public int getCode(int code) { + return code; + } + + public FileError(int code) { + super("File Error: " + code); + this.code = code; + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileException.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileException.java b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileException.java new file mode 100644 index 0000000..f0467de --- /dev/null +++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileException.java @@ -0,0 +1,40 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.file; + +import org.apache.nutch.protocol.ProtocolException; + +public class FileException extends ProtocolException { + + public FileException() { + super(); + } + + public FileException(String message) { + super(message); + } + + public FileException(String message, Throwable cause) { + super(message, cause); + } + + public FileException(Throwable cause) { + super(cause); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileResponse.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileResponse.java b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileResponse.java new file mode 100644 index 0000000..b6e74ff --- /dev/null +++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/FileResponse.java @@ -0,0 +1,317 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.file; + +// JDK imports +import java.net.URL; +import java.io.IOException; +import java.io.UnsupportedEncodingException; + +// Nutch imports +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.MimeUtil; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.HttpDateFormat; +import org.apache.nutch.net.protocols.Response; + +// Tika imports +import org.apache.tika.Tika; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +/************************************ + * FileResponse.java mimics file replies as http response. It tries its best to + * follow http's way for headers, response codes as well as exceptions. + * + * Comments: (1) java.net.URL and java.net.URLConnection can handle file: + * scheme. However they are not flexible enough, so not used in this + * implementation. + * + * (2) java.io.File is used for its abstractness across platforms. Warning: + * java.io.File API (1.4.2) does not elaborate on how special files, such as + * /dev/* in unix and /proc/* on linux, are treated. Tests show (a) + * java.io.File.isFile() return false for /dev/* (b) java.io.File.isFile() + * return true for /proc/* (c) java.io.File.length() return 0 for /proc/* We are + * probably oaky for now. Could be buggy here. How about special files on + * windows? + * + * (3) java.io.File API (1.4.2) does not seem to know unix hard link files. They + * are just treated as individual files. + * + * (4) No funcy POSIX file attributes yet. May never need? + * + * @author John Xing + ***********************************/ +public class FileResponse { + + private String orig; + private String base; + private byte[] content; + private static final byte[] EMPTY_CONTENT = new byte[0]; + private int code; + private Metadata headers = new Metadata(); + + private final File file; + private Configuration conf; + + private MimeUtil MIME; + private Tika tika; + + /** Returns the response code. */ + public int getCode() { + return code; + } + + /** Returns the value of a named header. */ + public String getHeader(String name) { + return headers.get(name); + } + + public byte[] getContent() { + return content; + } + + public Content toContent() { + return new Content(orig, base, (content != null ? content : EMPTY_CONTENT), + getHeader(Response.CONTENT_TYPE), headers, this.conf); + } + + /** + * Default public constructor + * + * @param url + * @param datum + * @param file + * @param conf + * @throws FileException + * @throws IOException + */ + public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf) + throws FileException, IOException { + + this.orig = url.toString(); + this.base = url.toString(); + this.file = file; + this.conf = conf; + + MIME = new MimeUtil(conf); + tika = new Tika(); + + if (!"file".equals(url.getProtocol())) + throw new FileException("Not a file url:" + url); + + if (File.LOG.isTraceEnabled()) { + File.LOG.trace("fetching " + url); + } + + if (url.getPath() != url.getFile()) { + if (File.LOG.isWarnEnabled()) { + File.LOG.warn("url.getPath() != url.getFile(): " + url); + } + } + + String path = "".equals(url.getPath()) ? "/" : url.getPath(); + + try { + // specify the encoding via the config later? + path = java.net.URLDecoder.decode(path, "UTF-8"); + } catch (UnsupportedEncodingException ex) { + } + + try { + + this.content = null; + + // url.toURI() is only in j2se 1.5.0 + // java.io.File f = new java.io.File(url.toURI()); + java.io.File f = new java.io.File(path); + + if (!f.exists()) { + this.code = 404; // http Not Found + return; + } + + if (!f.canRead()) { + this.code = 401; // http Unauthorized + return; + } + + // symbolic link or relative path on unix + // fix me: what's the consequence on windows platform + // where case is insensitive + if (!f.equals(f.getCanonicalFile())) { + // set headers + // hdrs.put("Location", f.getCanonicalFile().toURI()); + // + // we want to automatically escape characters that are illegal in URLs. + // It is recommended that new code convert an abstract pathname into a + // URL + // by first converting it into a URI, via the toURI method, and then + // converting the URI into a URL via the URI.toURL method. + headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL() + .toString()); + + this.code = 300; // http redirect + return; + } + if (f.lastModified() <= datum.getModifiedTime()) { + this.code = 304; + this.headers.set("Last-Modified", + HttpDateFormat.toString(f.lastModified())); + return; + } + + if (f.isDirectory()) { + getDirAsHttpResponse(f); + } else if (f.isFile()) { + getFileAsHttpResponse(f); + } else { + this.code = 500; // http Internal Server Error + return; + } + + } catch (IOException e) { + throw e; + } + + } + + // get file as http response + private void getFileAsHttpResponse(java.io.File f) throws FileException, + IOException { + + // ignore file of size larger than + // Integer.MAX_VALUE = 2^31-1 = 2147483647 + long size = f.length(); + if (size > Integer.MAX_VALUE) { + throw new FileException("file is too large, size: " + size); + // or we can do this? + // this.code = 400; // http Bad request + // return; + } + + // capture content + int len = (int) size; + + if (this.file.maxContentLength >= 0 && len > this.file.maxContentLength) + len = this.file.maxContentLength; + + this.content = new byte[len]; + + java.io.InputStream is = new java.io.FileInputStream(f); + int offset = 0; + int n = 0; + while (offset < len + && (n = is.read(this.content, offset, len - offset)) >= 0) { + offset += n; + } + if (offset < len) { // keep whatever already have, but issue a warning + if (File.LOG.isWarnEnabled()) { + File.LOG.warn("not enough bytes read from file: " + f.getPath()); + } + } + is.close(); + + // set headers + headers.set(Response.CONTENT_LENGTH, new Long(size).toString()); + headers.set(Response.LAST_MODIFIED, + HttpDateFormat.toString(f.lastModified())); + + String mimeType = tika.detect(f); + + headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : ""); + + // response code + this.code = 200; // http OK + } + + /** + * get dir list as http response + * + * @param f + * @throws IOException + */ + private void getDirAsHttpResponse(java.io.File f) throws IOException { + + String path = f.toString(); + if (this.file.crawlParents) + this.content = list2html(f.listFiles(), path, "/".equals(path) ? false + : true); + else + this.content = list2html(f.listFiles(), path, false); + + // set headers + headers.set(Response.CONTENT_LENGTH, + new Integer(this.content.length).toString()); + headers.set(Response.CONTENT_TYPE, "text/html"); + headers.set(Response.LAST_MODIFIED, + HttpDateFormat.toString(f.lastModified())); + + // response code + this.code = 200; // http OK + } + + /** + * generate html page from dir list + * + * @param list + * @param path + * @param includeDotDot + * @return + */ + private byte[] list2html(java.io.File[] list, String path, + boolean includeDotDot) { + + StringBuffer x = new StringBuffer("<html><head>"); + x.append("<title>Index of " + path + "</title></head>\n"); + x.append("<body><h1>Index of " + path + "</h1><pre>\n"); + + if (includeDotDot) { + x.append("<a href='../'>../</a>\t-\t-\t-\n"); + } + + // fix me: we might want to sort list here! but not now. + + java.io.File f; + for (int i = 0; i < list.length; i++) { + f = list[i]; + String name = f.getName(); + String time = HttpDateFormat.toString(f.lastModified()); + if (f.isDirectory()) { + // java 1.4.2 api says dir itself and parent dir are not listed + // so the following is not needed. + // if (name.equals(".") || name.equals("..")) + // continue; + x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t"); + x.append(time + "\t-\n"); + } else if (f.isFile()) { + x.append("<a href='" + name + "'>" + name + "</a>\t"); + x.append(time + "\t" + f.length() + "\n"); + } else { + // ignore any other + } + } + + x.append("</pre></body></html>\n"); + + return new String(x).getBytes(); + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/package.html ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/package.html b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/package.html new file mode 100644 index 0000000..221c79c --- /dev/null +++ b/nutch-plugins/protocol-file/src/main/java/org/apache/nutch/protocol/file/package.html @@ -0,0 +1,5 @@ +<html> +<body> +<p>Protocol plugin which supports retrieving local file resources.</p><p></p> +</body> +</html> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java b/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java new file mode 100644 index 0000000..5f95377 --- /dev/null +++ b/nutch-plugins/protocol-file/src/test/java/org/apache/nutch/protocol/file/TestProtocolFile.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.protocol.file; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; + +// Nutch imports +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolException; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.protocol.ProtocolOutput; +import org.apache.nutch.protocol.ProtocolStatus; +import org.apache.nutch.util.NutchConfiguration; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +/** + * @author mattmann + * @version $Revision$ + * + * <p> + * Unit tests for the {@link File}Protocol. + * </p> + * . + */ +public class TestProtocolFile { + + private String fileSeparator = System.getProperty("file.separator"); + private String sampleDir = System.getProperty("test.data", "."); + + private static final String[] testTextFiles = new String[] { + "testprotocolfile.txt", "testprotocolfile_(encoded).txt", + "testprotocolfile_%28encoded%29.txt" }; + + private static final CrawlDatum datum = new CrawlDatum(); + + private static final String expectedMimeType = "text/plain"; + + private Configuration conf; + + @Before + public void setUp() { + conf = NutchConfiguration.create(); + } + + @Test + public void testSetContentType() throws ProtocolException { + for (String testTextFile : testTextFiles) { + setContentType(testTextFile); + } + } + + /** + * Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field. + * + * @since NUTCH-384 + * + */ + public void setContentType(String testTextFile) throws ProtocolException { + String urlString = "file:" + sampleDir + fileSeparator + testTextFile; + Assert.assertNotNull(urlString); + Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); + ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString), + datum); + Assert.assertNotNull(output); + Assert.assertEquals("Status code: [" + output.getStatus().getCode() + + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: [" + + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output + .getStatus().getCode()); + Assert.assertNotNull(output.getContent()); + Assert.assertNotNull(output.getContent().getContentType()); + Assert.assertEquals(expectedMimeType, output.getContent().getContentType()); + Assert.assertNotNull(output.getContent().getMetadata()); + Assert.assertEquals(expectedMimeType, output.getContent().getMetadata() + .get(Response.CONTENT_TYPE)); + + } + +} http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt new file mode 100644 index 0000000..fbe8a8a --- /dev/null +++ b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile.txt @@ -0,0 +1 @@ +Protocol File Test http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt new file mode 100644 index 0000000..fbe8a8a --- /dev/null +++ b/nutch-plugins/protocol-file/src/test/resources/testprotocolfile_(encoded).txt @@ -0,0 +1 @@ +Protocol File Test http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/build.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-ftp/build.xml b/nutch-plugins/protocol-ftp/build.xml new file mode 100644 index 0000000..79314d4 --- /dev/null +++ b/nutch-plugins/protocol-ftp/build.xml @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="protocol-ftp" default="jar-core"> + + <import file="../build-plugin.xml"/> + +</project> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/ivy.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-ftp/ivy.xml b/nutch-plugins/protocol-ftp/ivy.xml new file mode 100644 index 0000000..214c445 --- /dev/null +++ b/nutch-plugins/protocol-ftp/ivy.xml @@ -0,0 +1,42 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + <dependency org="commons-net" name="commons-net" rev="1.2.2" conf="*->master"/> + </dependencies> + +</ivy-module> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/plugin.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-ftp/plugin.xml b/nutch-plugins/protocol-ftp/plugin.xml new file mode 100644 index 0000000..1421e37 --- /dev/null +++ b/nutch-plugins/protocol-ftp/plugin.xml @@ -0,0 +1,46 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="protocol-ftp" + name="Ftp Protocol Plug-in" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="protocol-ftp.jar"> + <export name="*"/> + </library> + <library name="commons-net-1.2.0-dev.jar"/> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.protocol.ftp" + name="FtpProtocol" + point="org.apache.nutch.protocol.Protocol"> + + <implementation id="org.apache.nutch.protocol.ftp.Ftp" + class="org.apache.nutch.protocol.ftp.Ftp"> + <parameter name="protocolName" value="ftp"/> + </implementation> + + </extension> + +</plugin> http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/pom.xml ---------------------------------------------------------------------- diff --git a/nutch-plugins/protocol-ftp/pom.xml b/nutch-plugins/protocol-ftp/pom.xml new file mode 100644 index 0000000..fe9a61b --- /dev/null +++ b/nutch-plugins/protocol-ftp/pom.xml @@ -0,0 +1,38 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.nutch</groupId> + <artifactId>nutch-plugins</artifactId> + <version>1.13-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + <artifactId>protocol-ftp</artifactId> + <packaging>jar</packaging> + + <name>protocol-ftp</name> + <url>http://nutch.apache.org</url> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + +</project>
