This is an automated email from the ASF dual-hosted git repository. lewismc pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 2fae4cd NUTCH-2849 Replace remaining package.html files with package-info.java (#569) 2fae4cd is described below commit 2fae4cde67a05cf1fa9ecdd6b6bd5307c0e46fe7 Author: Lewis John McGibbney <lewis.mcgibb...@gmail.com> AuthorDate: Tue Feb 16 10:40:00 2021 -0800 NUTCH-2849 Replace remaining package.html files with package-info.java (#569) --- build.xml | 7 +++- .../org/apache/nutch/crawl/package-info.java} | 8 ++-- src/java/org/apache/nutch/crawl/package.html | 5 --- .../org/apache/nutch/fetcher/package-info.java} | 8 ++-- src/java/org/apache/nutch/fetcher/package.html | 5 --- .../org/apache/nutch/indexer/package-info.java} | 16 ++++--- src/java/org/apache/nutch/indexer/package.html | 10 ----- .../org/apache/nutch/metadata/package-info.java} | 11 ++--- src/java/org/apache/nutch/metadata/package.html | 6 --- src/java/org/apache/nutch/plugin/package-info.java | 42 +++++++++++++++++++ src/java/org/apache/nutch/plugin/package.html | 40 ------------------ .../apache/nutch/util/domain/package-info.java} | 17 +++++--- src/java/org/apache/nutch/util/domain/package.html | 14 ------- .../org/creativecommons/nutch/package-info.java} | 8 ++-- .../java/org/creativecommons/nutch/package.html | 5 --- .../apache/nutch/indexer/anchor/package-info.java} | 8 ++-- .../org/apache/nutch/indexer/anchor/package.html | 5 --- .../apache/nutch/indexer/basic/package-info.java} | 10 ++--- .../org/apache/nutch/indexer/basic/package.html | 5 --- .../apache/nutch/indexer/more/package-info.java} | 11 ++--- .../org/apache/nutch/indexer/more/package.html | 6 --- .../nutch/indexer/staticfield/package-info.java} | 12 +++--- .../apache/nutch/indexer/staticfield/package.html | 5 --- .../apache/nutch/analysis/lang/package-info.java} | 13 +++--- .../org/apache/nutch/analysis/lang/package.html | 6 --- .../nutch/protocol/http/api/package-info.java} | 11 ++--- .../apache/nutch/protocol/http/api/package.html | 6 --- .../nutch/microformats/reltag/package-info.java} | 11 ++--- .../apache/nutch/microformats/reltag/package.html | 8 ---- .../org/apache/nutch/parse/html/package-info.java} | 11 ++--- .../java/org/apache/nutch/parse/html/package.html | 5 --- .../apache/nutch/protocol/file/package-info.java} | 8 ++-- .../org/apache/nutch/protocol/file/package.html | 5 --- .../apache/nutch/protocol/ftp/package-info.java} | 8 ++-- .../org/apache/nutch/protocol/ftp/package.html | 5 --- .../htmlunit/{package.html => package-info.java} | 8 ++-- .../apache/nutch/protocol/http/package-info.java} | 8 ++-- .../org/apache/nutch/protocol/http/package.html | 5 --- .../nutch/protocol/httpclient/package-info.java} | 15 ++++--- .../apache/nutch/protocol/httpclient/package.html | 9 ---- .../interactiveselenium/package-info.java} | 8 ++-- .../protocol/interactiveselenium/package.html | 5 --- .../nutch/protocol/selenium/package-info.java} | 8 ++-- .../apache/nutch/protocol/selenium/package.html | 5 --- .../nutch/scoring/metadata/package-info.java | 32 ++++++++++++++ .../org/apache/nutch/scoring/metadata/package.html | 33 --------------- .../org/apache/nutch/collection/package-info.java | 49 ++++++++++++++++++++++ .../java/org/apache/nutch/collection/package.html | 36 ---------------- .../apache/nutch/indexer/tld/package-info.java} | 8 ++-- .../java/org/apache/nutch/indexer/tld/package.html | 5 --- .../apache/nutch/scoring/tld/package-info.java} | 8 ++-- .../java/org/apache/nutch/scoring/tld/package.html | 5 --- .../nutch/urlfilter/automaton/package-info.java} | 12 +++--- .../apache/nutch/urlfilter/automaton/package.html | 9 ---- .../nutch/urlfilter/prefix/package-info.java} | 8 ++-- .../org/apache/nutch/urlfilter/prefix/package.html | 5 --- .../nutch/urlfilter/regex/package-info.java} | 10 ++--- .../org/apache/nutch/urlfilter/regex/package.html | 5 --- .../nutch/urlfilter/validator/package-info.java} | 14 ++++--- .../apache/nutch/urlfilter/validator/package.html | 9 ---- .../nutch/indexer/urlmeta/package-info.java} | 16 ++++--- .../org/apache/nutch/indexer/urlmeta/package.html | 12 ------ .../nutch/scoring/urlmeta/package-info.java} | 15 ++++--- .../org/apache/nutch/scoring/urlmeta/package.html | 11 ----- 64 files changed, 292 insertions(+), 442 deletions(-) diff --git a/build.xml b/build.xml index ec003c3..dcb7b94 100644 --- a/build.xml +++ b/build.xml @@ -186,6 +186,7 @@ doctitle="${name} ${version} API" bottom="Copyright &copy; ${year} The Apache Software Foundation" failonerror="true" + failonwarning="true" > <arg value="${javadoc.proxy.host}"/> <arg value="${javadoc.proxy.port}"/> @@ -269,9 +270,9 @@ <link href="${javadoc.link.java}"/> <link href="${javadoc.link.hadoop}"/> - <link href="${javadoc.link.lucene.core}"/> + <!--link href="${javadoc.link.lucene.core}"/> <link href="${javadoc.link.lucene.analyzers-common}"/> - <link href="${javadoc.link.solr-solrj}"/> + <link href="${javadoc.link.solr-solrj}"/--> <classpath refid="classpath"/> <classpath> @@ -718,6 +719,7 @@ doctitle="${name} ${version} API" bottom="Copyright &copy; ${year} The Apache Software Foundation" failonerror="true" + failonwarning="true" > <arg value="${javadoc.proxy.host}"/> <arg value="${javadoc.proxy.port}"/> @@ -809,6 +811,7 @@ <classpath> <fileset dir="${build.plugins}" > <include name="**/*.jar"/> + <exclude name="any23/javax.annotation-api*.jar"/> </fileset> </classpath> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/java/org/apache/nutch/crawl/package-info.java similarity index 87% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/java/org/apache/nutch/crawl/package-info.java index 4181951..f7a529b 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/java/org/apache/nutch/crawl/package-info.java @@ -14,8 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** Crawl control code and tools to run the crawler. */ +package org.apache.nutch.crawl; diff --git a/src/java/org/apache/nutch/crawl/package.html b/src/java/org/apache/nutch/crawl/package.html deleted file mode 100644 index 05eeb50..0000000 --- a/src/java/org/apache/nutch/crawl/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -Crawl control code and tools to run the crawler. -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/java/org/apache/nutch/fetcher/package-info.java similarity index 87% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/java/org/apache/nutch/fetcher/package-info.java index 4181951..c06243c 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/java/org/apache/nutch/fetcher/package-info.java @@ -14,8 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** The Nutch multi-threaded fetching module */ +package org.apache.nutch.fetcher; diff --git a/src/java/org/apache/nutch/fetcher/package.html b/src/java/org/apache/nutch/fetcher/package.html deleted file mode 100644 index 9c843e0..0000000 --- a/src/java/org/apache/nutch/fetcher/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -The Nutch robot. -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/java/org/apache/nutch/indexer/package-info.java similarity index 67% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/java/org/apache/nutch/indexer/package-info.java index 4181951..2307dd9 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/java/org/apache/nutch/indexer/package-info.java @@ -14,8 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** + * Index content, configure and run indexing and cleaning jobs to + * add, update, and delete documents from an index. Two tasks are + * delegated to plugins: + * <ul> + * <li>indexing filters, which fill index fields of each document</li> + * <li>index writer plugins; which send documents to index back-ends (Solr, etc.).</li> + * </ul> + */ +package org.apache.nutch.indexer; diff --git a/src/java/org/apache/nutch/indexer/package.html b/src/java/org/apache/nutch/indexer/package.html deleted file mode 100644 index 825eaae..0000000 --- a/src/java/org/apache/nutch/indexer/package.html +++ /dev/null @@ -1,10 +0,0 @@ -<html> -<body> -Index content, configure and run indexing and cleaning jobs to -add, update, and delete documents from an index. Two tasks are -delegated to plugins: -<ul> -<li>indexing filters fill index fields of each documents</li> -<li>index writer plugins send documents to index back-ends (Solr, etc.). -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/java/org/apache/nutch/metadata/package-info.java similarity index 85% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/java/org/apache/nutch/metadata/package-info.java index 4181951..b64dca3 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/java/org/apache/nutch/metadata/package-info.java @@ -14,8 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** + * A Multi-valued Metadata container, and set + * of constant fields for Nutch Metadata. + */ +package org.apache.nutch.metadata; diff --git a/src/java/org/apache/nutch/metadata/package.html b/src/java/org/apache/nutch/metadata/package.html deleted file mode 100644 index 53281bb..0000000 --- a/src/java/org/apache/nutch/metadata/package.html +++ /dev/null @@ -1,6 +0,0 @@ -<html> -<body> -A Multi-valued Metadata container, and set -of constant fields for Nutch Metadata. -</body> -</html> diff --git a/src/java/org/apache/nutch/plugin/package-info.java b/src/java/org/apache/nutch/plugin/package-info.java new file mode 100644 index 0000000..f7d95f9 --- /dev/null +++ b/src/java/org/apache/nutch/plugin/package-info.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * The Nutch {@link org.apache.nutch.plugin.Pluggable Plugin} System. + * <p><b>The Nutch Plugin System provides a way to extend nutch functionality</b>. + * A large part of the functionality of Nutch are provided by plugins: + * All of the parsing, indexing and searching that nutch does is actually + * accomplished by various plugins.</p> + * <p>In writing a plugin, you're actually providing one or more extensions + * of the existing extension-points (<i>hooks</i>). The core Nutch extension-points + * are themselves defined in a plugin, the <code>nutch-extensionpoints</code> plugin. + * Each extension-point defines an interface that must be implemented by the + * extension. The core extension-points and extensions available in Nutch are + * listed in the {@link org.apache.nutch.plugin.Pluggable} interface.</p> + * @see <a href="./doc-files/plugin.dtd">Nutch plugin manifest DTD</a> + * @see <a href="https://cwiki.apache.org/confluence/display/NUTCH/PluginCentral">Plugin Central</a> + * @see <a href="https://cwiki.apache.org/confluence/display/NUTCH/AboutPlugins">About Plugins</a> + * @see <a href="https://cwiki.apache.org/confluence/display/NUTCH/WhyNutchHasAPluginSystem"> + * Why Nutch has a Plugin System?</a> + * @see <a href="https://cwiki.apache.org/confluence/display/NUTCH/WhichTechnicalConceptsAreBehindTheNutchPluginSystem"> + * Which technical concepts are behind the nutch plugin system?</a> + * @see <a href="https://cwiki.apache.org/confluence/display/NUTCH/WhatsTheProblemWithPluginsAndClass-loading"> + * What's the problem with Plugins and Class loading?</a> + * @see <a href="https://cwiki.apache.org/confluence/display/NUTCH/WritingPluginExample"> + * Writing Plugin Example</a> + */ +package org.apache.nutch.plugin; diff --git a/src/java/org/apache/nutch/plugin/package.html b/src/java/org/apache/nutch/plugin/package.html deleted file mode 100644 index 442ed09..0000000 --- a/src/java/org/apache/nutch/plugin/package.html +++ /dev/null @@ -1,40 +0,0 @@ -<html> -<body> -The Nutch {@link org.apache.nutch.plugin.Pluggable Plugin} System. -<p> -<b>The Nutch Plugin System provides a way to extend nutch functionality</b>. -A large part of the functionality of Nutch are provided by plugins: -All of the parsing, indexing and searching that nutch does is actually -accomplished by various plugins. -</p><p> -In writing a plugin, you're actually providing one or more extensions of the -existing extension-points (<i>hooks</i>). -The core Nutch extension-points are themselves defined in a plugin, -the <code>nutch-extensionpoints</code> plugin. -Each extension-point defines an interface that must be implemented by the -extension. The core extension-points and extensions available in Nutch are -listed in the {@link org.apache.nutch.plugin.Pluggable} interface. -</p> - -@see <a href="./doc-files/plugin.dtd">Nutch plugin manifest DTD</a> - -@see <a href="https://cwiki.apache.org/confluence/display/NUTCH/PluginCentral"> - Plugin Central - </a> -@see <a href="https://cwiki.apache.org/confluence/display/NUTCH/AboutPlugins"> - About Plugins - </a> -@see <a href="https://cwiki.apache.org/confluence/display/NUTCH/WhyNutchHasAPluginSystem"> - Why Nutch has a Plugin System? - </a> -@see <a href="https://cwiki.apache.org/confluence/display/NUTCH/WhichTechnicalConceptsAreBehindTheNutchPluginSystem"> - Which technical concepts are behind the nutch plugin system? - </a> -@see <a href="https://cwiki.apache.org/confluence/display/NUTCH/WhatsTheProblemWithPluginsAndClass-loading"> - What's the problem with Plugins and Class loading? - </a> -@see <a href="https://cwiki.apache.org/confluence/display/NUTCH/WritingPluginExample"> - Writing Plugin Example - </a> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/java/org/apache/nutch/util/domain/package-info.java similarity index 60% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/java/org/apache/nutch/util/domain/package-info.java index 4181951..6a799a9 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/java/org/apache/nutch/util/domain/package-info.java @@ -14,8 +14,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** + * Classes for domain name analysis. for information refer to + * following urls : + * <ul> + * <li><a href="http://en.wikipedia.org/wiki/DNS">http://en.wikipedia.org/wiki/DNS</a></li> + * <li><a href="http://en.wikipedia.org/wiki/Top-level_domain">http://en.wikipedia.org/wiki/Top-level_domain</a></li> + * <li><a href="http://wiki.mozilla.org/TLD_List">http://wiki.mozilla.org/TLD_List</a></li> + * <li><a href="http://publicsuffix.org/">http://publicsuffix.org/</a></li> + * </ul> + */ +package org.apache.nutch.util.domain; diff --git a/src/java/org/apache/nutch/util/domain/package.html b/src/java/org/apache/nutch/util/domain/package.html deleted file mode 100644 index 49e0e6a..0000000 --- a/src/java/org/apache/nutch/util/domain/package.html +++ /dev/null @@ -1,14 +0,0 @@ -<html> -<body> -<h2>Classes for domain name analysis.</h2> - -for information please refer to following urls : -<ul> -<li><a href="http://en.wikipedia.org/wiki/DNS">http://en.wikipedia.org/wiki/DNS</a></li> -<li><a href="http://en.wikipedia.org/wiki/Top-level_domain">http://en.wikipedia.org/wiki/Top-level_domain</a></li> -<li><a href="http://wiki.mozilla.org/TLD_List">http://wiki.mozilla.org/TLD_List</a></li> -<li><a href="http://publicsuffix.org/">http://publicsuffix.org/</a></li> -</ul> - -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package-info.java similarity index 87% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/creativecommons/src/java/org/creativecommons/nutch/package-info.java index 4181951..138637c 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package-info.java @@ -14,8 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** Sample plugins that parse and index Creative Commons metadata. */ +package org.creativecommons.nutch; diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package.html b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package.html deleted file mode 100644 index 0c91293..0000000 --- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>Sample plugins that parse and index Creative Commons medadata.</p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package-info.java similarity index 87% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package-info.java index 4181951..2f3ee87 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package-info.java @@ -14,8 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** An indexing plugin for inbound anchor text. */ +package org.apache.nutch.indexer.anchor; diff --git a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package.html b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package.html deleted file mode 100644 index c255029..0000000 --- a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>An indexing plugin for inbound anchor text.</p><p></p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package-info.java similarity index 86% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package-info.java index 4181951..74bba1e 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package-info.java @@ -14,8 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** + * A basic indexing plugin, adds basic fields: url, host, title, content, etc. + */ +package org.apache.nutch.indexer.basic; diff --git a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html b/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html deleted file mode 100644 index 3fae405..0000000 --- a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>A basic indexing plugin, adds basic fields: url, host, title, content, etc.</p><p></p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package-info.java similarity index 84% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package-info.java index 4181951..bd20502 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package-info.java @@ -14,8 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** + * A more indexing plugin, adds "more" index fields:last modified + * date, MIME type, content length. + */ +package org.apache.nutch.indexer.more; diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package.html b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package.html deleted file mode 100644 index 7b8fade..0000000 --- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/package.html +++ /dev/null @@ -1,6 +0,0 @@ -<html> -<body> -<p>A more indexing plugin, adds "more" index fields: -last modified date, MIME type, content length.</p><p></p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/package-info.java similarity index 72% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/package-info.java index 4181951..8c6eb3d 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/package-info.java @@ -14,8 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** A simple plugin called at indexing that adds fields with static data. + * You can specify a list of fieldname:fieldcontent per nutch job. + * It can be useful when collections can't be created by urlpatterns, + * like in subcollection, but on a job-basis. + */ +package org.apache.nutch.indexer.staticfield; diff --git a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/package.html b/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/package.html deleted file mode 100644 index f4b5146..0000000 --- a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>A simple plugin called at indexing that adds fields with static data. You can specify a list of fieldname:fieldcontent per nutch job. It can be useful when collections can't be created by urlpatterns, like in subcollection, but on a job-basis.</p><p></p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/package-info.java similarity index 72% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/package-info.java index 4181951..b14730b 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/package-info.java @@ -14,8 +14,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** + * <p>Text document language identifier.</p> + * <p>Language profiles are based on material from + * <a href="http://www.homepages.inf.ed.ac.uk/pkoehn/publications/europarl.ps"> + * http://www.homepages.inf.ed.ac.uk/pkoehn/publications/europarl.ps</a>.</p> + */ +package org.apache.nutch.analysis.lang; diff --git a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/package.html b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/package.html deleted file mode 100644 index 06343c8..0000000 --- a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/package.html +++ /dev/null @@ -1,6 +0,0 @@ -<html> -<body> -<p>Text document language identifier.</p><p>Language profiles are based on material from -<a href="http://www.homepages.inf.ed.ac.uk/pkoehn/publications/europarl.ps/">http://www.homepages.inf.ed.ac.uk/pkoehn/publications/europarl.ps/</a>.</p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package-info.java similarity index 80% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package-info.java index 4181951..a99b4ba 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package-info.java @@ -14,8 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** + * Common API used by HTTP plugins ({@link org.apache.nutch.protocol.http http}, + * {@link org.apache.nutch.protocol.httpclient httpclient}) + */ +package org.apache.nutch.protocol.http.api; diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html deleted file mode 100644 index 972bb3c..0000000 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/package.html +++ /dev/null @@ -1,6 +0,0 @@ -<html> -<body> -<p>Common API used by HTTP plugins ({@link org.apache.nutch.protocol.http http}, -{@link org.apache.nutch.protocol.httpclient httpclient})</p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package-info.java similarity index 82% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package-info.java index 4181951..4a828bd 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package-info.java @@ -14,8 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** + * A microformats <a href="http://www.microformats.org/wiki/Rel-Tag">Rel-Tag</a> + * Parser/Indexer/Querier plugin. + */ +package org.apache.nutch.microformats.reltag; diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html deleted file mode 100644 index bef5409..0000000 --- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html +++ /dev/null @@ -1,8 +0,0 @@ -<html> -<body> -<p> -A microformats <a href="http://www.microformats.org/wiki/Rel-Tag">Rel-Tag</a> -Parser/Indexer/Querier plugin. -</p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/package-info.java similarity index 81% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/parse-html/src/java/org/apache/nutch/parse/html/package-info.java index 4181951..c1b3c46 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/package-info.java @@ -14,8 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** + * <p>An HTML document parsing plugin.</p> + * <p>This package relies on <a href="https://github.com/codelibs/nekohtml">NekoHTML</a>.</p> + */ +package org.apache.nutch.parse.html; diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/package.html b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/package.html deleted file mode 100644 index c650389..0000000 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>An HTML document parsing plugin.</p><p>This package relies on <a href="http://www.apache.org/~andyc/neko/doc/html/index.html">NekoHTML</a>.</p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/package-info.java similarity index 87% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/package-info.java index 4181951..0cc9b74 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/package-info.java @@ -14,8 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** Protocol plugin which supports retrieving local file resources. */ +package org.apache.nutch.protocol.file; diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/package.html b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/package.html deleted file mode 100644 index 221c79c..0000000 --- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>Protocol plugin which supports retrieving local file resources.</p><p></p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package-info.java similarity index 86% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package-info.java index 4181951..d64c0fb 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package-info.java @@ -14,8 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** Protocol plugin which supports retrieving documents via the ftp protocol. */ +package org.apache.nutch.protocol.ftp; diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package.html b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package.html deleted file mode 100644 index d936930..0000000 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the ftp protocol.</p><p></p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package-info.java similarity index 86% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package-info.java index 4181951..bf4902c 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package-info.java @@ -14,8 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** Protocol plugin which supports retrieving documents via the http protocol.*/ +package org.apache.nutch.protocol.htmlunit; diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package-info.java similarity index 86% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package-info.java index 4181951..cc82483 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package-info.java @@ -14,8 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** Protocol plugin which supports retrieving documents via the http protocol. */ +package org.apache.nutch.protocol.http; diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package.html b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package.html deleted file mode 100644 index 34d1d1c..0000000 --- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package-info.java similarity index 67% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package-info.java index 4181951..2512044 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package-info.java @@ -14,8 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** + * Protocol plugin which supports retrieving documents via the + * HTTP andHTTPS protocols, optionally with Basic, Digest and + * NTLM authentication schemes for web server as well as + * proxy server. It handles cookies within a single fetch + * operation. This plugin is based on Jakarta Commons + * HttpClient library. + */ +package org.apache.nutch.protocol.httpclient; diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html deleted file mode 100644 index 9cbcb14..0000000 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/package.html +++ /dev/null @@ -1,9 +0,0 @@ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the HTTP and -HTTPS protocols, optionally with Basic, Digest and NTLM authentication -schemes for web server as well as proxy server. It handles cookies -within a single fetch operation. This plugin is based on Jakarta -Commons HttpClient library.</p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/package-info.java similarity index 86% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/package-info.java index 4181951..f6738ed 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/package-info.java @@ -14,8 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** Protocol plugin which supports retrieving documents via selenium. */ +package org.apache.nutch.protocol.interactiveselenium; diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/package.html b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/package.html deleted file mode 100644 index 75cd5b5..0000000 --- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via selenium.</p><p></p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package-info.java similarity index 87% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package-info.java index 4181951..2441aec 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package-info.java @@ -14,8 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** Protocol plugin which supports retrieving documents via selenium. */ +package org.apache.nutch.protocol.selenium; diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html deleted file mode 100644 index 75cd5b5..0000000 --- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via selenium.</p><p></p> -</body> -</html> diff --git a/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/package-info.java b/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/package-info.java new file mode 100644 index 0000000..b3ddbd1 --- /dev/null +++ b/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/package-info.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * <p>Metadata Scoring Plugin</p> + * <p>Propagates Metadata from an injected or outlink url in the crawldb + * to the url's different procecssed objects. In moving any metadata + * item, you need to copy metadata in three steps:</p> + * <ul> + * <li>Crawldb to content: Copy a metadata entry stored in the crawldb record of the url to the url's fetched content object. You need to specify the entry in the <b>scoring.db.md</b> property</li> + * <li>Content to parsedData: Copy a metadata entry stored in the Content object of a crawled url to its parsedData. You need to specify the entry in the <b>scoring.content.md</b> property</li> + * <li>ParsedData to outlink objects: Copy a metadata entry stored in the parsedData of a crawl item to the crawldb records of the url's outlinks. You need to specify the entry in the <b>scoring.parse.md</b> property</li> + * </ul> + * <p>Note that you can not move data directly from a crawldb record to + * parseData or outlink objects. The sequence of moving the metadata + * should be crawldb -> content -> parsedData -> outlink objects.</p> + */ +package org.apache.nutch.scoring.metadata; diff --git a/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/package.html b/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/package.html deleted file mode 100644 index 0356152..0000000 --- a/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/package.html +++ /dev/null @@ -1,33 +0,0 @@ -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<html> - <body> - <p> - Metadata Scoring Plugin - </p> - <p> - Propagates Metadata from an injected or outlink url in the crawldb to the url's different procecssed objects. In moving any metadata item, you need to copy metadata in three steps: - <ul> - <li>Crawldb to content: Copy a metadata entry stored in the crawldb record of the url to the url's fetched content object. You need to specify the entry in the <b>scoring.db.md</b> property</li> - <li>Content to parsedData: Copy a metadata entry stored in the Content object of a crawled url to its parsedData. You need to specify the entry in the <b>scoring.content.md</b> property</li> - <li>ParsedData to outlink objects: Copy a metadata entry stored in the parsedData of a crawl item to the crawldb records of the url's outlinks. You need to specify the entry in the <b>scoring.parse.md</b> property</li> - </ul> - - Note that you can not move data directly from a crawldb record to parseData or outlink objects. The sequence of moving the metadata should be crawldb -> content -> parsedData -> outlink objects. - </p> - </body> -</html> diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/collection/package-info.java b/src/plugin/subcollection/src/java/org/apache/nutch/collection/package-info.java new file mode 100644 index 0000000..055cb45 --- /dev/null +++ b/src/plugin/subcollection/src/java/org/apache/nutch/collection/package-info.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * <p>Subcollection is a subset of an index. Subcollections are + * defined by urlpatterns in form of white/blacklist. So to get the + * page into subcollection it must match the whitelist and not + * the blacklist.</p> + * <p> Subcollection definitions are read from a file + * <code>subcollections.xml</code> and the format is as follows + * (imagine here that you are crawling all the virtualhosts from + * apache.org and you want to tag pages with url pattern + * "https://nutch.apache.org" and + * "https://cwiki.apache.org/confluence/display/nutch" to be part of + * subcollection "nutch", this allows you to later search specifically + * from this subcollection)</p> + * <pre> + * {@code + * <xml version="1.0" encoding="UTF-8"?> + * <subcollections> + * <subcollection> + * <name>nutch</name> + * <id>nutch</id> + * <whitelist>https://nutch.apache.org</whitelist> + * <whitelist>https://cwiki.apache.org/confluence/display/nutch</whitelist> + * <blacklist /> + * </subcollection> + * </subcollections> + * } + * </pre> + * <p>Despite of this configuration you still can crawl any urls + * as long as they pass through your global url filters. (note that + * you must also seed your urls in normal nutch way)</p> + */ +package org.apache.nutch.collection; diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html b/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html deleted file mode 100644 index be08d1c..0000000 --- a/src/plugin/subcollection/src/java/org/apache/nutch/collection/package.html +++ /dev/null @@ -1,36 +0,0 @@ -<html> -<body> -<p> -Subcollection is a subset of an index. Subcollections are defined -by urlpatterns in form of white/blacklist. So to get the page into -subcollection it must match the whitelist and not the blacklist. -</p> -<p> -Subcollection definitions are read from a file subcollections.xml -and the format is as follows (imagine here that you are crawling all -the virtualhosts from apache.org and you wan't to tag pages with -url pattern "http://lucene.apache.org/nutch" and http://wiki.apache.org/nutch/ -to be part of subcollection "nutch", this allows you to later search -specifically from this subcollection) -</p> -<p/> -<p/> -<pre> -<?xml version="1.0" encoding="UTF-8"?> -<subcollections> - <subcollection> - <name>nutch</name> - <id>lucene</id> - <whitelist>http://lucene.apache.org/nutch</whitelist> - <whitelist>http://wiki.apache.org/nutch/</whitelist> - <blacklist /> - </subcollection> -</subcollections> -</pre> -</p> -<p>Despite of this configuration you still can crawl any urls -as long as they pass through your global url filters. (note that -you must also seed your urls in normal nutch way) -</p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/package-info.java similarity index 87% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/tld/src/java/org/apache/nutch/indexer/tld/package-info.java index 4181951..6696fdc 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/package-info.java @@ -14,8 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** Top Level Domain Indexing plugin. */ +package org.apache.nutch.indexer.tld; diff --git a/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/package.html b/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/package.html deleted file mode 100644 index 75841d9..0000000 --- a/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>Top Level Domain Indexing plugin.</p><p></p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package-info.java similarity index 87% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package-info.java index 4181951..6ab8373 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package-info.java @@ -14,8 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** Top Level Domain Scoring plugin. */ +package org.apache.nutch.scoring.tld; diff --git a/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package.html b/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package.html deleted file mode 100644 index d05e4b8..0000000 --- a/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>Top Level Domain Scoring plugin.</p><p></p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package-info.java similarity index 79% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package-info.java index 4181951..41743a3 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package-info.java @@ -14,8 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** + * URL filter plugin based on + * <a href="https://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State + * Automata for Java<sup>TM</sup>. + */ +package org.apache.nutch.urlfilter.automaton; diff --git a/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package.html b/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package.html deleted file mode 100644 index 282013f..0000000 --- a/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/package.html +++ /dev/null @@ -1,9 +0,0 @@ -<html> -<body> -<p> -URL filter plugin based on -<a href="https://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State -Automata for Java<sup>TM</sup>. -</p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package-info.java similarity index 85% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package-info.java index 4181951..1718ee8 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package-info.java @@ -14,8 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** URL filter plugin to include only URLs which match one of a given list of URL prefixes. */ +package org.apache.nutch.urlfilter.prefix; diff --git a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html deleted file mode 100644 index dbed0be..0000000 --- a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>URL filter plugin to include only URLs which match one of a given list of URL prefixes.</p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package-info.java similarity index 85% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package-info.java index 4181951..8cb3afa 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package-info.java @@ -14,8 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** + * URL filter plugin to include and/or exclude URLs matching Java regular expressions. + */ +package org.apache.nutch.urlfilter.regex; diff --git a/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html b/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html deleted file mode 100644 index 7acf73b..0000000 --- a/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/package.html +++ /dev/null @@ -1,5 +0,0 @@ -<html> -<body> -<p>URL filter plugin to include and/or exclude URLs matching Java regular expressions.</p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/package-info.java similarity index 70% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/package-info.java index 4181951..11d2cde 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/package-info.java @@ -14,8 +14,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** + * <p>URL filter plugin that validates given urls.</p> + * <p>This plugin runs a series of tests for the given url to make sure that given + * url is valid and 'fetchable'.</p> + * <p>Note: This plugin should <b>only</b> be used for web-related protocols such + * as http, https and ftp.</p> + */ +package org.apache.nutch.urlfilter.validator; diff --git a/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/package.html b/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/package.html deleted file mode 100644 index b5ec8a1..0000000 --- a/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/package.html +++ /dev/null @@ -1,9 +0,0 @@ -<html> -<body> -<p>URL filter plugin that validates given urls.</p> -<p>This plugin runs a series of tests for the given url to make sure that given -url is valid and 'fetchable'.</p> -<p>Note: This plugin should <b>only</b> be used for web-related protocols such -as http, https and ftp.</p> -</body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/package-info.java similarity index 62% copy from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html copy to src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/package-info.java index 4181951..1a1239b 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/package-info.java @@ -14,8 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** + * <p>URL Meta Tag Indexing Plugin</p> + * <p>Takes Meta Tags, injected alongside a URL + * (see <A href="https://issues.apache.org/jira/browse/NUTCH-655">NUTCH-655</a>) + * and specified in the "urlmeta.tags" property, and inserts them into + * the document--which is then sent to the Indexer. If you specify + * these fields in the Nutch schema (as well as the Indexer's), you + * can reasonably assume that they will be indexed.</p> + */ +package org.apache.nutch.indexer.urlmeta; diff --git a/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/package.html b/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/package.html deleted file mode 100644 index 5da5d56..0000000 --- a/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/package.html +++ /dev/null @@ -1,12 +0,0 @@ -<html> - <body> - <p> - URL Meta Tag Indexing Plugin - </p> - <p> - Takes Meta Tags, injected alongside a URL (see NUTCH-655) and specified in the "urlmeta.tags" property, - and inserts them into the document--which is then sent to the Indexer. If you specify these fields in - the Nutch's schema (as well as the Indexer's), you can reasonably assume that they will be indexed. - </p> - </body> -</html> diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/package-info.java similarity index 69% rename from src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html rename to src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/package-info.java index 4181951..df182c4 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html +++ b/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/package-info.java @@ -14,8 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -<html> -<body> -<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p> -</body> -</html> + +/** + * <p>URL Meta Tag Scoring Plugin</p> + * <p>Propagates Meta Tags, injected alongside a URL + * (see <a href="https://issues.apache.org/jira/browse/NUTCH-655">NUTCH-655</a>) + * and specified in the "urlmeta.tags" property, along to their outlinks. + * This does not actually perform scoring.</p> + */ +package org.apache.nutch.scoring.urlmeta; + diff --git a/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/package.html b/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/package.html deleted file mode 100644 index 5bba7a8..0000000 --- a/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/package.html +++ /dev/null @@ -1,11 +0,0 @@ -<html> - <body> - <p> - URL Meta Tag Scoring Plugin - </p> - <p> - Propagates Meta Tags, injected alongside a URL (see NUTCH-655) and specified in the "urlmeta.tags" property, - along to their outlinks. This does not actually perform scoring. - </p> - </body> -</html>