This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 0e3e021 NUTCH-2869 Add @Override annotations to Nutch plugins - add/complete @Override annotions for methods implementing interfaces - plugins implementing the ScoringFilter interface: extend AbstractScoringFilter and get rid of default method implementations - URL filters/normalizers: remove unused methods including a CrawlDatum parameter - improve Javadoc and documentation in build and config files new 41bf0a1 Merge pull request #650 from sebastian-nagel/NUTCH-2869-plugins-override-annotation 0e3e021 is described below commit 0e3e021d088b03b83de07963cc0c363c90aaacda Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Thu Jun 10 14:50:31 2021 +0200 NUTCH-2869 Add @Override annotations to Nutch plugins - add/complete @Override annotions for methods implementing interfaces - plugins implementing the ScoringFilter interface: extend AbstractScoringFilter and get rid of default method implementations - URL filters/normalizers: remove unused methods including a CrawlDatum parameter - improve Javadoc and documentation in build and config files --- build.xml | 2 +- conf/nutch-default.xml | 6 ++- src/java/org/apache/nutch/net/URLFilter.java | 14 +++-- src/java/org/apache/nutch/net/URLFilters.java | 19 ++++--- .../nutch/scoring/AbstractScoringFilter.java | 11 +++- .../creativecommons/nutch/CCIndexingFilter.java | 3 ++ .../org/creativecommons/nutch/CCParseFilter.java | 3 ++ .../nutch/indexer/feed/FeedIndexingFilter.java | 3 ++ .../org/apache/nutch/parse/feed/FeedParser.java | 3 ++ .../nutch/parse/headings/HeadingsParseFilter.java | 3 ++ .../nutch/indexer/anchor/AnchorIndexingFilter.java | 3 ++ .../nutch/indexer/basic/BasicIndexingFilter.java | 3 ++ .../nutch/indexer/links/LinksIndexingFilter.java | 2 + .../nutch/indexer/metadata/MetadataIndexer.java | 3 ++ .../nutch/indexer/more/MoreIndexingFilter.java | 3 ++ .../nutch/indexer/replace/ReplaceIndexer.java | 12 ++--- .../indexer/staticfield/StaticFieldIndexer.java | 3 ++ .../nutch/analysis/lang/HTMLLanguageParser.java | 3 ++ .../analysis/lang/LanguageIndexingFilter.java | 3 ++ .../apache/nutch/protocol/http/api/HttpBase.java | 3 ++ .../nutch/urlfilter/api/RegexURLFilterBase.java | 3 ++ .../microformats/reltag/RelTagIndexingFilter.java | 14 ++--- .../nutch/microformats/reltag/RelTagParser.java | 3 ++ .../java/org/apache/nutch/parse/ext/ExtParser.java | 3 ++ .../org/apache/nutch/parse/html/HtmlParser.java | 1 + .../org/apache/nutch/parse/js/JSParseFilter.java | 2 + .../nutch/parse/metatags/MetaTagsParser.java | 3 ++ .../org/apache/nutch/parse/tika/TikaParser.java | 3 ++ .../java/org/apache/nutch/parse/zip/ZipParser.java | 3 ++ .../naivebayes/NaiveBayesParseFilter.java | 2 + .../nutch/parsefilter/regex/RegexParseFilter.java | 3 ++ .../java/org/apache/nutch/protocol/file/File.java | 3 ++ .../java/org/apache/nutch/protocol/ftp/Ftp.java | 4 ++ .../org/apache/nutch/protocol/htmlunit/Http.java | 4 +- .../nutch/protocol/htmlunit/HttpResponse.java | 5 ++ .../java/org/apache/nutch/protocol/http/Http.java | 2 + .../org/apache/nutch/protocol/httpclient/Http.java | 2 + .../org/apache/nutch/protocol/okhttp/OkHttp.java | 2 + .../nutch/scoring/depth/DepthScoringFilter.java | 1 + .../scoring/link/LinkAnalysisScoringFilter.java | 36 ++++--------- .../scoring/metadata/MetadataScoringFilter.java | 4 ++ .../nutch/scoring/opic/OPICScoringFilter.java | 10 ++++ .../nutch/scoring/orphan/OrphanScoringFilter.java | 3 ++ .../org/apache/nutch/collection/Subcollection.java | 1 + .../subcollection/SubcollectionIndexingFilter.java | 3 ++ .../nutch/indexer/tld/TLDIndexingFilter.java | 7 +-- .../apache/nutch/scoring/tld/TLDScoringFilter.java | 61 ++-------------------- .../nutch/urlfilter/domain/DomainURLFilter.java | 3 ++ .../domaindenylist/DomainDenylistURLFilter.java | 3 ++ .../nutch/urlfilter/prefix/PrefixURLFilter.java | 3 ++ .../nutch/urlfilter/suffix/SuffixURLFilter.java | 3 ++ .../nutch/urlfilter/validator/UrlValidator.java | 3 ++ .../indexer/urlmeta/URLMetaIndexingFilter.java | 4 +- .../scoring/urlmeta/URLMetaScoringFilter.java | 49 +++-------------- .../net/urlnormalizer/ajax/AjaxURLNormalizer.java | 3 ++ .../net/urlnormalizer/host/HostURLNormalizer.java | 3 ++ .../net/urlnormalizer/pass/PassURLNormalizer.java | 3 ++ .../protocol/ProtocolURLNormalizer.java | 7 ++- .../querystring/QuerystringURLNormalizer.java | 3 ++ .../urlnormalizer/regex/RegexURLNormalizer.java | 2 + .../urlnormalizer/slash/SlashURLNormalizer.java | 7 ++- 61 files changed, 216 insertions(+), 173 deletions(-) diff --git a/build.xml b/build.xml index dcb7b94..1180dea 100644 --- a/build.xml +++ b/build.xml @@ -647,7 +647,7 @@ <typefound uri="antlib:org.apache.ivy.ant" name="cleancache" /> </not> </condition> - You need Apache Ivy 2.0 or later from http://ant.apache.org/ + You need Apache Ivy 2.5.0 or later from https://ant.apache.org/ It could not be loaded from ${ivy.repo.url} </fail> </target> diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index 3e867e6..1e89745 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -2635,7 +2635,11 @@ Add scoring-metadata to the list of active plugins <name>publisher.order</name> <value></value> <description> - The order in which the publisher queues would be loaded + The order in which the publisher queues would be loaded. If + empty, all available publishers (see properties plugin-includes + and plugin-excludes) are loaded and applied in system defined + order. If not empty, only named publishers are loaded and applied + in the given order. </description> </property> diff --git a/src/java/org/apache/nutch/net/URLFilter.java b/src/java/org/apache/nutch/net/URLFilter.java index afbd1e0..6767b98 100644 --- a/src/java/org/apache/nutch/net/URLFilter.java +++ b/src/java/org/apache/nutch/net/URLFilter.java @@ -21,17 +21,23 @@ import org.apache.hadoop.conf.Configurable; import org.apache.nutch.plugin.Pluggable; /** - * Interface used to limit which URLs enter Nutch. Used by the injector and the - * db updater. + * Interface used to limit which URLs enter Nutch. Used per default by injector, + * fetcher and parser for all URLs seen first (seeds, outlinks, redirects). URL + * filters can be optionally enabled for many more Nutch tools. */ - public interface URLFilter extends Pluggable, Configurable { + /** The name of the extension point. */ public final static String X_POINT_ID = URLFilter.class.getName(); - /* + /** * Interface for a filter that transforms a URL: it can pass the original URL * through or "delete" the URL by returning null + * + * @param urlString + * the URL string the filter is applied on + * @return the original URL string if the URL is accepted by the filter or + * null in case the URL is rejected */ public String filter(String urlString); } diff --git a/src/java/org/apache/nutch/net/URLFilters.java b/src/java/org/apache/nutch/net/URLFilters.java index ed58650..4404626 100644 --- a/src/java/org/apache/nutch/net/URLFilters.java +++ b/src/java/org/apache/nutch/net/URLFilters.java @@ -19,7 +19,11 @@ package org.apache.nutch.net; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.plugin.PluginRepository; -/** Creates and caches {@link URLFilter} implementing plugins. */ +/** + * Creates and caches plugins implementing {@link URLFilter}. Filters URLs using + * the active filters defined by the properties "plugin.includes", + * "plugin.excludes" and "urlfilter.order". + */ public class URLFilters { public static final String URLFILTER_ORDER = "urlfilter.order"; @@ -34,12 +38,15 @@ public class URLFilters { return this.filters; } - /** - * Run all defined filters. Assume logical AND. - * @param urlString to execute filters on + /** + * Run all defined filters. Assume logical AND. To control performance, the + * ULFilter classes can be ordered by the property "urlfilter.order". + * + * @param urlString + * to execute filters on * @return filtered result - * @throws URLFilterException if there is an issue executing - * any URLFilter implementations. + * @throws URLFilterException + * if there is an issue executing any URLFilter implementations. */ public String filter(String urlString) throws URLFilterException { for (int i = 0; i < this.filters.length; i++) { diff --git a/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java b/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java index 94e1732..e6ee206 100644 --- a/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java +++ b/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java @@ -28,48 +28,55 @@ import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseData; import org.apache.nutch.protocol.Content; -import org.apache.nutch.scoring.ScoringFilter; -import org.apache.nutch.scoring.ScoringFilterException; public abstract class AbstractScoringFilter implements ScoringFilter { private Configuration conf; + @Override public Configuration getConf() { return conf; } + @Override public void setConf(Configuration conf) { this.conf = conf; } + @Override public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException { } + @Override public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException { } + @Override public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException { return initSort; } + @Override public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) throws ScoringFilterException { } + @Override public void passScoreAfterParsing(Text url, Content content, Parse parse) throws ScoringFilterException { } + @Override public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount) throws ScoringFilterException { return adjust; } + @Override public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked) throws ScoringFilterException { } diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java index e0a4253..bb3560d 100644 --- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java +++ b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java @@ -49,6 +49,7 @@ public class CCIndexingFilter implements IndexingFilter { private Configuration conf; + @Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { @@ -115,10 +116,12 @@ public class CCIndexingFilter implements IndexingFilter { doc.add(FIELD, feature); } + @Override public void setConf(Configuration conf) { this.conf = conf; } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java index ba10432..9e7676d 100644 --- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java +++ b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java @@ -277,6 +277,7 @@ public class CCParseFilter implements HtmlParseFilter { * Adds metadata or otherwise modifies a parse of an HTML document, given the * DOM tree of a page. */ + @Override public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { @@ -307,10 +308,12 @@ public class CCParseFilter implements HtmlParseFilter { return parseResult; } + @Override public void setConf(Configuration conf) { this.conf = conf; } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java b/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java index 5a2fa77..901caa6 100644 --- a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java +++ b/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java @@ -64,6 +64,7 @@ public class FeedIndexingFilter implements IndexingFilter { * And sends them to the {@link org.apache.nutch.indexer Indexer} for indexing within the Nutch index. * */ + @Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { ParseData parseData = parse.getData(); @@ -107,6 +108,7 @@ public class FeedIndexingFilter implements IndexingFilter { * @return the {@link Configuration} object used to configure this * {@link IndexingFilter}. */ + @Override public Configuration getConf() { return conf; } @@ -119,6 +121,7 @@ public class FeedIndexingFilter implements IndexingFilter { * The {@link Configuration} object used to configure this * {@link IndexingFilter}. */ + @Override public void setConf(Configuration conf) { this.conf = conf; } diff --git a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java index 646c4f9..cecd366 100644 --- a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java +++ b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java @@ -100,6 +100,7 @@ public class FeedParser implements Parser { * present in the feed file that this {@link Parser} dealt with. * */ + @Override public ParseResult getParse(Content content) { SyndFeed feed = null; ParseResult parseResult = new ParseResult(content.getUrl()); @@ -162,6 +163,7 @@ public class FeedParser implements Parser { * {@link Parser}. * */ + @Override public void setConf(Configuration conf) { this.conf = conf; this.parserFactory = new ParserFactory(conf); @@ -176,6 +178,7 @@ public class FeedParser implements Parser { * @return The {@link Configuration} object used to configure this * {@link Parser}. */ + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java b/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java index 4b446bb..57d6de9 100644 --- a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java +++ b/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java @@ -45,6 +45,7 @@ public class HeadingsParseFilter implements HtmlParseFilter { private String[] headings; private boolean multiValued = false; + @Override public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { Parse parse = parseResult.get(content.getUrl()); @@ -68,6 +69,7 @@ public class HeadingsParseFilter implements HtmlParseFilter { return parseResult; } + @Override public void setConf(Configuration conf) { this.conf = conf; @@ -75,6 +77,7 @@ public class HeadingsParseFilter implements HtmlParseFilter { multiValued = conf.getBoolean("headings.multivalued", false); } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java index 2b280d5..7493c31 100644 --- a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java +++ b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java @@ -46,6 +46,7 @@ public class AnchorIndexingFilter implements IndexingFilter { /** * Set the {@link Configuration} object */ + @Override public void setConf(Configuration conf) { this.conf = conf; @@ -56,6 +57,7 @@ public class AnchorIndexingFilter implements IndexingFilter { /** * Get the {@link Configuration} object */ + @Override public Configuration getConf() { return this.conf; } @@ -77,6 +79,7 @@ public class AnchorIndexingFilter implements IndexingFilter { * The {@link Inlinks} containing anchor text * @return filtered NutchDocument */ + @Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { diff --git a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java b/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java index 94cd1fc..0eab1a7 100644 --- a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java +++ b/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java @@ -68,6 +68,7 @@ public class BasicIndexingFilter implements IndexingFilter { * The {@link Inlinks} containing anchor text * @return filtered NutchDocument */ + @Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { @@ -135,6 +136,7 @@ public class BasicIndexingFilter implements IndexingFilter { /** * Set the {@link Configuration} object */ + @Override public void setConf(Configuration conf) { this.conf = conf; this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100); @@ -145,6 +147,7 @@ public class BasicIndexingFilter implements IndexingFilter { /** * Get the {@link Configuration} object */ + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java b/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java index 35370f2..4833237 100644 --- a/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java +++ b/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java @@ -156,6 +156,7 @@ public class LinksIndexingFilter implements IndexingFilter { } } + @Override public void setConf(Configuration conf) { this.conf = conf; filterOutlinks = conf.getBoolean(LINKS_OUTLINKS_HOST, false); @@ -164,6 +165,7 @@ public class LinksIndexingFilter implements IndexingFilter { indexHost = conf.getBoolean(LINKS_ONLY_HOSTS, false); } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java index e2f722c..a8eb8ef 100644 --- a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java +++ b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java @@ -49,6 +49,7 @@ public class MetadataIndexer implements IndexingFilter { private static final String separator_CONF_PROPERTY = "index.metadata.separator"; private static final String mvfields_CONF_PROPERTY = "index.metadata.multivalued.fields"; + @Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { @@ -105,6 +106,7 @@ public class MetadataIndexer implements IndexingFilter { } } + @Override public void setConf(Configuration conf) { this.conf = conf; dbFieldnames = conf.getStrings(db_CONF_PROPERTY); @@ -119,6 +121,7 @@ public class MetadataIndexer implements IndexingFilter { } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java index 2a475c5..6f40359 100644 --- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java +++ b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java @@ -105,6 +105,7 @@ public class MoreIndexingFilter implements IndexingFilter { "yyyy-MM-dd'T'HH:mm:ssXXX" }; private String[] dateStyles = null; + @Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { @@ -311,6 +312,7 @@ public class MoreIndexingFilter implements IndexingFilter { return doc; } + @Override public void setConf(Configuration conf) { this.conf = conf; MIME = new MimeUtil(conf); @@ -352,6 +354,7 @@ public class MoreIndexingFilter implements IndexingFilter { } } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java index 503310a..8dde66f 100644 --- a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java +++ b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java @@ -102,9 +102,7 @@ public class ReplaceIndexer implements IndexingFilter { private Configuration conf; - /** - * {@inheritDoc} - */ + @Override public void setConf(Configuration conf) { this.conf = conf; FIELDREPLACERS_BY_HOST.clear(); @@ -116,9 +114,7 @@ public class ReplaceIndexer implements IndexingFilter { } } - /** - * {@inheritDoc} - */ + @Override public Configuration getConf() { return this.conf; } @@ -233,9 +229,7 @@ public class ReplaceIndexer implements IndexingFilter { } } - /** - * {@inheritDoc} - */ + @Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { diff --git a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java b/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java index bd68dd1..c022ca7 100644 --- a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java +++ b/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java @@ -59,6 +59,7 @@ public class StaticFieldIndexer implements IndexingFilter { * The {@link Inlinks} containing anchor text * @return filtered NutchDocument */ + @Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { @@ -99,6 +100,7 @@ public class StaticFieldIndexer implements IndexingFilter { /** * Set the {@link Configuration} object */ + @Override public void setConf(Configuration conf) { this.conf = conf; @@ -117,6 +119,7 @@ public class StaticFieldIndexer implements IndexingFilter { /** * Get the {@link Configuration} object */ + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java index 28878dc..41fe099 100644 --- a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java +++ b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java @@ -85,6 +85,7 @@ public class HTMLLanguageParser implements HtmlParseFilter { * -html.shtml#language) <li>3. meta http-equiv (content-language) * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br></ul> */ + @Override public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { String lang = null; @@ -300,6 +301,7 @@ public class HTMLLanguageParser implements HtmlParseFilter { } + @Override public void setConf(Configuration conf) { this.conf = conf; contentMaxlength = conf.getInt("lang.analyze.max.length", -1); @@ -314,6 +316,7 @@ public class HTMLLanguageParser implements HtmlParseFilter { } } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java index 10289e5..ed9362e 100644 --- a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java +++ b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java @@ -60,6 +60,7 @@ public class LanguageIndexingFilter implements IndexingFilter { } // Inherited JavaDoc + @Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { @@ -84,11 +85,13 @@ public class LanguageIndexingFilter implements IndexingFilter { return doc; } + @Override public void setConf(Configuration conf) { this.conf = conf; indexLangs = new HashSet<>(conf.getStringCollection("lang.index.languages")); } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index 58dfbfe..ce999b3 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -203,6 +203,7 @@ public abstract class HttpBase implements Protocol { robots = new HttpRobotRulesParser(); } + @Override public void setConf(Configuration conf) { this.conf = conf; this.proxyHost = conf.get("http.proxy.host"); @@ -373,10 +374,12 @@ public abstract class HttpBase implements Protocol { logConf(); } + @Override public Configuration getConf() { return this.conf; } + @Override public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { String urlString = url.toString(); diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java index af54c00..0ddb698 100644 --- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java +++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java @@ -168,6 +168,7 @@ public abstract class RegexURLFilterBase implements URLFilter { protected abstract Reader getRulesReader(Configuration conf) throws IOException; + @Override public String filter(String url) { String host = null; String domain = null; @@ -205,6 +206,7 @@ public abstract class RegexURLFilterBase implements URLFilter { return null; } + @Override public void setConf(Configuration conf) { this.conf = conf; Reader reader = null; @@ -226,6 +228,7 @@ public abstract class RegexURLFilterBase implements URLFilter { } } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java index e0fcfa7..b2121d9 100644 --- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java +++ b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java @@ -38,7 +38,7 @@ public class RelTagIndexingFilter implements IndexingFilter { private Configuration conf; - // Inherited JavaDoc + @Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { @@ -54,22 +54,14 @@ public class RelTagIndexingFilter implements IndexingFilter { return doc; } - /* - * ----------------------------- * <implementation:Configurable> * - * ----------------------------- - */ - + @Override public void setConf(Configuration conf) { this.conf = conf; } + @Override public Configuration getConf() { return this.conf; } - /* - * ------------------------------ * </implementation:Configurable> * - * ------------------------------ - */ - } diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java index 0efcbb3..3d96a7b 100644 --- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java +++ b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java @@ -58,6 +58,7 @@ public class RelTagParser implements HtmlParseFilter { /** * Scan the HTML document looking at possible rel-tags */ + @Override public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { @@ -136,10 +137,12 @@ public class RelTagParser implements HtmlParseFilter { } + @Override public void setConf(Configuration conf) { this.conf = conf; } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java b/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java index dfebb53..525cfc1 100644 --- a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java +++ b/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java @@ -66,6 +66,7 @@ public class ExtParser implements Parser { public ExtParser() { } + @Override public ParseResult getParse(Content content) { String contentType = content.getContentType(); @@ -141,6 +142,7 @@ public class ExtParser implements Parser { parseData)); } + @Override public void setConf(Configuration conf) { this.conf = conf; Extension[] extensions = PluginRepository.get(conf) @@ -177,6 +179,7 @@ public class ExtParser implements Parser { } } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java index 5852b14..6a6d49d 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java @@ -143,6 +143,7 @@ public class HtmlParser implements Parser { private String cachingPolicy; + @Override public ParseResult getParse(Content content) { HTMLMetaTags metaTags = new HTMLMetaTags(); diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java index e6527e2..c27ef4a 100644 --- a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java +++ b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java @@ -302,10 +302,12 @@ public class JSParseFilter implements HtmlParseFilter, Parser { System.out.println(" - " + links[i]); } + @Override public void setConf(Configuration conf) { this.conf = conf; } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java index 8deaf18..6cef438 100644 --- a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java +++ b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java @@ -48,6 +48,7 @@ public class MetaTagsParser implements HtmlParseFilter { private Set<String> metatagset = new HashSet<String>(); + @Override public void setConf(Configuration conf) { this.conf = conf; // specify whether we want a specific subset of metadata @@ -58,6 +59,7 @@ public class MetaTagsParser implements HtmlParseFilter { } } + @Override public Configuration getConf() { return this.conf; } @@ -95,6 +97,7 @@ public class MetaTagsParser implements HtmlParseFilter { } } + @Override public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java index d97e8b4..4b79eee 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java @@ -80,6 +80,7 @@ public class TikaParser implements org.apache.nutch.parse.Parser { private String boilerpipeExtractorName; private Set<String> boilerpipeMimeTypes; + @Override public ParseResult getParse(Content content) { HTMLDocumentImpl doc = new HTMLDocumentImpl(); doc.setErrorChecking(false); @@ -257,6 +258,7 @@ public class TikaParser implements org.apache.nutch.parse.Parser { return filteredParse; } + @Override public void setConf(Configuration conf) { this.conf = conf; this.tikaConfig = null; @@ -324,6 +326,7 @@ public class TikaParser implements org.apache.nutch.parse.Parser { parseEmbedded = conf.getBoolean("tika.parse.embedded", true); } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java index c4b953e..a605f3b 100644 --- a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java +++ b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java @@ -54,6 +54,7 @@ public class ZipParser implements Parser { public ZipParser() { } + @Override public ParseResult getParse(final Content content) { String resultText = null; @@ -109,10 +110,12 @@ public class ZipParser implements Parser { resultText, parseData)); } + @Override public void setConf(Configuration conf) { this.conf = conf; } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java index 25354bd..76821a2 100644 --- a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java +++ b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java @@ -104,6 +104,7 @@ public class NaiveBayesParseFilter implements HtmlParseFilter { return false; } + @Override public void setConf(Configuration conf) { this.conf = conf; inputFilePath = conf.get(TRAINFILE_MODELFILTER); @@ -150,6 +151,7 @@ public class NaiveBayesParseFilter implements HtmlParseFilter { } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java index 6e86fc6..bc17eb0 100644 --- a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java +++ b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java @@ -56,6 +56,7 @@ public class RegexParseFilter implements HtmlParseFilter { private static final Map<String,RegexRule> rules = new HashMap<>(); + @Override public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { Parse parse = parseResult.get(content.getUrl()); String html = new String(content.getContent()); @@ -87,6 +88,7 @@ public class RegexParseFilter implements HtmlParseFilter { return parseResult; } + @Override public void setConf(Configuration conf) { this.conf = conf; @@ -139,6 +141,7 @@ public class RegexParseFilter implements HtmlParseFilter { } } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java index 4120cbb..d55e42e 100644 --- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java +++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java @@ -70,6 +70,7 @@ public class File implements Protocol { /** * Set the {@link Configuration} object */ + @Override public void setConf(Configuration conf) { this.conf = conf; this.maxContentLength = conf.getInt("file.content.limit", 1024 * 1024); @@ -81,6 +82,7 @@ public class File implements Protocol { /** * Get the {@link Configuration} object */ + @Override public Configuration getConf() { return this.conf; } @@ -105,6 +107,7 @@ public class File implements Protocol { * @return {@link ProtocolOutput} object for the content of the file indicated * by url */ + @Override public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { String urlString = url.toString(); try { diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java index 470e151..2a47b63 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java @@ -140,6 +140,7 @@ public class Ftp implements Protocol { * * @return {@link ProtocolOutput} object for the url */ + @Override public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { String urlString = url.toString(); try { @@ -186,6 +187,7 @@ public class Ftp implements Protocol { } } + @Override protected void finalize() { try { if (this.client != null && this.client.isConnected()) { @@ -272,6 +274,7 @@ public class Ftp implements Protocol { /** * Set the {@link Configuration} object */ + @Override public void setConf(Configuration conf) { this.conf = conf; this.maxContentLength = conf.getInt("ftp.content.limit", 1024 * 1024); @@ -287,6 +290,7 @@ public class Ftp implements Protocol { /** * Get the {@link Configuration} object */ + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java index b093e5c..40a6941 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java +++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java @@ -48,6 +48,7 @@ public class Http extends HttpBase { * * @param conf a popultaed {@link Configuration} */ + @Override public void setConf(Configuration conf) { super.setConf(conf); } @@ -57,7 +58,8 @@ public class Http extends HttpBase { http.setConf(NutchConfiguration.create()); main(http, args); } - + + @Override protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) throws ProtocolException, IOException { return new HttpResponse(this, url, datum); diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java index 58e809a..ae876e0 100644 --- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java +++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java @@ -305,22 +305,27 @@ public class HttpResponse implements Response { * ------------------------- */ + @Override public URL getUrl() { return url; } + @Override public int getCode() { return code; } + @Override public String getHeader(String name) { return headers.get(name); } + @Override public Metadata getHeaders() { return headers; } + @Override public byte[] getContent() { return content; } diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java index b85c47a..cc10221 100644 --- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java +++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java @@ -48,6 +48,7 @@ public class Http extends HttpBase { * * @param conf a populated {@link Configuration} */ + @Override public void setConf(Configuration conf) { super.setConf(conf); } @@ -58,6 +59,7 @@ public class Http extends HttpBase { main(http, args); } + @Override protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) throws ProtocolException, IOException { return new HttpResponse(this, url, datum); diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java index 2247f5e..5942486 100644 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java +++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java @@ -128,6 +128,7 @@ public class Http extends HttpBase { * @param conf * Configuration */ + @Override public void setConf(Configuration conf) { super.setConf(conf); Http.conf = conf; @@ -174,6 +175,7 @@ public class Http extends HttpBase { * Follow redirects if and only if true * @return HTTP response */ + @Override protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) throws ProtocolException, IOException { resolveCredentials(url); diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java index 9fbe9fa..65cb2d3 100644 --- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java +++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java @@ -106,6 +106,7 @@ public class OkHttp extends HttpBase { super(LOG); } + @Override public void setConf(Configuration conf) { super.setConf(conf); @@ -328,6 +329,7 @@ public class OkHttp extends HttpBase { return client; } + @Override protected Response getResponse(URL url, CrawlDatum datum, boolean redirect) throws ProtocolException, IOException { return new OkHttpResponse(this, url, datum); diff --git a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java index 29b119b..e6aa7a6 100644 --- a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java +++ b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java @@ -138,6 +138,7 @@ public class DepthScoringFilter extends Configured implements ScoringFilter { return initSort * (1 + mul); } + @Override public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException { diff --git a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java index c98ccce..41895ea 100644 --- a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java +++ b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java @@ -16,9 +16,7 @@ */ package org.apache.nutch.scoring.link; -import java.util.Collection; import java.util.List; -import java.util.Map.Entry; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; @@ -27,41 +25,31 @@ import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; import org.apache.nutch.protocol.Content; -import org.apache.nutch.scoring.ScoringFilter; +import org.apache.nutch.scoring.AbstractScoringFilter; import org.apache.nutch.scoring.ScoringFilterException; -public class LinkAnalysisScoringFilter implements ScoringFilter { +public class LinkAnalysisScoringFilter extends AbstractScoringFilter { - private Configuration conf; private float normalizedScore = 1.00f; private float initialScore = 0.0f; public LinkAnalysisScoringFilter() { - - } - - public Configuration getConf() { - return conf; } + @Override public void setConf(Configuration conf) { - this.conf = conf; + super.setConf(conf); normalizedScore = conf.getFloat("link.analyze.normalize.score", 1.00f); } - public CrawlDatum distributeScoreToOutlinks(Text fromUrl, - ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, - CrawlDatum adjust, int allCount) throws ScoringFilterException { - return adjust; - } - + @Override public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException { return datum.getScore() * initSort; } + @Override public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException { @@ -71,29 +59,23 @@ public class LinkAnalysisScoringFilter implements ScoringFilter { return (normalizedScore * dbDatum.getScore()); } + @Override public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException { datum.setScore(initialScore); } - public void injectedScore(Text url, CrawlDatum datum) - throws ScoringFilterException { - } - + @Override public void passScoreAfterParsing(Text url, Content content, Parse parse) throws ScoringFilterException { parse.getData().getContentMeta() .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY)); } + @Override public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) throws ScoringFilterException { content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore()); } - public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, - List<CrawlDatum> inlinked) throws ScoringFilterException { - // nothing to do - } - } diff --git a/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/MetadataScoringFilter.java b/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/MetadataScoringFilter.java index e3ad56e..489491c 100644 --- a/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/MetadataScoringFilter.java +++ b/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/MetadataScoringFilter.java @@ -58,6 +58,7 @@ public class MetadataScoringFilter extends AbstractScoringFilter { * * @see ScoringFilter#distributeScoreToOutlinks */ + @Override public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount) throws ScoringFilterException { @@ -90,6 +91,7 @@ public class MetadataScoringFilter extends AbstractScoringFilter { * @see ScoringFilter#passScoreBeforeParsing * @see MetadataScoringFilter#passScoreAfterParsing */ + @Override public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) { if (datumMetadata == null || content == null || datum == null) return; @@ -112,6 +114,7 @@ public class MetadataScoringFilter extends AbstractScoringFilter { * @see MetadataScoringFilter#passScoreBeforeParsing * @see ScoringFilter#passScoreAfterParsing */ + @Override public void passScoreAfterParsing(Text url, Content content, Parse parse) { if (contentMetadata == null || content == null || parse == null) return; @@ -130,6 +133,7 @@ public class MetadataScoringFilter extends AbstractScoringFilter { * handles conf assignment and pulls the value assignment from the * "scoring.db.md", "scoring.content.md" and "scoring.parse.md" properties. */ + @Override public void setConf(Configuration conf) { super.setConf(conf); diff --git a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java index 4c6c36b..54e2fe5 100644 --- a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java +++ b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java @@ -58,10 +58,12 @@ public class OPICScoringFilter implements ScoringFilter { private float externalScoreFactor; private boolean countFiltered; + @Override public Configuration getConf() { return conf; } + @Override public void setConf(Configuration conf) { this.conf = conf; scorePower = conf.getFloat("indexer.score.power", 0.5f); @@ -70,6 +72,7 @@ public class OPICScoringFilter implements ScoringFilter { countFiltered = conf.getBoolean("db.score.count.filtered", false); } + @Override public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException { } @@ -78,18 +81,21 @@ public class OPICScoringFilter implements ScoringFilter { * Set to 0.0f (unknown value) - inlink contributions will bring it to a * correct level. Newly discovered pages have at least one inlink. */ + @Override public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException { datum.setScore(0.0f); } /** Use {@link CrawlDatum#getScore()}. */ + @Override public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException { return datum.getScore() * initSort; } /** Increase the score by a sum of inlinked scores. */ + @Override public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked) throws ScoringFilterException { float adjust = 0.0f; @@ -103,11 +109,13 @@ public class OPICScoringFilter implements ScoringFilter { } /** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */ + @Override public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) { content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore()); } /** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */ + @Override public void passScoreAfterParsing(Text url, Content content, Parse parse) { parse.getData().getContentMeta() .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY)); @@ -117,6 +125,7 @@ public class OPICScoringFilter implements ScoringFilter { * Get a float value from Fetcher.SCORE_KEY, divide it by the number of * outlinks and apply. */ + @Override public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount) throws ScoringFilterException { @@ -163,6 +172,7 @@ public class OPICScoringFilter implements ScoringFilter { } /** Dampen the boost value by scorePower. */ + @Override public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException { diff --git a/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java b/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java index 3471a95..a0ab439 100644 --- a/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java +++ b/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java @@ -46,6 +46,7 @@ public class OrphanScoringFilter extends AbstractScoringFilter { private long markGoneAfter = DEFAULT_GONE_TIME; private long markOrphanAfter = DEFAULT_ORPHAN_TIME; + @Override public void setConf(Configuration conf) { markGoneAfter = conf.getInt("scoring.orphan.mark.gone.after", DEFAULT_GONE_TIME); @@ -71,6 +72,7 @@ public class OrphanScoringFilter extends AbstractScoringFilter { * @param inlinks * list of inlinked CrawlDatums */ + @Override public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinks) throws ScoringFilterException { @@ -86,6 +88,7 @@ public class OrphanScoringFilter extends AbstractScoringFilter { } } + @Override public void orphanedScore(Text url, CrawlDatum datum) { // Already has an orphaned time? if (datum.getMetaData().containsKey(ORPHAN_KEY_WRITABLE)) { diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java b/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java index 007eeae..b82ffd6 100644 --- a/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java +++ b/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java @@ -180,6 +180,7 @@ public class Subcollection extends Configured implements URLFilter { * * @see org.apache.nutch.net.URLFilter#filter(java.lang.String) */ + @Override public String filter(String urlString) { // first the blacklist Iterator<String> i = blackList.iterator(); diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java index c7ba54e..6aaa452 100644 --- a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java +++ b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java @@ -49,6 +49,7 @@ public class SubcollectionIndexingFilter extends Configured implements /** * @param conf A populated {@link Configuration} */ + @Override public void setConf(Configuration conf) { this.conf = conf; fieldName = conf.get("subcollection.default.fieldname", "subcollection"); @@ -60,6 +61,7 @@ public class SubcollectionIndexingFilter extends Configured implements /** * @return Configuration */ + @Override public Configuration getConf() { return this.conf; } @@ -91,6 +93,7 @@ public class SubcollectionIndexingFilter extends Configured implements } } + @Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { // Check for subcollection overrride in HTML metadata diff --git a/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java b/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java index 4f3a92c..296124d 100644 --- a/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java +++ b/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java @@ -33,9 +33,7 @@ import org.apache.nutch.util.URLUtil; import org.apache.nutch.util.domain.DomainSuffix; /** - * Adds the Top level domain extensions to the index - * - * @author Enis Soztutar <enis.soz.nu...@gmail.com> + * Adds the top-level domain extensions to the index */ public class TLDIndexingFilter implements IndexingFilter { private static final Logger LOG = LoggerFactory @@ -43,6 +41,7 @@ public class TLDIndexingFilter implements IndexingFilter { private Configuration conf; + @Override public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks) throws IndexingException { @@ -59,10 +58,12 @@ public class TLDIndexingFilter implements IndexingFilter { return doc; } + @Override public void setConf(Configuration conf) { this.conf = conf; } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java b/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java index 95891dd..5f30809 100644 --- a/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java +++ b/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java @@ -16,10 +16,6 @@ */ package org.apache.nutch.scoring.tld; -import java.util.List; -import java.util.Collection; -import java.util.Map.Entry; - import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; @@ -27,27 +23,23 @@ import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.indexer.NutchField; import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseData; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.scoring.ScoringFilter; +import org.apache.nutch.scoring.AbstractScoringFilter; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.util.domain.DomainSuffix; import org.apache.nutch.util.domain.DomainSuffixes; /** - * Scoring filter to boost tlds. - * - * @author Enis Soztutar <enis.soz.nu...@gmail.com> + * Scoring filter to boost top-level domains (TLDs). */ -public class TLDScoringFilter implements ScoringFilter { +public class TLDScoringFilter extends AbstractScoringFilter { - private Configuration conf; private DomainSuffixes tldEntries; public TLDScoringFilter() { tldEntries = DomainSuffixes.getInstance(); } + @Override public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException { @@ -65,49 +57,4 @@ public class TLDScoringFilter implements ScoringFilter { return initScore * boost; } - public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl, - ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount, - int validCount) throws ScoringFilterException { - return adjust; - } - - public float generatorSortValue(Text url, CrawlDatum datum, float initSort) - throws ScoringFilterException { - return initSort; - } - - public void initialScore(Text url, CrawlDatum datum) - throws ScoringFilterException { - } - - public void injectedScore(Text url, CrawlDatum datum) - throws ScoringFilterException { - } - - public void passScoreAfterParsing(Text url, Content content, Parse parse) - throws ScoringFilterException { - } - - public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) - throws ScoringFilterException { - } - - public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, - List<CrawlDatum> inlinked) throws ScoringFilterException { - } - - public Configuration getConf() { - return conf; - } - - public void setConf(Configuration conf) { - this.conf = conf; - } - - public CrawlDatum distributeScoreToOutlinks(Text fromUrl, - ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, - CrawlDatum adjust, int allCount) throws ScoringFilterException { - return adjust; - } - } diff --git a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java index f629262..c68750c 100644 --- a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java +++ b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java @@ -101,6 +101,7 @@ public class DomainURLFilter implements URLFilter { /** * Sets the configuration. */ + @Override public void setConf(Configuration conf) { this.conf = conf; @@ -149,10 +150,12 @@ public class DomainURLFilter implements URLFilter { } } + @Override public Configuration getConf() { return this.conf; } + @Override public String filter(String url) { // https://issues.apache.org/jira/browse/NUTCH-2189 if (domainSet.size() == 0) return url; diff --git a/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java b/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java index 58e3754..7b38bfc 100644 --- a/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java +++ b/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java @@ -101,6 +101,7 @@ public class DomainDenylistURLFilter implements URLFilter { /** * Sets the configuration. */ + @Override public void setConf(Configuration conf) { this.conf = conf; @@ -150,10 +151,12 @@ public class DomainDenylistURLFilter implements URLFilter { } } + @Override public Configuration getConf() { return this.conf; } + @Override public String filter(String url) { try { // match for suffix, domain, and host in that order. more general will diff --git a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java index c54740a..ccba29c 100644 --- a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java +++ b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java @@ -66,6 +66,7 @@ public class PrefixURLFilter implements URLFilter { trie = readConfiguration(new StringReader(stringRules)); } + @Override public String filter(String url) { if (trie.shortestMatch(url) == null) return null; @@ -115,6 +116,7 @@ public class PrefixURLFilter implements URLFilter { } } + @Override public void setConf(Configuration conf) { this.conf = conf; @@ -165,6 +167,7 @@ public class PrefixURLFilter implements URLFilter { } } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java index ff3826a..dd8605f 100644 --- a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java +++ b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java @@ -141,6 +141,7 @@ public class SuffixURLFilter implements URLFilter { readConfiguration(reader); } + @Override public String filter(String url) { if (url == null) return null; @@ -249,6 +250,7 @@ public class SuffixURLFilter implements URLFilter { } } + @Override public void setConf(Configuration conf) { this.conf = conf; @@ -293,6 +295,7 @@ public class SuffixURLFilter implements URLFilter { } } + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java b/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java index 84d516b..14fed8a 100644 --- a/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java +++ b/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java @@ -132,14 +132,17 @@ public class UrlValidator implements URLFilter { private Configuration conf; + @Override public String filter(String urlString) { return isValid(urlString) ? urlString : null; } + @Override public Configuration getConf() { return conf; } + @Override public void setConf(Configuration conf) { this.conf = conf; } diff --git a/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java b/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java index e34e087..557b8eb 100644 --- a/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java +++ b/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java @@ -75,6 +75,7 @@ public class URLMetaIndexingFilter implements IndexingFilter { * * @see IndexingFilter#filter */ + @Override public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException { if (conf != null) @@ -93,7 +94,7 @@ public class URLMetaIndexingFilter implements IndexingFilter { return doc; } - /** Boilerplate */ + @Override public Configuration getConf() { return conf; } @@ -102,6 +103,7 @@ public class URLMetaIndexingFilter implements IndexingFilter { * handles conf assignment and pulls the value assignment from the * "urlmeta.tags" property */ + @Override public void setConf(Configuration conf) { this.conf = conf; diff --git a/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java b/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java index 1b179ba..cb7e1b0 100644 --- a/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java +++ b/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java @@ -17,19 +17,16 @@ package org.apache.nutch.scoring.urlmeta; import java.util.Collection; -import java.util.Map.Entry; import java.util.Iterator; -import java.util.List; +import java.util.Map.Entry; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configured; import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseData; import org.apache.nutch.protocol.Content; +import org.apache.nutch.scoring.AbstractScoringFilter; import org.apache.nutch.scoring.ScoringFilter; import org.apache.nutch.scoring.ScoringFilterException; @@ -38,11 +35,10 @@ import org.apache.nutch.scoring.ScoringFilterException; * * {@link org.apache.nutch.scoring.urlmeta} */ -public class URLMetaScoringFilter extends Configured implements ScoringFilter { +public class URLMetaScoringFilter extends AbstractScoringFilter { private static final String CONF_PROPERTY = "urlmeta.tags"; private static String[] urlMetaTags; - private Configuration conf; /** * This will take the metatags that you have listed in your "urlmeta.tags" @@ -52,6 +48,7 @@ public class URLMetaScoringFilter extends Configured implements ScoringFilter { * * @see ScoringFilter#distributeScoreToOutlinks */ + @Override public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount) throws ScoringFilterException { @@ -84,6 +81,7 @@ public class URLMetaScoringFilter extends Configured implements ScoringFilter { * @see ScoringFilter#passScoreBeforeParsing * @see URLMetaScoringFilter#passScoreAfterParsing */ + @Override public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) { if (urlMetaTags == null || content == null || datum == null) return; @@ -105,6 +103,7 @@ public class URLMetaScoringFilter extends Configured implements ScoringFilter { * @see URLMetaScoringFilter#passScoreBeforeParsing * @see ScoringFilter#passScoreAfterParsing */ + @Override public void passScoreAfterParsing(Text url, Content content, Parse parse) { if (urlMetaTags == null || content == null || parse == null) return; @@ -119,41 +118,11 @@ public class URLMetaScoringFilter extends Configured implements ScoringFilter { } } - /** Boilerplate */ - public float generatorSortValue(Text url, CrawlDatum datum, float initSort) - throws ScoringFilterException { - return initSort; - } - - /** Boilerplate */ - public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, - CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) - throws ScoringFilterException { - return initScore; - } - - /** Boilerplate */ - public void initialScore(Text url, CrawlDatum datum) - throws ScoringFilterException { - return; - } - - /** Boilerplate */ - public void injectedScore(Text url, CrawlDatum datum) - throws ScoringFilterException { - return; - } - - /** Boilerplate */ - public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, - List<CrawlDatum> inlinked) throws ScoringFilterException { - return; - } - /** * handles conf assignment and pulls the value assignment from the * "urlmeta.tags" property */ + @Override public void setConf(Configuration conf) { super.setConf(conf); @@ -163,8 +132,4 @@ public class URLMetaScoringFilter extends Configured implements ScoringFilter { urlMetaTags = conf.getStrings(CONF_PROPERTY); } - /** Boilerplate */ - public Configuration getConf() { - return conf; - } } diff --git a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java index b596400..7c55cd7 100644 --- a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java +++ b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java @@ -60,6 +60,7 @@ public class AjaxURLNormalizer implements URLNormalizer { * @return String * @throws MalformedURLException if the urlString is malformed */ + @Override public String normalize(String urlString, String scope) throws MalformedURLException { LOG.info(scope + " // " + urlString); @@ -224,6 +225,7 @@ public class AjaxURLNormalizer implements URLNormalizer { /** * @param conf a populated {@link Configuration} */ + @Override public void setConf(Configuration conf) { this.conf = conf; } @@ -231,6 +233,7 @@ public class AjaxURLNormalizer implements URLNormalizer { /** * @return Configuration */ + @Override public Configuration getConf() { return this.conf; } diff --git a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java index 3a3c8a4..537868b 100644 --- a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java +++ b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java @@ -77,10 +77,12 @@ public class HostURLNormalizer implements URLNormalizer { } } + @Override public Configuration getConf() { return conf; } + @Override public void setConf(Configuration conf) { this.conf = conf; @@ -137,6 +139,7 @@ public class HostURLNormalizer implements URLNormalizer { } } + @Override public String normalize(String urlString, String scope) throws MalformedURLException { String host = new URL(urlString).getHost(); diff --git a/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java b/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java index 717471c..18e5fc4 100644 --- a/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java +++ b/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java @@ -32,15 +32,18 @@ public class PassURLNormalizer implements URLNormalizer { private Configuration conf; + @Override public String normalize(String urlString, String scope) throws MalformedURLException { return urlString; } + @Override public Configuration getConf() { return conf; } + @Override public void setConf(Configuration conf) { this.conf = conf; } diff --git a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java index e1afde8..d747858 100644 --- a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java +++ b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java @@ -122,10 +122,12 @@ public class ProtocolURLNormalizer implements URLNormalizer { protocolsMap.size(), domainProtocolsMap.size()); } + @Override public Configuration getConf() { return conf; } + @Override public void setConf(Configuration conf) { this.conf = conf; @@ -185,11 +187,8 @@ public class ProtocolURLNormalizer implements URLNormalizer { } } + @Override public String normalize(String url, String scope) throws MalformedURLException { - return normalize(url, null, scope); - } - - public String normalize(String url, CrawlDatum crawlDatum, String scope) throws MalformedURLException { // Get URL repr. URL u = new URL(url); diff --git a/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java b/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java index 60ec55e..f8a547b 100644 --- a/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java +++ b/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java @@ -39,14 +39,17 @@ public class QuerystringURLNormalizer implements URLNormalizer { public QuerystringURLNormalizer() { } + @Override public Configuration getConf() { return conf; } + @Override public void setConf(Configuration conf) { this.conf = conf; } + @Override public String normalize(String urlString, String scope) throws MalformedURLException { URL url = new URL(urlString); diff --git a/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java b/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java index 885944e..c86d55a 100644 --- a/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java +++ b/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java @@ -123,6 +123,7 @@ public class RegexURLNormalizer extends Configured implements URLNormalizer { } } + @Override public void setConf(Configuration conf) { super.setConf(conf); if (conf == null) @@ -202,6 +203,7 @@ public class RegexURLNormalizer extends Configured implements URLNormalizer { return urlString; } + @Override public String normalize(String urlString, String scope) throws MalformedURLException { return regexNormalize(urlString, scope); diff --git a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java index 2570427..ce3128d 100644 --- a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java +++ b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java @@ -88,10 +88,12 @@ public class SlashURLNormalizer implements URLNormalizer { } } + @Override public Configuration getConf() { return conf; } + @Override public void setConf(Configuration conf) { this.conf = conf; @@ -150,11 +152,8 @@ public class SlashURLNormalizer implements URLNormalizer { } } + @Override public String normalize(String url, String scope) throws MalformedURLException { - return normalize(url, null, scope); - } - - public String normalize(String url, CrawlDatum crawlDatum, String scope) throws MalformedURLException { // Get URL repr. URL u = new URL(url);