NUTCH-2089 Nutch 2.x is moved to compile on JDK 8
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/0ea78907 Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/0ea78907 Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/0ea78907 Branch: refs/heads/2.x Commit: 0ea78907dee6b07058b66a99e395aea8cf623e92 Parents: 7dcc5fa Author: Furkan KAMACI <[email protected]> Authored: Sun Sep 4 00:53:31 2016 +0300 Committer: Furkan KAMACI <[email protected]> Committed: Sun Sep 4 01:38:13 2016 +0300 ---------------------------------------------------------------------- src/java/org/apache/nutch/api/NutchServer.java | 23 +++--- .../apache/nutch/api/impl/RAMConfManager.java | 3 + .../nutch/crawl/AbstractFetchSchedule.java | 23 +++--- .../nutch/crawl/AdaptiveFetchSchedule.java | 6 +- .../org/apache/nutch/crawl/FetchSchedule.java | 25 ++---- .../org/apache/nutch/crawl/GeneratorJob.java | 20 ++++- .../apache/nutch/crawl/SignatureFactory.java | 7 +- .../nutch/crawl/TextProfileSignature.java | 2 +- .../org/apache/nutch/fetcher/FetcherJob.java | 12 ++- .../org/apache/nutch/indexer/IndexUtil.java | 4 +- .../org/apache/nutch/net/URLNormalizers.java | 16 ++-- .../apache/nutch/parse/NutchSitemapParse.java | 4 +- .../apache/nutch/parse/ParsePluginsReader.java | 6 +- src/java/org/apache/nutch/parse/Parser.java | 2 +- .../org/apache/nutch/parse/ParserChecker.java | 6 +- .../apache/nutch/plugin/PluginRepository.java | 6 +- .../org/apache/nutch/scoring/ScoringFilter.java | 19 ++--- .../org/apache/nutch/storage/StorageUtils.java | 2 +- .../apache/nutch/tools/arc/ArcRecordReader.java | 20 ++--- src/java/org/apache/nutch/util/Bytes.java | 12 +-- .../org/apache/nutch/util/EncodingDetector.java | 10 +-- src/java/org/apache/nutch/util/MimeUtil.java | 4 +- src/java/org/apache/nutch/util/NodeWalker.java | 10 +-- src/java/org/apache/nutch/util/NutchJob.java | 4 +- src/java/org/apache/nutch/util/NutchTool.java | 17 +++- .../apache/nutch/util/PrefixStringMatcher.java | 8 +- .../apache/nutch/util/SuffixStringMatcher.java | 8 +- src/java/org/apache/nutch/util/TableUtil.java | 4 +- src/java/org/apache/nutch/util/TimingUtil.java | 2 +- .../apache/nutch/util/TrieStringMatcher.java | 8 +- src/java/org/apache/nutch/util/URLUtil.java | 83 ++++++++++---------- .../apache/nutch/util/domain/DomainSuffix.java | 5 +- .../nutch/util/domain/TopLevelDomain.java | 4 +- .../org/apache/nutch/parse/feed/FeedParser.java | 2 +- .../indexer/anchor/AnchorIndexingFilter.java | 2 +- .../nutch/indexer/metadata/MetadataIndexer.java | 2 +- .../nutch/indexer/more/MoreIndexingFilter.java | 4 +- .../nutch/analysis/lang/HTMLLanguageParser.java | 15 +++- .../nutch/urlfilter/api/RegexURLFilterBase.java | 12 +-- .../org/apache/nutch/parse/html/DOMBuilder.java | 50 ++++++------ .../nutch/parse/html/DOMContentUtils.java | 2 +- .../parse/html/XMLCharacterRecognizer.java | 2 +- .../apache/nutch/parse/js/JSParseFilter.java | 6 +- .../org/apache/nutch/parse/swf/SWFParser.java | 2 +- .../org/apache/nutch/parse/tika/DOMBuilder.java | 50 ++++++------ .../parse/tika/XMLCharacterRecognizer.java | 2 +- .../apache/nutch/parse/tika/TestRSSParser.java | 2 +- .../org/apache/nutch/protocol/file/File.java | 4 +- .../nutch/protocol/file/TestProtocolFile.java | 2 +- .../org/apache/nutch/protocol/ftp/Client.java | 2 +- .../java/org/apache/nutch/protocol/ftp/Ftp.java | 4 +- .../DummySSLProtocolSocketFactory.java | 2 +- .../apache/nutch/protocol/httpclient/Http.java | 4 +- .../httpclient/HttpBasicAuthentication.java | 4 +- .../apache/nutch/scoring/link/package-info.java | 3 +- .../nutch/scoring/opic/OPICScoringFilter.java | 2 +- .../nutch/collection/CollectionManager.java | 2 +- .../nutch/urlfilter/domain/DomainURLFilter.java | 11 +-- .../nutch/urlfilter/domain/package-info.java | 2 - .../nutch/urlfilter/prefix/PrefixURLFilter.java | 2 +- .../nutch/urlfilter/suffix/SuffixURLFilter.java | 14 ++-- .../nutch/urlfilter/validator/UrlValidator.java | 10 +-- .../urlnormalizer/regex/RegexURLNormalizer.java | 4 +- .../nutch/api/AbstractNutchAPITestBase.java | 2 +- .../org/apache/nutch/crawl/TestGenerator.java | 2 +- .../org/apache/nutch/fetcher/TestFetcher.java | 6 +- .../org/apache/nutch/util/CrawlTestUtil.java | 7 +- 67 files changed, 318 insertions(+), 309 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/api/NutchServer.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/api/NutchServer.java b/src/java/org/apache/nutch/api/NutchServer.java index 802bbef..5118497 100644 --- a/src/java/org/apache/nutch/api/NutchServer.java +++ b/src/java/org/apache/nutch/api/NutchServer.java @@ -98,9 +98,9 @@ public class NutchServer extends Application { * 'INFO' however best attempts should always be made to specify a logging * level.<br> * {@link org.apache.nutch.api.NutchServer} can be run as secure. restapi.auth property - * should be set to BASIC, DIGEST or SSL at <code>nutch-site.xml</code> to enable HTTP basic authentication, + * should be set to BASIC, DIGEST or SSL at <code>nutch-site.xml</code> to enable HTTP basic authentication, * digest authentication or SSL when communicating with RESTAPI. - * Set restapi.auth.username and restapi.auth.password properties at <code>nutch-site.xml</code> to configure + * Set restapi.auth.username and restapi.auth.password properties at <code>nutch-site.xml</code> to configure * credentials when BASIC or DIGEST authentication is used. * Set restapi.auth.ssl.storepath, restapi.auth.ssl.storepass and restapi.auth.ssl.keypass when SSL is used. * @@ -117,12 +117,14 @@ public class NutchServer extends Application { * 'INFO' however best attempts should always be made to specify a logging * level.<br> * {@link org.apache.nutch.api.NutchServer} can be run as secure. restapi.auth property - * should be set to BASIC, DIGEST or SSL at <code>nutch-site.xml</code> to enable HTTP basic authentication, + * should be set to BASIC, DIGEST or SSL at <code>nutch-site.xml</code> to enable HTTP basic authentication, * digest authentication or SSL when communicating with RESTAPI. - * Set restapi.auth.username and restapi.auth.password properties at <code>nutch-site.xml</code> to configure + * Set restapi.auth.username and restapi.auth.password properties at <code>nutch-site.xml</code> to configure * credentials when BASIC or DIGEST authentication is used. * Set restapi.auth.ssl.storepath, restapi.auth.ssl.storepass and restapi.auth.ssl.keypass when SSL is used. * + * @param ramConfManager {@link RAMConfManager} + * * @see org.apache.nutch.api.security.AuthenticationTypeEnum */ public NutchServer(RAMConfManager ramConfManager) { @@ -137,12 +139,15 @@ public class NutchServer extends Application { * 'INFO' however best attempts should always be made to specify a logging * level.<br> * {@link org.apache.nutch.api.NutchServer} can be run as secure. restapi.auth property - * should be set to BASIC, DIGEST or SSL at <code>nutch-site.xml</code> to enable HTTP basic authentication, + * should be set to BASIC, DIGEST or SSL at <code>nutch-site.xml</code> to enable HTTP basic authentication, * digest authentication or SSL when communicating with RESTAPI. - * Set restapi.auth.username and restapi.auth.password properties at <code>nutch-site.xml</code> to configure + * Set restapi.auth.username and restapi.auth.password properties at <code>nutch-site.xml</code> to configure * credentials when BASIC or DIGEST authentication is used. * Set restapi.auth.ssl.storepath, restapi.auth.ssl.storepass and restapi.auth.ssl.keypass when SSL is used. * + * @param ramConfManager {@link RAMConfManager} + * @param confId active configuration id + * * @see org.apache.nutch.api.security.AuthenticationTypeEnum */ public NutchServer(RAMConfManager ramConfManager, String confId) { @@ -305,7 +310,7 @@ public class NutchServer extends Application { /** * Safety and convenience method to determine whether or not it is safe to * shut down the server. We make this assertion by consulting the - * {@link org.apache.nutch.api.NutchApp#jobManager} for a list of jobs with + * {@link #getJobMgr()} for a list of jobs with * {@link org.apache.nutch.api.model.response.JobInfo#state} equal to * 'RUNNING'. * @@ -356,8 +361,8 @@ public class NutchServer extends Application { /** * Main method for NutchServer to run via command line. * - * @param args arguments for log level, stopping the Server and port. - * @throws Exception + * @param args arguments for log level, stopping the Server and port. + * @throws Exception exception */ public static void main(String[] args) throws Exception { CommandLineParser parser = new PosixParser(); http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/api/impl/RAMConfManager.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/api/impl/RAMConfManager.java b/src/java/org/apache/nutch/api/impl/RAMConfManager.java index 13c05fd..356a8bd 100644 --- a/src/java/org/apache/nutch/api/impl/RAMConfManager.java +++ b/src/java/org/apache/nutch/api/impl/RAMConfManager.java @@ -50,6 +50,9 @@ public class RAMConfManager implements ConfManager { /** * Public constructor which accepts a configuration id and {@link Configuration} type configuration. + * + * @param confId configuration id + * @param configuration configuration */ public RAMConfManager(String confId, Configuration configuration) { configurations.put(confId, configuration); http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java b/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java index 045f4cd..8070c7b 100755 --- a/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java @@ -76,6 +76,7 @@ public abstract class AbstractFetchSchedule extends Configured implements * @param url * URL of the page. * @param page + * {@link WebPage} object relative to the URL */ @Override public void initializeSchedule(String url, WebPage page) { @@ -104,13 +105,7 @@ public abstract class AbstractFetchSchedule extends Configured implements * @param url * URL of the page * @param page - * @return adjusted page information, including all original information. - * NOTE: this may be a different instance than - * @param datum - * , but implementations should make sure that it contains at least - * all information from - * @param datum - * . + * {@link WebPage} object relative to the URL */ @Override public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime, @@ -134,6 +129,7 @@ public abstract class AbstractFetchSchedule extends Configured implements * @param url * URL of the page * @param page + * {@link WebPage} object relative to the URL * @param prevFetchTime * previous fetch time * @param prevModifiedTime @@ -163,15 +159,15 @@ public abstract class AbstractFetchSchedule extends Configured implements * in the current fetchlist. NOTE: a true return value does not guarantee that * the page will be fetched, it just allows it to be included in the further * selection process based on scores. The default implementation checks - * <code>fetchTime</code>, if it is higher than the - * - * @param curTime - * it returns false, and true otherwise. It will also check that - * fetchTime is not too remote (more than <code>maxInterval</code), - * in which case it lowers the interval and returns true. + * <code>fetchTime</code>, if it is higher than the current time + * it returns false, and true otherwise. It will also check that + * fetchTime is not too remote (more than <code>maxInterval</code>), + * in which case it lowers the interval and returns true. + * * @param url * URL of the page * @param page + * {@link WebPage} object relative to the URL * @param curTime * reference time (usually set to the time when the fetchlist * generation process was started). @@ -200,6 +196,7 @@ public abstract class AbstractFetchSchedule extends Configured implements * @param url * URL of the page * @param page + * {@link WebPage} object relative to the URL * @param asap * if true, force refetch as soon as possible - this sets the * fetchTime to now. If false, force refetch whenever the next fetch http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java index 1c2780a..30c6ec7 100755 --- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java @@ -46,10 +46,8 @@ import org.apache.nutch.storage.WebPage; * <p> * NOTE: values of DEC_FACTOR and INC_FACTOR higher than 0.4f may destabilize * the algorithm, so that the fetch interval either increases or decreases - * infinitely, with little relevance to the page changes. Please use - * {@link #main(String[])} method to test the values before applying them in a - * production system. - * </p> + * infinitely, with little relevance to the page changes. + * * * @author Andrzej Bialecki */ http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/crawl/FetchSchedule.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/FetchSchedule.java b/src/java/org/apache/nutch/crawl/FetchSchedule.java index eb896a6..8219a61 100755 --- a/src/java/org/apache/nutch/crawl/FetchSchedule.java +++ b/src/java/org/apache/nutch/crawl/FetchSchedule.java @@ -49,6 +49,7 @@ public interface FetchSchedule extends Configurable { * @param url * URL of the page. * @param page + * {@link WebPage} object relative to the URL */ public void initializeSchedule(String url, WebPage page); @@ -60,21 +61,16 @@ public interface FetchSchedule extends Configurable { * @param url * url of the page * @param page + * {@link WebPage} object relative to the URL * @param prevFetchTime * previous value of fetch time, or -1 if not available * @param prevModifiedTime * previous value of modifiedTime, or -1 if not available * @param fetchTime - * the latest time, when the page was recently re-fetched. Most - * FetchSchedule implementations should update the value in - * @param datum - * to something greater than this value. + * the latest time, when the page was recently re-fetched. * @param modifiedTime * last time the content was modified. This information comes from - * the protocol implementations, or is set to < 0 if not available. - * Most FetchSchedule implementations should update the value in - * @param datum - * to this value. + * the protocol implementations, or is set to < 0 if not available. * @param state * if {@link #STATUS_MODIFIED}, then the content is considered to be * "changed" before the <code>fetchTime</code>, if @@ -90,13 +86,10 @@ public interface FetchSchedule extends Configurable { /** * This method specifies how to schedule refetching of pages marked as GONE. - * Default implementation increases fetchInterval by 50%, and if it exceeds - * the <code>maxInterval</code> it calls - * {@link #forceRefetch(Text, CrawlDatum, boolean)}. - * * @param url * URL of the page * @param page + * {@link WebPage} object relative to the URL */ public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime, long prevModifiedTime, long fetchTime); @@ -109,6 +102,7 @@ public interface FetchSchedule extends Configurable { * @param url * URL of the page * @param page + * {@link WebPage} object relative to the URL * @param prevFetchTime * previous fetch time * @param prevModifiedTime @@ -133,14 +127,8 @@ public interface FetchSchedule extends Configurable { * selection process based on scores. The default implementation checks * <code>fetchTime</code>, if it is higher than the * - * @param curTime - * it returns false, and true otherwise. It will also check that - * fetchTime is not too remote (more than <code>maxInterval</code), - * in which case it lowers the interval and returns true. * @param url * URL of the page - * @param row - * url's row * @param curTime * reference time (usually set to the time when the fetchlist * generation process was started). @@ -155,7 +143,6 @@ public interface FetchSchedule extends Configurable { * * @param url * URL of the page - * @param page * @param asap * if true, force refetch as soon as possible - this sets the * fetchTime to now. If false, force refetch whenever the next fetch http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/crawl/GeneratorJob.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/GeneratorJob.java b/src/java/org/apache/nutch/crawl/GeneratorJob.java index f47637f..1627590 100644 --- a/src/java/org/apache/nutch/crawl/GeneratorJob.java +++ b/src/java/org/apache/nutch/crawl/GeneratorJob.java @@ -165,7 +165,11 @@ public class GeneratorJob extends NutchTool implements Tool { return fields; } - /** Generate a random batch id */ + /** + * Generates a random batch id + * + * @return random batch id + */ public static String randomBatchId() { long curTime = System.currentTimeMillis(); int randomSeed = Math.abs(new Random().nextInt()); @@ -173,6 +177,13 @@ public class GeneratorJob extends NutchTool implements Tool { return batchId; } + /** + * Runs generator + * + * @param args map of arguments + * @return results + * @throws Exception + */ public Map<String, Object> run(Map<String, Object> args) throws Exception { String batchId = (String) args.get(Nutch.ARG_BATCH); if (batchId == null) { @@ -290,6 +301,13 @@ public class GeneratorJob extends NutchTool implements Tool { return batchId; } + /** + * Runs generator from commandline + * + * @param args arguments + * @return returns -1 + * @throws Exception + */ public int run(String[] args) throws Exception { if (args.length <= 0) { System.out http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/crawl/SignatureFactory.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/SignatureFactory.java b/src/java/org/apache/nutch/crawl/SignatureFactory.java index 1577634..8cf7471 100644 --- a/src/java/org/apache/nutch/crawl/SignatureFactory.java +++ b/src/java/org/apache/nutch/crawl/SignatureFactory.java @@ -40,7 +40,12 @@ public class SignatureFactory { private SignatureFactory() { } // no public ctor - /** Return the default Signature implementation. */ + /** + * Returns the default {@link Signature} implementation + * + * @param conf configuration + * @return default {@link Signature} implementation + */ public static Signature getSignature(Configuration conf) { String clazz = conf.get("db.signature.class", MD5Signature.class.getName()); ObjectCache objectCache = ObjectCache.get(conf); http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/crawl/TextProfileSignature.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/TextProfileSignature.java b/src/java/org/apache/nutch/crawl/TextProfileSignature.java index 6d7e5e0..f797b10 100644 --- a/src/java/org/apache/nutch/crawl/TextProfileSignature.java +++ b/src/java/org/apache/nutch/crawl/TextProfileSignature.java @@ -33,7 +33,7 @@ import org.apache.nutch.storage.WebPage; * An implementation of a page signature. It calculates an MD5 hash of a plain * text "profile" of a page. In case there is no text, it calculates a hash * using the {@link MD5Signature}. - * </p> + * * <p> * The algorithm to calculate a page "profile" takes the plain text version of a * page and performs the following steps: http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/fetcher/FetcherJob.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/fetcher/FetcherJob.java b/src/java/org/apache/nutch/fetcher/FetcherJob.java index a7f3df8..015c209 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherJob.java +++ b/src/java/org/apache/nutch/fetcher/FetcherJob.java @@ -79,19 +79,17 @@ public class FetcherJob extends NutchTool implements Tool { /** * <p> * Mapper class for Fetcher. - * </p> + * * <p> * This class reads the random integer written by {@link GeneratorJob} as its * key while outputting the actual key and value arguments through a * {@link FetchEntry} instance. - * </p> + * * <p> - * This approach (combined with the use of {@link PartitionUrlByHost}) makes - * sure that Fetcher is still polite while also randomizing the key order. If * one host has a huge number of URLs in your table while other hosts have * not, {@link FetcherReducer} will not be stuck on one host but process URLs * from other hosts as well. - * </p> + * */ public static class FetcherMapper extends GoraMapper<String, WebPage, IntWritable, FetchEntry> { @@ -246,7 +244,7 @@ public class FetcherJob extends NutchTool implements Tool { * number of threads per map task * @param shouldResume * @param numTasks - * number of fetching tasks (reducers). If set to < 1 then use the + * number of fetching tasks (reducers). If set to < 1 then use the * default, which is mapred.map.tasks. * @return 0 on success * @throws Exception @@ -266,7 +264,7 @@ public class FetcherJob extends NutchTool implements Tool { * number of threads per map task * @param shouldResume * @param numTasks - * number of fetching tasks (reducers). If set to < 1 then use the + * number of fetching tasks (reducers). If set to < 1 then use the * default, which is mapred.map.tasks. * @param stmDetect * If set true, sitemap detection is run. http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/indexer/IndexUtil.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/indexer/IndexUtil.java b/src/java/org/apache/nutch/indexer/IndexUtil.java index 6d12382..ddb6f0d 100644 --- a/src/java/org/apache/nutch/indexer/IndexUtil.java +++ b/src/java/org/apache/nutch/indexer/IndexUtil.java @@ -42,7 +42,7 @@ public class IndexUtil { } /** - * Index a {@link Webpage}, here we add the following fields: + * Index a {@link WebPage}, here we add the following fields: * <ol> * <li><tt>id</tt>: default uniqueKey for the {@link NutchDocument}.</li> * <li><tt>digest</tt>: Digest is used to identify pages (like unique ID) and @@ -60,7 +60,7 @@ public class IndexUtil { * @param key * The key of the page (reversed url). * @param page - * The {@link Webpage}. + * The {@link WebPage}. * @return The indexed document, or null if skipped by index filters. */ public NutchDocument index(String key, WebPage page) { http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/net/URLNormalizers.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/net/URLNormalizers.java b/src/java/org/apache/nutch/net/URLNormalizers.java index 03a0b85..1fc1df8 100644 --- a/src/java/org/apache/nutch/net/URLNormalizers.java +++ b/src/java/org/apache/nutch/net/URLNormalizers.java @@ -51,30 +51,30 @@ import org.apache.nutch.util.ObjectCache; * order). If there are more normalizers activated than explicitly named on this * list, the remaining ones will be run in random order after the ones specified * on the list are executed. - * </p> + * * <p> * You can define a set of contexts (or scopes) in which normalizers may be * called. Each scope can have its own list of normalizers (defined in - * "urlnormalizer.scope.<scope_name>" property) and its own order (defined in - * "urlnormalizer.order.<scope_name>" property). If any of these properties are + * "urlnormalizer.scope.<scope_name>" property) and its own order (defined in + * "urlnormalizer.order.<scope_name>" property). If any of these properties are * missing, default settings are used for the global scope. - * </p> + * * <p> * In case no normalizers are required for any given scope, a * <code>org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer</code> should * be used. - * </p> + * * <p> * Each normalizer may further select among many configurations, depending on * the scope in which it is called, because the scope name is passed as a * parameter to each normalizer. You can also use the same normalizer for many * scopes. - * </p> + * * <p> * Several scopes have been defined, and various Nutch tools will attempt using * scope-specific normalizers first (and fall back to default config if * scope-specific configuration is missing). - * </p> + * * <p> * Normalizers may be run several times, to ensure that modifications introduced * by normalizers at the end of the list can be further reduced by normalizers @@ -83,7 +83,7 @@ import org.apache.nutch.util.ObjectCache; * want to run this loop up to the number of activated normalizers. This loop * count can be configured through <tt>urlnormalizer.loop.count</tt> property. * As soon as the url is unchanged the loop will stop and return the result. - * </p> + * * * @author Andrzej Bialecki */ http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/parse/NutchSitemapParse.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/parse/NutchSitemapParse.java b/src/java/org/apache/nutch/parse/NutchSitemapParse.java index c0a9d9b..0e57339 100644 --- a/src/java/org/apache/nutch/parse/NutchSitemapParse.java +++ b/src/java/org/apache/nutch/parse/NutchSitemapParse.java @@ -6,9 +6,9 @@ * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at - * <p/> + * <p> * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> + * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/parse/ParsePluginsReader.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/parse/ParsePluginsReader.java b/src/java/org/apache/nutch/parse/ParsePluginsReader.java index dddd025..b4c6f4e 100644 --- a/src/java/org/apache/nutch/parse/ParsePluginsReader.java +++ b/src/java/org/apache/nutch/parse/ParsePluginsReader.java @@ -69,12 +69,10 @@ public class ParsePluginsReader { /** * Reads the <code>parse-plugins.xml</code> file and returns the - * {@link #ParsePluginList} defined by it. + * {@link ParsePluginList} defined by it. * - * @return A {@link #ParsePluginList} specified by the + * @return A {@link ParsePluginList} specified by the * <code>parse-plugins.xml</code> file. - * @throws Exception - * If any parsing error occurs. */ public ParsePluginList parse(Configuration conf) { http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/parse/Parser.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/parse/Parser.java b/src/java/org/apache/nutch/parse/Parser.java index b623fd0..9a8c2b7 100644 --- a/src/java/org/apache/nutch/parse/Parser.java +++ b/src/java/org/apache/nutch/parse/Parser.java @@ -34,7 +34,7 @@ public interface Parser extends FieldPluggable, Configurable { /** * <p> * This method parses content in WebPage instance - * </p> + * * * @param url * Page's URL http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/parse/ParserChecker.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java b/src/java/org/apache/nutch/parse/ParserChecker.java index 12faeae..4d5c572 100644 --- a/src/java/org/apache/nutch/parse/ParserChecker.java +++ b/src/java/org/apache/nutch/parse/ParserChecker.java @@ -48,13 +48,13 @@ import java.util.Map.Entry; * is used to remove duplicates during the dedup procedure. It is calculated * using {@link org.apache.nutch.crawl.MD5Signature} or * {@link org.apache.nutch.crawl.TextProfileSignature}.</li> - * <li><tt>Version</tt>: From {@link org.apache.nutch.parse.ParseData}.</li> - * <li><tt>Status</tt>: From {@link org.apache.nutch.parse.ParseData}.</li> + * <li><tt>Version</tt>: From org.apache.nutch.parse.ParseData.</li> + * <li><tt>Status</tt>: From org.apache.nutch.parse.ParseData.</li> * <li><tt>Title</tt>: of the URL</li> * <li><tt>Outlinks</tt>: associated with the URL</li> * <li><tt>Content Metadata</tt>: such as <i>X-AspNet-Version</i>, <i>Date</i>, * <i>Content-length</i>, <i>servedBy</i>, <i>Content-Type</i>, - * <i>Cache-Control</>, etc.</li> + * <i>Cache-Control</i>, etc.</li> * <li><tt>Parse Metadata</tt>: such as <i>CharEncodingForConversion</i>, * <i>OriginalCharEncoding</i>, <i>language</i>, etc.</li> * <li><tt>ParseText</tt>: The page parse text which varies in length depdnecing http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/plugin/PluginRepository.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/plugin/PluginRepository.java b/src/java/org/apache/nutch/plugin/PluginRepository.java index 346ae31..6486f63 100644 --- a/src/java/org/apache/nutch/plugin/PluginRepository.java +++ b/src/java/org/apache/nutch/plugin/PluginRepository.java @@ -59,8 +59,10 @@ public class PluginRepository { .getLogger(PluginRepository.class); /** - * @throws PluginRuntimeException - * @see java.lang.Object#Object() + * Pluging repository constructor + * + * @param conf Configuration + * @throws RuntimeException */ public PluginRepository(Configuration conf) throws RuntimeException { fActivatedPlugins = new HashMap<String, Plugin>(); http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/scoring/ScoringFilter.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/scoring/ScoringFilter.java b/src/java/org/apache/nutch/scoring/ScoringFilter.java index 8c06ef6..17c6350 100644 --- a/src/java/org/apache/nutch/scoring/ScoringFilter.java +++ b/src/java/org/apache/nutch/scoring/ScoringFilter.java @@ -72,8 +72,8 @@ public interface ScoringFilter extends Configurable, FieldPluggable { * * @param url * url of the page - * @param datum - * page row. Modifications will be persisted. + * @param page + * {@link WebPage} object relative to the URL * @param initSort * initial sort value, or a value from previous filters in chain */ @@ -85,13 +85,8 @@ public interface ScoringFilter extends Configurable, FieldPluggable { * * @param fromUrl * url of the source page - * @param row - * page row * @param scoreData - * A list of {@link OutlinkedScoreDatum}s for every outlink. These - * {@link OutlinkedScoreDatum}s will be passed to - * {@link #updateScore(String, OldWebTableRow, List)} for every - * outlinked URL. + * A list of {@link ScoreDatum} * @param allCount * number of all collected outlinks from the source page * @throws ScoringFilterException @@ -106,9 +101,9 @@ public interface ScoringFilter extends Configurable, FieldPluggable { * * @param url * url of the page - * @param page - * @param inlinked - * list of {@link OutlinkedScoreDatum}s for all inlinks pointing to + * @param page {@link WebPage} object relative to the URL + * @param inlinkedScoreData + * list of {@link ScoreDatum}s for all inlinks pointing to * this URL. * @throws ScoringFilterException */ @@ -124,8 +119,6 @@ public interface ScoringFilter extends Configurable, FieldPluggable { * document. NOTE: this already contains all information collected by * indexing filters. Implementations may modify this instance, in * order to store/remove some information. - * @param row - * page row * @param initScore * initial boost value for the Lucene document. * @return boost value for the Lucene document. This value is passed as an http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/storage/StorageUtils.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/storage/StorageUtils.java b/src/java/org/apache/nutch/storage/StorageUtils.java index b68e8f8..e82a3c5 100644 --- a/src/java/org/apache/nutch/storage/StorageUtils.java +++ b/src/java/org/apache/nutch/storage/StorageUtils.java @@ -82,7 +82,7 @@ public class StorageUtils { /** * Return the Persistent Gora class used to persist Nutch Web data. * - * @param the + * @param conf * Nutch configuration * @return the Gora DataStore persistent class * @throws ClassNotFoundException http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java index d3f9799..7f36b52 100644 --- a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java +++ b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java @@ -38,21 +38,21 @@ import org.apache.hadoop.util.StringUtils; * <p> * The <code>ArchRecordReader</code> class provides a record reader which reads * records from arc files. - * </p> + * * * <p> * Arc files are essentially tars of gzips. Each record in an arc file is a * compressed gzip. Multiple records are concatenated together to form a - * complete arc. For more information on the arc file format see {@link http - * ://www.archive.org/web/researcher/ArcFileFormat.php}. - * </p> - * + * complete arc. For more information on the arc file format see + * <a href="http://www.archive.org/web/researcher/ArcFileFormat.php"> + * http://www.archive.org/web/researcher/ArcFileFormat.php</a>. + * * <p> * Arc files are used by the internet archive and grub projects. - * </p> + * * - * @see http://www.archive.org/ - * @see http://www.grub.org/ + * @see <a href="http://www.archive.org/">http://www.archive.org/</a> + * @see <a href="http://www.grub.org/">http://www.grub.org/</a> */ public class ArcRecordReader implements RecordReader<Text, BytesWritable> { @@ -72,7 +72,7 @@ public class ArcRecordReader implements RecordReader<Text, BytesWritable> { /** * <p> * Returns true if the byte array passed matches the gzip header magic number. - * </p> + * * * @param input * The byte array to check. @@ -174,7 +174,7 @@ public class ArcRecordReader implements RecordReader<Text, BytesWritable> { * Returns true if the next record in the split is read into the key and value * pair. The key will be the arc record header and the values will be the raw * content bytes of the arc record. - * </p> + * * * @param key * The record key http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/util/Bytes.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/Bytes.java b/src/java/org/apache/nutch/util/Bytes.java index db9f468..043a897 100644 --- a/src/java/org/apache/nutch/util/Bytes.java +++ b/src/java/org/apache/nutch/util/Bytes.java @@ -980,7 +980,7 @@ public class Bytes { * left operand * @param right * right operand - * @return 0 if equal, < 0 if left is less than right, etc. + * @return 0 if equal, < 0 if left is less than right, etc. */ public static int compareTo(final byte[] left, final byte[] right) { return compareTo(left, 0, left.length, right, 0, right.length); @@ -1001,7 +1001,7 @@ public class Bytes { * How much to compare from the left buffer * @param length2 * How much to compare from the right buffer - * @return 0 if equal, < 0 if left is less than right, etc. + * @return 0 if equal, < 0 if left is less than right, etc. */ public static int compareTo(byte[] buffer1, int offset1, int length1, byte[] buffer2, int offset2, int length2) { @@ -1050,7 +1050,7 @@ public class Bytes { * @return Runs {@link WritableComparator#hashBytes(byte[], int)} on the * passed in array. This method is what * {@link org.apache.hadoop.io.Text} and - * {@link ImmutableBytesWritable} use calculating hash code. + * org.apache.hadoop.hbase.io.ImmutableBytesWritable use calculating hash code. */ public static int hashCode(final byte[] b) { return hashCode(b, b.length); @@ -1064,7 +1064,7 @@ public class Bytes { * @return Runs {@link WritableComparator#hashBytes(byte[], int)} on the * passed in array. This method is what * {@link org.apache.hadoop.io.Text} and - * {@link ImmutableBytesWritable} use calculating hash code. + * org.apache.hadoop.hbase.io.ImmutableBytesWritable use calculating hash code. */ public static int hashCode(final byte[] b, final int length) { return WritableComparator.hashBytes(b, length); @@ -1366,12 +1366,12 @@ public class Bytes { * given amount. * * @param value - * - array of bytes containing long (length <= SIZEOF_LONG) + * - array of bytes containing long (length <= SIZEOF_LONG) * @param amount * value will be incremented on (deincremented if negative) * @return array of bytes containing incremented long (length == SIZEOF_LONG) * @throws IOException - * - if value.length > SIZEOF_LONG + * - if value.length > SIZEOF_LONG */ public static byte[] incrementBytes(byte[] value, long amount) throws IOException { http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/util/EncodingDetector.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/EncodingDetector.java b/src/java/org/apache/nutch/util/EncodingDetector.java index 5b40e29..25f8eef 100644 --- a/src/java/org/apache/nutch/util/EncodingDetector.java +++ b/src/java/org/apache/nutch/util/EncodingDetector.java @@ -45,7 +45,7 @@ import java.util.Locale; * <li>Taking a set of clues and making a "best guess" as to the "real" * encoding.</li> * </ol> - * </p> + * * * <p> * A caller will often have some extra information about what the encoding might @@ -56,7 +56,7 @@ import java.util.Locale; * <li>Run step (1) to generate a set of auto-detected clues;</li> * <li>Combine these clues with the caller-dependent "extra clues" available;</li> * <li>Run step (2) to guess what the most probable answer is.</li> - * </p> + * </ul> */ public class EncodingDetector { @@ -211,9 +211,7 @@ public class EncodingDetector { /** * Guess the encoding with the previously specified list of clues. - * - * @param row - * URL's row + * * @param defaultValue * Default encoding to return if no encoding can be detected with * enough confidence. Note that this will <b>not</b> be normalized @@ -340,7 +338,7 @@ public class EncodingDetector { /** * Parse the character encoding from the specified content type header. If the * content type is null, or there is no explicit character encoding, - * <code>null</code> is returned. <br /> + * <code>null</code> is returned. <p> * This method was copied from org.apache.catalina.util.RequestUtil, which is * licensed under the Apache License, Version 2.0 (the "License"). * http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/util/MimeUtil.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/MimeUtil.java b/src/java/org/apache/nutch/util/MimeUtil.java index 198fdee..241087c 100644 --- a/src/java/org/apache/nutch/util/MimeUtil.java +++ b/src/java/org/apache/nutch/util/MimeUtil.java @@ -50,7 +50,7 @@ import org.apache.nutch.protocol.ProtocolOutput; * substrate library, <a href="http://incubator.apache.org/tika/">Apache * Tika</a>. Any mime handling code should be placed in this utility * class, and hidden from the Nutch classes that rely on it. - * </p> + * */ public final class MimeUtil { @@ -229,7 +229,7 @@ public final class MimeUtil { * method. * * @param url - * A string representation of the document {@link URL} to sense the + * A string representation of the document. URL to sense the * {@link MimeType} for. * @return An appropriate {@link MimeType}, identified from the given Document * url in string form. http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/util/NodeWalker.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/NodeWalker.java b/src/java/org/apache/nutch/util/NodeWalker.java index 16e84c3..3e0b0e1 100644 --- a/src/java/org/apache/nutch/util/NodeWalker.java +++ b/src/java/org/apache/nutch/util/NodeWalker.java @@ -27,12 +27,12 @@ import org.w3c.dom.NodeList; * of recursion. As the node tree is walked the next node is popped off of the * stack and all of its children are automatically added to the stack to be * called in tree order. - * </p> + * * * <p> * Currently this class is not thread safe. It is assumed that only one thread * will be accessing the <code>NodeWalker</code> at any given time. - * </p> + * */ public class NodeWalker { @@ -58,7 +58,7 @@ public class NodeWalker { * children onto the stack, allowing us to walk the node tree without the use * of recursion. If there are no more nodes on the stack then null is * returned. - * </p> + * * * @return Node The next <code>Node</code> on the stack or null if there isn't * a next node. @@ -90,12 +90,12 @@ public class NodeWalker { * When getting a next node from the walker, that node's children are * automatically added to the stack. You can call this method to remove those * children from the stack. - * </p> + * * * <p> * This is useful when you don't want to process deeper into the current path * of the node tree but you want to continue processing sibling nodes. - * </p> + * * */ public void skipChildren() { http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/util/NutchJob.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/NutchJob.java b/src/java/org/apache/nutch/util/NutchJob.java index c0456c1..029e7ae 100644 --- a/src/java/org/apache/nutch/util/NutchJob.java +++ b/src/java/org/apache/nutch/util/NutchJob.java @@ -63,7 +63,7 @@ public class NutchJob extends Job { * Creates a new {@link NutchJob} with no particular {@link org.apache.hadoop.mapreduce.Cluster} and a * given {@link org.apache.hadoop.conf.Configuration}. * - * The <code>NutchJob</code> makes a copy of the <code>Configuration</code> so + * The <code>NutchJob</code> makes a copy of the <code>Configuration</code> so * that any necessary internal modifications do not reflect on the incoming * parameter. * @@ -87,7 +87,7 @@ public class NutchJob extends Job { * and a given jobName. * A Cluster will be created from the conf parameter only when it's needed. * - * The <code>NutchJob</code> makes a copy of the <code>Configuration</code> so + * The <code>NutchJob</code> makes a copy of the <code>Configuration</code> so * that any necessary internal modifications do not reflect on the incoming * parameter. * http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/util/NutchTool.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/NutchTool.java b/src/java/org/apache/nutch/util/NutchTool.java index 1f5789a..443d1da 100644 --- a/src/java/org/apache/nutch/util/NutchTool.java +++ b/src/java/org/apache/nutch/util/NutchTool.java @@ -36,11 +36,19 @@ public abstract class NutchTool extends Configured { /** * Runs the tool, using a map of arguments. May return results, or null. + * + * @param args map of arguments + * @return results or null + * @throws Exception */ public abstract Map<String, Object> run(Map<String, Object> args) throws Exception; - /** Returns relative progress of the tool, a float in range [0,1]. */ + /** + * Returns relative progress of the tool, a float in range [0,1] + * + * @return relative progress of the tool, a float in range [0,1] + */ public float getProgress() { float res = 0; if (currentJob != null) { @@ -62,7 +70,11 @@ public abstract class NutchTool extends Configured { return res; } - /** Returns current status of the running tool. */ + /** + * Returns current status of the running tool + * + * @return current status of the running tool + */ public Map<String, Object> getStatus() { return status; } @@ -72,6 +84,7 @@ public abstract class NutchTool extends Configured { * this, since by default it calls {@link #killJob()}. * * @return true if succeeded, false otherwise + * @throws Exception */ public boolean stopJob() throws Exception { return killJob(); http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/util/PrefixStringMatcher.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/PrefixStringMatcher.java b/src/java/org/apache/nutch/util/PrefixStringMatcher.java index e323b67..6ca48c8 100644 --- a/src/java/org/apache/nutch/util/PrefixStringMatcher.java +++ b/src/java/org/apache/nutch/util/PrefixStringMatcher.java @@ -70,8 +70,8 @@ public class PrefixStringMatcher extends TrieStringMatcher { } /** - * Returns the shortest prefix of <code>input<code> that is matched, - * or <code>null<code> if no match exists. + * Returns the shortest prefix of <code>input</code> that is matched, + * or <code>null</code> if no match exists. */ public String shortestMatch(String input) { TrieNode node = root; @@ -86,8 +86,8 @@ public class PrefixStringMatcher extends TrieStringMatcher { } /** - * Returns the longest prefix of <code>input<code> that is matched, - * or <code>null<code> if no match exists. + * Returns the longest prefix of <code>input</code> that is matched, + * or <code>null</code> if no match exists. */ public String longestMatch(String input) { TrieNode node = root; http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/util/SuffixStringMatcher.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/SuffixStringMatcher.java b/src/java/org/apache/nutch/util/SuffixStringMatcher.java index a967c01..6e070b9 100644 --- a/src/java/org/apache/nutch/util/SuffixStringMatcher.java +++ b/src/java/org/apache/nutch/util/SuffixStringMatcher.java @@ -65,8 +65,8 @@ public class SuffixStringMatcher extends TrieStringMatcher { } /** - * Returns the shortest suffix of <code>input<code> that is matched, - * or <code>null<code> if no match exists. + * Returns the shortest suffix of <code>input</code> that is matched, + * or <code>null</code> if no match exists. */ public String shortestMatch(String input) { TrieNode node = root; @@ -81,8 +81,8 @@ public class SuffixStringMatcher extends TrieStringMatcher { } /** - * Returns the longest suffix of <code>input<code> that is matched, - * or <code>null<code> if no match exists. + * Returns the longest suffix of <code>input</code> that is matched, + * or <code>null</code> if no match exists. */ public String longestMatch(String input) { TrieNode node = root; http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/util/TableUtil.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/TableUtil.java b/src/java/org/apache/nutch/util/TableUtil.java index 68ded69..e6ccbbc 100644 --- a/src/java/org/apache/nutch/util/TableUtil.java +++ b/src/java/org/apache/nutch/util/TableUtil.java @@ -33,7 +33,7 @@ public class TableUtil { * E.g. "http://bar.foo.com:8983/to/index.html?a=b" becomes * "com.foo.bar:8983:http/to/index.html?a=b". * - * @param url + * @param urlString * url to be reversed * @return Reversed url * @throws MalformedURLException @@ -111,7 +111,7 @@ public class TableUtil { /** * Given a reversed url, returns the reversed host E.g - * "com.foo.bar:http:8983/to/index.html?a=b" -> "com.foo.bar" + * "com.foo.bar:http:8983/to/index.html?a=b" -> "com.foo.bar" * * @param reversedUrl * Reversed url http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/util/TimingUtil.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/TimingUtil.java b/src/java/org/apache/nutch/util/TimingUtil.java index 524bee6..497716c 100644 --- a/src/java/org/apache/nutch/util/TimingUtil.java +++ b/src/java/org/apache/nutch/util/TimingUtil.java @@ -32,7 +32,7 @@ public class TimingUtil { * @param end * The end of the time period * @return a string of the form "XhYmZs" when the elapsed time is X hours, Y - * minutes and Z seconds or null if start > end. + * minutes and Z seconds or null if start > end. */ public static String elapsedTime(long start, long end) { if (start > end) { http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/util/TrieStringMatcher.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/TrieStringMatcher.java b/src/java/org/apache/nutch/util/TrieStringMatcher.java index 95f06ad..e7773cb 100644 --- a/src/java/org/apache/nutch/util/TrieStringMatcher.java +++ b/src/java/org/apache/nutch/util/TrieStringMatcher.java @@ -186,15 +186,15 @@ public abstract class TrieStringMatcher { public abstract boolean matches(String input); /** - * Returns the shortest substring of <code>input<code> that is - * matched by a pattern in the trie, or <code>null<code> if no match + * Returns the shortest substring of <code>input</code> that is + * matched by a pattern in the trie, or <code>null</code> if no match * exists. */ public abstract String shortestMatch(String input); /** - * Returns the longest substring of <code>input<code> that is - * matched by a pattern in the trie, or <code>null<code> if no match + * Returns the longest substring of <code>input</code> that is + * matched by a pattern in the trie, or <code>null</code> if no match * exists. */ public abstract String longestMatch(String input); http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/util/URLUtil.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java index 5183ba1..e1df9e3 100644 --- a/src/java/org/apache/nutch/util/URLUtil.java +++ b/src/java/org/apache/nutch/util/URLUtil.java @@ -219,52 +219,49 @@ public class URLUtil { * Yahoo! Slurp crawler described here:<br> * <a href= * "http://help.yahoo.com/l/nz/yahooxtra/search/webcrawler/slurp-11.html"> How - * does the Yahoo! webcrawler handle redirects?</a> <br> + * does the Yahoo! webcrawler handle redirects?</a> * <br> - * <ol> - * <li>Choose target url if either url is malformed.</li> - * <li>If different domains the keep the destination whether or not the - * redirect is temp or perm</li> - * <ul> - * <li>a.com -> b.com*</li> - * </ul> - * <li>If the redirect is permanent and the source is root, keep the source.</li> - * <ul> - * <li>*a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html</li> - * </ul> - * <li>If the redirect is permanent and the source is not root and the - * destination is root, keep the destination</li> - * <ul> - * <li>a.com/xyz/index.html -> a.com*</li> - * </ul> - * <li>If the redirect is permanent and neither the source nor the destination - * is root, then keep the destination</li> - * <ul> - * <li>a.com/xyz/index.html -> a.com/abc/page.html*</li> - * </ul> - * <li>If the redirect is temporary and source is root and destination is not - * root, then keep the source</li> - * <ul> - * <li>*a.com -> a.com/xyz/index.html</li> - * </ul> - * <li>If the redirect is temporary and source is not root and destination is - * root, then keep the destination</li> - * <ul> - * <li>a.com/xyz/index.html -> a.com*</li> - * </ul> - * <li>If the redirect is temporary and neither the source or the destination + * + * <dl> + * + * <dt>Choose target url if either url is malformed.</dt> + * + * <dt>If different domains the keep the destination whether or not the + * redirect is temp or perm</dt> + * <dd>a.com -> b.com*</dd> + * + * <dt>If the redirect is permanent and the source is root, keep the source.</dt> + * <dd>*a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html</dd> + * + * <dt>If the redirect is permanent and the source is not root and the + * destination is root, keep the destination</dt> + * <dd>a.com/xyz/index.html -> a.com*</dd> + * + * <dt>If the redirect is permanent and neither the source nor the destination + * is root, then keep the destination</dt> + * <dd>a.com/xyz/index.html -> a.com/abc/page.html*</dd> + * + * <dt>If the redirect is temporary and source is root and destination is not + * root, then keep the source</dt> + * <dd>*a.com -> a.com/xyz/index.html</dd> + * + * <dt>If the redirect is temporary and source is not root and destination is + * root, then keep the destination</dt> + * <dd>a.com/xyz/index.html -> a.com*</dd> + * + * <dt>If the redirect is temporary and neither the source or the destination * is root, then keep the shortest url. First check for the shortest host, and * if both are equal then check by path. Path is first by length then by the - * number of / path separators.</li> - * <ul> - * <li>a.com/xyz/index.html -> a.com/abc/page.html*</li> - * <li>*www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html</li> - * </ul> - * <li>If the redirect is temporary and both the source and the destination - * are root, then keep the shortest sub-domain</li> - * <ul> - * <li>*www.a.com -> www.news.a.com</li> - * </ul> + * number of / path separators.</dt> + * <dd>a.com/xyz/index.html -> a.com/abc/page.html*</dd> + * <dd>*www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html</dd> + * + * <dt>If the redirect is temporary and both the source and the destination + * are root, then keep the shortest sub-domain</dt> + * <dd>*www.a.com -> www.news.a.com</dd> + * + * </dl> + * * <br> * While not in this logic there is a further piece of representative url * logic that occurs during indexing and after scoring. During creation of the http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/util/domain/DomainSuffix.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/domain/DomainSuffix.java b/src/java/org/apache/nutch/util/domain/DomainSuffix.java index ae03ec4..0e0b7b0 100644 --- a/src/java/org/apache/nutch/util/domain/DomainSuffix.java +++ b/src/java/org/apache/nutch/util/domain/DomainSuffix.java @@ -23,17 +23,16 @@ package org.apache.nutch.util.domain; * name of a host. The domain name of a host is defined to be the last part * before the domain suffix, w/o subdomain names. As an example the domain name * of <br> - * <code> http://lucene.apache.org/ + * <code> http://lucene.apache.org/ * </code><br> * is <code> apache.org</code> <br> * This class holds three fields, <strong>domain</strong> field represents the * suffix (such as "co.uk") <strong>boost</strong> is a float for boosting score * of url's with this suffix <strong>status</strong> field represents domain's - * status + * status. Check also domain-suffixes.xml * * @author Enis Soztutar <[email protected]> * @see TopLevelDomain - * @see domain-suffixes.xml */ public class DomainSuffix { http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/java/org/apache/nutch/util/domain/TopLevelDomain.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/domain/TopLevelDomain.java b/src/java/org/apache/nutch/util/domain/TopLevelDomain.java index 6386335..87e370e 100644 --- a/src/java/org/apache/nutch/util/domain/TopLevelDomain.java +++ b/src/java/org/apache/nutch/util/domain/TopLevelDomain.java @@ -24,8 +24,8 @@ package org.apache.nutch.util.domain; * top-level domain is <code>com</code>. * * @author Enis Soztutar <[email protected]> - * @see http://www.iana.org/ - * @see http://en.wikipedia.org/wiki/Top-level_domain + * @see <a href="http://www.iana.org/">http://www.iana.org/</a> + * @see <a href="http://en.wikipedia.org/wiki/Top-level_domain">http://en.wikipedia.org/wiki/Top-level_domain</a> */ public class TopLevelDomain extends DomainSuffix { http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java index 9df4e27..0751ddc 100644 --- a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java +++ b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java @@ -64,7 +64,7 @@ import com.sun.syndication.io.SyndFeedInput; * <p> * A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced links * and content present in the feed. - * </p> + * * */ public class FeedParser implements Parser { http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java index 9e2e75b..7e0e246 100644 --- a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java +++ b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java @@ -36,7 +36,7 @@ import java.util.Locale; * Indexing filter that offers an option to either index all inbound anchor text * for a document or deduplicate anchors. Deduplication does have it's con's, * - * @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml. + * Check {@code anchorIndexingFilter.deduplicate} in nutch-default.xml. */ public class AnchorIndexingFilter implements IndexingFilter { http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java index fdd3b81..a97e9ed 100644 --- a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java +++ b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java @@ -37,7 +37,7 @@ import org.apache.nutch.util.Bytes; * Indexer which can be configured to extract metadata from the crawldb, parse * metadata or content metadata. You can specify the properties "index.db", * "index.parse" or "index.content" who's values are comma-delimited - * <value>key1,key2,key3</value>. + * <value>key1,key2,key3</value>. */ public class MetadataIndexer implements IndexingFilter { http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java index b1d99e5..9171b1c 100644 --- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java +++ b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java @@ -148,14 +148,14 @@ public class MoreIndexingFilter implements IndexingFilter { * primaryType and subType to field "type" as un-stored, indexed and * un-tokenized, so that search results can be confined by contentType or its * primaryType or its subType. - * </p> + * * <p> * For example, if contentType is application/vnd.ms-powerpoint, search can be * done with one of the following qualifiers * type:application/vnd.ms-powerpoint type:application type:vnd.ms-powerpoint * all case insensitive. The query filter is implemented in * {@link TypeQueryFilter}. - * </p> + * * * @param doc * @param data http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java index f3af6a9..064cd8d 100644 --- a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java +++ b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java @@ -84,12 +84,19 @@ public class HTMLLanguageParser implements ParseFilter { /** * Scan the HTML document looking at possible indications of content language<br> - * <li>1. html lang attribute - * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) <li>2. meta + * <ol> + * <li>html lang attribute + * (http://www.w3.org/TR/REC-html40/struct/dirlang.html#h-8.1) + * </li> + * <li>meta * dc.language * (http://dublincore.org/documents/2000/07/16/usageguide/qualified - * -html.shtml#language) <li>3. meta http-equiv (content-language) - * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br> + * -html.shtml#language) + * </li> + * <li>meta http-equiv (content-language) + * (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) + * </li> + * </ol> */ public Parse filter(String url, WebPage page, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) { http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java index d374e95..a1475a7 100644 --- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java +++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java @@ -44,19 +44,13 @@ import org.apache.nutch.net.*; * expressions. * * <p> - * The regular expressions rules are expressed in a file. The file of rules is - * provided by each implementation using the - * {@link #getRulesFile(Configuration)} method. - * </p> - * - * <p> - * The format of this file is made of many rules (one per line):<br/> + * The format of this file is made of many rules (one per line):<br> * <code> * [+-]<regex> - * </code><br/> + * </code><br> * where plus (<code>+</code>)means go ahead and index it and minus ( * <code>-</code>)means no. - * </p> + * */ public abstract class RegexURLFilterBase implements URLFilter { http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java index 31b54da..6bd4305 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java @@ -174,7 +174,7 @@ public class DOMBuilder implements ContentHandler, LexicalHandler { * supply a locator: if it does so, it must supply the locator to the * application by invoking this method before invoking any of the other * methods in the ContentHandler interface. - * </p> + * * * <p> * The locator allows the application to determine the end position of any @@ -183,13 +183,13 @@ public class DOMBuilder implements ContentHandler, LexicalHandler { * errors (such as character content that does not match an application's * business rules). The information returned by the locator is probably not * sufficient for use with a search engine. - * </p> + * * * <p> * Note that the locator will return correct information only during the * invocation of the events in this interface. The application should not * attempt to use it at any other time. - * </p> + * * * @param locator * An object that can return the location of any SAX document event. @@ -206,7 +206,7 @@ public class DOMBuilder implements ContentHandler, LexicalHandler { * <p> * The SAX parser will invoke this method only once, before any other methods * in this interface or in DTDHandler (except for setDocumentLocator). - * </p> + * */ public void startDocument() throws org.xml.sax.SAXException { @@ -221,7 +221,7 @@ public class DOMBuilder implements ContentHandler, LexicalHandler { * method invoked during the parse. The parser shall not invoke this method * until it has either abandoned parsing (because of an unrecoverable error) * or reached the end of input. - * </p> + * */ public void endDocument() throws org.xml.sax.SAXException { @@ -237,14 +237,14 @@ public class DOMBuilder implements ContentHandler, LexicalHandler { * startElement() event (even when the element is empty). All of the element's * content will be reported, in order, before the corresponding endElement() * event. - * </p> + * * * <p> * If the element name has a namespace prefix, the prefix will still be * attached. Note that the attribute list provided will contain only * attributes with explicit values (specified or defaulted): #IMPLIED * attributes will be omitted. - * </p> + * * * * @param ns @@ -328,12 +328,12 @@ public class DOMBuilder implements ContentHandler, LexicalHandler { * The SAX parser will invoke this method at the end of every element in the * XML document; there will be a corresponding startElement() event for every * endElement() event (even when the element is empty). - * </p> + * * * <p> * If the element name has a namespace prefix, the prefix will still be * attached to the name. - * </p> + * * * * @param ns @@ -371,18 +371,18 @@ public class DOMBuilder implements ContentHandler, LexicalHandler { * they may split it into several chunks; however, all of the characters in * any single event must come from the same external entity, so that the * Locator provides useful information. - * </p> + * * * <p> * The application must not attempt to read from the array outside of the * specified range. - * </p> + * * * <p> * Note that some parsers will report whitespace using the * ignorableWhitespace() method rather than this one (validating parsers must * do so). - * </p> + * * * @param ch * The characters from the XML document. @@ -489,19 +489,19 @@ public class DOMBuilder implements ContentHandler, LexicalHandler { * whitespace (see the W3C XML 1.0 recommendation, section 2.10): * non-validating parsers may also use this method if they are capable of * parsing and using content models. - * </p> + * * * <p> * SAX parsers may return all contiguous whitespace in a single chunk, or they * may split it into several chunks; however, all of the characters in any * single event must come from the same external entity, so that the Locator * provides useful information. - * </p> + * * * <p> * The application must not attempt to read from the array outside of the * specified range. - * </p> + * * * @param ch * The characters from the XML document. @@ -539,12 +539,12 @@ public class DOMBuilder implements ContentHandler, LexicalHandler { * The Parser will invoke this method once for each processing instruction * found: note that processing instructions may occur before or after the main * document element. - * </p> + * * * <p> * A SAX parser should never report an XML declaration (XML 1.0, section 2.8) * or a text declaration (XML 1.0, section 4.3.1) using this method. - * </p> + * * * @param target * The processing instruction target. @@ -608,18 +608,18 @@ public class DOMBuilder implements ContentHandler, LexicalHandler { * they may split it into several chunks; however, all of the characters in * any single event must come from the same external entity, so that the * Locator provides useful information. - * </p> + * * * <p> * The application must not attempt to read from the array outside of the * specified range. - * </p> + * * * <p> * Note that some parsers will report whitespace using the * ignorableWhitespace() method rather than this one (validating parsers must * do so). - * </p> + * * * @param ch * The characters from the XML document. @@ -687,14 +687,14 @@ public class DOMBuilder implements ContentHandler, LexicalHandler { * processing: the SAX XML reader will automatically replace prefixes for * element and attribute names when the http://xml.org/sax/features/namespaces * feature is true (the default). - * </p> + * * * <p> * There are cases, however, when applications need to use prefixes in * character data or in attribute values, where they cannot safely be expanded * automatically; the start/endPrefixMapping event supplies the information to * the application to expand prefixes in those contexts itself, if necessary. - * </p> + * * * <p> * Note that start/endPrefixMapping events are not guaranteed to be properly @@ -702,7 +702,7 @@ public class DOMBuilder implements ContentHandler, LexicalHandler { * before the corresponding startElement event, and all endPrefixMapping * events will occur after the corresponding endElement event, but their order * is not guaranteed. - * </p> + * * * @param prefix * The Namespace prefix being declared. @@ -735,7 +735,7 @@ public class DOMBuilder implements ContentHandler, LexicalHandler { * See startPrefixMapping for details. This event will always occur after the * corresponding endElement event, but the order of endPrefixMapping events is * not otherwise guaranteed. - * </p> + * * * @param prefix * The prefix that was being mapping. @@ -755,7 +755,7 @@ public class DOMBuilder implements ContentHandler, LexicalHandler { * DTD subset). All processors may skip external entities, depending on the * values of the http://xml.org/sax/features/external-general-entities and the * http://xml.org/sax/features/external-parameter-entities properties. - * </p> + * * * @param name * The name of the skipped entity. If it is a parameter entity, the http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java index 8e079fb..488cacd 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java @@ -113,7 +113,7 @@ public class DOMContentUtils { /** * This is a convinience method, equivalent to - * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}. + * {@link #getText(StringBuilder, Node, boolean)} which passes false as third argument * */ public void getText(StringBuilder sb, Node node) { http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java index cfef10c..0143f06 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java @@ -35,7 +35,7 @@ public class XMLCharacterRecognizer { * Returns whether the specified <var>ch</var> conforms to the XML 1.0 * definition of whitespace. Refer to <A * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S"> the definition of - * <CODE>S</CODE></A> for details. + * <code>S</code></A> for details. * * @param ch * Character to check as XML whitespace. http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java index a481755..a685844 100644 --- a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java +++ b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java @@ -79,10 +79,10 @@ public class JSParseFilter implements ParseFilter, Parser { * {@link WebPage} object relative to the URL * @param parse * {@link Parse} object holding parse status - * @param metatags - * within the {@link NutchDocument} + * @param metaTags + * within the {@link HTMLMetaTags} * @param doc - * The {@link NutchDocument} object + * The {@link DocumentFragment} object * @return parse the actual {@link Parse} object */ @Override http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java b/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java index a3f779a..4fbcad3 100644 --- a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java +++ b/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java @@ -280,7 +280,7 @@ class ExtractText extends SWFTagTypesImpl { /* * There are some issues with this method: sometimes SWF files define their - * own font, so short of OCR we cannot guess what is the glyph code -> character + * own font, so short of OCR we cannot guess what is the glyph code -> character * mapping. Additionally, some files don't use literal space character, instead * they adjust glyphAdvances. We don't handle it at all - in such cases the text * will be all glued together. http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java index 4f4c8a7..db59d13 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java @@ -174,7 +174,7 @@ class DOMBuilder implements ContentHandler, LexicalHandler { * supply a locator: if it does so, it must supply the locator to the * application by invoking this method before invoking any of the other * methods in the ContentHandler interface. - * </p> + * * * <p> * The locator allows the application to determine the end position of any @@ -183,13 +183,13 @@ class DOMBuilder implements ContentHandler, LexicalHandler { * errors (such as character content that does not match an application's * business rules). The information returned by the locator is probably not * sufficient for use with a search engine. - * </p> + * * * <p> * Note that the locator will return correct information only during the * invocation of the events in this interface. The application should not * attempt to use it at any other time. - * </p> + * * * @param locator * An object that can return the location of any SAX document event. @@ -206,7 +206,7 @@ class DOMBuilder implements ContentHandler, LexicalHandler { * <p> * The SAX parser will invoke this method only once, before any other methods * in this interface or in DTDHandler (except for setDocumentLocator). - * </p> + * */ public void startDocument() throws org.xml.sax.SAXException { @@ -221,7 +221,7 @@ class DOMBuilder implements ContentHandler, LexicalHandler { * method invoked during the parse. The parser shall not invoke this method * until it has either abandoned parsing (because of an unrecoverable error) * or reached the end of input. - * </p> + * */ public void endDocument() throws org.xml.sax.SAXException { @@ -237,14 +237,14 @@ class DOMBuilder implements ContentHandler, LexicalHandler { * startElement() event (even when the element is empty). All of the element's * content will be reported, in order, before the corresponding endElement() * event. - * </p> + * * * <p> * If the element name has a namespace prefix, the prefix will still be * attached. Note that the attribute list provided will contain only * attributes with explicit values (specified or defaulted): #IMPLIED * attributes will be omitted. - * </p> + * * * * @param ns @@ -328,12 +328,12 @@ class DOMBuilder implements ContentHandler, LexicalHandler { * The SAX parser will invoke this method at the end of every element in the * XML document; there will be a corresponding startElement() event for every * endElement() event (even when the element is empty). - * </p> + * * * <p> * If the element name has a namespace prefix, the prefix will still be * attached to the name. - * </p> + * * * * @param ns @@ -373,18 +373,18 @@ class DOMBuilder implements ContentHandler, LexicalHandler { * they may split it into several chunks; however, all of the characters in * any single event must come from the same external entity, so that the * Locator provides useful information. - * </p> + * * * <p> * The application must not attempt to read from the array outside of the * specified range. - * </p> + * * * <p> * Note that some parsers will report whitespace using the * ignorableWhitespace() method rather than this one (validating parsers must * do so). - * </p> + * * * @param ch * The characters from the XML document. @@ -491,19 +491,19 @@ class DOMBuilder implements ContentHandler, LexicalHandler { * whitespace (see the W3C XML 1.0 recommendation, section 2.10): * non-validating parsers may also use this method if they are capable of * parsing and using content models. - * </p> + * * * <p> * SAX parsers may return all contiguous whitespace in a single chunk, or they * may split it into several chunks; however, all of the characters in any * single event must come from the same external entity, so that the Locator * provides useful information. - * </p> + * * * <p> * The application must not attempt to read from the array outside of the * specified range. - * </p> + * * * @param ch * The characters from the XML document. @@ -541,12 +541,12 @@ class DOMBuilder implements ContentHandler, LexicalHandler { * The Parser will invoke this method once for each processing instruction * found: note that processing instructions may occur before or after the main * document element. - * </p> + * * * <p> * A SAX parser should never report an XML declaration (XML 1.0, section 2.8) * or a text declaration (XML 1.0, section 4.3.1) using this method. - * </p> + * * * @param target * The processing instruction target. @@ -610,18 +610,18 @@ class DOMBuilder implements ContentHandler, LexicalHandler { * they may split it into several chunks; however, all of the characters in * any single event must come from the same external entity, so that the * Locator provides useful information. - * </p> + * * * <p> * The application must not attempt to read from the array outside of the * specified range. - * </p> + * * * <p> * Note that some parsers will report whitespace using the * ignorableWhitespace() method rather than this one (validating parsers must * do so). - * </p> + * * * @param ch * The characters from the XML document. @@ -689,14 +689,14 @@ class DOMBuilder implements ContentHandler, LexicalHandler { * processing: the SAX XML reader will automatically replace prefixes for * element and attribute names when the http://xml.org/sax/features/namespaces * feature is true (the default). - * </p> + * * * <p> * There are cases, however, when applications need to use prefixes in * character data or in attribute values, where they cannot safely be expanded * automatically; the start/endPrefixMapping event supplies the information to * the application to expand prefixes in those contexts itself, if necessary. - * </p> + * * * <p> * Note that start/endPrefixMapping events are not guaranteed to be properly @@ -704,7 +704,7 @@ class DOMBuilder implements ContentHandler, LexicalHandler { * before the corresponding startElement event, and all endPrefixMapping * events will occur after the corresponding endElement event, but their order * is not guaranteed. - * </p> + * * * @param prefix * The Namespace prefix being declared. @@ -737,7 +737,7 @@ class DOMBuilder implements ContentHandler, LexicalHandler { * See startPrefixMapping for details. This event will always occur after the * corresponding endElement event, but the order of endPrefixMapping events is * not otherwise guaranteed. - * </p> + * * * @param prefix * The prefix that was being mapping. @@ -757,7 +757,7 @@ class DOMBuilder implements ContentHandler, LexicalHandler { * DTD subset). All processors may skip external entities, depending on the * values of the http://xml.org/sax/features/external-general-entities and the * http://xml.org/sax/features/external-parameter-entities properties. - * </p> + * * * @param name * The name of the skipped entity. If it is a parameter entity, the http://git-wip-us.apache.org/repos/asf/nutch/blob/0ea78907/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java index d625c33..b5c95ce 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java @@ -35,7 +35,7 @@ class XMLCharacterRecognizer { * Returns whether the specified <var>ch</var> conforms to the XML 1.0 * definition of whitespace. Refer to <A * href="http://www.w3.org/TR/1998/REC-xml-19980210#NT-S"> the definition of - * <CODE>S</CODE></A> for details. + * <code>S</code></A> for details. * * @param ch * Character to check as XML whitespace.
