Author: lewismc Date: Sat Jan 14 15:45:46 2012 New Revision: 1231517 URL: http://svn.apache.org/viewvc?rev=1231517&view=rev Log: commit to try and resolve NUTCH-1176, I expect this not to work 1st time, N.B. This doesn't change or even touch syntax of code.
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=1231517&r1=1231516&r2=1231517&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Sat Jan 14 15:45:46 2012 @@ -64,6 +64,7 @@ public abstract class AbstractFetchSched * default <code>fetchInterval</code>. * * @param url URL of the page. + * * @param datum datum instance to be initialized (modified in place). */ public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) { @@ -91,12 +92,15 @@ public abstract class AbstractFetchSched * marked as GONE. Default implementation increases fetchInterval by 50%, * and if it exceeds the <code>maxInterval</code> it calls * {@link #forceRefetch(Text, CrawlDatum, boolean)}. - * @param url URL of the page - * @param datum datum instance to be adjusted + * + * @param url URL of the page. + * + * @param datum datum instance to be adjusted. + * * @return adjusted page information, including all original information. - * NOTE: this may be a different instance than {@param datum}, but + * NOTE: this may be a different instance than {@see datum}, but * implementations should make sure that it contains at least all - * information from {@param datum}. + * information from {@see datum}. */ public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, long fetchTime) { @@ -113,15 +117,21 @@ public abstract class AbstractFetchSched * re-tried due to transient errors. The default implementation * sets the next fetch time 1 day in the future and increases * the retry counter. - * @param url URL of the page - * @param datum page information - * @param prevFetchTime previous fetch time - * @param prevModifiedTime previous modified time - * @param fetchTime current fetch time + * + * @param url URL of the page. + * + * @param datum page information. + * + * @param prevFetchTime previous fetch time. + * + * @param prevModifiedTime previous modified time. + * + * @param fetchTime current fetch time. + * * @return adjusted page information, including all original information. - * NOTE: this may be a different instance than {@param datum}, but + * NOTE: this may be a different instance than {@see datum}, but * implementations should make sure that it contains at least all - * information from {@param datum}. + * information from {@see datum}. */ public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, long fetchTime) { @@ -147,10 +157,14 @@ public abstract class AbstractFetchSched * {@param curTime} it returns false, and true otherwise. It will also * check that fetchTime is not too remote (more than <code>maxInterval</code>, * in which case it lowers the interval and returns true. - * @param url URL of the page - * @param datum datum instance + * + * @param url URL of the page. + * + * @param datum datum instance. + * * @param curTime reference time (usually set to the time when the * fetchlist generation process was started). + * * @return true, if the page should be considered for inclusion in the current * fetchlist, otherwise false. */ @@ -173,8 +187,11 @@ public abstract class AbstractFetchSched /** * This method resets fetchTime, fetchInterval, modifiedTime, * retriesSinceFetch and page signature, so that it forces refetching. - * @param url URL of the page - * @param datum datum instance + * + * @param url URL of the page. + * + * @param datum datum instance. + * * @param asap if true, force refetch as soon as possible - this sets * the fetchTime to now. If false, force refetch whenever the next fetch * time is set. Modified: nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java?rev=1231517&r1=1231516&r2=1231517&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java Sat Jan 14 15:45:46 2012 @@ -43,11 +43,13 @@ public interface FetchSchedule extends C * default <code>fetchInterval</code>. * * @param url URL of the page. + * * @param datum datum instance to be initialized. + * * @return adjusted page information, including all original information. - * NOTE: this may be a different instance than {@param datum}, but + * NOTE: this may be a different instance than {@see datum}, but * implementations should make sure that it contains at least all - * information from {@param datum}. + * information from {@see datum}. */ public CrawlDatum initializeSchedule(Text url, CrawlDatum datum); @@ -58,23 +60,30 @@ public interface FetchSchedule extends C * schedules. * * @param url url of the page + * * @param datum page description to be adjusted. NOTE: this instance, passed by reference, * may be modified inside the method. - * @param prevFetchTime previous value of fetch time, or 0 if not available - * @param prevModifiedTime previous value of modifiedTime, or 0 if not available + * + * @param prevFetchTime previous value of fetch time, or 0 if not available. + * + * @param prevModifiedTime previous value of modifiedTime, or 0 if not available. + * * @param fetchTime the latest time, when the page was recently re-fetched. Most FetchSchedule - * implementations should update the value in {@param datum} to something greater than this value. + * implementations should update the value in {@see datum} to something greater than this value. + * * @param modifiedTime last time the content was modified. This information comes from * the protocol implementations, or is set to < 0 if not available. Most FetchSchedule - * implementations should update the value in {@param datum} to this value. + * implementations should update the value in {@see datum} to this value. + * * @param state if {@link #STATUS_MODIFIED}, then the content is considered to be "changed" before the * <code>fetchTime</code>, if {@link #STATUS_NOTMODIFIED} then the content is known to be unchanged. * This information may be obtained by comparing page signatures before and after fetching. If this * is set to {@link #STATUS_UNKNOWN}, then it is unknown whether the page was changed; implementations * are free to follow a sensible default behavior. + * * @return adjusted page information, including all original information. NOTE: this may - * be a different instance than {@param datum}, but implementations should make sure that - * it contains at least all information from {@param datum}. + * be a different instance than {@see datum}, but implementations should make sure that + * it contains at least all information from {@see datum}. */ public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, @@ -85,12 +94,15 @@ public interface FetchSchedule extends C * marked as GONE. Default implementation increases fetchInterval by 50%, * and if it exceeds the <code>maxInterval</code> it calls * {@link #forceRefetch(Text, CrawlDatum, boolean)}. + * * @param url URL of the page - * @param datum datum instance to be adjusted + * + * @param datum datum instance to be adjusted. + * * @return adjusted page information, including all original information. - * NOTE: this may be a different instance than {@param datum}, but + * NOTE: this may be a different instance than {@see datum}, but * implementations should make sure that it contains at least all - * information from {@param datum}. + * information from {@see datum}. */ public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, long fetchTime); @@ -100,15 +112,21 @@ public interface FetchSchedule extends C * re-tried due to transient errors. The default implementation * sets the next fetch time 1 day in the future and increases the * retry counter. - * @param url URL of the page - * @param datum page information - * @param prevFetchTime previous fetch time - * @param prevModifiedTime previous modified time - * @param fetchTime current fetch time + * + * @param url URL of the page. + * + * @param datum page information. + * + * @param prevFetchTime previous fetch time. + * + * @param prevModifiedTime previous modified time. + * + * @param fetchTime current fetch time. + * * @return adjusted page information, including all original information. - * NOTE: this may be a different instance than {@param datum}, but + * NOTE: this may be a different instance than {@see datum}, but * implementations should make sure that it contains at least all - * information from {@param datum}. + * information from {@see datum}. */ public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, long fetchTime); @@ -125,13 +143,17 @@ public interface FetchSchedule extends C * guarantee that the page will be fetched, it just allows it to be * included in the further selection process based on scores. The default * implementation checks <code>fetchTime</code>, if it is higher than the - * {@param curTime} it returns false, and true otherwise. It will also + * {@see curTime} it returns false, and true otherwise. It will also * check that fetchTime is not too remote (more than <code>maxInterval</code), * in which case it lowers the interval and returns true. - * @param url URL of the page - * @param datum datum instance + * + * @param url URL of the page. + * + * @param datum datum instance. + * * @param curTime reference time (usually set to the time when the * fetchlist generation process was started). + * * @return true, if the page should be considered for inclusion in the current * fetchlist, otherwise false. */ @@ -140,15 +162,19 @@ public interface FetchSchedule extends C /** * This method resets fetchTime, fetchInterval, modifiedTime and * page signature, so that it forces refetching. - * @param url URL of the page - * @param datum datum instance + * + * @param url URL of the page. + * + * @param datum datum instance. + * * @param asap if true, force refetch as soon as possible - this sets * the fetchTime to now. If false, force refetch whenever the next fetch * time is set. + * * @return adjusted page information, including all original information. - * NOTE: this may be a different instance than {@param datum}, but + * NOTE: this may be a different instance than {@see datum}, but * implementations should make sure that it contains at least all - * information from {@param datum}. + * information from {@see datum}. */ public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap); } Modified: nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java?rev=1231517&r1=1231516&r2=1231517&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java Sat Jan 14 15:45:46 2012 @@ -77,7 +77,7 @@ import org.apache.solr.common.SolrDocume * </li> * </ul> * - * Note that unlike {@link DeleteDuplicates} we assume that two documents in + * Note that unlike {@link DeleteDuplicate}s we assume that two documents in * a solr index will never have the same URL. So this class only deals with * documents with <b>different</b> URLs but the same digest. */ Modified: nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java?rev=1231517&r1=1231516&r2=1231517&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java Sat Jan 14 15:45:46 2012 @@ -40,12 +40,12 @@ import org.apache.hadoop.util.StringUtil * <p>Arc files are essentially tars of gzips. Each record in an arc file is * a compressed gzip. Multiple records are concatenated together to form a * complete arc. For more information on the arc file format see - * {@link http://www.archive.org/web/researcher/ArcFileFormat.php}.</p> + * {@link http://www.archive.org/web/researcher/ArcFileFormat.php } .</p> * * <p>Arc files are used by the internet archive and grub projects.</p> * - * @see http://www.archive.org/ - * @see http://www.grub.org/ + * see {@link http://www.archive.org/ } + * see {@link http://www.grub.org/ } */ public class ArcRecordReader implements RecordReader<Text, BytesWritable> { Modified: nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java?rev=1231517&r1=1231516&r2=1231517&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Sat Jan 14 15:45:46 2012 @@ -29,9 +29,14 @@ import org.apache.tika.mime.MimeType; import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; import org.apache.tika.mime.MimeTypesFactory; + +// Slf4j logging imports import org.slf4j.Logger; import org.slf4j.LoggerFactory; +// imported for Javadoc +import org.apache.nutch.protocol.ProtocolOutput + /** * @author mattmann * @since NUTCH-608 @@ -123,7 +128,7 @@ public final class MimeUtil { * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}. * Then the cleaned mime type is looked up in the underlying Tika * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} is - * found, then that mime type is used, otherwise {@link URL} resolution is + * found, then that mime type is used, otherwise URL resolution is * used to try and determine the mime type. If that means is unsuccessful, and * if <code>mime.type.magic</code> is enabled in {@link NutchConfiguration}, * then mime type magic resolution is used to try and obtain a @@ -132,7 +137,7 @@ public final class MimeUtil { * @param typeName * The original mime type, returned from a {@link ProtocolOutput}. * @param url - * The given {@link URL}, that Nutch was trying to crawl. + * The given @see url, that Nutch was trying to crawl. * @param data * The byte data, returned from the crawl, if any. * @return The correctly, automatically guessed {@link MimeType} name. @@ -254,4 +259,4 @@ public final class MimeUtil { } -} \ No newline at end of file +} Modified: nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java?rev=1231517&r1=1231516&r2=1231517&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java Sat Jan 14 15:45:46 2012 @@ -102,8 +102,8 @@ public class NodeWalker { } /** - * Returns true if there are more nodes on the current stack. - * @return + * @return returns true if there are more nodes on the current stack. + * */ public boolean hasNext() { return (nodes.size() > 0); Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java?rev=1231517&r1=1231516&r2=1231517&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java Sat Jan 14 15:45:46 2012 @@ -32,7 +32,7 @@ package org.apache.nutch.util.domain; * * @author Enis Soztutar <enis.soz.nu...@gmail.com> * @see TopLevelDomain - * @see domain-suffixes.xml + * for info please see conf/domain-suffixes.xml */ public class DomainSuffix { Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java?rev=1231517&r1=1231516&r2=1231517&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java Sat Jan 14 15:45:46 2012 @@ -50,7 +50,7 @@ public class DomainSuffixes { /** * Singleton instance, lazy instantination - * @return + * @return returns the domain suffix instance */ public static DomainSuffixes getInstance() { if(instance == null) { Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java?rev=1231517&r1=1231516&r2=1231517&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java Sat Jan 14 15:45:46 2012 @@ -22,9 +22,12 @@ package org.apache.nutch.util.domain; * Internet domain name; that is, the letters which follow the final * dot of any domain name. For example, in the domain name * <code>www.website.com</code>, the top-level domain is <code>com</code>. + * * @author Enis Soztutar <enis.soz.nu...@gmail.com> - * @see http://www.iana.org/ - * @see http://en.wikipedia.org/wiki/Top-level_domain + * + * @see <a href="http://www.iana.org/"> iana.org</a> + * + * @see <a href="http://en.wikipedia.org/wiki/Top-level_domain"> Top-level_domain</a> */ public class TopLevelDomain extends DomainSuffix { Modified: nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=1231517&r1=1231516&r2=1231517&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java (original) +++ nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java Sat Jan 14 15:45:46 2012 @@ -43,8 +43,8 @@ import org.apache.nutch.net.*; * regular expressions. * * <p>The regular expressions rules are expressed in a file. The file of rules - * is provided by each implementation using the - * {@link #getRulesFile(Configuration)} method.</p> + * is determined for each implementation using the + * {@link #getRulesReader(Configuration conf)} method.</p> * * <p>The format of this file is made of many rules (one per line):<br/> * <code>