Author: lewismc
Date: Sat Jan 14 15:45:46 2012
New Revision: 1231517
URL: http://svn.apache.org/viewvc?rev=1231517&view=rev
Log:
commit to try and resolve NUTCH-1176, I expect this not to work 1st time, N.B.
This doesn't change or even touch syntax of code.
Modified:
nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java
nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java
nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java
nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Sat
Jan 14 15:45:46 2012
@@ -64,6 +64,7 @@ public abstract class AbstractFetchSched
* default <code>fetchInterval</code>.
*
* @param url URL of the page.
+ *
* @param datum datum instance to be initialized (modified in place).
*/
public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) {
@@ -91,12 +92,15 @@ public abstract class AbstractFetchSched
* marked as GONE. Default implementation increases fetchInterval by 50%,
* and if it exceeds the <code>maxInterval</code> it calls
* {@link #forceRefetch(Text, CrawlDatum, boolean)}.
- * @param url URL of the page
- * @param datum datum instance to be adjusted
+ *
+ * @param url URL of the page.
+ *
+ * @param datum datum instance to be adjusted.
+ *
* @return adjusted page information, including all original information.
- * NOTE: this may be a different instance than {@param datum}, but
+ * NOTE: this may be a different instance than {@see datum}, but
* implementations should make sure that it contains at least all
- * information from {@param datum}.
+ * information from {@see datum}.
*/
public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum,
long prevFetchTime, long prevModifiedTime, long fetchTime) {
@@ -113,15 +117,21 @@ public abstract class AbstractFetchSched
* re-tried due to transient errors. The default implementation
* sets the next fetch time 1 day in the future and increases
* the retry counter.
- * @param url URL of the page
- * @param datum page information
- * @param prevFetchTime previous fetch time
- * @param prevModifiedTime previous modified time
- * @param fetchTime current fetch time
+ *
+ * @param url URL of the page.
+ *
+ * @param datum page information.
+ *
+ * @param prevFetchTime previous fetch time.
+ *
+ * @param prevModifiedTime previous modified time.
+ *
+ * @param fetchTime current fetch time.
+ *
* @return adjusted page information, including all original information.
- * NOTE: this may be a different instance than {@param datum}, but
+ * NOTE: this may be a different instance than {@see datum}, but
* implementations should make sure that it contains at least all
- * information from {@param datum}.
+ * information from {@see datum}.
*/
public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
long prevFetchTime, long prevModifiedTime, long fetchTime) {
@@ -147,10 +157,14 @@ public abstract class AbstractFetchSched
* {@param curTime} it returns false, and true otherwise. It will also
* check that fetchTime is not too remote (more than
<code>maxInterval</code>,
* in which case it lowers the interval and returns true.
- * @param url URL of the page
- * @param datum datum instance
+ *
+ * @param url URL of the page.
+ *
+ * @param datum datum instance.
+ *
* @param curTime reference time (usually set to the time when the
* fetchlist generation process was started).
+ *
* @return true, if the page should be considered for inclusion in the
current
* fetchlist, otherwise false.
*/
@@ -173,8 +187,11 @@ public abstract class AbstractFetchSched
/**
* This method resets fetchTime, fetchInterval, modifiedTime,
* retriesSinceFetch and page signature, so that it forces refetching.
- * @param url URL of the page
- * @param datum datum instance
+ *
+ * @param url URL of the page.
+ *
+ * @param datum datum instance.
+ *
* @param asap if true, force refetch as soon as possible - this sets
* the fetchTime to now. If false, force refetch whenever the next fetch
* time is set.
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java Sat Jan 14
15:45:46 2012
@@ -43,11 +43,13 @@ public interface FetchSchedule extends C
* default <code>fetchInterval</code>.
*
* @param url URL of the page.
+ *
* @param datum datum instance to be initialized.
+ *
* @return adjusted page information, including all original information.
- * NOTE: this may be a different instance than {@param datum}, but
+ * NOTE: this may be a different instance than {@see datum}, but
* implementations should make sure that it contains at least all
- * information from {@param datum}.
+ * information from {@see datum}.
*/
public CrawlDatum initializeSchedule(Text url, CrawlDatum datum);
@@ -58,23 +60,30 @@ public interface FetchSchedule extends C
* schedules.
*
* @param url url of the page
+ *
* @param datum page description to be adjusted. NOTE: this instance, passed
by reference,
* may be modified inside the method.
- * @param prevFetchTime previous value of fetch time, or 0 if not available
- * @param prevModifiedTime previous value of modifiedTime, or 0 if not
available
+ *
+ * @param prevFetchTime previous value of fetch time, or 0 if not available.
+ *
+ * @param prevModifiedTime previous value of modifiedTime, or 0 if not
available.
+ *
* @param fetchTime the latest time, when the page was recently re-fetched.
Most FetchSchedule
- * implementations should update the value in {@param datum} to something
greater than this value.
+ * implementations should update the value in {@see datum} to something
greater than this value.
+ *
* @param modifiedTime last time the content was modified. This information
comes from
* the protocol implementations, or is set to < 0 if not available. Most
FetchSchedule
- * implementations should update the value in {@param datum} to this value.
+ * implementations should update the value in {@see datum} to this value.
+ *
* @param state if {@link #STATUS_MODIFIED}, then the content is considered
to be "changed" before the
* <code>fetchTime</code>, if {@link #STATUS_NOTMODIFIED} then the content
is known to be unchanged.
* This information may be obtained by comparing page signatures before and
after fetching. If this
* is set to {@link #STATUS_UNKNOWN}, then it is unknown whether the page
was changed; implementations
* are free to follow a sensible default behavior.
+ *
* @return adjusted page information, including all original information.
NOTE: this may
- * be a different instance than {@param datum}, but implementations should
make sure that
- * it contains at least all information from {@param datum}.
+ * be a different instance than {@see datum}, but implementations should
make sure that
+ * it contains at least all information from {@see datum}.
*/
public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
long prevFetchTime, long prevModifiedTime,
@@ -85,12 +94,15 @@ public interface FetchSchedule extends C
* marked as GONE. Default implementation increases fetchInterval by 50%,
* and if it exceeds the <code>maxInterval</code> it calls
* {@link #forceRefetch(Text, CrawlDatum, boolean)}.
+ *
* @param url URL of the page
- * @param datum datum instance to be adjusted
+ *
+ * @param datum datum instance to be adjusted.
+ *
* @return adjusted page information, including all original information.
- * NOTE: this may be a different instance than {@param datum}, but
+ * NOTE: this may be a different instance than {@see datum}, but
* implementations should make sure that it contains at least all
- * information from {@param datum}.
+ * information from {@see datum}.
*/
public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum,
long prevFetchTime, long prevModifiedTime, long fetchTime);
@@ -100,15 +112,21 @@ public interface FetchSchedule extends C
* re-tried due to transient errors. The default implementation
* sets the next fetch time 1 day in the future and increases the
* retry counter.
- * @param url URL of the page
- * @param datum page information
- * @param prevFetchTime previous fetch time
- * @param prevModifiedTime previous modified time
- * @param fetchTime current fetch time
+ *
+ * @param url URL of the page.
+ *
+ * @param datum page information.
+ *
+ * @param prevFetchTime previous fetch time.
+ *
+ * @param prevModifiedTime previous modified time.
+ *
+ * @param fetchTime current fetch time.
+ *
* @return adjusted page information, including all original information.
- * NOTE: this may be a different instance than {@param datum}, but
+ * NOTE: this may be a different instance than {@see datum}, but
* implementations should make sure that it contains at least all
- * information from {@param datum}.
+ * information from {@see datum}.
*/
public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
long prevFetchTime, long prevModifiedTime, long fetchTime);
@@ -125,13 +143,17 @@ public interface FetchSchedule extends C
* guarantee that the page will be fetched, it just allows it to be
* included in the further selection process based on scores. The default
* implementation checks <code>fetchTime</code>, if it is higher than the
- * {@param curTime} it returns false, and true otherwise. It will also
+ * {@see curTime} it returns false, and true otherwise. It will also
* check that fetchTime is not too remote (more than
<code>maxInterval</code),
* in which case it lowers the interval and returns true.
- * @param url URL of the page
- * @param datum datum instance
+ *
+ * @param url URL of the page.
+ *
+ * @param datum datum instance.
+ *
* @param curTime reference time (usually set to the time when the
* fetchlist generation process was started).
+ *
* @return true, if the page should be considered for inclusion in the
current
* fetchlist, otherwise false.
*/
@@ -140,15 +162,19 @@ public interface FetchSchedule extends C
/**
* This method resets fetchTime, fetchInterval, modifiedTime and
* page signature, so that it forces refetching.
- * @param url URL of the page
- * @param datum datum instance
+ *
+ * @param url URL of the page.
+ *
+ * @param datum datum instance.
+ *
* @param asap if true, force refetch as soon as possible - this sets
* the fetchTime to now. If false, force refetch whenever the next fetch
* time is set.
+ *
* @return adjusted page information, including all original information.
- * NOTE: this may be a different instance than {@param datum}, but
+ * NOTE: this may be a different instance than {@see datum}, but
* implementations should make sure that it contains at least all
- * information from {@param datum}.
+ * information from {@see datum}.
*/
public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap);
}
Modified:
nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
---
nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
(original)
+++
nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
Sat Jan 14 15:45:46 2012
@@ -77,7 +77,7 @@ import org.apache.solr.common.SolrDocume
* </li>
* </ul>
*
- * Note that unlike {@link DeleteDuplicates} we assume that two documents in
+ * Note that unlike {@link DeleteDuplicate}s we assume that two documents in
* a solr index will never have the same URL. So this class only deals with
* documents with <b>different</b> URLs but the same digest.
*/
Modified: nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java Sat
Jan 14 15:45:46 2012
@@ -40,12 +40,12 @@ import org.apache.hadoop.util.StringUtil
* <p>Arc files are essentially tars of gzips. Each record in an arc file is
* a compressed gzip. Multiple records are concatenated together to form a
* complete arc. For more information on the arc file format see
- * {@link http://www.archive.org/web/researcher/ArcFileFormat.php}.</p>
+ * {@link http://www.archive.org/web/researcher/ArcFileFormat.php } .</p>
*
* <p>Arc files are used by the internet archive and grub projects.</p>
*
- * @see http://www.archive.org/
- * @see http://www.grub.org/
+ * see {@link http://www.archive.org/ }
+ * see {@link http://www.grub.org/ }
*/
public class ArcRecordReader
implements RecordReader<Text, BytesWritable> {
Modified: nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Sat Jan 14
15:45:46 2012
@@ -29,9 +29,14 @@ import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.mime.MimeTypesFactory;
+
+// Slf4j logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+// imported for Javadoc
+import org.apache.nutch.protocol.ProtocolOutput
+
/**
* @author mattmann
* @since NUTCH-608
@@ -123,7 +128,7 @@ public final class MimeUtil {
* <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}.
* Then the cleaned mime type is looked up in the underlying Tika
* {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType}
is
- * found, then that mime type is used, otherwise {@link URL} resolution is
+ * found, then that mime type is used, otherwise URL resolution is
* used to try and determine the mime type. If that means is unsuccessful,
and
* if <code>mime.type.magic</code> is enabled in {@link NutchConfiguration},
* then mime type magic resolution is used to try and obtain a
@@ -132,7 +137,7 @@ public final class MimeUtil {
* @param typeName
* The original mime type, returned from a {@link ProtocolOutput}.
* @param url
- * The given {@link URL}, that Nutch was trying to crawl.
+ * The given @see url, that Nutch was trying to crawl.
* @param data
* The byte data, returned from the crawl, if any.
* @return The correctly, automatically guessed {@link MimeType} name.
@@ -254,4 +259,4 @@ public final class MimeUtil {
}
-}
\ No newline at end of file
+}
Modified: nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java Sat Jan 14
15:45:46 2012
@@ -102,8 +102,8 @@ public class NodeWalker {
}
/**
- * Returns true if there are more nodes on the current stack.
- * @return
+ * @return returns true if there are more nodes on the current stack.
+ *
*/
public boolean hasNext() {
return (nodes.size() > 0);
Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java Sat Jan
14 15:45:46 2012
@@ -32,7 +32,7 @@ package org.apache.nutch.util.domain;
*
* @author Enis Soztutar <[email protected]>
* @see TopLevelDomain
- * @see domain-suffixes.xml
+ * for info please see conf/domain-suffixes.xml
*/
public class DomainSuffix {
Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java Sat
Jan 14 15:45:46 2012
@@ -50,7 +50,7 @@ public class DomainSuffixes {
/**
* Singleton instance, lazy instantination
- * @return
+ * @return returns the domain suffix instance
*/
public static DomainSuffixes getInstance() {
if(instance == null) {
Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
(original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java Sat
Jan 14 15:45:46 2012
@@ -22,9 +22,12 @@ package org.apache.nutch.util.domain;
* Internet domain name; that is, the letters which follow the final
* dot of any domain name. For example, in the domain name
* <code>www.website.com</code>, the top-level domain is <code>com</code>.
+ *
* @author Enis Soztutar <[email protected]>
- * @see http://www.iana.org/
- * @see http://en.wikipedia.org/wiki/Top-level_domain
+ *
+ * @see <a href="http://www.iana.org/"> iana.org</a>
+ *
+ * @see <a href="http://en.wikipedia.org/wiki/Top-level_domain">
Top-level_domain</a>
*/
public class TopLevelDomain extends DomainSuffix {
Modified:
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
---
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
(original)
+++
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
Sat Jan 14 15:45:46 2012
@@ -43,8 +43,8 @@ import org.apache.nutch.net.*;
* regular expressions.
*
* <p>The regular expressions rules are expressed in a file. The file of rules
- * is provided by each implementation using the
- * {@link #getRulesFile(Configuration)} method.</p>
+ * is determined for each implementation using the
+ * {@link #getRulesReader(Configuration conf)} method.</p>
*
* <p>The format of this file is made of many rules (one per line):<br/>
* <code>