nut...

lewismc Sat, 14 Jan 2012 07:46:14 -0800

Author: lewismc
Date: Sat Jan 14 15:45:46 2012
New Revision: 1231517

URL: http://svn.apache.org/viewvc?rev=1231517&view=rev
Log:
commit to try and resolve NUTCH-1176, I expect this not to work 1st time, N.B. 
This doesn't change or even touch syntax of code.


Modified:
    nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
    nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java
    nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
    nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
    nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
    nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java
    nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java
    nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
    nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
    
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Sat 
Jan 14 15:45:46 2012
@@ -64,6 +64,7 @@ public abstract class AbstractFetchSched
    * default <code>fetchInterval</code>.
    * 
    * @param url URL of the page.
+   *
    * @param datum datum instance to be initialized (modified in place).
    */
   public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) {
@@ -91,12 +92,15 @@ public abstract class AbstractFetchSched
    * marked as GONE. Default implementation increases fetchInterval by 50%,
    * and if it exceeds the <code>maxInterval</code> it calls
    * {@link #forceRefetch(Text, CrawlDatum, boolean)}.
-   * @param url URL of the page
-   * @param datum datum instance to be adjusted
+   *
+   * @param url URL of the page.
+   *
+   * @param datum datum instance to be adjusted.
+   *
    * @return adjusted page information, including all original information.
-   * NOTE: this may be a different instance than {@param datum}, but
+   * NOTE: this may be a different instance than {@see datum}, but
    * implementations should make sure that it contains at least all
-   * information from {@param datum}.
+   * information from {@see datum}.
    */
   public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum,
           long prevFetchTime, long prevModifiedTime, long fetchTime) {
@@ -113,15 +117,21 @@ public abstract class AbstractFetchSched
    * re-tried due to transient errors. The default implementation
    * sets the next fetch time 1 day in the future and increases
    * the retry counter.
-   * @param url URL of the page
-   * @param datum page information
-   * @param prevFetchTime previous fetch time
-   * @param prevModifiedTime previous modified time
-   * @param fetchTime current fetch time
+   *
+   * @param url URL of the page.
+   *
+   * @param datum page information.
+   *
+   * @param prevFetchTime previous fetch time.
+   *
+   * @param prevModifiedTime previous modified time.
+   *
+   * @param fetchTime current fetch time.
+   *
    * @return adjusted page information, including all original information.
-   * NOTE: this may be a different instance than {@param datum}, but
+   * NOTE: this may be a different instance than {@see datum}, but
    * implementations should make sure that it contains at least all
-   * information from {@param datum}.
+   * information from {@see datum}.
    */
   public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
           long prevFetchTime, long prevModifiedTime, long fetchTime) {
@@ -147,10 +157,14 @@ public abstract class AbstractFetchSched
    * {@param curTime} it returns false, and true otherwise. It will also
    * check that fetchTime is not too remote (more than 
<code>maxInterval</code>,
    * in which case it lowers the interval and returns true.
-   * @param url URL of the page
-   * @param datum datum instance
+   *
+   * @param url URL of the page.
+   *
+   * @param datum datum instance.
+   *
    * @param curTime reference time (usually set to the time when the
    * fetchlist generation process was started).
+   *
    * @return true, if the page should be considered for inclusion in the 
current
    * fetchlist, otherwise false.
    */
@@ -173,8 +187,11 @@ public abstract class AbstractFetchSched
   /**
    * This method resets fetchTime, fetchInterval, modifiedTime,
    * retriesSinceFetch and page signature, so that it forces refetching.
-   * @param url URL of the page
-   * @param datum datum instance
+   *
+   * @param url URL of the page.
+   *
+   * @param datum datum instance.
+   *
    * @param asap if true, force refetch as soon as possible - this sets
    * the fetchTime to now. If false, force refetch whenever the next fetch
    * time is set.

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/FetchSchedule.java Sat Jan 14 
15:45:46 2012
@@ -43,11 +43,13 @@ public interface FetchSchedule extends C
    * default <code>fetchInterval</code>.
    * 
    * @param url URL of the page.
+   *
    * @param datum datum instance to be initialized.
+   *
    * @return adjusted page information, including all original information.
-   * NOTE: this may be a different instance than {@param datum}, but
+   * NOTE: this may be a different instance than {@see datum}, but
    * implementations should make sure that it contains at least all
-   * information from {@param datum}.
+   * information from {@see datum}.
    */
   public CrawlDatum initializeSchedule(Text url, CrawlDatum datum);
   
@@ -58,23 +60,30 @@ public interface FetchSchedule extends C
    * schedules.
    * 
    * @param url url of the page
+   *
    * @param datum page description to be adjusted. NOTE: this instance, passed 
by reference,
    * may be modified inside the method.
-   * @param prevFetchTime previous value of fetch time, or 0 if not available
-   * @param prevModifiedTime previous value of modifiedTime, or 0 if not 
available
+   *
+   * @param prevFetchTime previous value of fetch time, or 0 if not available.
+   *
+   * @param prevModifiedTime previous value of modifiedTime, or 0 if not 
available.
+   *
    * @param fetchTime the latest time, when the page was recently re-fetched. 
Most FetchSchedule
-   * implementations should update the value in {@param datum} to something 
greater than this value.
+   * implementations should update the value in {@see datum} to something 
greater than this value.
+   *
    * @param modifiedTime last time the content was modified. This information 
comes from
    * the protocol implementations, or is set to < 0 if not available. Most 
FetchSchedule
-   * implementations should update the value in {@param datum} to this value.
+   * implementations should update the value in {@see datum} to this value.
+   *
    * @param state if {@link #STATUS_MODIFIED}, then the content is considered 
to be "changed" before the
    * <code>fetchTime</code>, if {@link #STATUS_NOTMODIFIED} then the content 
is known to be unchanged.
    * This information may be obtained by comparing page signatures before and 
after fetching. If this
    * is set to {@link #STATUS_UNKNOWN}, then it is unknown whether the page 
was changed; implementations
    * are free to follow a sensible default behavior.
+   *
    * @return adjusted page information, including all original information. 
NOTE: this may
-   * be a different instance than {@param datum}, but implementations should 
make sure that
-   * it contains at least all information from {@param datum}.
+   * be a different instance than {@see datum}, but implementations should 
make sure that
+   * it contains at least all information from {@see datum}.
    */
   public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
           long prevFetchTime, long prevModifiedTime,
@@ -85,12 +94,15 @@ public interface FetchSchedule extends C
    * marked as GONE. Default implementation increases fetchInterval by 50%,
    * and if it exceeds the <code>maxInterval</code> it calls
    * {@link #forceRefetch(Text, CrawlDatum, boolean)}.
+   *
    * @param url URL of the page
-   * @param datum datum instance to be adjusted
+   *
+   * @param datum datum instance to be adjusted.
+   *
    * @return adjusted page information, including all original information.
-   * NOTE: this may be a different instance than {@param datum}, but
+   * NOTE: this may be a different instance than {@see datum}, but
    * implementations should make sure that it contains at least all
-   * information from {@param datum}.
+   * information from {@see datum}.
    */
   public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum,
           long prevFetchTime, long prevModifiedTime, long fetchTime);
@@ -100,15 +112,21 @@ public interface FetchSchedule extends C
    * re-tried due to transient errors. The default implementation
    * sets the next fetch time 1 day in the future and increases the
    * retry counter.
-   * @param url URL of the page
-   * @param datum page information
-   * @param prevFetchTime previous fetch time
-   * @param prevModifiedTime previous modified time
-   * @param fetchTime current fetch time
+   *
+   * @param url URL of the page.
+   *
+   * @param datum page information.
+   *
+   * @param prevFetchTime previous fetch time.
+   *
+   * @param prevModifiedTime previous modified time.
+   *
+   * @param fetchTime current fetch time.
+   *
    * @return adjusted page information, including all original information.
-   * NOTE: this may be a different instance than {@param datum}, but
+   * NOTE: this may be a different instance than {@see datum}, but
    * implementations should make sure that it contains at least all
-   * information from {@param datum}.
+   * information from {@see datum}.
    */
   public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
           long prevFetchTime, long prevModifiedTime, long fetchTime);
@@ -125,13 +143,17 @@ public interface FetchSchedule extends C
    * guarantee that the page will be fetched, it just allows it to be
    * included in the further selection process based on scores. The default
    * implementation checks <code>fetchTime</code>, if it is higher than the
-   * {@param curTime} it returns false, and true otherwise. It will also
+   * {@see curTime} it returns false, and true otherwise. It will also
    * check that fetchTime is not too remote (more than 
<code>maxInterval</code),
    * in which case it lowers the interval and returns true.
-   * @param url URL of the page
-   * @param datum datum instance
+   *
+   * @param url URL of the page.
+   *
+   * @param datum datum instance.
+   *
    * @param curTime reference time (usually set to the time when the
    * fetchlist generation process was started).
+   *
    * @return true, if the page should be considered for inclusion in the 
current
    * fetchlist, otherwise false.
    */
@@ -140,15 +162,19 @@ public interface FetchSchedule extends C
   /**
    * This method resets fetchTime, fetchInterval, modifiedTime and
    * page signature, so that it forces refetching.
-   * @param url URL of the page
-   * @param datum datum instance
+   *
+   * @param url URL of the page.
+   *
+   * @param datum datum instance.
+   *
    * @param asap if true, force refetch as soon as possible - this sets
    * the fetchTime to now. If false, force refetch whenever the next fetch
    * time is set.
+   *
    * @return adjusted page information, including all original information.
-   * NOTE: this may be a different instance than {@param datum}, but
+   * NOTE: this may be a different instance than {@see datum}, but
    * implementations should make sure that it contains at least all
-   * information from {@param datum}.
+   * information from {@see datum}.
    */
   public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap);
 }

Modified: 
nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- 
nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java 
(original)
+++ 
nutch/trunk/src/java/org/apache/nutch/indexer/solr/SolrDeleteDuplicates.java 
Sat Jan 14 15:45:46 2012
@@ -77,7 +77,7 @@ import org.apache.solr.common.SolrDocume
  * </li>
  * </ul>
  * 
- * Note that unlike {@link DeleteDuplicates} we assume that two documents in
+ * Note that unlike {@link DeleteDuplicate}s we assume that two documents in
  * a solr index will never have the same URL. So this class only deals with
  * documents with <b>different</b> URLs but the same digest. 
  */

Modified: nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java Sat 
Jan 14 15:45:46 2012
@@ -40,12 +40,12 @@ import org.apache.hadoop.util.StringUtil
  * <p>Arc files are essentially tars of gzips.  Each record in an arc file is
  * a compressed gzip.  Multiple records are concatenated together to form a
  * complete arc.  For more information on the arc file format see
- * {@link http://www.archive.org/web/researcher/ArcFileFormat.php}.</p>
+ * {@link http://www.archive.org/web/researcher/ArcFileFormat.php } .</p>
  * 
  * <p>Arc files are used by the internet archive and grub projects.</p>
  * 
- * @see http://www.archive.org/
- * @see http://www.grub.org/
+ * see {@link http://www.archive.org/ }
+ * see {@link http://www.grub.org/ }
  */
 public class ArcRecordReader
   implements RecordReader<Text, BytesWritable> {

Modified: nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Sat Jan 14 
15:45:46 2012
@@ -29,9 +29,14 @@ import org.apache.tika.mime.MimeType;
 import org.apache.tika.mime.MimeTypeException;
 import org.apache.tika.mime.MimeTypes;
 import org.apache.tika.mime.MimeTypesFactory;
+
+// Slf4j logging imports
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+// imported for Javadoc
+import org.apache.nutch.protocol.ProtocolOutput
+
 /**
  * @author mattmann
  * @since NUTCH-608
@@ -123,7 +128,7 @@ public final class MimeUtil {
    * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}.
    * Then the cleaned mime type is looked up in the underlying Tika
    * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} 
is
-   * found, then that mime type is used, otherwise {@link URL} resolution is
+   * found, then that mime type is used, otherwise URL resolution is
    * used to try and determine the mime type. If that means is unsuccessful, 
and
    * if <code>mime.type.magic</code> is enabled in {@link NutchConfiguration},
    * then mime type magic resolution is used to try and obtain a
@@ -132,7 +137,7 @@ public final class MimeUtil {
    * @param typeName
    *          The original mime type, returned from a {@link ProtocolOutput}.
    * @param url
-   *          The given {@link URL}, that Nutch was trying to crawl.
+   *          The given @see url, that Nutch was trying to crawl.
    * @param data
    *          The byte data, returned from the crawl, if any.
    * @return The correctly, automatically guessed {@link MimeType} name.
@@ -254,4 +259,4 @@ public final class MimeUtil {
   }
 
 
-}
\ No newline at end of file
+}

Modified: nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java Sat Jan 14 
15:45:46 2012
@@ -102,8 +102,8 @@ public class NodeWalker {
   }
   
   /**
-   * Returns true if there are more nodes on the current stack.
-   * @return
+   * @return returns true if there are more nodes on the current stack.
+   * 
    */
   public boolean hasNext() {
     return (nodes.size() > 0);

Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java Sat Jan 
14 15:45:46 2012
@@ -32,7 +32,7 @@ package org.apache.nutch.util.domain;
  * 
  * @author Enis Soztutar &lt;enis.soz.nu...@gmail.com&gt;
  * @see TopLevelDomain
- * @see domain-suffixes.xml
+ * for info please see conf/domain-suffixes.xml
  */
 public class DomainSuffix {
 

Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java Sat 
Jan 14 15:45:46 2012
@@ -50,7 +50,7 @@ public class DomainSuffixes {
   
   /**
    * Singleton instance, lazy instantination
-   * @return
+   * @return returns the domain suffix instance 
    */
   public static DomainSuffixes getInstance() {
     if(instance == null) {

Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java Sat 
Jan 14 15:45:46 2012
@@ -22,9 +22,12 @@ package org.apache.nutch.util.domain;
  * Internet domain name; that is, the letters which follow the final 
  * dot of any domain name. For example, in the domain name 
  * <code>www.website.com</code>, the top-level domain is <code>com</code>.
+ *
  * @author Enis Soztutar &lt;enis.soz.nu...@gmail.com&gt;
- * @see http://www.iana.org/
- * @see http://en.wikipedia.org/wiki/Top-level_domain
+ * 
+ * @see <a href="http://www.iana.org/";> iana.org</a>
+ * 
+ * @see <a href="http://en.wikipedia.org/wiki/Top-level_domain";> 
Top-level_domain</a>
  */
 public class TopLevelDomain extends DomainSuffix {
 

Modified: 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=1231517&r1=1231516&r2=1231517&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
 Sat Jan 14 15:45:46 2012
@@ -43,8 +43,8 @@ import org.apache.nutch.net.*;
  * regular expressions.
  *
  * <p>The regular expressions rules are expressed in a file. The file of rules
- * is provided by each implementation using the
- * {@link #getRulesFile(Configuration)} method.</p>
+ * is determined for each implementation using the
+ * {@link #getRulesReader(Configuration conf)} method.</p>
  * 
  * <p>The format of this file is made of many rules (one per line):<br/>
  * <code>

svn commit: r1231517 - in /nutch/trunk/src: java/org/apache/nutch/crawl/ java/org/apache/nutch/indexer/solr/ java/org/apache/nutch/tools/arc/ java/org/apache/nutch/util/ java/org/apache/nutch/util/domain/ plugin/lib-regex-filter/src/java/org/apache/nut...

Reply via email to