[jira] [Commented] (NUTCH-2553) Fetcher not to modify URLs to be fetched

ASF GitHub Bot (JIRA) Sat, 21 Apr 2018 09:35:55 -0700

    [ 
https://issues.apache.org/jira/browse/NUTCH-2553?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16446881#comment-16446881
 ]


ASF GitHub Bot commented on NUTCH-2553:
---------------------------------------

sebastian-nagel closed pull request #317: NUTCH-2553 Fetcher not to modify URLs 
to be fetched
URL: https://github.com/apache/nutch/pull/317
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java 
b/src/java/org/apache/nutch/fetcher/Fetcher.java
index ba34d68a8..dc3e2f9b7 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -109,7 +109,6 @@
   SequenceFileInputFormat<Text, CrawlDatum> {
     /** Don't split inputs, to keep things polite. */
     public InputSplit[] getSplits(JobContext job, int nSplits) throws 
IOException {
-      Configuration conf = job.getConfiguration();
       List<FileStatus> files = listStatus(job);
       FileSplit[] splits = new FileSplit[files.size()];
       Iterator<FileStatus> iterator= files.listIterator();
diff --git a/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java 
b/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
index 11c09ae1d..56b24e4a7 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
@@ -67,8 +67,6 @@ public void checkOutputSpecs(JobContext job) throws 
IOException {
 
     Configuration conf = context.getConfiguration();
     String name = getUniqueFile(context, "part", "");
-    Path dir = FileOutputFormat.getOutputPath(context);
-    FileSystem fs = dir.getFileSystem(context.getConfiguration());
     Path out = FileOutputFormat.getOutputPath(context);
     final Path fetch = new Path(new Path(out, CrawlDatum.FETCH_DIR_NAME), 
name);
     final Path content = new Path(new Path(out, Content.DIR_NAME), name);
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java 
b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index d4f0a956b..4f67391f0 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -31,11 +31,11 @@
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper.Context;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.NutchWritable;
 import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.fetcher.Fetcher.FetcherRun;
 import org.apache.nutch.fetcher.FetcherThreadEvent.PublishEventType;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
@@ -105,7 +105,6 @@
   URLFilters urlFiltersForOutlinks;
   URLNormalizers normalizersForOutlinks;
 
-  private int outlinksDepthDivisor;
   private boolean skipTruncated;
 
   private boolean halted = false;
@@ -126,7 +125,7 @@
 
   private boolean parsing;
 
-  private Context context;
+  private FetcherRun.Context context;
 
   private boolean storingContent;
 
@@ -145,7 +144,7 @@
   private boolean activatePublisher;
 
   public FetcherThread(Configuration conf, AtomicInteger activeThreads, 
FetchItemQueues fetchQueues, 
-      QueueFeeder feeder, AtomicInteger spinWaiting, AtomicLong 
lastRequestStart, Context context,
+      QueueFeeder feeder, AtomicInteger spinWaiting, AtomicLong 
lastRequestStart, FetcherRun.Context context,
       AtomicInteger errors, String segmentName, boolean parsing, boolean 
storingContent, 
       AtomicInteger pages, AtomicLong bytes) {
     this.setDaemon(true); // don't hang JVM on exit
@@ -191,11 +190,11 @@ public FetcherThread(Configuration conf, AtomicInteger 
activeThreads, FetchItemQ
     if (!queueMode.equals(FetchItemQueues.QUEUE_MODE_IP)
         && !queueMode.equals(FetchItemQueues.QUEUE_MODE_DOMAIN)
         && !queueMode.equals(FetchItemQueues.QUEUE_MODE_HOST)) {
-      LOG.error("Unknown partition mode : " + queueMode
-          + " - forcing to byHost");
+      LOG.error("Unknown partition mode : {} - forcing to byHost", queueMode);
       queueMode = FetchItemQueues.QUEUE_MODE_HOST;
     }
-    LOG.info(getName() + " " + Thread.currentThread().getId() + " Using queue 
mode : " + queueMode);
+    LOG.info("{} {} Using queue mode : {}", getName(),
+        Thread.currentThread().getId(), queueMode);
     this.maxRedirect = conf.getInt("http.redirect.max", 3);
 
     maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
@@ -211,13 +210,13 @@ public FetcherThread(Configuration conf, AtomicInteger 
activeThreads, FetchItemQ
         "fetcher.follow.outlinks.ignore.external", false);
     maxOutlinkDepthNumLinks = conf.getInt(
         "fetcher.follow.outlinks.num.links", 4);
-    outlinksDepthDivisor = conf.getInt(
-        "fetcher.follow.outlinks.depth.divisor", 2);
     if (conf.getBoolean("fetcher.store.robotstxt", false)) {
       if (storingContent) {
         robotsTxtContent = new LinkedList<>();
       } else {
-        LOG.warn(getName() + " " + Thread.currentThread().getId() + " Ignoring 
fetcher.store.robotstxt because not storing content (fetcher.store.content)!");
+        LOG.warn(
+            "{} {} Ignoring fetcher.store.robotstxt because not storing 
content (fetcher.store.content)!",
+            getName(), Thread.currentThread().getId());
       }
     }
   }
@@ -226,7 +225,6 @@ public FetcherThread(Configuration conf, AtomicInteger 
activeThreads, FetchItemQ
   public void run() {
     activeThreads.incrementAndGet(); // count threads
 
-    Text url = new Text();
     FetchItem fit = null;
     try {
       // checking for the server to be running and fetcher.parse to be true
@@ -242,20 +240,15 @@ public void run() {
 
         // check whether must be stopped
         if (isHalted()) {
-          LOG.debug(getName() + " set to halted");
+          LOG.debug("{} set to halted", getName());
           fit = null;
           return;
         }
 
         fit = ((FetchItemQueues) fetchQueues).getFetchItem();
-        if(fit!=null){
-          URL u = fit.u;
-          String temp_url = u.getProtocol() + "://" + u.getHost() + ":" + 
u.getPort() + u.getFile();
-          url = new Text(temp_url);
-        }
         if (fit == null) {
           if (feeder.isAlive() || ((FetchItemQueues) 
fetchQueues).getTotalSize() > 0) {
-            LOG.debug(getName() + " spin-waiting ...");
+            LOG.debug("{} spin-waiting ...", getName());
             // spin-wait.
             ((AtomicInteger) spinWaiting).incrementAndGet();
             try {
@@ -266,18 +259,21 @@ public void run() {
             continue;
           } else {
             // all done, finish this thread
-            LOG.info(getName() + " " + Thread.currentThread().getId() + " has 
no more work available");
+            LOG.info("{} {} has no more work available", getName(),
+                Thread.currentThread().getId());
             return;
           }
         }
         lastRequestStart.set(System.currentTimeMillis());
+
         Text reprUrlWritable = (Text) fit.datum.getMetaData().get(
             Nutch.WRITABLE_REPR_URL_KEY);
         if (reprUrlWritable == null) {
-          setReprUrl(url.toString());
+          setReprUrl(fit.url.toString());
         } else {
           setReprUrl(reprUrlWritable.toString());
         }
+
         try {
           // fetch the page
           redirecting = false;
@@ -291,17 +287,18 @@ public void run() {
           
           do {
             if (LOG.isInfoEnabled()) {
-              LOG.info(getName() + " " + Thread.currentThread().getId() + " 
fetching " + url + " (queue crawl delay="
-                  + ((FetchItemQueues) 
fetchQueues).getFetchItemQueue(fit.queueID).crawlDelay
-                  + "ms)");
+              LOG.info("{} {} fetching {} (queue crawl delay={}ms)", getName(),
+                  Thread.currentThread().getId(), fit.url,
+                  ((FetchItemQueues) fetchQueues)
+                      .getFetchItemQueue(fit.queueID).crawlDelay);
             }
             if (LOG.isDebugEnabled()) {
-              LOG.debug("redirectCount=" + redirectCount);
+              LOG.debug("redirectCount={}", redirectCount);
             }
             redirecting = false;
-            Protocol protocol = this.protocolFactory.getProtocol(url
+            Protocol protocol = this.protocolFactory.getProtocol(fit.url
                 .toString());
-            BaseRobotRules rules = protocol.getRobotRules(url, fit.datum, 
robotsTxtContent);
+            BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum, 
robotsTxtContent);
             if (robotsTxtContent != null) {
               outputRobotsTxt(robotsTxtContent);
               robotsTxtContent.clear();
@@ -310,9 +307,9 @@ public void run() {
               // unblock
               ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
               if (LOG.isDebugEnabled()) {
-                LOG.debug("Denied by robots.txt: " + url);
+                LOG.debug("Denied by robots.txt: {}", fit.url);
               }
-              output(url, fit.datum, null,
+              output(fit.url, fit.datum, null,
                   ProtocolStatus.STATUS_ROBOTS_DENIED,
                   CrawlDatum.STATUS_FETCH_GONE);
               context.getCounter("FetcherStatus", 
"robots_denied").increment(1);
@@ -322,9 +319,9 @@ public void run() {
               if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) 
{
                 // unblock
                 ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
-                LOG.debug("Crawl-Delay for " + url + " too long ("
-                    + rules.getCrawlDelay() + "), skipping");
-                output(url, fit.datum, null,
+                LOG.debug("Crawl-Delay for {} too long ({}), skipping", 
fit.url,
+                    rules.getCrawlDelay());
+                output(fit.url, fit.datum, null,
                     ProtocolStatus.STATUS_ROBOTS_DENIED,
                     CrawlDatum.STATUS_FETCH_GONE);
                 context.getCounter("FetcherStatus",
@@ -337,11 +334,11 @@ public void run() {
                 if (LOG.isDebugEnabled()) {
                   LOG.debug("Crawl delay for queue: " + fit.queueID
                       + " is set to " + fiq.crawlDelay
-                      + " as per robots.txt. url: " + url);
+                      + " as per robots.txt. url: " + fit.url);
                 }
               }
             }
-            ProtocolOutput output = protocol.getProtocolOutput(url,
+            ProtocolOutput output = protocol.getProtocolOutput(fit.url,
                 fit.datum);
             ProtocolStatus status = output.getStatus();
             Content content = output.getContent();
@@ -349,13 +346,11 @@ public void run() {
             // unblock queue
             ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
 
-            String urlString = url.toString();
-            
             // used for FetchNode
             if (fetchNode != null) {
               fetchNode.setStatus(status.getCode());
               fetchNode.setFetchTime(System.currentTimeMillis());
-              fetchNode.setUrl(url);
+              fetchNode.setUrl(fit.url);
             }
             
             //Publish fetch finish event
@@ -374,19 +369,18 @@ public void run() {
               break;
 
             case ProtocolStatus.SUCCESS: // got a page
-              pstatus = output(url, fit.datum, content, status,
+              pstatus = output(fit.url, fit.datum, content, status,
                   CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
               updateStatus(content.getContent().length);
               if (pstatus != null && pstatus.isSuccess()
                   && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                 String newUrl = pstatus.getMessage();
                 int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
-                Text redirUrl = handleRedirect(url, fit.datum, urlString,
-                    newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME,
+                Text redirUrl = handleRedirect(fit, newUrl,
+                    refreshTime < Fetcher.PERM_REFRESH_TIME,
                     Fetcher.CONTENT_REDIR);
                 if (redirUrl != null) {
                   fit = queueRedirect(redirUrl, fit);
-                  url = fit.url;
                 }
               }
               break;
@@ -402,13 +396,12 @@ public void run() {
                 code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
                 temp = true;
               }
-              output(url, fit.datum, content, status, code);
+              output(fit.url, fit.datum, content, status, code);
               String newUrl = status.getMessage();
-              Text redirUrl = handleRedirect(url, fit.datum, urlString,
-                  newUrl, temp, Fetcher.PROTOCOL_REDIR);
+              Text redirUrl = handleRedirect(fit, newUrl, temp,
+                  Fetcher.PROTOCOL_REDIR);
               if (redirUrl != null) {
                 fit = queueRedirect(redirUrl, fit);
-                url = fit.url;
               } else {
                 // stop redirecting
                 redirecting = false;
@@ -416,7 +409,7 @@ public void run() {
               break;
 
             case ProtocolStatus.EXCEPTION:
-              logError(url, status.getMessage());
+              logError(fit.url, status.getMessage());
               int killedURLs = ((FetchItemQueues) 
fetchQueues).checkExceptionThreshold(fit
                   .getQueueID());
               if (killedURLs != 0)
@@ -425,7 +418,7 @@ public void run() {
               /* FALLTHROUGH */
             case ProtocolStatus.RETRY: // retry
             case ProtocolStatus.BLOCKED:
-              output(url, fit.datum, null, status,
+              output(fit.url, fit.datum, null, status,
                   CrawlDatum.STATUS_FETCH_RETRY);
               break;
 
@@ -433,29 +426,31 @@ public void run() {
             case ProtocolStatus.NOTFOUND:
             case ProtocolStatus.ACCESS_DENIED:
             case ProtocolStatus.ROBOTS_DENIED:
-              output(url, fit.datum, null, status,
+              output(fit.url, fit.datum, null, status,
                   CrawlDatum.STATUS_FETCH_GONE);
               break;
 
             case ProtocolStatus.NOTMODIFIED:
-              output(url, fit.datum, null, status,
+              output(fit.url, fit.datum, null, status,
                   CrawlDatum.STATUS_FETCH_NOTMODIFIED);
               break;
 
             default:
               if (LOG.isWarnEnabled()) {
-                LOG.warn(getName() + " " + Thread.currentThread().getId() + " 
Unknown ProtocolStatus: " + status.getCode());
+                LOG.warn("{} {} Unknown ProtocolStatus: {}", getName(),
+                    Thread.currentThread().getId(), status.getCode());
               }
-              output(url, fit.datum, null, status,
+              output(fit.url, fit.datum, null, status,
                   CrawlDatum.STATUS_FETCH_RETRY);
             }
 
             if (redirecting && redirectCount > maxRedirect) {
               ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
               if (LOG.isInfoEnabled()) {
-                LOG.info(getName() + " " + Thread.currentThread().getId() + "  
- redirect count exceeded " + url);
+                LOG.info("{} {} - redirect count exceeded {}", getName(),
+                    Thread.currentThread().getId(), fit.url);
               }
-              output(url, fit.datum, null,
+              output(fit.url, fit.datum, null,
                   ProtocolStatus.STATUS_REDIR_EXCEEDED,
                   CrawlDatum.STATUS_FETCH_GONE);
             }
@@ -465,30 +460,31 @@ public void run() {
         } catch (Throwable t) { // unexpected exception
           // unblock
           ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
-          logError(url, StringUtils.stringifyException(t));
-          output(url, fit.datum, null, ProtocolStatus.STATUS_FAILED,
+          logError(fit.url, StringUtils.stringifyException(t));
+          output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED,
               CrawlDatum.STATUS_FETCH_RETRY);
         }
       }
 
     } catch (Throwable e) {
       if (LOG.isErrorEnabled()) {
-        LOG.error("fetcher caught:" + e.toString());
+        LOG.error("fetcher caught:", e);
       }
     } finally {
       if (fit != null)
         ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
       activeThreads.decrementAndGet(); // count threads
-      LOG.info(getName() + " " + Thread.currentThread().getId() + " -finishing 
thread " + getName() + ", activeThreads="
-          + activeThreads);
+      LOG.info("{} {} -finishing thread {}, activeThreads={}", getName(),
+          Thread.currentThread().getId(), getName(), activeThreads);
     }
   }
 
-  private Text handleRedirect(Text url, CrawlDatum datum, String urlString,
-      String newUrl, boolean temp, String redirType)
+  private Text handleRedirect(FetchItem fit, String newUrl,
+      boolean temp, String redirType)
       throws MalformedURLException, URLFilterException, InterruptedException {
     newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
     newUrl = urlFilters.filter(newUrl);
+    String urlString = fit.url.toString();
 
     if (newUrl == null || newUrl.equals(urlString)) {
       LOG.debug(" - {} redirect skipped: {}", redirType,
@@ -498,7 +494,7 @@ private Text handleRedirect(Text url, CrawlDatum datum, 
String urlString,
 
     if (ignoreAlsoRedirects && (ignoreExternalLinks || ignoreInternalLinks)) {
       try {
-        URL origUrl = new URL(urlString);
+        URL origUrl = fit.u;
         URL redirUrl = new URL(newUrl);
         if (ignoreExternalLinks) {
           String origHostOrDomain, newHostOrDomain;
@@ -534,7 +530,7 @@ private Text handleRedirect(Text url, CrawlDatum datum, 
String urlString,
     }
 
     reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
-    url = new Text(newUrl);
+    Text url = new Text(newUrl);
     if (maxRedirect > 0) {
       redirecting = true;
       redirectCount++;
@@ -542,9 +538,9 @@ private Text handleRedirect(Text url, CrawlDatum datum, 
String urlString,
       return url;
     } else {
       CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED,
-          datum.getFetchInterval(), datum.getScore());
+          fit.datum.getFetchInterval(), fit.datum.getScore());
       // transfer existing metadata
-      newDatum.getMetaData().putAll(datum.getMetaData());
+      newDatum.getMetaData().putAll(fit.datum.getMetaData());
       try {
         scfilters.initialScore(url, newDatum);
       } catch (ScoringFilterException e) {
@@ -585,7 +581,8 @@ private FetchItem queueRedirect(Text redirUrl, FetchItem 
fit)
 
   private void logError(Text url, String message) {
     if (LOG.isInfoEnabled()) {
-      LOG.info(getName() + " " + Thread.currentThread().getId() + " fetch of " 
+ url + " failed with: " + message);
+      LOG.info("{} {} fetch of {} failed with: ", getName(),
+          Thread.currentThread().getId(), url, message);
     }
     errors.incrementAndGet();
   }
@@ -619,7 +616,8 @@ private ParseStatus output(Text key, CrawlDatum datum, 
Content content,
         scfilters.passScoreBeforeParsing(key, datum, content);
       } catch (Exception e) {
         if (LOG.isWarnEnabled()) {
-          LOG.warn(getName() + " " + Thread.currentThread().getId() + " 
Couldn't pass score, url " + key + " (" + e + ")");
+          LOG.warn("{} {} Couldn't pass score, url {} ({})", getName(),
+              Thread.currentThread().getId(), key, e);
         }
       }
       /*
@@ -632,8 +630,9 @@ private ParseStatus output(Text key, CrawlDatum datum, 
Content content,
           try {
             parseResult = this.parseUtil.parse(content);
           } catch (Exception e) {
-            LOG.warn(getName() + " " + Thread.currentThread().getId() + " 
Error parsing: " + key + ": "
-                + StringUtils.stringifyException(e));
+            LOG.warn("{} {} Error parsing: {}: {}", getName(),
+                Thread.currentThread().getId(), key,
+                StringUtils.stringifyException(e));
           }
         }
 
@@ -664,7 +663,8 @@ private ParseStatus output(Text key, CrawlDatum datum, 
Content content,
           ParseData parseData = parse.getData();
 
           if (!parseStatus.isSuccess()) {
-            LOG.warn(getName() + " " + Thread.currentThread().getId() + " 
Error parsing: " + key + ": " + parseStatus);
+            LOG.warn("{} {} Error parsing: {}: {}", getName(),
+                Thread.currentThread().getId(), key, parseStatus);
             parse = parseStatus.getEmptyParse(conf);
           }
 
@@ -685,7 +685,8 @@ private ParseStatus output(Text key, CrawlDatum datum, 
Content content,
             scfilters.passScoreAfterParsing(url, content, parse);
           } catch (Exception e) {
             if (LOG.isWarnEnabled()) {
-              LOG.warn(getName() + " " + Thread.currentThread().getId() + " 
Couldn't pass score, url " + key + " (" + e + ")");
+              LOG.warn("{} {} Couldn't pass score, url {} ({})", getName(),
+                  Thread.currentThread().getId(), key, e);
             }
           }
 
@@ -757,11 +758,6 @@ private ParseStatus output(Text key, CrawlDatum datum, 
Content content,
             // Counter to limit num outlinks to follow per page
             int outlinkCounter = 0;
 
-            // Calculate variable number of outlinks by depth using the
-            // divisor (outlinks = Math.floor(divisor / depth * num.links))
-            int maxOutlinksByDepth = (int) Math.floor(outlinksDepthDivisor
-                / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);
-
             String followUrl;
 
             // Walk over the outlinks and add as new FetchItem to the queues
@@ -808,7 +804,7 @@ private ParseStatus output(Text key, CrawlDatum datum, 
Content content,
       }
     } catch (IOException e) {
       if (LOG.isErrorEnabled()) {
-        LOG.error("fetcher caught:" + e.toString());
+        LOG.error("fetcher caught:", e);
       }
     }
 
@@ -832,7 +828,7 @@ private void outputRobotsTxt(List<Content> 
robotsTxtContent) throws InterruptedE
         context.write(new Text(robotsTxt.getUrl()),
             new NutchWritable(robotsTxt));
       } catch (IOException e) {
-        LOG.error("fetcher caught: {}", e.toString());
+        LOG.error("fetcher caught:", e);
       }
     }
   }
diff --git a/src/java/org/apache/nutch/fetcher/QueueFeeder.java 
b/src/java/org/apache/nutch/fetcher/QueueFeeder.java
index de02c4847..72009ad3a 100644
--- a/src/java/org/apache/nutch/fetcher/QueueFeeder.java
+++ b/src/java/org/apache/nutch/fetcher/QueueFeeder.java
@@ -20,8 +20,8 @@
 import java.lang.invoke.MethodHandles;
 
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper.Context;
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.fetcher.Fetcher.FetcherRun;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -34,12 +34,12 @@
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
   
-  private Context context;
+  private FetcherRun.Context context;
   private FetchItemQueues queues;
   private int size;
   private long timelimit = -1;
 
-  public QueueFeeder(Context context,
+  public QueueFeeder(FetcherRun.Context context,
       FetchItemQueues queues, int size) {
     this.context = context;
     this.queues = queues;
@@ -67,7 +67,7 @@ public void run() {
           LOG.error("QueueFeeder error reading input, record " + cnt, e);
           return;
         } catch (InterruptedException e) {
-          LOG.info("QueueFeeder interrupted, exception: "+e);
+          LOG.info("QueueFeeder interrupted, exception:", e);
           return;
         }
         continue;
@@ -82,12 +82,19 @@ public void run() {
         ;
         continue;
       } else {
-        LOG.debug("-feeding " + feed + " input urls ...");
+        LOG.debug("-feeding {} input urls ...", feed);
         while (feed > 0 && hasMore) {
           try {
             hasMore = context.nextKeyValue();
             if (hasMore) {
-              
queues.addFetchItem((Text)context.getCurrentKey(),(CrawlDatum)context.getCurrentValue());
+              /*
+               * Need to copy key and value objects because MapReduce will 
reuse
+               * the original objects while the objects are stored in the 
queue.
+               */
+              Text url = new Text((Text)context.getCurrentKey());
+              CrawlDatum datum = new CrawlDatum();
+              datum.set((CrawlDatum)context.getCurrentValue());
+              queues.addFetchItem(url, datum);
               cnt++;
               feed--;
             }
@@ -95,12 +102,12 @@ public void run() {
             LOG.error("QueueFeeder error reading input, record " + cnt, e);
             return;
           } catch (InterruptedException e) {
-            LOG.info("QueueFeeder interrupted, exception: "+e);
+            LOG.info("QueueFeeder interrupted, exception:", e);
           }
         }
       }
     }
-    LOG.info("QueueFeeder finished: total " + cnt
-        + " records hit by time limit :" + timelimitcount);
+    LOG.info("QueueFeeder finished: total {} records hit by time limit : {}",
+        cnt, timelimitcount);
   }
 }
diff --git a/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java 
b/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
index fae629694..e271e88cf 100644
--- a/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
+++ b/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
@@ -34,14 +34,11 @@
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.MapFile.Writer.Option;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configuration.IntegerRanges;
-import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.RawComparator;
 import org.apache.hadoop.mapred.Counters;
 import org.apache.hadoop.mapred.Counters.Counter;
 import org.apache.hadoop.mapreduce.InputFormat;
-import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.JobID;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
@@ -49,7 +46,6 @@
 import org.apache.hadoop.mapreduce.OutputFormat;
 import org.apache.hadoop.mapreduce.Partitioner;
 import org.apache.hadoop.mapreduce.TaskAttemptID;
-import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.hadoop.mapreduce.Reducer.Context;
 import org.apache.hadoop.security.Credentials;
 import org.mortbay.jetty.Server;
@@ -91,19 +87,6 @@ public static void createCrawlDb(Configuration conf, 
FileSystem fs,
     writer.close();
   }
 
-  /**
-   * For now we need to manually construct our Configuration, because we need 
to
-   * override the default one and it is currently not possible to use
-   * dynamically set values.
-   * 
-   * @return
-   * @deprecated Use {@link #createConfiguration()} instead
-   */
-  @Deprecated
-  public static Context create() {
-    return createContext();
-  }
-
   /** {@link Context} to collect all values in a {@link List} */
   private static class DummyContext extends Reducer<Text, CrawlDatum, Text, 
CrawlDatum>.Context {
 
@@ -377,12 +360,12 @@ public Path getWorkingDirectory() throws IOException {
    * 
    * @return
    */
-  public static Context createContext() {
+  public static Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context 
createContext() {
     DummyContext context = new DummyContext();
     Configuration conf = context.getConfiguration();
     conf.addResource("nutch-default.xml");
     conf.addResource("crawl-tests.xml");
-    return (Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context)context;
+    return (Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context) context;
   }
 
   public static class URLCrawlDatum {


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


> Fetcher not to modify URLs to be fetched
> ----------------------------------------
>
>                 Key: NUTCH-2553
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2553
>             Project: Nutch
>          Issue Type: Bug
>          Components: fetcher
>    Affects Versions: 1.15
>            Reporter: Sebastian Nagel
>            Priority: Critical
>             Fix For: 1.15
>
>
> Fetcher modifies the URLs being fetched (introduced with NUTCH-2375 in 
> [c93d908|https://github.com/apache/nutch/commit/c93d908bb635d3c5b59f8c8a22e0584ebf588794#diff-847479d08597eb30da1c715310438685R253]:
> {noformat}
> FetcherThread 22 fetching http://nutch.apache.org:-1/ (queue crawl 
> delay=5000ms)
> {noformat}
> which makes it hard to trace the URLs in the log files and likely causes 
> other issues because URLs in CrawlDb and segments do not match 
> (http://nutch.apache.org/ in CrawlDb and http://nutch.apache.org:-1/ in 
> segment).



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

[jira] [Commented] (NUTCH-2553) Fetcher not to modify URLs to be fetched

Reply via email to