[Nutch-dev] Re: How to manage fetching?

Tim Martin Wed, 20 Apr 2005 17:43:10 -0700

Thanks. I made the changes you suggested but the problem persisted.
After about 5 rounds of 1000 URLs one site would "take over." I made
the attached small change to get around this problem. It allows you to
specific the maximum number of URLs you want from any single host. I
now use -topN 1000 -maxSite 500 and things are going as I had hoped.


Thanks,
Tim

On 4/19/05, Doug Cutting <[EMAIL PROTECTED]> wrote:
> Tim Martin wrote:
> > I first had the "bin/nutch generate" command only add the top
> > 1000 URLs as is done in the example. I may be wrong but it appears
> > that the rest of the URLs are then thrown out.
> 
> They are not thrown out, just delayed until the next generate.
> 
> > This isn't necessarily
> > a problem but sometimes 1 of my 3 sites has _all_ the top 1000 URLs so
> > after that the other 2 sites don't get crawled at all. If I don't use
> > the -topN option to generate by the 4th round the fetchlist is very
> > large and the round takes much longer than my desired 30 minutes.
> 
> Are you using link analysis?  Perhaps it is doing you a disservice by
> prioritizing one site above the others.  Try, in place of the analyze
> command, setting setting both fetchlist.score.by.link.count and
> indexer.boost.by.link.count to true.  Please tell us how that works for you.
> 
> Doug
>

--- FetchListTool.java	2005-01-14 11:49:41.000000000 -0800
+++ /home/tmartin/nutch-0.6/src/java/net/nutch/tools/FetchListTool.java	2005-04-19 23:19:14.000000000 -0700
@@ -261,7 +261,7 @@
      * Spit out several fetchlists, so that we can fetch across
      * several machines.  
      */
-    public void emitMultipleLists(File dir, int numLists, long topN, long curTime) throws IOException {
+    public void emitMultipleLists(File dir, int numLists, long topN, int maxsite, long curTime) throws IOException {
         //
         // Create tables (and directories) for each fetchlist we want.
         // Add them all to a TableSet object.
@@ -281,7 +281,7 @@
                 }
 
                 // Now go through the fetchlist.
-                emitFetchList(tables, workingDir, topN, curTime);
+                emitFetchList(tables, workingDir, topN, maxsite, curTime);
             } finally {
                 FileUtil.fullyDelete(nfs, workingDir);
             }
@@ -293,7 +293,7 @@
     /**
      * Spit out the fetchlist, to a BDB at the indicated filename.
      */
-    public void emitFetchList(File segmentDir, long topN, long curTime) throws IOException {
+    public void emitFetchList(File segmentDir, long topN, int maxsite, long curTime) throws IOException {
         TableSet tables = new TableSet();
         File workingDir = new File(segmentDir, "tmp_" + getDate());
         nfs.mkdirs(workingDir);
@@ -304,7 +304,7 @@
             tables.add(new File(subdir, FetchListEntry.DIR_NAME).getPath());
         
             try {
-                emitFetchList(tables, workingDir, topN, curTime);
+                emitFetchList(tables, workingDir, topN, maxsite, curTime);
             } finally {
                 tables.close();
             }
@@ -323,7 +323,7 @@
      * responsible for actually appending the item to the output file, 
      * which is from this function.
      */
-    void emitFetchList(TableSet tables, File workingDir, long topN, long curTime) throws IOException {
+    void emitFetchList(TableSet tables, File workingDir, long topN, int maxsite, long curTime) throws IOException {
         // Iterate through all the Pages, by URL.  Iterating
         // through by URL means we can save disk seeks when
         // calling webdb.getLinks(URL).  
@@ -340,7 +340,8 @@
         long count = 0;
         TreeMap anchorTable = new TreeMap();
         Vector unknownDomainLinks = new Vector();
-
+        Hashtable htable = new Hashtable();
+        
         //
         // Create a comparator that matches the domainIDs for
         // Link objects.
@@ -386,7 +387,7 @@
                     if ((cutoffScore >= 0) && (page.getScore() < cutoffScore)) {
                         continue;
                     }
-
+                    
                     //
                     // If the item is not yet ready to be fetched, move on.
                     //
@@ -399,6 +400,26 @@
                     }
 
                     //
+                    // If maxsite is greater than zero only allow at maximum
+                    // maxsite URLs from a single domain
+                    //
+                    if (maxsite > 0) {
+                        URL url = new URL(page.getURL().toString());
+                        String key = url.getHost();
+                        Integer num = (Integer)htable.get(key);
+                        if (num != null) {
+                            if (num.intValue() >= maxsite) {
+                                System.out.println("Over " + maxsite + " URLs from " + key);
+                                continue;
+                            }
+                            num = new Integer(num.intValue() + 1);
+                        } else {
+                            num = new Integer(1);
+                        }
+                        htable.put(key, num);
+                    }
+                    
+                    //
                     // If we're in refetchOnly mode, set shouldFetch to FALSE
                     // for any Pages whose URL's MD5 is the same as the 
                     // listed MD5.  That indicates that no content has been 
@@ -588,6 +609,7 @@
         //
         boolean refetchOnly = false, anchorOptimize = false;
         long topN = Long.MAX_VALUE;
+        int maxsite = 0;
         float cutoffScore = -1.0f;
         int numFetchers = 1;
         int seed = new Random().nextInt();
@@ -615,6 +637,14 @@
                         System.out.println("No argument present for -cutoffscore");
                         return;
                     }
+                } else if ("-maxSite".equals(argv[i])) {
+                    if (i+1 < argv.length) {
+                        maxsite = Integer.parseInt(argv[i+1]);
+                        i++;
+                    } else {
+                        System.out.println("No argument present for -maxsite");
+                        return;
+                    }
                 } else if ("-numFetchers".equals(argv[i])) {
                     if (i+1 < argv.length) {
                         numFetchers = Integer.parseInt(argv[i+1]);
@@ -654,6 +684,9 @@
         if (topN != Long.MAX_VALUE) {
             LOG.info("topN:" + topN);
         }
+        if (maxsite > 0) {
+            LOG.info("maxSite:" + maxsite);
+        }
         if (cutoffScore >= 0) {
             LOG.info("cutoffscore:" + cutoffScore);
         }
@@ -663,9 +696,9 @@
 
         FetchListTool flt = new FetchListTool(nfs, dbDir, refetchOnly, anchorOptimize, cutoffScore, seed);
         if (numFetchers > 1) {
-            flt.emitMultipleLists(segmentDir, numFetchers, topN, curTime);
+            flt.emitMultipleLists(segmentDir, numFetchers, topN, maxsite, curTime);
         } else {
-            flt.emitFetchList(segmentDir, topN, curTime);
+            flt.emitFetchList(segmentDir, topN, maxsite, curTime);
         }
         nfs.close();
         LOG.info("FetchListTool completed");

[Nutch-dev] Re: How to manage fetching?

Reply via email to