Thanks. I made the changes you suggested but the problem persisted.
After about 5 rounds of 1000 URLs one site would "take over." I made
the attached small change to get around this problem. It allows you to
specific the maximum number of URLs you want from any single host. I
now use -topN 1000 -maxSite 500 and things are going as I had hoped.
Thanks,
Tim
On 4/19/05, Doug Cutting <[EMAIL PROTECTED]> wrote:
> Tim Martin wrote:
> > I first had the "bin/nutch generate" command only add the top
> > 1000 URLs as is done in the example. I may be wrong but it appears
> > that the rest of the URLs are then thrown out.
>
> They are not thrown out, just delayed until the next generate.
>
> > This isn't necessarily
> > a problem but sometimes 1 of my 3 sites has _all_ the top 1000 URLs so
> > after that the other 2 sites don't get crawled at all. If I don't use
> > the -topN option to generate by the 4th round the fetchlist is very
> > large and the round takes much longer than my desired 30 minutes.
>
> Are you using link analysis? Perhaps it is doing you a disservice by
> prioritizing one site above the others. Try, in place of the analyze
> command, setting setting both fetchlist.score.by.link.count and
> indexer.boost.by.link.count to true. Please tell us how that works for you.
>
> Doug
>
--- FetchListTool.java 2005-01-14 11:49:41.000000000 -0800
+++ /home/tmartin/nutch-0.6/src/java/net/nutch/tools/FetchListTool.java 2005-04-19 23:19:14.000000000 -0700
@@ -261,7 +261,7 @@
* Spit out several fetchlists, so that we can fetch across
* several machines.
*/
- public void emitMultipleLists(File dir, int numLists, long topN, long curTime) throws IOException {
+ public void emitMultipleLists(File dir, int numLists, long topN, int maxsite, long curTime) throws IOException {
//
// Create tables (and directories) for each fetchlist we want.
// Add them all to a TableSet object.
@@ -281,7 +281,7 @@
}
// Now go through the fetchlist.
- emitFetchList(tables, workingDir, topN, curTime);
+ emitFetchList(tables, workingDir, topN, maxsite, curTime);
} finally {
FileUtil.fullyDelete(nfs, workingDir);
}
@@ -293,7 +293,7 @@
/**
* Spit out the fetchlist, to a BDB at the indicated filename.
*/
- public void emitFetchList(File segmentDir, long topN, long curTime) throws IOException {
+ public void emitFetchList(File segmentDir, long topN, int maxsite, long curTime) throws IOException {
TableSet tables = new TableSet();
File workingDir = new File(segmentDir, "tmp_" + getDate());
nfs.mkdirs(workingDir);
@@ -304,7 +304,7 @@
tables.add(new File(subdir, FetchListEntry.DIR_NAME).getPath());
try {
- emitFetchList(tables, workingDir, topN, curTime);
+ emitFetchList(tables, workingDir, topN, maxsite, curTime);
} finally {
tables.close();
}
@@ -323,7 +323,7 @@
* responsible for actually appending the item to the output file,
* which is from this function.
*/
- void emitFetchList(TableSet tables, File workingDir, long topN, long curTime) throws IOException {
+ void emitFetchList(TableSet tables, File workingDir, long topN, int maxsite, long curTime) throws IOException {
// Iterate through all the Pages, by URL. Iterating
// through by URL means we can save disk seeks when
// calling webdb.getLinks(URL).
@@ -340,7 +340,8 @@
long count = 0;
TreeMap anchorTable = new TreeMap();
Vector unknownDomainLinks = new Vector();
-
+ Hashtable htable = new Hashtable();
+
//
// Create a comparator that matches the domainIDs for
// Link objects.
@@ -386,7 +387,7 @@
if ((cutoffScore >= 0) && (page.getScore() < cutoffScore)) {
continue;
}
-
+
//
// If the item is not yet ready to be fetched, move on.
//
@@ -399,6 +400,26 @@
}
//
+ // If maxsite is greater than zero only allow at maximum
+ // maxsite URLs from a single domain
+ //
+ if (maxsite > 0) {
+ URL url = new URL(page.getURL().toString());
+ String key = url.getHost();
+ Integer num = (Integer)htable.get(key);
+ if (num != null) {
+ if (num.intValue() >= maxsite) {
+ System.out.println("Over " + maxsite + " URLs from " + key);
+ continue;
+ }
+ num = new Integer(num.intValue() + 1);
+ } else {
+ num = new Integer(1);
+ }
+ htable.put(key, num);
+ }
+
+ //
// If we're in refetchOnly mode, set shouldFetch to FALSE
// for any Pages whose URL's MD5 is the same as the
// listed MD5. That indicates that no content has been
@@ -588,6 +609,7 @@
//
boolean refetchOnly = false, anchorOptimize = false;
long topN = Long.MAX_VALUE;
+ int maxsite = 0;
float cutoffScore = -1.0f;
int numFetchers = 1;
int seed = new Random().nextInt();
@@ -615,6 +637,14 @@
System.out.println("No argument present for -cutoffscore");
return;
}
+ } else if ("-maxSite".equals(argv[i])) {
+ if (i+1 < argv.length) {
+ maxsite = Integer.parseInt(argv[i+1]);
+ i++;
+ } else {
+ System.out.println("No argument present for -maxsite");
+ return;
+ }
} else if ("-numFetchers".equals(argv[i])) {
if (i+1 < argv.length) {
numFetchers = Integer.parseInt(argv[i+1]);
@@ -654,6 +684,9 @@
if (topN != Long.MAX_VALUE) {
LOG.info("topN:" + topN);
}
+ if (maxsite > 0) {
+ LOG.info("maxSite:" + maxsite);
+ }
if (cutoffScore >= 0) {
LOG.info("cutoffscore:" + cutoffScore);
}
@@ -663,9 +696,9 @@
FetchListTool flt = new FetchListTool(nfs, dbDir, refetchOnly, anchorOptimize, cutoffScore, seed);
if (numFetchers > 1) {
- flt.emitMultipleLists(segmentDir, numFetchers, topN, curTime);
+ flt.emitMultipleLists(segmentDir, numFetchers, topN, maxsite, curTime);
} else {
- flt.emitFetchList(segmentDir, topN, curTime);
+ flt.emitFetchList(segmentDir, topN, maxsite, curTime);
}
nfs.close();
LOG.info("FetchListTool completed");