Generate main problems
----------------------
Key: NUTCH-1269
URL: https://issues.apache.org/jira/browse/NUTCH-1269
Project: Nutch
Issue Type: Improvement
Components: generator
Affects Versions: 1.4
Environment: software
Reporter: behnam nikbakht
there are some problems with current Generate method, with maxNumSegments and
maxHostCount options:
1. first, size of generated segments are different
2. with maxHostCount option, it is unclear that it was applied or not
3. urls from one host are distributed non-uniform between segments
we change Generator.java as described below:
in Selector class:
private int maxNumSegments;
private int segmentSize;
private int maxHostCount;
public void config
...
maxNumSegments = job.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1);
segmentSize=(int)job.getInt(GENERATOR_TOP_N, 10000000)/maxNumSegments;
maxHostCount=job.getInt("GENERATE_MAX_PER_HOST", 100);
...
public void reduce(FloatWritable key, Iterator<SelectorEntry> values,
OutputCollector<FloatWritable,SelectorEntry> output, Reporter reporter)
throws IOException {
int limit2=(int)((limit*3)/2);
while (values.hasNext()) {
if(count == limit)
break;
if (count % segmentSize == 0 ) {
if (currentsegmentnum < maxNumSegments-1){
currentsegmentnum++;
}
else
currentsegmentnum=0;
}
boolean full=true;
for(int jk=0;jk<maxNumSegments;jk++){
if (segCounts[jk]<segmentSize){
full=false;
}
}
if(full){
break;
}
SelectorEntry entry = values.next();
Text url = entry.url;
//logWrite("Generated3:"+limit+"-"+count+"-"+url.toString());
String urlString = url.toString();
URL u = null;
String hostordomain = null;
try {
if (normalise && normalizers != null) {
urlString = normalizers.normalize(urlString,
URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
}
u = new URL(urlString);
if (byDomain) {
hostordomain = URLUtil.getDomainName(u);
} else {
hostordomain = new URL(urlString).getHost();
}
hostordomain = hostordomain.toLowerCase();
boolean countLimit=true;
// only filter if we are counting hosts or domains
int[] hostCount = hostCounts.get(hostordomain);
//host count: {a,b,c,d} means that from this host there are a urls
in segment 0 and b urls in seg 1 and ...
if (hostCount == null) {
hostCount = new int[maxNumSegments];
for(int kl=0;kl<hostCount.length;kl++)
hostCount[kl]=0;
hostCounts.put(hostordomain, hostCount);
}
int selectedSeg=currentsegmentnum;
int minCount=hostCount[selectedSeg];
for(int jk=0;jk<maxNumSegments;jk++){
if(hostCount[jk]<minCount){
minCount=hostCount[jk];
selectedSeg=jk;
}
}
if(hostCount[selectedSeg]<=maxHostCount){
count++;
entry.segnum = new IntWritable(selectedSeg);
hostCount[selectedSeg]++;
output.collect(key, entry);
}
} catch (Exception e) {
LOG.warn("Malformed URL: '" + urlString + "', skipping ("
logWrite("Generate-malform:"+hostordomain+"-"+url.toString());
+ StringUtils.stringifyException(e) + ")");
//continue;
}
}
}
--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators:
https://issues.apache.org/jira/secure/ContactAdministrators!default.jspa
For more information on JIRA, see: http://www.atlassian.com/software/jira