[ 
https://issues.apache.org/jira/browse/NUTCH-1269?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Lewis John McGibbney updated NUTCH-1269:
----------------------------------------

    Fix Version/s: 1.7
    
> Generate main problems
> ----------------------
>
>                 Key: NUTCH-1269
>                 URL: https://issues.apache.org/jira/browse/NUTCH-1269
>             Project: Nutch
>          Issue Type: Improvement
>          Components: generator
>    Affects Versions: 1.4
>         Environment: software
>            Reporter: behnam nikbakht
>              Labels: Generate, MaxHostCount, MaxNumSegments
>             Fix For: 1.7
>
>         Attachments: NUTCH-1269.patch, NUTCH-1269-v.2.patch
>
>
> there are some problems with current Generate method, with maxNumSegments and 
> maxHostCount options:
> 1. first, size of generated segments are different
> 2. with maxHostCount option, it is unclear that it was applied or not
> 3. urls from one host are distributed non-uniform between segments
> we change Generator.java as described below:
> in Selector class:
>     private int maxNumSegments;
>     private int segmentSize;
>     private int maxHostCount;
> public void config
> ...
>       maxNumSegments = job.getInt(GENERATOR_MAX_NUM_SEGMENTS, 1);
>       segmentSize=(int)job.getInt(GENERATOR_TOP_N, 10000000)/maxNumSegments;
>       maxHostCount=job.getInt("GENERATE_MAX_PER_HOST", 100);  
> ...
> public void reduce(FloatWritable key, Iterator<SelectorEntry> values,
>         OutputCollector<FloatWritable,SelectorEntry> output, Reporter 
> reporter)
>         throws IOException {
>       int limit2=(int)((limit*3)/2);
>       while (values.hasNext()) {
>       if(count == limit)
>                 break;
>         if (count % segmentSize == 0 ) {
>           if (currentsegmentnum < maxNumSegments-1){
>             currentsegmentnum++;
>           }
>           else
>                 currentsegmentnum=0;
>         }
>         boolean full=true;
>         for(int jk=0;jk<maxNumSegments;jk++){
>               if (segCounts[jk]<segmentSize){
>                       full=false;
>               }
>         }
>         if(full){
>               break;
>         }
>         SelectorEntry entry = values.next();
>         Text url = entry.url;
>                 //logWrite("Generated3:"+limit+"-"+count+"-"+url.toString());
>         String urlString = url.toString();
>         URL u = null;
>         String hostordomain = null;
>         try {
>           if (normalise && normalizers != null) {
>             urlString = normalizers.normalize(urlString,
>                 URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
>           }
>        
>           u = new URL(urlString);
>           if (byDomain) {
>             hostordomain = URLUtil.getDomainName(u);
>           } else {
>             hostordomain = new URL(urlString).getHost();
>           }
>  
>       hostordomain = hostordomain.toLowerCase();
>         boolean countLimit=true;
>         // only filter if we are counting hosts or domains
>              int[] hostCount = hostCounts.get(hostordomain);
>              //host count: {a,b,c,d} means that from this host there are a 
> urls in segment 0 and b urls in seg 1 and ...
>              if (hostCount == null) {
>                  hostCount = new int[maxNumSegments];
>                  for(int kl=0;kl<hostCount.length;kl++)
>                          hostCount[kl]=0;
>                  hostCounts.put(hostordomain, hostCount);
>              }  
>                  int selectedSeg=currentsegmentnum;
>                  int minCount=hostCount[selectedSeg];
>                  for(int jk=0;jk<maxNumSegments;jk++){
>                          if(hostCount[jk]<minCount){
>                                  minCount=hostCount[jk];
>                                  selectedSeg=jk;
>                          }
>                 }
>                 if(hostCount[selectedSeg]<=maxHostCount){
>                         count++;
>                         entry.segnum = new IntWritable(selectedSeg);
>                         hostCount[selectedSeg]++;
>                         output.collect(key, entry);
>                 }
>         } catch (Exception e) {
>           LOG.warn("Malformed URL: '" + urlString + "', skipping ("
>                 logWrite("Generate-malform:"+hostordomain+"-"+url.toString());
>               + StringUtils.stringifyException(e) + ")");
>           //continue;
>         }
>       }
>     }
>     

--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira

Reply via email to