Hi list !

Can someone explain me better what the following code do ? That's the code of 
OPICScoringFilter.java. Can I change it and change the scoring when the results 
are displayed by adding the value of a META TAG ? Please it's very difficult 
for me to progam my own plugin so I need your help.

This is the code :

package org.apache.nutch.scoring.opic;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.List;

// Commons Logging imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.lucene.document.Document;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.fetcher.Fetcher;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilter;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.util.LogUtil;

/**
 * This plugin implements a variant of an Online Page Importance Computation
 * (OPIC) score, described in this paper:
 * <a href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/>
 * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003),
 * Adaptive On-Line Page Importance Computation
 * </a>.
 * 
 * @author Andrzej Bialecki
 */
public class OPICScoringFilter implements ScoringFilter {

  private final static Log LOG = LogFactory.getLog(OPICScoringFilter.class);

  private Configuration conf;
  private float scoreInjected;
  private float scorePower;
  private float internalScoreFactor;
  private float externalScoreFactor;
  private boolean countFiltered;

  public Configuration getConf() {
    return conf;
  }

  public void setConf(Configuration conf) {
    this.conf = conf;
    scoreInjected = conf.getFloat("db.score.injected", 1.0f);
    scorePower = conf.getFloat("indexer.score.power", 0.5f);
    internalScoreFactor = conf.getFloat("db.score.link.internal", 1.0f);
    externalScoreFactor = conf.getFloat("db.score.link.external", 1.0f);
    countFiltered = conf.getBoolean("db.score.count.filtered", false);
  }

  /** Set to the value defined in config, 1.0f by default. */
  public void injectedScore(Text url, CrawlDatum datum) throws 
ScoringFilterException {
    datum.setScore(scoreInjected);
  }

  /** Set to 0.0f (unknown value) - inlink contributions will bring it to
   * a correct level. Newly discovered pages have at least one inlink. */
  public void initialScore(Text url, CrawlDatum datum) throws 
ScoringFilterException {
    datum.setScore(0.0f);
  }

  /** Use [EMAIL PROTECTED] CrawlDatum#getScore()}. */
  public float generatorSortValue(Text url, CrawlDatum datum, float initSort) 
throws ScoringFilterException {
    return datum.getScore();
  }

  /** Increase the score by a sum of inlinked scores. */
  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List 
inlinked) throws ScoringFilterException {
    float adjust = 0.0f;
    for (int i = 0; i < inlinked.size(); i++) {
      CrawlDatum linked = (CrawlDatum)inlinked.get(i);
      adjust += linked.getScore();
    }
    if (old == null) old = datum;
    datum.setScore(old.getScore() + adjust);
  }

  /** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */
  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content 
content) {
    content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
  }

  /** Copy the value from Content metadata under Fetcher.SCORE_KEY to 
parseData. */
  public void passScoreAfterParsing(Text url, Content content, Parse parse) {
    parse.getData().getContentMeta().set(Nutch.SCORE_KEY, 
content.getMetadata().get(Nutch.SCORE_KEY));
  }

  /** Get a float value from Fetcher.SCORE_KEY, divide it by the number of 
outlinks and apply. */
  public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl, 
ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount, int 
validCount) throws ScoringFilterException {
    float score = scoreInjected;
    String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY);
    if (scoreString != null) {
      try {
        score = Float.parseFloat(scoreString);
      } catch (Exception e) {
        e.printStackTrace(LogUtil.getWarnStream(LOG));
      }
    }
    if (countFiltered) {
      score /= allCount;
    } else {
      score /= validCount;
    }
    // internal or external score factor 
    try {
      String toHost = new URL(toUrl.toString()).getHost();
      String fromHost = new URL(fromUrl.toString()).getHost();
      if(toHost.equalsIgnoreCase(fromHost)){
        score *= internalScoreFactor;
      } else {
        score *= externalScoreFactor;
      }
    } catch (MalformedURLException e) {
       e.printStackTrace(LogUtil.getWarnStream(LOG));
       score *= externalScoreFactor;
    }
    target.setScore(score);
    // XXX (ab) no adjustment? I think this is contrary to the algorithm descr.
    // XXX in the paper, where page "loses" its score if it's distributed to
    // XXX linked pages...
    return adjust;
  }

  /** Dampen the boost value by scorePower.*/
  public float indexerScore(Text url, Document doc, CrawlDatum dbDatum, 
CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws 
ScoringFilterException {
    return (float)Math.pow(dbDatum.getScore(), scorePower);
  }
}


Thank's in advance,

Jisay

_________________________________________________________________
Changez votre Live en un clic !
http://get.live.com

Reply via email to