Author: ab Date: Sat Sep 23 10:30:45 2006 New Revision: 449279 URL: http://svn.apache.org/viewvc?view=rev&rev=449279 Log: NUTCH-336: differentiate between newly discovered pages (known value through inlink contributions) and newly injected pages (aribtrarily defined initial value).
Modified: lucene/nutch/branches/branch-0.8/CHANGES.txt lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDbReducer.java lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Injector.java lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilter.java lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilters.java lucene/nutch/branches/branch-0.8/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Modified: lucene/nutch/branches/branch-0.8/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/CHANGES.txt?view=diff&rev=449279&r1=449278&r2=449279 ============================================================================== --- lucene/nutch/branches/branch-0.8/CHANGES.txt (original) +++ lucene/nutch/branches/branch-0.8/CHANGES.txt Sat Sep 23 10:30:45 2006 @@ -31,6 +31,10 @@ 10. NUTCH-332 - Fix doubling score caused by links to self (Stefan Groschupf via ab) + +11. NUTCH-336 - Differentiate between newly discovered pages and newly + injected pages (Chris Schneider via ab) NOTE: this changes the + scoring API, filter implementations need to be updated. Release 0.8 - 2006-07-25 Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDbReducer.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?view=diff&rev=449279&r1=449278&r2=449279 ============================================================================== --- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original) +++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Sat Sep 23 10:30:45 2006 @@ -36,12 +36,10 @@ private CrawlDatum result = new CrawlDatum(); private ArrayList linked = new ArrayList(); private ScoringFilters scfilters = null; - private float scoreInjected; public void configure(JobConf job) { retryMax = job.getInt("db.fetch.retry.max", 3); scfilters = new ScoringFilters(job); - scoreInjected = job.getFloat("db.score.injected", 1.0f); } public void close() {} @@ -112,7 +110,7 @@ LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage()); } - result.setScore(scoreInjected); + result.setScore(0.0f); } } break; Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Injector.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Injector.java?view=diff&rev=449279&r1=449278&r2=449279 ============================================================================== --- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Injector.java (original) +++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/crawl/Injector.java Sat Sep 23 10:30:45 2006 @@ -78,10 +78,10 @@ CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, interval); datum.setScore(scoreInjected); try { - scfilters.initialScore(value, datum); + scfilters.injectedScore(value, datum); } catch (ScoringFilterException e) { if (LOG.isWarnEnabled()) { - LOG.warn("Cannot filter init score for url " + url + + LOG.warn("Cannot filter injected score for url " + url + ", using default (" + e.getMessage() + ")"); } datum.setScore(scoreInjected); Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilter.java?view=diff&rev=449279&r1=449278&r2=449279 ============================================================================== --- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilter.java (original) +++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilter.java Sat Sep 23 10:30:45 2006 @@ -41,7 +41,21 @@ public final static String X_POINT_ID = ScoringFilter.class.getName(); /** - * Set an initial score for newly injected pages. + * Set an initial score for newly injected pages. Note: newly injected pages + * may have no inlinks, so filter implementations may wish to set this + * score to a non-zero value, to give newly injected pages some initial + * credit. + * @param url url of the page + * @param datum new datum. Filters will modify it in-place. + * @throws ScoringFilterException + */ + public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException; + + /** + * Set an initial score for newly discovered pages. Note: newly discovered pages + * have at least one inlink with its score contribution, so filter implementations + * may choose to set initial score to zero (unknown value), and then the inlink + * score contribution will set the "real" value of the new page. * @param url url of the page * @param datum new datum. Filters will modify it in-place. * @throws ScoringFilterException Modified: lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilters.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilters.java?view=diff&rev=449279&r1=449278&r2=449279 ============================================================================== --- lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilters.java (original) +++ lucene/nutch/branches/branch-0.8/src/java/org/apache/nutch/scoring/ScoringFilters.java Sat Sep 23 10:30:45 2006 @@ -92,10 +92,17 @@ return initSort; } - /** Calculate a new initial score, used when adding new pages. */ + /** Calculate a new initial score, used when adding newly discovered pages. */ public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException { for (int i = 0; i < this.filters.length; i++) { this.filters[i].initialScore(url, datum); + } + } + + /** Calculate a new initial score, used when injecting new pages. */ + public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException { + for (int i = 0; i < this.filters.length; i++) { + this.filters[i].injectedScore(url, datum); } } Modified: lucene/nutch/branches/branch-0.8/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/branches/branch-0.8/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java?view=diff&rev=449279&r1=449278&r2=449279 ============================================================================== --- lucene/nutch/branches/branch-0.8/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java (original) +++ lucene/nutch/branches/branch-0.8/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Sat Sep 23 10:30:45 2006 @@ -73,8 +73,14 @@ } /** Set to the value defined in config, 1.0f by default. */ - public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException { + public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException { datum.setScore(scoreInjected); + } + + /** Set to 0.0f (unknown value) - inlink contributions will bring it to + * a correct level. Newly discovered pages have at least one inlink. */ + public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException { + datum.setScore(0.0f); } /** Use [EMAIL PROTECTED] CrawlDatum#getScore()}. */ ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys -- and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs