Update of /cvsroot/nutch/nutch/src/java/net/nutch/indexer
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv9211/src/java/net/nutch/indexer
Modified Files:
NutchSimilarity.java
Log Message:
Some scoring tweaks.
Index: NutchSimilarity.java
===================================================================
RCS file: /cvsroot/nutch/nutch/src/java/net/nutch/indexer/NutchSimilarity.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** NutchSimilarity.java 5 Apr 2004 16:44:56 -0000 1.4
--- NutchSimilarity.java 6 Apr 2004 02:54:05 -0000 1.5
***************
*** 10,23 ****
private static final int MIN_CONTENT_LENGTH = 1000;
! /** Normalize field by length. */
public float lengthNorm(String fieldName, int numTokens) {
if ("url".equals(fieldName)) { // URL: prefer short
return 1.0f / numTokens; // use linear normalization
} else if ("content".equals(fieldName)) { // Content: penalize short
return super.lengthNorm(fieldName, // treat short as longer
Math.max(numTokens, MIN_CONTENT_LENGTH));
! } else { // Anchor: use default
return super.lengthNorm(fieldName, numTokens);
}
--- 10,26 ----
private static final int MIN_CONTENT_LENGTH = 1000;
! /** Normalize field by length. Called at index time. */
public float lengthNorm(String fieldName, int numTokens) {
if ("url".equals(fieldName)) { // URL: prefer short
return 1.0f / numTokens; // use linear normalization
+ } else if ("anchor".equals(fieldName)) { // Anchor: prefer more
+ return (float)(1.0 / Math.log(numTokens)); // use log
+
} else if ("content".equals(fieldName)) { // Content: penalize short
return super.lengthNorm(fieldName, // treat short as longer
Math.max(numTokens, MIN_CONTENT_LENGTH));
! } else { // use default
return super.lengthNorm(fieldName, numTokens);
}
-------------------------------------------------------
This SF.Net email is sponsored by: IBM Linux Tutorials
Free Linux tutorial presented by Daniel Robbins, President and CEO of
GenToo technologies. Learn everything from fundamentals to system
administration.http://ads.osdn.com/?ad_id=1470&alloc_id=3638&op=click
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs