Author: markus Date: Tue Feb 23 10:23:24 2016 New Revision: 1731831 URL: http://svn.apache.org/viewvc?rev=1731831&view=rev Log: NUTCH-2220 Rename db.* options used only by the linkdb to linkdb.*
Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1731831&r1=1731830&r2=1731831&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Tue Feb 23 10:23:24 2016 @@ -1,5 +1,17 @@ +Fellow committers, Nutch 1.12 contains a breaking change NUTCH-2220. Please use the note below and +in the release announcement and keep it on top in this CHANGES.txt for the Nutch 1.12 release. + +* replace your old conf/nutch-default.xml with the conf/nutch-default.xml from Nutch 1.12 release +* if you use LinkDB (e.g. invertlinks) and modified parameters db.max.inlinks and/or db.max.anchor.length + and/or db.ignore.internal.links, rename those parameters to linkdb.max.inlinks and + linkdb.max.anchor.length and linkdb.ignore.internal.links +* db.ignore.internal.links and db.ignore.external.links now operate on the CrawlDB only +* linkdb.ignore.internal.links and linkdb.ignore.external.links now operate on the LinkDB only + Nutch Change Log +* NUTCH-2220 Rename db.* options used only by the linkdb to linkdb.* (markus) + * NUTCH-2228 Plugin index-replace unit test broken on Java 8 (snagel via markus) * NUTCH-2219 Criteria order to be configurable in DeduplicationJob (Ron van der Vegt via markus) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1731831&r1=1731830&r2=1731831&view=diff ============================================================================== --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Tue Feb 23 10:23:24 2016 @@ -538,16 +538,6 @@ </property> <property> - <name>db.ignore.internal.links</name> - <value>true</value> - <description>If true, when adding new links to a page, links from - the same host are ignored. This is an effective way to limit the - size of the link database, keeping only the highest quality - links. - </description> -</property> - -<property> <name>db.ignore.external.links</name> <value>false</value> <description>If true, outlinks leading from a page to external hosts or domain @@ -616,15 +606,6 @@ </property> <property> - <name>db.max.inlinks</name> - <value>10000</value> - <description>Maximum number of Inlinks per URL to be kept in LinkDb. - If "invertlinks" finds more inlinks than this number, only the first - N inlinks will be stored, and the rest will be discarded. - </description> -</property> - -<property> <name>db.max.outlinks.per.page</name> <value>100</value> <description>The maximum number of outlinks that we'll process for a page. @@ -681,6 +662,35 @@ </description> </property> +<!-- linkdb properties --> + +<property> + <name>linkdb.max.inlinks</name> + <value>10000</value> + <description>Maximum number of Inlinks per URL to be kept in LinkDb. + If "invertlinks" finds more inlinks than this number, only the first + N inlinks will be stored, and the rest will be discarded. + </description> +</property> + +<property> + <name>linkdb.ignore.internal.links</name> + <value>true</value> + <description>If true, when adding new links to a page, links from + the same host are ignored. This is an effective way to limit the + size of the link database, keeping only the highest quality + links. + </description> +</property> + +<property> + <name>linkdb.ignore.external.links</name> + <value>false</value> + <description>If true, when adding new links to a page, links from + the a different host are ignored. + </description> +</property> + <!-- generate properties --> <property> Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=1731831&r1=1731830&r2=1731831&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Tue Feb 23 10:23:24 2016 @@ -48,8 +48,8 @@ public class LinkDb extends NutchTool im public static final Logger LOG = LoggerFactory.getLogger(LinkDb.class); - public static final String IGNORE_INTERNAL_LINKS = "db.ignore.internal.links"; - public static final String IGNORE_EXTERNAL_LINKS = "db.ignore.external.links"; + public static final String IGNORE_INTERNAL_LINKS = "linkdb.ignore.internal.links"; + public static final String IGNORE_EXTERNAL_LINKS = "linkdb.ignore.external.links"; public static final String CURRENT_NAME = "current"; public static final String LOCK_NAME = ".locked"; @@ -68,7 +68,7 @@ public class LinkDb extends NutchTool im } public void configure(JobConf job) { - maxAnchorLength = job.getInt("db.max.anchor.length", 100); + maxAnchorLength = job.getInt("linkdb.max.anchor.length", 100); ignoreInternalLinks = job.getBoolean(IGNORE_INTERNAL_LINKS, true); ignoreExternalLinks = job.getBoolean(IGNORE_EXTERNAL_LINKS, false); Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java?rev=1731831&r1=1731830&r2=1731831&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java (original) +++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java Tue Feb 23 10:23:24 2016 @@ -55,7 +55,7 @@ import org.apache.nutch.util.TimingUtil; * </p> * <p> * If more than one LinkDb contains information about the same URL, all inlinks - * are accumulated, but only at most <code>db.max.inlinks</code> inlinks will + * are accumulated, but only at most <code>linkdb.max.inlinks</code> inlinks will * ever be added. * </p> * <p> @@ -104,7 +104,7 @@ public class LinkDbMerger extends Config } public void configure(JobConf job) { - maxInlinks = job.getInt("db.max.inlinks", 10000); + maxInlinks = job.getInt("linkdb.max.inlinks", 10000); } public void close() throws IOException {