Author: markus
Date: Tue Feb 23 10:23:24 2016
New Revision: 1731831
URL: http://svn.apache.org/viewvc?rev=1731831&view=rev
Log:
NUTCH-2220 Rename db.* options used only by the linkdb to linkdb.*
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java
Modified: nutch/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1731831&r1=1731830&r2=1731831&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Feb 23 10:23:24 2016
@@ -1,5 +1,17 @@
+Fellow committers, Nutch 1.12 contains a breaking change NUTCH-2220. Please
use the note below and
+in the release announcement and keep it on top in this CHANGES.txt for the
Nutch 1.12 release.
+
+* replace your old conf/nutch-default.xml with the conf/nutch-default.xml from
Nutch 1.12 release
+* if you use LinkDB (e.g. invertlinks) and modified parameters db.max.inlinks
and/or db.max.anchor.length
+ and/or db.ignore.internal.links, rename those parameters to
linkdb.max.inlinks and
+ linkdb.max.anchor.length and linkdb.ignore.internal.links
+* db.ignore.internal.links and db.ignore.external.links now operate on the
CrawlDB only
+* linkdb.ignore.internal.links and linkdb.ignore.external.links now operate on
the LinkDB only
+
Nutch Change Log
+* NUTCH-2220 Rename db.* options used only by the linkdb to linkdb.* (markus)
+
* NUTCH-2228 Plugin index-replace unit test broken on Java 8 (snagel via
markus)
* NUTCH-2219 Criteria order to be configurable in DeduplicationJob (Ron van
der Vegt via markus)
Modified: nutch/trunk/conf/nutch-default.xml
URL:
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1731831&r1=1731830&r2=1731831&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Feb 23 10:23:24 2016
@@ -538,16 +538,6 @@
</property>
<property>
- <name>db.ignore.internal.links</name>
- <value>true</value>
- <description>If true, when adding new links to a page, links from
- the same host are ignored. This is an effective way to limit the
- size of the link database, keeping only the highest quality
- links.
- </description>
-</property>
-
-<property>
<name>db.ignore.external.links</name>
<value>false</value>
<description>If true, outlinks leading from a page to external hosts or
domain
@@ -616,15 +606,6 @@
</property>
<property>
- <name>db.max.inlinks</name>
- <value>10000</value>
- <description>Maximum number of Inlinks per URL to be kept in LinkDb.
- If "invertlinks" finds more inlinks than this number, only the first
- N inlinks will be stored, and the rest will be discarded.
- </description>
-</property>
-
-<property>
<name>db.max.outlinks.per.page</name>
<value>100</value>
<description>The maximum number of outlinks that we'll process for a page.
@@ -681,6 +662,35 @@
</description>
</property>
+<!-- linkdb properties -->
+
+<property>
+ <name>linkdb.max.inlinks</name>
+ <value>10000</value>
+ <description>Maximum number of Inlinks per URL to be kept in LinkDb.
+ If "invertlinks" finds more inlinks than this number, only the first
+ N inlinks will be stored, and the rest will be discarded.
+ </description>
+</property>
+
+<property>
+ <name>linkdb.ignore.internal.links</name>
+ <value>true</value>
+ <description>If true, when adding new links to a page, links from
+ the same host are ignored. This is an effective way to limit the
+ size of the link database, keeping only the highest quality
+ links.
+ </description>
+</property>
+
+<property>
+ <name>linkdb.ignore.external.links</name>
+ <value>false</value>
+ <description>If true, when adding new links to a page, links from
+ the a different host are ignored.
+ </description>
+</property>
+
<!-- generate properties -->
<property>
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=1731831&r1=1731830&r2=1731831&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Tue Feb 23 10:23:24
2016
@@ -48,8 +48,8 @@ public class LinkDb extends NutchTool im
public static final Logger LOG = LoggerFactory.getLogger(LinkDb.class);
- public static final String IGNORE_INTERNAL_LINKS =
"db.ignore.internal.links";
- public static final String IGNORE_EXTERNAL_LINKS =
"db.ignore.external.links";
+ public static final String IGNORE_INTERNAL_LINKS =
"linkdb.ignore.internal.links";
+ public static final String IGNORE_EXTERNAL_LINKS =
"linkdb.ignore.external.links";
public static final String CURRENT_NAME = "current";
public static final String LOCK_NAME = ".locked";
@@ -68,7 +68,7 @@ public class LinkDb extends NutchTool im
}
public void configure(JobConf job) {
- maxAnchorLength = job.getInt("db.max.anchor.length", 100);
+ maxAnchorLength = job.getInt("linkdb.max.anchor.length", 100);
ignoreInternalLinks = job.getBoolean(IGNORE_INTERNAL_LINKS, true);
ignoreExternalLinks = job.getBoolean(IGNORE_EXTERNAL_LINKS, false);
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java
URL:
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java?rev=1731831&r1=1731830&r2=1731831&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java Tue Feb 23
10:23:24 2016
@@ -55,7 +55,7 @@ import org.apache.nutch.util.TimingUtil;
* </p>
* <p>
* If more than one LinkDb contains information about the same URL, all inlinks
- * are accumulated, but only at most <code>db.max.inlinks</code> inlinks will
+ * are accumulated, but only at most <code>linkdb.max.inlinks</code> inlinks
will
* ever be added.
* </p>
* <p>
@@ -104,7 +104,7 @@ public class LinkDbMerger extends Config
}
public void configure(JobConf job) {
- maxInlinks = job.getInt("db.max.inlinks", 10000);
+ maxInlinks = job.getInt("linkdb.max.inlinks", 10000);
}
public void close() throws IOException {