from:"markus"

(nutch) branch master updated: NUTCH-3029

2024-03-14 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 98902236d NUTCH-3029
98902236d is described below

commit 98902236d782615ea1b8676a477bfa735499810a
Author: Markus Jelsma 
AuthorDate: Thu Mar 14 10:49:34 2024 +

NUTCH-3029
---
 src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java 
b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
index 64719cdae..8ee957c09 100644
--- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
@@ -192,7 +192,7 @@ public class AdaptiveFetchSchedule extends 
AbstractFetchSchedule {
*
* @param url url to get hostname for
* @return hostname
-   * @throws URISyntaxException
+   * @throws URISyntaxException if the given string violates RFC 2396
*/
   public static String getHostName(String url) throws URISyntaxException {
 URI uri = new URI(url);

(nutch) branch master updated: NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler

2024-03-13 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new a8ec17ca8 NUTCH-3029 Host specific max. and min. intervals in adaptive 
scheduler
a8ec17ca8 is described below

commit a8ec17ca853b2488bf5d96538915a00a05064a31
Author: Markus Jelsma 
AuthorDate: Wed Mar 13 18:35:22 2024 +

NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler
---
 src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java 
b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
index 4d4a3af73..64719cdae 100644
--- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
@@ -192,6 +192,7 @@ public class AdaptiveFetchSchedule extends 
AbstractFetchSchedule {
*
* @param url url to get hostname for
* @return hostname
+   * @throws URISyntaxException
*/
   public static String getHostName(String url) throws URISyntaxException {
 URI uri = new URI(url);

(nutch) branch master updated: NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler

2024-03-13 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 84cda2abd NUTCH-3029 Host specific max. and min. intervals in adaptive 
scheduler
84cda2abd is described below

commit 84cda2abd500667222fdb00e503780ee0bdaaab4
Author: Markus Jelsma 
AuthorDate: Wed Mar 13 16:12:21 2024 +

NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler
---
 .../org/apache/nutch/crawl/AdaptiveFetchSchedule.java   | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java 
b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
index a403d5649..4d4a3af73 100644
--- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
@@ -189,6 +189,9 @@ public class AdaptiveFetchSchedule extends 
AbstractFetchSchedule {
 
   /**
* Strip a URL, leaving only the host name.
+   *
+   * @param url url to get hostname for
+   * @return hostname
*/
   public static String getHostName(String url) throws URISyntaxException {
 URI uri = new URI(url);
@@ -198,9 +201,10 @@ public class AdaptiveFetchSchedule extends 
AbstractFetchSchedule {
 
   /**
* Returns the max_interval for this URL, which might depend on the host.
-   * @param  url  the URL to be scheduled
-   * @param  defaultMaxInterval  the value to which to default
-   * if max_interval has not been configured for this host
+   *
+   * @param url the URL to be scheduled
+   * @param defaultMaxInterval the value to which to default if max_interval 
has not been configured for this host
+   * @return the configured maximum interval or the default interval
*/
   public float getMaxInterval(Text url, float defaultMaxInterval){
 if (hostSpecificMaxInterval.isEmpty()) {
@@ -220,9 +224,10 @@ public class AdaptiveFetchSchedule extends 
AbstractFetchSchedule {
 
   /**
* Returns the min_interval for this URL, which might depend on the host.
-   * @param  url  the URL to be scheduled
-   * @param  defaultMinInterval  the value to which to default
-   * if min_interval has not been configured for this host
+   *
+   * @param url the URL to be scheduled
+   * @param defaultMinInterval the value to which to default if min_interval 
has not been configured for this host
+   * @return the configured minimum interval or the default interval
*/
   public float getMinInterval(Text url, float defaultMinInterval){
 if (hostSpecificMinInterval.isEmpty()) {

(nutch) branch master updated: NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler

2024-03-13 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 5ba50c0c6 NUTCH-3029 Host specific max. and min. intervals in adaptive 
scheduler
5ba50c0c6 is described below

commit 5ba50c0c6091a95818d3788f0d5b7c0ff49bec57
Author: Markus Jelsma 
AuthorDate: Wed Mar 13 14:53:10 2024 +

NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler
---
 .../apache/nutch/crawl/AdaptiveFetchSchedule.java  | 159 -
 1 file changed, 155 insertions(+), 4 deletions(-)

diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java 
b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
index 5bccd4f30..a403d5649 100644
--- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
@@ -22,11 +22,20 @@ import org.apache.hadoop.io.FloatWritable;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.util.NutchConfiguration;
+import org.apache.commons.lang.StringUtils;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.Reader;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.Map;
+import java.util.HashMap;
 import java.lang.invoke.MethodHandles;
+import java.net.URI;
+import java.net.URISyntaxException;
 
 /**
  * This class implements an adaptive re-fetch algorithm. This works as follows:
@@ -79,9 +88,16 @@ public class AdaptiveFetchSchedule extends 
AbstractFetchSchedule {
 
   private double SYNC_DELTA_RATE;
 
+  private Configuration conf;
+
+  private Map hostSpecificMaxInterval = new HashMap<>();
+  
+  private Map hostSpecificMinInterval = new HashMap<>();
+
   @Override
   public void setConf(Configuration conf) {
 super.setConf(conf);
+this.conf = conf;
 if (conf == null)
   return;
 INC_RATE = conf.getFloat("db.fetch.schedule.adaptive.inc_rate", 0.2f);
@@ -92,6 +108,136 @@ public class AdaptiveFetchSchedule extends 
AbstractFetchSchedule {
 SYNC_DELTA = conf.getBoolean("db.fetch.schedule.adaptive.sync_delta", 
true);
 SYNC_DELTA_RATE = conf.getFloat(
 "db.fetch.schedule.adaptive.sync_delta_rate", 0.2f);
+try {
+  setHostSpecificIntervals("adaptive-host-specific-intervals.txt", 
+MIN_INTERVAL, MAX_INTERVAL);
+} catch (IOException e){
+  LOG.error("Failed reading the configuration file. ", e);
+}
+  }
+
+  /**
+   * Load host-specific min_intervals and max_intervals
+   * from the configuration file into the HashMaps.
+   */
+  private void setHostSpecificIntervals(String fileName,
+float defaultMin, float defaultMax) throws IOException {
+Reader configReader = null;
+configReader = conf.getConfResourceAsReader(fileName);
+if (configReader == null) {
+  configReader = new FileReader(fileName);
+}
+BufferedReader reader = new BufferedReader(configReader);
+String line;
+int lineNo = 0;
+while ((line = reader.readLine()) != null) {
+  lineNo++;
+  if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+line = line.trim();
+String[] parts = line.split("\\s+");
+if (parts.length == 3) {
+  // TODO: Maybe add host validatio here?
+  // It might get computationally expensive for large files, though.
+  String host = parts[0].trim().toLowerCase();
+  String minInt = parts[1].trim();
+  String maxInt = parts[2].trim();
+  if (minInt.equalsIgnoreCase("default")){ minInt = "0"; }
+  if (maxInt.equalsIgnoreCase("default")){ maxInt = "0"; }
+  float m,M;
+  try {
+m = Float.parseFloat(minInt);
+M = Float.parseFloat(maxInt);
+
+//negative values and mismatched boundaries are ignored
+//(default to global settings)
+if (m < 0 || M < 0 || m > M){
+  LOG.error("Improper fetch intervals given on line " + 
String.valueOf(lineNo)
++ " in the config. file: " + line);
+} else {
+
+  // min. interval should be positive and above the global minimum
+  if (m > 0 && m > defaultMin){
+  hostSpecificMinInterval.put(host,m);
+  LOG.debug("Added custom min. interval " + m + " for host " + 
host + ".");
+  } else if (m > 0) {
+LOG.error("Min. interval out of bounds on line " + 
String.valueOf(lineNo)
+  + " in the config. file: " + line);
+  }
+
+

(nutch) branch master updated: NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler

2024-03-13 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 4642c30c2 NUTCH-3029 Host specific max. and min. intervals in adaptive 
scheduler
4642c30c2 is described below

commit 4642c30c2aeb2a1fa2436541bd4af877d0aad86a
Author: Markus Jelsma 
AuthorDate: Wed Mar 13 12:58:05 2024 +

NUTCH-3029 Host specific max. and min. intervals in adaptive scheduler
---
 conf/adaptive-host-specific-intervals.txt.template | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/conf/adaptive-host-specific-intervals.txt.template 
b/conf/adaptive-host-specific-intervals.txt.template
new file mode 100644
index 0..4aa7920d3
--- /dev/null
+++ b/conf/adaptive-host-specific-intervals.txt.template
@@ -0,0 +1,14 @@
+# This file defines a mapping that associates specific min. and max. 
refetching time intervals
+# to a host, that deviate from the default settings of the 
AdaptiveFetchSchedule class.
+#
+# Format:.
+#
+# The two values will be parsed as float and should be STRICTLY between
+# db.fetch.schedule.adaptive.min_interval and 
db.fetch.schedule.adaptive.max_interval.
+#
+# To use default values, write "default" or "0".
+# The default min. is 60 (1 min) and default max. is 31536000 (1 year).
+#
+www.apache.org   default 1728000
+www.example.org  1296000 0
+nutch.apache.org 864000  216

(nutch) branch master updated: NUTCH-3030 Use system default cipher suites instead of hard-coded set

2024-03-13 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 551c50b1c NUTCH-3030 Use system default cipher suites instead of 
hard-coded set
551c50b1c is described below

commit 551c50b1caac27ae65f25517de5b202b314fef0e
Author: Markus Jelsma 
AuthorDate: Wed Mar 13 11:50:25 2024 +

NUTCH-3030 Use system default cipher suites instead of hard-coded set
---
 .../apache/nutch/protocol/http/api/HttpBase.java   | 63 +-
 1 file changed, 12 insertions(+), 51 deletions(-)

diff --git 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 1438754ce..034fa7840 100644
--- 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -31,6 +31,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ThreadLocalRandom;
+import javax.net.ssl.SSLSocketFactory;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -320,57 +321,17 @@ public abstract class HttpBase implements Protocol {
   }
 }
 
-String[] protocols = conf.getStrings("http.tls.supported.protocols",
-"TLSv1.3", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
-String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
-"ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-RSA-AES128-GCM-SHA256",
-"ECDHE-ECDSA-AES256-GCM-SHA384", "ECDHE-RSA-AES256-GCM-SHA384",
-"ECDHE-ECDSA-CHACHA20-POLY1305", "ECDHE-RSA-CHACHA20-POLY1305",
-"DHE-RSA-AES128-GCM-SHA256", "DHE-RSA-AES256-GCM-SHA384",
-"TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384",
-"TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
-"TLS_RSA_WITH_AES_256_CBC_SHA256",
-"TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384",
-"TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384",
-"TLS_DHE_RSA_WITH_AES_256_CBC_SHA256",
-"TLS_DHE_DSS_WITH_AES_256_CBC_SHA256",
-"TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
-"TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA",
-"TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
-"TLS_ECDH_RSA_WITH_AES_256_CBC_SHA", 
"TLS_DHE_RSA_WITH_AES_256_CBC_SHA",
-"TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
-"TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256",
-"TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256",
-"TLS_RSA_WITH_AES_128_CBC_SHA256",
-"TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256",
-"TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256",
-"TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
-"TLS_DHE_DSS_WITH_AES_128_CBC_SHA256",
-"TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA",
-"TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA",
-"TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA",
-"TLS_ECDH_RSA_WITH_AES_128_CBC_SHA", 
"TLS_DHE_RSA_WITH_AES_128_CBC_SHA",
-"TLS_DHE_DSS_WITH_AES_128_CBC_SHA", "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA",
-"TLS_ECDHE_RSA_WITH_RC4_128_SHA", "SSL_RSA_WITH_RC4_128_SHA",
-"TLS_ECDH_ECDSA_WITH_RC4_128_SHA", "TLS_ECDH_RSA_WITH_RC4_128_SHA",
-"TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA",
-"TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA",
-"TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA",
-"TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA",
-"SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA",
-"SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5",
-"TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256",
-"TLS_ECDHE_ECDSA_WITH_NULL_SHA", "TLS_ECDHE_RSA_WITH_NULL_SHA",
-"SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA",
-"TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5",
-"SSL_RSA_WITH_DES_CBC_SHA", "SSL_DHE_RSA_WITH_DES_CBC_SHA",
-"SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA",
-"TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA",
-"TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA",
-

(nutch) branch master updated: NUTCH-3031 ProtocolFactory host mapper to support domains

2024-03-12 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new c390dfc8b NUTCH-3031 ProtocolFactory host mapper to support domains
c390dfc8b is described below

commit c390dfc8b5c15db74d61c83e79f8e17d9bdc7b3f
Author: Markus Jelsma 
AuthorDate: Tue Mar 12 17:29:20 2024 +

NUTCH-3031 ProtocolFactory host mapper to support domains
---
 src/java/org/apache/nutch/protocol/ProtocolFactory.java | 15 ---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/java/org/apache/nutch/protocol/ProtocolFactory.java 
b/src/java/org/apache/nutch/protocol/ProtocolFactory.java
index a545a4cd0..dc274b7e1 100644
--- a/src/java/org/apache/nutch/protocol/ProtocolFactory.java
+++ b/src/java/org/apache/nutch/protocol/ProtocolFactory.java
@@ -29,6 +29,7 @@ import org.apache.nutch.plugin.ExtensionPoint;
 import org.apache.nutch.plugin.PluginRepository;
 import org.apache.nutch.plugin.PluginRuntimeException;
 import org.apache.nutch.util.ObjectCache;
+import org.apache.nutch.util.URLUtil;
 
 import org.apache.commons.lang.StringUtils;
 
@@ -130,8 +131,16 @@ public class ProtocolFactory {
 
   // First attempt to resolve a protocol implementation by hostname
   String host = url.getHost();
+  String domain = URLUtil.getDomainName(url).toLowerCase().trim();
+  String hostOrDomain = null;
+  Extension extension = null;
   if (hostProtocolMapping.containsKey(host)) {
-Extension extension = getExtensionById(hostProtocolMapping.get(host));
+hostOrDomain = host;
+  } else if (hostProtocolMapping.containsKey(domain)) {
+hostOrDomain = domain;
+  }
+  if (hostOrDomain != null) {
+extension = getExtensionById(hostProtocolMapping.get(hostOrDomain));
 if (extension != null) {
   protocol = getProtocolInstanceByExtension(extension);
 }
@@ -141,7 +150,7 @@ public class ProtocolFactory {
   if (protocol == null) {
 // Protocol listed in default map?
 if (defaultProtocolImplMapping.containsKey(url.getProtocol())) {
-  Extension extension = 
getExtensionById(defaultProtocolImplMapping.get(url.getProtocol()));
+  extension = 
getExtensionById(defaultProtocolImplMapping.get(url.getProtocol()));
   if (extension != null) {
 protocol = getProtocolInstanceByExtension(extension);
   }
@@ -150,7 +159,7 @@ public class ProtocolFactory {
 
   // Still couldn't find a protocol? Attempt by protocol
   if (protocol == null) {
-Extension extension = findExtension(url.getProtocol(), "protocolName");
+extension = findExtension(url.getProtocol(), "protocolName");
 if (extension != null) {
   protocol = getProtocolInstanceByExtension(extension);
 }

(nutch) branch master updated: NUTCH-3027 Trivial resource leak patch in DomainSuffixes.java

2024-01-19 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new d95e1a79d NUTCH-3027 Trivial resource leak patch in DomainSuffixes.java
 new 6b0455454 Merge branch 'master' of 
https://gitbox.apache.org/repos/asf/nutch
d95e1a79d is described below

commit d95e1a79d665dfd10ae88e9985f3d85e398a751e
Author: Markus Jelsma 
AuthorDate: Fri Jan 19 12:53:40 2024 +0100

NUTCH-3027 Trivial resource leak patch in DomainSuffixes.java
---
 src/java/org/apache/nutch/util/domain/DomainSuffixes.java | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/java/org/apache/nutch/util/domain/DomainSuffixes.java 
b/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
index ae0d31b52..455f36712 100644
--- a/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
+++ b/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
@@ -41,9 +41,9 @@ public class DomainSuffixes {
   /** private ctor */
   private DomainSuffixes() {
 String file = "domain-suffixes.xml";
-InputStream input = this.getClass().getClassLoader()
-.getResourceAsStream(file);
-try {
+
+try (InputStream input = this.getClass().getClassLoader()
+.getResourceAsStream(file)) {
   new DomainSuffixesReader().read(this, input);
 } catch (Exception ex) {
   LOG.warn(StringUtils.stringifyException(ex));

[nutch] branch master updated: NUTCH-2924 Generate maxCount expr evaluated only once

2022-12-12 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 7d3900450 NUTCH-2924 Generate maxCount expr evaluated only once
7d3900450 is described below

commit 7d390045049036541d2fd94302ab97c8cb3e3cb1
Author: Markus Jelsma 
AuthorDate: Mon Dec 12 16:13:40 2022 +0100

NUTCH-2924 Generate maxCount expr evaluated only once
---
 src/java/org/apache/nutch/crawl/Generator.java | 103 +++--
 1 file changed, 44 insertions(+), 59 deletions(-)

diff --git a/src/java/org/apache/nutch/crawl/Generator.java 
b/src/java/org/apache/nutch/crawl/Generator.java
index 0fce6b3b0..8a2f87ba4 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -311,27 +311,30 @@ public class Generator extends NutchTool implements Tool {
 private SequenceFile.Reader[] hostdbReaders = null;
 private JexlScript maxCountExpr = null;
 private JexlScript fetchDelayExpr = null;
-
-public void open() {
-  if (conf.get(GENERATOR_HOSTDB) != null) {
-try {
-  Path path = new Path(conf.get(GENERATOR_HOSTDB), "current");
-  hostdbReaders = SegmentReaderUtil.getReaders(path, conf);
-} catch (IOException e) {
-  LOG.error("Error reading HostDB because {}", e.getMessage());
-}
+private Map hostDatumCache = new HashMap<>();
+
+public void readHostDb() throws IOException {
+  if (conf.get(GENERATOR_HOSTDB) == null) {
+return;
   }
-}
-
-public void close() {
-  if (hostdbReaders != null) {
-try {
-  for (int i = 0; i < hostdbReaders.length; i++) {
-hostdbReaders[i].close();
+  
+  Path path = new Path(conf.get(GENERATOR_HOSTDB), "current");
+  hostdbReaders = SegmentReaderUtil.getReaders(path, conf);
+  
+  try {
+Text key = new Text();
+HostDatum value = new HostDatum();
+for (int i = 0; i < hostdbReaders.length; i++) {
+  while (hostdbReaders[i].next(key, value)) {
+hostDatumCache.put(key.toString(), (HostDatum)value.clone());
   }
-} catch (IOException e) {
-  LOG.error("Error closing HostDB because {}", e.getMessage());
 }
+  } catch (Exception e) {
+throw new IOException(e);
+  }
+  
+  for (int i = 0; i < hostdbReaders.length; i++) {
+hostdbReaders[i].close();
   }
 }
 
@@ -402,6 +405,8 @@ public class Generator extends NutchTool implements Tool {
 fetchDelayExpr = JexlUtil
 .parseExpression(conf.get(GENERATOR_FETCH_DELAY_EXPR, null));
   }
+  
+  readHostDb();
 }
 
 @Override
@@ -414,7 +419,7 @@ public class Generator extends NutchTool implements Tool {
 public void reduce(FloatWritable key, Iterable values,
 Context context) throws IOException, InterruptedException {
 
-  String hostname = null;
+  String currentHostname = null;
   HostDatum host = null;
   LongWritable variableFetchDelayWritable = null; // in millis
   Text variableFetchDelayKey = new Text("_variableFetchDelay_");
@@ -425,33 +430,31 @@ public class Generator extends NutchTool implements Tool {
 String urlString = url.toString();
 URL u = null;
 
-// Do this only once per queue
-if (host == null) {
-  try {
-hostname = URLUtil.getHost(urlString);
-host = getHostDatum(hostname);
-  } catch (Exception e) {
-  }
+String hostname = URLUtil.getHost(urlString);
+if (!hostname.equals(currentHostname)) {
+  currentHostname = hostname;
+  host = hostDatumCache.get(hostname);
 
   // Got it?
-  if (host == null) {
-// Didn't work, prevent future lookups
-host = new HostDatum();
-  } else {
+  if (host != null) {
 if (maxCountExpr != null) {
-  long variableMaxCount = Math
-  .round((double) maxCountExpr.execute(createContext(host)));
-  LOG.info("Generator: variable maxCount: {} for {}",
-  variableMaxCount, hostname);
-  maxCount = (int) variableMaxCount;
+  try {
+long variableMaxCount = 
Math.round((double)maxCountExpr.execute(createContext(host)));
+LOG.debug("Generator: variable maxCount: {} for {}", 
variableMaxCount, hostname);
+maxCount = (int)variableMaxCount;
+  } catch (Exception e) {
+LOG.error("Unable to execute variable maxCount expression 
because: " + e.getMessage(), e);
+  }
 }
 
 if (fetchDelay

[nutch] branch master updated: NUTCH-2977

2022-12-07 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new d806aa450 NUTCH-2977
d806aa450 is described below

commit d806aa4507c59dcd680eac6f116df1eab22d996a
Author: Markus Jelsma 
AuthorDate: Wed Dec 7 18:08:53 2022 +0100

NUTCH-2977
---
 build.xml | 4 
 1 file changed, 4 insertions(+)

diff --git a/build.xml b/build.xml
index d7377ab25..004a12191 100644
--- a/build.xml
+++ b/build.xml
@@ -86,6 +86,10 @@
 
   
 
+  
+
+  
+

[nutch] branch master updated: NUTCH-2794 Add additional ciphers to HTTP base's default cipher suite

2020-06-17 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 1c2e411  NUTCH-2794 Add additional ciphers to HTTP base's default 
cipher suite
1c2e411 is described below

commit 1c2e4110ca4f4d739c6f9cde42d7a54ab52fa860
Author: Markus Jelsma 
AuthorDate: Wed Jun 17 13:21:24 2020 +0200

NUTCH-2794 Add additional ciphers to HTTP base's default cipher suite
---
 .../src/java/org/apache/nutch/protocol/http/api/HttpBase.java | 8 
 1 file changed, 8 insertions(+)

diff --git 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index d7e330e..30e2432 100644
--- 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -311,6 +311,14 @@ public abstract class HttpBase implements Protocol {
 String[] protocols = conf.getStrings("http.tls.supported.protocols",
 "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
 String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
+"ECDHE-ECDSA-AES128-GCM-SHA256",
+"ECDHE-RSA-AES128-GCM-SHA256",
+"ECDHE-ECDSA-AES256-GCM-SHA384",
+"ECDHE-RSA-AES256-GCM-SHA384",
+"ECDHE-ECDSA-CHACHA20-POLY1305",
+"ECDHE-RSA-CHACHA20-POLY1305",
+"DHE-RSA-AES128-GCM-SHA256",
+"DHE-RSA-AES256-GCM-SHA384",
 "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384",
 "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
 "TLS_RSA_WITH_AES_256_CBC_SHA256",

[nutch] branch master updated: NUTCH-2612 Support for sitemap processing by hostname

2019-09-09 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 9dbb4be  NUTCH-2612 Support for sitemap processing by hostname
 new 87b08fc  Merge branch 'master' of 
https://gitbox.apache.org/repos/asf/nutch
9dbb4be is described below

commit 9dbb4be71b248f61437375b21fc29934e03190db
Author: Markus Jelsma 
AuthorDate: Mon Sep 9 15:00:30 2019 +0200

NUTCH-2612 Support for sitemap processing by hostname
---
 .../org/apache/nutch/util/SitemapProcessor.java| 98 +-
 1 file changed, 58 insertions(+), 40 deletions(-)

diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java 
b/src/java/org/apache/nutch/util/SitemapProcessor.java
index cbfbe0c..18e3871 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -132,46 +132,27 @@ public class SitemapProcessor extends Configured 
implements Tool {
   context.write(key, (CrawlDatum) value);
 }
 else if (value instanceof HostDatum) {
-  // For entry from hostdb, get sitemap url(s) from robots.txt, fetch 
the sitemap,
-  // extract urls and emit those
-
-  // try different combinations of schemes one by one till we get 
rejection in all cases
-  String host = key.toString();
-  if((url = filterNormalize("http://; + host + "/")) == null &&
-  (url = filterNormalize("https://; + host + "/")) == null &&
-  (url = filterNormalize("ftp://; + host + "/")) == null &&
-  (url = filterNormalize("file:/" + host + "/")) == null) {
-context.getCounter("Sitemap", "filtered_records").increment(1);
-return;
-  }
-  // We may wish to use the robots.txt content as the third parameter 
for .getRobotRules
-  BaseRobotRules rules = 
protocolFactory.getProtocol(url).getRobotRules(new Text(url), datum, null);
-  List sitemaps = rules.getSitemaps();
-
-  if (tryDefaultSitemapXml && sitemaps.size() == 0) {
-sitemaps.add(url + "sitemap.xml");
-  }
-  for (String sitemap : sitemaps) {
-context.getCounter("Sitemap", "sitemaps_from_hostdb").increment(1);
-sitemap = filterNormalize(sitemap);
-if (sitemap == null) {
-  context.getCounter("Sitemap", "filtered_sitemaps_from_hostdb")
-  .increment(1);
-} else {
-  generateSitemapUrlDatum(protocolFactory.getProtocol(sitemap),
-  sitemap, context);
-}
-  }
+  generateSitemapsFromHostname(key.toString(), context);
 }
 else if (value instanceof Text) {
-  // For entry from sitemap urls file, fetch the sitemap, extract urls 
and emit those
-  if((url = filterNormalize(key.toString())) == null) {
-context.getCounter("Sitemap", "filtered_records").increment(1);
-return;
-  }
+  // Input can be sitemap URL or hostname
+  url = key.toString();
+  if (url.startsWith("http://;) ||
+url.startsWith("https://;) ||
+url.startsWith("ftp://;) ||
+url.startsWith("file:/")) {
+// For entry from sitemap urls file, fetch the sitemap, extract 
urls and emit those
+if((url = filterNormalize(url)) == null) {
+  context.getCounter("Sitemap", "filtered_records").increment(1);
+  return;
+}
 
-  context.getCounter("Sitemap", "sitemap_seeds").increment(1);
-  generateSitemapUrlDatum(protocolFactory.getProtocol(url), url, 
context);
+context.getCounter("Sitemap", "sitemap_seeds").increment(1);
+generateSitemapUrlDatum(protocolFactory.getProtocol(url), url, 
context); 
+  } else {
+LOG.info("generateSitemapsFromHostname: " + key.toString());
+generateSitemapsFromHostname(key.toString(), context);
+  }
 }
   } catch (Exception e) {
 LOG.warn("Exception for record {} : {}", key.toString(), 
StringUtils.stringifyException(e));
@@ -191,6 +172,43 @@ public class SitemapProcessor extends Configured 
implements Tool {
   }
   return url;
 }
+
+private void generateSitemapsFromHostname(String host, Context context) {
+  try {
+// For entry from hostdb, get sitemap url(s) from robots.txt, fetch 
the sitemap,
+// extract urls and emit those
+
+// try diffe

[nutch] branch master updated: NUTCH-2725 Plugin lib-http to support per-host configurable cookies

2019-07-29 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 54f73bf  NUTCH-2725 Plugin lib-http to support per-host configurable 
cookies
54f73bf is described below

commit 54f73bf78ded8b66ba262270d069232417bbe391
Author: Markus Jelsma 
AuthorDate: Mon Jul 29 12:44:49 2019 +0200

NUTCH-2725 Plugin lib-http to support per-host configurable cookies
---
 conf/cookies.txt   |  3 ++
 conf/nutch-default.xml |  8 
 .../apache/nutch/protocol/http/api/HttpBase.java   | 56 ++
 .../apache/nutch/protocol/http/HttpResponse.java   | 23 ++---
 .../nutch/protocol/httpclient/HttpResponse.java| 17 +--
 .../nutch/protocol/okhttp/OkHttpResponse.java  | 19 ++--
 6 files changed, 111 insertions(+), 15 deletions(-)

diff --git a/conf/cookies.txt b/conf/cookies.txt
new file mode 100644
index 000..f75f220
--- /dev/null
+++ b/conf/cookies.txt
@@ -0,0 +1,3 @@
+# Optional per-host configurable cookies. Format:
+#
+# \t
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index a9ce899..e88991c 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -190,6 +190,14 @@
 
 
 
+  http.agent.host.cookie.file
+  cookies.txt
+  
+File containing per-host configured cookies.
+  
+
+
+
   http.agent.host
   
   Name or IP address of the host on which the Nutch crawler
diff --git 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index bcc2e29..4b91f9c 100644
--- 
a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ 
b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -28,6 +28,7 @@ import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ThreadLocalRandom;
 
@@ -45,6 +46,7 @@ import org.apache.nutch.protocol.ProtocolStatus;
 import org.apache.nutch.util.GZIPUtils;
 import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.DeflateUtils;
+import org.apache.nutch.util.URLUtil;
 import org.apache.hadoop.util.StringUtils;
 
 import org.apache.hadoop.conf.Configuration;
@@ -66,6 +68,9 @@ public abstract class HttpBase implements Protocol {
   private HttpRobotRulesParser robots = null;
 
   private ArrayList userAgentNames = null;
+  
+  /** Mapping hostnames to cookies */
+  private Map hostCookies = null;
 
   /** The proxy hostname. */
   protected String proxyHost = null;
@@ -257,6 +262,42 @@ public abstract class HttpBase implements Protocol {
 .warn("Falling back to fixed user agent set via property 
http.agent.name");
   }
 }
+
+// If cookies are enabled, try to load a per-host cookie file
+if (enableCookieHeader) {
+  String cookieFile = conf.get("http.agent.host.cookie.file", 
"cookies.txt");
+  BufferedReader br = null;
+  try {
+Reader reader = conf.getConfResourceAsReader(cookieFile);
+br = new BufferedReader(reader);
+hostCookies = new HashMap();
+String word = "";
+while ((word = br.readLine()) != null) {
+  if (!word.trim().isEmpty()) {
+if (word.indexOf("#") == -1) { // skip comment
+  String[] parts = word.split("\t");
+  if (parts.length == 2) {
+hostCookies.put(parts[0], parts[1]);
+  } else {
+LOG.warn("Unable to parse cookie file correctly at: " + word);
+  }
+}
+  }
+}
+  } catch (Exception e) {
+logger.warn("Failed to read http.agent.host.cookie.file {}: {}", 
cookieFile,
+StringUtils.stringifyException(e));
+hostCookies = null;
+  } finally {
+if (br != null) {
+  try {
+br.close();
+  } catch (IOException e) {
+// ignore
+  }
+}
+  }
+}
 
 String[] protocols = conf.getStrings("http.tls.supported.protocols",
 "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
@@ -479,6 +520,21 @@ public abstract class HttpBase implements Protocol {
 }
 return userAgent;
   }
+  
+  /**
+   * If per-host cookies are configured, this method will look it up
+   * for the given url.
+   *
+   * @param url the url to look-up a cookie for
+   * @return the cookie or null
+   */
+  public String getCookie(URL url) {
+if (hostCookies != null) {
+  return hostCookies.get(url.getHost());
+}
+
+return null;
+  }
 
   /**
* Value of "Accept-Language&

[nutch] branch master updated: NUTCH-2724 Metadata indexer not to emit empty values

2019-07-15 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new a67c9be  NUTCH-2724 Metadata indexer not to emit empty values
a67c9be is described below

commit a67c9bee94049d37dad9278cdf8dd9131735da43
Author: Markus Jelsma 
AuthorDate: Mon Jul 15 12:25:42 2019 +0200

NUTCH-2724 Metadata indexer not to emit empty values
---
 .../java/org/apache/nutch/indexer/metadata/MetadataIndexer.java| 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git 
a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
 
b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
index 3927bd3..be56377 100644
--- 
a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
+++ 
b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
@@ -93,12 +93,15 @@ public class MetadataIndexer implements IndexingFilter {
   
   protected void add(NutchDocument doc, String key, String value) {
 if (separator == null || value.indexOf(separator) == -1 || 
!mvFields.contains(key)) {
-  doc.add(key, value);
+  value = value.trim();
+  if (!value.isEmpty()) {
+doc.add(key, value);
+  }
 } else {
   String[] parts = value.split(separator);
   for (String part : parts) {
 part = part.trim();
-if (part.length() != 0) {
+if (!part.isEmpty()) {
   doc.add(key, part);
 }
   }

[nutch] branch master updated: NUTCH-2723 Indexer Solr not to decode URLs before deletion

2019-07-12 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 5150c44  NUTCH-2723 Indexer Solr not to decode URLs before deletion
 new 9692464  Merge branch 'master' of 
https://gitbox.apache.org/repos/asf/nutch
5150c44 is described below

commit 5150c442a78d15c042ee6fb12e6dbea8ec5341e6
Author: Markus Jelsma 
AuthorDate: Fri Jul 12 12:09:34 2019 +0200

NUTCH-2723 Indexer Solr not to decode URLs before deletion
---
 .../org/apache/nutch/indexwriter/solr/SolrIndexWriter.java   | 12 
 1 file changed, 12 deletions(-)

diff --git 
a/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
 
b/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
index 475d313..cc2e8d7 100644
--- 
a/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
+++ 
b/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
@@ -18,8 +18,6 @@ package org.apache.nutch.indexwriter.solr;
 
 import java.lang.invoke.MethodHandles;
 import java.io.IOException;
-import java.io.UnsupportedEncodingException;
-import java.net.URLDecoder;
 import java.time.format.DateTimeFormatter;
 import java.util.AbstractMap;
 import java.util.ArrayList;
@@ -153,16 +151,6 @@ public class SolrIndexWriter implements IndexWriter {
   }
 
   public void delete(String key) throws IOException {
-try {
-  key = URLDecoder.decode(key, "UTF8");
-} catch (UnsupportedEncodingException e) {
-  LOG.error("Error decoding: " + key);
-  throw new IOException("UnsupportedEncodingException for " + key);
-} catch (IllegalArgumentException e) {
-  LOG.warn("Could not decode: " + key
-  + ", it probably wasn't encoded in the first place..");
-}
-
 // escape solr hash separator
 key = key.replaceAll("!", "\\!");

[nutch] branch master updated: NUTCH-2703 parse-tika: Boilerpipe should not run for non-(X)HTML pages

2019-04-11 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 7e6eabb  NUTCH-2703 parse-tika: Boilerpipe should not run for 
non-(X)HTML pages
7e6eabb is described below

commit 7e6eabbc2b0a0b5ee91148a9effc6447af5057ba
Author: Markus Jelsma 
AuthorDate: Thu Apr 11 12:32:22 2019 +0200

NUTCH-2703 parse-tika: Boilerpipe should not run for non-(X)HTML pages
---
 conf/nutch-default.xml |  9 +
 .../org/apache/nutch/parse/tika/TikaParser.java| 22 ++
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index a4b202f..951494e 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1600,6 +1600,15 @@ CAUTION: Set the parser.timeout to -1 or a bigger value 
than 30, when using this
   
 
 
+
+  tika.extractor.boilerpipe.mime.types
+  text/html,application/xhtml+xml
+  
+Comma-separated list of MIME types accepted for Boilerpipe extraction,
+documents of other MIME types are not passed to the Boilerpipe extractor.
+  
+
+
 
 
 
diff --git 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index 7440333..40aa265 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -21,8 +21,11 @@ import java.io.ByteArrayInputStream;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
@@ -73,6 +76,7 @@ public class TikaParser implements 
org.apache.nutch.parse.Parser {
   private boolean upperCaseElementNames = true;
   private String boilerpipeExtractorName;
   private boolean useBoilerpipe;
+  private Set boilerpipeMimeTypes;
 
   public ParseResult getParse(Content content) {
 HTMLDocumentImpl doc = new HTMLDocumentImpl();
@@ -114,7 +118,7 @@ public class TikaParser implements 
org.apache.nutch.parse.Parser {
 ContentHandler domHandler;
 
 // Check whether to use Tika's BoilerplateContentHandler
-if (useBoilerpipe) {
+if (useBoilerpipe && boilerpipeMimeTypes.contains(mimeType)) {
   BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(
   (ContentHandler) new DOMBuilder(doc, root),
   BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
@@ -291,16 +295,18 @@ public class TikaParser implements 
org.apache.nutch.parse.Parser {
   }
 }
 
-htmlParseFilters = new HtmlParseFilters(getConf());
+htmlParseFilters = new HtmlParseFilters(conf);
 utils = new DOMContentUtils(conf);
-cachingPolicy = getConf().get("parser.caching.forbidden.policy",
+cachingPolicy = conf.get("parser.caching.forbidden.policy",
 Nutch.CACHING_FORBIDDEN_CONTENT);
-upperCaseElementNames = 
getConf().getBoolean("tika.uppercase.element.names",
+upperCaseElementNames = conf.getBoolean("tika.uppercase.element.names",
 true);
-useBoilerpipe = getConf().get("tika.extractor", "none")
-.equals("boilerpipe");
-boilerpipeExtractorName = getConf()
-.get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
+useBoilerpipe = conf.get("tika.extractor", "none").equals("boilerpipe");
+boilerpipeExtractorName = conf.get("tika.extractor.boilerpipe.algorithm",
+"ArticleExtractor");
+boilerpipeMimeTypes = new HashSet<>(Arrays
+.asList(conf.getTrimmedStrings("tika.extractor.boilerpipe.mime.types",
+"text/html", "application/xhtml+xml")));
   }
 
   public Configuration getConf() {

[nutch] branch master updated: NUTCH-2692 Removing previously accidentally added file

2019-02-22 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new f7fdca3  NUTCH-2692 Removing previously accidentally added file
f7fdca3 is described below

commit f7fdca37fe15f95955ec9082943a9392a578b728
Author: Markus Jelsma 
AuthorDate: Fri Feb 22 17:07:29 2019 +0100

NUTCH-2692 Removing previously accidentally added file
---
 conf/host-protocol-mapping.txt | 11 ---
 1 file changed, 11 deletions(-)

diff --git a/conf/host-protocol-mapping.txt b/conf/host-protocol-mapping.txt
deleted file mode 100644
index d0a1b70..000
--- a/conf/host-protocol-mapping.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-# This file defines a hostname to protocol plugin mapping. Each line takes a
-# host name followed by a tab, followed by the ID of the protocol plugin. You
-# can find the ID in the protocol plugin's plugin.xml file.
-# 
-# \t\n
-# nutch.apache.org org.apache.nutch.protocol.httpclient.Http
-# tika.apache.org  org.apache.nutch.protocol.http.Http
-#
-nutch.apache.org   org.apache.nutch.protocol.httpclient.Http
-tika.apache.orgorg.apache.nutch.protocol.http.Http
-

[nutch] 02/03: NUTCH-2692 Subcollection to support case-insensitive white and black lists

2019-02-22 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 3fa2f4a7efac598258eb01a4387b5fde43c1a813
Author: Markus Jelsma 
AuthorDate: Fri Feb 22 16:46:42 2019 +0100

NUTCH-2692 Subcollection to support case-insensitive white and black lists
---
 conf/host-protocol-mapping.txt | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/conf/host-protocol-mapping.txt b/conf/host-protocol-mapping.txt
new file mode 100644
index 000..d0a1b70
--- /dev/null
+++ b/conf/host-protocol-mapping.txt
@@ -0,0 +1,11 @@
+# This file defines a hostname to protocol plugin mapping. Each line takes a
+# host name followed by a tab, followed by the ID of the protocol plugin. You
+# can find the ID in the protocol plugin's plugin.xml file.
+# 
+# \t\n
+# nutch.apache.org org.apache.nutch.protocol.httpclient.Http
+# tika.apache.org  org.apache.nutch.protocol.http.Http
+#
+nutch.apache.org   org.apache.nutch.protocol.httpclient.Http
+tika.apache.orgorg.apache.nutch.protocol.http.Http
+

[nutch] 01/03: NUTCH-2692 Subcollection to support case-insensitive white and black lists

2019-02-22 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 89c41e1b5a245322b27e8dd0728b543faa171e9d
Author: Markus Jelsma 
AuthorDate: Fri Feb 22 16:44:25 2019 +0100

NUTCH-2692 Subcollection to support case-insensitive white and black lists
---
 conf/nutch-default.xml  |  8 
 .../src/java/org/apache/nutch/collection/Subcollection.java | 13 -
 .../indexer/subcollection/SubcollectionIndexingFilter.java  |  6 ++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index a42e6a9..69fbb7d 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2407,6 +2407,14 @@ visit 
https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   
 
 
+
+  subcollection.case.insensitive
+  false
+  
+  Whether the URL prefixes are to be treated case insensitive.
+  
+
+
 
 
 
diff --git 
a/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
 
b/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
index 13064eb..8478390 100644
--- 
a/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
+++ 
b/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
@@ -69,6 +69,11 @@ public class Subcollection extends Configured implements 
URLFilter {
* SubCollection blacklist as String
*/
   String blString;
+  
+  /**
+   * Whether the white and black lists are case sensitive
+   */
+  boolean caseInsensitive = false;
 
   /**
* public Constructor
@@ -95,10 +100,12 @@ public class Subcollection extends Configured implements 
URLFilter {
 this.id = id;
 this.key = key;
 this.name = name;
+caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false);
   }
 
   public Subcollection(Configuration conf) {
 super(conf);
+caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false);
   }
 
   /**
@@ -231,7 +238,11 @@ public class Subcollection extends Configured implements 
URLFilter {
 
 while (st.hasMoreElements()) {
   String line = (String) st.nextElement();
-  list.add(line.trim());
+  line = line.trim();
+  if (caseInsensitive) {
+line = line.toLowerCase();
+  }
+  list.add(line);
 }
   }
 
diff --git 
a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
 
b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
index 898d314..767d54d 100644
--- 
a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
+++ 
b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
@@ -36,6 +36,7 @@ public class SubcollectionIndexingFilter extends Configured 
implements
 IndexingFilter {
 
   private Configuration conf;
+  private boolean caseInsensitive = false;
 
   public SubcollectionIndexingFilter() {
 super(NutchConfiguration.create());
@@ -52,7 +53,9 @@ public class SubcollectionIndexingFilter extends Configured 
implements
 this.conf = conf;
 fieldName = conf.get("subcollection.default.fieldname", "subcollection");
 metadataSource = conf.get("subcollection.metadata.source", 
"subcollection");
+caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false);
   }
+  
 
   /**
* @return Configuration
@@ -102,6 +105,9 @@ public class SubcollectionIndexingFilter extends Configured 
implements
 }
 
 String sUrl = url.toString();
+if (caseInsensitive) {
+  sUrl = sUrl.toLowerCase();
+}
 addSubCollectionField(doc, sUrl);
 return doc;
   }

[nutch] branch master updated (78af89f -> 0085ee7)

2019-02-22 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git.


from 78af89f  Merge pull request #436 from r0ann3l/NUTCH-2684
 new 89c41e1  NUTCH-2692 Subcollection to support case-insensitive white 
and black lists
 new 3fa2f4a  NUTCH-2692 Subcollection to support case-insensitive white 
and black lists
 new 0085ee7  Merge branch 'master' of 
https://gitbox.apache.org/repos/asf/nutch

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 ...tocol-mapping.txt.template => host-protocol-mapping.txt} | 13 -
 conf/nutch-default.xml  |  8 
 .../src/java/org/apache/nutch/collection/Subcollection.java | 13 -
 .../indexer/subcollection/SubcollectionIndexingFilter.java  |  6 ++
 4 files changed, 30 insertions(+), 10 deletions(-)
 copy conf/{host-protocol-mapping.txt.template => host-protocol-mapping.txt} 
(50%)

[nutch] 03/03: Merge branch 'master' of https://gitbox.apache.org/repos/asf/nutch

2019-02-22 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 0085ee740e78b58091d1aa39614277f1a612810c
Merge: 3fa2f4a 78af89f
Author: Markus Jelsma 
AuthorDate: Fri Feb 22 16:48:45 2019 +0100

Merge branch 'master' of https://gitbox.apache.org/repos/asf/nutch

 conf/nutch-default.xml | 18 -
 src/java/org/apache/nutch/crawl/CrawlDbReader.java |  2 +-
 .../org/apache/nutch/crawl/CrawlDbReducer.java |  4 +-
 src/java/org/apache/nutch/crawl/Generator.java |  8 +-
 src/java/org/apache/nutch/fetcher/QueueFeeder.java | 91 --
 .../apache/nutch/hostdb/UpdateHostDbMapper.java|  3 -
 .../apache/nutch/hostdb/UpdateHostDbReducer.java   |  2 -
 .../nutch/indexer/IndexingFiltersChecker.java  |  2 +-
 .../org/apache/nutch/net/protocols/Response.java   |  2 +-
 .../org/apache/nutch/parse/OutlinkExtractor.java   |  2 +-
 src/java/org/apache/nutch/parse/ParseData.java | 18 +
 .../org/apache/nutch/parse/ParsePluginsReader.java |  2 +-
 .../org/apache/nutch/segment/SegmentMerger.java|  4 +-
 .../org/apache/nutch/service/impl/LinkReader.java  |  8 +-
 .../org/apache/nutch/service/impl/NodeReader.java  |  8 +-
 .../service/impl/NutchServerPoolExecutor.java  |  2 +-
 .../apache/nutch/service/impl/SequenceReader.java  |  8 +-
 .../org/apache/nutch/tools/arc/ArcInputFormat.java |  4 +-
 .../apache/nutch/tools/arc/ArcRecordReader.java|  2 +-
 .../apache/nutch/tools/arc/ArcSegmentCreator.java  |  4 +-
 .../org/apache/nutch/util/EncodingDetector.java|  6 +-
 src/java/org/apache/nutch/util/MimeUtil.java   |  3 +-
 src/plugin/indexer-cloudsearch/README.md   | 54 ++---
 src/plugin/indexer-csv/README.md   | 42 ++
 .../nutch/indexwriter/csv/CSVIndexWriter.java  |  4 +-
 src/plugin/indexer-dummy/README.md | 34 
 src/plugin/indexer-elastic-rest/README.md  | 45 +++
 src/plugin/indexer-elastic/README.md   | 41 ++
 src/plugin/indexer-rabbit/README.md| 44 +++
 src/plugin/indexer-solr/README.md  | 40 ++
 .../apache/nutch/parse/html/HTMLMetaProcessor.java | 45 +--
 .../apache/nutch/parse/tika/HTMLMetaProcessor.java | 45 +--
 .../apache/nutch/protocol/http/HttpResponse.java   |  4 +-
 .../org/apache/nutch/protocol/okhttp/OkHttp.java   | 22 +++---
 .../nutch/scoring/orphan/OrphanScoringFilter.java  | 12 ++-
 .../scoring/orphan/TestOrphanScoringFilter.java|  4 +-
 src/test/org/apache/nutch/crawl/TestGenerator.java |  6 +-
 37 files changed, 463 insertions(+), 182 deletions(-)

[nutch] branch master updated: NUTCH-2694 HostDB to aggregate by long instead of integer

2019-02-22 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 33922fe  NUTCH-2694 HostDB to aggregate by long instead of integer
33922fe is described below

commit 33922feb804d740180fb4abd833884dae6d62cc0
Author: Markus Jelsma 
AuthorDate: Fri Feb 22 14:08:08 2019 +0100

NUTCH-2694 HostDB to aggregate by long instead of integer
---
 CHANGES.txt|   9 +-
 src/java/org/apache/nutch/hostdb/HostDatum.java| 110 ++---
 .../org/apache/nutch/hostdb/ResolverThread.java|   6 +-
 .../apache/nutch/hostdb/UpdateHostDbReducer.java   |  34 +++
 4 files changed, 81 insertions(+), 78 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 96bd05a..12f5aad 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -6,9 +6,12 @@ Comments
 
 Breaking Changes
 
-The value of crawl.gen.delay is now read in milliseconds as stated in the 
description
-in nutch-default.xml. Previously, the value has been read in days, see 
NUTCH-1842 for
-further information.
+-  The value of crawl.gen.delay is now read in milliseconds as stated in 
the description
+   in nutch-default.xml. Previously, the value has been read in days, see 
NUTCH-1842 for
+   further information.
+
+-  HostDB entries have been moved from Integer to Long in order to 
accomodate very large
+   hosts. Remove your existing HostDB and recreate it with bin/nutch 
updatehostdb.
 
 
 Nutch 1.15 Release (25/07/2018)
diff --git a/src/java/org/apache/nutch/hostdb/HostDatum.java 
b/src/java/org/apache/nutch/hostdb/HostDatum.java
index fe3b73e..2bc9244 100644
--- a/src/java/org/apache/nutch/hostdb/HostDatum.java
+++ b/src/java/org/apache/nutch/hostdb/HostDatum.java
@@ -30,7 +30,7 @@ import org.apache.hadoop.io.Writable;
 /**
  */
 public class HostDatum implements Writable, Cloneable {
-  protected int failures = 0;
+  protected long failures = 0;
   protected float score = 0;
   protected Date lastCheck = new Date(0);
   protected String homepageUrl = new String();
@@ -38,17 +38,17 @@ public class HostDatum implements Writable, Cloneable {
   protected MapWritable metaData = new MapWritable();
 
   // Records the number of times DNS look-up failed, may indicate host no 
longer exists
-  protected int dnsFailures = 0;
+  protected long dnsFailures = 0;
 
   // Records the number of connection failures, may indicate our netwerk being 
blocked by firewall
-  protected int connectionFailures = 0;
+  protected long connectionFailures = 0;
 
-  protected int unfetched = 0;
-  protected int fetched = 0;
-  protected int notModified = 0;
-  protected int redirTemp = 0;
-  protected int redirPerm = 0;
-  protected int gone = 0;
+  protected long unfetched = 0;
+  protected long fetched = 0;
+  protected long notModified = 0;
+  protected long redirTemp = 0;
+  protected long redirPerm = 0;
+  protected long gone = 0;
 
   public HostDatum() {
   }
@@ -68,15 +68,15 @@ public class HostDatum implements Writable, Cloneable {
   }
 
   public void resetFailures() {
-setDnsFailures(0);
-setConnectionFailures(0);
+setDnsFailures(0l);
+setConnectionFailures(0l);
   }
 
-  public void setDnsFailures(Integer dnsFailures) {
+  public void setDnsFailures(Long dnsFailures) {
 this.dnsFailures = dnsFailures;
   }
 
-  public void setConnectionFailures(Integer connectionFailures) {
+  public void setConnectionFailures(Long connectionFailures) {
 this.connectionFailures = connectionFailures;
   }
 
@@ -88,15 +88,15 @@ public class HostDatum implements Writable, Cloneable {
 this.connectionFailures++;
   }
 
-  public Integer numFailures() {
+  public Long numFailures() {
 return getDnsFailures() + getConnectionFailures();
   }
 
-  public Integer getDnsFailures() {
+  public Long getDnsFailures() {
 return dnsFailures;
   }
 
-  public Integer getConnectionFailures() {
+  public Long getConnectionFailures() {
 return connectionFailures;
   }
 
@@ -120,7 +120,7 @@ public class HostDatum implements Writable, Cloneable {
 return score;
   }
 
-  public Integer numRecords() {
+  public Long numRecords() {
 return unfetched + fetched + gone + redirPerm + redirTemp + notModified;
   }
 
@@ -140,51 +140,51 @@ public class HostDatum implements Writable, Cloneable {
 this.homepageUrl = homepageUrl;
   }
 
-  public void setUnfetched(int val) {
+  public void setUnfetched(long val) {
 unfetched = val;
   }
 
-  public int getUnfetched() {
+  public long getUnfetched() {
 return unfetched;
   }
 
-  public void setFetched(int val) {
+  public void setFetched(long val) {
 fetched = val;
   }
 
-  public int getFetched() {
+  public long getFetched() {
 return fetched;
   }
 
-  public void setNotModified(int val) {
+  public void setNotModified(long val) {
 notModified = val

[nutch] branch master updated: NUTCH-2687 Regex for reading title from Content-Disposition is wrong

2019-01-18 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 9cc076f  NUTCH-2687 Regex for reading title from Content-Disposition 
is wrong
9cc076f is described below

commit 9cc076f33746c34acfdeef8b3007bb5b0dec736d
Author: Markus Jelsma 
AuthorDate: Fri Jan 18 11:36:49 2019 +0100

NUTCH-2687 Regex for reading title from Content-Disposition is wrong
---
 .../src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git 
a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
 
b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
index c16d233..8c4a2d6 100644
--- 
a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
+++ 
b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
@@ -273,7 +273,7 @@ public class MoreIndexingFilter implements IndexingFilter {
   static {
 try {
   // order here is important
-  patterns[0] = Pattern.compile("\\bfilename=['\"](.+)['\"]");
+  patterns[0] = Pattern.compile("\\bfilename=['\"]([^\"]+)");
   patterns[1] = Pattern.compile("\\bfilename=(\\S+)\\b");
 } catch (PatternSyntaxException e) {
   // just ignore

[nutch] branch master updated: NUTCH-2647 Skip TLS certificate checks in protocol-http plugin

2018-09-28 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 61d7e8c  NUTCH-2647 Skip TLS certificate checks in protocol-http plugin
61d7e8c is described below

commit 61d7e8ce440aa544ce23e98a6fc6f811c482c5a0
Author: Markus Jelsma 
AuthorDate: Fri Sep 28 11:25:31 2018 +0200

NUTCH-2647 Skip TLS certificate checks in protocol-http plugin
---
 .../nutch/protocol/http/DummyX509TrustManager.java | 93 ++
 .../apache/nutch/protocol/http/HttpResponse.java   | 14 ++--
 2 files changed, 102 insertions(+), 5 deletions(-)

diff --git 
a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/DummyX509TrustManager.java
 
b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/DummyX509TrustManager.java
new file mode 100644
index 000..879f703
--- /dev/null
+++ 
b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/DummyX509TrustManager.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Based on EasyX509TrustManager from commons-httpclient.
+ */
+
+package org.apache.nutch.protocol.http;
+
+import java.lang.invoke.MethodHandles;
+import java.security.KeyStore;
+import java.security.KeyStoreException;
+import java.security.NoSuchAlgorithmException;
+import java.security.cert.CertificateException;
+import java.security.cert.X509Certificate;
+
+import javax.net.ssl.TrustManagerFactory;
+import javax.net.ssl.TrustManager;
+import javax.net.ssl.X509TrustManager;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class DummyX509TrustManager implements X509TrustManager {
+  private X509TrustManager standardTrustManager = null;
+
+  /** Logger object for this class. */
+  private static final Logger LOG = LoggerFactory
+  .getLogger(MethodHandles.lookup().lookupClass());
+
+  /**
+   * Constructor for DummyX509TrustManager.
+   */
+  public DummyX509TrustManager(KeyStore keystore)
+  throws NoSuchAlgorithmException, KeyStoreException {
+super();
+String algo = TrustManagerFactory.getDefaultAlgorithm();
+TrustManagerFactory factory = TrustManagerFactory.getInstance(algo);
+factory.init(keystore);
+TrustManager[] trustmanagers = factory.getTrustManagers();
+if (trustmanagers.length == 0) {
+  throw new NoSuchAlgorithmException(algo + " trust manager not 
supported");
+}
+this.standardTrustManager = (X509TrustManager) trustmanagers[0];
+  }
+
+  /**
+   * @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[],
+   *  String)
+   */
+  public boolean isClientTrusted(X509Certificate[] certificates) {
+return true;
+  }
+
+  /**
+   * @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[],
+   *  String)
+   */
+  public boolean isServerTrusted(X509Certificate[] certificates) {
+return true;
+  }
+
+  /**
+   * @see javax.net.ssl.X509TrustManager#getAcceptedIssuers()
+   */
+  public X509Certificate[] getAcceptedIssuers() {
+return this.standardTrustManager.getAcceptedIssuers();
+  }
+
+  public void checkClientTrusted(X509Certificate[] arg0, String arg1)
+  throws CertificateException {
+// do nothing
+
+  }
+
+  public void checkServerTrusted(X509Certificate[] arg0, String arg1)
+  throws CertificateException {
+// do nothing
+
+  }
+}
diff --git 
a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 
b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
index 4b5544e..95ae352 100644
--- 
a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ 
b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
@@ -30,8 +30,10 @@ import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Set;
 
+import javax.net.ssl.SSLContext;
 import javax.net.ssl.SSLSocket;
 import javax.net.ssl.SSLSocketFactory;
+import javax.net.ssl.TrustManager;
 
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
@@

[nutch] branch master updated: NUTCH-2411 Index-metadata to support indexing multiple values for a field

2018-03-08 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 9a77f43  NUTCH-2411 Index-metadata to support indexing multiple values 
for a field
9a77f43 is described below

commit 9a77f43774b2c3cd70785895afb989e9ee2d8d5f
Author: Markus Jelsma <mar...@apache.org>
AuthorDate: Thu Mar 8 14:03:12 2018 +0100

NUTCH-2411 Index-metadata to support indexing multiple values for a field
---
 conf/nutch-default.xml |  9 ++
 .../nutch/indexer/metadata/MetadataIndexer.java| 35 ++
 2 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 87c4058..71ef51b 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1738,6 +1738,15 @@ visit 
https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   
 
 
+
+  index.metadata.separator
+  
+  
+   Separator to use if you want to index multiple values for a given field. 
Leave empty to
+   treat each value as a single value.
+  
+
+
 
 
   index.geoip.usage
diff --git 
a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
 
b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
index edb8b15..74d9eb1 100644
--- 
a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
+++ 
b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
@@ -17,9 +17,12 @@
 
 package org.apache.nutch.indexer.metadata;
 
+import java.util.Arrays;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Set;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
@@ -35,17 +38,21 @@ import org.apache.nutch.parse.Parse;
  * Indexer which can be configured to extract metadata from the crawldb, parse
  * metadata or content metadata. You can specify the properties "index.db.md",
  * "index.parse.md" or "index.content.md" who's values are comma-delimited
- * Example : key1,key2,key3.
+ * key1,key2,key3.
  */
 public class MetadataIndexer implements IndexingFilter {
   private Configuration conf;
   private String[] dbFieldnames;
   private Map<String, String> parseFieldnames;
   private String[] contentFieldnames;
+  private String separator;
+  private Set mvFields;
   private static final String db_CONF_PROPERTY = "index.db.md";
   private static final String parse_CONF_PROPERTY = "index.parse.md";
   private static final String content_CONF_PROPERTY = "index.content.md";
-
+  private static final String separator_CONF_PROPERTY = 
"index.metadata.separator";
+  private static final String mvfields_CONF_PROPERTY = 
"index.metadata.multivalued.fields";
+  
   public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
   CrawlDatum datum, Inlinks inlinks) throws IndexingException {
 
@@ -58,7 +65,7 @@ public class MetadataIndexer implements IndexingFilter {
   for (String metatag : dbFieldnames) {
 Writable metadata = datum.getMetaData().get(new Text(metatag));
 if (metadata != null)
-  doc.add(metatag, metadata.toString());
+  add(doc, metatag, metadata.toString());
   }
 }
 
@@ -67,7 +74,7 @@ public class MetadataIndexer implements IndexingFilter {
   for (String metatag : parseFieldnames.keySet()) {
 for (String value : parse.getData().getParseMeta().getValues(metatag)) 
{
   if (value != null)
-doc.add(parseFieldnames.get(metatag), value);
+add(doc, parseFieldnames.get(metatag), value);
 }
   }
 }
@@ -77,13 +84,27 @@ public class MetadataIndexer implements IndexingFilter {
   for (String metatag : contentFieldnames) {
 for (String value : 
parse.getData().getContentMeta().getValues(metatag)) {
   if (value != null)
-doc.add(metatag, value);
+add(doc, metatag, value);
 }
   }
 }
 
 return doc;
   }
+  
+  protected void add(NutchDocument doc, String key, String value) {
+if (separator == null || value.indexOf(separator) == -1 || 
!mvFields.contains(key)) {
+  doc.add(key, value);
+} else {
+  String[] parts = value.split(separator);
+  for (String part : parts) {
+part = part.trim();
+if (part.length() != 0) {
+  doc.add(key, part);
+}
+  }
+}
+  }
 
   public void setConf(Configuration conf) {
 this.conf = conf;
@@ -93,7 +114,9 @@ public class MetadataIndexer implements IndexingFilter {
   parseFieldnames.put(metatag.toLowerCase(Locale.ROOT), metatag);
 }
 contentFieldnames = conf.getStrings(content_CONF_P

[nutch] branch master updated: NUTCH-2458

2017-11-10 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new c345618  NUTCH-2458
 new 705686e  Merge branch 'master' of 
https://gitbox.apache.org/repos/asf/nutch
c345618 is described below

commit c345618ec425f0e907a6e54565f2d0577139b45f
Author: Markus Jelsma <mar...@apache.org>
AuthorDate: Fri Nov 10 10:56:56 2017 +0100

NUTCH-2458
---
 .../parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git 
a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java 
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index 49dc378..73cd083 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -239,7 +239,7 @@ public class TikaParser implements 
org.apache.nutch.parse.Parser {
 // see if a Tika config file can be found in the job file
 URL customTikaConfig = conf.getResource(customConfFile);
 if (customTikaConfig != null)
-  tikaConfig = new TikaConfig(customTikaConfig);
+  tikaConfig = new TikaConfig(customTikaConfig, 
this.getClass().getClassLoader());
   } catch (Exception e1) {
 String message = "Problem loading custom Tika configuration from "
 + customConfFile;

-- 
To stop receiving notification emails like this one, please contact
['"commits@nutch.apache.org" <commits@nutch.apache.org>'].

[nutch] branch master updated: NUTCH-2420 Bug in variable generate.max.count and fetcher.server.delay

2017-11-06 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 6199492  NUTCH-2420 Bug in variable generate.max.count and 
fetcher.server.delay
6199492 is described below

commit 6199492f5e1e8811022257c88dbf63f1e1c739d0
Author: Markus Jelsma <mar...@apache.org>
AuthorDate: Mon Nov 6 17:08:09 2017 +0100

NUTCH-2420 Bug in variable generate.max.count and fetcher.server.delay
---
 src/java/org/apache/nutch/crawl/Generator.java | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/java/org/apache/nutch/crawl/Generator.java 
b/src/java/org/apache/nutch/crawl/Generator.java
index 21607ec..e5f4831 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -179,11 +179,16 @@ public class Generator extends NutchTool implements Tool {
   segCounts = new int[maxNumSegments];
   
   if (job.get(GENERATOR_HOSTDB) != null) {
+maxCountExpr = 
JexlUtil.parseExpression(job.get(GENERATOR_MAX_COUNT_EXPR, null));
+fetchDelayExpr = 
JexlUtil.parseExpression(job.get(GENERATOR_FETCH_DELAY_EXPR, null));
+  }
+}
+
+public void open() {
+  if (conf.get(GENERATOR_HOSTDB) != null) {
 try {
-  Path path = new Path(job.get(GENERATOR_HOSTDB), "current");
-  hostdbReaders = SequenceFileOutputFormat.getReaders(job, path);
-  maxCountExpr = 
JexlUtil.parseExpression(job.get(GENERATOR_MAX_COUNT_EXPR, null));
-  fetchDelayExpr = 
JexlUtil.parseExpression(job.get(GENERATOR_FETCH_DELAY_EXPR, null));
+  Path path = new Path(conf.get(GENERATOR_HOSTDB), "current");
+  hostdbReaders = SequenceFileOutputFormat.getReaders(conf, path);
 } catch (IOException e) {
   LOG.error("Error reading HostDB because {}", e.getMessage());
 }
@@ -287,14 +292,17 @@ public class Generator extends NutchTool implements Tool {
   Text key = new Text();
   HostDatum value = new HostDatum();
   
+  open();
   for (int i = 0; i < hostdbReaders.length; i++) {
 while (hostdbReaders[i].next(key, value)) {
   if (host.equals(key.toString())) {
+close();
 return value;
   }
 }
   }
   
+  close();
   return null;
 }
 

-- 
To stop receiving notification emails like this one, please contact
['"commits@nutch.apache.org" <commits@nutch.apache.org>'].

[nutch] branch master updated: NUTCH-2386 BasicURLNormalizer does not encode curly braces

2017-10-25 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new bd8c847  NUTCH-2386 BasicURLNormalizer does not encode curly braces
bd8c847 is described below

commit bd8c8476b36a465159703c88b75eb08008650136
Author: Markus Jelsma <mar...@apache.org>
AuthorDate: Wed Oct 25 15:00:33 2017 +0200

NUTCH-2386 BasicURLNormalizer does not encode curly braces
---
 .../apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java  | 2 +-
 .../nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java | 8 +++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git 
a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
 
b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index ffd22ce..b6033ae 100644
--- 
a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -250,7 +250,7 @@ public class BasicURLNormalizer extends Configured 
implements URLNormalizer {
 // Traverse over all bytes in this URL
 for (byte b: path.getBytes(utf8)) {
   // Is this a control character?
-  if (b < 33 || b == 91 || b == 93) {
+  if (b < 0x21 || b == 0x5B || b == 0x5D || b == 0x7B || b == 0x7D) {
 // Start escape sequence 
 sb.append('%');
 
diff --git 
a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
 
b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index 2625ea3..5cefbf3 100644
--- 
a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ 
b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -171,6 +171,12 @@ public class TestBasicURLNormalizer {
 normalizeTest("http:;, "http:/");
 normalizeTest("http:///;, "http:/");
   }
+  
+  @Test
+  public void testCurlyBraces() throws Exception {
+// check that leading and trailing spaces are removed
+normalizeTest("http://foo.com/{{stuff}} ", 
"http://foo.com/%7B%7Bstuff%7D%7D;);
+  }
 
   private void normalizeTest(String weird, String normal) throws Exception {
 Assert.assertEquals("normalizing: " + weird, normal,
@@ -181,4 +187,4 @@ public class TestBasicURLNormalizer {
 new TestBasicURLNormalizer().testNormalizer();
   }
 
-}
\ No newline at end of file
+}

-- 
To stop receiving notification emails like this one, please contact
['"commits@nutch.apache.org" <commits@nutch.apache.org>'].

[nutch] branch master updated: NUTCH-2445 Fetcher following outlinks to keep track of already fetched items

2017-10-23 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new 0cdd095  NUTCH-2445 Fetcher following outlinks to keep track of 
already fetched items
0cdd095 is described below

commit 0cdd095c881eed52dc461e559ce6ae278e99157f
Author: Markus Jelsma <mar...@apache.org>
AuthorDate: Mon Oct 23 15:59:13 2017 +0200

NUTCH-2445 Fetcher following outlinks to keep track of already fetched items
---
 .../org/apache/nutch/fetcher/FetchItemQueue.java   |  6 
 .../org/apache/nutch/fetcher/FetcherThread.java| 41 ++
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/src/java/org/apache/nutch/fetcher/FetchItemQueue.java 
b/src/java/org/apache/nutch/fetcher/FetchItemQueue.java
index b67be74..5096b37 100644
--- a/src/java/org/apache/nutch/fetcher/FetchItemQueue.java
+++ b/src/java/org/apache/nutch/fetcher/FetchItemQueue.java
@@ -22,6 +22,8 @@ import java.util.LinkedList;
 import java.util.List;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
+import java.util.HashSet;
+import java.util.Set;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.LongWritable;
@@ -51,6 +53,10 @@ public class FetchItemQueue {
   Text cookie;
   Text variableFetchDelayKey = new Text("_variableFetchDelay_");
   boolean variableFetchDelaySet = false;
+  // keep track of duplicates if fetcher.follow.outlinks.depth > 0. Some urls 
may 
+  // not get followed due to hash collisions. Hashing is used to reduce memory
+  // usage.
+  Set alreadyFetched = new HashSet<>();
   
   public FetchItemQueue(Configuration conf, int maxThreads, long crawlDelay,
   long minCrawlDelay) {
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java 
b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 77947b6..42d5d50 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -198,7 +198,7 @@ public class FetcherThread extends Thread {
   + " - forcing to byHost");
   queueMode = FetchItemQueues.QUEUE_MODE_HOST;
 }
-LOG.info("Using queue mode : " + queueMode);
+LOG.info(getName() + " " + Thread.currentThread().getId() + " Using queue 
mode : " + queueMode);
 this.maxRedirect = conf.getInt("http.redirect.max", 3);
 
 maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
@@ -219,7 +219,7 @@ public class FetcherThread extends Thread {
   if (storingContent) {
 robotsTxtContent = new LinkedList<>();
   } else {
-LOG.warn("Ignoring fetcher.store.robotstxt because not storing content 
(fetcher.store.content)!");
+LOG.warn(getName() + " " + Thread.currentThread().getId() + " Ignoring 
fetcher.store.robotstxt because not storing content (fetcher.store.content)!");
   }
 }
   }
@@ -262,7 +262,7 @@ public class FetcherThread extends Thread {
 continue;
   } else {
 // all done, finish this thread
-LOG.info("Thread " + getName() + " has no more work available");
+LOG.info(getName() + " " + Thread.currentThread().getId() + " has 
no more work available");
 return;
   }
 }
@@ -287,7 +287,7 @@ public class FetcherThread extends Thread {
   
   do {
 if (LOG.isInfoEnabled()) {
-  LOG.info("fetching " + fit.url + " (queue crawl delay="
+  LOG.info(getName() + " " + Thread.currentThread().getId() + " 
fetching " + fit.url + " (queue crawl delay="
   + ((FetchItemQueues) 
fetchQueues).getFetchItemQueue(fit.queueID).crawlDelay
   + "ms)");
 }
@@ -438,7 +438,7 @@ public class FetcherThread extends Thread {
 
 default:
   if (LOG.isWarnEnabled()) {
-LOG.warn("Unknown ProtocolStatus: " + status.getCode());
+LOG.warn(getName() + " " + Thread.currentThread().getId() + " 
Unknown ProtocolStatus: " + status.getCode());
   }
   output(fit.url, fit.datum, null, status,
   CrawlDatum.STATUS_FETCH_RETRY);
@@ -447,7 +447,7 @@ public class FetcherThread extends Thread {
 if (redirecting && redirectCount > maxRedirect) {
   ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
   if (LOG.isInfoEnabled()) {
-LOG.info(" - redirect count exceeded " + fit.url);
+LOG.info(getName() + " " + Thr

[nutch] branch master updated: NUTCH-2444 HostDB CSV dumper to emit field header by default

2017-10-23 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
 new d7e4046  NUTCH-2444 HostDB CSV dumper to emit field header by default
 new 3c21a6b  Merge branch 'master' of 
https://gitbox.apache.org/repos/asf/nutch
d7e4046 is described below

commit d7e4046e6e725ed759d0c43e37c51c5c3122e006
Author: Markus Jelsma <mar...@apache.org>
AuthorDate: Mon Oct 23 15:11:17 2017 +0200

NUTCH-2444 HostDB CSV dumper to emit field header by default
---
 src/java/org/apache/nutch/hostdb/ReadHostDb.java | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java 
b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
index 54649e4..28a7eb7 100644
--- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java
+++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
@@ -62,6 +62,7 @@ public class ReadHostDb extends Configured implements Tool {
   private static final Logger LOG = LoggerFactory
   .getLogger(MethodHandles.lookup().lookupClass());
 
+  public static final String HOSTDB_DUMP_HEADER = "hostdb.dump.field.header";
   public static final String HOSTDB_DUMP_HOSTNAMES = "hostdb.dump.hostnames";
   public static final String HOSTDB_DUMP_HOMEPAGES = "hostdb.dump.homepages";
   public static final String HOSTDB_FILTER_EXPRESSION = 
"hostdb.filter.expression";
@@ -69,12 +70,14 @@ public class ReadHostDb extends Configured implements Tool {
   static class ReadHostDbMapper extends Mapper<Text, HostDatum, Text, Text> {
 protected boolean dumpHostnames = false;
 protected boolean dumpHomepages = false;
+protected boolean fieldHeader = true;
 protected Text emptyText = new Text();
 protected Expression expr = null;
 
 public void setup(Context context) {
   dumpHomepages = 
context.getConfiguration().getBoolean(HOSTDB_DUMP_HOMEPAGES, false);
   dumpHostnames = 
context.getConfiguration().getBoolean(HOSTDB_DUMP_HOSTNAMES, false);
+  fieldHeader = context.getConfiguration().getBoolean(HOSTDB_DUMP_HEADER, 
true);
   String expr = context.getConfiguration().get(HOSTDB_FILTER_EXPRESSION);
   if (expr != null) {
 // Create or retrieve a JexlEngine
@@ -89,7 +92,12 @@ public class ReadHostDb extends Configured implements Tool {
   }
 }
 
-public void map(Text key, HostDatum datum, Context context) throws 
IOException, InterruptedException { 
+public void map(Text key, HostDatum datum, Context context) throws 
IOException, InterruptedException {
+  if (fieldHeader && !dumpHomepages && !dumpHostnames) {
+context.write(new Text("hostname"), new 
Text("unfetched\tfetched\tgone\tredirTemp\tredirPerm\tredirSum\tok\tnumRecords\tdnsFail\tcnxFail\tsumFail\tscore\tlastCheck\thomepage\tmetadata"));
+fieldHeader = false;
+  }
+  
   if (expr != null) {
 // Create a context and add data
 JexlContext jcontext = new MapContext();

-- 
To stop receiving notification emails like this one, please contact
['"commits@nutch.apache.org" <commits@nutch.apache.org>'].

[nutch] branch master updated: NUTCH-2367 Get single record from HostDB

2017-03-16 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

The following commit(s) were added to refs/heads/master by this push:
   new  be3aea1   NUTCH-2367 Get single record from HostDB
be3aea1 is described below

commit be3aea1410835b34cfacdff7c3def9fb01a83e76
Author: Markus Jelsma <mar...@apache.org>
AuthorDate: Thu Mar 16 11:40:02 2017 +0100

NUTCH-2367 Get single record from HostDB
---
 src/java/org/apache/nutch/hostdb/ReadHostDb.java | 39 ++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java 
b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
index 5b08504..17e135a 100644
--- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java
+++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
@@ -30,9 +30,11 @@ import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.io.FloatWritable;
 import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
@@ -200,6 +202,29 @@ public class ReadHostDb extends Configured implements Tool 
{
 long end = System.currentTimeMillis();
 LOG.info("ReadHostDb: finished at " + sdf.format(end) + ", elapsed: " + 
TimingUtil.elapsedTime(start, end));
   }
+  
+  private void getHostDbRecord(Path hostDb, String host) throws Exception {
+Configuration conf = getConf();
+SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(conf, 
hostDb);
+
+Class keyClass = readers[0].getKeyClass();
+Class valueClass = readers[0].getValueClass();
+
+if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))
+  throw new IOException("Incompatible key (" + keyClass.getName() + ")");
+  
+Text key = (Text) keyClass.newInstance();
+HostDatum value = (HostDatum) valueClass.newInstance();
+
+for (int i = 0; i < readers.length; i++) {
+  while (readers[i].next(key, value)) {
+if (host.equals(key.toString())) {
+  System.out.println(value.toString());
+}
+  }
+  readers[i].close();
+}
+  }
 
   public static void main(String args[]) throws Exception {
 int res = ToolRunner.run(NutchConfiguration.create(), new ReadHostDb(), 
args);
@@ -208,13 +233,14 @@ public class ReadHostDb extends Configured implements 
Tool {
 
   public int run(String[] args) throws Exception {
 if (args.length < 2) {
-  System.err.println("Usage: ReadHostDb   [-dumpHomepages 
| -dumpHostnames | -expr ]");
+  System.err.println("Usage: ReadHostDb  [-get ] [ 
[-dumpHomepages | -dumpHostnames | -expr ]]");
   return -1;
 }
 
 boolean dumpHomepages = false;
 boolean dumpHostnames = false;
 String expr = null;
+String get = null;
 
 for (int i = 0; i < args.length; i++) {
   if (args[i].equals("-dumpHomepages")) {
@@ -225,6 +251,11 @@ public class ReadHostDb extends Configured implements Tool 
{
 LOG.info("ReadHostDb: dumping hostnames");
 dumpHostnames = true;
   }
+  if (args[i].equals("-get")) {
+get = args[i + 1];
+LOG.info("ReadHostDb: get: "+ get);
+i++;
+  }
   if (args[i].equals("-expr")) {
 expr = args[i + 1];
 LOG.info("ReadHostDb: evaluating expression: " + expr);
@@ -233,7 +264,11 @@ public class ReadHostDb extends Configured implements Tool 
{
 }
 
 try {
-  readHostDb(new Path(args[0]), new Path(args[1]), dumpHomepages, 
dumpHostnames, expr);
+  if (get != null) {
+getHostDbRecord(new Path(args[0], "current"), get);
+  } else {
+readHostDb(new Path(args[0]), new Path(args[1]), dumpHomepages, 
dumpHostnames, expr);
+  }
   return 0;
 } catch (Exception e) {
   LOG.error("ReadHostDb: " + StringUtils.stringifyException(e));

-- 
To stop receiving notification emails like this one, please contact
['"commits@nutch.apache.org" <commits@nutch.apache.org>'].

[nutch] branch master updated: NUTCH-2366 Deprecated Job constructor in hostdb/ReadHostDb.java\

2017-03-15 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

The following commit(s) were added to refs/heads/master by this push:
   new  3926910   NUTCH-2366 Deprecated Job constructor in 
hostdb/ReadHostDb.java\
3926910 is described below

commit 3926910e145df083ec9d42cd397c0cbd9b3a16da
Author: Markus Jelsma <mar...@apache.org>
AuthorDate: Wed Mar 15 13:04:25 2017 +0100

NUTCH-2366 Deprecated Job constructor in hostdb/ReadHostDb.java\
---
 src/java/org/apache/nutch/hostdb/ReadHostDb.java | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/java/org/apache/nutch/hostdb/ReadHostDb.java 
b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
index ab3ec0c..5b08504 100644
--- a/src/java/org/apache/nutch/hostdb/ReadHostDb.java
+++ b/src/java/org/apache/nutch/hostdb/ReadHostDb.java
@@ -173,7 +173,8 @@ public class ReadHostDb extends Configured implements Tool {
 conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
 conf.set("mapred.textoutputformat.separator", "\t");
 
-Job job = new Job(conf, "ReadHostDb");
+Job job = Job.getInstance(conf);
+job.setJobName("ReadHostDb");
 job.setJarByClass(ReadHostDb.class);
 
 FileInputFormat.addInputPath(job, new Path(hostDb, "current"));
@@ -239,4 +240,4 @@ public class ReadHostDb extends Configured implements Tool {
   return -1;
 }
   }
-}
\ No newline at end of file
+}

-- 
To stop receiving notification emails like this one, please contact
['"commits@nutch.apache.org" <commits@nutch.apache.org>'].

[nutch] branch master updated: remove test again

2017-03-15 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

The following commit(s) were added to refs/heads/master by this push:
   new  6d47e14   remove test again
6d47e14 is described below

commit 6d47e14352540bdd0f0630e7a2aa0967f08122bc
Author: Markus Jelsma <mar...@apache.org>
AuthorDate: Wed Mar 15 12:59:21 2017 +0100

remove test again
---
 test | 0
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/test b/test
deleted file mode 100644
index e69de29..000

-- 
To stop receiving notification emails like this one, please contact
['"commits@nutch.apache.org" <commits@nutch.apache.org>'].

[nutch] branch master updated: test markus using git box

2017-03-15 Thread markus

This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

The following commit(s) were added to refs/heads/master by this push:
   new  7143a4c   test markus using git box
7143a4c is described below

commit 7143a4c68f52905537a6f22c8b2d46cb7610e238
Author: Markus Jelsma <mar...@apache.org>
AuthorDate: Wed Mar 15 12:58:15 2017 +0100

    test markus using git box
---
 test | 0
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/test b/test
new file mode 100644
index 000..e69de29

-- 
To stop receiving notification emails like this one, please contact
['"commits@nutch.apache.org" <commits@nutch.apache.org>'].

nutch git commit: NUTCH-2359 Parsefilter-regex raises IndexOutOfBoundsException when rules are ill-formed

2017-02-14 Thread markus

Repository: nutch
Updated Branches:
  refs/heads/master 76aedcb78 -> 9a9c4b32b


NUTCH-2359 Parsefilter-regex raises IndexOutOfBoundsException when rules are 
ill-formed


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/9a9c4b32
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/9a9c4b32
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/9a9c4b32

Branch: refs/heads/master
Commit: 9a9c4b32b9c1ab9c47583a217665e4694272d58a
Parents: 76aedcb
Author: Markus Jelsma <mar...@apache.org>
Authored: Tue Feb 14 14:15:32 2017 +0100
Committer: Markus Jelsma <mar...@apache.org>
Committed: Tue Feb 14 14:15:32 2017 +0100

--
 src/plugin/parsefilter-regex/README.txt | 41 
 .../parsefilter/regex/RegexParseFilter.java | 18 +
 2 files changed, 52 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/9a9c4b32/src/plugin/parsefilter-regex/README.txt
--
diff --git a/src/plugin/parsefilter-regex/README.txt 
b/src/plugin/parsefilter-regex/README.txt
new file mode 100644
index 000..1fac05f
--- /dev/null
+++ b/src/plugin/parsefilter-regex/README.txt
@@ -0,0 +1,41 @@
+Parsefilter-regex plugin
+
+Allow parsing and set custom defined fields using regex. Rules can be defined
+in a separate rule file or in the nutch configuration.
+
+If a rule file is used, should create a text file regex-parsefilter.txt (which
+is the default name of the rules file). To use a different filename, either
+update the file value in pluginâs build.xml or add parsefilter.regex.file
+config to the nutch config.
+
+ie:
+
+  parsefilter.regex.file
+  
+   /path/to/rulefile
+  
+\t\t\n
+
+ie:
+   my_first_field  htmlh1
+   my_second_field textmy_pattern
+
+
+If a rule file is not used, rules can be directly set in the nutch config:
+
+ie:
+
+  parsefilter.regex.rules
+  
+   my_first_field  htmlh1
+   my_second_field textmy_pattern
+  
+http://git-wip-us.apache.org/repos/asf/nutch/blob/9a9c4b32/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
--
diff --git 
a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
 
b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
index 6955166..f799e5f 100644
--- 
a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
+++ 
b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
@@ -179,13 +179,17 @@ public class RegexParseFilter implements HtmlParseFilter {
 while ((line = reader.readLine()) != null) {
   if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
 line = line.trim();
-String[] parts = line.split("\t");
-
-String field = parts[0].trim();
-String source = parts[1].trim();
-String regex = parts[2].trim();
-
-rules.put(field, new RegexRule(source, regex));
+String[] parts = line.split("\\s");
+
+if (parts.length == 3) {
+String field = parts[0].trim();
+String source = parts[1].trim();
+String regex = parts[2].trim();
+
+rules.put(field, new RegexRule(source, regex));
+} else {
+LOG.info("RegexParseFilter rule is invalid. " + line);
+}
   }
 }
   }

nutch git commit: revert 2320

2016-10-06 Thread markus

Repository: nutch
Updated Branches:
  refs/heads/master 836b2e01d -> d4c924e56


revert 2320


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/d4c924e5
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/d4c924e5
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/d4c924e5

Branch: refs/heads/master
Commit: d4c924e56030d6b1fa3b115686e80c8cf516db61
Parents: 836b2e0
Author: Markus Jelsma <mar...@apache.org>
Authored: Thu Oct 6 10:56:50 2016 +0200
Committer: Markus Jelsma <mar...@apache.org>
Committed: Thu Oct 6 10:56:50 2016 +0200

--
 .../org/apache/nutch/net/URLFilterChecker.java  | 181 ++-
 1 file changed, 59 insertions(+), 122 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/d4c924e5/src/java/org/apache/nutch/net/URLFilterChecker.java
--
diff --git a/src/java/org/apache/nutch/net/URLFilterChecker.java 
b/src/java/org/apache/nutch/net/URLFilterChecker.java
index 86b91e2..89a3d00 100644
--- a/src/java/org/apache/nutch/net/URLFilterChecker.java
+++ b/src/java/org/apache/nutch/net/URLFilterChecker.java
@@ -17,27 +17,16 @@
 
 package org.apache.nutch.net;
 
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
-import java.io.PrintWriter;
-import java.net.ServerSocket;
-import java.net.Socket;
-import java.net.InetSocketAddress;
-import java.nio.charset.Charset;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
 import org.apache.nutch.plugin.Extension;
 import org.apache.nutch.plugin.ExtensionPoint;
 import org.apache.nutch.plugin.PluginRepository;
-import org.apache.nutch.util.NutchConfiguration;
 
 import org.apache.hadoop.conf.Configuration;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
 
 /**
  * Checks one given filter or all filters.
@@ -47,118 +36,62 @@ import org.slf4j.LoggerFactory;
 public class URLFilterChecker {
 
   private Configuration conf;
-  private static String filterName = null;
-  protected static boolean keepClientCnxOpen = false;
-  protected static int tcpPort = -1;
-  protected URLFilters filters = null;
-  
-  public static final Logger LOG = LoggerFactory
-  .getLogger(URLFilterChecker.class);
-  
+
   public URLFilterChecker(Configuration conf) {
-System.out.println("Checking combination of all URLFilters available");
 this.conf = conf;
-if (filterName != null) {
-this.conf.set("plugin.includes", filterName);
-}
-filters = new URLFilters(this.conf);
   }
-  
-  public void run() throws Exception {
-// In listening mode?
-if (tcpPort == -1) {
-  // No, just fetch and display
-  checkStdin();
-} else {
-  // Listen on socket and start workers on incoming requests
-  listen();
-}
-  }
-  
-  private void listen() throws Exception {
-ServerSocket server = null;
-
-try{
-  server = new ServerSocket();
-  server.bind(new InetSocketAddress(tcpPort));
-  LOG.info(server.toString());
-} catch (Exception e) {
-  LOG.error("Could not listen on port " + tcpPort);
-  System.exit(-1);
-}
-
-while(true){
-  Worker worker;
-  try{
-worker = new Worker(server.accept());
-Thread thread = new Thread(worker);
-thread.start();
-  } catch (Exception e) {
-LOG.error("Accept failed: " + tcpPort);
-System.exit(-1);
-  }
-}
-  }
-  
-  private class Worker implements Runnable {
-private Socket client;
 
-Worker(Socket client) {
-  this.client = client;
-  LOG.info(client.toString());
-}
+  private void checkOne(String filterName) throws Exception {
+URLFilter filter = null;
+
+ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+URLFilter.X_POINT_ID);
+
+if (point == null)
+  throw new RuntimeException(URLFilter.X_POINT_ID + " not found.");
 
-public void run() {
-  if (keepClientCnxOpen) {
-while (true) { // keep connection open until closes
-  readWrite();
-}
+Extension[] extensions = point.getExtensions();
+
+for (int i = 0; i < extensions.length; i++) {
+  Extension extension = extensions[i];
+  filter = (URLFilter) extension.getExtensionInstance();
+  if (filter.getClass().getName().equals(filterName)) {
+break;
   } else {
-readWrite();
-
-try { // close ourselves
-  client.close();
-} catch (Exception e){
-  LOG.error(e.toString());
-}
+filter = null;

nutch git commit: NUTCH-2320 URLFilterChecker to run as TCP Telnet service

2016-10-05 Thread markus

Repository: nutch
Updated Branches:
  refs/heads/master e53b34b23 -> 836b2e01d


NUTCH-2320 URLFilterChecker to run as TCP Telnet service


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/836b2e01
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/836b2e01
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/836b2e01

Branch: refs/heads/master
Commit: 836b2e01d1a4e0e9443601da755ea37de91b8c7d
Parents: e53b34b
Author: Markus Jelsma <mar...@apache.org>
Authored: Wed Oct 5 14:53:05 2016 +0200
Committer: Markus Jelsma <mar...@apache.org>
Committed: Wed Oct 5 14:53:05 2016 +0200

--
 .../org/apache/nutch/net/URLFilterChecker.java  | 181 +--
 1 file changed, 122 insertions(+), 59 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/836b2e01/src/java/org/apache/nutch/net/URLFilterChecker.java
--
diff --git a/src/java/org/apache/nutch/net/URLFilterChecker.java 
b/src/java/org/apache/nutch/net/URLFilterChecker.java
index 89a3d00..86b91e2 100644
--- a/src/java/org/apache/nutch/net/URLFilterChecker.java
+++ b/src/java/org/apache/nutch/net/URLFilterChecker.java
@@ -17,16 +17,27 @@
 
 package org.apache.nutch.net;
 
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.net.InetSocketAddress;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
 import org.apache.nutch.plugin.Extension;
 import org.apache.nutch.plugin.ExtensionPoint;
 import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.NutchConfiguration;
 
 import org.apache.hadoop.conf.Configuration;
 
-import org.apache.nutch.util.NutchConfiguration;
-
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Checks one given filter or all filters.
@@ -36,62 +47,118 @@ import java.io.InputStreamReader;
 public class URLFilterChecker {
 
   private Configuration conf;
-
+  private static String filterName = null;
+  protected static boolean keepClientCnxOpen = false;
+  protected static int tcpPort = -1;
+  protected URLFilters filters = null;
+  
+  public static final Logger LOG = LoggerFactory
+  .getLogger(URLFilterChecker.class);
+  
   public URLFilterChecker(Configuration conf) {
+System.out.println("Checking combination of all URLFilters available");
 this.conf = conf;
+if (filterName != null) {
+this.conf.set("plugin.includes", filterName);
+}
+filters = new URLFilters(this.conf);
   }
+  
+  public void run() throws Exception {
+// In listening mode?
+if (tcpPort == -1) {
+  // No, just fetch and display
+  checkStdin();
+} else {
+  // Listen on socket and start workers on incoming requests
+  listen();
+}
+  }
+  
+  private void listen() throws Exception {
+ServerSocket server = null;
+
+try{
+  server = new ServerSocket();
+  server.bind(new InetSocketAddress(tcpPort));
+  LOG.info(server.toString());
+} catch (Exception e) {
+  LOG.error("Could not listen on port " + tcpPort);
+  System.exit(-1);
+}
+
+while(true){
+  Worker worker;
+  try{
+worker = new Worker(server.accept());
+Thread thread = new Thread(worker);
+thread.start();
+  } catch (Exception e) {
+LOG.error("Accept failed: " + tcpPort);
+System.exit(-1);
+  }
+}
+  }
+  
+  private class Worker implements Runnable {
+private Socket client;
 
-  private void checkOne(String filterName) throws Exception {
-URLFilter filter = null;
-
-ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
-URLFilter.X_POINT_ID);
-
-if (point == null)
-  throw new RuntimeException(URLFilter.X_POINT_ID + " not found.");
-
-Extension[] extensions = point.getExtensions();
+Worker(Socket client) {
+  this.client = client;
+  LOG.info(client.toString());
+}
 
-for (int i = 0; i < extensions.length; i++) {
-  Extension extension = extensions[i];
-  filter = (URLFilter) extension.getExtensionInstance();
-  if (filter.getClass().getName().equals(filterName)) {
-break;
+public void run() {
+  if (keepClientCnxOpen) {
+while (true) { // keep connection open until closes
+  readWrite();
+}
   } else {
-filter = null;
+readWrite();
+
+try { // close ourselves
+  client.close();
+} catch (Exception e){
+  LOG.error

nutch git commit: NUTCH-2272 Index checker server to optionally keep client connection open

2016-06-03 Thread markus

Repository: nutch
Updated Branches:
  refs/heads/master 7956daee8 -> beb48a84b


NUTCH-2272 Index checker server to optionally keep client connection open


Project: http://git-wip-us.apache.org/repos/asf/nutch/repo
Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/beb48a84
Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/beb48a84
Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/beb48a84

Branch: refs/heads/master
Commit: beb48a84b2be52f92af24956ae59286ad116913c
Parents: 7956dae
Author: Markus Jelsma <mar...@apache.org>
Authored: Fri Jun 3 15:02:12 2016 +0200
Committer: Markus Jelsma <mar...@apache.org>
Committed: Fri Jun 3 15:02:12 2016 +0200

--
 CHANGES.txt |  1 +
 .../nutch/indexer/IndexingFiltersChecker.java   | 35 ++--
 2 files changed, 25 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/nutch/blob/beb48a84/CHANGES.txt
--
diff --git a/CHANGES.txt b/CHANGES.txt
index ffcf5ae..877f23b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -37,6 +37,7 @@ Bug
 
 Improvement
 
+[NUTCH-2272] - Index checker server to optionally keep client connection 
open
 [NUTCH-1233] - Rely on Tika for outlink extraction
 [NUTCH-1712] - Use MultipleInputs in Injector to make it a single 
mapreduce job
 [NUTCH-2172] - index-more: document format of contenttype-mapping.txt

http://git-wip-us.apache.org/repos/asf/nutch/blob/beb48a84/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
--
diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
index da4123f..2e1b9c2 100644
--- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
+++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
@@ -69,6 +69,7 @@ public class IndexingFiltersChecker extends Configured 
implements Tool {
   protected URLNormalizers normalizers = null;
   protected boolean dumpText = false;
   protected boolean followRedirects = false;
+  protected boolean keepClientCnxOpen = false;
   // used to simulate the metadata propagated from injection
   protected HashMap<String, String> metadata = new HashMap<String, String>();
   protected int tcpPort = -1;
@@ -82,7 +83,7 @@ public class IndexingFiltersChecker extends Configured 
implements Tool {
 
   public int run(String[] args) throws Exception {
 String url = null;
-String usage = "Usage: IndexingFiltersChecker [-normalize] 
[-followRedirects] [-dumpText] [-md key=value] [-listen ] ";
+String usage = "Usage: IndexingFiltersChecker [-normalize] 
[-followRedirects] [-dumpText] [-md key=value] [-listen ] 
[-keepClientCnxOpen]";
 
 if (args.length == 0) {
   System.err.println(usage);
@@ -96,6 +97,8 @@ public class IndexingFiltersChecker extends Configured 
implements Tool {
 tcpPort = Integer.parseInt(args[++i]);
   } else if (args[i].equals("-followRedirects")) {
 followRedirects = true;
+  } else if (args[i].equals("-keepClientCnxOpen")) {
+keepClientCnxOpen = true;
   } else if (args[i].equals("-dumpText")) {
 dumpText = true;
   } else if (args[i].equals("-md")) {
@@ -164,7 +167,23 @@ public class IndexingFiltersChecker extends Configured 
implements Tool {
   LOG.info(client.toString());
 }
 
-public void run(){
+public void run() {
+  if (keepClientCnxOpen) {
+while (true) { // keep connection open until closes
+  readWrite();
+}
+  } else {
+readWrite();
+
+try { // close ourselves
+  client.close();
+} catch (Exception e){
+  LOG.error(e.toString());
+}
+  }
+}
+
+protected void readWrite() {
   String line;
   BufferedReader in = null;
   PrintWriter out = null;
@@ -185,14 +204,6 @@ public class IndexingFiltersChecker extends Configured 
implements Tool {
   }catch (Exception e) {
 LOG.error("Read/Write failed: " + e);
   }
-  
-  try {
-client.close();
-  } catch (Exception e){
-LOG.error(e.toString());
-  }
-  
-  return;
 }
   }
 
@@ -331,6 +342,8 @@ public class IndexingFiltersChecker extends Configured 
implements Tool {
 }
   }
 }
+
+output.append("\n"); // For readability if keepClientCnxOpen
 
 if (getConf().getBoolean("doIndex", false) && doc != null) {
   IndexWriters writers = new IndexWriters(getConf());
@@ -355,4 +368,4 @@ public class IndexingFiltersChecker extends Configured 
implements Tool {
 new IndexingFiltersChecker(), args);
 System.exit(res);
   }
-}
+}
\ No newline at end of file

svn commit: r1732332 - /nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java

2016-02-25 Thread markus

Author: markus
Date: Thu Feb 25 16:44:18 2016
New Revision: 1732332

URL: http://svn.apache.org/viewvc?rev=1732332=rev
Log:
NUTCH-2231 Jexl support in generator job

Modified:
nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java

Modified: nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java?rev=1732332=1732331=1732332=diff
==
--- nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java Thu Feb 25 
16:44:18 2016
@@ -47,6 +47,8 @@ public class JexlUtil {
* @return parsed Jexl expression or null in case of parse error
*/
   public static Expression parseExpression(String expr) {
+if (expr == null) return null;
+
 try {
   // Translate any date object into a long, dates must be specified as 
20-03-2016T00:00:00Z
   Matcher matcher = datePattern.matcher(expr);

svn commit: r1732177 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDatum.java src/java/org/apache/nutch/crawl/CrawlDbReader.java src/java/org/apache/nutch/crawl/Generator.java sr

2016-02-24 Thread markus

Author: markus
Date: Wed Feb 24 15:51:21 2016
New Revision: 1732177

URL: http://svn.apache.org/viewvc?rev=1732177=rev
Log:
NUTCH-2231 Jexl support in generator job

Added:
nutch/trunk/src/java/org/apache/nutch/util/JexlUtil.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1732177=1732176=1732177=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Feb 24 15:51:21 2016
@@ -10,6 +10,8 @@ in the release announcement and keep it
 
 Nutch Change Log
 
+* NUTCH-2231 Jexl support in generator job (markus)
+
 * NUTCH-2232 DeduplicationJob should decode URL's before length is compared 
(Ron van der Vegt via markus)
 
 * NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=1732177=1732176=1732177=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Wed Feb 24 
15:51:21 2016
@@ -534,7 +534,7 @@ public class CrawlDatum implements Writa
   jcontext.set("interval", new Integer(getFetchInterval()));
   jcontext.set("score", getScore());
   jcontext.set("signature", StringUtil.toHexString(getSignature()));
-  
+
   // Set metadata variables
   for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
 Object value = entry.getValue();
@@ -553,15 +553,11 @@ public class CrawlDatum implements Writa
 
 if (value instanceof Text) {
   Text tvalue = (Text)value;
-  Text tkey = (Text)entry.getKey();
-  
-  try {
-Float number = Float.parseFloat(tvalue.toString());
-jcontext.set(tkey.toString(), number);
-  } catch (Exception e) {}
+  Text tkey = (Text)entry.getKey(); 
+  jcontext.set(tkey.toString().replace("-", "_"), tvalue.toString());
 }
   }
-
+  
   try {
 if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
   return true;

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1732177=1732176=1732177=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Wed Feb 24 
15:51:21 2016
@@ -65,6 +65,7 @@ import org.apache.hadoop.mapred.lib.Iden
 import org.apache.hadoop.util.Progressable;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
+import org.apache.nutch.util.JexlUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.StringUtil;
@@ -508,8 +509,10 @@ public class CrawlDbReader extends Confi
   job.set("regex", regex);
 if (retry != null)
   job.setInt("retry", retry);
-if (expr != null)
+if (expr != null) {
   job.set("expr", expr);
+  LOG.info("CrawlDb db: expr: " + expr);
+}
 
 job.setMapperClass(CrawlDbDumpMapper.class);
 job.setOutputKeyClass(Text.class);
@@ -523,7 +526,6 @@ public class CrawlDbReader extends Confi
 
   public static class CrawlDbDumpMapper implements
   Mapper<Text, CrawlDatum, Text, CrawlDatum> {
-Pattern datePattern = 
Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
 Pattern pattern = null;
 Matcher matcher = null;
 String status = null;
@@ -536,30 +538,9 @@ public class CrawlDbReader extends Confi
   }
   status = job.get("status", null);
   retry = job.getInt("retry", -1);
-  String exprStr = job.get("expr", null);
   
   if (job.get("expr", null) != null) {
-try {
-  // Translate any date object into a long, dates must be specified as 
20-03-2016T00:00:00Z
-  Matcher matcher = datePattern.matcher(exprStr);
-  if (matcher.find()) {
-String date = matcher.group();
-
-// Parse the thing and get epoch!
-Date parsedDate = DateUtils.parseDateStrictly(date, new String[] 
{"-MM-dd'T'HH:mm:ss'Z'"});
-

svn commit: r1732160 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/DeduplicationJob.java

2016-02-24 Thread markus

Author: markus
Date: Wed Feb 24 14:12:42 2016
New Revision: 1732160

URL: http://svn.apache.org/viewvc?rev=1732160=rev
Log:
NUTCH-2232 DeduplicationJob should decode URL's before length is compared

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1732160=1732159=1732160=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Feb 24 14:12:42 2016
@@ -10,6 +10,8 @@ in the release announcement and keep it
 
 Nutch Change Log
 
+* NUTCH-2232 DeduplicationJob should decode URL's before length is compared 
(Ron van der Vegt via markus)
+
 * NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes (markus)
 
 * NUTCH-2227 RegexParseFilter (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1732160=1732159=1732160=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Wed Feb 
24 14:12:42 2016
@@ -17,6 +17,8 @@
 package org.apache.nutch.crawl;
 
 import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
 import java.text.SimpleDateFormat;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -193,8 +195,15 @@ public class DeduplicationJob extends Nu
   break;
 case "urlLength":
   // same time? keep the one which has the shortest URL
-  String urlExisting = 
existingDoc.getMetaData().get(urlKey).toString();
-  String urlnewDoc = newDoc.getMetaData().get(urlKey).toString();
+  String urlExisting;
+  String urlnewDoc;
+  try {
+urlExisting = 
URLDecoder.decode(existingDoc.getMetaData().get(urlKey).toString(), "UTF8");
+urlnewDoc = 
URLDecoder.decode(newDoc.getMetaData().get(urlKey).toString(), "UTF8");
+  } catch (UnsupportedEncodingException e) {
+LOG.error("Error decoding: " + urlKey);
+throw new IOException("UnsupportedEncodingException for " + 
urlKey);
+  }
   if (urlExisting.length() < urlnewDoc.length()) {
 // mark new one as duplicate
 writeOutAsDuplicate(newDoc, output, reporter);

svn commit: r1732140 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDatum.java src/java/org/apache/nutch/crawl/CrawlDbReader.java

2016-02-24 Thread markus

Author: markus
Date: Wed Feb 24 13:05:02 2016
New Revision: 1732140

URL: http://svn.apache.org/viewvc?rev=1732140=rev
Log:
NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1732140=1732139=1732140=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Feb 24 13:05:02 2016
@@ -10,6 +10,8 @@ in the release announcement and keep it
 
 Nutch Change Log
 
+* NUTCH-2229 Allow Jexl expressions on CrawlDatum's fixed attributes (markus)
+
 * NUTCH-2227 RegexParseFilter (markus)
 
 * NUTCH-2221 Introduce db.ignore.internal.links to FetcherThread (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=1732140=1732139=1732140=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Wed Feb 24 
13:05:02 2016
@@ -521,30 +521,20 @@ public class CrawlDatum implements Writa
 }
   }
   
-  public boolean evaluate(String expr) {
-return evaluate(expr, true, true);
-  }
-  
-  public boolean evaluate(String expr, boolean silent, boolean strict) {
-if (expr != null) {
-  // Create or retrieve a JexlEngine
-  JexlEngine jexl = new JexlEngine();
-  
-  jexl.setSilent(silent);
-  jexl.setStrict(strict);
-  
-  // Create an expression object and evaluate
-  return evaluate(jexl.createExpression(expr));
-}
-
-return false;
-  }
-  
   public boolean evaluate(Expression expr) {
 if (expr != null) {
   // Create a context and add data
   JexlContext jcontext = new MapContext();
-
+  
+  // https://issues.apache.org/jira/browse/NUTCH-2229
+  jcontext.set("status", getStatusName(getStatus()));
+  jcontext.set("fetchTime", (long)(getFetchTime()));
+  jcontext.set("modifiedTime", (long)(getModifiedTime()));
+  jcontext.set("retries", getRetriesSinceFetch());
+  jcontext.set("interval", new Integer(getFetchInterval()));
+  jcontext.set("score", getScore());
+  jcontext.set("signature", StringUtil.toHexString(getSignature()));
+  
   // Set metadata variables
   for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
 Object value = entry.getValue();
@@ -571,7 +561,7 @@ public class CrawlDatum implements Writa
   } catch (Exception e) {}
 }
   }
-  
+
   try {
 if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
   return true;

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1732140=1732139=1732140=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Wed Feb 24 
13:05:02 2016
@@ -70,6 +70,7 @@ import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.StringUtil;
 import org.apache.commons.jexl2.Expression;
 import org.apache.commons.jexl2.JexlEngine;
+import org.apache.commons.lang.time.DateUtils;
 
 /**
  * Read utility for the CrawlDB.
@@ -522,6 +523,7 @@ public class CrawlDbReader extends Confi
 
   public static class CrawlDbDumpMapper implements
   Mapper<Text, CrawlDatum, Text, CrawlDatum> {
+Pattern datePattern = 
Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
 Pattern pattern = null;
 Matcher matcher = null;
 String status = null;
@@ -534,12 +536,30 @@ public class CrawlDbReader extends Confi
   }
   status = job.get("status", null);
   retry = job.getInt("retry", -1);
-
+  String exprStr = job.get("expr", null);
+  
   if (job.get("expr", null) != null) {
-JexlEngine jexl = new JexlEngine();
-jexl.setSilent(true);
-jexl.setStrict(true);
-expr = jexl.createExpression(job.get("expr", null));
+try {
+  // Translate any date object into a long, dates must be specified as 
20-03-2016T00:00:00Z
+  Matcher matcher = datePattern.matcher(exprStr);
+  if (matcher.find()) {
+String date = matcher.group();
+
+// Parse the thing and get epoch!
+

svn commit: r1731849 - in /nutch/trunk: ./ conf/ src/plugin/ src/plugin/parsefilter-regex/ src/plugin/parsefilter-regex/data/ src/plugin/parsefilter-regex/src/ src/plugin/parsefilter-regex/src/java/ s

2016-02-23 Thread markus

Author: markus
Date: Tue Feb 23 12:58:54 2016
New Revision: 1731849

URL: http://svn.apache.org/viewvc?rev=1731849=rev
Log:
NUTCH-2227 RegexParseFilter

Added:
nutch/trunk/conf/regex-parsefilter.txt
nutch/trunk/src/plugin/parsefilter-regex/
nutch/trunk/src/plugin/parsefilter-regex/build.xml
nutch/trunk/src/plugin/parsefilter-regex/data/
nutch/trunk/src/plugin/parsefilter-regex/data/regex-parsefilter.txt
nutch/trunk/src/plugin/parsefilter-regex/ivy.xml
nutch/trunk/src/plugin/parsefilter-regex/plugin.xml
nutch/trunk/src/plugin/parsefilter-regex/src/
nutch/trunk/src/plugin/parsefilter-regex/src/java/
nutch/trunk/src/plugin/parsefilter-regex/src/java/org/
nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/
nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/

nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/

nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/

nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java

nutch/trunk/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/package-info.java
nutch/trunk/src/plugin/parsefilter-regex/src/test/
nutch/trunk/src/plugin/parsefilter-regex/src/test/org/
nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/
nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/

nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/

nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/

nutch/trunk/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/build.xml
nutch/trunk/default.properties
nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1731849=1731848=1731849=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Feb 23 12:58:54 2016
@@ -10,6 +10,8 @@ in the release announcement and keep it
 
 Nutch Change Log
 
+* NUTCH-2227 RegexParseFilter (markus)
+
 * NUTCH-2221 Introduce db.ignore.internal.links to FetcherThread (markus)
 
 * NUTCH-2220 Rename db.* options used only by the linkdb to linkdb.* (markus)

Modified: nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1731849=1731848=1731849=diff
==
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Tue Feb 23 12:58:54 2016
@@ -200,6 +200,7 @@
   
   
   
+  
   
   
   
@@ -637,6 +638,7 @@
   
   
   
+  
   
   
   
@@ -1048,6 +1050,8 @@
 
 
 
+
+
 
 
 

Added: nutch/trunk/conf/regex-parsefilter.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/regex-parsefilter.txt?rev=1731849=auto
==
--- nutch/trunk/conf/regex-parsefilter.txt (added)
+++ nutch/trunk/conf/regex-parsefilter.txt Tue Feb 23 12:58:54 2016
@@ -0,0 +1,8 @@
+# Example configuration file for parsefilter-regex
+#
+# Parse metadata field  is set to true if the HTML matches the regex. The
+# source can either be html or text. If source is html, the regex is applied to
+# the entire HTML tree. If source is text, the regex is applied to the
+# extracted text.
+#
+# format: \t\t\n

Modified: nutch/trunk/default.properties
URL: 
http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1731849=1731848=1731849=diff
==
--- nutch/trunk/default.properties (original)
+++ nutch/trunk/default.properties Tue Feb 23 12:58:54 2016
@@ -143,6 +143,7 @@ plugins.parse=\
 plugins.parsefilter=\
org.apache.nutch.parse.headings*:\
org.apache.nutch.parsefilter.naivebayes*:\
+   org.apache.nutch.parsefilter.regex*:\
org.apache.nutch.parse.metatags*
 
 #

Modified: nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1731849=1731848=1731849=diff
==
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Tue Feb 23 12:58:54 2016
@@ -77,6 +77,7 @@
  
  
  
+ 
  
  
  
@@ -114,6 +115,7 @@
  
  
  
+ 
  
  
  
@@ -176,6 +178,7 @@
 
 
 
+
 
 
 

Added: nutch/trunk/src/plugin/parsefilter-regex/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parsefilter-regex/build.xml?rev=1731849=auto

svn commit: r1731836 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/fetcher/FetcherThread.java src/java/org/apache/nutch/parse/ParseOutputFormat.java

2016-02-23 Thread markus

Author: markus
Date: Tue Feb 23 10:38:31 2016
New Revision: 1731836

URL: http://svn.apache.org/viewvc?rev=1731836=rev
Log:
NUTCH-2221 Introduce db.ignore.internal.links to FetcherThread

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java
nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1731836=1731835=1731836=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Feb 23 10:38:31 2016
@@ -10,6 +10,8 @@ in the release announcement and keep it
 
 Nutch Change Log
 
+* NUTCH-2221 Introduce db.ignore.internal.links to FetcherThread (markus)
+
 * NUTCH-2220 Rename db.* options used only by the linkdb to linkdb.* (markus)
 
 * NUTCH-2228 Plugin index-replace unit test broken on Java 8 (snagel via 
markus)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1731836=1731835=1731836=diff
==
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Feb 23 10:38:31 2016
@@ -538,6 +538,16 @@
 
 
 
+  db.ignore.internal.links
+  false
+  If true, outlinks leading from a page to internal hosts or 
domain
+  will be ignored. This is an effective way to limit the crawl to include
+  only initially injected hosts, without creating complex URLFilters.
+  See 'db.ignore.external.links.mode'.
+  
+
+
+
   db.ignore.external.links
   false
   If true, outlinks leading from a page to external hosts or 
domain

Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java?rev=1731836=1731835=1731836=diff
==
--- nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherThread.java Tue Feb 23 
10:38:31 2016
@@ -84,6 +84,7 @@ public class FetcherThread extends Threa
   private String reprUrl;
   private boolean redirecting;
   private int redirectCount;
+  private boolean ignoreInternalLinks;
   private boolean ignoreExternalLinks;
   private String ignoreExternalLinksMode;
 
@@ -174,6 +175,7 @@ public class FetcherThread extends Threa
 maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
 : maxOutlinksPerPage;
 interval = conf.getInt("db.fetch.interval.default", 2592000);
+ignoreInternalLinks = conf.getBoolean("db.ignore.internal.links", false);
 ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
 ignoreExternalLinksMode = conf.get("db.ignore.external.links.mode", 
"byHost");
 maxOutlinkDepth = conf.getInt("fetcher.follow.outlinks.depth", -1);
@@ -428,10 +430,10 @@ public class FetcherThread extends Threa
 newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
 newUrl = urlFilters.filter(newUrl);
 
-if (ignoreExternalLinks) {
-  try {
-String origHost = new URL(urlString).getHost().toLowerCase();
-String newHost = new URL(newUrl).getHost().toLowerCase();
+try {
+  String origHost = new URL(urlString).getHost().toLowerCase();
+  String newHost = new URL(newUrl).getHost().toLowerCase();
+  if (ignoreExternalLinks) {
 if (!origHost.equals(newHost)) {
   if (LOG.isDebugEnabled()) {
 LOG.debug(" - ignoring redirect " + redirType + " from "
@@ -440,10 +442,20 @@ public class FetcherThread extends Threa
   }
   return null;
 }
-  } catch (MalformedURLException e) {
   }
-}
-
+  
+  if (ignoreInternalLinks) {
+if (origHost.equals(newHost)) {
+  if (LOG.isDebugEnabled()) {
+LOG.debug(" - ignoring redirect " + redirType + " from "
++ urlString + " to " + newUrl
++ " because internal links are ignored");
+  }
+  return null;
+}
+  }
+} catch (MalformedURLException e) { }
+
 if (newUrl != null && !newUrl.equals(urlString)) {
   reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
   url = new Text(newUrl);
@@ -621,7 +633,7 @@ public class FetcherThread extends Threa
   // collect outlinks for subsequent db update
   Outlink[] links = parseData.getOutlinks();
   int outlinksToStore = Math.min(maxOutlinks, links.length);
-  if (ignoreExternalLinks) {
+  if (ignoreExternalLinks || ignoreInternalLinks) {
 URL originU

svn commit: r1731831 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/crawl/LinkDb.java src/java/org/apache/nutch/crawl/LinkDbMerger.java

2016-02-23 Thread markus

Author: markus
Date: Tue Feb 23 10:23:24 2016
New Revision: 1731831

URL: http://svn.apache.org/viewvc?rev=1731831=rev
Log:
NUTCH-2220 Rename db.* options used only by the linkdb to linkdb.*

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbMerger.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1731831=1731830=1731831=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Feb 23 10:23:24 2016
@@ -1,5 +1,17 @@
+Fellow committers, Nutch 1.12 contains a breaking change NUTCH-2220. Please 
use the note below and
+in the release announcement and keep it on top in this CHANGES.txt for the 
Nutch 1.12 release.
+
+* replace your old conf/nutch-default.xml with the conf/nutch-default.xml from 
Nutch 1.12 release
+* if you use LinkDB (e.g. invertlinks) and modified parameters db.max.inlinks 
and/or db.max.anchor.length
+  and/or db.ignore.internal.links, rename those parameters to 
linkdb.max.inlinks and
+  linkdb.max.anchor.length and linkdb.ignore.internal.links
+* db.ignore.internal.links and db.ignore.external.links now operate on the 
CrawlDB only
+* linkdb.ignore.internal.links and linkdb.ignore.external.links now operate on 
the LinkDB only
+
 Nutch Change Log
 
+* NUTCH-2220 Rename db.* options used only by the linkdb to linkdb.* (markus)
+
 * NUTCH-2228 Plugin index-replace unit test broken on Java 8 (snagel via 
markus)
 
 * NUTCH-2219 Criteria order to be configurable in DeduplicationJob (Ron van 
der Vegt via markus)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1731831=1731830=1731831=diff
==
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Feb 23 10:23:24 2016
@@ -538,16 +538,6 @@
 
 
 
-  db.ignore.internal.links
-  true
-  If true, when adding new links to a page, links from
-  the same host are ignored.  This is an effective way to limit the
-  size of the link database, keeping only the highest quality
-  links.
-  
-
-
-
   db.ignore.external.links
   false
   If true, outlinks leading from a page to external hosts or 
domain
@@ -616,15 +606,6 @@
 
 
 
-  db.max.inlinks
-  1
-  Maximum number of Inlinks per URL to be kept in LinkDb.
-  If "invertlinks" finds more inlinks than this number, only the first
-  N inlinks will be stored, and the rest will be discarded.
-  
-
-
-
   db.max.outlinks.per.page
   100
   The maximum number of outlinks that we'll process for a page.
@@ -681,6 +662,35 @@
   
 
 
+
+
+
+  linkdb.max.inlinks
+  1
+  Maximum number of Inlinks per URL to be kept in LinkDb.
+  If "invertlinks" finds more inlinks than this number, only the first
+  N inlinks will be stored, and the rest will be discarded.
+  
+
+
+
+  linkdb.ignore.internal.links
+  true
+  If true, when adding new links to a page, links from
+  the same host are ignored.  This is an effective way to limit the
+  size of the link database, keeping only the highest quality
+  links.
+  
+
+
+
+  linkdb.ignore.external.links
+  false
+  If true, when adding new links to a page, links from
+  the a different host are ignored.
+  
+
+
 
 
 

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=1731831=1731830=1731831=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Tue Feb 23 10:23:24 
2016
@@ -48,8 +48,8 @@ public class LinkDb extends NutchTool im
 
   public static final Logger LOG = LoggerFactory.getLogger(LinkDb.class);
 
-  public static final String IGNORE_INTERNAL_LINKS = 
"db.ignore.internal.links";
-  public static final String IGNORE_EXTERNAL_LINKS = 
"db.ignore.external.links";
+  public static final String IGNORE_INTERNAL_LINKS = 
"linkdb.ignore.internal.links";
+  public static final String IGNORE_EXTERNAL_LINKS = 
"linkdb.ignore.external.links";
 
   public static final String CURRENT_NAME = "current";
   public static final String LOCK_NAME = ".locked";
@@ -68,7 +68,7 @@ public class LinkDb extends NutchTool im
   }
 
   public void configure(JobConf job) {
-maxAnchorLength = job.getInt("db.max.anchor.length", 100);
+maxAnchorLength = job.getInt("linkdb.max.anchor.length", 100);
 ignoreInternalLinks = job.getBoolean(IGNORE_INTERNAL_LINKS, true);
 ignoreExternalLinks = job.getBoolean(IGNORE_EXTERNAL_LINKS, false);
 

Modified: nu

svn commit: r1731824 - in /nutch/trunk: CHANGES.txt src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java

2016-02-23 Thread markus

Author: markus
Date: Tue Feb 23 09:50:05 2016
New Revision: 1731824

URL: http://svn.apache.org/viewvc?rev=1731824=rev
Log:
NUTCH-2228 Plugin index-replace unit test broken on Java 8 

Modified:
nutch/trunk/CHANGES.txt

nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1731824=1731823=1731824=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Feb 23 09:50:05 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2228 Plugin index-replace unit test broken on Java 8 (snagel via 
markus)
+
 * NUTCH-2219 Criteria order to be configurable in DeduplicationJob (Ron van 
der Vegt via markus)
 
 * NUTCH-2218 Update CrawlComplete util to use Commons CLI (Joyce)

Modified: 
nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java?rev=1731824=1731823=1731824=diff
==
--- 
nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java
 (original)
+++ 
nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java
 Tue Feb 23 09:50:05 2016
@@ -182,7 +182,7 @@ public class TestIndexReplace {
 String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
 String expectedAuthor = "Peter Ciuffetti";
 // Contains: invalid pattern, invalid flags, incomplete property
-String indexReplaceProperty = "  metatag.description=/this\\hplugin/this 
awesome plugin/\n"
+String indexReplaceProperty = "  
metatag.description=/this\\s+**plugin/this awesome plugin/\n"
 + "  metatag.keywords=/\\,/\\!/what\n" + " 
metatag.author=#notcomplete";
 
 Configuration conf = NutchConfiguration.create();

svn commit: r1731651 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/DeduplicationJob.java

2016-02-22 Thread markus

Author: markus
Date: Mon Feb 22 14:41:37 2016
New Revision: 1731651

URL: http://svn.apache.org/viewvc?rev=1731651=rev
Log:
NUTCH-2219 Criteria order to be configurable in DeduplicationJob

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1731651=1731650=1731651=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Feb 22 14:41:37 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2219 Criteria order to be configurable in DeduplicationJob (Ron van 
der Vegt via markus)
+
 * NUTCH-2218 Update CrawlComplete util to use Commons CLI (Joyce)
 
 * NUTCH-2223 Upgrade xercesImpl to 2.11.0 to fix hang on issue in tika 
mimetype detection (Tien Nguyen Manh via markus)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1731651=1731650=1731651=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Mon Feb 
22 14:41:37 2016
@@ -22,6 +22,7 @@ import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Random;
+import java.util.Arrays;
 
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -69,6 +70,7 @@ public class DeduplicationJob extends Nu
 
   private final static Text urlKey = new Text("_URLTEMPKEY_");
   private final static String DEDUPLICATION_GROUP_MODE = 
"deduplication.group.mode";
+  private final static String DEDUPLICATION_COMPARE_ORDER = 
"deduplication.compare.order";
 
   public static class DBFilter implements
   Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum> {
@@ -128,6 +130,13 @@ public class DeduplicationJob extends Nu
   public static class DedupReducer implements
   Reducer<BytesWritable, CrawlDatum, Text, CrawlDatum> {
 
+private String[] compareOrder;
+
+@Override
+public void configure(JobConf arg0) {
+  compareOrder = arg0.get(DEDUPLICATION_COMPARE_ORDER).split(",");
+}
+
 private void writeOutAsDuplicate(CrawlDatum datum,
 OutputCollector<Text, CrawlDatum> output, Reporter reporter)
 throws IOException {
@@ -144,6 +153,7 @@ public class DeduplicationJob extends Nu
 throws IOException {
   CrawlDatum existingDoc = null;
 
+  outerloop:
   while (values.hasNext()) {
 if (existingDoc == null) {
   existingDoc = new CrawlDatum();
@@ -151,48 +161,56 @@ public class DeduplicationJob extends Nu
   continue;
 }
 CrawlDatum newDoc = values.next();
-// compare based on score
-if (existingDoc.getScore() < newDoc.getScore()) {
-  writeOutAsDuplicate(existingDoc, output, reporter);
-  existingDoc = new CrawlDatum();
-  existingDoc.set(newDoc);
-  continue;
-} else if (existingDoc.getScore() > newDoc.getScore()) {
-  // mark new one as duplicate
-  writeOutAsDuplicate(newDoc, output, reporter);
-  continue;
-}
-// same score? delete the one which is oldest
-if (existingDoc.getFetchTime() > newDoc.getFetchTime()) {
-  // mark new one as duplicate
-  writeOutAsDuplicate(newDoc, output, reporter);
-  continue;
-} else if (existingDoc.getFetchTime() < newDoc.getFetchTime()) {
-  // mark existing one as duplicate
-  writeOutAsDuplicate(existingDoc, output, reporter);
-  existingDoc = new CrawlDatum();
-  existingDoc.set(newDoc);
-  continue;
-}
-// same time? keep the one which has the shortest URL
-String urlExisting = existingDoc.getMetaData().get(urlKey).toString();
-String urlnewDoc = newDoc.getMetaData().get(urlKey).toString();
-if (urlExisting.length() < urlnewDoc.length()) {
-  // mark new one as duplicate
-  writeOutAsDuplicate(newDoc, output, reporter);
-  continue;
-} else if (urlExisting.length() > urlnewDoc.length()) {
-  // mark existing one as duplicate
-  writeOutAsDuplicate(existingDoc, output, reporter);
-  existingDoc = new CrawlDatum();
-  existingDoc.set(newDoc);
-  continue;
+
+for (int i = 0; i < compareOrder.length; i++) {
+  switch (compareOrder[i]) {
+case "score":
+  // compare based on score
+  if (existingDoc.getScore() < newDoc.getScore()) {
+writeOutAsDuplicate(existingDoc, output, reporte

svn commit: r1730803 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java

2016-02-17 Thread markus

Author: markus
Date: Wed Feb 17 09:55:27 2016
New Revision: 1730803

URL: http://svn.apache.org/viewvc?rev=1730803=rev
Log:
NUTCH-2224 Average bytes/second calculated incorrectly in fetcher

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1730803=1730802=1730803=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Feb 17 09:55:27 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2224 Average bytes/second calculated incorrectly in fetcher (Tien 
Nguyen Manh via markus)
+
 * NUTCH-2225 Parsed time calculated incorrectly (Tien Nguyen Manh via markus)
 
 * NUTCH-961 Expose Tika's Boilerpipe support (Gabriele Kahlout, Vincent Slot, 
markus)

Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1730803=1730802=1730803=diff
==
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Wed Feb 17 
09:55:27 2016
@@ -138,7 +138,7 @@ public class Fetcher extends NutchTool i
 Long elapsed = new Long((System.currentTimeMillis() - start) / 1000);
 
 float avgPagesSec = (float) pages.get() / elapsed.floatValue();
-long avgBytesSec = (bytes.get() / 125l) / elapsed.longValue();
+long avgBytesSec = (bytes.get() / 128l) / elapsed.longValue();
 
 status.append(activeThreads).append(" threads (").append(spinWaiting.get())
 .append(" waiting), ");
@@ -148,7 +148,7 @@ public class Fetcher extends NutchTool i
 status.append(String.format("%.2f", avgPagesSec)).append(" pages/s (");
 status.append(pagesLastSec).append(" last sec), ");
 status.append(avgBytesSec).append(" kbits/s (")
-.append((bytesLastSec / 125)).append(" last sec)");
+.append((bytesLastSec / 128)).append(" last sec)");
 
 reporter.setStatus(status.toString());
   }

svn commit: r1730802 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/parse/ParseSegment.java

2016-02-17 Thread markus

Author: markus
Date: Wed Feb 17 09:51:14 2016
New Revision: 1730802

URL: http://svn.apache.org/viewvc?rev=1730802=rev
Log:
NUTCH-2225 Parsed time calculated incorrectly

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1730802=1730801=1730802=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Feb 17 09:51:14 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2225 Parsed time calculated incorrectly (Tien Nguyen Manh via markus)
+
 * NUTCH-961 Expose Tika's Boilerpipe support (Gabriele Kahlout, Vincent Slot, 
markus)
 
 * NUTCH-1233 Rely on Tika for outlink extraction (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1730802=1730801=1730802=diff
==
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Wed Feb 17 
09:51:14 2016
@@ -96,6 +96,7 @@ public class ParseSegment extends NutchT
   return;
 }
 
+long start = System.currentTimeMillis();
 ParseResult parseResult = null;
 try {
   if (parseUtil == null)
@@ -112,8 +113,6 @@ public class ParseSegment extends NutchT
   Parse parse = entry.getValue();
   ParseStatus parseStatus = parse.getData().getStatus();
 
-  long start = System.currentTimeMillis();
-
   reporter.incrCounter("ParserStatus",
   ParseStatus.majorCodes[parseStatus.getMajorCode()], 1);

svn commit: r1730687 - in /nutch/trunk: ./ src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/

2016-02-16 Thread markus

Author: markus
Date: Tue Feb 16 13:39:18 2016
New Revision: 1730687

URL: http://svn.apache.org/viewvc?rev=1730687=rev
Log:
NUTCH-1233 Rely on Tika for outlink extraction

Modified:
nutch/trunk/CHANGES.txt

nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java

nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java

nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1730687=1730686=1730687=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Feb 16 13:39:18 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-1233 Rely on Tika for outlink extraction (markus)
+
 * NUTCH-2210 Upgrade to Tika 1.12 (markus)
 
 * NUTCH-2209 Improved Tokenization for Similarity Scoring plugin (Sujen)

Modified: 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java?rev=1730687=1730686=1730687=diff
==
--- 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java
 Tue Feb 16 13:39:18 2016
@@ -355,7 +355,9 @@ class DOMBuilder implements ContentHandl
*/
   public void endElement(String ns, String localName, String name)
   throws org.xml.sax.SAXException {
-m_elemStack.pop();
+if (!m_elemStack.isEmpty()) {
+  m_elemStack.pop();
+}
 m_currentNode = m_elemStack.isEmpty() ? null : (Node) m_elemStack.peek();
   }
 

Modified: 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java?rev=1730687=1730686=1730687=diff
==
--- 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
 Tue Feb 16 13:39:18 2016
@@ -22,11 +22,14 @@ import java.net.URL;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.util.NodeWalker;
 import org.apache.nutch.util.URLUtil;
+import org.apache.tika.sax.Link;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
@@ -57,6 +60,7 @@ public class DOMContentUtils {
   }
 
   private HashMap<String, LinkParams> linkParams = new HashMap<String, 
LinkParams>();
+  private HashSet ignoredTags = new HashSet();
   private Configuration conf;
 
   public DOMContentUtils(Configuration conf) {
@@ -85,6 +89,7 @@ public class DOMContentUtils {
 // remove unwanted link tags from the linkParams map
 String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
 for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) {
+  ignoredTags.add(ignoreTags[i].toLowerCase());
   if (!forceTags.contains(ignoreTags[i]))
 linkParams.remove(ignoreTags[i]);
 }
@@ -244,7 +249,7 @@ public class DOMContentUtils {
 }
 return true;
   }
-
+  
   // this only covers a few cases of empty links that are symptomatic
   // of nekohtml's DOM-fixup process...
   private boolean shouldThrowAwayLink(Node node, NodeList children,
@@ -365,5 +370,33 @@ public class DOMContentUtils {
   }
 }
   }
-
-}
+  
+  // This one is used by NUTCH-1918
+  public void getOutlinks(URL base, ArrayList outlinks, List 
tikaExtractedOutlinks) {
+String target = null;
+String anchor = null;
+boolean noFollow = false;
+
+for (Link link : tikaExtractedOutlinks) {
+  target = link.getUri();
+  noFollow = (link.getRel().toLowerCase().equals("nofollow")) ? true : 
false;
+  anchor = link.getText();
+
+  if (!ignoredTags.contains(link.getType())) {
+if (target != null && !noFollow) {
+  try {
+URL url = URLUtil.resolveURL(base, target);
+
+// clean the anchor
+anchor = anchor.replaceAll("\\s+", " ");
+anchor = anchor.trim();
+
+outlinks.add(new Outlink(url.toString(), anchor));
+  } catch (MalformedURLException e) {
+// don't care
+  }
+}
+  }
+}
+  }
+}
\ No newline at

svn commit: r1728313 - in /nutch/trunk: ./ src/plugin/indexer-solr/ src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/

2016-02-03 Thread markus

Author: markus
Date: Wed Feb  3 13:51:10 2016
New Revision: 1728313

URL: http://svn.apache.org/viewvc?rev=1728313=rev
Log:
NUTCH-2197 Add Solr 5 cloud indexer support

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/indexer-solr/ivy.xml
nutch/trunk/src/plugin/indexer-solr/plugin.xml

nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java

nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java

nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrUtils.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1728313=1728312=1728313=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Feb  3 13:51:10 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2197 Add Solr 5 cloud indexer support (Jurian Broertjes via markus)
+
 * NUTCH-2206 Provide example scoring.similarity.stopword.file (sujen)
 
 * NUTCH-2204 Remove junit lib from runtime (snagel)

Modified: nutch/trunk/src/plugin/indexer-solr/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/ivy.xml?rev=1728313=1728312=1728313=diff
==
--- nutch/trunk/src/plugin/indexer-solr/ivy.xml (original)
+++ nutch/trunk/src/plugin/indexer-solr/ivy.xml Wed Feb  3 13:51:10 2016
@@ -36,9 +36,9 @@
   
 
   
-
-
-   
+
+
+
   
   
 

Modified: nutch/trunk/src/plugin/indexer-solr/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/plugin.xml?rev=1728313=1728312=1728313=diff
==
--- nutch/trunk/src/plugin/indexer-solr/plugin.xml (original)
+++ nutch/trunk/src/plugin/indexer-solr/plugin.xml Wed Feb  3 13:51:10 2016
@@ -22,17 +22,16 @@
 
   
 
-  
-  
-  
-  
-  
-  
-  
-  
-  
-  
-  
+  
+  
+  
+  
+  
+  
+  
+  
+  
+   
   
 
   

Modified: 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java?rev=1728313=1728312=1728313=diff
==
--- 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
 (original)
+++ 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrConstants.java
 Wed Feb  3 13:51:10 2016
@@ -17,7 +17,6 @@
 package org.apache.nutch.indexwriter.solr;
 
 public interface SolrConstants {
-  
   public static final String SOLR_PREFIX = "solr.";
 
   public static final String SERVER_URL = SOLR_PREFIX + "server.url";
@@ -31,13 +30,23 @@ public interface SolrConstants {
   public static final String USERNAME = SOLR_PREFIX + "auth.username";
 
   public static final String PASSWORD = SOLR_PREFIX + "auth.password";
-  
-  public static final String SERVER_TYPE = SOLR_PREFIX + "server.type";
-  
-  public static final String ZOOKEEPER_URL = SOLR_PREFIX + "zookeeper.url";
-  
-  public static final String LOADBALANCE_URLS = SOLR_PREFIX + 
"loadbalance.urls";
-  
+
+  public static final String COLLECTION = SOLR_PREFIX + "collection";
+
+  public static final String ZOOKEEPER_HOSTS = SOLR_PREFIX + "zookeeper.hosts";
+
+  public static final String ID_FIELD = "id";
+
+  public static final String URL_FIELD = "url";
+
+  public static final String BOOST_FIELD = "boost";
+
+  public static final String TIMESTAMP_FIELD = "tstamp";
+
+  public static final String DIGEST_FIELD = "digest";
+
+
+
   @Deprecated
   public static final String COMMIT_INDEX = SOLR_PREFIX + "commit.index";
 

Modified: 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java?rev=1728313=1728312=1728313=diff
==
--- 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
 (original)
+++ 
nutch/trunk/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
 Wed Feb  3 13:51:10 2016
@@ -17,6 +17,7 @@
 package org.apache.nutch.indexwriter.solr;
 
 import java.io.IOException;
+import java.io.UnsupportedEncodingException;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.List;
@@ -28,2

svn commit: r1725981 - in /nutch/trunk: ./ src/java/org/apache/nutch/scoring/webgraph/

2016-01-21 Thread markus

Author: markus
Date: Thu Jan 21 15:18:07 2016
New Revision: 1725981

URL: http://svn.apache.org/viewvc?rev=1725981=rev
Log:
NUTCH-2201 Remove loops program from webgraph package

Removed:
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1725981=1725980=1725981=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jan 21 15:18:07 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2201 Remove loops program from webgraph package (markus)
+
 * NUTCH-1325 HostDB for Nutch (Gui Forget, markus, tejasp)
 
 * NUTCH-2203 Suffix URL filter can't handle trailing/leading whitespaces 
(Jurian Broertjes via markus)

Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java?rev=1725981=1725980=1725981=diff
==
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java Thu 
Jan 21 15:18:07 2016
@@ -59,7 +59,6 @@ import org.apache.hadoop.mapred.lib.Hash
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
-import org.apache.nutch.scoring.webgraph.Loops.LoopSet;
 import org.apache.nutch.util.FSUtils;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
@@ -246,9 +245,8 @@ public class LinkDumper extends Configur
   String fromUrl = key.toString();
   List outlinks = new ArrayList();
   Node node = null;
-  LoopSet loops = null;
-
-  // loop through all values aggregating outlinks, saving node and loopset
+  
+  // loop through all values aggregating outlinks, saving node
   while (values.hasNext()) {
 ObjectWritable write = values.next();
 Object obj = write.get();
@@ -256,25 +254,16 @@ public class LinkDumper extends Configur
   node = (Node) obj;
 } else if (obj instanceof LinkDatum) {
   outlinks.add(WritableUtils.clone((LinkDatum) obj, conf));
-} else if (obj instanceof LoopSet) {
-  loops = (LoopSet) obj;
 }
   }
 
   // only collect if there are outlinks
   int numOutlinks = node.getNumOutlinks();
   if (numOutlinks > 0) {
-
-Set loopSet = (loops != null) ? loops.getLoopSet() : null;
 for (int i = 0; i < outlinks.size(); i++) {
   LinkDatum outlink = outlinks.get(i);
   String toUrl = outlink.getUrl();
 
-  // remove any url that is in the loopset, same as LinkRank
-  if (loopSet != null && loopSet.contains(toUrl)) {
-continue;
-  }
-
   // collect the outlink as an inlink with the node
   output.collect(new Text(toUrl), new LinkNode(fromUrl, node));
 }
@@ -343,8 +332,6 @@ public class LinkDumper extends Configur
 
 Path linkdump = new Path(webGraphDb, DUMP_DIR);
 Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
-Path loopSetDb = new Path(webGraphDb, Loops.LOOPS_DIR);
-boolean loopsExists = fs.exists(loopSetDb);
 Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);
 
 // run the inverter job
@@ -353,9 +340,6 @@ public class LinkDumper extends Configur
 JobConf inverter = new NutchJob(conf);
 inverter.setJobName("LinkDumper: inverter");
 FileInputFormat.addInputPath(inverter, nodeDb);
-if (loopsExists) {
-  FileInputFormat.addInputPath(inverter, loopSetDb);
-}
 FileInputFormat.addInputPath(inverter, outlinkDb);
 inverter.setInputFormat(SequenceFileInputFormat.class);
 inverter.setMapperClass(Inverter.class);

Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java?rev=1725981=1725980=1725981=diff
==
--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java Thu 
Jan 21 15:18:07 2016
@@ -61,7 +61,6 @@ import org.apache.hadoop.mapred.TextOutp
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
-import org.apache.nutch.scoring.webgraph.Loops.LoopSet;
 import org.apache.nut

svn commit: r1725538 - in /nutch/trunk: CHANGES.txt src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java

2016-01-19 Thread markus

Author: markus
Date: Tue Jan 19 14:53:05 2016
New Revision: 1725538

URL: http://svn.apache.org/viewvc?rev=1725538=rev
Log:
NUTCH-2203 Suffix URL filter can't handle trailing/leading whitespaces

Modified:
nutch/trunk/CHANGES.txt

nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1725538=1725537=1725538=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jan 19 14:53:05 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2203 Suffix URL filter can't handle trailing/leading whitespaces 
(Jurian Broertjes via markus)
+
 * NUTCH-2194 Run IndexingFilterChecker as simple Telnet server (markus)
 
 * NUTCH-2196 IndexingFilterChecker to optionally normalize (markus)

Modified: 
nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java?rev=1725538=1725537=1725538=diff
==
--- 
nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
 Tue Jan 19 14:53:05 2016
@@ -196,6 +196,7 @@ public class SuffixURLFilter implements
 String line;
 
 while ((line = in.readLine()) != null) {
+  line = line.trim();
   if (line.length() == 0)
 continue;

svn commit: r1724771 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

2016-01-15 Thread markus

Author: markus
Date: Fri Jan 15 10:45:27 2016
New Revision: 1724771

URL: http://svn.apache.org/viewvc?rev=1724771=rev
Log:
NUTCH-2194 Run IndexingFilterChecker as simple Telnet server

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1724771=1724770=1724771=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan 15 10:45:27 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2194 Run IndexingFilterChecker as simple Telnet server (markus)
+
 * NUTCH-2196 IndexingFilterChecker to optionally normalize (markus)
 
 * NUTCH-2195 IndexingFilterChecker to optionally follow N redirects (markus)

Modified: 
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1724771=1724770=1724771=diff
==
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
Fri Jan 15 10:45:27 2016
@@ -17,6 +17,13 @@
 
 package org.apache.nutch.indexer;
 
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.net.InetSocketAddress;
+import java.nio.charset.Charset;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
@@ -59,6 +66,13 @@ import org.slf4j.LoggerFactory;
 
 public class IndexingFiltersChecker extends Configured implements Tool {
 
+  protected URLNormalizers normalizers = null;
+  protected boolean dumpText = false;
+  protected boolean followRedirects = false;
+  // used to simulate the metadata propagated from injection
+  protected HashMap<String, String> metadata = new HashMap<String, String>();
+  protected int tcpPort = -1;
+
   public static final Logger LOG = LoggerFactory
   .getLogger(IndexingFiltersChecker.class);
 
@@ -67,25 +81,19 @@ public class IndexingFiltersChecker exte
   }
 
   public int run(String[] args) throws Exception {
-String contentType = null;
 String url = null;
-URLNormalizers normalizers = null;
-boolean dumpText = false;
-boolean followRedirects = false;
-
-String usage = "Usage: IndexingFiltersChecker [-normalize] 
[-followRedirects] [-dumpText] [-md key=value] ";
+String usage = "Usage: IndexingFiltersChecker [-normalize] 
[-followRedirects] [-dumpText] [-md key=value] [-listen ] ";
 
 if (args.length == 0) {
   System.err.println(usage);
   return -1;
 }
 
-// used to simulate the metadata propagated from injection
-HashMap<String, String> metadata = new HashMap<String, String>();
-
 for (int i = 0; i < args.length; i++) {
   if (args[i].equals("-normalize")) {
 normalizers = new URLNormalizers(getConf(), 
URLNormalizers.SCOPE_DEFAULT);
+  } else if (args[i].equals("-listen")) {
+tcpPort = Integer.parseInt(args[++i]);
   } else if (args[i].equals("-followRedirects")) {
 followRedirects = true;
   } else if (args[i].equals("-dumpText")) {
@@ -108,6 +116,88 @@ public class IndexingFiltersChecker exte
   }
 }
 
+// In listening mode?
+if (tcpPort == -1) {
+  // No, just fetch and display
+  StringBuilder output = new StringBuilder();
+  int ret = fetch(url, output);
+  System.out.println(output);
+  return ret;
+} else {
+  // Listen on socket and start workers on incoming requests
+  listen();
+}
+
+return 0;
+  }
+  
+  protected void listen() throws Exception {
+ServerSocket server = null;
+
+try{
+  server = new ServerSocket();
+  server.bind(new InetSocketAddress(tcpPort));
+  LOG.info(server.toString());
+} catch (Exception e) {
+  LOG.error("Could not listen on port " + tcpPort);
+  System.exit(-1);
+}
+
+while(true){
+  Worker worker;
+  try{
+worker = new Worker(server.accept());
+Thread thread = new Thread(worker);
+thread.start();
+  } catch (Exception e) {
+LOG.error("Accept failed: " + tcpPort);
+System.exit(-1);
+  }
+}
+  }
+  
+  private class Worker implements Runnable {
+private Socket client;
+
+Worker(Socket client) {
+  this.client = client;
+  LOG.info(client.toString());
+}
+
+public void run(){
+  String line;
+  BufferedReader in = null;
+  PrintWriter out = null;
+  
+  try{
+in = new BufferedReader(new 
InputStreamReader(client.getInpu

svn commit: r1724418 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

2016-01-13 Thread markus

Author: markus
Date: Wed Jan 13 13:10:19 2016
New Revision: 1724418

URL: http://svn.apache.org/viewvc?rev=1724418=rev
Log:
NUTCH-2196 IndexingFilterChecker to optionally normalize

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1724418=1724417=1724418=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan 13 13:10:19 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2196 IndexingFilterChecker to optionally normalize (markus)
+
 * NUTCH-2195 IndexingFilterChecker to optionally follow N redirects (markus)
 
 * NUTCH-2190 Protocol normalizer (markus)

Modified: 
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1724418=1724417=1724418=diff
==
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
Wed Jan 13 13:10:19 2016
@@ -32,6 +32,7 @@ import org.apache.nutch.crawl.Inlinks;
 import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.parse.ParseSegment;
@@ -43,7 +44,6 @@ import org.apache.nutch.protocol.Protoco
 import org.apache.nutch.scoring.ScoringFilters;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.StringUtil;
-import org.apache.nutch.util.URLUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -69,10 +69,11 @@ public class IndexingFiltersChecker exte
   public int run(String[] args) throws Exception {
 String contentType = null;
 String url = null;
+URLNormalizers normalizers = null;
 boolean dumpText = false;
 boolean followRedirects = false;
 
-String usage = "Usage: IndexingFiltersChecker [-followRedirects] 
[-dumpText] [-md key=value] ";
+String usage = "Usage: IndexingFiltersChecker [-normalize] 
[-followRedirects] [-dumpText] [-md key=value] ";
 
 if (args.length == 0) {
   System.err.println(usage);
@@ -83,7 +84,9 @@ public class IndexingFiltersChecker exte
 HashMap<String, String> metadata = new HashMap<String, String>();
 
 for (int i = 0; i < args.length; i++) {
-  if (args[i].equals("-followRedirects")) {
+  if (args[i].equals("-normalize")) {
+normalizers = new URLNormalizers(getConf(), 
URLNormalizers.SCOPE_DEFAULT);
+  } else if (args[i].equals("-followRedirects")) {
 followRedirects = true;
   } else if (args[i].equals("-dumpText")) {
 dumpText = true;
@@ -101,9 +104,13 @@ public class IndexingFiltersChecker exte
 System.err.println(usage);
 System.exit(-1);
   } else {
-url = URLUtil.toASCII(args[i]);
+url =args[i];
   }
 }
+
+if (normalizers != null) {
+  url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
+}
 
 LOG.info("fetching: " + url);
 
@@ -129,6 +136,11 @@ public class IndexingFiltersChecker exte
 while (!output.getStatus().isSuccess() && followRedirects && 
output.getStatus().isRedirect() && maxRedirects != 0) {
   String[] stuff = output.getStatus().getArgs();
   url = stuff[0];
+  
+  if (normalizers != null) {
+url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
+  }
+
   turl.set(url);
   
   // try again

svn commit: r1724409 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

2016-01-13 Thread markus

Author: markus
Date: Wed Jan 13 12:17:03 2016
New Revision: 1724409

URL: http://svn.apache.org/viewvc?rev=1724409=rev
Log:
NUTCH-2195 IndexingFilterChecker to optionally follow N redirects

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1724409=1724408=1724409=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan 13 12:17:03 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2195 IndexingFilterChecker to optionally follow N redirects (markus)
+
 * NUTCH-2190 Protocol normalizer (markus)
 
 * NUTCH-1838 Host and domain based regex and automaton filtering (markus)

Modified: 
nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1724409=1724408=1724409=diff
==
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java 
Wed Jan 13 12:17:03 2016
@@ -70,8 +70,9 @@ public class IndexingFiltersChecker exte
 String contentType = null;
 String url = null;
 boolean dumpText = false;
+boolean followRedirects = false;
 
-String usage = "Usage: IndexingFiltersChecker [-dumpText] [-md key=value] 
";
+String usage = "Usage: IndexingFiltersChecker [-followRedirects] 
[-dumpText] [-md key=value] ";
 
 if (args.length == 0) {
   System.err.println(usage);
@@ -82,7 +83,9 @@ public class IndexingFiltersChecker exte
 HashMap<String, String> metadata = new HashMap<String, String>();
 
 for (int i = 0; i < args.length; i++) {
-  if (args[i].equals("-dumpText")) {
+  if (args[i].equals("-followRedirects")) {
+followRedirects = true;
+  } else if (args[i].equals("-dumpText")) {
 dumpText = true;
   } else if (args[i].equals("-md")) {
 String k = null, v = null;
@@ -116,11 +119,22 @@ public class IndexingFiltersChecker exte
 }
 
 IndexingFilters indexers = new IndexingFilters(getConf());
+
+int maxRedirects = 3;
 
-ProtocolFactory factory = new ProtocolFactory(getConf());
-Protocol protocol = factory.getProtocol(url);
+ProtocolOutput output = getProtocolOutput(url, datum);
 Text turl = new Text(url);
-ProtocolOutput output = protocol.getProtocolOutput(turl, datum);
+
+// Following redirects and not reached maxRedirects?
+while (!output.getStatus().isSuccess() && followRedirects && 
output.getStatus().isRedirect() && maxRedirects != 0) {
+  String[] stuff = output.getStatus().getArgs();
+  url = stuff[0];
+  turl.set(url);
+  
+  // try again
+  output = getProtocolOutput(url, datum);
+  maxRedirects--;
+}
 
 if (!output.getStatus().isSuccess()) {
   System.out.println("Fetch failed with protocol status: "
@@ -224,6 +238,14 @@ public class IndexingFiltersChecker exte
 
 return 0;
   }
+  
+  protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) 
throws Exception {
+ProtocolFactory factory = new ProtocolFactory(getConf());
+Protocol protocol = factory.getProtocol(url);
+Text turl = new Text(url);
+ProtocolOutput output = protocol.getProtocolOutput(turl, datum);
+return output;
+  }
 
   public static void main(String[] args) throws Exception {
 final int res = ToolRunner.run(NutchConfiguration.create(),

svn commit: r1724199 - /nutch/trunk/conf/protocols.txt

2016-01-12 Thread markus

Author: markus
Date: Tue Jan 12 10:33:59 2016
New Revision: 1724199

URL: http://svn.apache.org/viewvc?rev=1724199=rev
Log:
NUTCH-2190 Protocol normalizer

Added:
nutch/trunk/conf/protocols.txt

Added: nutch/trunk/conf/protocols.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/protocols.txt?rev=1724199=auto
==
--- nutch/trunk/conf/protocols.txt (added)
+++ nutch/trunk/conf/protocols.txt Tue Jan 12 10:33:59 2016
@@ -0,0 +1,7 @@
+# Example configuration file for urlnormalizer-protocol
+#
+# URL's of hosts listed in the configuration are normalized to the target
+# protocol. Useful in cases where a host accepts both http and https, doubling
+# the site's size.
+#
+# format: \t\n

svn commit: r1724085 - in /nutch/trunk: ./ src/plugin/ src/plugin/urlnormalizer-protocol/ src/plugin/urlnormalizer-protocol/data/ src/plugin/urlnormalizer-protocol/src/ src/plugin/urlnormalizer-protoc

2016-01-11 Thread markus

Author: markus
Date: Mon Jan 11 17:10:30 2016
New Revision: 1724085

URL: http://svn.apache.org/viewvc?rev=1724085=rev
Log:
NUTCH-2190 Protocol normalizer

Added:
nutch/trunk/src/plugin/urlnormalizer-protocol/
nutch/trunk/src/plugin/urlnormalizer-protocol/build.xml
nutch/trunk/src/plugin/urlnormalizer-protocol/data/
nutch/trunk/src/plugin/urlnormalizer-protocol/data/protocols.txt
nutch/trunk/src/plugin/urlnormalizer-protocol/ivy.xml
nutch/trunk/src/plugin/urlnormalizer-protocol/plugin.xml
nutch/trunk/src/plugin/urlnormalizer-protocol/src/
nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/
nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/
nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/
nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/
nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/

nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/

nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/

nutch/trunk/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/
nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/
nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/
nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/
nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/

nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/

nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/

nutch/trunk/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/build.xml
nutch/trunk/default.properties
nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1724085=1724084=1724085=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Jan 11 17:10:30 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2190 Protocol normalizer (markus)
+
 * NUTCH-1838 Host and domain based regex and automaton filtering (markus)
 
 * NUTCH-2178 DeduplicationJob to optionally group on host or domain (markus)

Modified: nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1724085=1724084=1724085=diff
==
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Mon Jan 11 17:10:30 2016
@@ -224,6 +224,7 @@
   
   
   
+  
   
   
   
@@ -660,6 +661,7 @@
   
   
   
+  
   
   
   
@@ -1082,6 +1084,8 @@
 
 
 
+
+
 
 
 

Modified: nutch/trunk/default.properties
URL: 
http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1724085=1724084=1724085=diff
==
--- nutch/trunk/default.properties (original)
+++ nutch/trunk/default.properties Mon Jan 11 17:10:30 2016
@@ -110,6 +110,7 @@ plugins.urlnormalizer=\
org.apache.nutch.net.urlnormalizer.basic*:\
org.apache.nutch.net.urlnormalizer.host*:\
org.apache.nutch.net.urlnormalizer.pass*:\
+   org.apache.nutch.net.urlnormalizer.protocol*:\
org.apache.nutch.net.urlnormalizer.querystring*:\
org.apache.nutch.net.urlnormalizer.regex*
 

Modified: nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1724085=1724084=1724085=diff
==
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Mon Jan 11 17:10:30 2016
@@ -82,6 +82,7 @@
  
  
  
+ 
  
  
  
@@ -125,6 +126,7 @@
  
  
  
+ 
  
  
  
@@ -193,6 +195,7 @@
 
 
 
+
 
 
 

Added: nutch/trunk/src/plugin/urlnormalizer-protocol/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-protocol/build.xml?rev=1724085=auto
==
--- nutch/trunk/src/plugin/urlnormalizer-protocol/build.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-protocol/build.xml Mon Jan 11 17:10:30 
2016
@@ -0,0 +1,27 @@
+
+
+
+
+  
+
+  
+  
+  
+
+  
+

Added: nutch/trunk/src/plugin/urlnormalizer-protocol/data/protocols.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-protocol/data/protocols.txt?rev

svn commit: r1723688 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/indexer/IndexerMapReduce.java

2016-01-08 Thread markus

Author: markus
Date: Fri Jan  8 11:10:38 2016
New Revision: 1723688

URL: http://svn.apache.org/viewvc?rev=1723688=rev
Log:
NUTCH-1449 Optionally delete documents skipped by IndexingFilters


Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1723688=1723687=1723688=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan  8 11:10:38 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-1449 Optionally delete documents skipped by IndexingFilters (markus)
+
 * NUTCH-2189 Domain filter must deactivate if no rules are present (markus)
 
 * NUTCH-2182 Make reverseUrlDirs file dumper option hash the URL for 
consistency (joyce)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1723688=1723687=1723688=diff
==
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Jan  8 11:10:38 2016
@@ -1043,6 +1043,20 @@
   
 
 
+
+  indexer.delete.robots.noindex
+  false
+  Whether the indexer will delete documents marked by 
robots=noindex
+  
+
+
+
+  indexer.delete.skipped.by.indexingfilter
+  false
+  Whether the indexer will delete documents that were skipped by 
indexing filters
+  
+
+
 
 
 

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1723688=1723687=1723688=diff
==
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri Jan 
 8 11:10:38 2016
@@ -63,6 +63,7 @@ public class IndexerMapReduce extends Co
   public static final String INDEXER_PARAMS = "indexer.additional.params";
   public static final String INDEXER_DELETE = "indexer.delete";
   public static final String INDEXER_DELETE_ROBOTS_NOINDEX = 
"indexer.delete.robots.noindex";
+  public static final String INDEXER_DELETE_SKIPPED = 
"indexer.delete.skipped.by.indexingfilter";
   public static final String INDEXER_SKIP_NOTMODIFIED = 
"indexer.skip.notmodified";
   public static final String URL_FILTERING = "indexer.url.filters";
   public static final String URL_NORMALIZING = "indexer.url.normalizers";
@@ -71,6 +72,7 @@ public class IndexerMapReduce extends Co
   private boolean skip = false;
   private boolean delete = false;
   private boolean deleteRobotsNoIndex = false;
+  private boolean deleteSkippedByIndexingFilter = false;
   private boolean base64 = false;
   private IndexingFilters filters;
   private ScoringFilters scfilters;
@@ -94,6 +96,8 @@ public class IndexerMapReduce extends Co
 this.delete = job.getBoolean(INDEXER_DELETE, false);
 this.deleteRobotsNoIndex = job.getBoolean(INDEXER_DELETE_ROBOTS_NOINDEX,
 false);
+this.deleteSkippedByIndexingFilter = job.getBoolean(INDEXER_DELETE_SKIPPED,
+false);
 this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
 this.base64 = job.getBoolean(INDEXER_BINARY_AS_BASE64, false);
 
@@ -245,7 +249,7 @@ public class IndexerMapReduce extends Co
   || fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
   || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM
   || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
-reporter.incrCounter("IndexerStatus", "deleted redirects", 1);
+reporter.incrCounter("IndexerStatus", "deleted (redirects)", 1);
 output.collect(key, DELETE_ACTION);
 return;
   }
@@ -258,7 +262,7 @@ public class IndexerMapReduce extends Co
 
 // Whether to delete pages marked as duplicates
 if (delete && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
-  reporter.incrCounter("IndexerStatus", "deleted duplicates", 1);
+  reporter.incrCounter("IndexerStatus", "deleted (duplicates)", 1);
   output.collect(key, DELETE_ACTION);
   return;
 }
@@ -284,8 +288,25 @@ public class IndexerMapReduce extends Co
 
 // add digest, used by dedup
 doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));
-
+
 final Parse parse = new ParseImpl(parseText, parseData);
+float boost = 1.0f;
+// run scoring filters
+try {
+  boost = this.scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse,
+  inlinks, boost);
+} catch (final ScoringFilterException e) {
+  reporter.incrCoun

svn commit: r1723690 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/DeduplicationJob.java

2016-01-08 Thread markus

Author: markus
Date: Fri Jan  8 11:14:33 2016
New Revision: 1723690

URL: http://svn.apache.org/viewvc?rev=1723690=rev
Log:
NUTCH-2178 DeduplicationJob to optionally group on host or domain

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1723690=1723689=1723690=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan  8 11:14:33 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-2178 DeduplicationJob to optionally group on host or domain (markus)
+
 * NUTCH-1449 Optionally delete documents skipped by IndexingFilters (markus)
 
 * NUTCH-2189 Domain filter must deactivate if no rules are present (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java?rev=1723690=1723689=1723690=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/DeduplicationJob.java Fri Jan  
8 11:14:33 2016
@@ -49,6 +49,7 @@ import org.apache.nutch.util.NutchConfig
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.NutchTool;
 import org.apache.nutch.util.TimingUtil;
+import org.apache.nutch.util.URLUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -67,12 +68,16 @@ public class DeduplicationJob extends Nu
   .getLogger(DeduplicationJob.class);
 
   private final static Text urlKey = new Text("_URLTEMPKEY_");
+  private final static String DEDUPLICATION_GROUP_MODE = 
"deduplication.group.mode";
 
   public static class DBFilter implements
   Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum> {
+  
+private String groupMode;
 
 @Override
 public void configure(JobConf arg0) {
+  groupMode = arg0.get(DEDUPLICATION_GROUP_MODE);
 }
 
 @Override
@@ -90,10 +95,31 @@ public class DeduplicationJob extends Nu
 byte[] signature = value.getSignature();
 if (signature == null)
   return;
-BytesWritable sig = new BytesWritable(signature);
+String url = key.toString();
+BytesWritable sig = null;
+byte[] data;
+switch (groupMode) {
+  case "none":
+sig = new BytesWritable(signature);
+break;
+  case "host":
+byte[] host = URLUtil.getHost(url).getBytes();
+data = new byte[signature.length + host.length];
+System.arraycopy(signature, 0, data, 0, signature.length);
+System.arraycopy(host, 0, data, signature.length, host.length);
+sig = new BytesWritable(data);
+break;
+  case "domain":
+byte[] domain = URLUtil.getDomainName(url).getBytes();
+data = new byte[signature.length + domain.length];
+System.arraycopy(signature, 0, data, 0, signature.length);
+System.arraycopy(domain, 0, data, signature.length, domain.length);
+sig = new BytesWritable(data);
+break;
+}
 // add the URL as a temporary MD
 value.getMetaData().put(urlKey, key);
-// reduce on the signature
+// reduce on the signature optionall grouped on host or domain or not 
at all
 output.collect(sig, value);
   }
 }
@@ -216,11 +242,17 @@ public class DeduplicationJob extends Nu
 
   public int run(String[] args) throws IOException {
 if (args.length < 1) {
-  System.err.println("Usage: DeduplicationJob ");
+  System.err.println("Usage: DeduplicationJob  [-group 
<none|host|domain>]");
   return 1;
 }
 
+String group = "none";
 String crawldb = args[0];
+
+for (int i = 1; i < args.length; i++) {
+  if (args[i].equals("-group"))
+group = args[++i];
+}
 
 SimpleDateFormat sdf = new SimpleDateFormat("-MM-dd HH:mm:ss");
 long start = System.currentTimeMillis();
@@ -233,6 +265,7 @@ public class DeduplicationJob extends Nu
 JobConf job = new NutchJob(getConf());
 
 job.setJobName("Deduplication on " + crawldb);
+job.set(DEDUPLICATION_GROUP_MODE, group);
 
 FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
 job.setInputFormat(SequenceFileInputFormat.class);

svn commit: r1723710 - in /nutch/trunk: ./ src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/ src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/ src/plugin

2016-01-08 Thread markus

Author: markus
Date: Fri Jan  8 12:11:18 2016
New Revision: 1723710

URL: http://svn.apache.org/viewvc?rev=1723710=rev
Log:
NUTCH-1838 Host and domain based regex and automaton filtering 


Added:
nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.rules
nutch/trunk/src/plugin/urlfilter-regex/sample/nutch1838.urls
Modified:
nutch/trunk/CHANGES.txt

nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java

nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java

nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java

nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java

nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1723710=1723709=1723710=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan  8 12:11:18 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-1838 Host and domain based regex and automaton filtering (markus)
+
 * NUTCH-2178 DeduplicationJob to optionally group on host or domain (markus)
 
 * NUTCH-1449 Optionally delete documents skipped by IndexingFilters (markus)

Modified: 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java?rev=1723710=1723709=1723710=diff
==
--- 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
 Fri Jan  8 12:11:18 2016
@@ -24,6 +24,10 @@ package org.apache.nutch.urlfilter.api;
 public abstract class RegexRule {
 
   private final boolean sign;
+  
+  private final String hostOrDomain;
+  
+  private final String regex;
 
   /**
* Constructs a new regular expression rule.
@@ -38,7 +42,27 @@ public abstract class RegexRule {
*  {@link #match(String)} method).
*/
   protected RegexRule(boolean sign, String regex) {
+this(sign, regex, null);
+  }
+  
+  /**
+   * Constructs a new regular expression rule.
+   * 
+   * @param sign
+   *  specifies if this rule must filter-in or filter-out. A
+   *  true value means that any url matching this rule 
must
+   *  be accepted, a false value means that any url
+   *  matching this rule must be rejected.
+   * @param regex
+   *  is the regular expression used for matching (see
+   *  {@link #match(String)} method).
+   * @param hostOrDomain
+   *  the host or domain to which this regex belongs
+   */
+  protected RegexRule(boolean sign, String regex, String hostOrDomain) {
 this.sign = sign;
+this.hostOrDomain = hostOrDomain;
+this.regex = regex;
   }
 
   /**
@@ -52,6 +76,20 @@ public abstract class RegexRule {
   }
 
   /**
+   * Return if this rule is used for filtering-in or out.
+   *
+   * @return host or domain this regex rule belongs to
+   */
+  protected String hostOrDomain() { return hostOrDomain; }
+  
+  /**
+   * Return if this rule's regex.
+   *
+   * @return this regex
+   */
+  protected String regex() { return regex; }
+
+  /**
* Checks if a url matches this rule.
* 
* @param url

Modified: 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=1723710=1723709=1723710=diff
==
--- 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
 Fri Jan  8 12:11:18 2016
@@ -24,6 +24,7 @@ import java.io.BufferedReader;
 import java.io.InputStreamReader;
 import java.io.IOException;
 import java.io.StringReader;
+import java.net.MalformedURLException;
 import java.util.List;
 import java.util.ArrayList;
 
@@ -36,6 +37,7 @@ import org.apache.hadoop.conf.Configurat
 
 // Nutch imports
 import org.apache.nutch.net.*;
+import org.apache.nutch.util.URLUtil;
 
 /**
  * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular
@@ -123,6 +125,20 @@ public abstract class RegexURLFilterBase
*  is the regular expression associated to this rule.
*/
   protected abstract RegexRule createRule

svn commit: r1721615 - in /nutch/trunk: CHANGES.txt src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java src/plugin/urlfilter-domain/src/test/org/apache/nutch/ur

2015-12-24 Thread markus

Author: markus
Date: Thu Dec 24 12:45:27 2015
New Revision: 1721615

URL: http://svn.apache.org/viewvc?rev=1721615=rev
Log:
NUTCH-2189 Domain filter must deactivate if no rules are present

Modified:
nutch/trunk/CHANGES.txt

nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java

nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1721615=1721614=1721615=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Dec 24 12:45:27 2015
@@ -1,6 +1,8 @@
 Nutch Change Log
 
-* NUTCH-2182 Make reverseUrlDirs file dumper option hash the URL for 
consistency
+* NUTCH-2189 Domain filter must deactivate if no rules are present (markus)
+
+* NUTCH-2182 Make reverseUrlDirs file dumper option hash the URL for 
consistency (joyce)
 
 * NUTCH-2183 Improvement to SegmentChecker for skipping non-segments present 
in segments directory (lewismc)
 

Modified: 
nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java?rev=1721615=1721614=1721615=diff
==
--- 
nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
 Thu Dec 24 12:45:27 2015
@@ -180,9 +180,10 @@ public class DomainURLFilter implements
   }
 
   public String filter(String url) {
-
+// https://issues.apache.org/jira/browse/NUTCH-2189
+if (domainSet.size() == 0) return url;
+
 try {
-
   // match for suffix, domain, and host in that order. more general will
   // override more specific
   String domain = URLUtil.getDomainName(url).toLowerCase().trim();

Modified: 
nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java?rev=1721615=1721614=1721615=diff
==
--- 
nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
 Thu Dec 24 12:45:27 2015
@@ -44,5 +44,24 @@ public class TestDomainURLFilter {
 Assert.assertNotNull(domainFilter.filter("http://www.foobar.be;));
 Assert.assertNull(domainFilter.filter("http://www.adobe.com;));
   }
+  
+  @Test
+  public void testNoFilter() throws Exception {
+// https://issues.apache.org/jira/browse/NUTCH-2189
+String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt";
+Configuration conf = NutchConfiguration.create();
+DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
+domainFilter.setConf(conf);
+Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org;));
+Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org;));
+Assert.assertNotNull(domainFilter.filter("http://www.apache.org;));
+Assert.assertNotNull(domainFilter.filter("http://www.google.com;));
+Assert.assertNotNull(domainFilter.filter("http://mail.yahoo.com;));
+Assert.assertNotNull(domainFilter.filter("http://www.foobar.net;));
+Assert.assertNotNull(domainFilter.filter("http://www.foobas.net;));
+Assert.assertNotNull(domainFilter.filter("http://www.yahoo.com;));
+Assert.assertNotNull(domainFilter.filter("http://www.foobar.be;));
+Assert.assertNotNull(domainFilter.filter("http://www.adobe.com;));
+  }
 
 }

svn commit: r1717622 - in /nutch/trunk: CHANGES.txt conf/log4j.properties

2015-12-02 Thread markus

Author: markus
Date: Wed Dec  2 12:40:27 2015
New Revision: 1717622

URL: http://svn.apache.org/viewvc?rev=1717622=rev
Log:
NUTCH-2176 Clean up of log4j.properties

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/log4j.properties

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1717622=1717621=1717622=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Dec  2 12:40:27 2015
@@ -3,6 +3,8 @@ Nutch Change Log
 Nutch 1.11 Release 25/10/2015 (dd/mm/)
 Release Report: http://s.apache.org/nutch11
 
+* NUTCH-2176 Clean up of log4j.properties (markus)
+
 * NUTCH-2107 plugin.xml to validate against plugin.dtd (snagel)
 
 * NUTCH-2177 Generator produces only one partition even in distributed mode 
(jnioche, snagel)

Modified: nutch/trunk/conf/log4j.properties
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/log4j.properties?rev=1717622=1717621=1717622=diff
==
--- nutch/trunk/conf/log4j.properties (original)
+++ nutch/trunk/conf/log4j.properties Wed Dec  2 12:40:27 2015
@@ -24,41 +24,39 @@ log4j.rootLogger=INFO,DRFA
 log4j.threshold=ALL
 
 #special logging requirements for some commandline tools
+log4j.logger.org.apache.nutch.crawl.CrawlDb=INFO,cmdstdout
+log4j.logger.org.apache.nutch.crawl.CrawlDbMerger=INFO,cmdstdout
+log4j.logger.org.apache.nutch.crawl.CrawlDbReader=INFO,cmdstdout
 log4j.logger.org.apache.nutch.crawl.Crawl=INFO,cmdstdout
-log4j.logger.org.apache.nutch.crawl.Injector=INFO,cmdstdout
+log4j.logger.org.apache.nutch.crawl.DeduplicationJob=INFO,cmdstdout
 log4j.logger.org.apache.nutch.crawl.Generator=INFO,cmdstdout
+log4j.logger.org.apache.nutch.crawl.Injector=INFO,cmdstdout
+log4j.logger.org.apache.nutch.crawl.LinkDb=INFO,cmdstdout
+log4j.logger.org.apache.nutch.crawl.LinkDbMerger=INFO,cmdstdout
+log4j.logger.org.apache.nutch.crawl.LinkDbReader=INFO,cmdstdout
 log4j.logger.org.apache.nutch.fetcher.Fetcher=INFO,cmdstdout
-log4j.logger.org.apache.nutch.fetcher.FetcherThread=INFO,cmdstdout
 log4j.logger.org.apache.nutch.fetcher.FetcherItem=INFO,cmdstdout
 log4j.logger.org.apache.nutch.fetcher.FetcherItemQueue=INFO,cmdstdout
 log4j.logger.org.apache.nutch.fetcher.FetcherItemQueues=INFO,cmdstdout
+log4j.logger.org.apache.nutch.fetcher.FetcherThread=INFO,cmdstdout
 log4j.logger.org.apache.nutch.fetcher.QueueFeeder=INFO,cmdstdout
-log4j.logger.org.apache.nutch.parse.ParseSegment=INFO,cmdstdout
-log4j.logger.org.apache.nutch.crawl.CrawlDbReader=INFO,cmdstdout
-log4j.logger.org.apache.nutch.crawl.CrawlDbMerger=INFO,cmdstdout
-log4j.logger.org.apache.nutch.crawl.LinkDbReader=INFO,cmdstdout
-log4j.logger.org.apache.nutch.segment.SegmentChecker=INFO,cmdstdout
-log4j.logger.org.apache.nutch.segment.SegmentReader=INFO,cmdstdout
-log4j.logger.org.apache.nutch.segment.SegmentMerger=INFO,cmdstdout
-log4j.logger.org.apache.nutch.crawl.CrawlDb=INFO,cmdstdout
-log4j.logger.org.apache.nutch.crawl.LinkDb=INFO,cmdstdout
-log4j.logger.org.apache.nutch.crawl.LinkDbMerger=INFO,cmdstdout
+log4j.logger.org.apache.nutch.indexer.IndexingFiltersChecker=INFO,cmdstdout
 log4j.logger.org.apache.nutch.indexer.IndexingJob=INFO,cmdstdout
 log4j.logger.org.apache.nutch.indexwriter.solr.SolrIndexWriter=INFO,cmdstdout
 log4j.logger.org.apache.nutch.indexwriter.solr.SolrUtils-INFO,cmdstdout
-log4j.logger.org.apache.nutch.scoring.webgraph.WebGraph=INFO,cmdstdout
+log4j.logger.org.apache.nutch.parse.ParserChecker=INFO,cmdstdout
+log4j.logger.org.apache.nutch.parse.ParseSegment=INFO,cmdstdout
+log4j.logger.org.apache.nutch.plugin.PluginRepository=WARN
+log4j.logger.org.apache.nutch.protocol.RobotRulesParser=INFO,cmdstdout
 log4j.logger.org.apache.nutch.scoring.webgraph.LinkRank=INFO,cmdstdout
 log4j.logger.org.apache.nutch.scoring.webgraph.Loops=INFO,cmdstdout
 log4j.logger.org.apache.nutch.scoring.webgraph.ScoreUpdater=INFO,cmdstdout
-log4j.logger.org.apache.nutch.util.hostdb.HostDb=INFO,cmdstdout
-log4j.logger.org.apache.nutch.util.hostdb.DumpHostDb=INFO,cmdstdout
-log4j.logger.org.apache.nutch.parse.ParserChecker=INFO,cmdstdout
-log4j.logger.org.apache.nutch.indexer.IndexingFiltersChecker=INFO,cmdstdout
+log4j.logger.org.apache.nutch.scoring.webgraph.WebGraph=INFO,cmdstdout
+log4j.logger.org.apache.nutch.segment.SegmentChecker=INFO,cmdstdout
+log4j.logger.org.apache.nutch.segment.SegmentMerger=INFO,cmdstdout
+log4j.logger.org.apache.nutch.segment.SegmentReader=INFO,cmdstdout
 log4j.logger.org.apache.nutch.tools.FreeGenerator=INFO,cmdstdout
 log4j.logger.org.apache.nutch.util.domain.DomainStatistics=INFO,cmdstdout
-log4j.logger.org.apache.nutch.tools.CrawlDBScanner=INFO,cmdstdout
-log4j.logger.org.apache.nutch.protocol.RobotRulesParser=INFO,cmdstdout
-log4j.logger.org.apache.nutch.plugin.PluginRepository=WARN
 
 log4j.logger.org.apache.nutch=INFO
 log4j.logger.org.apache.hadoop=WARN

svn commit: r1703111 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexerMapReduce.java

2015-09-15 Thread markus

Author: markus
Date: Tue Sep 15 06:51:48 2015
New Revision: 1703111

URL: http://svn.apache.org/r1703111
Log:
NUTCH-2093 Indexing filters to use current signatures

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1703111=1703110=1703111=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Sep 15 06:51:48 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2093 Indexing filters to use current signatures (markus)
+
 * NUTCH-2092: Unit Test for NutchServer (Sujen Shah via mattmann)
 
 * NUTCH-2096 Explicitly indicate broswer binary to use when selecting 

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1703111=1703110=1703111=diff
==
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Tue Sep 
15 06:51:48 2015
@@ -287,6 +287,9 @@ public class IndexerMapReduce extends Co
 
 final Parse parse = new ParseImpl(parseText, parseData);
 try {
+  // Indexing filters may also be interested in the signature
+  fetchDatum.setSignature(dbDatum.getSignature());
+  
   // extract information from dbDatum and pass it to
   // fetchDatum so that indexing filters can use it
   final Text url = (Text) dbDatum.getMetaData().get(

svn commit: r1688566 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/segment/SegmentReader.java

2015-07-01 Thread markus

Author: markus
Date: Wed Jul  1 07:00:40 2015
New Revision: 1688566

URL: http://svn.apache.org/r1688566
Log:
NUTCH-1692 SegmentReader was broken in distributed mode

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1688566r1=1688565r2=1688566view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jul  1 07:00:40 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-1692 SegmentReader was broken in distributed mode (markus, tejasp)
+
 * NUTCH-1684 ParseMeta to be added before fetch schedulers are run (markus)
 
 * NUTCH-2038 fix for NUTCH-2038: Naive Bayes classifier based html Parse 
filter (for filtering outlinks) 

Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?rev=1688566r1=1688565r2=1688566view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Wed Jul  1 
07:00:40 2015
@@ -507,55 +507,64 @@ public class SegmentReader extends Confi
 
   public void getStats(Path segment, final SegmentReaderStats stats)
   throws Exception {
-SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(
-getConf(), new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
 long cnt = 0L;
 Text key = new Text();
-for (int i = 0; i  readers.length; i++) {
-  while (readers[i].next(key))
-cnt++;
-  readers[i].close();
-}
-stats.generated = cnt;
-Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
-if (fs.exists(fetchDir)  fs.getFileStatus(fetchDir).isDir()) {
-  cnt = 0L;
-  long start = Long.MAX_VALUE;
-  long end = Long.MIN_VALUE;
-  CrawlDatum value = new CrawlDatum();
-  MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, fetchDir,
-  getConf());
-  for (int i = 0; i  mreaders.length; i++) {
-while (mreaders[i].next(key, value)) {
+
+if (ge) {
+  SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(
+  getConf(), new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
+  for (int i = 0; i  readers.length; i++) {
+while (readers[i].next(key))
   cnt++;
-  if (value.getFetchTime()  start)
-start = value.getFetchTime();
-  if (value.getFetchTime()  end)
-end = value.getFetchTime();
+readers[i].close();
+  }
+  stats.generated = cnt;
+}
+
+if (fe) {
+  Path fetchDir = new Path(segment, CrawlDatum.FETCH_DIR_NAME);
+  if (fs.exists(fetchDir)  fs.getFileStatus(fetchDir).isDir()) {
+cnt = 0L;
+long start = Long.MAX_VALUE;
+long end = Long.MIN_VALUE;
+CrawlDatum value = new CrawlDatum();
+MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, 
fetchDir,
+getConf());
+for (int i = 0; i  mreaders.length; i++) {
+  while (mreaders[i].next(key, value)) {
+cnt++;
+if (value.getFetchTime()  start)
+  start = value.getFetchTime();
+if (value.getFetchTime()  end)
+  end = value.getFetchTime();
+  }
+  mreaders[i].close();
 }
-mreaders[i].close();
+stats.start = start;
+stats.end = end;
+stats.fetched = cnt;
   }
-  stats.start = start;
-  stats.end = end;
-  stats.fetched = cnt;
 }
-Path parseDir = new Path(segment, ParseData.DIR_NAME);
-if (fs.exists(parseDir)  fs.getFileStatus(parseDir).isDir()) {
-  cnt = 0L;
-  long errors = 0L;
-  ParseData value = new ParseData();
-  MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, parseDir,
-  getConf());
-  for (int i = 0; i  mreaders.length; i++) {
-while (mreaders[i].next(key, value)) {
-  cnt++;
-  if (!value.getStatus().isSuccess())
-errors++;
+
+if (pd) {
+  Path parseDir = new Path(segment, ParseData.DIR_NAME);
+  if (fs.exists(parseDir)  fs.getFileStatus(parseDir).isDir()) {
+cnt = 0L;
+long errors = 0L;
+ParseData value = new ParseData();
+MapFile.Reader[] mreaders = MapFileOutputFormat.getReaders(fs, 
parseDir,
+getConf());
+for (int i = 0; i  mreaders.length; i++) {
+  while (mreaders[i].next(key, value)) {
+cnt++;
+if (!value.getStatus().isSuccess())
+  errors++;
+  }
+  mreaders[i].close();
 }
-mreaders[i

svn commit: r1688561 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDbReducer.java

2015-07-01 Thread markus

Author: markus
Date: Wed Jul  1 06:56:32 2015
New Revision: 1688561

URL: http://svn.apache.org/r1688561
Log:
NUTCH-1684 ParseMeta to be added before fetch schedulers are run

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1688561r1=1688560r2=1688561view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jul  1 06:56:32 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-1684 ParseMeta to be added before fetch schedulers are run (markus)
+
 * NUTCH-2038 fix for NUTCH-2038: Naive Bayes classifier based html Parse 
filter (for filtering outlinks) 
   (Asitang Mishra, snagel via mattmann)
 

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=1688561r1=1688560r2=1688561view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Wed Jul  1 
06:56:32 2015
@@ -209,6 +209,13 @@ public class CrawlDbReducer implements
 case CrawlDatum.STATUS_FETCH_REDIR_TEMP: // successful fetch, redirected
 case CrawlDatum.STATUS_FETCH_REDIR_PERM:
 case CrawlDatum.STATUS_FETCH_NOTMODIFIED: // successful fetch, notmodified
+  // https://issues.apache.org/jira/browse/NUTCH-1656
+  if (metaFromParse != null) {
+for (EntryWritable, Writable e : metaFromParse.entrySet()) {
+  result.getMetaData().put(e.getKey(), e.getValue());
+}
+  }
+  
   // determine the modification status
   int modified = FetchSchedule.STATUS_UNKNOWN;
   if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
@@ -260,13 +267,6 @@ public class CrawlDbReducer implements
 result.setSignature(signature);
   }
 
-  // https://issues.apache.org/jira/browse/NUTCH-1656
-  if (metaFromParse != null) {
-for (EntryWritable, Writable e : metaFromParse.entrySet()) {
-  result.getMetaData().put(e.getKey(), e.getValue());
-}
-  }
-
   // if fetchInterval is larger than the system-wide maximum, trigger
   // an unconditional recrawl. This prevents the page to be stuck at
   // NOTMODIFIED state, when the old fetched copy was already removed with

svn commit: r1675058 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/segment/SegmentMerger.java

2015-04-21 Thread markus

Author: markus
Date: Tue Apr 21 07:43:32 2015
New Revision: 1675058

URL: http://svn.apache.org/r1675058
Log:
NUTCH-1697 SegmentMerger to implement Tool

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1675058r1=1675057r2=1675058view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Apr 21 07:43:32 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1697 SegmentMerger to implement Tool (markus, snagel)
+
 * NUTCH-1987 - Make bin/crawl indexer agnostic (Michael Joyce, snagel via 
mattmann)
  
 * NUTCH-1854 bin/crawl fails with a parsing fetcher (Asitang Mishra via snagel)

Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?rev=1675058r1=1675057r2=1675058view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Tue Apr 21 
07:43:32 2015
@@ -51,6 +51,8 @@ import org.apache.hadoop.mapred.Sequence
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.hadoop.mapred.SequenceFileRecordReader;
 import org.apache.hadoop.util.Progressable;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Generator;
 import org.apache.nutch.metadata.MetaWrapper;
@@ -118,7 +120,7 @@ import org.apache.nutch.util.NutchJob;
  * 
  * @author Andrzej Bialecki
  */
-public class SegmentMerger extends Configured implements
+public class SegmentMerger extends Configured implements Tool,
 MapperText, MetaWrapper, Text, MetaWrapper,
 ReducerText, MetaWrapper, Text, MetaWrapper {
   private static final Logger LOG = LoggerFactory
@@ -691,7 +693,7 @@ public class SegmentMerger extends Confi
   /**
* @param args
*/
-  public static void main(String[] args) throws Exception {
+  public int run(String[] args)  throws Exception {
 if (args.length  2) {
   System.err
   .println(SegmentMerger output_dir (-dir segments | seg1 seg2 ...) 
[-filter] [-slice ]);
@@ -706,7 +708,7 @@ public class SegmentMerger extends Confi
   .println(\t-normalize\t\tnormalize URL via current URLNormalizers);
   System.err
   .println(\t-slice \tcreate many output segments, each 
containing  URLs);
-  return;
+  return -1;
 }
 Configuration conf = NutchConfiguration.create();
 final FileSystem fs = FileSystem.get(conf);
@@ -734,11 +736,18 @@ public class SegmentMerger extends Confi
 }
 if (segs.size() == 0) {
   System.err.println(ERROR: No input segments.);
-  return;
+  return -1;
 }
-SegmentMerger merger = new SegmentMerger(conf);
-merger.merge(out, segs.toArray(new Path[segs.size()]), filter, normalize,
+
+merge(out, segs.toArray(new Path[segs.size()]), filter, normalize,
 sliceSize);
+return 0;
+  }
+
+  public static void main(String[] args) throws Exception {
+int result = ToolRunner.run(NutchConfiguration.create(),
+new SegmentMerger(), args);
+System.exit(result);
   }
 
 }

svn commit: r1666471 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/NutchWritable.java

2015-03-13 Thread markus

Author: markus
Date: Fri Mar 13 14:58:05 2015
New Revision: 1666471

URL: http://svn.apache.org/r1666471
Log:
NUTCH-1955 ByteWritable missing in NutchWritable

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1666471r1=1666470r2=1666471view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Mar 13 14:58:05 2015
@@ -2,6 +2,8 @@ Nutch Change Log
  
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1955 ByteWritable missing in NutchWritable (markus)
+
 * NUTCH-1956 Members to be public in URLCrawlDatum (markus)
  
 * NUTCH-1954 FilenameTooLong error appears in CommonCrawlDumper (mattmann)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java?rev=1666471r1=1666470r2=1666471view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java Fri Mar 13 
14:58:05 2015
@@ -29,6 +29,7 @@ public class NutchWritable extends Gener
 org.apache.hadoop.io.NullWritable.class,
 org.apache.hadoop.io.BooleanWritable.class,
 org.apache.hadoop.io.LongWritable.class,
+org.apache.hadoop.io.ByteWritable.class,
 org.apache.hadoop.io.BytesWritable.class,
 org.apache.hadoop.io.FloatWritable.class,
 org.apache.hadoop.io.IntWritable.class,

svn commit: r1663698 - in /nutch/trunk: ./ conf/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/ src/plugin/protocol-

2015-03-03 Thread markus

Author: markus
Date: Tue Mar  3 13:16:39 2015
New Revision: 1663698

URL: http://svn.apache.org/r1663698
Log:
NUTCH 1921 Optionally disable HTTP if-modified-since header

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml

nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java

nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java

nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1663698r1=1663697r2=1663698view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Mar  3 13:16:39 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1921 Optionally disable HTTP if-modified-since header (markus)
+
 * NUTCH-1933 nutch-selenium plugin (Mo Omer, Mohammad Al-Moshin, lewismc)
 
 * NUTCH-827 HTTP POST Authentication (Jasper van Veghel, yuanyun.cn, snagel, 
lewismc)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1663698r1=1663697r2=1663698view=diff
==
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Tue Mar  3 13:16:39 2015
@@ -297,6 +297,17 @@
   /description
 /property
 
+property
+  namehttp.enable.if.modified.since.header/name
+  valuetrue/value
+  descriptionWhether Nutch sends an HTTP If-Modified-Since header. It reduces
+  bandwidth when enabled by not downloading pages that respond with an HTTP
+  Not-Modified header. URL's that are not downloaded are not passed through
+  parse or indexing filters. If you regularly modify filters, you should force
+  Nutch to also download unmodified pages by disabling this feature.
+  /description
+/property
+
 !-- FTP properties --
 
 property

Modified: 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1663698r1=1663697r2=1663698view=diff
==
--- 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Tue Mar  3 13:16:39 2015
@@ -107,6 +107,9 @@ public abstract class HttpBase implement
 
   /** Which TLS/SSL cipher suites to support */
   protected SetString tlsPreferredCipherSuites;
+  
+  /** Configuration directive for If-Modified-Since HTTP header */
+  public boolean enableIfModifiedsinceHeader = true;
 
   /** Creates a new instance of HttpBase */
   public HttpBase() {
@@ -137,6 +140,7 @@ public abstract class HttpBase implement
 // backward-compatible default setting
 this.useHttp11 = conf.getBoolean(http.useHttp11, false);
 this.responseTime = conf.getBoolean(http.store.responsetime, true);
+this.enableIfModifiedsinceHeader = 
conf.getBoolean(http.enable.if.modified.since.header, true);
 this.robots.setConf(conf);
 
 String[] protocols = conf.getStrings(http.tls.supported.protocols,
@@ -298,6 +302,10 @@ public abstract class HttpBase implement
   public int getTimeout() {
 return timeout;
   }
+  
+  public boolean isIfModifiedSinceEnabled() {
+return enableIfModifiedsinceHeader;
+  }
 
   public int getMaxContent() {
 return maxContent;

Modified: 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1663698r1=1663697r2=1663698view=diff
==
--- 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 Tue Mar  3 13:16:39 2015
@@ -192,7 +192,7 @@ public class HttpResponse implements Res
   reqStr.append(this.http.getAccept());
   reqStr.append(\r\n);
 
-  if (datum.getModifiedTime()  0) {
+  if (http.isIfModifiedSinceEnabled()  datum.getModifiedTime()  0) {
 reqStr.append(If-Modified-Since: 
 + HttpDateFormat.toString(datum.getModifiedTime()));
 reqStr.append(\r\n);

Modified: 
nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient

svn commit: r1659532 - in /nutch/branches/2.x: CHANGES.txt ivy/ivy.xml src/plugin/parse-tika/ivy.xml src/plugin/parse-tika/plugin.xml

2015-02-13 Thread markus

Author: markus
Date: Fri Feb 13 12:25:13 2015
New Revision: 1659532

URL: http://svn.apache.org/r1659532
Log:
NUTCH-1925 Upgrade to Apache Tika 1.7

Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/ivy/ivy.xml
nutch/branches/2.x/src/plugin/parse-tika/ivy.xml
nutch/branches/2.x/src/plugin/parse-tika/plugin.xml

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1659532r1=1659531r2=1659532view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Feb 13 12:25:13 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development 2.4-SNAPSHOT
 
+* NUTCH-1925 Upgrade to Apache Tika 1.7 (Tyler Palsulich via markus)
+
 * NUTCH-1924 Nutch + HBase Docker (RadosÅaw Stankiewicz via lewismc)
 
 * NUTCH-1920 Upgrade Nutch to use Java 1.7 (lewismc)

Modified: nutch/branches/2.x/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1659532r1=1659531r2=1659532view=diff
==
--- nutch/branches/2.x/ivy/ivy.xml (original)
+++ nutch/branches/2.x/ivy/ivy.xml Fri Feb 13 12:25:13 2015
@@ -55,7 +55,7 @@
 /dependency
 
 dependency org=com.ibm.icu name=icu4j rev=4.0.1 /
-dependency org=org.apache.tika name=tika-core rev=1.6 /
+dependency org=org.apache.tika name=tika-core rev=1.7 /
 dependency org=com.googlecode.juniversalchardet 
name=juniversalchardet rev=1.0.3/
 
 dependency org=log4j name=log4j rev=1.2.15 conf=*-master /

Modified: nutch/branches/2.x/src/plugin/parse-tika/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/ivy.xml?rev=1659532r1=1659531r2=1659532view=diff
==
--- nutch/branches/2.x/src/plugin/parse-tika/ivy.xml (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/ivy.xml Fri Feb 13 12:25:13 2015
@@ -36,7 +36,7 @@
   /publications
 
   dependencies
-dependency org=org.apache.tika name=tika-parsers rev=1.6 
conf=*-default
+dependency org=org.apache.tika name=tika-parsers rev=1.7 
conf=*-default
  exclude org=org.apache.tika name=tika-core /
 /dependency
 override module=rome rev=0.9/

Modified: nutch/branches/2.x/src/plugin/parse-tika/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/plugin.xml?rev=1659532r1=1659531r2=1659532view=diff
==
--- nutch/branches/2.x/src/plugin/parse-tika/plugin.xml (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/plugin.xml Fri Feb 13 12:25:13 2015
@@ -38,27 +38,27 @@
   library name=commons-httpclient-3.1.jar/
   library name=commons-logging-1.1.1.jar/
   library name=dom4j-1.6.1.jar/
-  library name=fontbox-1.8.6.jar/
+  library name=fontbox-1.8.8.jar/
   library name=geronimo-stax-api_1.0_spec-1.0.1.jar/
   library name=isoparser-1.0.2.jar/
   library name=java-libpst-0.8.1.jar/
   library name=jcip-annotations-1.0.jar/
   library name=jdom-1.0.jar/
-  library name=jempbox-1.8.6.jar/
+  library name=jempbox-1.8.8.jar/
   library name=jhighlight-1.0.jar/
   library name=jmatio-1.0.jar/
   library name=juniversalchardet-1.0.3.jar/
   library name=metadata-extractor-2.6.2.jar/
   library name=netcdf-4.2.20.jar/
-  library name=pdfbox-1.8.6.jar/
-  library name=poi-3.11-beta2.jar/
-  library name=poi-ooxml-3.11-beta2.jar/
-  library name=poi-ooxml-schemas-3.11-beta2.jar/
-  library name=poi-scratchpad-3.11-beta2.jar/
+  library name=pdfbox-1.8.8.jar/
+  library name=poi-3.11.jar/
+  library name=poi-ooxml-3.11.jar/
+  library name=poi-ooxml-schemas-3.11.jar/
+  library name=poi-scratchpad-3.11.jar/
   library name=rome-0.9.jar/
   library name=slf4j-api-1.6.1.jar/
   library name=tagsoup-1.2.1.jar/
-  library name=tika-parsers-1.6.jar/
+  library name=tika-parsers-1.7.jar/
   library name=unidataCommon-4.2.20.jar/
   library name=vorbis-java-core-0.6.jar/
   library name=vorbis-java-tika-0.6.jar/

svn commit: r1659533 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/LinkDbReader.java

2015-02-13 Thread markus

Author: markus
Date: Fri Feb 13 12:28:13 2015
New Revision: 1659533

URL: http://svn.apache.org/r1659533
Log:
NUTCH-1724 LinkDBReader to support regex output filtering

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1659533r1=1659532r2=1659533view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Feb 13 12:28:13 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1724 LinkDBReader to support regex output filtering (markus)
+
 * NUTCH-1939 Fetcher fails to follow redirects (Leo Ye via snagel)
 
 * NUTCH-1913 LinkDB to implement db.ignore.external.links (markus, snagel)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=1659533r1=1659532r2=1659533view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Fri Feb 13 
12:28:13 2015
@@ -19,6 +19,9 @@ package org.apache.nutch.crawl;
 
 import java.io.IOException;
 
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
 // Commons Logging imports
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -43,7 +46,7 @@ import java.io.Closeable;
 public class LinkDbReader extends Configured implements Tool, Closeable {
   public static final Logger LOG = LoggerFactory.getLogger(LinkDbReader.class);
 
-  private static final PartitionerWritableComparable?, Writable 
PARTITIONER = new HashPartitionerWritableComparable?, Writable();
+  private static final PartitionerWritableComparable, Writable PARTITIONER = 
new HashPartitionerWritableComparable, Writable();
 
   private FileSystem fs;
   private Path directory;
@@ -90,8 +93,33 @@ public class LinkDbReader extends Config
   }
 }
   }
+  
+  public static class LinkDBDumpMapper implements MapperText, Inlinks, Text, 
Inlinks {
+Pattern pattern = null;
+Matcher matcher = null;
+
+public void configure(JobConf job) {
+  if (job.get(linkdb.regex, null) != null) {
+pattern = Pattern.compile(job.get(linkdb.regex));
+  }
+}
+
+public void close() {}
+public void map(Text key, Inlinks value, OutputCollectorText, Inlinks 
output, Reporter reporter)
+throws IOException {
+
+  if (pattern != null) {
+matcher = pattern.matcher(key.toString());
+if (!matcher.matches()) {
+  return;
+}
+  }
+
+  output.collect(key, value);
+}
+  }
 
-  public void processDumpJob(String linkdb, String output) throws IOException {
+  public void processDumpJob(String linkdb, String output, String regex) 
throws IOException {
 SimpleDateFormat sdf = new SimpleDateFormat(-MM-dd HH:mm:ss);
 long start = System.currentTimeMillis();
 if (LOG.isInfoEnabled()) {
@@ -102,6 +130,11 @@ public class LinkDbReader extends Config
 
 JobConf job = new NutchJob(getConf());
 job.setJobName(read  + linkdb);
+
+if (regex != null) {
+  job.set(linkdb.regex, regex);
+  job.setMapperClass(LinkDBDumpMapper.class);
+}
 
 FileInputFormat.addInputPath(job, new Path(linkdb, LinkDb.CURRENT_NAME));
 job.setInputFormat(SequenceFileInputFormat.class);
@@ -127,16 +160,24 @@ public class LinkDbReader extends Config
   public int run(String[] args) throws Exception {
 if (args.length  2) {
   System.err
-  .println(Usage: LinkDbReader linkdb (-dump out_dir | -url 
url));
+  .println(Usage: LinkDbReader linkdb (-dump out_dir [-regex 
regex]) | -url url);
   System.err
   .println(\t-dump out_dir\tdump whole link db to a text file in 
out_dir);
   System.err
+  .println(\t\t-regex regex\trestrict to url's matching 
expression);
+  System.err
   .println(\t-url url\tprint information about url to 
System.out);
   return -1;
 }
 try {
   if (args[1].equals(-dump)) {
-processDumpJob(args[0], args[2]);
+String regex = null;
+for (int i = 2; i  args.length; i++) {
+  if (args[i].equals(-regex)) {
+regex = args[++i];
+  }
+}
+processDumpJob(args[0], args[2], regex);
 return 0;
   } else if (args[1].equals(-url)) {
 init(new Path(args[0]));

svn commit: r1659169 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/LinkDb.java

2015-02-12 Thread markus

Author: markus
Date: Thu Feb 12 08:42:49 2015
New Revision: 1659169

URL: http://svn.apache.org/r1659169
Log:
NUTCH-1913 LinkDB to implement db.ignore.external.links

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1659169r1=1659168r2=1659169view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Feb 12 08:42:49 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1913 LinkDB to implement db.ignore.external.links (markus, snagel)
+
 * NUTCH-1925 Upgrade to Apache Tika 1.7 (Tyler Palsulich via markus)
 
 * NUTCH-1323 AjaxNormalizer (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java?rev=1659169r1=1659168r2=1659169view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDb.java Thu Feb 12 08:42:49 
2015
@@ -49,12 +49,14 @@ public class LinkDb extends Configured i
   public static final Logger LOG = LoggerFactory.getLogger(LinkDb.class);
 
   public static final String IGNORE_INTERNAL_LINKS = 
db.ignore.internal.links;
+  public static final String IGNORE_EXTERNAL_LINKS = 
db.ignore.external.links;
 
   public static final String CURRENT_NAME = current;
   public static final String LOCK_NAME = .locked;
 
   private int maxAnchorLength;
   private boolean ignoreInternalLinks;
+  private boolean ignoreExternalLinks;
   private URLFilters urlFilters;
   private URLNormalizers urlNormalizers;
 
@@ -68,6 +70,8 @@ public class LinkDb extends Configured i
   public void configure(JobConf job) {
 maxAnchorLength = job.getInt(db.max.anchor.length, 100);
 ignoreInternalLinks = job.getBoolean(IGNORE_INTERNAL_LINKS, true);
+ignoreExternalLinks = job.getBoolean(IGNORE_EXTERNAL_LINKS, false);
+
 if (job.getBoolean(LinkDbFilter.URL_FILTERING, false)) {
   urlFilters = new URLFilters(job);
 }
@@ -115,6 +119,11 @@ public class LinkDb extends Configured i
 if (toHost == null || toHost.equals(fromHost)) { // internal link
   continue; // skip it
 }
+  } else if (ignoreExternalLinks) {
+String toHost = getHost(toUrl);
+if (toHost == null || !toHost.equals(fromHost)) { // external link
+  continue;   // skip it
+}
   }
   if (urlNormalizers != null) {
 try {
@@ -180,6 +189,15 @@ public class LinkDb extends Configured i
   if (job.getBoolean(IGNORE_INTERNAL_LINKS, true)) {
 LOG.info(LinkDb: internal links will be ignored.);
   }
+  if (job.getBoolean(IGNORE_EXTERNAL_LINKS, false)) {
+LOG.info(LinkDb: external links will be ignored.);
+  }
+}
+if (job.getBoolean(IGNORE_INTERNAL_LINKS, true)
+ job.getBoolean(IGNORE_EXTERNAL_LINKS, false)) {
+  LOG.warn(LinkDb: internal and external links are ignored! 
+  + Nothing to do, actually. Exiting.);
+  return;
 }
 
 for (int i = 0; i  segments.length; i++) {
@@ -291,7 +309,6 @@ public class LinkDb extends Configured i
   System.err.println(\t-noFilter\tdon't apply URLFilters to link URLs);
   return -1;
 }
-Path segDir = null;
 final FileSystem fs = FileSystem.get(getConf());
 Path db = new Path(args[0]);
 ArrayListPath segs = new ArrayListPath();

svn commit: r1659167 - in /nutch/trunk: ./ src/plugin/ src/plugin/urlnormalizer-ajax/ src/plugin/urlnormalizer-ajax/src/ src/plugin/urlnormalizer-ajax/src/java/ src/plugin/urlnormalizer-ajax/src/java/

2015-02-12 Thread markus

Author: markus
Date: Thu Feb 12 08:30:31 2015
New Revision: 1659167

URL: http://svn.apache.org/r1659167
Log:
NUTCH-1323 AjaxNormalizer


Added:
nutch/trunk/src/plugin/urlnormalizer-ajax/
nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml
nutch/trunk/src/plugin/urlnormalizer-ajax/ivy.xml
nutch/trunk/src/plugin/urlnormalizer-ajax/plugin.xml
nutch/trunk/src/plugin/urlnormalizer-ajax/src/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/

nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/

nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/

nutch/trunk/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/
nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/

nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/

nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/

nutch/trunk/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1659167r1=1659166r2=1659167view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Feb 12 08:30:31 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development 1.10-SNAPSHOT
 
+* NUTCH-1323 AjaxNormalizer (markus)
+
 * NUTCH-1918 TikaParser specifies a default namespace when generating DOM 
(jnioche)
 
 * NUTCH-1889 Store all values from Tika metadata in Nutch metadata (jnioche)

Modified: nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1659167r1=1659166r2=1659167view=diff
==
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Thu Feb 12 08:30:31 2015
@@ -69,6 +69,7 @@
  ant dir=urlfilter-suffix target=deploy/
  ant dir=urlfilter-validator target=deploy/
  ant dir=urlmeta target=deploy/
+ ant dir=urlnormalizer-ajax target=deploy/
  ant dir=urlnormalizer-basic target=deploy/
  ant dir=urlnormalizer-host target=deploy/
  ant dir=urlnormalizer-pass target=deploy/
@@ -107,6 +108,7 @@
  ant dir=urlfilter-regex target=test/
  ant dir=urlfilter-suffix target=test/
  ant dir=urlfilter-validator target=test/
+ ant dir=urlnormalizer-ajax target=test/
  ant dir=urlnormalizer-basic target=test/
  ant dir=urlnormalizer-host target=test/
  ant dir=urlnormalizer-pass target=test/
@@ -164,8 +166,9 @@
 ant dir=urlfilter-suffix target=clean/
 ant dir=urlfilter-validator target=clean/
 ant dir=urlmeta target=clean/
-ant dir=urlnormalizer-host target=clean/
+ant dir=urlnormalizer-ajax target=clean/
 ant dir=urlnormalizer-basic target=clean/
+ant dir=urlnormalizer-host target=clean/
 ant dir=urlnormalizer-pass target=clean/
 ant dir=urlnormalizer-querystring target=clean/
 ant dir=urlnormalizer-regex target=clean/

Added: nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml?rev=1659167view=auto
==
--- nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-ajax/build.xml Thu Feb 12 08:30:31 2015
@@ -0,0 +1,22 @@
+?xml version=1.0?
+!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the License); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an AS IS BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License

svn commit: r1607043 - /nutch/cms_site/trunk/templates/std.html

2014-07-01 Thread markus

Author: markus
Date: Tue Jul  1 11:07:57 2014
New Revision: 1607043

URL: http://svn.apache.org/r1607043
Log:
have at least a title on all pages

Modified:
nutch/cms_site/trunk/templates/std.html

Modified: nutch/cms_site/trunk/templates/std.html
URL: 
http://svn.apache.org/viewvc/nutch/cms_site/trunk/templates/std.html?rev=1607043r1=1607042r2=1607043view=diff
==
--- nutch/cms_site/trunk/templates/std.html (original)
+++ nutch/cms_site/trunk/templates/std.html Tue Jul  1 11:07:57 2014
@@ -314,7 +314,7 @@ under the License. 
 script type=text/javascript 
src=http://w.sharethis.com/button/buttons.js;/script
 script type=text/javascript 
src=http://s.sharethis.com/loader.js;/script
script type=text/javascript src=./assets/js/jquery.js/script
-   title{% block title %}{% endblock %} -- Apache Nutch/title
+   titleApache Nutch/title
   /head
 
   body

svn commit: r914579 - /websites/production/nutch/content/

2014-07-01 Thread markus

Author: markus
Date: Tue Jul  1 11:09:09 2014
New Revision: 914579

Log:
Add title to pages.

Added:
websites/production/nutch/content/
  - copied from r914578, websites/staging/nutch/trunk/content/

svn commit: r1606693 - /nutch/cms_site/trunk/content/index.md

2014-06-30 Thread markus

Author: markus
Date: Mon Jun 30 11:44:03 2014
New Revision: 1606693

URL: http://svn.apache.org/r1606693
Log:
page title missing

Modified:
nutch/cms_site/trunk/content/index.md

Modified: nutch/cms_site/trunk/content/index.md
URL: 
http://svn.apache.org/viewvc/nutch/cms_site/trunk/content/index.md?rev=1606693r1=1606692r2=1606693view=diff
==
--- nutch/cms_site/trunk/content/index.md (original)
+++ nutch/cms_site/trunk/content/index.md Mon Jun 30 11:44:03 2014
@@ -1,3 +1,4 @@
+Welcome to Apache Nutch
 !-- 
 Licensed to the Apache Software Foundation (ASF) under one
 or more contributor license agreements.  See the NOTICE file

svn commit: r1606694 - /nutch/cms_site/trunk/content/index.md

2014-06-30 Thread markus

Author: markus
Date: Mon Jun 30 11:46:31 2014
New Revision: 1606694

URL: http://svn.apache.org/r1606694
Log:
Apparently the page header input box does not result in a title

Modified:
nutch/cms_site/trunk/content/index.md

Modified: nutch/cms_site/trunk/content/index.md
URL: 
http://svn.apache.org/viewvc/nutch/cms_site/trunk/content/index.md?rev=1606694r1=1606693r2=1606694view=diff
==
--- nutch/cms_site/trunk/content/index.md (original)
+++ nutch/cms_site/trunk/content/index.md Mon Jun 30 11:46:31 2014
@@ -1,4 +1,3 @@
-Welcome to Apache Nutch
 !-- 
 Licensed to the Apache Software Foundation (ASF) under one
 or more contributor license agreements.  See the NOTICE file

svn commit: r1606695 - /nutch/cms_site/trunk/templates/std.html

2014-06-30 Thread markus

Author: markus
Date: Mon Jun 30 11:50:44 2014
New Revision: 1606695

URL: http://svn.apache.org/r1606695
Log:
added title 

Modified:
nutch/cms_site/trunk/templates/std.html

Modified: nutch/cms_site/trunk/templates/std.html
URL: 
http://svn.apache.org/viewvc/nutch/cms_site/trunk/templates/std.html?rev=1606695r1=1606694r2=1606695view=diff
==
--- nutch/cms_site/trunk/templates/std.html (original)
+++ nutch/cms_site/trunk/templates/std.html Mon Jun 30 11:50:44 2014
@@ -314,7 +314,7 @@ under the License. 
 script type=text/javascript 
src=http://w.sharethis.com/button/buttons.js;/script
 script type=text/javascript 
src=http://s.sharethis.com/loader.js;/script
script type=text/javascript src=./assets/js/jquery.js/script
-
+   title{% block title %}{% endblock %}/title
   /head
 
   body

svn commit: r1606696 - /nutch/cms_site/trunk/templates/std.html

2014-06-30 Thread markus

Author: markus
Date: Mon Jun 30 11:52:25 2014
New Revision: 1606696

URL: http://svn.apache.org/r1606696
Log:
actually put something in the title

Modified:
nutch/cms_site/trunk/templates/std.html

Modified: nutch/cms_site/trunk/templates/std.html
URL: 
http://svn.apache.org/viewvc/nutch/cms_site/trunk/templates/std.html?rev=1606696r1=1606695r2=1606696view=diff
==
--- nutch/cms_site/trunk/templates/std.html (original)
+++ nutch/cms_site/trunk/templates/std.html Mon Jun 30 11:52:25 2014
@@ -314,7 +314,7 @@ under the License. 
 script type=text/javascript 
src=http://w.sharethis.com/button/buttons.js;/script
 script type=text/javascript 
src=http://s.sharethis.com/loader.js;/script
script type=text/javascript src=./assets/js/jquery.js/script
-   title{% block title %}{% endblock %}/title
+   title{% block title %}{% endblock %} -- Apache Nutch/title
   /head
 
   body

svn commit: r1606703 - /nutch/cms_site/trunk/content/index.md

2014-06-30 Thread markus

Author: markus
Date: Mon Jun 30 12:01:53 2014
New Revision: 1606703

URL: http://svn.apache.org/r1606703
Log:
CMS commit to nutch by markus

Modified:
nutch/cms_site/trunk/content/index.md

Modified: nutch/cms_site/trunk/content/index.md
URL: 
http://svn.apache.org/viewvc/nutch/cms_site/trunk/content/index.md?rev=1606703r1=1606702r2=1606703view=diff
==
--- nutch/cms_site/trunk/content/index.md (original)
+++ nutch/cms_site/trunk/content/index.md Mon Jun 30 12:01:53 2014
@@ -1,3 +1,4 @@
+test title? will it work?
 !-- 
 Licensed to the Apache Software Foundation (ASF) under one
 or more contributor license agreements.  See the NOTICE file

svn commit: r1606704 - /nutch/cms_site/trunk/content/index.md

2014-06-30 Thread markus

Author: markus
Date: Mon Jun 30 12:03:56 2014
New Revision: 1606704

URL: http://svn.apache.org/r1606704
Log:
will this work?

Modified:
nutch/cms_site/trunk/content/index.md

Modified: nutch/cms_site/trunk/content/index.md
URL: 
http://svn.apache.org/viewvc/nutch/cms_site/trunk/content/index.md?rev=1606704r1=1606703r2=1606704view=diff
==
--- nutch/cms_site/trunk/content/index.md (original)
+++ nutch/cms_site/trunk/content/index.md Mon Jun 30 12:03:56 2014
@@ -1,4 +1,3 @@
-test title? will it work?
 !-- 
 Licensed to the Apache Software Foundation (ASF) under one
 or more contributor license agreements.  See the NOTICE file
@@ -17,6 +16,7 @@ KIND, either express or implied.  See th
 specific language governing permissions and limitations
 under the License. 
 --
+{% block title %}does this work ten?{% endblock %}
 !-- Carousel== --
 div id=myCarousel class=carousel slide
   div class=carousel-inner

svn commit: r1606705 - /nutch/cms_site/trunk/content/index.md

2014-06-30 Thread markus

Author: markus
Date: Mon Jun 30 12:04:50 2014
New Revision: 1606705

URL: http://svn.apache.org/r1606705
Log:
restore stuff i broke

Modified:
nutch/cms_site/trunk/content/index.md

Modified: nutch/cms_site/trunk/content/index.md
URL: 
http://svn.apache.org/viewvc/nutch/cms_site/trunk/content/index.md?rev=1606705r1=1606704r2=1606705view=diff
==
--- nutch/cms_site/trunk/content/index.md (original)
+++ nutch/cms_site/trunk/content/index.md Mon Jun 30 12:04:50 2014
@@ -16,7 +16,6 @@ KIND, either express or implied.  See th
 specific language governing permissions and limitations
 under the License. 
 --
-{% block title %}does this work ten?{% endblock %}
 !-- Carousel== --
 div id=myCarousel class=carousel slide
   div class=carousel-inner

svn commit: r1600566 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/util/NodeWalker.java

2014-06-05 Thread markus

Author: markus
Date: Thu Jun  5 08:34:01 2014
New Revision: 1600566

URL: http://svn.apache.org/r1600566
Log:
NUTCH-1782 NodeWalker to return current node

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1600566r1=1600565r2=1600566view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jun  5 08:34:01 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1782 NodeWalker to return current node (markus)
+
 * NUTCH-1758 IndexChecker to send document to IndexWriters (jnioche)
 
 * NUTCH-1786 CrawlDb should follow db.url.normalizers and db.url.filters (Diaa 
via markus)

Modified: nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java?rev=1600566r1=1600565r2=1600566view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java Thu Jun  5 
08:34:01 2014
@@ -102,6 +102,14 @@ public class NodeWalker {
   }
   
   /**
+   * Return the current node.
+   * @return Node
+   */
+  public Node getCurrentNode() {
+return currentNode;
+  }
+  
+  /**
* @return returns true if there are more nodes on the current stack.
* 
*/

svn commit: r1562058 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/util/hostdb/HostDb.java

2014-01-28 Thread markus

Author: markus
Date: Tue Jan 28 13:07:09 2014
New Revision: 1562058

URL: http://svn.apache.org/r1562058
Log:
NUTCH-1717 HostDB not to complain if filters/normalizers are disabled

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDb.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1562058r1=1562057r2=1562058view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jan 28 13:07:09 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1717 HostDB not to complain if filters/normalizers are disabled 
(markus)
+
 * NUTCH-1715 RobotRulesParser adds additional '*' to the robots name (tejasp)
 
 * NUTCH-356 Plugin repository cache can lead to memory leak (Enrico Triolo, 
DoÄacan GÃ¼ney via markus)

Modified: nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDb.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDb.java?rev=1562058r1=1562057r2=1562058view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDb.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/hostdb/HostDb.java Tue Jan 28 
13:07:09 2014
@@ -505,12 +505,12 @@ public class HostDb extends Configured i
 conf.setBoolean(HOSTDB_URL_NORMALIZING, normalize);
 
 // Check whether the urlfilter-domainblacklist plugin is loaded
-if (urlfilter-domainblacklist.matches(conf.get(plugin.includes))) {
+if (filter  
urlfilter-domainblacklist.matches(conf.get(plugin.includes))) {
   throw new Exception(domainblacklist-urlfilter must not be enabled);
 }
 
 // Check whether the urlnormalizer-host plugin is loaded
-if (urlnormalizer-host.matches(conf.get(plugin.includes))) {
+if (normalize  
urlnormalizer-host.matches(conf.get(plugin.includes))) {
   throw new Exception(urlnormalizer-host must not be enabled);
 }

svn commit: r1560985 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/plugin/Extension.java src/java/org/apache/nutch/plugin/PluginClassLoader.java src/java/org/apache/nutch/plugin/PluginRepos

2014-01-24 Thread markus

Author: markus
Date: Fri Jan 24 13:12:00 2014
New Revision: 1560985

URL: http://svn.apache.org/r1560985
Log:
NUTCH-356 Plugin repository cache can lead to memory leak

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java
nutch/trunk/src/java/org/apache/nutch/plugin/PluginClassLoader.java
nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1560985r1=1560984r2=1560985view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan 24 13:12:00 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-356 Plugin repository cache can lead to memory leak (Enrico Triolo, 
DoÄacan GÃ¼ney via markus)
+
 * NUTCH-1413 Record response time (Yasin KÄ±lÄ±nÃ§, Talat Uyarer, snagel)
 
 * NUTCH-1325 HostDB for Nutch (markus, tejasp)

Modified: nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java?rev=1560985r1=1560984r2=1560985view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java Fri Jan 24 
13:12:00 2014
@@ -33,7 +33,6 @@ public class Extension {
   private String fClazz;
   private HashMapString, String fAttributes;
   private Configuration conf;
-  private PluginRepository pluginRepository;
 
   /**
* @param pDescriptor
@@ -52,7 +51,6 @@ public class Extension {
 setId(pId);
 setClazz(pExtensionClass);
 this.conf = conf;
-this.pluginRepository = pluginRepository;
   }
 
   /**
@@ -149,12 +147,13 @@ public class Extension {
 // The same is in PluginRepository.getPluginInstance().
 // Suggested by Stefan Groschupf s...@media-style.com
 synchronized (getId()) {
-  try {
-PluginClassLoader loader = fDescriptor.getClassLoader();
-Class? extensionClazz = loader.loadClass(getClazz());
+  try {  
+PluginRepository pluginRepository = PluginRepository.get(conf);
+Class extensionClazz = 
+  pluginRepository.getCachedClass(fDescriptor, getClazz());
 // lazy loading of Plugin in case there is no instance of the plugin
 // already.
-this.pluginRepository.getPluginInstance(getDescriptor());
+pluginRepository.getPluginInstance(getDescriptor());
 Object object = extensionClazz.newInstance();
 if (object instanceof Configurable) {
   ((Configurable) object).setConf(this.conf);

Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginClassLoader.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginClassLoader.java?rev=1560985r1=1560984r2=1560985view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/plugin/PluginClassLoader.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginClassLoader.java Fri Jan 
24 13:12:00 2014
@@ -18,6 +18,7 @@ package org.apache.nutch.plugin;
 
 import java.net.URL;
 import java.net.URLClassLoader;
+import java.util.Arrays;
 
 /**
  * The codePluginClassLoader/code contains only classes of the runtime
@@ -30,6 +31,10 @@ import java.net.URLClassLoader;
  * @author joa23
  */
 public class PluginClassLoader extends URLClassLoader {
+
+  private URL[] urls;
+  private ClassLoader parent;
+
   /**
* Construtor
* 
@@ -40,5 +45,36 @@ public class PluginClassLoader extends U
*/
   public PluginClassLoader(URL[] urls, ClassLoader parent) {
 super(urls, parent);
+
+this.urls = urls;
+this.parent = parent;
+  }
+  
+  @Override
+  public int hashCode() {
+final int PRIME = 31;
+int result = 1;
+result = PRIME * result + ((parent == null) ? 0 : parent.hashCode());
+result = PRIME * result + Arrays.hashCode(urls);
+return result;
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+if (this == obj)
+  return true;
+if (obj == null)
+  return false;
+if (getClass() != obj.getClass())
+  return false;
+final PluginClassLoader other = (PluginClassLoader) obj;
+if (parent == null) {
+  if (other.parent != null)
+return false;
+} else if (!parent.equals(other.parent))
+  return false;
+if (!Arrays.equals(urls, other.urls))
+  return false;
+return true;
   }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=1560985r1=1560984r2=1560985view=diff
==
--- nutch/trunk/src/java/org

svn commit: r1559657 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDbReader.java

2014-01-20 Thread markus

Author: markus
Date: Mon Jan 20 09:29:42 2014
New Revision: 1559657

URL: http://svn.apache.org/r1559657
Log:
NUTCH-1680 CrawlDbReader to dump minRetry value

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1559657r1=1559656r2=1559657view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Jan 20 09:29:42 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1680 CrawlDbReader to dump minRetry value (markus)
+
 * NUTCH-1699 Tika Parser - Image Parse Bug (Mehmet Zahid YÃ¼zÃ¼gÃ¼ldÃ¼, snagel 
via lewismc)
 
 * NUTCH-1695 Add NutchDocument.toString() to ease debugging (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1559657r1=1559656r2=1559657view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Mon Jan 20 
09:29:42 2014
@@ -407,7 +407,7 @@ public class CrawlDbReader implements Cl
 }
   }
 
-  public void processDumpJob(String crawlDb, String output, Configuration 
config, String format, String regex, String status) throws IOException {
+  public void processDumpJob(String crawlDb, String output, Configuration 
config, String format, String regex, String status, Integer retry) throws 
IOException {
 if (LOG.isInfoEnabled()) {
   LOG.info(CrawlDb dump: starting);
   LOG.info(CrawlDb db:  + crawlDb);
@@ -433,7 +433,8 @@ public class CrawlDbReader implements Cl
 
 if (status != null) job.set(status, status);
 if (regex != null) job.set(regex, regex);
-
+if (retry != null) job.setInt(retry, retry);
+
 job.setMapperClass(CrawlDbDumpMapper.class);
 job.setOutputKeyClass(Text.class);
 job.setOutputValueClass(CrawlDatum.class);
@@ -446,17 +447,26 @@ public class CrawlDbReader implements Cl
 Pattern pattern = null;
 Matcher matcher = null;
 String status = null;
+Integer retry = null;
 
 public void configure(JobConf job) {
   if (job.get(regex, null) != null) {
 pattern = Pattern.compile(job.get(regex));
   }
   status = job.get(status, null);
+  retry = job.getInt(retry, -1);
 }
 
 public void close() {}
 public void map(Text key, CrawlDatum value, OutputCollectorText, 
CrawlDatum output, Reporter reporter)
 throws IOException {
+
+  // check retry
+  if (retry != -1) {
+if (value.getRetriesSinceFetch()  retry) {
+  return;
+}
+  }
 
   // check status
   if (status != null
@@ -542,6 +552,7 @@ public class CrawlDbReader implements Cl
   System.err.println(\t\t[-format normal]\tdump in standard format 
(default option));
   System.err.println(\t\t[-format crawldb]\tdump as CrawlDB);
   System.err.println(\t\t[-regex expr]\tfilter records with 
expression);
+  System.err.println(\t\t[-retry num]\tminimum retry count);
   System.err.println(\t\t[-status status]\tfilter records by CrawlDatum 
status);
   System.err.println(\t-url url\tprint information on url to 
System.out);
   System.err.println(\t-topN  out_dir [min]\tdump top  
urls sorted by score to out_dir);
@@ -564,6 +575,7 @@ public class CrawlDbReader implements Cl
 param = args[++i];
 String format = normal;
 String regex = null;
+Integer retry = null;
 String status = null;
 for (int j = i + 1; j  args.length; j++) {
   if (args[j].equals(-format)) {
@@ -574,12 +586,16 @@ public class CrawlDbReader implements Cl
 regex = args[++j];
 i=i+2;
   }
+  if (args[j].equals(-retry)) {
+retry = Integer.parseInt(args[++j]);
+i=i+2;
+  }
   if (args[j].equals(-status)) {
 status = args[++j];
 i=i+2;
   }
 }
-dbr.processDumpJob(crawlDb, param, conf, format, regex, status);
+dbr.processDumpJob(crawlDb, param, conf, format, regex, status, retry);
   } else if (args[i].equals(-url)) {
 param = args[++i];
 dbr.readUrl(crawlDb, param, conf);

svn commit: r1556474 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/NutchDocument.java

2014-01-08 Thread markus

Author: markus
Date: Wed Jan  8 09:39:47 2014
New Revision: 1556474

URL: http://svn.apache.org/r1556474
Log:
NUTCH-1695 Add NutchDocument.toString() to ease debugging

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/NutchDocument.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1556474r1=1556473r2=1556474view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jan  8 09:39:47 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1695 Add NutchDocument.toString() to ease debugging (markus)
+
 * NUTCH-1675 NutchField to support long (markus)
 
 * NUTCH-1670 set same crawldb directory in mergedb parameter (lufeng via 
tejasp)

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/NutchDocument.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/NutchDocument.java?rev=1556474r1=1556473r2=1556474view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/indexer/NutchDocument.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/NutchDocument.java Wed Jan  8 
09:39:47 2014
@@ -127,4 +127,18 @@ implements Writable, IterableEntryStri
 out.writeFloat(weight);
 documentMeta.write(out);
   }
+  
+  public String toString() { 
+StringBuilder sb = new StringBuilder();
+sb.append(doc {\n);
+for (Map.EntryString, NutchField entry : fields.entrySet()) {
+  sb.append(\t);
+  sb.append(entry.getKey());
+  sb.append(:\t);
+  sb.append(entry.getValue());
+  sb.append(\n);
+}
+sb.append(}\n);
+return sb.toString();
+  }
 }

svn commit: r1554791 - /nutch/trunk/conf/nutch-default.xml

2014-01-02 Thread markus

Author: markus
Date: Thu Jan  2 11:53:36 2014
New Revision: 1554791

URL: http://svn.apache.org/r1554791
Log:
NUTCH-1360 fix entity in configuration

Modified:
nutch/trunk/conf/nutch-default.xml

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1554791r1=1554790r2=1554791view=diff
==
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Thu Jan  2 11:53:36 2014
@@ -29,7 +29,7 @@
   valuefalse/value
   descriptionEnables us to capture the specific IP address 
   (InetSocketAddress) of the host which we connect to via 
-  the given protocol. Currently supported is protocol-ftp  
+  the given protocol. Currently supported is protocol-ftp and
   http.
   /description
 /property

svn commit: r1553115 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/util/URLUtil.java src/test/org/apache/nutch/util/TestURLUtil.java

2013-12-23 Thread markus

Author: markus
Date: Mon Dec 23 14:17:40 2013
New Revision: 1553115

URL: http://svn.apache.org/r1553115
Log:
NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1553115r1=1553114r2=1553115view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Dec 23 14:17:40 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1681 In URLUtil.java, toUNICODE method does not work correctly 
(Ä°lhami KALKAN, snagel via markus)
+
 * NUTCH-1668 Remove package org.apache.nutch.indexer.solr (jnioche)
 
 * NUTCH-1621 Remove deprecated class o.a.n.crawl.Crawler (Rui Gao via jnioche)

Modified: nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=1553115r1=1553114r2=1553115view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Mon Dec 23 14:17:40 
2013
@@ -481,7 +481,7 @@ public class URLUtil {
 try {
   URL u = new URL(url);
   URI p = new URI(u.getProtocol(),
-null,
+u.getUserInfo(),
 IDN.toASCII(u.getHost()),
 u.getPort(),
 u.getPath(),
@@ -498,15 +498,25 @@ public class URLUtil {
   public static String toUNICODE(String url) {
 try {
   URL u = new URL(url);
-  URI p = new URI(u.getProtocol(),
-null,
-IDN.toUnicode(u.getHost()),
-u.getPort(),
-u.getPath(),
-u.getQuery(),
-u.getRef());
+  StringBuilder sb = new StringBuilder();
+  sb.append(u.getProtocol());
+  sb.append(://);
+  if (u.getUserInfo() != null) {
+sb.append(u.getUserInfo());
+sb.append('@');
+  }
+  sb.append(IDN.toUnicode(u.getHost()));
+  if (u.getPort() != -1) {
+sb.append(':');
+sb.append(u.getPort());
+  }
+  sb.append(u.getFile()); // includes query
+  if (u.getRef() != null) {
+sb.append('#');
+sb.append(u.getRef());
+  }
 
-  return p.toString();
+  return sb.toString();
 }
 catch (Exception e) {
   return null;

Modified: nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1553115r1=1553114r2=1553115view=diff
==
--- nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/util/TestURLUtil.java Mon Dec 23 
14:17:40 2013
@@ -258,5 +258,22 @@ public class TestURLUtil
   assertEquals(targets[i][1], targets[i][1], u.toString());
 }
   }
+  
+  public void testToUNICODE() throws Exception {
+assertEquals(http://www.Ã§evir.com;, 
URLUtil.toUNICODE(http://www.xn--evir-zoa.com;));
+assertEquals(http://uni-tÃ¼bingen.de/;, 
URLUtil.toUNICODE(http://xn--uni-tbingen-xhb.de/;));
+assertEquals(
+http://www.medizin.uni-tÃ¼bingen.de:8080/search.php?q=abc#p1;,
+
URLUtil.toUNICODE(http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1;));
+
+  }
+  
+  public void testToASCII() throws Exception {
+assertEquals(http://www.xn--evir-zoa.com;, 
URLUtil.toASCII(http://www.Ã§evir.com;));
+assertEquals(http://xn--uni-tbingen-xhb.de/;, 
URLUtil.toASCII(http://uni-tÃ¼bingen.de/;));
+assertEquals(
+http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1;,
+
URLUtil.toASCII(http://www.medizin.uni-tÃ¼bingen.de:8080/search.php?q=abc#p1;));
 
+  }
 
 }

svn commit: r1528072 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexerMapReduce.java

2013-10-01 Thread markus

Author: markus
Date: Tue Oct  1 12:50:06 2013
New Revision: 1528072

URL: http://svn.apache.org/r1528072
Log:
NUTCH-1646 IndexerMapReduce to consider DB status

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1528072r1=1528071r2=1528072view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Oct  1 12:50:06 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1646 IndexerMapReduce to consider DB status (markus)
+
 * NUTCH-1636 Indexer to normalize and filter repr URL (Iain Lopata via snagel)
 
 * NUTCH-1637 URLUtil is missing getProtocol (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1528072r1=1528071r2=1528072view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Tue Oct 
 1 12:50:06 2013
@@ -189,14 +189,18 @@ implements MapperText, Writable, Text, 
  * Check if we need to delete 404 NOT FOUND and 301 PERMANENT 
REDIRECT.
  */
 if (delete) {
-  if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE) {
+  if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || 
dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
 reporter.incrCounter(IndexerStatus, Documents deleted, 1);
 
 NutchIndexAction action = new NutchIndexAction(null, 
NutchIndexAction.DELETE);
 output.collect(key, action);
 return;
   }
-  if (fetchDatum.getStatus() == 
CrawlDatum.STATUS_FETCH_REDIR_PERM) {
+  if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM 
||
+  fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP 
||
+  dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM ||
+  dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
+reporter.incrCounter(IndexerStatus, Deleted redirects, 1);
 reporter.incrCounter(IndexerStatus, Perm redirects 
deleted, 1);
 
 NutchIndexAction action = new NutchIndexAction(null, 
NutchIndexAction.DELETE);

svn commit: r1499948 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/segment/SegmentMerger.java

2013-07-05 Thread markus

Author: markus
Date: Fri Jul  5 08:52:51 2013
New Revision: 1499948

URL: http://svn.apache.org/r1499948
Log:
NUTCH-1520 SegmentMerger looses records

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1499948r1=1499947r2=1499948view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jul  5 08:52:51 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1520 SegmentMerger looses records (markus)
+
 * NUTCH-1602 improve the readability of metadata in readdb dump normal (lufeng)
 
 * NUTCH-1596 HeadingsParseFilter not thread safe (snagel via markus)

Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?rev=1499948r1=1499947r2=1499948view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Fri Jul  5 
08:52:51 2013
@@ -412,10 +412,14 @@ public class SegmentMerger extends Confi
 lastF = val;
 lastFname = sp.segmentName;
   } else {
-// take newer
-if (lastFname.compareTo(sp.segmentName)  0) {
-  lastF = val;
-  lastFname = sp.segmentName;
+// only consider fetch status
+// https://issues.apache.org/jira/browse/NUTCH-1520
+if (CrawlDatum.hasFetchStatus(val)) {
+  // take newer
+  if (lastFname.compareTo(sp.segmentName)  0) {
+lastF = val;
+lastFname = sp.segmentName;
+  }
 }
   }
 } else if (sp.partName.equals(CrawlDatum.PARSE_DIR_NAME)) {
@@ -480,7 +484,7 @@ public class SegmentMerger extends Confi
   linked.isEmpty() ? null : 
linked.lastEntry().getValue())){
   return;
 }
-   
+
 curCount++;
 String sliceName = null;
 MetaWrapper wrapper = new MetaWrapper();

svn commit: r1499959 - in /nutch/branches/2.x: CHANGES.txt ivy/ivy.xml src/plugin/parse-tika/howto_upgrade_tika.txt src/plugin/parse-tika/ivy.xml src/plugin/parse-tika/plugin.xml

2013-07-05 Thread markus

Author: markus
Date: Fri Jul  5 10:27:47 2013
New Revision: 1499959

URL: http://svn.apache.org/r1499959
Log:
NUTCH-1595 Upgrade to Tika 1.4 (jnioche, markus)

Added:
nutch/branches/2.x/src/plugin/parse-tika/howto_upgrade_tika.txt
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/ivy/ivy.xml
nutch/branches/2.x/src/plugin/parse-tika/ivy.xml
nutch/branches/2.x/src/plugin/parse-tika/plugin.xml

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1499959r1=1499958r2=1499959view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Jul  5 10:27:47 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1595 Upgrade to Tika 1.4 (jnioche, markus)
+
 * NUTCH-1594 count variable is never changed in ParseUtil class (Canan via 
Feng)
 
 Release 2.2.1 - 06/27/2013 (mm/dd/)

Modified: nutch/branches/2.x/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1499959r1=1499958r2=1499959view=diff
==
--- nutch/branches/2.x/ivy/ivy.xml (original)
+++ nutch/branches/2.x/ivy/ivy.xml Fri Jul  5 10:27:47 2013
@@ -58,7 +58,7 @@
 /dependency
 
 dependency org=com.ibm.icu name=icu4j rev=4.0.1 /
-dependency org=org.apache.tika name=tika-core rev=1.3 /
+dependency org=org.apache.tika name=tika-core rev=1.4 /
 dependency org=com.googlecode.juniversalchardet 
name=juniversalchardet rev=1.0.3/
 
 dependency org=log4j name=log4j rev=1.2.15 conf=*-master /

Added: nutch/branches/2.x/src/plugin/parse-tika/howto_upgrade_tika.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/howto_upgrade_tika.txt?rev=1499959view=auto
==
--- nutch/branches/2.x/src/plugin/parse-tika/howto_upgrade_tika.txt (added)
+++ nutch/branches/2.x/src/plugin/parse-tika/howto_upgrade_tika.txt Fri Jul  5 
10:27:47 2013
@@ -0,0 +1,8 @@
+1. Upgrade Tika depencency in trunk/ivy/ivy.xml
+
+2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml
+
+3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml
+   To get the list of dependencies and their versions execute:
+   $ ant -f ./build-ivy.xml
+   $ ls lib/

Modified: nutch/branches/2.x/src/plugin/parse-tika/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/ivy.xml?rev=1499959r1=1499958r2=1499959view=diff
==
--- nutch/branches/2.x/src/plugin/parse-tika/ivy.xml (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/ivy.xml Fri Jul  5 10:27:47 2013
@@ -36,7 +36,7 @@
   /publications
 
   dependencies
-dependency org=org.apache.tika name=tika-parsers rev=1.3 
conf=*-default
+dependency org=org.apache.tika name=tika-parsers rev=1.4 
conf=*-default
  exclude org=org.apache.tika name=tika-core /
 /dependency
   /dependencies

Modified: nutch/branches/2.x/src/plugin/parse-tika/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/plugin.xml?rev=1499959r1=1499958r2=1499959view=diff
==
--- nutch/branches/2.x/src/plugin/parse-tika/plugin.xml (original)
+++ nutch/branches/2.x/src/plugin/parse-tika/plugin.xml Fri Jul  5 10:27:47 2013
@@ -28,39 +28,39 @@
 
   library name=apache-mime4j-core-0.7.2.jar/
   library name=apache-mime4j-dom-0.7.2.jar/
-  library name=asm-3.1.jar/
+  library name=asm-5.1.jar/
   library name=aspectjrt-1.6.11.jar/
   library name=bcmail-jdk15-1.45.jar/
   library name=bcprov-jdk15-1.45.jar/
   library name=boilerpipe-1.1.0.jar/
   library name=commons-codec-1.5.jar/
-  library name=commons-compress-1.4.1.jar/
+  library name=commons-compress-1.5.jar/
   library name=commons-logging-1.1.1.jar/
   library name=dom4j-1.6.1.jar/
-  library name=fontbox-1.7.1.jar/
+  library name=fontbox-1.8.1.jar/
   library name=geronimo-stax-api_1.0_spec-1.0.1.jar/
   library name=isoparser-1.0-RC-1.jar/
   library name=jdom-1.0.jar/
-  library name=jempbox-1.7.1.jar/
+  library name=jempbox-1.8.1.jar/
   library name=juniversalchardet-1.0.3.jar/
   library name=metadata-extractor-2.6.2.jar/
   library name=netcdf-4.2-min.jar/
-  library name=pdfbox-1.7.1.jar/
-  library name=poi-3.8.jar/
-  library name=poi-ooxml-3.8.jar/
-  library name=poi-ooxml-schemas-3.8.jar/
-  library name=poi-scratchpad-3.8.jar/
+  library name=pdfbox-1.8.1.jar/
+  library name=poi-3.9.jar/
+  library name=poi-ooxml-3.9.jar/
+  library name=poi-ooxml-schemas-3.9.jar/
+  library name=poi-scratchpad-3.9.jar/
   library name

svn commit: r1499960 - in /nutch/trunk: CHANGES.txt ivy/ivy.xml src/plugin/parse-tika/howto_upgrade_tika.txt src/plugin/parse-tika/ivy.xml src/plugin/parse-tika/plugin.xml

2013-07-05 Thread markus

Author: markus
Date: Fri Jul  5 10:28:46 2013
New Revision: 1499960

URL: http://svn.apache.org/r1499960
Log:
NUTCH-1595 Upgrade to Tika 1.4

Added:
nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/ivy/ivy.xml
nutch/trunk/src/plugin/parse-tika/ivy.xml
nutch/trunk/src/plugin/parse-tika/plugin.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1499960r1=1499959r2=1499960view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jul  5 10:28:46 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1595 Upgrade to Tika 1.4 (jnioche, markus)
+
 * NUTCH-1598 ElasticSearchIndexer to read ImmutableSettings from config 
(markus)
 
 * NUTCH-1520 SegmentMerger looses records (markus)

Modified: nutch/trunk/ivy/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1499960r1=1499959r2=1499960view=diff
==
--- nutch/trunk/ivy/ivy.xml (original)
+++ nutch/trunk/ivy/ivy.xml Fri Jul  5 10:28:46 2013
@@ -64,7 +64,7 @@
exclude org=ant name=ant /
/dependency
 
-   dependency org=org.apache.tika name=tika-core rev=1.3 /
+   dependency org=org.apache.tika name=tika-core rev=1.4 /
dependency org=com.ibm.icu name=icu4j rev=4.0.1 /
 
dependency org=org.mortbay.jetty name=jetty-client

Added: nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt?rev=1499960view=auto
==
--- nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt (added)
+++ nutch/trunk/src/plugin/parse-tika/howto_upgrade_tika.txt Fri Jul  5 
10:28:46 2013
@@ -0,0 +1,8 @@
+1. Upgrade Tika depencency in trunk/ivy/ivy.xml
+
+2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml
+
+3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml
+   To get the list of dependencies and their versions execute:
+   $ ant -f ./build-ivy.xml
+   $ ls lib/

Modified: nutch/trunk/src/plugin/parse-tika/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/ivy.xml?rev=1499960r1=1499959r2=1499960view=diff
==
--- nutch/trunk/src/plugin/parse-tika/ivy.xml (original)
+++ nutch/trunk/src/plugin/parse-tika/ivy.xml Fri Jul  5 10:28:46 2013
@@ -36,7 +36,7 @@
   /publications
 
   dependencies
-dependency org=org.apache.tika name=tika-parsers rev=1.3 
conf=*-default
+dependency org=org.apache.tika name=tika-parsers rev=1.4 
conf=*-default
  exclude org=org.apache.tika name=tika-core /
 /dependency
   /dependencies

Modified: nutch/trunk/src/plugin/parse-tika/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/plugin.xml?rev=1499960r1=1499959r2=1499960view=diff
==
--- nutch/trunk/src/plugin/parse-tika/plugin.xml (original)
+++ nutch/trunk/src/plugin/parse-tika/plugin.xml Fri Jul  5 10:28:46 2013
@@ -28,39 +28,39 @@
   
   library name=apache-mime4j-core-0.7.2.jar/
   library name=apache-mime4j-dom-0.7.2.jar/
-  library name=asm-3.1.jar/
+  library name=asm-4.1.jar/
   library name=aspectjrt-1.6.11.jar/
   library name=bcmail-jdk15-1.45.jar/
   library name=bcprov-jdk15-1.45.jar/
   library name=boilerpipe-1.1.0.jar/
   library name=commons-codec-1.5.jar/
-  library name=commons-compress-1.4.1.jar/
+  library name=commons-compress-1.5.jar/
   library name=commons-logging-1.1.1.jar/
   library name=dom4j-1.6.1.jar/
-  library name=fontbox-1.7.1.jar/
+  library name=fontbox-1.8.1.jar/
   library name=geronimo-stax-api_1.0_spec-1.0.1.jar/
   library name=isoparser-1.0-RC-1.jar/
   library name=jdom-1.0.jar/
-  library name=jempbox-1.7.1.jar/
+  library name=jempbox-1.8.1.jar/
   library name=juniversalchardet-1.0.3.jar/
   library name=metadata-extractor-2.6.2.jar/
   library name=netcdf-4.2-min.jar/
-  library name=pdfbox-1.7.1.jar/
-  library name=poi-3.8.jar/
-  library name=poi-ooxml-3.8.jar/
-  library name=poi-ooxml-schemas-3.8.jar/
-  library name=poi-scratchpad-3.8.jar/
+  library name=pdfbox-1.8.1.jar/
+  library name=poi-3.9.jar/
+  library name=poi-ooxml-3.9.jar/
+  library name=poi-ooxml-schemas-3.9.jar/
+  library name=poi-scratchpad-3.9.jar/
   library name=rome-0.9.jar/
   library name=slf4j-api-1.5.6.jar/
   library name=tagsoup-1.2.1.jar/
-  library name=tika-parsers-1.3.jar/
+  library name=tika

svn commit: r1499684 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Injector.java

2013-07-04 Thread markus

Author: markus
Date: Thu Jul  4 08:50:25 2013
New Revision: 1499684

URL: http://svn.apache.org/r1499684
Log:
NUTCH-1600 Injector overwrite does not always work properly

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1499684r1=1499683r2=1499684view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jul  4 08:50:25 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1600 Injector overwrite does not always work properly (markus)
+
 * NUTCH-1581 CrawlDB csv output to include metadata (markus)
 
 * NUTCH-1327 QueryStringNormalizer (markus)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=1499684r1=1499683r2=1499684view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Thu Jul  4 
08:50:25 2013
@@ -186,6 +186,8 @@ public class Injector extends Configured
   scoreInjected = job.getFloat(db.score.injected, 1.0f);
   overwrite = job.getBoolean(db.injector.overwrite, false);
   update = job.getBoolean(db.injector.update, false);
+  LOG.info(Injector: overwrite:  + overwrite);
+  LOG.info(Injector: update:  + update);
 }
 
 public void close() {}
@@ -209,22 +211,20 @@ public class Injector extends Configured
   oldSet = true;
 }
   }
+
   CrawlDatum res = null;
+
+  // Old default behaviour
+  if (injectedSet  !oldSet) {
+res = injected;
+  } else {
+res = old;
+  }
   
   /**
* Whether to overwrite, ignore or update existing records
* @see https://issues.apache.org/jira/browse/NUTCH-1405
*/
-  
-  // Injected record already exists and overwrite but not update
-  if (injectedSet  oldSet  overwrite) {
-res = injected;
-
-if (update) {
-  LOG.info(key.toString() +  overwritten with injected record but 
update was specified.);
-}
-  }
-
   // Injected record already exists and update but not overwrite
   if (injectedSet  oldSet  update  !overwrite) {
 res = old;
@@ -233,11 +233,9 @@ public class Injector extends Configured
 old.setFetchInterval(injected.getFetchInterval() != interval ? 
injected.getFetchInterval() : old.getFetchInterval());
   }
   
-  // Old default behaviour
-  if (injectedSet  !oldSet) {
+  // Injected record already exists and overwrite
+  if (injectedSet  oldSet  overwrite) {
 res = injected;
-  } else {
-res = old;
   }
 
   output.collect(key, res);

svn commit: r1499696 - in /nutch/trunk: CHANGES.txt src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java

2013-07-04 Thread markus

Author: markus
Date: Thu Jul  4 09:07:12 2013
New Revision: 1499696

URL: http://svn.apache.org/r1499696
Log:
NUTCH-1597  HeadingsParseFilter to trim and remove exess whitespace

Modified:
nutch/trunk/CHANGES.txt

nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1499696r1=1499695r2=1499696view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jul  4 09:07:12 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1597  HeadingsParseFilter to trim and remove exess whitespace (markus)
+
 * NUTCH-1601 ElasticSearchIndexer fails to properly delete documents (markus)
 
 * NUTCH-1600 Injector overwrite does not always work properly (markus)

Modified: 
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java?rev=1499696r1=1499695r2=1499696view=diff
==
--- 
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
 Thu Jul  4 09:07:12 2013
@@ -19,6 +19,7 @@ package org.apache.nutch.parse.headings;
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.regex.*;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.parse.HTMLMetaTags;
 import org.apache.nutch.parse.Parse;
@@ -33,6 +34,11 @@ import org.w3c.dom.*;
  */
 public class HeadingsParseFilter implements HtmlParseFilter {
 
+  /**
+   * Pattern used to strip surpluss whitespace
+   */
+  protected static Pattern whitespacePattern = Pattern.compile(\\s+);
+
   private Configuration conf;
   private DocumentFragment doc;
   private String[] headings;
@@ -113,6 +119,8 @@ public class HeadingsParseFilter impleme
   }
 }
 
-return buffer.toString();
+// Return with stripped surplus whitespace
+Matcher matcher = whitespacePattern.matcher(buffer.toString().trim());
+return matcher.replaceAll( ).trim();
   }
 }

svn commit: r1499722 - in /nutch/trunk: CHANGES.txt src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java

2013-07-04 Thread markus

Author: markus
Date: Thu Jul  4 11:13:34 2013
New Revision: 1499722

URL: http://svn.apache.org/r1499722
Log:
NUTCH-1596 HeadingsParseFilter not thread safe

Modified:
nutch/trunk/CHANGES.txt

nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1499722r1=1499721r2=1499722view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jul  4 11:13:34 2013
@@ -2,7 +2,9 @@ Nutch Change Log
 
 Nutch Development Trunk
 
-* NUTCH-1597  HeadingsParseFilter to trim and remove exess whitespace (markus)
+* NUTCH-1596 HeadingsParseFilter not thread safe (snagel via markus)
+
+* NUTCH-1597 HeadingsParseFilter to trim and remove exess whitespace (markus)
 
 * NUTCH-1601 ElasticSearchIndexer fails to properly delete documents (markus)
 

Modified: 
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java?rev=1499722r1=1499721r2=1499722view=diff
==
--- 
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
 (original)
+++ 
nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
 Thu Jul  4 11:13:34 2013
@@ -40,17 +40,14 @@ public class HeadingsParseFilter impleme
   protected static Pattern whitespacePattern = Pattern.compile(\\s+);
 
   private Configuration conf;
-  private DocumentFragment doc;
   private String[] headings;
   private boolean multiValued = false;
 
   public ParseResult filter(Content content, ParseResult parseResult, 
HTMLMetaTags metaTags, DocumentFragment doc) {
-this.doc = doc;
-
 Parse parse = parseResult.get(content.getUrl());
 
 for (int i = 0 ; headings != null  i  headings.length ; i++ ) {
-  ListString discoveredHeadings = getElement(headings[i]);
+  ListString discoveredHeadings = getElement(doc, headings[i]);
 
   if (discoveredHeadings.size()  0) {
 for (String heading : discoveredHeadings) {
@@ -82,7 +79,7 @@ public class HeadingsParseFilter impleme
   /**
* Finds the specified element and returns its value
*/
-  protected ListString getElement(String element) {
+  protected ListString getElement(DocumentFragment doc, String element) {
 ListString headings = new ArrayListString();
 NodeWalker walker = new NodeWalker(doc);

svn commit: r1498830 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDbReader.java

2013-07-02 Thread markus

Author: markus
Date: Tue Jul  2 08:36:13 2013
New Revision: 1498830

URL: http://svn.apache.org/r1498830
Log:
NUTCH-1327 QueryStringNormalizer

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1498830r1=1498829r2=1498830view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jul  2 08:36:13 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1327 QueryStringNormalizer (markus)
+
 * NUTCH-1593 Normalize option missing in SegmentMerger's usage (markus)
 
 * NUTCH-1580 index-static returns object instead of value for index.static 
(Antoinette, lewismc, snagel)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1498830r1=1498829r2=1498830view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Tue Jul  2 
08:36:13 2013
@@ -24,6 +24,7 @@ import java.net.URL;
 import java.util.Date;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.Map.Entry;
 import java.util.Random;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -41,6 +42,7 @@ import org.apache.hadoop.io.LongWritable
 import org.apache.hadoop.io.MapFile;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapred.FileInputFormat;
 import org.apache.hadoop.mapred.FileOutputFormat;
 import org.apache.hadoop.mapred.JobClient;
@@ -64,42 +66,41 @@ import org.apache.nutch.util.StringUtil;
 
 /**
  * Read utility for the CrawlDB.
- * 
+ *
  * @author Andrzej Bialecki
- * 
+ *
  */
 public class CrawlDbReader implements Closeable {
 
   public static final Logger LOG = 
LoggerFactory.getLogger(CrawlDbReader.class);
 
   private MapFile.Reader[] readers = null;
-  
+
   private void openReaders(String crawlDb, Configuration config) throws 
IOException {
 if (readers != null) return;
 FileSystem fs = FileSystem.get(config);
 readers = MapFileOutputFormat.getReaders(fs, new Path(crawlDb,
 CrawlDb.CURRENT_NAME), config);
   }
-  
+
   private void closeReaders() {
 if (readers == null) return;
 for (int i = 0; i  readers.length; i++) {
   try {
 readers[i].close();
   } catch (Exception e) {
-
+
   }
 }
   }
-  
+
   public static class CrawlDatumCsvOutputFormat extends 
FileOutputFormatText,CrawlDatum {
 protected static class LineRecordWriter implements 
RecordWriterText,CrawlDatum {
   private DataOutputStream out;
-
   public LineRecordWriter(DataOutputStream out) {
 this.out = out;
 try {
-  out.writeBytes(Url;Status code;Status name;Fetch Time;Modified 
Time;Retries since fetch;Retry interval seconds;Retry interval 
days;Score;Signature\n);
+  out.writeBytes(Url;Status code;Status name;Fetch Time;Modified 
Time;Retries since fetch;Retry interval seconds;Retry interval 
days;Score;Signature;Metadata\n);
 } catch (IOException e) {}
   }
 
@@ -129,6 +130,18 @@ public class CrawlDbReader implements Cl
   out.writeByte('');
   out.writeBytes(value.getSignature() != null ? 
StringUtil.toHexString(value.getSignature()): null);
   out.writeByte('');
+  out.writeByte(';');
+  out.writeByte('');
+  if (value.getMetaData() != null) {
+for (EntryWritable, Writable e : value.getMetaData().entrySet()) 
{
+  out.writeBytes(e.getKey().toString());
+  out.writeByte(':');
+  out.writeBytes(e.getValue().toString());
+  out.writeBytes(|||);
+}
+  }
+  out.writeByte('');
+
   out.writeByte('\n');
   }
 
@@ -165,10 +178,10 @@ public class CrawlDbReader implements Cl
   }
 }
   }
-  
+
   public static class CrawlDbStatCombiner implements ReducerText, 
LongWritable, Text, LongWritable {
 LongWritable val = new LongWritable();
-
+
 public CrawlDbStatCombiner() { }
 public void configure(JobConf job) { }
 public void close() {}
@@ -249,7 +262,7 @@ public class CrawlDbReader implements Cl
   public static class CrawlDbTopNMapper implements MapperText, CrawlDatum, 
FloatWritable, Text {
 private static final FloatWritable fw = new FloatWritable();
 private float min = 0.0f;
-
+
 public void configure(JobConf job) {
   long lmin = job.getLong(db.reader.topn.min, 0);
   if (lmin != 0) {
@@ -264,11 +277,11 @@ public class CrawlDbReader implements Cl
   output.collect(fw, key

svn commit: r1498832 - in /nutch/trunk: ./ src/plugin/ src/plugin/urlnormalizer-querystring/ src/plugin/urlnormalizer-querystring/src/ src/plugin/urlnormalizer-querystring/src/java/ src/plugin/urlnorm

2013-07-02 Thread markus

Author: markus
Date: Tue Jul  2 08:37:40 2013
New Revision: 1498832

URL: http://svn.apache.org/r1498832
Log:
NUTCH-1581 CrawlDB csv output to include metadata

Added:
nutch/trunk/src/plugin/urlnormalizer-querystring/
nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml
nutch/trunk/src/plugin/urlnormalizer-querystring/ivy.xml
nutch/trunk/src/plugin/urlnormalizer-querystring/plugin.xml
nutch/trunk/src/plugin/urlnormalizer-querystring/src/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/

nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/

nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/

nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/

nutch/trunk/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/
nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/

nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/

nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/

nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/

nutch/trunk/src/plugin/urlnormalizer-querystring/src/test/org/apache/nutch/net/urlnormalizer/querystring/TestQuerystringURLNormalizer.java
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1498832r1=1498831r2=1498832view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jul  2 08:37:40 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1581 CrawlDB csv output to include metadata (markus)
+
 * NUTCH-1327 QueryStringNormalizer (markus)
 
 * NUTCH-1593 Normalize option missing in SegmentMerger's usage (markus)

Modified: nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1498832r1=1498831r2=1498832view=diff
==
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Tue Jul  2 08:37:40 2013
@@ -70,6 +70,7 @@
  ant dir=urlnormalizer-basic target=deploy/
  ant dir=urlnormalizer-host target=deploy/
  ant dir=urlnormalizer-pass target=deploy/
+ ant dir=urlnormalizer-querystring target=deploy/
  ant dir=urlnormalizer-regex target=deploy/
   /target
 
@@ -105,6 +106,7 @@
  ant dir=urlnormalizer-basic target=test/
  ant dir=urlnormalizer-host target=test/
  ant dir=urlnormalizer-pass target=test/
+ ant dir=urlnormalizer-querystring target=test/
  ant dir=urlnormalizer-regex target=test/
 /parallel
   /target
@@ -159,6 +161,7 @@
 ant dir=urlnormalizer-host target=clean/
 ant dir=urlnormalizer-basic target=clean/
 ant dir=urlnormalizer-pass target=clean/
+ant dir=urlnormalizer-querystring target=clean/
 ant dir=urlnormalizer-regex target=clean/
   /target
 /project

Added: nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml?rev=1498832view=auto
==
--- nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml (added)
+++ nutch/trunk/src/plugin/urlnormalizer-querystring/build.xml Tue Jul  2 
08:37:40 2013
@@ -0,0 +1,22 @@
+?xml version=1.0?
+!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the License); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an AS IS BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+--
+project name=urlnormalizer

svn commit: r1498346 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/segment/SegmentMerger.java

2013-07-01 Thread markus

Author: markus
Date: Mon Jul  1 10:03:12 2013
New Revision: 1498346

URL: http://svn.apache.org/r1498346
Log:
NUTCH-1593 Normalize option missing in SegmentMerger's usage

Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1498346r1=1498345r2=1498346view=diff
==
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Jul  1 10:03:12 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1593 Normalize option missing in SegmentMerger's usage (markus)
+
 * NUTCH-1580 index-static returns object instead of value for index.static 
(Antoinette, lewismc, snagel)
 
 * NUTCH-1126 JUnit test for urlfilter-prefix (Talat UYARER via markus)

Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?rev=1498346r1=1498345r2=1498346view=diff
==
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Mon Jul  1 
10:03:12 2013
@@ -649,6 +649,7 @@ public class SegmentMerger extends Confi
   System.err.println(\t-dir segments\tparent dir containing several 
segments);
   System.err.println(\tseg1 seg2 ...\tlist of segment dirs);
   System.err.println(\t-filter\t\tfilter out URL-s prohibited by current 
URLFilters);
+  System.err.println(\t-normalize\t\tnormalize URL via current 
URLNormalizers);
   System.err.println(\t-slice \tcreate many output segments, each 
containing  URLs);
   return;
 }

svn commit: r1496023 - in /nutch/branches/2.x: ./ src/plugin/ src/plugin/urlfilter-prefix/src/test/ src/plugin/urlfilter-prefix/src/test/org/ src/plugin/urlfilter-prefix/src/test/org/apache/ src/plugi

2013-06-24 Thread markus

Author: markus
Date: Mon Jun 24 13:12:59 2013
New Revision: 1496023

URL: http://svn.apache.org/r1496023
Log:
NUTCH-1126 JUnit test for urlfilter-prefix

Added:
nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/
nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/
nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/
nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/

nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/

nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/

nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/plugin/build.xml

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1496023r1=1496022r2=1496023view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Mon Jun 24 13:12:59 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1126 JUnit test for urlfilter-prefix (Talat UYARER via markus)
+
 * NUTCH-1585 Ensure duplicate tags do not exist in microformat-reltag tag set 
(lewismc)
 
 * NUTCH-1475 Index-More Plugin -- A better fall back value for date field 
(James Sullivan, snagel via lewismc)

Modified: nutch/branches/2.x/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1496023r1=1496022r2=1496023view=diff
==
--- nutch/branches/2.x/src/plugin/build.xml (original)
+++ nutch/branches/2.x/src/plugin/build.xml Mon Jun 24 13:12:59 2013
@@ -81,7 +81,8 @@
  ant dir=language-identifier target=test/
  ant dir=protocol-httpclient target=test/
  ant dir=urlfilter-automaton target=test/
- ant dir=urlfilter-domain target=test /
+ ant dir=urlfilter-domain target=test/
+ ant dir=urlfilter-prefix target=test/
  ant dir=urlfilter-regex target=test/
  ant dir=urlfilter-suffix target=test/
  ant dir=urlnormalizer-basic target=test/

Added: 
nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java?rev=1496023view=auto
==
--- 
nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java
 (added)
+++ 
nutch/branches/2.x/src/plugin/urlfilter-prefix/src/test/org/apache/nutch/urlfilter/prefix/TestPrefixURLFilter.java
 Mon Jun 24 13:12:59 2013
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.prefix;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+import junit.textui.TestRunner;
+
+import java.io.IOException;
+
+
+/**
+ * JUnit test for codePrefixURLFilter/code.
+ *
+ * @author Talat Uyarer
+ * @author Cihad Guzel
+ */
+public class TestPrefixURLFilter extends TestCase {
+  private static final String prefixes =
+# this is a comment\n +
+\n +
+http://\n; +
+https://\n; +
+file://\n +
+ftp://\n;;
+
+  private static final String[] urls = new String[] {
+http://www.example.com/;,
+https://www.example.com/;,
+ftp://www.example.com/;,
+file://www.example.com/,
+abcd://www.example.com/,
+www.example.com/,
+  };
+
+  private static String[] urlsModeAccept = new String[] {
+urls[0],
+urls[1],
+urls[2],
+urls[3],
+null,
+null
+  };
+
+  private PrefixURLFilter filter = null;
+
+  public static Test suite() {
+return new TestSuite(TestPrefixURLFilter.class);
+  }
+
+  public static void main(String[] args) {
+TestRunner.run(suite());
+  }
+
+  public void setUp() throws IOException {
+filter = new PrefixURLFilter(prefixes

1 2 >

1 - 100 of 169 matches

Mail list logo